View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.procedure;
19  
20  import java.io.IOException;
21  import java.io.InterruptedIOException;
22  import java.util.Arrays;
23  import java.util.List;
24  
25  import org.apache.commons.logging.Log;
26  import org.apache.commons.logging.LogFactory;
27  import org.apache.hadoop.hbase.classification.InterfaceAudience;
28  import org.apache.hadoop.hbase.errorhandling.ForeignException;
29  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
30  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
31  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
32  import org.apache.zookeeper.KeeperException;
33  
34  import com.google.protobuf.InvalidProtocolBufferException;
35  
36  /**
37   * ZooKeeper based {@link ProcedureCoordinatorRpcs} for a {@link ProcedureCoordinator}
38   */
39  @InterfaceAudience.Private
40  public class ZKProcedureCoordinatorRpcs implements ProcedureCoordinatorRpcs {
41    public static final Log LOG = LogFactory.getLog(ZKProcedureCoordinatorRpcs.class);
42    private ZKProcedureUtil zkProc = null;
43    protected ProcedureCoordinator coordinator = null;  // if started this should be non-null
44  
45    ZooKeeperWatcher watcher;
46    String procedureType;
47    String coordName;
48  
49    /**
50     * @param watcher zookeeper watcher. Owned by <tt>this</tt> and closed via {@link #close()}
51     * @param procedureClass procedure type name is a category for when there are multiple kinds of
52     *    procedures.-- this becomes a znode so be aware of the naming restrictions
53     * @param coordName name of the node running the coordinator
54     * @throws KeeperException if an unexpected zk error occurs
55     */
56    public ZKProcedureCoordinatorRpcs(ZooKeeperWatcher watcher,
57        String procedureClass, String coordName) throws KeeperException {
58      this.watcher = watcher;
59      this.procedureType = procedureClass;
60      this.coordName = coordName;
61    }
62  
63    /**
64     * The "acquire" phase.  The coordinator creates a new procType/acquired/ znode dir. If znodes
65     * appear, first acquire to relevant listener or sets watch waiting for notification of
66     * the acquire node
67     *
68     * @param proc the Procedure
69     * @param info data to be stored in the acquire node
70     * @param nodeNames children of the acquire phase
71     * @throws IOException if any failure occurs.
72     */
73    @Override
74    final public void sendGlobalBarrierAcquire(Procedure proc, byte[] info, List<String> nodeNames)
75        throws IOException, IllegalArgumentException {
76      String procName = proc.getName();
77      // start watching for the abort node
78      String abortNode = zkProc.getAbortZNode(procName);
79      try {
80        // check to see if the abort node already exists
81        if (ZKUtil.watchAndCheckExists(zkProc.getWatcher(), abortNode)) {
82          abort(abortNode);
83        }
84        // If we get an abort node watch triggered here, we'll go complete creating the acquired
85        // znode but then handle the acquire znode and bail out
86      } catch (KeeperException e) {
87        LOG.error("Failed to watch abort", e);
88        throw new IOException("Failed while watching abort node:" + abortNode, e);
89      }
90  
91      // create the acquire barrier
92      String acquire = zkProc.getAcquiredBarrierNode(procName);
93      LOG.debug("Creating acquire znode:" + acquire);
94      try {
95        // notify all the procedure listeners to look for the acquire node
96        byte[] data = ProtobufUtil.prependPBMagic(info);
97        ZKUtil.createWithParents(zkProc.getWatcher(), acquire, data);
98        // loop through all the children of the acquire phase and watch for them
99        for (String node : nodeNames) {
100         String znode = ZKUtil.joinZNode(acquire, node);
101         LOG.debug("Watching for acquire node:" + znode);
102         if (ZKUtil.watchAndCheckExists(zkProc.getWatcher(), znode)) {
103           coordinator.memberAcquiredBarrier(procName, node);
104         }
105       }
106     } catch (KeeperException e) {
107       throw new IOException("Failed while creating acquire node:" + acquire, e);
108     }
109   }
110 
111   @Override
112   public void sendGlobalBarrierReached(Procedure proc, List<String> nodeNames) throws IOException {
113     String procName = proc.getName();
114     String reachedNode = zkProc.getReachedBarrierNode(procName);
115     LOG.debug("Creating reached barrier zk node:" + reachedNode);
116     try {
117       // create the reached znode and watch for the reached znodes
118       ZKUtil.createWithParents(zkProc.getWatcher(), reachedNode);
119       // loop through all the children of the acquire phase and watch for them
120       for (String node : nodeNames) {
121         String znode = ZKUtil.joinZNode(reachedNode, node);
122         if (ZKUtil.watchAndCheckExists(zkProc.getWatcher(), znode)) {
123           byte[] dataFromMember = ZKUtil.getData(zkProc.getWatcher(), znode);
124           // ProtobufUtil.isPBMagicPrefix will check null
125           if (dataFromMember != null && dataFromMember.length > 0) {
126             if (!ProtobufUtil.isPBMagicPrefix(dataFromMember)) {
127               throw new IOException(
128                 "Failed to get data from finished node or data is illegally formatted: "
129                     + znode);
130             } else {
131               dataFromMember = Arrays.copyOfRange(dataFromMember, ProtobufUtil.lengthOfPBMagic(),
132                 dataFromMember.length);
133               coordinator.memberFinishedBarrier(procName, node, dataFromMember);
134             }
135           } else {
136             coordinator.memberFinishedBarrier(procName, node, dataFromMember);
137           }
138         }
139       }
140     } catch (KeeperException e) {
141       throw new IOException("Failed while creating reached node:" + reachedNode, e);
142     } catch (InterruptedException e) {
143       throw new InterruptedIOException("Interrupted while creating reached node:" + reachedNode);
144     }
145   }
146 
147 
148   /**
149    * Delete znodes that are no longer in use.
150    */
151   @Override
152   final public void resetMembers(Procedure proc) throws IOException {
153     String procName = proc.getName();
154     boolean stillGettingNotifications = false;
155     do {
156       try {
157         LOG.debug("Attempting to clean out zk node for op:" + procName);
158         zkProc.clearZNodes(procName);
159         stillGettingNotifications = false;
160       } catch (KeeperException.NotEmptyException e) {
161         // recursive delete isn't transactional (yet) so we need to deal with cases where we get
162         // children trickling in
163         stillGettingNotifications = true;
164       } catch (KeeperException e) {
165         throw new IOException("Failed to complete reset procedure " + procName, e);
166       }
167     } while (stillGettingNotifications);
168   }
169 
170   /**
171    * Start monitoring znodes in ZK - subclass hook to start monitoring znodes they are about.
172    * @return true if succeed, false if encountered initialization errors.
173    */
174   final public boolean start(final ProcedureCoordinator coordinator) {
175     if (this.coordinator != null) {
176       throw new IllegalStateException(
177         "ZKProcedureCoordinator already started and already has listener installed");
178     }
179     this.coordinator = coordinator;
180 
181     try {
182       this.zkProc = new ZKProcedureUtil(watcher, procedureType) {
183         @Override
184         public void nodeCreated(String path) {
185           if (!isInProcedurePath(path)) return;
186           LOG.debug("Node created: " + path);
187           logZKTree(this.baseZNode);
188           if (isAcquiredPathNode(path)) {
189             // node wasn't present when we created the watch so zk event triggers acquire
190             coordinator.memberAcquiredBarrier(ZKUtil.getNodeName(ZKUtil.getParent(path)),
191               ZKUtil.getNodeName(path));
192           } else if (isReachedPathNode(path)) {
193             // node was absent when we created the watch so zk event triggers the finished barrier.
194 
195             // TODO Nothing enforces that acquire and reached znodes from showing up in wrong order.
196             String procName = ZKUtil.getNodeName(ZKUtil.getParent(path));
197             String member = ZKUtil.getNodeName(path);
198             // get the data from the procedure member
199             try {
200               byte[] dataFromMember = ZKUtil.getData(watcher, path);
201               // ProtobufUtil.isPBMagicPrefix will check null
202               if (dataFromMember != null && dataFromMember.length > 0) {
203                 if (!ProtobufUtil.isPBMagicPrefix(dataFromMember)) {
204                   ForeignException ee = new ForeignException(coordName,
205                     "Failed to get data from finished node or data is illegally formatted:"
206                         + path);
207                   coordinator.abortProcedure(procName, ee);
208                 } else {
209                   dataFromMember = Arrays.copyOfRange(dataFromMember, ProtobufUtil.lengthOfPBMagic(),
210                     dataFromMember.length);
211                   LOG.debug("Finished data from procedure '" + procName
212                     + "' member '" + member + "': " + new String(dataFromMember));
213                   coordinator.memberFinishedBarrier(procName, member, dataFromMember);
214                 }
215               } else {
216                 coordinator.memberFinishedBarrier(procName, member, dataFromMember);
217               }
218             } catch (KeeperException e) {
219               ForeignException ee = new ForeignException(coordName, e);
220               coordinator.abortProcedure(procName, ee);
221             } catch (InterruptedException e) {
222               ForeignException ee = new ForeignException(coordName, e);
223               coordinator.abortProcedure(procName, ee);
224             }
225           } else if (isAbortPathNode(path)) {
226             abort(path);
227           } else {
228             LOG.debug("Ignoring created notification for node:" + path);
229           }
230         }
231       };
232       zkProc.clearChildZNodes();
233     } catch (KeeperException e) {
234       LOG.error("Unable to start the ZK-based Procedure Coordinator rpcs.", e);
235       return false;
236     }
237 
238     LOG.debug("Starting the controller for procedure member:" + coordName);
239     return true;
240   }
241 
242   /**
243    * This is the abort message being sent by the coordinator to member
244    *
245    * TODO this code isn't actually used but can be used to issue a cancellation from the
246    * coordinator.
247    */
248   @Override
249   final public void sendAbortToMembers(Procedure proc, ForeignException ee) {
250     String procName = proc.getName();
251     LOG.debug("Aborting procedure '" + procName + "' in zk");
252     String procAbortNode = zkProc.getAbortZNode(procName);
253     try {
254       LOG.debug("Creating abort znode:" + procAbortNode);
255       String source = (ee.getSource() == null) ? coordName : ee.getSource();
256       byte[] errorInfo = ProtobufUtil.prependPBMagic(ForeignException.serialize(source, ee));
257       // first create the znode for the procedure
258       ZKUtil.createAndFailSilent(zkProc.getWatcher(), procAbortNode, errorInfo);
259       LOG.debug("Finished creating abort node:" + procAbortNode);
260     } catch (KeeperException e) {
261       // possible that we get this error for the procedure if we already reset the zk state, but in
262       // that case we should still get an error for that procedure anyways
263       zkProc.logZKTree(zkProc.baseZNode);
264       coordinator.rpcConnectionFailure("Failed to post zk node:" + procAbortNode
265           + " to abort procedure '" + procName + "'", new IOException(e));
266     }
267   }
268 
269   /**
270    * Receive a notification and propagate it to the local coordinator
271    * @param abortNode full znode path to the failed procedure information
272    */
273   protected void abort(String abortNode) {
274     String procName = ZKUtil.getNodeName(abortNode);
275     ForeignException ee = null;
276     try {
277       byte[] data = ZKUtil.getData(zkProc.getWatcher(), abortNode);
278       if (data == null || data.length == 0) {
279         // ignore
280         return;
281       } else if (!ProtobufUtil.isPBMagicPrefix(data)) {
282         LOG.warn("Got an error notification for op:" + abortNode
283             + " but we can't read the information. Killing the procedure.");
284         // we got a remote exception, but we can't describe it
285         ee = new ForeignException(coordName, "Data in abort node is illegally formatted.  ignoring content.");
286       } else {
287 
288         data = Arrays.copyOfRange(data, ProtobufUtil.lengthOfPBMagic(), data.length);
289         ee = ForeignException.deserialize(data);
290       }
291     } catch (InvalidProtocolBufferException e) {
292       LOG.warn("Got an error notification for op:" + abortNode
293           + " but we can't read the information. Killing the procedure.");
294       // we got a remote exception, but we can't describe it
295       ee = new ForeignException(coordName, e);
296     } catch (KeeperException e) {
297       coordinator.rpcConnectionFailure("Failed to get data for abort node:" + abortNode
298           + zkProc.getAbortZnode(), new IOException(e));
299     } catch (InterruptedException e) {
300       coordinator.rpcConnectionFailure("Failed to get data for abort node:" + abortNode
301           + zkProc.getAbortZnode(), new IOException(e));
302       Thread.currentThread().interrupt();
303     }
304     coordinator.abortProcedure(procName, ee);
305   }
306 
307   @Override
308   final public void close() throws IOException {
309     zkProc.close();
310   }
311 
312   /**
313    * Used in testing
314    */
315   final ZKProcedureUtil getZkProcedureUtil() {
316     return zkProc;
317   }
318 }