View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.zookeeper;
20  
21  import java.util.List;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.apache.hadoop.hbase.classification.InterfaceAudience;
26  import org.apache.hadoop.hbase.HConstants;
27  import org.apache.hadoop.hbase.HRegionInfo;
28  import org.apache.hadoop.hbase.RegionTransition;
29  import org.apache.hadoop.hbase.ServerName;
30  import org.apache.hadoop.hbase.exceptions.DeserializationException;
31  import org.apache.hadoop.hbase.executor.EventType;
32  import org.apache.zookeeper.AsyncCallback;
33  import org.apache.zookeeper.KeeperException;
34  import org.apache.zookeeper.KeeperException.Code;
35  import org.apache.zookeeper.data.Stat;
36  
37  // We should not be importing this Type here, nor a RegionTransition, etc.  This class should be
38  // about zk and bytes only.
39  
40  /**
41   * Utility class for doing region assignment in ZooKeeper.  This class extends
42   * stuff done in {@link ZKUtil} to cover specific assignment operations.
43   * <p>
44   * Contains only static methods and constants.
45   * <p>
46   * Used by both the Master and RegionServer.
47   * <p>
48   * All valid transitions outlined below:
49   * <p>
50   * <b>MASTER</b>
51   * <ol>
52   *   <li>
53   *     Master creates an unassigned node as OFFLINE.
54   *     - Cluster startup and table enabling.
55   *   </li>
56   *   <li>
57   *     Master forces an existing unassigned node to OFFLINE.
58   *     - RegionServer failure.
59   *     - Allows transitions from all states to OFFLINE.
60   *   </li>
61   *   <li>
62   *     Master deletes an unassigned node that was in a OPENED state.
63   *     - Normal region transitions.  Besides cluster startup, no other deletions
64   *     of unassigned nodes is allowed.
65   *   </li>
66   *   <li>
67   *     Master deletes all unassigned nodes regardless of state.
68   *     - Cluster startup before any assignment happens.
69   *   </li>
70   * </ol>
71   * <p>
72   * <b>REGIONSERVER</b>
73   * <ol>
74   *   <li>
75   *     RegionServer creates an unassigned node as CLOSING.
76   *     - All region closes will do this in response to a CLOSE RPC from Master.
77   *     - A node can never be transitioned to CLOSING, only created.
78   *   </li>
79   *   <li>
80   *     RegionServer transitions an unassigned node from CLOSING to CLOSED.
81   *     - Normal region closes.  CAS operation.
82   *   </li>
83   *   <li>
84   *     RegionServer transitions an unassigned node from OFFLINE to OPENING.
85   *     - All region opens will do this in response to an OPEN RPC from the Master.
86   *     - Normal region opens.  CAS operation.
87   *   </li>
88   *   <li>
89   *     RegionServer transitions an unassigned node from OPENING to OPENED.
90   *     - Normal region opens.  CAS operation.
91   *   </li>
92   * </ol>
93   */
94  @InterfaceAudience.Private
95  public class ZKAssign {
96    private static final Log LOG = LogFactory.getLog(ZKAssign.class);
97  
98    /**
99     * Gets the full path node name for the unassigned node for the specified
100    * region.
101    * @param zkw zk reference
102    * @param regionName region name
103    * @return full path node name
104    */
105   public static String getNodeName(ZooKeeperWatcher zkw, String regionName) {
106     return ZKUtil.joinZNode(zkw.assignmentZNode, regionName);
107   }
108 
109   /**
110    * Gets the region name from the full path node name of an unassigned node.
111    * @param path full zk path
112    * @return region name
113    */
114   public static String getRegionName(ZooKeeperWatcher zkw, String path) {
115     return path.substring(zkw.assignmentZNode.length()+1);
116   }
117 
118   // Master methods
119 
120   /**
121    * Creates a new unassigned node in the OFFLINE state for the specified region.
122    *
123    * <p>Does not transition nodes from other states.  If a node already exists
124    * for this region, a {@link org.apache.zookeeper.KeeperException.NodeExistsException} 
125    * will be thrown.
126    *
127    * <p>Sets a watcher on the unassigned region node if the method is successful.
128    *
129    * <p>This method should only be used during cluster startup and the enabling
130    * of a table.
131    *
132    * @param zkw zk reference
133    * @param region region to be created as offline
134    * @param serverName server transition will happen on
135    * @throws KeeperException if unexpected zookeeper exception
136    * @throws KeeperException.NodeExistsException if node already exists
137    */
138   public static void createNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region,
139       ServerName serverName)
140   throws KeeperException, KeeperException.NodeExistsException {
141     createNodeOffline(zkw, region, serverName, EventType.M_ZK_REGION_OFFLINE);
142   }
143 
144   public static void createNodeOffline(ZooKeeperWatcher zkw, HRegionInfo region,
145       ServerName serverName, final EventType event)
146   throws KeeperException, KeeperException.NodeExistsException {
147     LOG.debug(zkw.prefix("Creating unassigned node " +
148       region.getEncodedName() + " in OFFLINE state"));
149     RegionTransition rt =
150       RegionTransition.createRegionTransition(event, region.getRegionName(), serverName);
151     String node = getNodeName(zkw, region.getEncodedName());
152     ZKUtil.createAndWatch(zkw, node, rt.toByteArray());
153   }
154 
155   /**
156    * Creates an unassigned node in the OFFLINE state for the specified region.
157    * <p>
158    * Runs asynchronously.  Depends on no pre-existing znode.
159    *
160    * <p>Sets a watcher on the unassigned region node.
161    *
162    * @param zkw zk reference
163    * @param region region to be created as offline
164    * @param serverName server transition will happen on
165    * @param cb
166    * @param ctx
167    * @throws KeeperException if unexpected zookeeper exception
168    * @throws KeeperException.NodeExistsException if node already exists
169    */
170   public static void asyncCreateNodeOffline(ZooKeeperWatcher zkw,
171       HRegionInfo region, ServerName serverName,
172       final AsyncCallback.StringCallback cb, final Object ctx)
173   throws KeeperException {
174     LOG.debug(zkw.prefix("Async create of unassigned node " +
175       region.getEncodedName() + " with OFFLINE state"));
176     RegionTransition rt =
177       RegionTransition.createRegionTransition(
178           EventType.M_ZK_REGION_OFFLINE, region.getRegionName(), serverName);
179     String node = getNodeName(zkw, region.getEncodedName());
180     ZKUtil.asyncCreate(zkw, node, rt.toByteArray(), cb, ctx);
181   }
182 
183   /**
184    * Creates or force updates an unassigned node to the OFFLINE state for the
185    * specified region.
186    * <p>
187    * Attempts to create the node but if it exists will force it to transition to
188    * and OFFLINE state.
189    *
190    * <p>Sets a watcher on the unassigned region node if the method is
191    * successful.
192    *
193    * <p>This method should be used when assigning a region.
194    *
195    * @param zkw zk reference
196    * @param region region to be created as offline
197    * @param serverName server transition will happen on
198    * @return the version of the znode created in OFFLINE state, -1 if
199    *         unsuccessful.
200    * @throws KeeperException if unexpected zookeeper exception
201    * @throws KeeperException.NodeExistsException if node already exists
202    */
203   public static int createOrForceNodeOffline(ZooKeeperWatcher zkw,
204       HRegionInfo region, ServerName serverName) throws KeeperException {
205     LOG.debug(zkw.prefix("Creating (or updating) unassigned node " +
206       region.getEncodedName() + " with OFFLINE state"));
207     RegionTransition rt = RegionTransition.createRegionTransition(EventType.M_ZK_REGION_OFFLINE,
208       region.getRegionName(), serverName, HConstants.EMPTY_BYTE_ARRAY);
209     byte [] data = rt.toByteArray();
210     String node = getNodeName(zkw, region.getEncodedName());
211     zkw.sync(node);
212     int version = ZKUtil.checkExists(zkw, node);
213     if (version == -1) {
214       return ZKUtil.createAndWatch(zkw, node, data);
215     } else {
216       boolean setData = false;
217       try {
218         setData = ZKUtil.setData(zkw, node, data, version);
219         // Setdata throws KeeperException which aborts the Master. So we are
220         // catching it here.
221         // If just before setting the znode to OFFLINE if the RS has made any
222         // change to the
223         // znode state then we need to return -1.
224       } catch (KeeperException kpe) {
225         LOG.info("Version mismatch while setting the node to OFFLINE state.");
226         return -1;
227       }
228       if (!setData) {
229         return -1;
230       } else {
231         // We successfully forced to OFFLINE, reset watch and handle if
232         // the state changed in between our set and the watch
233         byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
234         rt = getRegionTransition(bytes);
235         if (rt.getEventType() != EventType.M_ZK_REGION_OFFLINE) {
236           // state changed, need to process
237           return -1;
238         }
239       }
240     }
241     return version + 1;
242   }
243 
244   /**
245    * Deletes an existing unassigned node that is in the OPENED state for the
246    * specified region.
247    *
248    * <p>If a node does not already exist for this region, a
249    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
250    *
251    * <p>No watcher is set whether this succeeds or not.
252    *
253    * <p>Returns false if the node was not in the proper state but did exist.
254    *
255    * <p>This method is used during normal region transitions when a region
256    * finishes successfully opening.  This is the Master acknowledging completion
257    * of the specified regions transition.
258    *
259    * @param zkw zk reference
260    * @param encodedRegionName opened region to be deleted from zk
261    * @param sn the expected region transition target server name
262    * @throws KeeperException if unexpected zookeeper exception
263    * @throws KeeperException.NoNodeException if node does not exist
264    */
265   public static boolean deleteOpenedNode(ZooKeeperWatcher zkw,
266       String encodedRegionName, ServerName sn)
267   throws KeeperException, KeeperException.NoNodeException {
268     return deleteNode(zkw, encodedRegionName,
269       EventType.RS_ZK_REGION_OPENED, sn);
270   }
271 
272   /**
273    * Deletes an existing unassigned node that is in the OFFLINE state for the
274    * specified region.
275    *
276    * <p>If a node does not already exist for this region, a
277    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
278    *
279    * <p>No watcher is set whether this succeeds or not.
280    *
281    * <p>Returns false if the node was not in the proper state but did exist.
282    *
283    * <p>This method is used during master failover when the regions on an RS
284    * that has died are all set to OFFLINE before being processed.
285    *
286    * @param zkw zk reference
287    * @param encodedRegionName closed region to be deleted from zk
288    * @param sn the expected region transition target server name
289    * @throws KeeperException if unexpected zookeeper exception
290    * @throws KeeperException.NoNodeException if node does not exist
291    */
292   public static boolean deleteOfflineNode(ZooKeeperWatcher zkw,
293       String encodedRegionName, ServerName sn)
294   throws KeeperException, KeeperException.NoNodeException {
295     return deleteNode(zkw, encodedRegionName,
296       EventType.M_ZK_REGION_OFFLINE, sn);
297   }
298 
299   /**
300    * Deletes an existing unassigned node that is in the CLOSED state for the
301    * specified region.
302    *
303    * <p>If a node does not already exist for this region, a
304    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
305    *
306    * <p>No watcher is set whether this succeeds or not.
307    *
308    * <p>Returns false if the node was not in the proper state but did exist.
309    *
310    * <p>This method is used during table disables when a region finishes
311    * successfully closing.  This is the Master acknowledging completion
312    * of the specified regions transition to being closed.
313    *
314    * @param zkw zk reference
315    * @param encodedRegionName closed region to be deleted from zk
316    * @param sn the expected region transition target server name
317    * @throws KeeperException if unexpected zookeeper exception
318    * @throws KeeperException.NoNodeException if node does not exist
319    */
320   public static boolean deleteClosedNode(ZooKeeperWatcher zkw,
321       String encodedRegionName, ServerName sn)
322   throws KeeperException, KeeperException.NoNodeException {
323     return deleteNode(zkw, encodedRegionName,
324       EventType.RS_ZK_REGION_CLOSED, sn);
325   }
326 
327   /**
328    * Deletes an existing unassigned node that is in the CLOSING state for the
329    * specified region.
330    *
331    * <p>If a node does not already exist for this region, a
332    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
333    *
334    * <p>No watcher is set whether this succeeds or not.
335    *
336    * <p>Returns false if the node was not in the proper state but did exist.
337    *
338    * <p>This method is used during table disables when a region finishes
339    * successfully closing.  This is the Master acknowledging completion
340    * of the specified regions transition to being closed.
341    *
342    * @param zkw zk reference
343    * @param region closing region to be deleted from zk
344    * @param sn the expected region transition target server name
345    * @throws KeeperException if unexpected zookeeper exception
346    * @throws KeeperException.NoNodeException if node does not exist
347    */
348   public static boolean deleteClosingNode(ZooKeeperWatcher zkw,
349       HRegionInfo region, ServerName sn)
350   throws KeeperException, KeeperException.NoNodeException {
351     String encodedRegionName = region.getEncodedName();
352     return deleteNode(zkw, encodedRegionName,
353       EventType.M_ZK_REGION_CLOSING, sn);
354   }
355 
356   /**
357    * Deletes an existing unassigned node that is in the specified state for the
358    * specified region.
359    *
360    * <p>If a node does not already exist for this region, a
361    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
362    *
363    * <p>No watcher is set whether this succeeds or not.
364    *
365    * <p>Returns false if the node was not in the proper state but did exist.
366    *
367    * <p>This method is used when a region finishes opening/closing.
368    * The Master acknowledges completion
369    * of the specified regions transition to being closed/opened.
370    *
371    * @param zkw zk reference
372    * @param encodedRegionName region to be deleted from zk
373    * @param expectedState state region must be in for delete to complete
374    * @param sn the expected region transition target server name
375    * @throws KeeperException if unexpected zookeeper exception
376    * @throws KeeperException.NoNodeException if node does not exist
377    */
378   public static boolean deleteNode(ZooKeeperWatcher zkw, String encodedRegionName,
379       EventType expectedState, ServerName sn)
380   throws KeeperException, KeeperException.NoNodeException {
381     return deleteNode(zkw, encodedRegionName, expectedState, sn, -1);
382   }
383 
384   /**
385    * Deletes an existing unassigned node that is in the specified state for the
386    * specified region.
387    *
388    * <p>If a node does not already exist for this region, a
389    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
390    *
391    * <p>No watcher is set whether this succeeds or not.
392    *
393    * <p>Returns false if the node was not in the proper state but did exist.
394    *
395    * <p>This method is used when a region finishes opening/closing.
396    * The Master acknowledges completion
397    * of the specified regions transition to being closed/opened.
398    *
399    * @param zkw zk reference
400    * @param encodedRegionName region to be deleted from zk
401    * @param expectedState state region must be in for delete to complete
402    * @param expectedVersion of the znode that is to be deleted.
403    *        If expectedVersion need not be compared while deleting the znode
404    *        pass -1
405    * @throws KeeperException if unexpected zookeeper exception
406    * @throws KeeperException.NoNodeException if node does not exist
407    */
408   public static boolean deleteNode(ZooKeeperWatcher zkw, String encodedRegionName,
409       EventType expectedState, int expectedVersion)
410   throws KeeperException, KeeperException.NoNodeException {
411     return deleteNode(zkw, encodedRegionName, expectedState, null, expectedVersion);
412   }
413 
414   /**
415    * Deletes an existing unassigned node that is in the specified state for the
416    * specified region.
417    *
418    * <p>If a node does not already exist for this region, a
419    * {@link org.apache.zookeeper.KeeperException.NoNodeException} will be thrown.
420    *
421    * <p>No watcher is set whether this succeeds or not.
422    *
423    * <p>Returns false if the node was not in the proper state but did exist.
424    *
425    * <p>This method is used when a region finishes opening/closing.
426    * The Master acknowledges completion
427    * of the specified regions transition to being closed/opened.
428    *
429    * @param zkw zk reference
430    * @param encodedRegionName region to be deleted from zk
431    * @param expectedState state region must be in for delete to complete
432    * @param serverName the expected region transition target server name
433    * @param expectedVersion of the znode that is to be deleted.
434    *        If expectedVersion need not be compared while deleting the znode
435    *        pass -1
436    * @throws KeeperException if unexpected zookeeper exception
437    * @throws KeeperException.NoNodeException if node does not exist
438    */
439   public static boolean deleteNode(ZooKeeperWatcher zkw, String encodedRegionName,
440       EventType expectedState, ServerName serverName, int expectedVersion)
441   throws KeeperException, KeeperException.NoNodeException {
442     if (LOG.isTraceEnabled()) {
443       LOG.trace(zkw.prefix("Deleting existing unassigned " +
444         "node " + encodedRegionName + " in expected state " + expectedState));
445     }
446     String node = getNodeName(zkw, encodedRegionName);
447     zkw.sync(node);
448     Stat stat = new Stat();
449     byte [] bytes = ZKUtil.getDataNoWatch(zkw, node, stat);
450     if (bytes == null) {
451       // If it came back null, node does not exist.
452       throw KeeperException.create(Code.NONODE);
453     }
454     RegionTransition rt = getRegionTransition(bytes);
455     EventType et = rt.getEventType();
456     if (!et.equals(expectedState)) {
457       LOG.warn(zkw.prefix("Attempting to delete unassigned node " + encodedRegionName + " in " +
458         expectedState + " state but node is in " + et + " state"));
459       return false;
460     }
461     // Verify the server transition happens on is not changed
462     if (serverName != null && !rt.getServerName().equals(serverName)) {
463       LOG.warn(zkw.prefix("Attempting to delete unassigned node " + encodedRegionName
464         + " with target " + serverName + " but node has " + rt.getServerName()));
465       return false;
466     }
467     if (expectedVersion != -1
468         && stat.getVersion() != expectedVersion) {
469       LOG.warn("The node " + encodedRegionName + " we are trying to delete is not" +
470         " the expected one. Got a version mismatch");
471       return false;
472     }
473     if(!ZKUtil.deleteNode(zkw, node, stat.getVersion())) {
474       LOG.warn(zkw.prefix("Attempting to delete " +
475           "unassigned node " + encodedRegionName + " in " + expectedState +
476           " state but after verifying state, we got a version mismatch"));
477       return false;
478     }
479     LOG.debug(zkw.prefix("Deleted unassigned node " +
480         encodedRegionName + " in expected state " + expectedState));
481     return true;
482   }
483 
484   /**
485    * Deletes all unassigned nodes regardless of their state.
486    *
487    * <p>No watchers are set.
488    *
489    * <p>This method is used by the Master during cluster startup to clear out
490    * any existing state from other cluster runs.
491    *
492    * @param zkw zk reference
493    * @throws KeeperException if unexpected zookeeper exception
494    */
495   public static void deleteAllNodes(ZooKeeperWatcher zkw)
496   throws KeeperException {
497     LOG.debug(zkw.prefix("Deleting any existing unassigned nodes"));
498     ZKUtil.deleteChildrenRecursively(zkw, zkw.assignmentZNode);
499   }
500 
501   /**
502    * Creates a new unassigned node in the CLOSING state for the specified
503    * region.
504    *
505    * <p>Does not transition nodes from any states.  If a node already exists
506    * for this region, a {@link org.apache.zookeeper.KeeperException.NodeExistsException} 
507    * will be thrown.
508    *
509    * <p>If creation is successful, returns the version number of the CLOSING
510    * node created.
511    *
512    * <p>Set a watch.
513    *
514    * <p>This method should only be used by a Master when initiating a
515    * close of a region before sending a close request to the region server.
516    *
517    * @param zkw zk reference
518    * @param region region to be created as closing
519    * @param serverName server transition will happen on
520    * @return version of node after transition, -1 if unsuccessful transition
521    * @throws KeeperException if unexpected zookeeper exception
522    * @throws KeeperException.NodeExistsException if node already exists
523    */
524   public static int createNodeClosing(ZooKeeperWatcher zkw, HRegionInfo region,
525       ServerName serverName)
526   throws KeeperException, KeeperException.NodeExistsException {
527     LOG.debug(zkw.prefix("Creating unassigned node " +
528       region.getEncodedName() + " in a CLOSING state"));
529     RegionTransition rt = RegionTransition.createRegionTransition(EventType.M_ZK_REGION_CLOSING,
530       region.getRegionName(), serverName, HConstants.EMPTY_BYTE_ARRAY);
531     String node = getNodeName(zkw, region.getEncodedName());
532     return ZKUtil.createAndWatch(zkw, node, rt.toByteArray());
533   }
534 
535   // RegionServer methods
536 
537   /**
538    * Transitions an existing unassigned node for the specified region which is
539    * currently in the CLOSING state to be in the CLOSED state.
540    *
541    * <p>Does not transition nodes from other states.  If for some reason the
542    * node could not be transitioned, the method returns -1.  If the transition
543    * is successful, the version of the node after transition is returned.
544    *
545    * <p>This method can fail and return false for three different reasons:
546    * <ul><li>Unassigned node for this region does not exist</li>
547    * <li>Unassigned node for this region is not in CLOSING state</li>
548    * <li>After verifying CLOSING state, update fails because of wrong version
549    * (someone else already transitioned the node)</li>
550    * </ul>
551    *
552    * <p>Does not set any watches.
553    *
554    * <p>This method should only be used by a RegionServer when initiating a
555    * close of a region after receiving a CLOSE RPC from the Master.
556    *
557    * @param zkw zk reference
558    * @param region region to be transitioned to closed
559    * @param serverName server transition happens on
560    * @return version of node after transition, -1 if unsuccessful transition
561    * @throws KeeperException if unexpected zookeeper exception
562    */
563   public static int transitionNodeClosed(ZooKeeperWatcher zkw,
564       HRegionInfo region, ServerName serverName, int expectedVersion)
565   throws KeeperException {
566     return transitionNode(zkw, region, serverName,
567         EventType.M_ZK_REGION_CLOSING,
568         EventType.RS_ZK_REGION_CLOSED, expectedVersion);
569   }
570 
571   /**
572    * Transitions an existing unassigned node for the specified region which is
573    * currently in the OFFLINE state to be in the OPENING state.
574    *
575    * <p>Does not transition nodes from other states.  If for some reason the
576    * node could not be transitioned, the method returns -1.  If the transition
577    * is successful, the version of the node written as OPENING is returned.
578    *
579    * <p>This method can fail and return -1 for three different reasons:
580    * <ul><li>Unassigned node for this region does not exist</li>
581    * <li>Unassigned node for this region is not in OFFLINE state</li>
582    * <li>After verifying OFFLINE state, update fails because of wrong version
583    * (someone else already transitioned the node)</li>
584    * </ul>
585    *
586    * <p>Does not set any watches.
587    *
588    * <p>This method should only be used by a RegionServer when initiating an
589    * open of a region after receiving an OPEN RPC from the Master.
590    *
591    * @param zkw zk reference
592    * @param region region to be transitioned to opening
593    * @param serverName server transition happens on
594    * @return version of node after transition, -1 if unsuccessful transition
595    * @throws KeeperException if unexpected zookeeper exception
596    */
597   public static int transitionNodeOpening(ZooKeeperWatcher zkw,
598       HRegionInfo region, ServerName serverName)
599   throws KeeperException {
600     return transitionNodeOpening(zkw, region, serverName,
601       EventType.M_ZK_REGION_OFFLINE);
602   }
603 
604   public static int transitionNodeOpening(ZooKeeperWatcher zkw,
605       HRegionInfo region, ServerName serverName, final EventType beginState)
606   throws KeeperException {
607     return transitionNode(zkw, region, serverName, beginState,
608       EventType.RS_ZK_REGION_OPENING, -1);
609   }
610 
611   /**
612    * Confirm an existing unassigned node for the specified region which is
613    * currently in the OPENING state to be still in the OPENING state on
614    * the specified server.
615    *
616    * <p>If for some reason the check fails, the method returns -1. Otherwise,
617    * the version of the node (same as the expected version) is returned.
618    *
619    * <p>This method can fail and return -1 for three different reasons:
620    * <ul><li>Unassigned node for this region does not exist</li>
621    * <li>Unassigned node for this region is not in OPENING state</li>
622    * <li>After verifying OPENING state, the server name or the version of the
623    * doesn't match)</li>
624    * </ul>
625    *
626    * <p>Does not set any watches.
627    *
628    * <p>This method should only be used by a RegionServer when initiating an
629    * open of a region after receiving an OPEN RPC from the Master.
630    *
631    * @param zkw zk reference
632    * @param region region to be transitioned to opening
633    * @param serverName server transition happens on
634    * @return version of node after transition, -1 if unsuccessful transition
635    * @throws KeeperException if unexpected zookeeper exception
636    */
637   public static int confirmNodeOpening(ZooKeeperWatcher zkw,
638       HRegionInfo region, ServerName serverName, int expectedVersion)
639   throws KeeperException {
640 
641     String encoded = region.getEncodedName();
642     if(LOG.isDebugEnabled()) {
643       LOG.debug(zkw.prefix("Attempting to retransition opening state of node " +
644           HRegionInfo.prettyPrint(encoded)));
645     }
646 
647     String node = getNodeName(zkw, encoded);
648     zkw.sync(node);
649 
650     // Read existing data of the node
651     Stat stat = new Stat();
652     byte [] existingBytes = ZKUtil.getDataNoWatch(zkw, node, stat);
653     if (existingBytes == null) {
654       // Node no longer exists.  Return -1. It means unsuccessful transition.
655       return -1;
656     }
657     RegionTransition rt = getRegionTransition(existingBytes);
658 
659     // Verify it is the expected version
660     if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
661       LOG.warn(zkw.prefix("Attempt to retransition the opening state of the " +
662           "unassigned node for " + encoded + " failed, " +
663           "the node existed but was version " + stat.getVersion() +
664           " not the expected version " + expectedVersion));
665       return -1;
666     }
667 
668     // Verify it is in expected state
669     EventType et = rt.getEventType();
670     if (!et.equals(EventType.RS_ZK_REGION_OPENING)) {
671       String existingServer = (rt.getServerName() == null)
672           ? "<unknown>" : rt.getServerName().toString();
673       LOG.warn(zkw.prefix("Attempt to retransition the opening state of the unassigned node for "
674           + encoded + " failed, the node existed but was in the state " + et +
675           " set by the server " + existingServer));
676       return -1;
677     }
678 
679     return expectedVersion;
680   }
681 
682   /**
683    * Transitions an existing unassigned node for the specified region which is
684    * currently in the OPENING state to be in the OPENED state.
685    *
686    * <p>Does not transition nodes from other states.  If for some reason the
687    * node could not be transitioned, the method returns -1.  If the transition
688    * is successful, the version of the node after transition is returned.
689    *
690    * <p>This method can fail and return false for three different reasons:
691    * <ul><li>Unassigned node for this region does not exist</li>
692    * <li>Unassigned node for this region is not in OPENING state</li>
693    * <li>After verifying OPENING state, update fails because of wrong version
694    * (this should never actually happen since an RS only does this transition
695    * following a transition to OPENING.  if two RS are conflicting, one would
696    * fail the original transition to OPENING and not this transition)</li>
697    * </ul>
698    *
699    * <p>Does not set any watches.
700    *
701    * <p>This method should only be used by a RegionServer when completing the
702    * open of a region.
703    *
704    * @param zkw zk reference
705    * @param region region to be transitioned to opened
706    * @param serverName server transition happens on
707    * @return version of node after transition, -1 if unsuccessful transition
708    * @throws KeeperException if unexpected zookeeper exception
709    */
710   public static int transitionNodeOpened(ZooKeeperWatcher zkw,
711       HRegionInfo region, ServerName serverName, int expectedVersion)
712   throws KeeperException {
713     return transitionNode(zkw, region, serverName,
714         EventType.RS_ZK_REGION_OPENING,
715         EventType.RS_ZK_REGION_OPENED, expectedVersion);
716   }
717 
718   /**
719    *
720    * @param zkw zk reference
721    * @param region region to be closed
722    * @param expectedVersion expected version of the znode
723    * @return true if the znode exists, has the right version and the right state. False otherwise.
724    * @throws KeeperException
725    */
726   public static boolean checkClosingState(ZooKeeperWatcher zkw, HRegionInfo region,
727                                           int expectedVersion) throws KeeperException {
728 
729     final String encoded = getNodeName(zkw, region.getEncodedName());
730     zkw.sync(encoded);
731 
732     // Read existing data of the node
733     Stat stat = new Stat();
734     byte[] existingBytes = ZKUtil.getDataNoWatch(zkw, encoded, stat);
735 
736     if (existingBytes == null) {
737       LOG.warn(zkw.prefix("Attempt to check the " +
738           "closing node for " + encoded +
739           ". The node does not exist"));
740       return false;
741     }
742 
743     if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
744       LOG.warn(zkw.prefix("Attempt to check the " +
745           "closing node for " + encoded +
746           ". The node existed but was version " + stat.getVersion() +
747           " not the expected version " + expectedVersion));
748       return false;
749     }
750 
751     RegionTransition rt = getRegionTransition(existingBytes);
752 
753     if (!EventType.M_ZK_REGION_CLOSING.equals(rt.getEventType())) {
754       LOG.warn(zkw.prefix("Attempt to check the " +
755           "closing node for " + encoded +
756           ". The node existed but was in an unexpected state: " + rt.getEventType()));
757       return false;
758     }
759 
760     return true;
761   }
762 
763   /**
764    * Method that actually performs unassigned node transitions.
765    *
766    * <p>Attempts to transition the unassigned node for the specified region
767    * from the expected state to the state in the specified transition data.
768    *
769    * <p>Method first reads existing data and verifies it is in the expected
770    * state.  If the node does not exist or the node is not in the expected
771    * state, the method returns -1.  If the transition is successful, the
772    * version number of the node following the transition is returned.
773    *
774    * <p>If the read state is what is expected, it attempts to write the new
775    * state and data into the node.  When doing this, it includes the expected
776    * version (determined when the existing state was verified) to ensure that
777    * only one transition is successful.  If there is a version mismatch, the
778    * method returns -1.
779    *
780    * <p>If the write is successful, no watch is set and the method returns true.
781    *
782    * @param zkw zk reference
783    * @param region region to be transitioned to opened
784    * @param serverName server transition happens on
785    * @param endState state to transition node to if all checks pass
786    * @param beginState state the node must currently be in to do transition
787    * @param expectedVersion expected version of data before modification, or -1
788    * @return version of node after transition, -1 if unsuccessful transition
789    * @throws KeeperException if unexpected zookeeper exception
790    */
791   public static int transitionNode(ZooKeeperWatcher zkw, HRegionInfo region,
792       ServerName serverName, EventType beginState, EventType endState,
793       int expectedVersion)
794   throws KeeperException {
795     return transitionNode(zkw, region, serverName, beginState, endState, expectedVersion, null);
796   }
797 
798 
799   public static int transitionNode(ZooKeeperWatcher zkw, HRegionInfo region,
800       ServerName serverName, EventType beginState, EventType endState,
801       int expectedVersion, final byte [] payload)
802   throws KeeperException {
803     String encoded = region.getEncodedName();
804     if(LOG.isDebugEnabled()) {
805       LOG.debug(zkw.prefix("Transitioning " + HRegionInfo.prettyPrint(encoded) +
806         " from " + beginState.toString() + " to " + endState.toString()));
807     }
808 
809     String node = getNodeName(zkw, encoded);
810     zkw.sync(node);
811 
812     // Read existing data of the node
813     Stat stat = new Stat();
814     byte [] existingBytes = ZKUtil.getDataNoWatch(zkw, node, stat);
815     if (existingBytes == null) {
816       // Node no longer exists.  Return -1. It means unsuccessful transition.
817       return -1;
818     }
819 
820     // Verify it is the expected version
821     if (expectedVersion != -1 && stat.getVersion() != expectedVersion) {
822       LOG.warn(zkw.prefix("Attempt to transition the " +
823         "unassigned node for " + encoded +
824         " from " + beginState + " to " + endState + " failed, " +
825         "the node existed but was version " + stat.getVersion() +
826         " not the expected version " + expectedVersion));
827         return -1;
828     }
829 
830     if (beginState.equals(EventType.M_ZK_REGION_OFFLINE)
831         && endState.equals(EventType.RS_ZK_REGION_OPENING)
832         && expectedVersion == -1 && stat.getVersion() != 0) {
833       // the below check ensures that double assignment doesnot happen.
834       // When the node is created for the first time then the expected version
835       // that is passed will be -1 and the version in znode will be 0.
836       // In all other cases the version in znode will be > 0.
837       LOG.warn(zkw.prefix("Attempt to transition the " + "unassigned node for "
838           + encoded + " from " + beginState + " to " + endState + " failed, "
839           + "the node existed but was version " + stat.getVersion()
840           + " not the expected version " + expectedVersion));
841       return -1;
842     }
843 
844     RegionTransition rt = getRegionTransition(existingBytes);
845 
846     // Verify the server transition happens on is not changed
847     if (!rt.getServerName().equals(serverName)) {
848       LOG.warn(zkw.prefix("Attempt to transition the " +
849         "unassigned node for " + encoded +
850         " from " + beginState + " to " + endState + " failed, " +
851         "the server that tried to transition was " + serverName +
852         " not the expected " + rt.getServerName()));
853       return -1;
854     }
855 
856     // Verify it is in expected state
857     EventType et = rt.getEventType();
858     if (!et.equals(beginState)) {
859       String existingServer = (rt.getServerName() == null)
860         ? "<unknown>" : rt.getServerName().toString();
861       LOG.warn(zkw.prefix("Attempt to transition the unassigned node for " + encoded
862         + " from " + beginState + " to " + endState + " failed, the node existed but"
863         + " was in the state " + et + " set by the server " + existingServer));
864       return -1;
865     }
866 
867     // Write new data, ensuring data has not changed since we last read it
868     try {
869       rt = RegionTransition.createRegionTransition(
870           endState, region.getRegionName(), serverName, payload);
871       if(!ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion())) {
872         LOG.warn(zkw.prefix("Attempt to transition the " +
873         "unassigned node for " + encoded +
874         " from " + beginState + " to " + endState + " failed, " +
875         "the node existed and was in the expected state but then when " +
876         "setting data we got a version mismatch"));
877         return -1;
878       }
879       if(LOG.isDebugEnabled()) {
880         LOG.debug(zkw.prefix("Transitioned node " + encoded +
881           " from " + beginState + " to " + endState));
882       }
883       return stat.getVersion() + 1;
884     } catch (KeeperException.NoNodeException nne) {
885       LOG.warn(zkw.prefix("Attempt to transition the " +
886         "unassigned node for " + encoded +
887         " from " + beginState + " to " + endState + " failed, " +
888         "the node existed and was in the expected state but then when " +
889         "setting data it no longer existed"));
890       return -1;
891     }
892   }
893 
894   private static RegionTransition getRegionTransition(final byte [] bytes) throws KeeperException {
895     try {
896       return RegionTransition.parseFrom(bytes);
897     } catch (DeserializationException e) {
898       // Convert to a zk exception for now.  Otherwise have to change API
899       throw ZKUtil.convert(e);
900     }
901   }
902 
903   /**
904    * Gets the current data in the unassigned node for the specified region name
905    * or fully-qualified path.
906    *
907    * <p>Returns null if the region does not currently have a node.
908    *
909    * <p>Sets a watch on the node if the node exists.
910    *
911    * @param zkw zk reference
912    * @param pathOrRegionName fully-specified path or region name
913    * @return znode content
914    * @throws KeeperException if unexpected zookeeper exception
915    */
916   public static byte [] getData(ZooKeeperWatcher zkw,
917       String pathOrRegionName)
918   throws KeeperException {
919     String node = getPath(zkw, pathOrRegionName);
920     return ZKUtil.getDataAndWatch(zkw, node);
921   }
922 
923   /**
924    * Gets the current data in the unassigned node for the specified region name
925    * or fully-qualified path.
926    *
927    * <p>Returns null if the region does not currently have a node.
928    *
929    * <p>Sets a watch on the node if the node exists.
930    *
931    * @param zkw zk reference
932    * @param pathOrRegionName fully-specified path or region name
933    * @param stat object to populate the version.
934    * @return znode content
935    * @throws KeeperException if unexpected zookeeper exception
936    */
937   public static byte [] getDataAndWatch(ZooKeeperWatcher zkw,
938       String pathOrRegionName, Stat stat)
939   throws KeeperException {
940     String node = getPath(zkw, pathOrRegionName);
941     return ZKUtil.getDataAndWatch(zkw, node, stat);
942   }
943 
944   /**
945    * Gets the current data in the unassigned node for the specified region name
946    * or fully-qualified path.
947    *
948    * <p>Returns null if the region does not currently have a node.
949    *
950    * <p>Does not set a watch.
951    *
952    * @param zkw zk reference
953    * @param pathOrRegionName fully-specified path or region name
954    * @param stat object to store node info into on getData call
955    * @return znode content
956    * @throws KeeperException if unexpected zookeeper exception
957    */
958   public static byte [] getDataNoWatch(ZooKeeperWatcher zkw,
959       String pathOrRegionName, Stat stat)
960   throws KeeperException {
961     String node = getPath(zkw, pathOrRegionName);
962     return ZKUtil.getDataNoWatch(zkw, node, stat);
963   }
964 
965   /**
966    * @param zkw
967    * @param pathOrRegionName
968    * @return Path to znode
969    */
970   public static String getPath(final ZooKeeperWatcher zkw, final String pathOrRegionName) {
971     return pathOrRegionName.startsWith("/")? pathOrRegionName : getNodeName(zkw, pathOrRegionName);
972   }
973 
974   /**
975    * Get the version of the specified znode
976    * @param zkw zk reference
977    * @param region region's info
978    * @return the version of the znode, -1 if it doesn't exist
979    * @throws KeeperException
980    */
981   public static int getVersion(ZooKeeperWatcher zkw, HRegionInfo region)
982     throws KeeperException {
983     String znode = getNodeName(zkw, region.getEncodedName());
984     return ZKUtil.checkExists(zkw, znode);
985   }
986 
987   /**
988    * Delete the assignment node regardless of its current state.
989    * <p>
990    * Fail silent even if the node does not exist at all.
991    * @param watcher
992    * @param regionInfo
993    * @throws KeeperException
994    */
995   public static void deleteNodeFailSilent(ZooKeeperWatcher watcher,
996       HRegionInfo regionInfo)
997   throws KeeperException {
998     String node = getNodeName(watcher, regionInfo.getEncodedName());
999     ZKUtil.deleteNodeFailSilent(watcher, node);
1000   }
1001 
1002   /**
1003    * Blocks until there are no node in regions in transition.
1004    * <p>
1005    * Used in testing only.
1006    * @param zkw zk reference
1007    * @throws KeeperException
1008    * @throws InterruptedException
1009    */
1010   public static void blockUntilNoRIT(ZooKeeperWatcher zkw)
1011   throws KeeperException, InterruptedException {
1012     while (ZKUtil.nodeHasChildren(zkw, zkw.assignmentZNode)) {
1013       List<String> znodes =
1014         ZKUtil.listChildrenAndWatchForNewChildren(zkw, zkw.assignmentZNode);
1015       if (znodes != null && !znodes.isEmpty()) {
1016         LOG.debug("Waiting on RIT: " + znodes);
1017       }
1018       Thread.sleep(100);
1019     }
1020   }
1021 
1022   /**
1023    * Blocks until there is at least one node in regions in transition.
1024    * <p>
1025    * Used in testing only.
1026    * @param zkw zk reference
1027    * @throws KeeperException
1028    * @throws InterruptedException
1029    */
1030   public static void blockUntilRIT(ZooKeeperWatcher zkw)
1031   throws KeeperException, InterruptedException {
1032     while (!ZKUtil.nodeHasChildren(zkw, zkw.assignmentZNode)) {
1033       List<String> znodes =
1034         ZKUtil.listChildrenAndWatchForNewChildren(zkw, zkw.assignmentZNode);
1035       if (znodes == null || znodes.isEmpty()) {
1036         LOG.debug("No RIT in ZK");
1037       }
1038       Thread.sleep(100);
1039     }
1040   }
1041 
1042   /**
1043    * Presume bytes are serialized unassigned data structure
1044    * @param znodeBytes
1045    * @return String of the deserialized znode bytes.
1046    */
1047   static String toString(final byte[] znodeBytes) {
1048     // This method should not exist.  Used by ZKUtil stringifying RegionTransition.  Have the
1049     // method in here so RegionTransition does not leak into ZKUtil.
1050     try {
1051       RegionTransition rt = RegionTransition.parseFrom(znodeBytes);
1052       return rt.toString();
1053     } catch (DeserializationException e) {
1054       return "";
1055     }
1056   }
1057 }