View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.master;
20  
21  import java.io.IOException;
22  import java.util.concurrent.atomic.AtomicBoolean;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.Server;
28  import org.apache.hadoop.hbase.ServerName;
29  import org.apache.hadoop.hbase.ZNodeClearer;
30  import org.apache.hadoop.hbase.exceptions.DeserializationException;
31  import org.apache.hadoop.hbase.monitoring.MonitoredTask;
32  import org.apache.hadoop.hbase.zookeeper.MasterAddressTracker;
33  import org.apache.hadoop.hbase.zookeeper.ZKUtil;
34  import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
35  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
36  import org.apache.zookeeper.KeeperException;
37  
38  /**
39   * Handles everything on master-side related to master election.
40   *
41   * <p>Listens and responds to ZooKeeper notifications on the master znode,
42   * both <code>nodeCreated</code> and <code>nodeDeleted</code>.
43   *
44   * <p>Contains blocking methods which will hold up backup masters, waiting
45   * for the active master to fail.
46   *
47   * <p>This class is instantiated in the HMaster constructor and the method
48   * #blockUntilBecomingActiveMaster() is called to wait until becoming
49   * the active master of the cluster.
50   */
51  @InterfaceAudience.Private
52  public class ActiveMasterManager extends ZooKeeperListener {
53    private static final Log LOG = LogFactory.getLog(ActiveMasterManager.class);
54  
55    final AtomicBoolean clusterHasActiveMaster = new AtomicBoolean(false);
56    final AtomicBoolean clusterShutDown = new AtomicBoolean(false);
57  
58    private final ServerName sn;
59    private int infoPort;
60    private final Server master;
61  
62    /**
63     * @param watcher
64     * @param sn ServerName
65     * @param master In an instance of a Master.
66     */
67    ActiveMasterManager(ZooKeeperWatcher watcher, ServerName sn, Server master) {
68      super(watcher);
69      watcher.registerListener(this);
70      this.sn = sn;
71      this.master = master;
72    }
73  
74    // will be set after jetty server is started
75    public void setInfoPort(int infoPort) {
76      this.infoPort = infoPort;
77    }
78  
79    @Override
80    public void nodeCreated(String path) {
81      handle(path);
82    }
83  
84    @Override
85    public void nodeDeleted(String path) {
86  
87      // We need to keep track of the cluster's shutdown status while
88      // we wait on the current master. We consider that, if the cluster
89      // was already in a "shutdown" state when we started, that this master
90      // is part of a new cluster that was started shortly after the old cluster
91      // shut down, so that state is now irrelevant. This means that the shutdown
92      // state must be set while we wait on the active master in order
93      // to shutdown this master. See HBASE-8519.
94      if(path.equals(watcher.clusterStateZNode) && !master.isStopped()) {
95        clusterShutDown.set(true);
96      }
97  
98      handle(path);
99    }
100 
101   void handle(final String path) {
102     if (path.equals(watcher.getMasterAddressZNode()) && !master.isStopped()) {
103       handleMasterNodeChange();
104     }
105   }
106 
107   /**
108    * Handle a change in the master node.  Doesn't matter whether this was called
109    * from a nodeCreated or nodeDeleted event because there are no guarantees
110    * that the current state of the master node matches the event at the time of
111    * our next ZK request.
112    *
113    * <p>Uses the watchAndCheckExists method which watches the master address node
114    * regardless of whether it exists or not.  If it does exist (there is an
115    * active master), it returns true.  Otherwise it returns false.
116    *
117    * <p>A watcher is set which guarantees that this method will get called again if
118    * there is another change in the master node.
119    */
120   private void handleMasterNodeChange() {
121     // Watch the node and check if it exists.
122     try {
123       synchronized(clusterHasActiveMaster) {
124         if (ZKUtil.watchAndCheckExists(watcher, watcher.getMasterAddressZNode())) {
125           // A master node exists, there is an active master
126           LOG.debug("A master is now available");
127           clusterHasActiveMaster.set(true);
128         } else {
129           // Node is no longer there, cluster does not have an active master
130           LOG.debug("No master available. Notifying waiting threads");
131           clusterHasActiveMaster.set(false);
132           // Notify any thread waiting to become the active master
133           clusterHasActiveMaster.notifyAll();
134         }
135       }
136     } catch (KeeperException ke) {
137       master.abort("Received an unexpected KeeperException, aborting", ke);
138     }
139   }
140 
141   /**
142    * Block until becoming the active master.
143    *
144    * Method blocks until there is not another active master and our attempt
145    * to become the new active master is successful.
146    *
147    * This also makes sure that we are watching the master znode so will be
148    * notified if another master dies.
149    * @param checkInterval the interval to check if the master is stopped
150    * @param startupStatus the monitor status to track the progress
151    * @return True if no issue becoming active master else false if another
152    * master was running or if some other problem (zookeeper, stop flag has been
153    * set on this Master)
154    */
155   boolean blockUntilBecomingActiveMaster(
156       int checkInterval, MonitoredTask startupStatus) {
157     String backupZNode = ZKUtil.joinZNode(
158       this.watcher.backupMasterAddressesZNode, this.sn.toString());
159     while (!(master.isAborted() || master.isStopped())) {
160       startupStatus.setStatus("Trying to register in ZK as active master");
161       // Try to become the active master, watch if there is another master.
162       // Write out our ServerName as versioned bytes.
163       try {
164         if (MasterAddressTracker.setMasterAddress(this.watcher,
165             this.watcher.getMasterAddressZNode(), this.sn, infoPort)) {
166 
167           // If we were a backup master before, delete our ZNode from the backup
168           // master directory since we are the active now)
169           if (ZKUtil.checkExists(this.watcher, backupZNode) != -1) {
170             LOG.info("Deleting ZNode for " + backupZNode + " from backup master directory");
171             ZKUtil.deleteNodeFailSilent(this.watcher, backupZNode);
172           }
173           // Save the znode in a file, this will allow to check if we crash in the launch scripts
174           ZNodeClearer.writeMyEphemeralNodeOnDisk(this.sn.toString());
175 
176           // We are the master, return
177           startupStatus.setStatus("Successfully registered as active master.");
178           this.clusterHasActiveMaster.set(true);
179           LOG.info("Registered Active Master=" + this.sn);
180           return true;
181         }
182 
183         // There is another active master running elsewhere or this is a restart
184         // and the master ephemeral node has not expired yet.
185         this.clusterHasActiveMaster.set(true);
186 
187         String msg;
188         byte[] bytes =
189           ZKUtil.getDataAndWatch(this.watcher, this.watcher.getMasterAddressZNode());
190         if (bytes == null) {
191           msg = ("A master was detected, but went down before its address " +
192             "could be read.  Attempting to become the next active master");
193         } else {
194           ServerName currentMaster;
195           try {
196             currentMaster = ServerName.parseFrom(bytes);
197           } catch (DeserializationException e) {
198             LOG.warn("Failed parse", e);
199             // Hopefully next time around we won't fail the parse.  Dangerous.
200             continue;
201           }
202           if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) {
203             msg = ("Current master has this master's address, " +
204               currentMaster + "; master was restarted? Deleting node.");
205             // Hurry along the expiration of the znode.
206             ZKUtil.deleteNode(this.watcher, this.watcher.getMasterAddressZNode());
207 
208             // We may have failed to delete the znode at the previous step, but
209             //  we delete the file anyway: a second attempt to delete the znode is likely to fail again.
210             ZNodeClearer.deleteMyEphemeralNodeOnDisk();
211           } else {
212             msg = "Another master is the active master, " + currentMaster +
213               "; waiting to become the next active master";
214           }
215         }
216         LOG.info(msg);
217         startupStatus.setStatus(msg);
218       } catch (KeeperException ke) {
219         master.abort("Received an unexpected KeeperException, aborting", ke);
220         return false;
221       }
222       synchronized (this.clusterHasActiveMaster) {
223         while (clusterHasActiveMaster.get() && !master.isStopped()) {
224           try {
225             clusterHasActiveMaster.wait(checkInterval);
226           } catch (InterruptedException e) {
227             // We expect to be interrupted when a master dies,
228             //  will fall out if so
229             LOG.debug("Interrupted waiting for master to die", e);
230           }
231         }
232         if (clusterShutDown.get()) {
233           this.master.stop(
234             "Cluster went down before this master became active");
235         }
236       }
237     }
238     return false;
239   }
240 
241   /**
242    * @return True if cluster has an active master.
243    */
244   boolean hasActiveMaster() {
245     try {
246       if (ZKUtil.checkExists(watcher, watcher.getMasterAddressZNode()) >= 0) {
247         return true;
248       }
249     }
250     catch (KeeperException ke) {
251       LOG.info("Received an unexpected KeeperException when checking " +
252           "isActiveMaster : "+ ke);
253     }
254     return false;
255   }
256 
257   public void stop() {
258     try {
259       synchronized (clusterHasActiveMaster) {
260         // Master is already stopped, wake up the manager
261         // thread so that it can shutdown soon.
262         clusterHasActiveMaster.notifyAll();
263       }
264       // If our address is in ZK, delete it on our way out
265       ServerName activeMaster = null;
266       try {
267         activeMaster = MasterAddressTracker.getMasterAddress(this.watcher);
268       } catch (IOException e) {
269         LOG.warn("Failed get of master address: " + e.toString());
270       }
271       if (activeMaster != null &&  activeMaster.equals(this.sn)) {
272         ZKUtil.deleteNode(watcher, watcher.getMasterAddressZNode());
273         // We may have failed to delete the znode at the previous step, but
274         //  we delete the file anyway: a second attempt to delete the znode is likely to fail again.
275         ZNodeClearer.deleteMyEphemeralNodeOnDisk();
276       }
277     } catch (KeeperException e) {
278       LOG.error(this.watcher.prefix("Error deleting our own master address node"), e);
279     }
280   }
281 }