View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.catalog;
19  
20  import org.apache.commons.logging.Log;
21  import org.apache.commons.logging.LogFactory;
22  import org.apache.hadoop.classification.InterfaceAudience;
23  import org.apache.hadoop.conf.Configuration;
24  import org.apache.hadoop.hbase.Abortable;
25  import org.apache.hadoop.hbase.HRegionInfo;
26  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
27  import org.apache.hadoop.hbase.ServerName;
28  import org.apache.hadoop.hbase.client.HConnection;
29  import org.apache.hadoop.hbase.client.HConnectionManager;
30  import org.apache.hadoop.hbase.client.HTable;
31  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
32  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
33  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
34  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
36  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
37  import org.apache.hadoop.hbase.util.Bytes;
38  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
39  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
40  import org.apache.hadoop.ipc.RemoteException;
41  
42  import java.io.EOFException;
43  import java.io.IOException;
44  import java.net.ConnectException;
45  import java.net.NoRouteToHostException;
46  import java.net.SocketException;
47  import java.net.SocketTimeoutException;
48  import java.net.UnknownHostException;
49  
50  /**
51   * Tracks the availability of the catalog tables
52   * <code>hbase:meta</code>.
53   *
54   * This class is "read-only" in that the locations of the catalog tables cannot
55   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
56   * and location of <code>hbase:meta</code>.
57   *
58   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
59   * interrupt waits and close up shop.
60   */
61  @InterfaceAudience.Private
62  public class CatalogTracker {
63    // TODO JDC 11/30 We don't even have ROOT anymore, revisit
64    // TODO: This class needs a rethink.  The original intent was that it would be
65    // the one-stop-shop for meta locations and that it would get this
66    // info from reading and watching zk state.  The class was to be used by
67    // servers when they needed to know of meta movement but also by
68    // client-side (inside in HTable) so rather than figure meta
69    // locations on fault, the client would instead get notifications out of zk.
70    //
71    // But this original intent is frustrated by the fact that this class has to
72    // read an hbase table, the -ROOT- table, to figure out the hbase:meta region
73    // location which means we depend on an HConnection.  HConnection will do
74    // retrying but also, it has its own mechanism for finding root and meta
75    // locations (and for 'verifying'; it tries the location and if it fails, does
76    // new lookup, etc.).  So, at least for now, HConnection (or HTable) can't
77    // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
78    // For HT keep up a session with ZK?  Rather, shouldn't we do like asynchbase
79    // where we'd open a connection to zk, read what we need then let the
80    // connection go?).  The 'fix' is make it so both root and meta addresses
81    // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
82    //
83    // But even then, this class does 'verification' of the location and it does
84    // this by making a call over an HConnection (which will do its own root
85    // and meta lookups).  Isn't this verification 'useless' since when we
86    // return, whatever is dependent on the result of this call then needs to
87    // use HConnection; what we have verified may change in meantime (HConnection
88    // uses the CT primitives, the root and meta trackers finding root locations).
89    //
90    // When meta is moved to zk, this class may make more sense.  In the
91    // meantime, it does not cohere.  It should just watch meta and root and not
92    // NOT do verification -- let that be out in HConnection since its going to
93    // be done there ultimately anyways.
94    //
95    // This class has spread throughout the codebase.  It needs to be reigned in.
96    // This class should be used server-side only, even if we move meta location
97    // up into zk.  Currently its used over in the client package. Its used in
98    // MetaReader and MetaEditor classes usually just to get the Configuration
99    // its using (It does this indirectly by asking its HConnection for its
100   // Configuration and even then this is just used to get an HConnection out on
101   // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
102   // doing CT fixup. St.Ack 09/30/2011.
103   //
104 
105   // TODO: Timeouts have never been as advertised in here and its worse now
106   // with retries; i.e. the HConnection retries and pause goes ahead whatever
107   // the passed timeout is.  Fix.
108   private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
109   private final HConnection connection;
110   private final ZooKeeperWatcher zookeeper;
111   private final MetaRegionTracker metaRegionTracker;
112   private boolean instantiatedzkw = false;
113   private Abortable abortable;
114 
115   private boolean stopped = false;
116 
117   static final byte [] META_REGION_NAME =
118     HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
119 
120   /**
121    * Constructs a catalog tracker. Find current state of catalog tables.
122    * Begin active tracking by executing {@link #start()} post construction. Does
123    * not timeout.
124    *
125    * @param conf
126    *          the {@link Configuration} from which a {@link HConnection} will be
127    *          obtained; if problem, this connections
128    *          {@link HConnection#abort(String, Throwable)} will be called.
129    * @throws IOException
130    */
131   public CatalogTracker(final Configuration conf) throws IOException {
132     this(null, conf, null);
133   }
134 
135   /**
136    * Constructs the catalog tracker.  Find current state of catalog tables.
137    * Begin active tracking by executing {@link #start()} post construction.
138    * Does not timeout.
139    * @param zk If zk is null, we'll create an instance (and shut it down
140    * when {@link #stop()} is called) else we'll use what is passed.
141    * @param conf
142    * @param abortable If fatal exception we'll call abort on this.  May be null.
143    * If it is we'll use the Connection associated with the passed
144    * {@link Configuration} as our Abortable.
145    * @throws IOException
146    */
147   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
148       Abortable abortable)
149   throws IOException {
150     this(zk, conf, HConnectionManager.getConnection(conf), abortable);
151   }
152 
153   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
154       HConnection connection, Abortable abortable)
155   throws IOException {
156     this.connection = connection;
157     if (abortable == null) {
158       // A connection is abortable.
159       this.abortable = this.connection;
160     }
161     Abortable throwableAborter = new Abortable() {
162 
163       @Override
164       public void abort(String why, Throwable e) {
165         throw new RuntimeException(why, e);
166       }
167 
168       @Override
169       public boolean isAborted() {
170         return true;
171       }
172 
173     };
174     if (zk == null) {
175       // Create our own.  Set flag so we tear it down on stop.
176       this.zookeeper =
177         new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
178           abortable);
179       instantiatedzkw = true;
180     } else {
181       this.zookeeper = zk;
182     }
183     this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
184   }
185 
186   /**
187    * Starts the catalog tracker.
188    * Determines current availability of catalog tables and ensures all further
189    * transitions of either region are tracked.
190    * @throws IOException
191    * @throws InterruptedException
192    */
193   public void start() throws IOException, InterruptedException {
194     LOG.debug("Starting catalog tracker " + this);
195     try {
196       this.metaRegionTracker.start();
197     } catch (RuntimeException e) {
198       Throwable t = e.getCause();
199       this.abortable.abort(e.getMessage(), t);
200       throw new IOException("Attempt to start meta tracker failed.", t);
201     }
202   }
203 
204   /**
205    * Stop working.
206    * Interrupts any ongoing waits.
207    */
208   public void stop() {
209     if (!this.stopped) {
210       LOG.debug("Stopping catalog tracker " + this);
211       this.stopped = true;
212       this.metaRegionTracker.stop();
213       try {
214         if (this.connection != null) {
215           this.connection.close();
216         }
217       } catch (IOException e) {
218         // Although the {@link Closeable} interface throws an {@link
219         // IOException}, in reality, the implementation would never do that.
220         LOG.error("Attempt to close catalog tracker's connection failed.", e);
221       }
222       if (this.instantiatedzkw) {
223         this.zookeeper.close();
224       }
225     }
226   }
227 
228   /**
229    * Gets the current location for <code>hbase:meta</code> or null if location is
230    * not currently available.
231    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
232    * if none available
233    * @throws InterruptedException
234    */
235   public ServerName getMetaLocation() throws InterruptedException {
236     return this.metaRegionTracker.getMetaRegionLocation();
237   }
238 
239   /**
240    * Checks whether meta regionserver znode has some non null data.
241    * @return true if data is not null, false otherwise.
242    */
243   public boolean isMetaLocationAvailable() {
244     return this.metaRegionTracker.isLocationAvailable();
245   }
246   /**
247    * Gets the current location for <code>hbase:meta</code> if available and waits
248    * for up to the specified timeout if not immediately available.  Returns null
249    * if the timeout elapses before root is available.
250    * @param timeout maximum time to wait for root availability, in milliseconds
251    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
252    * if none available
253    * @throws InterruptedException if interrupted while waiting
254    * @throws NotAllMetaRegionsOnlineException if meta not available before
255    * timeout
256    */
257   public ServerName waitForMeta(final long timeout)
258   throws InterruptedException, NotAllMetaRegionsOnlineException {
259     ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
260     if (sn == null) {
261       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
262     }
263     return sn;
264   }
265 
266   /**
267    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
268    * waiting up to the specified timeout for availability.
269    * @param timeout How long to wait on meta location
270    * @see #waitForMeta for additional information
271    * @return connection to server hosting meta
272    * @throws InterruptedException
273    * @throws NotAllMetaRegionsOnlineException if timed out waiting
274    * @throws IOException
275    * @deprecated Use #getMetaServerConnection(long)
276    */
277   public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
278   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
279     return getMetaServerConnection(timeout);
280   }
281 
282   /**
283    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
284    * waiting up to the specified timeout for availability.
285    * <p>WARNING: Does not retry.  Use an {@link HTable} instead.
286    * @param timeout How long to wait on meta location
287    * @see #waitForMeta for additional information
288    * @return connection to server hosting meta
289    * @throws InterruptedException
290    * @throws NotAllMetaRegionsOnlineException if timed out waiting
291    * @throws IOException
292    */
293   AdminService.BlockingInterface getMetaServerConnection(long timeout)
294   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
295     return getCachedConnection(waitForMeta(timeout));
296   }
297 
298   /**
299    * Waits indefinitely for availability of <code>hbase:meta</code>.  Used during
300    * cluster startup.  Does not verify meta, just that something has been
301    * set up in zk.
302    * @see #waitForMeta(long)
303    * @throws InterruptedException if interrupted while waiting
304    */
305   public void waitForMeta() throws InterruptedException {
306     while (!this.stopped) {
307       try {
308         if (waitForMeta(100) != null) break;
309       } catch (NotAllMetaRegionsOnlineException e) {
310         if (LOG.isTraceEnabled()) {
311           LOG.trace("hbase:meta still not available, sleeping and retrying." +
312           " Reason: " + e.getMessage());
313         }
314       }
315     }
316   }
317 
318   /**
319    * @param sn ServerName to get a connection against.
320    * @return The AdminProtocol we got when we connected to <code>sn</code>
321    * May have come from cache, may not be good, may have been setup by this
322    * invocation, or may be null.
323    * @throws IOException
324    */
325   private AdminService.BlockingInterface getCachedConnection(ServerName sn)
326   throws IOException {
327     if (sn == null) {
328       return null;
329     }
330     AdminService.BlockingInterface service = null;
331     try {
332       service = connection.getAdmin(sn);
333     } catch (RetriesExhaustedException e) {
334       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
335         // Catch this; presume it means the cached connection has gone bad.
336       } else {
337         throw e;
338       }
339     } catch (SocketTimeoutException e) {
340       LOG.debug("Timed out connecting to " + sn);
341     } catch (NoRouteToHostException e) {
342       LOG.debug("Connecting to " + sn, e);
343     } catch (SocketException e) {
344       LOG.debug("Exception connecting to " + sn);
345     } catch (UnknownHostException e) {
346       LOG.debug("Unknown host exception connecting to  " + sn);
347     } catch (FailedServerException e) {
348       if (LOG.isDebugEnabled()) {
349         LOG.debug("Server " + sn + " is in failed server list.");
350       }
351     } catch (IOException ioe) {
352       Throwable cause = ioe.getCause();
353       if (ioe instanceof ConnectException) {
354         // Catch. Connect refused.
355       } else if (cause != null && cause instanceof EOFException) {
356         // Catch. Other end disconnected us.
357       } else if (cause != null && cause.getMessage() != null &&
358         cause.getMessage().toLowerCase().contains("connection reset")) {
359         // Catch. Connection reset.
360       } else {
361         throw ioe;
362       }
363 
364     }
365     return service;
366   }
367 
368   /**
369    * Verify we can connect to <code>hostingServer</code> and that its carrying
370    * <code>regionName</code>.
371    * @param hostingServer Interface to the server hosting <code>regionName</code>
372    * @param address The servername that goes with the <code>metaServer</code>
373    * Interface.  Used logging.
374    * @param regionName The regionname we are interested in.
375    * @return True if we were able to verify the region located at other side of
376    * the Interface.
377    * @throws IOException
378    */
379   // TODO: We should be able to get the ServerName from the AdminProtocol
380   // rather than have to pass it in.  Its made awkward by the fact that the
381   // HRI is likely a proxy against remote server so the getServerName needs
382   // to be fixed to go to a local method or to a cache before we can do this.
383   private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
384       final ServerName address, final byte [] regionName)
385   throws IOException {
386     if (hostingServer == null) {
387       LOG.info("Passed hostingServer is null");
388       return false;
389     }
390     Throwable t = null;
391     try {
392       // Try and get regioninfo from the hosting server.
393       return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
394     } catch (ConnectException e) {
395       t = e;
396     } catch (RetriesExhaustedException e) {
397       t = e;
398     } catch (RemoteException e) {
399       IOException ioe = e.unwrapRemoteException();
400       t = ioe;
401     } catch (IOException e) {
402       Throwable cause = e.getCause();
403       if (cause != null && cause instanceof EOFException) {
404         t = cause;
405       } else if (cause != null && cause.getMessage() != null
406           && cause.getMessage().contains("Connection reset")) {
407         t = cause;
408       } else {
409         t = e;
410       }
411     }
412     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
413       " at address=" + address + ", exception=" + t);
414     return false;
415   }
416 
417   /**
418    * Verify <code>hbase:meta</code> is deployed and accessible.
419    * @param timeout How long to wait on zk for meta address (passed through to
420    * the internal call to {@link #waitForMetaServerConnection(long)}.
421    * @return True if the <code>hbase:meta</code> location is healthy.
422    * @throws IOException
423    * @throws InterruptedException
424    */
425   public boolean verifyMetaRegionLocation(final long timeout)
426   throws InterruptedException, IOException {
427     AdminService.BlockingInterface service = null;
428     try {
429       service = waitForMetaServerConnection(timeout);
430     } catch (NotAllMetaRegionsOnlineException e) {
431       // Pass
432     } catch (ServerNotRunningYetException e) {
433       // Pass -- remote server is not up so can't be carrying root
434     } catch (UnknownHostException e) {
435       // Pass -- server name doesn't resolve so it can't be assigned anything.
436     } catch (RegionServerStoppedException e) {
437       // Pass -- server name sends us to a server that is dying or already dead.
438     }
439     return (service == null)? false:
440       verifyRegionLocation(service,
441           this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
442   }
443 
444   public HConnection getConnection() {
445     return this.connection;
446   }
447 }