View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.catalog;
19  
20  import com.google.common.base.Stopwatch;
21  import org.apache.commons.logging.Log;
22  import org.apache.commons.logging.LogFactory;
23  import org.apache.hadoop.classification.InterfaceAudience;
24  import org.apache.hadoop.conf.Configuration;
25  import org.apache.hadoop.hbase.Abortable;
26  import org.apache.hadoop.hbase.HRegionInfo;
27  import org.apache.hadoop.hbase.NotAllMetaRegionsOnlineException;
28  import org.apache.hadoop.hbase.ServerName;
29  import org.apache.hadoop.hbase.client.HConnection;
30  import org.apache.hadoop.hbase.client.HConnectionManager;
31  import org.apache.hadoop.hbase.client.HTable;
32  import org.apache.hadoop.hbase.client.RetriesExhaustedException;
33  import org.apache.hadoop.hbase.ipc.RpcClient.FailedServerException;
34  import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
35  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
37  import org.apache.hadoop.hbase.regionserver.RegionServerStoppedException;
38  import org.apache.hadoop.hbase.util.Bytes;
39  import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
40  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
41  import org.apache.hadoop.ipc.RemoteException;
42  
43  import java.io.EOFException;
44  import java.io.IOException;
45  import java.net.ConnectException;
46  import java.net.NoRouteToHostException;
47  import java.net.SocketException;
48  import java.net.SocketTimeoutException;
49  import java.net.UnknownHostException;
50  
51  /**
52   * Tracks the availability of the catalog tables
53   * <code>hbase:meta</code>.
54   *
55   * This class is "read-only" in that the locations of the catalog tables cannot
56   * be explicitly set.  Instead, ZooKeeper is used to learn of the availability
57   * and location of <code>hbase:meta</code>.
58   *
59   * <p>Call {@link #start()} to start up operation.  Call {@link #stop()}} to
60   * interrupt waits and close up shop.
61   */
62  @InterfaceAudience.Private
63  public class CatalogTracker {
64    // TODO JDC 11/30 We don't even have ROOT anymore, revisit
65    // TODO: This class needs a rethink.  The original intent was that it would be
66    // the one-stop-shop for meta locations and that it would get this
67    // info from reading and watching zk state.  The class was to be used by
68    // servers when they needed to know of meta movement but also by
69    // client-side (inside in HTable) so rather than figure meta
70    // locations on fault, the client would instead get notifications out of zk.
71    //
72    // But this original intent is frustrated by the fact that this class has to
73    // read an hbase table, the -ROOT- table, to figure out the hbase:meta region
74    // location which means we depend on an HConnection.  HConnection will do
75    // retrying but also, it has its own mechanism for finding root and meta
76    // locations (and for 'verifying'; it tries the location and if it fails, does
77    // new lookup, etc.).  So, at least for now, HConnection (or HTable) can't
78    // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
79    // For HT keep up a session with ZK?  Rather, shouldn't we do like asynchbase
80    // where we'd open a connection to zk, read what we need then let the
81    // connection go?).  The 'fix' is make it so both root and meta addresses
82    // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
83    //
84    // But even then, this class does 'verification' of the location and it does
85    // this by making a call over an HConnection (which will do its own root
86    // and meta lookups).  Isn't this verification 'useless' since when we
87    // return, whatever is dependent on the result of this call then needs to
88    // use HConnection; what we have verified may change in meantime (HConnection
89    // uses the CT primitives, the root and meta trackers finding root locations).
90    //
91    // When meta is moved to zk, this class may make more sense.  In the
92    // meantime, it does not cohere.  It should just watch meta and root and not
93    // NOT do verification -- let that be out in HConnection since its going to
94    // be done there ultimately anyways.
95    //
96    // This class has spread throughout the codebase.  It needs to be reigned in.
97    // This class should be used server-side only, even if we move meta location
98    // up into zk.  Currently its used over in the client package. Its used in
99    // MetaReader and MetaEditor classes usually just to get the Configuration
100   // its using (It does this indirectly by asking its HConnection for its
101   // Configuration and even then this is just used to get an HConnection out on
102   // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
103   // doing CT fixup. St.Ack 09/30/2011.
104   //
105 
106   // TODO: Timeouts have never been as advertised in here and its worse now
107   // with retries; i.e. the HConnection retries and pause goes ahead whatever
108   // the passed timeout is.  Fix.
109   private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
110   private final HConnection connection;
111   private final ZooKeeperWatcher zookeeper;
112   private final MetaRegionTracker metaRegionTracker;
113   private boolean instantiatedzkw = false;
114   private Abortable abortable;
115 
116   private boolean stopped = false;
117 
118   static final byte [] META_REGION_NAME =
119     HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
120 
121   /**
122    * Constructs a catalog tracker. Find current state of catalog tables.
123    * Begin active tracking by executing {@link #start()} post construction. Does
124    * not timeout.
125    *
126    * @param conf
127    *          the {@link Configuration} from which a {@link HConnection} will be
128    *          obtained; if problem, this connections
129    *          {@link HConnection#abort(String, Throwable)} will be called.
130    * @throws IOException
131    */
132   public CatalogTracker(final Configuration conf) throws IOException {
133     this(null, conf, HConnectionManager.getConnection(conf), null);
134   }
135 
136   /**
137    * Constructs the catalog tracker.  Find current state of catalog tables.
138    * Begin active tracking by executing {@link #start()} post construction.
139    * Does not timeout.
140    * @param zk If zk is null, we'll create an instance (and shut it down
141    * when {@link #stop()} is called) else we'll use what is passed.
142    * @param conf
143    * @param abortable If fatal exception we'll call abort on this.  May be null.
144    * If it is we'll use the Connection associated with the passed
145    * {@link Configuration} as our Abortable.
146    * @throws IOException
147    */
148   public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
149       HConnection connection, Abortable abortable)
150   throws IOException {
151     this.connection = connection;
152     if (abortable == null) {
153       // A connection is abortable.
154       this.abortable = this.connection;
155     } else {
156       this.abortable = abortable;
157     }
158     Abortable throwableAborter = new Abortable() {
159 
160       @Override
161       public void abort(String why, Throwable e) {
162         throw new RuntimeException(why, e);
163       }
164 
165       @Override
166       public boolean isAborted() {
167         return true;
168       }
169 
170     };
171     if (zk == null) {
172       // Create our own.  Set flag so we tear it down on stop.
173       this.zookeeper =
174         new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
175           abortable);
176       instantiatedzkw = true;
177     } else {
178       this.zookeeper = zk;
179     }
180     this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
181   }
182 
183   /**
184    * Starts the catalog tracker.
185    * Determines current availability of catalog tables and ensures all further
186    * transitions of either region are tracked.
187    * @throws IOException
188    * @throws InterruptedException
189    */
190   public void start() throws IOException, InterruptedException {
191     LOG.debug("Starting catalog tracker " + this);
192     try {
193       this.metaRegionTracker.start();
194     } catch (RuntimeException e) {
195       Throwable t = e.getCause();
196       this.abortable.abort(e.getMessage(), t);
197       throw new IOException("Attempt to start meta tracker failed.", t);
198     }
199   }
200 
201   /**
202    * Stop working.
203    * Interrupts any ongoing waits.
204    */
205   public void stop() {
206     if (!this.stopped) {
207       LOG.debug("Stopping catalog tracker " + this);
208       this.stopped = true;
209       this.metaRegionTracker.stop();
210       try {
211         if (this.connection != null) {
212           this.connection.close();
213         }
214       } catch (IOException e) {
215         // Although the {@link Closeable} interface throws an {@link
216         // IOException}, in reality, the implementation would never do that.
217         LOG.error("Attempt to close catalog tracker's connection failed.", e);
218       }
219       if (this.instantiatedzkw) {
220         this.zookeeper.close();
221       }
222     }
223   }
224 
225   /**
226    * Gets the current location for <code>hbase:meta</code> or null if location is
227    * not currently available.
228    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
229    * if none available
230    * @throws InterruptedException
231    */
232   public ServerName getMetaLocation() throws InterruptedException {
233     return this.metaRegionTracker.getMetaRegionLocation();
234   }
235 
236   /**
237    * Checks whether meta regionserver znode has some non null data.
238    * @return true if data is not null, false otherwise.
239    */
240   public boolean isMetaLocationAvailable() {
241     return this.metaRegionTracker.isLocationAvailable();
242   }
243   /**
244    * Gets the current location for <code>hbase:meta</code> if available and waits
245    * for up to the specified timeout if not immediately available.  Returns null
246    * if the timeout elapses before root is available.
247    * @param timeout maximum time to wait for root availability, in milliseconds
248    * @return {@link ServerName} for server hosting <code>hbase:meta</code> or null
249    * if none available
250    * @throws InterruptedException if interrupted while waiting
251    * @throws NotAllMetaRegionsOnlineException if meta not available before
252    * timeout
253    */
254   public ServerName waitForMeta(final long timeout)
255   throws InterruptedException, NotAllMetaRegionsOnlineException {
256     ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
257     if (sn == null) {
258       throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
259     }
260     return sn;
261   }
262 
263   /**
264    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
265    * waiting up to the specified timeout for availability.
266    * @param timeout How long to wait on meta location
267    * @see #waitForMeta for additional information
268    * @return connection to server hosting meta
269    * @throws InterruptedException
270    * @throws NotAllMetaRegionsOnlineException if timed out waiting
271    * @throws IOException
272    * @deprecated Use #getMetaServerConnection(long)
273    */
274   public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
275   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
276     return getMetaServerConnection(timeout);
277   }
278 
279   /**
280    * Gets a connection to the server hosting meta, as reported by ZooKeeper,
281    * waiting up to the specified timeout for availability.
282    * <p>WARNING: Does not retry.  Use an {@link HTable} instead.
283    * @param timeout How long to wait on meta location
284    * @see #waitForMeta for additional information
285    * @return connection to server hosting meta
286    * @throws InterruptedException
287    * @throws NotAllMetaRegionsOnlineException if timed out waiting
288    * @throws IOException
289    */
290   AdminService.BlockingInterface getMetaServerConnection(long timeout)
291   throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
292     return getCachedConnection(waitForMeta(timeout));
293   }
294 
295   /**
296    * Waits indefinitely for availability of <code>hbase:meta</code>.  Used during
297    * cluster startup.  Does not verify meta, just that something has been
298    * set up in zk.
299    * @see #waitForMeta(long)
300    * @throws InterruptedException if interrupted while waiting
301    */
302   public void waitForMeta() throws InterruptedException {
303     Stopwatch stopwatch = new Stopwatch().start();
304     while (!this.stopped) {
305       try {
306         if (waitForMeta(100) != null) break;
307         long sleepTime = stopwatch.elapsedMillis();
308         // +1 in case sleepTime=0
309         if ((sleepTime + 1) % 10000 == 0) {
310           LOG.warn("Have been waiting for meta to be assigned for " + sleepTime + "ms");
311         }
312       } catch (NotAllMetaRegionsOnlineException e) {
313         if (LOG.isTraceEnabled()) {
314           LOG.trace("hbase:meta still not available, sleeping and retrying." +
315           " Reason: " + e.getMessage());
316         }
317       }
318     }
319   }
320 
321   /**
322    * @param sn ServerName to get a connection against.
323    * @return The AdminProtocol we got when we connected to <code>sn</code>
324    * May have come from cache, may not be good, may have been setup by this
325    * invocation, or may be null.
326    * @throws IOException
327    */
328   @SuppressWarnings("deprecation")
329   private AdminService.BlockingInterface getCachedConnection(ServerName sn)
330   throws IOException {
331     if (sn == null) {
332       return null;
333     }
334     AdminService.BlockingInterface service = null;
335     try {
336       service = connection.getAdmin(sn);
337     } catch (RetriesExhaustedException e) {
338       if (e.getCause() != null && e.getCause() instanceof ConnectException) {
339         // Catch this; presume it means the cached connection has gone bad.
340       } else {
341         throw e;
342       }
343     } catch (SocketTimeoutException e) {
344       LOG.debug("Timed out connecting to " + sn);
345     } catch (NoRouteToHostException e) {
346       LOG.debug("Connecting to " + sn, e);
347     } catch (SocketException e) {
348       LOG.debug("Exception connecting to " + sn);
349     } catch (UnknownHostException e) {
350       LOG.debug("Unknown host exception connecting to  " + sn);
351     } catch (FailedServerException e) {
352       if (LOG.isDebugEnabled()) {
353         LOG.debug("Server " + sn + " is in failed server list.");
354       }
355     } catch (IOException ioe) {
356       Throwable cause = ioe.getCause();
357       if (ioe instanceof ConnectException) {
358         // Catch. Connect refused.
359       } else if (cause != null && cause instanceof EOFException) {
360         // Catch. Other end disconnected us.
361       } else if (cause != null && cause.getMessage() != null &&
362         cause.getMessage().toLowerCase().contains("connection reset")) {
363         // Catch. Connection reset.
364       } else {
365         throw ioe;
366       }
367 
368     }
369     return service;
370   }
371 
372   /**
373    * Verify we can connect to <code>hostingServer</code> and that its carrying
374    * <code>regionName</code>.
375    * @param hostingServer Interface to the server hosting <code>regionName</code>
376    * @param address The servername that goes with the <code>metaServer</code>
377    * Interface.  Used logging.
378    * @param regionName The regionname we are interested in.
379    * @return True if we were able to verify the region located at other side of
380    * the Interface.
381    * @throws IOException
382    */
383   // TODO: We should be able to get the ServerName from the AdminProtocol
384   // rather than have to pass it in.  Its made awkward by the fact that the
385   // HRI is likely a proxy against remote server so the getServerName needs
386   // to be fixed to go to a local method or to a cache before we can do this.
387   private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
388       final ServerName address, final byte [] regionName)
389   throws IOException {
390     if (hostingServer == null) {
391       LOG.info("Passed hostingServer is null");
392       return false;
393     }
394     Throwable t = null;
395     try {
396       // Try and get regioninfo from the hosting server.
397       return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
398     } catch (ConnectException e) {
399       t = e;
400     } catch (RetriesExhaustedException e) {
401       t = e;
402     } catch (RemoteException e) {
403       IOException ioe = e.unwrapRemoteException();
404       t = ioe;
405     } catch (IOException e) {
406       Throwable cause = e.getCause();
407       if (cause != null && cause instanceof EOFException) {
408         t = cause;
409       } else if (cause != null && cause.getMessage() != null
410           && cause.getMessage().contains("Connection reset")) {
411         t = cause;
412       } else {
413         t = e;
414       }
415     }
416     LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
417       " at address=" + address + ", exception=" + t);
418     return false;
419   }
420 
421   /**
422    * Verify <code>hbase:meta</code> is deployed and accessible.
423    * @param timeout How long to wait on zk for meta address (passed through to
424    * the internal call to {@link #waitForMetaServerConnection(long)}.
425    * @return True if the <code>hbase:meta</code> location is healthy.
426    * @throws IOException
427    * @throws InterruptedException
428    */
429   public boolean verifyMetaRegionLocation(final long timeout)
430   throws InterruptedException, IOException {
431     AdminService.BlockingInterface service = null;
432     try {
433       service = waitForMetaServerConnection(timeout);
434     } catch (NotAllMetaRegionsOnlineException e) {
435       // Pass
436     } catch (ServerNotRunningYetException e) {
437       // Pass -- remote server is not up so can't be carrying root
438     } catch (UnknownHostException e) {
439       // Pass -- server name doesn't resolve so it can't be assigned anything.
440     } catch (RegionServerStoppedException e) {
441       // Pass -- server name sends us to a server that is dying or already dead.
442     }
443     return (service == null)? false:
444       verifyRegionLocation(service,
445           this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
446   }
447 
448   public HConnection getConnection() {
449     return this.connection;
450   }
451 
452   @Override
453   public String toString() {
454     return "CatalogTracker{" + "connection=" + connection + ", zookeeper=" + zookeeper +
455         ", metaRegionTracker=" + metaRegionTracker + ", stopped=" + stopped + '}';
456   }
457 }