1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18 package org.apache.hadoop.hbase.catalog;
19
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22 import org.apache.hadoop.classification.InterfaceAudience;
23 import org.apache.hadoop.conf.Configuration;
24 import org.apache.hadoop.hbase.Abortable;
25 import org.apache.hadoop.hbase.HRegionInfo;
26 import org.apache.hadoop.hbase.ServerName;
27 import org.apache.hadoop.hbase.client.HConnection;
28 import org.apache.hadoop.hbase.client.HConnectionManager;
29 import org.apache.hadoop.hbase.client.HTable;
30 import org.apache.hadoop.hbase.client.RetriesExhaustedException;
31 import org.apache.hadoop.hbase.exceptions.NotAllMetaRegionsOnlineException;
32 import org.apache.hadoop.hbase.exceptions.ServerNotRunningYetException;
33 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
34 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
35 import org.apache.hadoop.hbase.util.Bytes;
36 import org.apache.hadoop.hbase.zookeeper.MetaRegionTracker;
37 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
38 import org.apache.hadoop.ipc.RemoteException;
39
40 import java.io.EOFException;
41 import java.io.IOException;
42 import java.net.ConnectException;
43 import java.net.NoRouteToHostException;
44 import java.net.SocketException;
45 import java.net.SocketTimeoutException;
46 import java.net.UnknownHostException;
47
48 /**
49 * Tracks the availability of the catalog tables
50 * <code>.META.</code>.
51 *
52 * This class is "read-only" in that the locations of the catalog tables cannot
53 * be explicitly set. Instead, ZooKeeper is used to learn of the availability
54 * and location of <code>.META.</code>.
55 *
56 * <p>Call {@link #start()} to start up operation. Call {@link #stop()}} to
57 * interrupt waits and close up shop.
58 */
59 @InterfaceAudience.Private
60 public class CatalogTracker {
61 // TODO JDC 11/30 We don't even have ROOT anymore, revisit
62 // TODO: This class needs a rethink. The original intent was that it would be
63 // the one-stop-shop for meta locations and that it would get this
64 // info from reading and watching zk state. The class was to be used by
65 // servers when they needed to know of meta movement but also by
66 // client-side (inside in HTable) so rather than figure meta
67 // locations on fault, the client would instead get notifications out of zk.
68 //
69 // But this original intent is frustrated by the fact that this class has to
70 // read an hbase table, the -ROOT- table, to figure out the .META. region
71 // location which means we depend on an HConnection. HConnection will do
72 // retrying but also, it has its own mechanism for finding root and meta
73 // locations (and for 'verifying'; it tries the location and if it fails, does
74 // new lookup, etc.). So, at least for now, HConnection (or HTable) can't
75 // have a CT since CT needs a HConnection (Even then, do want HT to have a CT?
76 // For HT keep up a session with ZK? Rather, shouldn't we do like asynchbase
77 // where we'd open a connection to zk, read what we need then let the
78 // connection go?). The 'fix' is make it so both root and meta addresses
79 // are wholey up in zk -- not in zk (root) -- and in an hbase table (meta).
80 //
81 // But even then, this class does 'verification' of the location and it does
82 // this by making a call over an HConnection (which will do its own root
83 // and meta lookups). Isn't this verification 'useless' since when we
84 // return, whatever is dependent on the result of this call then needs to
85 // use HConnection; what we have verified may change in meantime (HConnection
86 // uses the CT primitives, the root and meta trackers finding root locations).
87 //
88 // When meta is moved to zk, this class may make more sense. In the
89 // meantime, it does not cohere. It should just watch meta and root and not
90 // NOT do verification -- let that be out in HConnection since its going to
91 // be done there ultimately anyways.
92 //
93 // This class has spread throughout the codebase. It needs to be reigned in.
94 // This class should be used server-side only, even if we move meta location
95 // up into zk. Currently its used over in the client package. Its used in
96 // MetaReader and MetaEditor classes usually just to get the Configuration
97 // its using (It does this indirectly by asking its HConnection for its
98 // Configuration and even then this is just used to get an HConnection out on
99 // the other end). I made https://issues.apache.org/jira/browse/HBASE-4495 for
100 // doing CT fixup. St.Ack 09/30/2011.
101 //
102
103 // TODO: Timeouts have never been as advertised in here and its worse now
104 // with retries; i.e. the HConnection retries and pause goes ahead whatever
105 // the passed timeout is. Fix.
106 private static final Log LOG = LogFactory.getLog(CatalogTracker.class);
107 private final HConnection connection;
108 private final ZooKeeperWatcher zookeeper;
109 private final MetaRegionTracker metaRegionTracker;
110 private boolean instantiatedzkw = false;
111 private Abortable abortable;
112
113 private boolean stopped = false;
114
115 static final byte [] META_REGION_NAME =
116 HRegionInfo.FIRST_META_REGIONINFO.getRegionName();
117
118 /**
119 * Constructs a catalog tracker. Find current state of catalog tables.
120 * Begin active tracking by executing {@link #start()} post construction. Does
121 * not timeout.
122 *
123 * @param conf
124 * the {@link Configuration} from which a {@link HConnection} will be
125 * obtained; if problem, this connections
126 * {@link HConnection#abort(String, Throwable)} will be called.
127 * @throws IOException
128 */
129 public CatalogTracker(final Configuration conf) throws IOException {
130 this(null, conf, null);
131 }
132
133 /**
134 * Constructs the catalog tracker. Find current state of catalog tables.
135 * Begin active tracking by executing {@link #start()} post construction.
136 * Does not timeout.
137 * @param zk If zk is null, we'll create an instance (and shut it down
138 * when {@link #stop()} is called) else we'll use what is passed.
139 * @param conf
140 * @param abortable If fatal exception we'll call abort on this. May be null.
141 * If it is we'll use the Connection associated with the passed
142 * {@link Configuration} as our Abortable.
143 * @throws IOException
144 */
145 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
146 Abortable abortable)
147 throws IOException {
148 this(zk, conf, HConnectionManager.getConnection(conf), abortable);
149 }
150
151 public CatalogTracker(final ZooKeeperWatcher zk, final Configuration conf,
152 HConnection connection, Abortable abortable)
153 throws IOException {
154 this.connection = connection;
155 if (abortable == null) {
156 // A connection is abortable.
157 this.abortable = this.connection;
158 }
159 Abortable throwableAborter = new Abortable() {
160
161 @Override
162 public void abort(String why, Throwable e) {
163 throw new RuntimeException(why, e);
164 }
165
166 @Override
167 public boolean isAborted() {
168 return true;
169 }
170
171 };
172 if (zk == null) {
173 // Create our own. Set flag so we tear it down on stop.
174 this.zookeeper =
175 new ZooKeeperWatcher(conf, "catalogtracker-on-" + connection.toString(),
176 abortable);
177 instantiatedzkw = true;
178 } else {
179 this.zookeeper = zk;
180 }
181 this.metaRegionTracker = new MetaRegionTracker(zookeeper, throwableAborter);
182 }
183
184 /**
185 * Starts the catalog tracker.
186 * Determines current availability of catalog tables and ensures all further
187 * transitions of either region are tracked.
188 * @throws IOException
189 * @throws InterruptedException
190 */
191 public void start() throws IOException, InterruptedException {
192 LOG.debug("Starting catalog tracker " + this);
193 try {
194 this.metaRegionTracker.start();
195 } catch (RuntimeException e) {
196 Throwable t = e.getCause();
197 this.abortable.abort(e.getMessage(), t);
198 throw new IOException("Attempt to start meta tracker failed.", t);
199 }
200 }
201
202 /**
203 * Stop working.
204 * Interrupts any ongoing waits.
205 */
206 public void stop() {
207 if (!this.stopped) {
208 LOG.debug("Stopping catalog tracker " + this);
209 this.stopped = true;
210 this.metaRegionTracker.stop();
211 try {
212 if (this.connection != null) {
213 this.connection.close();
214 }
215 } catch (IOException e) {
216 // Although the {@link Closeable} interface throws an {@link
217 // IOException}, in reality, the implementation would never do that.
218 LOG.error("Attempt to close catalog tracker's connection failed.", e);
219 }
220 if (this.instantiatedzkw) {
221 this.zookeeper.close();
222 }
223 }
224 }
225
226 /**
227 * Gets the current location for <code>.META.</code> or null if location is
228 * not currently available.
229 * @return {@link ServerName} for server hosting <code>.META.</code> or null
230 * if none available
231 * @throws InterruptedException
232 */
233 public ServerName getMetaLocation() throws InterruptedException {
234 return this.metaRegionTracker.getMetaRegionLocation();
235 }
236
237 /**
238 * Gets the current location for <code>.META.</code> if available and waits
239 * for up to the specified timeout if not immediately available. Returns null
240 * if the timeout elapses before root is available.
241 * @param timeout maximum time to wait for root availability, in milliseconds
242 * @return {@link ServerName} for server hosting <code>.META.</code> or null
243 * if none available
244 * @throws InterruptedException if interrupted while waiting
245 * @throws NotAllMetaRegionsOnlineException if meta not available before
246 * timeout
247 */
248 public ServerName waitForMeta(final long timeout)
249 throws InterruptedException, NotAllMetaRegionsOnlineException {
250 ServerName sn = metaRegionTracker.waitMetaRegionLocation(timeout);
251 if (sn == null) {
252 throw new NotAllMetaRegionsOnlineException("Timed out; " + timeout + "ms");
253 }
254 return sn;
255 }
256
257 /**
258 * Gets a connection to the server hosting meta, as reported by ZooKeeper,
259 * waiting up to the specified timeout for availability.
260 * @param timeout How long to wait on meta location
261 * @see #waitForMeta for additional information
262 * @return connection to server hosting meta
263 * @throws InterruptedException
264 * @throws NotAllMetaRegionsOnlineException if timed out waiting
265 * @throws IOException
266 * @deprecated Use #getMetaServerConnection(long)
267 */
268 public AdminService.BlockingInterface waitForMetaServerConnection(long timeout)
269 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
270 return getMetaServerConnection(timeout);
271 }
272
273 /**
274 * Gets a connection to the server hosting meta, as reported by ZooKeeper,
275 * waiting up to the specified timeout for availability.
276 * <p>WARNING: Does not retry. Use an {@link HTable} instead.
277 * @param timeout How long to wait on meta location
278 * @see #waitForMeta for additional information
279 * @return connection to server hosting meta
280 * @throws InterruptedException
281 * @throws NotAllMetaRegionsOnlineException if timed out waiting
282 * @throws IOException
283 */
284 AdminService.BlockingInterface getMetaServerConnection(long timeout)
285 throws InterruptedException, NotAllMetaRegionsOnlineException, IOException {
286 return getCachedConnection(waitForMeta(timeout));
287 }
288
289 /**
290 * Waits indefinitely for availability of <code>.META.</code>. Used during
291 * cluster startup. Does not verify meta, just that something has been
292 * set up in zk.
293 * @see #waitForMeta(long)
294 * @throws InterruptedException if interrupted while waiting
295 */
296 public void waitForMeta() throws InterruptedException {
297 while (!this.stopped) {
298 try {
299 if (waitForMeta(100) != null) break;
300 } catch (NotAllMetaRegionsOnlineException e) {
301 if (LOG.isTraceEnabled()) {
302 LOG.info(".META. still not available, sleeping and retrying." +
303 " Reason: " + e.getMessage());
304 }
305 }
306 }
307 }
308
309 /**
310 * @param sn ServerName to get a connection against.
311 * @return The AdminProtocol we got when we connected to <code>sn</code>
312 * May have come from cache, may not be good, may have been setup by this
313 * invocation, or may be null.
314 * @throws IOException
315 */
316 private AdminService.BlockingInterface getCachedConnection(ServerName sn)
317 throws IOException {
318 if (sn == null) {
319 return null;
320 }
321 AdminService.BlockingInterface service = null;
322 try {
323 service = connection.getAdmin(sn);
324 } catch (RetriesExhaustedException e) {
325 if (e.getCause() != null && e.getCause() instanceof ConnectException) {
326 // Catch this; presume it means the cached connection has gone bad.
327 } else {
328 throw e;
329 }
330 } catch (SocketTimeoutException e) {
331 LOG.debug("Timed out connecting to " + sn);
332 } catch (NoRouteToHostException e) {
333 LOG.debug("Connecting to " + sn, e);
334 } catch (SocketException e) {
335 LOG.debug("Exception connecting to " + sn);
336 } catch (UnknownHostException e) {
337 LOG.debug("Unknown host exception connecting to " + sn);
338 } catch (IOException ioe) {
339 Throwable cause = ioe.getCause();
340 if (ioe instanceof ConnectException) {
341 // Catch. Connect refused.
342 } else if (cause != null && cause instanceof EOFException) {
343 // Catch. Other end disconnected us.
344 } else if (cause != null && cause.getMessage() != null &&
345 cause.getMessage().toLowerCase().contains("connection reset")) {
346 // Catch. Connection reset.
347 } else {
348 throw ioe;
349 }
350
351 }
352 return service;
353 }
354
355 /**
356 * Verify we can connect to <code>hostingServer</code> and that its carrying
357 * <code>regionName</code>.
358 * @param hostingServer Interface to the server hosting <code>regionName</code>
359 * @param address The servername that goes with the <code>metaServer</code>
360 * Interface. Used logging.
361 * @param regionName The regionname we are interested in.
362 * @return True if we were able to verify the region located at other side of
363 * the Interface.
364 * @throws IOException
365 */
366 // TODO: We should be able to get the ServerName from the AdminProtocol
367 // rather than have to pass it in. Its made awkward by the fact that the
368 // HRI is likely a proxy against remote server so the getServerName needs
369 // to be fixed to go to a local method or to a cache before we can do this.
370 private boolean verifyRegionLocation(AdminService.BlockingInterface hostingServer,
371 final ServerName address, final byte [] regionName)
372 throws IOException {
373 if (hostingServer == null) {
374 LOG.info("Passed hostingServer is null");
375 return false;
376 }
377 Throwable t = null;
378 try {
379 // Try and get regioninfo from the hosting server.
380 return ProtobufUtil.getRegionInfo(hostingServer, regionName) != null;
381 } catch (ConnectException e) {
382 t = e;
383 } catch (RetriesExhaustedException e) {
384 t = e;
385 } catch (RemoteException e) {
386 IOException ioe = e.unwrapRemoteException();
387 t = ioe;
388 } catch (IOException e) {
389 Throwable cause = e.getCause();
390 if (cause != null && cause instanceof EOFException) {
391 t = cause;
392 } else if (cause != null && cause.getMessage() != null
393 && cause.getMessage().contains("Connection reset")) {
394 t = cause;
395 } else {
396 t = e;
397 }
398 }
399 LOG.info("Failed verification of " + Bytes.toStringBinary(regionName) +
400 " at address=" + address + ", exception=" + t);
401 return false;
402 }
403
404 /**
405 * Verify <code>.META.</code> is deployed and accessible.
406 * @param timeout How long to wait on zk for meta address (passed through to
407 * the internal call to {@link #waitForMetaServerConnection(long)}.
408 * @return True if the <code>.META.</code> location is healthy.
409 * @throws IOException
410 * @throws InterruptedException
411 */
412 public boolean verifyMetaRegionLocation(final long timeout)
413 throws InterruptedException, IOException {
414 AdminService.BlockingInterface service = null;
415 try {
416 service = waitForMetaServerConnection(timeout);
417 } catch (NotAllMetaRegionsOnlineException e) {
418 // Pass
419 } catch (ServerNotRunningYetException e) {
420 // Pass -- remote server is not up so can't be carrying root
421 } catch (UnknownHostException e) {
422 // Pass -- server name doesn't resolve so it can't be assigned anything.
423 }
424 return (service == null)? false:
425 verifyRegionLocation(service,
426 this.metaRegionTracker.getMetaRegionLocation(), META_REGION_NAME);
427 }
428
429 public HConnection getConnection() {
430 return this.connection;
431 }
432 }