View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver.handler;
20  
21  import java.io.IOException;
22  import java.util.concurrent.atomic.AtomicBoolean;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.HRegionInfo;
28  import org.apache.hadoop.hbase.HTableDescriptor;
29  import org.apache.hadoop.hbase.Server;
30  import org.apache.hadoop.hbase.executor.EventHandler;
31  import org.apache.hadoop.hbase.executor.EventType;
32  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
33  import org.apache.hadoop.hbase.regionserver.HRegion;
34  import org.apache.hadoop.hbase.regionserver.Region;
35  import org.apache.hadoop.hbase.regionserver.RegionServerAccounting;
36  import org.apache.hadoop.hbase.regionserver.RegionServerServices;
37  import org.apache.hadoop.hbase.regionserver.RegionServerServices.PostOpenDeployContext;
38  import org.apache.hadoop.hbase.util.CancelableProgressable;
39  /**
40   * Handles opening of a region on a region server.
41   * <p>
42   * This is executed after receiving an OPEN RPC from the master or client.
43   */
44  @InterfaceAudience.Private
45  public class OpenRegionHandler extends EventHandler {
46    private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
47  
48    protected final RegionServerServices rsServices;
49  
50    private final HRegionInfo regionInfo;
51    private final HTableDescriptor htd;
52    private final long masterSystemTime;
53  
54    public OpenRegionHandler(final Server server,
55        final RegionServerServices rsServices, HRegionInfo regionInfo,
56        HTableDescriptor htd, long masterSystemTime) {
57      this(server, rsServices, regionInfo, htd, masterSystemTime, EventType.M_RS_OPEN_REGION);
58    }
59  
60    protected OpenRegionHandler(final Server server,
61        final RegionServerServices rsServices, final HRegionInfo regionInfo,
62        final HTableDescriptor htd, long masterSystemTime, EventType eventType) {
63      super(server, eventType);
64      this.rsServices = rsServices;
65      this.regionInfo = regionInfo;
66      this.htd = htd;
67      this.masterSystemTime = masterSystemTime;
68    }
69  
70    public HRegionInfo getRegionInfo() {
71      return regionInfo;
72    }
73  
74    @Override
75    public void process() throws IOException {
76      boolean openSuccessful = false;
77      final String regionName = regionInfo.getRegionNameAsString();
78      HRegion region = null;
79  
80      try {
81        if (this.server.isStopped() || this.rsServices.isStopping()) {
82          return;
83        }
84        final String encodedName = regionInfo.getEncodedName();
85  
86        // 2 different difficult situations can occur
87        // 1) The opening was cancelled. This is an expected situation
88        // 2) The region is now marked as online while we're suppose to open. This would be a bug.
89  
90        // Check that this region is not already online
91        if (this.rsServices.getFromOnlineRegions(encodedName) != null) {
92          LOG.error("Region " + encodedName +
93              " was already online when we started processing the opening. " +
94              "Marking this new attempt as failed");
95          return;
96        }
97  
98        // Check that we're still supposed to open the region.
99        // If fails, just return.  Someone stole the region from under us.
100       if (!isRegionStillOpening()){
101         LOG.error("Region " + encodedName + " opening cancelled");
102         return;
103       }
104 
105       // Open region.  After a successful open, failures in subsequent
106       // processing needs to do a close as part of cleanup.
107       region = openRegion();
108       if (region == null) {
109         return;
110       }
111 
112       if (!updateMeta(region, masterSystemTime) || this.server.isStopped() ||
113           this.rsServices.isStopping()) {
114         return;
115       }
116 
117       if (!isRegionStillOpening()) {
118         return;
119       }
120 
121       // Successful region open, and add it to OnlineRegions
122       this.rsServices.addToOnlineRegions(region);
123       openSuccessful = true;
124 
125       // Done!  Successful region open
126       LOG.debug("Opened " + regionName + " on " +
127         this.server.getServerName());
128     } finally {
129       // Do all clean up here
130       if (!openSuccessful) {
131         doCleanUpOnFailedOpen(region);
132       }
133       final Boolean current = this.rsServices.getRegionsInTransitionInRS().
134           remove(this.regionInfo.getEncodedNameAsBytes());
135 
136       // Let's check if we have met a race condition on open cancellation....
137       // A better solution would be to not have any race condition.
138       // this.rsServices.getRegionsInTransitionInRS().remove(
139       //  this.regionInfo.getEncodedNameAsBytes(), Boolean.TRUE);
140       // would help.
141       if (openSuccessful) {
142         if (current == null) { // Should NEVER happen, but let's be paranoid.
143           LOG.error("Bad state: we've just opened a region that was NOT in transition. Region="
144               + regionName);
145         } else if (Boolean.FALSE.equals(current)) { // Can happen, if we're
146                                                     // really unlucky.
147           LOG.error("Race condition: we've finished to open a region, while a close was requested "
148               + " on region=" + regionName + ". It can be a critical error, as a region that"
149               + " should be closed is now opened. Closing it now");
150           cleanupFailedOpen(region);
151         }
152       }
153     }
154   }
155 
156   private void doCleanUpOnFailedOpen(HRegion region)
157       throws IOException {
158     try {
159       if (region != null) {
160         cleanupFailedOpen(region);
161       }
162     } finally {
163       rsServices.reportRegionStateTransition(TransitionCode.FAILED_OPEN, regionInfo);
164     }
165   }
166 
167   /**
168    * Update ZK or META.  This can take a while if for example the
169    * hbase:meta is not available -- if server hosting hbase:meta crashed and we are
170    * waiting on it to come back -- so run in a thread and keep updating znode
171    * state meantime so master doesn't timeout our region-in-transition.
172    * Caller must cleanup region if this fails.
173    */
174   boolean updateMeta(final HRegion r, long masterSystemTime) {
175     if (this.server.isStopped() || this.rsServices.isStopping()) {
176       return false;
177     }
178     // Object we do wait/notify on.  Make it boolean.  If set, we're done.
179     // Else, wait.
180     final AtomicBoolean signaller = new AtomicBoolean(false);
181     PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
182       this.server, this.rsServices, signaller, masterSystemTime);
183     t.start();
184     // Post open deploy task:
185     //   meta => update meta location in ZK
186     //   other region => update meta
187     while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
188         !this.rsServices.isStopping() && isRegionStillOpening()) {
189       synchronized (signaller) {
190         try {
191           // Wait for 10 seconds, so that server shutdown
192           // won't take too long if this thread happens to run.
193           if (!signaller.get()) signaller.wait(10000);
194         } catch (InterruptedException e) {
195           // Go to the loop check.
196         }
197       }
198     }
199     // Is thread still alive?  We may have left above loop because server is
200     // stopping or we timed out the edit.  Is so, interrupt it.
201     if (t.isAlive()) {
202       if (!signaller.get()) {
203         // Thread still running; interrupt
204         LOG.debug("Interrupting thread " + t);
205         t.interrupt();
206       }
207       try {
208         t.join();
209       } catch (InterruptedException ie) {
210         LOG.warn("Interrupted joining " +
211           r.getRegionInfo().getRegionNameAsString(), ie);
212         Thread.currentThread().interrupt();
213       }
214     }
215 
216     // Was there an exception opening the region?  This should trigger on
217     // InterruptedException too.  If so, we failed.
218     return (!Thread.interrupted() && t.getException() == null);
219   }
220 
221   /**
222    * Thread to run region post open tasks. Call {@link #getException()} after the thread finishes
223    * to check for exceptions running {@link RegionServerServices#postOpenDeployTasks(Region)}.
224    */
225   static class PostOpenDeployTasksThread extends Thread {
226     private Throwable exception = null;
227     private final Server server;
228     private final RegionServerServices services;
229     private final HRegion region;
230     private final AtomicBoolean signaller;
231     private final long masterSystemTime;
232 
233     PostOpenDeployTasksThread(final HRegion region, final Server server,
234         final RegionServerServices services, final AtomicBoolean signaller, long masterSystemTime) {
235       super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
236       this.setDaemon(true);
237       this.server = server;
238       this.services = services;
239       this.region = region;
240       this.signaller = signaller;
241       this.masterSystemTime = masterSystemTime;
242     }
243 
244     @Override
245     public void run() {
246       try {
247         this.services.postOpenDeployTasks(new PostOpenDeployContext(region, masterSystemTime));
248       } catch (Throwable e) {
249         String msg = "Exception running postOpenDeployTasks; region=" +
250           this.region.getRegionInfo().getEncodedName();
251         this.exception = e;
252         if (e instanceof IOException
253             && isRegionStillOpening(region.getRegionInfo(), services)) {
254           server.abort(msg, e);
255         } else {
256           LOG.warn(msg, e);
257         }
258       }
259       // We're done.  Set flag then wake up anyone waiting on thread to complete.
260       this.signaller.set(true);
261       synchronized (this.signaller) {
262         this.signaller.notify();
263       }
264     }
265 
266     /**
267      * @return Null or the run exception; call this method after thread is done.
268      */
269     Throwable getException() {
270       return this.exception;
271     }
272   }
273 
274   /**
275    * @return Instance of HRegion if successful open else null.
276    */
277   HRegion openRegion() {
278     HRegion region = null;
279     try {
280       // Instantiate the region.  This also periodically tickles OPENING
281       // state so master doesn't timeout this region in transition.
282       region = HRegion.openHRegion(this.regionInfo, this.htd,
283         this.rsServices.getWAL(this.regionInfo),
284         this.server.getConfiguration(),
285         this.rsServices,
286         new CancelableProgressable() {
287           @Override
288           public boolean progress() {
289             if (!isRegionStillOpening()) {
290               LOG.warn("Open region aborted since it isn't opening any more");
291               return false;
292             }
293             return true;
294           }
295         });
296     } catch (Throwable t) {
297       // We failed open. Our caller will see the 'null' return value
298       // and transition the node back to FAILED_OPEN. If that fails,
299       // we rely on the Timeout Monitor in the master to reassign.
300       LOG.error(
301           "Failed open of region=" + this.regionInfo.getRegionNameAsString()
302               + ", starting to roll back the global memstore size.", t);
303       // Decrease the global memstore size.
304       if (this.rsServices != null) {
305         RegionServerAccounting rsAccounting =
306           this.rsServices.getRegionServerAccounting();
307         if (rsAccounting != null) {
308           rsAccounting.rollbackRegionReplayEditsSize(this.regionInfo.getRegionName());
309         }
310       }
311     }
312     return region;
313   }
314 
315   void cleanupFailedOpen(final HRegion region) throws IOException {
316     if (region != null) {
317       this.rsServices.removeFromOnlineRegions(region, null);
318       region.close();
319     }
320   }
321 
322   private static boolean isRegionStillOpening(
323       HRegionInfo regionInfo, RegionServerServices rsServices) {
324     byte[] encodedName = regionInfo.getEncodedNameAsBytes();
325     Boolean action = rsServices.getRegionsInTransitionInRS().get(encodedName);
326     return Boolean.TRUE.equals(action); // true means opening for RIT
327   }
328 
329   private boolean isRegionStillOpening() {
330     return isRegionStillOpening(regionInfo, rsServices);
331   }
332 }