View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver.handler;
20  
21  import java.io.IOException;
22  import java.util.concurrent.atomic.AtomicBoolean;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  import org.apache.hadoop.hbase.classification.InterfaceAudience;
27  import org.apache.hadoop.hbase.HRegionInfo;
28  import org.apache.hadoop.hbase.HTableDescriptor;
29  import org.apache.hadoop.hbase.Server;
30  import org.apache.hadoop.hbase.executor.EventHandler;
31  import org.apache.hadoop.hbase.executor.EventType;
32  import org.apache.hadoop.hbase.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
33  import org.apache.hadoop.hbase.regionserver.HRegion;
34  import org.apache.hadoop.hbase.regionserver.RegionServerAccounting;
35  import org.apache.hadoop.hbase.regionserver.RegionServerServices;
36  import org.apache.hadoop.hbase.util.CancelableProgressable;
37  /**
38   * Handles opening of a region on a region server.
39   * <p>
40   * This is executed after receiving an OPEN RPC from the master or client.
41   */
42  @InterfaceAudience.Private
43  public class OpenRegionHandler extends EventHandler {
44    private static final Log LOG = LogFactory.getLog(OpenRegionHandler.class);
45  
46    protected final RegionServerServices rsServices;
47  
48    private final HRegionInfo regionInfo;
49    private final HTableDescriptor htd;
50  
51    public OpenRegionHandler(final Server server,
52        final RegionServerServices rsServices, HRegionInfo regionInfo,
53        HTableDescriptor htd) {
54      this(server, rsServices, regionInfo, htd, EventType.M_RS_OPEN_REGION);
55    }
56  
57    protected OpenRegionHandler(final Server server,
58        final RegionServerServices rsServices, final HRegionInfo regionInfo,
59        final HTableDescriptor htd, EventType eventType) {
60      super(server, eventType);
61      this.rsServices = rsServices;
62      this.regionInfo = regionInfo;
63      this.htd = htd;
64    }
65  
66    public HRegionInfo getRegionInfo() {
67      return regionInfo;
68    }
69  
70    @Override
71    public void process() throws IOException {
72      boolean openSuccessful = false;
73      final String regionName = regionInfo.getRegionNameAsString();
74      HRegion region = null;
75  
76      try {
77        if (this.server.isStopped() || this.rsServices.isStopping()) {
78          return;
79        }
80        final String encodedName = regionInfo.getEncodedName();
81  
82        // 2 different difficult situations can occur
83        // 1) The opening was cancelled. This is an expected situation
84        // 2) The region is now marked as online while we're suppose to open. This would be a bug.
85  
86        // Check that this region is not already online
87        if (this.rsServices.getFromOnlineRegions(encodedName) != null) {
88          LOG.error("Region " + encodedName +
89              " was already online when we started processing the opening. " +
90              "Marking this new attempt as failed");
91          return;
92        }
93  
94        // Check that we're still supposed to open the region.
95        // If fails, just return.  Someone stole the region from under us.
96        if (!isRegionStillOpening()){
97          LOG.error("Region " + encodedName + " opening cancelled");
98          return;
99        }
100 
101       // Open region.  After a successful open, failures in subsequent
102       // processing needs to do a close as part of cleanup.
103       region = openRegion();
104       if (region == null) {
105         return;
106       }
107 
108       if (!updateMeta(region) || this.server.isStopped() ||
109           this.rsServices.isStopping()) {
110         return;
111       }
112 
113       if (!isRegionStillOpening()) {
114         return;
115       }
116 
117       // Successful region open, and add it to OnlineRegions
118       this.rsServices.addToOnlineRegions(region);
119       openSuccessful = true;
120 
121       // Done!  Successful region open
122       LOG.debug("Opened " + regionName + " on " +
123         this.server.getServerName());
124     } finally {
125       // Do all clean up here
126       if (!openSuccessful) {
127         doCleanUpOnFailedOpen(region);
128       }
129       final Boolean current = this.rsServices.getRegionsInTransitionInRS().
130           remove(this.regionInfo.getEncodedNameAsBytes());
131 
132       // Let's check if we have met a race condition on open cancellation....
133       // A better solution would be to not have any race condition.
134       // this.rsServices.getRegionsInTransitionInRS().remove(
135       //  this.regionInfo.getEncodedNameAsBytes(), Boolean.TRUE);
136       // would help.
137       if (openSuccessful) {
138         if (current == null) { // Should NEVER happen, but let's be paranoid.
139           LOG.error("Bad state: we've just opened a region that was NOT in transition. Region="
140               + regionName);
141         } else if (Boolean.FALSE.equals(current)) { // Can happen, if we're
142                                                     // really unlucky.
143           LOG.error("Race condition: we've finished to open a region, while a close was requested "
144               + " on region=" + regionName + ". It can be a critical error, as a region that"
145               + " should be closed is now opened. Closing it now");
146           cleanupFailedOpen(region);
147         }
148       }
149     }
150   }
151 
152   private void doCleanUpOnFailedOpen(HRegion region)
153       throws IOException {
154     try {
155       if (region != null) {
156         cleanupFailedOpen(region);
157       }
158     } finally {
159       rsServices.reportRegionStateTransition(TransitionCode.FAILED_OPEN, regionInfo);
160     }
161   }
162 
163   /**
164    * Update ZK or META.  This can take a while if for example the
165    * hbase:meta is not available -- if server hosting hbase:meta crashed and we are
166    * waiting on it to come back -- so run in a thread and keep updating znode
167    * state meantime so master doesn't timeout our region-in-transition.
168    * Caller must cleanup region if this fails.
169    */
170   boolean updateMeta(final HRegion r) {
171     if (this.server.isStopped() || this.rsServices.isStopping()) {
172       return false;
173     }
174     // Object we do wait/notify on.  Make it boolean.  If set, we're done.
175     // Else, wait.
176     final AtomicBoolean signaller = new AtomicBoolean(false);
177     PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
178       this.server, this.rsServices, signaller);
179     t.start();
180     // Post open deploy task:
181     //   meta => update meta location in ZK
182     //   other region => update meta
183     while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
184         !this.rsServices.isStopping() && isRegionStillOpening()) {
185       synchronized (signaller) {
186         try {
187           // Wait for 10 seconds, so that server shutdown
188           // won't take too long if this thread happens to run.
189           if (!signaller.get()) signaller.wait(10000);
190         } catch (InterruptedException e) {
191           // Go to the loop check.
192         }
193       }
194     }
195     // Is thread still alive?  We may have left above loop because server is
196     // stopping or we timed out the edit.  Is so, interrupt it.
197     if (t.isAlive()) {
198       if (!signaller.get()) {
199         // Thread still running; interrupt
200         LOG.debug("Interrupting thread " + t);
201         t.interrupt();
202       }
203       try {
204         t.join();
205       } catch (InterruptedException ie) {
206         LOG.warn("Interrupted joining " +
207           r.getRegionInfo().getRegionNameAsString(), ie);
208         Thread.currentThread().interrupt();
209       }
210     }
211 
212     // Was there an exception opening the region?  This should trigger on
213     // InterruptedException too.  If so, we failed.
214     return (!Thread.interrupted() && t.getException() == null);
215   }
216 
217   /**
218    * Thread to run region post open tasks. Call {@link #getException()} after
219    * the thread finishes to check for exceptions running
220    * {@link RegionServerServices#postOpenDeployTasks(HRegion)
221    */
222   static class PostOpenDeployTasksThread extends Thread {
223     private Throwable exception = null;
224     private final Server server;
225     private final RegionServerServices services;
226     private final HRegion region;
227     private final AtomicBoolean signaller;
228 
229     PostOpenDeployTasksThread(final HRegion region, final Server server,
230         final RegionServerServices services, final AtomicBoolean signaller) {
231       super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
232       this.setDaemon(true);
233       this.server = server;
234       this.services = services;
235       this.region = region;
236       this.signaller = signaller;
237     }
238 
239     public void run() {
240       try {
241         this.services.postOpenDeployTasks(this.region);
242       } catch (Throwable e) {
243         String msg = "Exception running postOpenDeployTasks; region=" +
244           this.region.getRegionInfo().getEncodedName();
245         this.exception = e;
246         if (e instanceof IOException
247             && isRegionStillOpening(region.getRegionInfo(), services)) {
248           server.abort(msg, e);
249         } else {
250           LOG.warn(msg, e);
251         }
252       }
253       // We're done.  Set flag then wake up anyone waiting on thread to complete.
254       this.signaller.set(true);
255       synchronized (this.signaller) {
256         this.signaller.notify();
257       }
258     }
259 
260     /**
261      * @return Null or the run exception; call this method after thread is done.
262      */
263     Throwable getException() {
264       return this.exception;
265     }
266   }
267 
268   /**
269    * @return Instance of HRegion if successful open else null.
270    */
271   HRegion openRegion() {
272     HRegion region = null;
273     try {
274       // Instantiate the region.  This also periodically tickles OPENING
275       // state so master doesn't timeout this region in transition.
276       region = HRegion.openHRegion(this.regionInfo, this.htd,
277         this.rsServices.getWAL(this.regionInfo),
278         this.server.getConfiguration(),
279         this.rsServices,
280         new CancelableProgressable() {
281           public boolean progress() {
282             if (!isRegionStillOpening()) {
283               LOG.warn("Open region aborted since it isn't opening any more");
284               return false;
285             }
286             return true;
287           }
288         });
289     } catch (Throwable t) {
290       // We failed open. Our caller will see the 'null' return value
291       // and transition the node back to FAILED_OPEN. If that fails,
292       // we rely on the Timeout Monitor in the master to reassign.
293       LOG.error(
294           "Failed open of region=" + this.regionInfo.getRegionNameAsString()
295               + ", starting to roll back the global memstore size.", t);
296       // Decrease the global memstore size.
297       if (this.rsServices != null) {
298         RegionServerAccounting rsAccounting =
299           this.rsServices.getRegionServerAccounting();
300         if (rsAccounting != null) {
301           rsAccounting.rollbackRegionReplayEditsSize(this.regionInfo.getRegionName());
302         }
303       }
304     }
305     return region;
306   }
307 
308   void cleanupFailedOpen(final HRegion region) throws IOException {
309     if (region != null) {
310       this.rsServices.removeFromOnlineRegions(region, null);
311       region.close();
312     }
313   }
314 
315   private static boolean isRegionStillOpening(
316       HRegionInfo regionInfo, RegionServerServices rsServices) {
317     byte[] encodedName = regionInfo.getEncodedNameAsBytes();
318     Boolean action = rsServices.getRegionsInTransitionInRS().get(encodedName);
319     return Boolean.TRUE.equals(action); // true means opening for RIT
320   }
321 
322   private boolean isRegionStillOpening() {
323     return isRegionStillOpening(regionInfo, rsServices);
324   }
325 }