001/**
002 *
003 * Licensed to the Apache Software Foundation (ASF) under one
004 * or more contributor license agreements.  See the NOTICE file
005 * distributed with this work for additional information
006 * regarding copyright ownership.  The ASF licenses this file
007 * to you under the Apache License, Version 2.0 (the
008 * "License"); you may not use this file except in compliance
009 * with the License.  You may obtain a copy of the License at
010 *
011 *     http://www.apache.org/licenses/LICENSE-2.0
012 *
013 * Unless required by applicable law or agreed to in writing, software
014 * distributed under the License is distributed on an "AS IS" BASIS,
015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 * See the License for the specific language governing permissions and
017 * limitations under the License.
018 */
019package org.apache.hadoop.hbase.regionserver.handler;
020
021import java.io.IOException;
022import java.util.concurrent.atomic.AtomicBoolean;
023
024import org.apache.hadoop.hbase.HConstants;
025import org.apache.hadoop.hbase.Server;
026import org.apache.hadoop.hbase.client.RegionInfo;
027import org.apache.hadoop.hbase.client.TableDescriptor;
028import org.apache.hadoop.hbase.executor.EventHandler;
029import org.apache.hadoop.hbase.executor.EventType;
030import org.apache.hadoop.hbase.regionserver.HRegion;
031import org.apache.hadoop.hbase.regionserver.RegionServerServices;
032import org.apache.hadoop.hbase.regionserver.RegionServerServices.PostOpenDeployContext;
033import org.apache.hadoop.hbase.regionserver.RegionServerServices.RegionStateTransitionContext;
034import org.apache.hadoop.hbase.util.CancelableProgressable;
035import org.apache.yetus.audience.InterfaceAudience;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
039/**
040 * Handles opening of a region on a region server.
041 * <p>
042 * This is executed after receiving an OPEN RPC from the master or client.
043 */
044@InterfaceAudience.Private
045public class OpenRegionHandler extends EventHandler {
046  private static final Logger LOG = LoggerFactory.getLogger(OpenRegionHandler.class);
047
048  protected final RegionServerServices rsServices;
049
050  private final RegionInfo regionInfo;
051  private final TableDescriptor htd;
052  private final long masterSystemTime;
053
054  public OpenRegionHandler(final Server server,
055      final RegionServerServices rsServices, RegionInfo regionInfo,
056      TableDescriptor htd, long masterSystemTime) {
057    this(server, rsServices, regionInfo, htd, masterSystemTime, EventType.M_RS_OPEN_REGION);
058  }
059
060  protected OpenRegionHandler(final Server server,
061                              final RegionServerServices rsServices, final RegionInfo regionInfo,
062                              final TableDescriptor htd, long masterSystemTime, EventType eventType) {
063    super(server, eventType);
064    this.rsServices = rsServices;
065    this.regionInfo = regionInfo;
066    this.htd = htd;
067    this.masterSystemTime = masterSystemTime;
068  }
069
070  public RegionInfo getRegionInfo() {
071    return regionInfo;
072  }
073
074  @Override
075  public void process() throws IOException {
076    boolean openSuccessful = false;
077    final String regionName = regionInfo.getRegionNameAsString();
078    HRegion region = null;
079
080    try {
081      if (this.server.isStopped() || this.rsServices.isStopping()) {
082        return;
083      }
084      final String encodedName = regionInfo.getEncodedName();
085
086      // 2 different difficult situations can occur
087      // 1) The opening was cancelled. This is an expected situation
088      // 2) The region is now marked as online while we're suppose to open. This would be a bug.
089
090      // Check that this region is not already online
091      if (this.rsServices.getRegion(encodedName) != null) {
092        LOG.error("Region " + encodedName +
093            " was already online when we started processing the opening. " +
094            "Marking this new attempt as failed");
095        return;
096      }
097
098      // Check that we're still supposed to open the region.
099      // If fails, just return.  Someone stole the region from under us.
100      if (!isRegionStillOpening()){
101        LOG.error("Region " + encodedName + " opening cancelled");
102        return;
103      }
104
105      // Open region.  After a successful open, failures in subsequent
106      // processing needs to do a close as part of cleanup.
107      region = openRegion();
108      if (region == null) {
109        return;
110      }
111
112      if (!updateMeta(region, masterSystemTime) || this.server.isStopped() ||
113          this.rsServices.isStopping()) {
114        return;
115      }
116
117      if (!isRegionStillOpening()) {
118        return;
119      }
120
121      // Successful region open, and add it to MutableOnlineRegions
122      this.rsServices.addRegion(region);
123      openSuccessful = true;
124
125      // Done!  Successful region open
126      LOG.debug("Opened " + regionName + " on " + this.server.getServerName());
127    } finally {
128      // Do all clean up here
129      if (!openSuccessful) {
130        doCleanUpOnFailedOpen(region);
131      }
132      final Boolean current = this.rsServices.getRegionsInTransitionInRS().
133          remove(this.regionInfo.getEncodedNameAsBytes());
134
135      // Let's check if we have met a race condition on open cancellation....
136      // A better solution would be to not have any race condition.
137      // this.rsServices.getRegionsInTransitionInRS().remove(
138      //  this.regionInfo.getEncodedNameAsBytes(), Boolean.TRUE);
139      // would help.
140      if (openSuccessful) {
141        if (current == null) { // Should NEVER happen, but let's be paranoid.
142          LOG.error("Bad state: we've just opened a region that was NOT in transition. Region="
143              + regionName);
144        } else if (Boolean.FALSE.equals(current)) { // Can happen, if we're
145                                                    // really unlucky.
146          LOG.error("Race condition: we've finished to open a region, while a close was requested "
147              + " on region=" + regionName + ". It can be a critical error, as a region that"
148              + " should be closed is now opened. Closing it now");
149          cleanupFailedOpen(region);
150        }
151      }
152    }
153  }
154
155  private void doCleanUpOnFailedOpen(HRegion region)
156      throws IOException {
157    try {
158      if (region != null) {
159        cleanupFailedOpen(region);
160      }
161    } finally {
162      rsServices.reportRegionStateTransition(new RegionStateTransitionContext(
163          TransitionCode.FAILED_OPEN, HConstants.NO_SEQNUM, -1, regionInfo));
164    }
165  }
166
167  /**
168   * Update ZK or META.  This can take a while if for example the
169   * hbase:meta is not available -- if server hosting hbase:meta crashed and we are
170   * waiting on it to come back -- so run in a thread and keep updating znode
171   * state meantime so master doesn't timeout our region-in-transition.
172   * Caller must cleanup region if this fails.
173   */
174  boolean updateMeta(final HRegion r, long masterSystemTime) {
175    if (this.server.isStopped() || this.rsServices.isStopping()) {
176      return false;
177    }
178    // Object we do wait/notify on.  Make it boolean.  If set, we're done.
179    // Else, wait.
180    final AtomicBoolean signaller = new AtomicBoolean(false);
181    PostOpenDeployTasksThread t = new PostOpenDeployTasksThread(r,
182      this.server, this.rsServices, signaller, masterSystemTime);
183    t.start();
184    // Post open deploy task:
185    //   meta => update meta location in ZK
186    //   other region => update meta
187    while (!signaller.get() && t.isAlive() && !this.server.isStopped() &&
188        !this.rsServices.isStopping() && isRegionStillOpening()) {
189      synchronized (signaller) {
190        try {
191          // Wait for 10 seconds, so that server shutdown
192          // won't take too long if this thread happens to run.
193          if (!signaller.get()) signaller.wait(10000);
194        } catch (InterruptedException e) {
195          // Go to the loop check.
196        }
197      }
198    }
199    // Is thread still alive?  We may have left above loop because server is
200    // stopping or we timed out the edit.  Is so, interrupt it.
201    if (t.isAlive()) {
202      if (!signaller.get()) {
203        // Thread still running; interrupt
204        LOG.debug("Interrupting thread " + t);
205        t.interrupt();
206      }
207      try {
208        t.join();
209      } catch (InterruptedException ie) {
210        LOG.warn("Interrupted joining " +
211          r.getRegionInfo().getRegionNameAsString(), ie);
212        Thread.currentThread().interrupt();
213      }
214    }
215
216    // Was there an exception opening the region?  This should trigger on
217    // InterruptedException too.  If so, we failed.
218    return (!Thread.interrupted() && t.getException() == null);
219  }
220
221  /**
222   * Thread to run region post open tasks. Call {@link #getException()} after the thread finishes
223   * to check for exceptions running
224   * {@link RegionServerServices#postOpenDeployTasks(PostOpenDeployContext)}
225   */
226  static class PostOpenDeployTasksThread extends Thread {
227    private Throwable exception = null;
228    private final Server server;
229    private final RegionServerServices services;
230    private final HRegion region;
231    private final AtomicBoolean signaller;
232    private final long masterSystemTime;
233
234    PostOpenDeployTasksThread(final HRegion region, final Server server,
235        final RegionServerServices services, final AtomicBoolean signaller, long masterSystemTime) {
236      super("PostOpenDeployTasks:" + region.getRegionInfo().getEncodedName());
237      this.setDaemon(true);
238      this.server = server;
239      this.services = services;
240      this.region = region;
241      this.signaller = signaller;
242      this.masterSystemTime = masterSystemTime;
243    }
244
245    @Override
246    public void run() {
247      try {
248        this.services.postOpenDeployTasks(new PostOpenDeployContext(region, masterSystemTime));
249      } catch (Throwable e) {
250        String msg = "Exception running postOpenDeployTasks; region=" +
251          this.region.getRegionInfo().getEncodedName();
252        this.exception = e;
253        if (e instanceof IOException
254            && isRegionStillOpening(region.getRegionInfo(), services)) {
255          server.abort(msg, e);
256        } else {
257          LOG.warn(msg, e);
258        }
259      }
260      // We're done.  Set flag then wake up anyone waiting on thread to complete.
261      this.signaller.set(true);
262      synchronized (this.signaller) {
263        this.signaller.notify();
264      }
265    }
266
267    /**
268     * @return Null or the run exception; call this method after thread is done.
269     */
270    Throwable getException() {
271      return this.exception;
272    }
273  }
274
275  /**
276   * @return Instance of HRegion if successful open else null.
277   */
278  HRegion openRegion() {
279    HRegion region = null;
280    try {
281      // Instantiate the region.  This also periodically tickles OPENING
282      // state so master doesn't timeout this region in transition.
283      region = HRegion.openHRegion(this.regionInfo, this.htd,
284        this.rsServices.getWAL(this.regionInfo),
285        this.server.getConfiguration(),
286        this.rsServices,
287        new CancelableProgressable() {
288          @Override
289          public boolean progress() {
290            if (!isRegionStillOpening()) {
291              LOG.warn("Open region aborted since it isn't opening any more");
292              return false;
293            }
294            return true;
295          }
296        });
297    } catch (Throwable t) {
298      // We failed open. Our caller will see the 'null' return value
299      // and transition the node back to FAILED_OPEN. If that fails,
300      // we rely on the Timeout Monitor in the master to reassign.
301      LOG.error(
302          "Failed open of region=" + this.regionInfo.getRegionNameAsString(), t);
303    }
304    return region;
305  }
306
307  void cleanupFailedOpen(final HRegion region) throws IOException {
308    if (region != null) {
309      this.rsServices.removeRegion(region, null);
310      region.close();
311    }
312  }
313
314  private static boolean isRegionStillOpening(
315      RegionInfo regionInfo, RegionServerServices rsServices) {
316    byte[] encodedName = regionInfo.getEncodedNameAsBytes();
317    Boolean action = rsServices.getRegionsInTransitionInRS().get(encodedName);
318    return Boolean.TRUE.equals(action); // true means opening for RIT
319  }
320
321  private boolean isRegionStillOpening() {
322    return isRegionStillOpening(regionInfo, rsServices);
323  }
324}