001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.snapshot;
019
020import java.io.IOException;
021import java.util.HashSet;
022import java.util.List;
023import java.util.Set;
024
025import org.apache.hadoop.hbase.ServerName;
026import org.apache.hadoop.hbase.client.RegionInfo;
027import org.apache.hadoop.hbase.client.RegionReplicaUtil;
028import org.apache.hadoop.hbase.errorhandling.ForeignException;
029import org.apache.hadoop.hbase.master.MasterServices;
030import org.apache.hadoop.hbase.mob.MobUtils;
031import org.apache.hadoop.hbase.procedure.Procedure;
032import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
033import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
034import org.apache.hadoop.hbase.util.Pair;
035import org.apache.yetus.audience.InterfaceAudience;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
039import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
040
041/**
042 * Handle the master side of taking a snapshot of an online table, regardless of snapshot type.
043 * Uses a {@link Procedure} to run the snapshot across all the involved region servers.
044 * @see ProcedureCoordinator
045 */
046@InterfaceAudience.Private
047public class EnabledTableSnapshotHandler extends TakeSnapshotHandler {
048
049  private static final Logger LOG = LoggerFactory.getLogger(EnabledTableSnapshotHandler.class);
050  private final ProcedureCoordinator coordinator;
051
052  public EnabledTableSnapshotHandler(SnapshotDescription snapshot, MasterServices master,
053      final SnapshotManager manager) {
054    super(snapshot, master, manager);
055    this.coordinator = manager.getCoordinator();
056  }
057
058  @Override
059  public EnabledTableSnapshotHandler prepare() throws Exception {
060    return (EnabledTableSnapshotHandler) super.prepare();
061  }
062
063  // TODO consider switching over to using regionnames, rather than server names. This would allow
064  // regions to migrate during a snapshot, and then be involved when they are ready. Still want to
065  // enforce a snapshot time constraints, but lets us be potentially a bit more robust.
066
067  /**
068   * This method kicks off a snapshot procedure.  Other than that it hangs around for various
069   * phases to complete.
070   */
071  @Override
072  protected void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions) throws IOException {
073    Set<String> regionServers = new HashSet<>(regions.size());
074    for (Pair<RegionInfo, ServerName> region : regions) {
075      if (region != null && region.getFirst() != null && region.getSecond() != null) {
076        RegionInfo hri = region.getFirst();
077        if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue;
078        regionServers.add(region.getSecond().toString());
079      }
080    }
081
082    // start the snapshot on the RS
083    Procedure proc = coordinator.startProcedure(this.monitor, this.snapshot.getName(),
084      this.snapshot.toByteArray(), Lists.newArrayList(regionServers));
085    if (proc == null) {
086      String msg = "Failed to submit distributed procedure for snapshot '"
087          + snapshot.getName() + "'";
088      LOG.error(msg);
089      throw new HBaseSnapshotException(msg);
090    }
091
092    try {
093      // wait for the snapshot to complete.  A timer thread is kicked off that should cancel this
094      // if it takes too long.
095      proc.waitForCompleted();
096      LOG.info("Done waiting - online snapshot for " + this.snapshot.getName());
097
098      // Take the offline regions as disabled
099      for (Pair<RegionInfo, ServerName> region : regions) {
100        RegionInfo regionInfo = region.getFirst();
101        if (regionInfo.isOffline() && (regionInfo.isSplit() || regionInfo.isSplitParent()) &&
102            RegionReplicaUtil.isDefaultReplica(regionInfo)) {
103          LOG.info("Take disabled snapshot of offline region=" + regionInfo);
104          snapshotDisabledRegion(regionInfo);
105        }
106      }
107      // handle the mob files if any.
108      boolean mobEnabled = MobUtils.hasMobColumns(htd);
109      if (mobEnabled) {
110        LOG.info("Taking snapshot for mob files in table " + htd.getTableName());
111        // snapshot the mob files as a offline region.
112        RegionInfo mobRegionInfo = MobUtils.getMobRegionInfo(htd.getTableName());
113        snapshotMobRegion(mobRegionInfo);
114      }
115    } catch (InterruptedException e) {
116      ForeignException ee =
117          new ForeignException("Interrupted while waiting for snapshot to finish", e);
118      monitor.receive(ee);
119      Thread.currentThread().interrupt();
120    } catch (ForeignException e) {
121      monitor.receive(e);
122    }
123  }
124
125  /**
126   * Takes a snapshot of the mob region
127   */
128  private void snapshotMobRegion(final RegionInfo regionInfo)
129      throws IOException {
130    snapshotManifest.addMobRegion(regionInfo);
131    monitor.rethrowException();
132    status.setStatus("Completed referencing HFiles for the mob region of table: " + snapshotTable);
133  }
134
135  @Override
136  protected boolean downgradeToSharedTableLock() {
137    // return true here to change from exclusive lock to shared lock, so we can still assign regions
138    // while taking snapshots. This is important, as region server crash can happen at any time, if
139    // we can not assign regions then the cluster will be in trouble as the regions can not online.
140    return true;
141  }
142}