001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.snapshot;
019
020import java.io.IOException;
021import java.util.HashSet;
022import java.util.List;
023import java.util.Set;
024
025import org.apache.hadoop.hbase.ServerName;
026import org.apache.hadoop.hbase.client.RegionInfo;
027import org.apache.hadoop.hbase.client.RegionReplicaUtil;
028import org.apache.hadoop.hbase.errorhandling.ForeignException;
029import org.apache.hadoop.hbase.master.MasterServices;
030import org.apache.hadoop.hbase.mob.MobUtils;
031import org.apache.hadoop.hbase.procedure.Procedure;
032import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
033import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
034import org.apache.hadoop.hbase.util.Pair;
035import org.apache.yetus.audience.InterfaceAudience;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
039import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
040
041/**
042 * Handle the master side of taking a snapshot of an online table, regardless of snapshot type.
043 * Uses a {@link Procedure} to run the snapshot across all the involved region servers.
044 * @see ProcedureCoordinator
045 */
046@InterfaceAudience.Private
047public class EnabledTableSnapshotHandler extends TakeSnapshotHandler {
048
049  private static final Logger LOG = LoggerFactory.getLogger(EnabledTableSnapshotHandler.class);
050  private final ProcedureCoordinator coordinator;
051
052  public EnabledTableSnapshotHandler(SnapshotDescription snapshot, MasterServices master,
053      final SnapshotManager manager) {
054    super(snapshot, master, manager);
055    this.coordinator = manager.getCoordinator();
056  }
057
058  @Override
059  public EnabledTableSnapshotHandler prepare() throws Exception {
060    return (EnabledTableSnapshotHandler) super.prepare();
061  }
062
063  // TODO consider switching over to using regionnames, rather than server names. This would allow
064  // regions to migrate during a snapshot, and then be involved when they are ready. Still want to
065  // enforce a snapshot time constraints, but lets us be potentially a bit more robust.
066
067  /**
068   * This method kicks off a snapshot procedure.  Other than that it hangs around for various
069   * phases to complete.
070   */
071  @Override
072  protected void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions)
073      throws HBaseSnapshotException, IOException {
074    Set<String> regionServers = new HashSet<>(regions.size());
075    for (Pair<RegionInfo, ServerName> region : regions) {
076      if (region != null && region.getFirst() != null && region.getSecond() != null) {
077        RegionInfo hri = region.getFirst();
078        if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue;
079        regionServers.add(region.getSecond().toString());
080      }
081    }
082
083    // start the snapshot on the RS
084    Procedure proc = coordinator.startProcedure(this.monitor, this.snapshot.getName(),
085      this.snapshot.toByteArray(), Lists.newArrayList(regionServers));
086    if (proc == null) {
087      String msg = "Failed to submit distributed procedure for snapshot '"
088          + snapshot.getName() + "'";
089      LOG.error(msg);
090      throw new HBaseSnapshotException(msg);
091    }
092
093    try {
094      // wait for the snapshot to complete.  A timer thread is kicked off that should cancel this
095      // if it takes too long.
096      proc.waitForCompleted();
097      LOG.info("Done waiting - online snapshot for " + this.snapshot.getName());
098
099      // Take the offline regions as disabled
100      for (Pair<RegionInfo, ServerName> region : regions) {
101        RegionInfo regionInfo = region.getFirst();
102        if (regionInfo.isOffline() && (regionInfo.isSplit() || regionInfo.isSplitParent()) &&
103            RegionReplicaUtil.isDefaultReplica(regionInfo)) {
104          LOG.info("Take disabled snapshot of offline region=" + regionInfo);
105          snapshotDisabledRegion(regionInfo);
106        }
107      }
108      // handle the mob files if any.
109      boolean mobEnabled = MobUtils.hasMobColumns(htd);
110      if (mobEnabled) {
111        LOG.info("Taking snapshot for mob files in table " + htd.getTableName());
112        // snapshot the mob files as a offline region.
113        RegionInfo mobRegionInfo = MobUtils.getMobRegionInfo(htd.getTableName());
114        snapshotMobRegion(mobRegionInfo);
115      }
116    } catch (InterruptedException e) {
117      ForeignException ee =
118          new ForeignException("Interrupted while waiting for snapshot to finish", e);
119      monitor.receive(ee);
120      Thread.currentThread().interrupt();
121    } catch (ForeignException e) {
122      monitor.receive(e);
123    }
124  }
125
126  /**
127   * Takes a snapshot of the mob region
128   */
129  private void snapshotMobRegion(final RegionInfo regionInfo)
130      throws IOException {
131    snapshotManifest.addMobRegion(regionInfo);
132    monitor.rethrowException();
133    status.setStatus("Completed referencing HFiles for the mob region of table: " + snapshotTable);
134  }
135
136  @Override
137  protected boolean downgradeToSharedTableLock() {
138    // return true here to change from exclusive lock to shared lock, so we can still assign regions
139    // while taking snapshots. This is important, as region server crash can happen at any time, if
140    // we can not assign regions then the cluster will be in trouble as the regions can not online.
141    return true;
142  }
143}