001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.snapshot;
019
020import java.io.IOException;
021import java.util.HashSet;
022import java.util.List;
023import java.util.Set;
024import org.apache.hadoop.hbase.ServerName;
025import org.apache.hadoop.hbase.client.RegionInfo;
026import org.apache.hadoop.hbase.client.RegionReplicaUtil;
027import org.apache.hadoop.hbase.errorhandling.ForeignException;
028import org.apache.hadoop.hbase.master.MasterServices;
029import org.apache.hadoop.hbase.mob.MobUtils;
030import org.apache.hadoop.hbase.procedure.Procedure;
031import org.apache.hadoop.hbase.procedure.ProcedureCoordinator;
032import org.apache.hadoop.hbase.snapshot.HBaseSnapshotException;
033import org.apache.hadoop.hbase.util.Pair;
034import org.apache.yetus.audience.InterfaceAudience;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
039
040import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
041
042/**
043 * Handle the master side of taking a snapshot of an online table, regardless of snapshot type. Uses
044 * a {@link Procedure} to run the snapshot across all the involved region servers.
045 * @see ProcedureCoordinator
046 */
047@InterfaceAudience.Private
048public class EnabledTableSnapshotHandler extends TakeSnapshotHandler {
049
050  private static final Logger LOG = LoggerFactory.getLogger(EnabledTableSnapshotHandler.class);
051  private final ProcedureCoordinator coordinator;
052
053  public EnabledTableSnapshotHandler(SnapshotDescription snapshot, MasterServices master,
054    final SnapshotManager manager) throws IOException {
055    super(snapshot, master, manager);
056    this.coordinator = manager.getCoordinator();
057  }
058
059  @Override
060  public EnabledTableSnapshotHandler prepare() throws Exception {
061    return (EnabledTableSnapshotHandler) super.prepare();
062  }
063
064  // TODO consider switching over to using regionnames, rather than server names. This would allow
065  // regions to migrate during a snapshot, and then be involved when they are ready. Still want to
066  // enforce a snapshot time constraints, but lets us be potentially a bit more robust.
067
068  /**
069   * This method kicks off a snapshot procedure. Other than that it hangs around for various phases
070   * to complete.
071   */
072  @Override
073  protected void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions) throws IOException {
074    Set<String> regionServers = new HashSet<>(regions.size());
075    for (Pair<RegionInfo, ServerName> region : regions) {
076      if (region != null && region.getFirst() != null && region.getSecond() != null) {
077        RegionInfo hri = region.getFirst();
078        if (hri.isOffline() && (hri.isSplit() || hri.isSplitParent())) continue;
079        regionServers.add(region.getSecond().toString());
080      }
081    }
082
083    // start the snapshot on the RS
084    Procedure proc = coordinator.startProcedure(this.monitor, this.snapshot.getName(),
085      this.snapshot.toByteArray(), Lists.newArrayList(regionServers));
086    if (proc == null) {
087      String msg =
088        "Failed to submit distributed procedure for snapshot '" + snapshot.getName() + "'";
089      LOG.error(msg);
090      throw new HBaseSnapshotException(msg);
091    }
092
093    try {
094      // wait for the snapshot to complete. A timer thread is kicked off that should cancel this
095      // if it takes too long.
096      proc.waitForCompleted();
097      LOG.info("Done waiting - online snapshot for " + this.snapshot.getName());
098
099      // Take the offline regions as disabled
100      for (Pair<RegionInfo, ServerName> region : regions) {
101        RegionInfo regionInfo = region.getFirst();
102        if (
103          regionInfo.isOffline() && (regionInfo.isSplit() || regionInfo.isSplitParent())
104            && RegionReplicaUtil.isDefaultReplica(regionInfo)
105        ) {
106          LOG.info("Take disabled snapshot of offline region=" + regionInfo);
107          snapshotDisabledRegion(regionInfo);
108        }
109      }
110      // handle the mob files if any.
111      boolean mobEnabled = MobUtils.hasMobColumns(htd);
112      if (mobEnabled) {
113        LOG.info("Taking snapshot for mob files in table " + htd.getTableName());
114        // snapshot the mob files as a offline region.
115        RegionInfo mobRegionInfo = MobUtils.getMobRegionInfo(htd.getTableName());
116        snapshotMobRegion(mobRegionInfo);
117      }
118    } catch (InterruptedException e) {
119      ForeignException ee =
120        new ForeignException("Interrupted while waiting for snapshot to finish", e);
121      monitor.receive(ee);
122      Thread.currentThread().interrupt();
123    } catch (ForeignException e) {
124      monitor.receive(e);
125    }
126  }
127
128  /**
129   * Takes a snapshot of the mob region
130   */
131  private void snapshotMobRegion(final RegionInfo regionInfo) throws IOException {
132    snapshotManifest.addMobRegion(regionInfo);
133    monitor.rethrowException();
134    status.setStatus("Completed referencing HFiles for the mob region of table: " + snapshotTable);
135  }
136
137  @Override
138  protected boolean downgradeToSharedTableLock() {
139    // return true here to change from exclusive lock to shared lock, so we can still assign regions
140    // while taking snapshots. This is important, as region server crash can happen at any time, if
141    // we can not assign regions then the cluster will be in trouble as the regions can not online.
142    return true;
143  }
144}