001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.snapshot;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.fail;
022
023import java.io.IOException;
024import java.util.Collections;
025import java.util.Comparator;
026import java.util.HashMap;
027import java.util.List;
028import java.util.Map;
029import java.util.concurrent.TimeUnit;
030import java.util.concurrent.TimeoutException;
031import org.apache.hadoop.conf.Configuration;
032import org.apache.hadoop.fs.FileSystem;
033import org.apache.hadoop.fs.Path;
034import org.apache.hadoop.hbase.HBaseTestingUtil;
035import org.apache.hadoop.hbase.HConstants;
036import org.apache.hadoop.hbase.TableName;
037import org.apache.hadoop.hbase.TableNotFoundException;
038import org.apache.hadoop.hbase.client.Admin;
039import org.apache.hadoop.hbase.client.RegionInfo;
040import org.apache.hadoop.hbase.client.SnapshotDescription;
041import org.apache.hadoop.hbase.client.SnapshotType;
042import org.apache.hadoop.hbase.client.Table;
043import org.apache.hadoop.hbase.master.HMaster;
044import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
045import org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy;
046import org.apache.hadoop.hbase.testclassification.LargeTests;
047import org.apache.hadoop.hbase.testclassification.RegionServerTests;
048import org.apache.hadoop.hbase.util.Bytes;
049import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
050import org.junit.jupiter.api.AfterAll;
051import org.junit.jupiter.api.AfterEach;
052import org.junit.jupiter.api.BeforeAll;
053import org.junit.jupiter.api.BeforeEach;
054import org.junit.jupiter.api.Tag;
055import org.junit.jupiter.api.Test;
056import org.junit.jupiter.api.TestInfo;
057import org.slf4j.Logger;
058import org.slf4j.LoggerFactory;
059
060import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
061
062import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
063import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.IsSnapshotDoneRequest;
064import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos.IsSnapshotDoneResponse;
065import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos;
066
067/**
068 * Test creating/using/deleting snapshots from the client
069 * <p>
070 * This is an end-to-end test for the snapshot utility TODO This is essentially a clone of
071 * TestSnapshotFromClient. This is worth refactoring this because there will be a few more flavors
072 * of snapshots that need to run these tests.
073 */
074@Tag(RegionServerTests.TAG)
075@Tag(LargeTests.TAG)
076public class TestFlushSnapshotFromClient {
077
078  private static final Logger LOG = LoggerFactory.getLogger(TestFlushSnapshotFromClient.class);
079
080  protected static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
081  protected static final int NUM_RS = 2;
082  protected static final byte[] TEST_FAM = Bytes.toBytes("fam");
083  protected static final TableName TABLE_NAME = TableName.valueOf("test");
084  protected final int DEFAULT_NUM_ROWS = 100;
085  protected Admin admin = null;
086
087  @BeforeAll
088  public static void setupCluster(TestInfo testInfo) throws Exception {
089    if (testInfo.getTestClass().orElse(null) != TestFlushSnapshotFromClient.class) {
090      return;
091    }
092    setupConf(UTIL.getConfiguration());
093    UTIL.startMiniCluster(NUM_RS);
094  }
095
096  protected static void setupConf(Configuration conf) {
097    // disable the ui
098    conf.setInt("hbase.regionsever.info.port", -1);
099    // change the flush size to a small amount, regulating number of store files
100    conf.setInt("hbase.hregion.memstore.flush.size", 25000);
101    // so make sure we get a compaction when doing a load, but keep around some
102    // files in the store
103    conf.setInt("hbase.hstore.compaction.min", 10);
104    conf.setInt("hbase.hstore.compactionThreshold", 10);
105    // block writes if we get to 12 store files
106    conf.setInt("hbase.hstore.blockingStoreFiles", 12);
107    // Enable snapshot
108    conf.setBoolean(SnapshotManager.HBASE_SNAPSHOT_ENABLED, true);
109    conf.set(HConstants.HBASE_REGION_SPLIT_POLICY_KEY,
110      ConstantSizeRegionSplitPolicy.class.getName());
111  }
112
113  @BeforeEach
114  public void setup() throws Exception {
115    createTable();
116    this.admin = UTIL.getConnection().getAdmin();
117  }
118
119  protected void createTable() throws Exception {
120    SnapshotTestingUtils.createTable(UTIL, TABLE_NAME, TEST_FAM);
121  }
122
123  @AfterEach
124  public void tearDown() throws Exception {
125    UTIL.deleteTable(TABLE_NAME);
126    SnapshotTestingUtils.deleteAllSnapshots(this.admin);
127    this.admin.close();
128    SnapshotTestingUtils.deleteArchiveDirectory(UTIL);
129  }
130
131  @AfterAll
132  public static void cleanupTest() throws Exception {
133    try {
134      UTIL.shutdownMiniCluster();
135    } catch (Exception e) {
136      LOG.warn("failure shutting down cluster", e);
137    }
138  }
139
140  /**
141   * Test simple flush snapshotting a table that is online
142   */
143  @Test
144  public void testFlushTableSnapshot() throws Exception {
145    // make sure we don't fail on listing snapshots
146    SnapshotTestingUtils.assertNoSnapshots(admin);
147
148    // put some stuff in the table
149    SnapshotTestingUtils.loadData(UTIL, TABLE_NAME, DEFAULT_NUM_ROWS, TEST_FAM);
150
151    LOG.debug("FS state before snapshot:");
152    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
153
154    // take a snapshot of the enabled table
155    String snapshotString = "offlineTableSnapshot";
156    byte[] snapshot = Bytes.toBytes(snapshotString);
157    admin.snapshot(snapshotString, TABLE_NAME, SnapshotType.FLUSH);
158    LOG.debug("Snapshot completed.");
159
160    // make sure we have the snapshot
161    List<SnapshotDescription> snapshots =
162      SnapshotTestingUtils.assertOneSnapshotThatMatches(admin, snapshot, TABLE_NAME);
163
164    // make sure its a valid snapshot
165    LOG.debug("FS state after snapshot:");
166    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
167
168    SnapshotTestingUtils.confirmSnapshotValid(UTIL,
169      ProtobufUtil.createHBaseProtosSnapshotDesc(snapshots.get(0)), TABLE_NAME, TEST_FAM);
170  }
171
172  /**
173   * Test snapshotting a table that is online without flushing
174   */
175  @Test
176  public void testSkipFlushTableSnapshot() throws Exception {
177    // make sure we don't fail on listing snapshots
178    SnapshotTestingUtils.assertNoSnapshots(admin);
179
180    // put some stuff in the table
181    Table table = UTIL.getConnection().getTable(TABLE_NAME);
182    UTIL.loadTable(table, TEST_FAM);
183    UTIL.flush(TABLE_NAME);
184
185    LOG.debug("FS state before snapshot:");
186    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
187
188    // take a snapshot of the enabled table
189    String snapshotString = "skipFlushTableSnapshot";
190    String snapshot = snapshotString;
191    admin.snapshot(snapshotString, TABLE_NAME, SnapshotType.SKIPFLUSH);
192    LOG.debug("Snapshot completed.");
193
194    // make sure we have the snapshot
195    List<SnapshotDescription> snapshots =
196      SnapshotTestingUtils.assertOneSnapshotThatMatches(admin, snapshot, TABLE_NAME);
197
198    // make sure its a valid snapshot
199    LOG.debug("FS state after snapshot:");
200    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
201
202    SnapshotTestingUtils.confirmSnapshotValid(UTIL,
203      ProtobufUtil.createHBaseProtosSnapshotDesc(snapshots.get(0)), TABLE_NAME, TEST_FAM);
204
205    admin.deleteSnapshot(snapshot);
206    snapshots = admin.listSnapshots();
207    SnapshotTestingUtils.assertNoSnapshots(admin);
208  }
209
210  /**
211   * Test simple flush snapshotting a table that is online
212   */
213  @Test
214  public void testFlushTableSnapshotWithProcedure() throws Exception {
215    // make sure we don't fail on listing snapshots
216    SnapshotTestingUtils.assertNoSnapshots(admin);
217
218    // put some stuff in the table
219    SnapshotTestingUtils.loadData(UTIL, TABLE_NAME, DEFAULT_NUM_ROWS, TEST_FAM);
220
221    LOG.debug("FS state before snapshot:");
222    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
223
224    // take a snapshot of the enabled table
225    String snapshotString = "offlineTableSnapshot";
226    byte[] snapshot = Bytes.toBytes(snapshotString);
227    Map<String, String> props = new HashMap<>();
228    props.put("table", TABLE_NAME.getNameAsString());
229    admin.execProcedure(SnapshotManager.ONLINE_SNAPSHOT_CONTROLLER_DESCRIPTION, snapshotString,
230      props);
231
232    LOG.debug("Snapshot completed.");
233
234    // make sure we have the snapshot
235    List<SnapshotDescription> snapshots =
236      SnapshotTestingUtils.assertOneSnapshotThatMatches(admin, snapshot, TABLE_NAME);
237
238    // make sure its a valid snapshot
239    LOG.debug("FS state after snapshot:");
240    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
241
242    SnapshotTestingUtils.confirmSnapshotValid(UTIL,
243      ProtobufUtil.createHBaseProtosSnapshotDesc(snapshots.get(0)), TABLE_NAME, TEST_FAM);
244  }
245
246  @Test
247  public void testSnapshotFailsOnNonExistantTable() throws Exception {
248    // make sure we don't fail on listing snapshots
249    SnapshotTestingUtils.assertNoSnapshots(admin);
250    TableName tableName = TableName.valueOf("_not_a_table");
251
252    // make sure the table doesn't exist
253    boolean fail = false;
254    do {
255      try {
256        admin.getDescriptor(tableName);
257        fail = true;
258        LOG.error("Table:" + tableName + " already exists, checking a new name");
259        tableName = TableName.valueOf(tableName + "!");
260      } catch (TableNotFoundException e) {
261        fail = false;
262      }
263    } while (fail);
264
265    // snapshot the non-existant table
266    try {
267      admin.snapshot("fail", tableName, SnapshotType.FLUSH);
268      fail("Snapshot succeeded even though there is not table.");
269    } catch (SnapshotCreationException e) {
270      LOG.info("Correctly failed to snapshot a non-existant table:" + e.getMessage());
271    }
272  }
273
274  /**
275   * Helper method for testing async snapshot operations. Just waits for the given snapshot to
276   * complete on the server by repeatedly checking the master.
277   * @param master       the master running the snapshot
278   * @param snapshot     the snapshot to check
279   * @param timeoutNanos the timeout in nano between checks to see if the snapshot is done
280   */
281  private static void waitForSnapshotToComplete(HMaster master,
282    SnapshotProtos.SnapshotDescription snapshot, long timeoutNanos) throws Exception {
283    final IsSnapshotDoneRequest request =
284      IsSnapshotDoneRequest.newBuilder().setSnapshot(snapshot).build();
285    long start = System.nanoTime();
286    while (System.nanoTime() - start < timeoutNanos) {
287      try {
288        IsSnapshotDoneResponse done = master.getMasterRpcServices().isSnapshotDone(null, request);
289        if (done.getDone()) {
290          return;
291        }
292      } catch (ServiceException e) {
293        // ignore UnknownSnapshotException, this is possible as for AsyncAdmin, the method will
294        // return immediately after sending out the request, no matter whether the master has
295        // processed the request or not.
296        if (!(e.getCause() instanceof UnknownSnapshotException)) {
297          throw e;
298        }
299      }
300
301      Thread.sleep(200);
302    }
303    throw new TimeoutException("Timeout waiting for snapshot " + snapshot + " to complete");
304  }
305
306  @Test
307  public void testAsyncFlushSnapshot() throws Exception {
308    SnapshotProtos.SnapshotDescription snapshot = SnapshotProtos.SnapshotDescription.newBuilder()
309      .setName("asyncSnapshot").setTable(TABLE_NAME.getNameAsString())
310      .setType(SnapshotProtos.SnapshotDescription.Type.FLUSH).build();
311
312    // take the snapshot async
313    admin.snapshotAsync(new SnapshotDescription("asyncSnapshot", TABLE_NAME, SnapshotType.FLUSH));
314
315    // constantly loop, looking for the snapshot to complete
316    HMaster master = UTIL.getMiniHBaseCluster().getMaster();
317    waitForSnapshotToComplete(master, snapshot, TimeUnit.MINUTES.toNanos(1));
318    LOG.info(" === Async Snapshot Completed ===");
319    UTIL.getHBaseCluster().getMaster().getMasterFileSystem().logFileSystemState(LOG);
320
321    // make sure we get the snapshot
322    SnapshotTestingUtils.assertOneSnapshotThatMatches(admin, snapshot);
323  }
324
325  @Test
326  public void testSnapshotStateAfterMerge() throws Exception {
327    int numRows = DEFAULT_NUM_ROWS;
328    // make sure we don't fail on listing snapshots
329    SnapshotTestingUtils.assertNoSnapshots(admin);
330    // load the table so we have some data
331    SnapshotTestingUtils.loadData(UTIL, TABLE_NAME, numRows, TEST_FAM);
332
333    // Take a snapshot
334    String snapshotBeforeMergeName = "snapshotBeforeMerge";
335    admin.snapshot(snapshotBeforeMergeName, TABLE_NAME, SnapshotType.FLUSH);
336
337    // Clone the table
338    TableName cloneBeforeMergeName = TableName.valueOf("cloneBeforeMerge");
339    admin.cloneSnapshot(snapshotBeforeMergeName, cloneBeforeMergeName);
340    SnapshotTestingUtils.waitForTableToBeOnline(UTIL, cloneBeforeMergeName);
341
342    // Merge two regions
343    List<RegionInfo> regions = admin.getRegions(TABLE_NAME);
344    Collections.sort(regions, new Comparator<RegionInfo>() {
345      @Override
346      public int compare(RegionInfo r1, RegionInfo r2) {
347        return Bytes.compareTo(r1.getStartKey(), r2.getStartKey());
348      }
349    });
350
351    int numRegions = admin.getRegions(TABLE_NAME).size();
352    int numRegionsAfterMerge = numRegions - 2;
353    admin.mergeRegionsAsync(regions.get(1).getEncodedNameAsBytes(),
354      regions.get(2).getEncodedNameAsBytes(), true);
355    admin.mergeRegionsAsync(regions.get(4).getEncodedNameAsBytes(),
356      regions.get(5).getEncodedNameAsBytes(), true);
357
358    // Verify that there's one region less
359    waitRegionsAfterMerge(numRegionsAfterMerge);
360    assertEquals(numRegionsAfterMerge, admin.getRegions(TABLE_NAME).size());
361
362    // Clone the table
363    TableName cloneAfterMergeName = TableName.valueOf("cloneAfterMerge");
364    admin.cloneSnapshot(snapshotBeforeMergeName, cloneAfterMergeName);
365    SnapshotTestingUtils.waitForTableToBeOnline(UTIL, cloneAfterMergeName);
366
367    verifyRowCount(UTIL, TABLE_NAME, numRows);
368    verifyRowCount(UTIL, cloneBeforeMergeName, numRows);
369    verifyRowCount(UTIL, cloneAfterMergeName, numRows);
370
371    // test that we can delete the snapshot
372    UTIL.deleteTable(cloneAfterMergeName);
373    UTIL.deleteTable(cloneBeforeMergeName);
374  }
375
376  @Test
377  public void testTakeSnapshotAfterMerge() throws Exception {
378    int numRows = DEFAULT_NUM_ROWS;
379    // make sure we don't fail on listing snapshots
380    SnapshotTestingUtils.assertNoSnapshots(admin);
381    // load the table so we have some data
382    SnapshotTestingUtils.loadData(UTIL, TABLE_NAME, numRows, TEST_FAM);
383
384    // Merge two regions
385    List<RegionInfo> regions = admin.getRegions(TABLE_NAME);
386    Collections.sort(regions, new Comparator<RegionInfo>() {
387      @Override
388      public int compare(RegionInfo r1, RegionInfo r2) {
389        return Bytes.compareTo(r1.getStartKey(), r2.getStartKey());
390      }
391    });
392
393    int numRegions = admin.getRegions(TABLE_NAME).size();
394    int numRegionsAfterMerge = numRegions - 2;
395    admin.mergeRegionsAsync(regions.get(1).getEncodedNameAsBytes(),
396      regions.get(2).getEncodedNameAsBytes(), true);
397    admin.mergeRegionsAsync(regions.get(4).getEncodedNameAsBytes(),
398      regions.get(5).getEncodedNameAsBytes(), true);
399
400    waitRegionsAfterMerge(numRegionsAfterMerge);
401    assertEquals(numRegionsAfterMerge, admin.getRegions(TABLE_NAME).size());
402
403    // Take a snapshot
404    String snapshotName = "snapshotAfterMerge";
405    SnapshotTestingUtils.snapshot(admin, snapshotName, TABLE_NAME, SnapshotType.FLUSH, 3);
406
407    // Clone the table
408    TableName cloneName = TableName.valueOf("cloneMerge");
409    admin.cloneSnapshot(snapshotName, cloneName);
410    SnapshotTestingUtils.waitForTableToBeOnline(UTIL, cloneName);
411
412    verifyRowCount(UTIL, TABLE_NAME, numRows);
413    verifyRowCount(UTIL, cloneName, numRows);
414
415    // test that we can delete the snapshot
416    UTIL.deleteTable(cloneName);
417  }
418
419  /**
420   * Basic end-to-end test of simple-flush-based snapshots
421   */
422  @Test
423  public void testFlushCreateListDestroy() throws Exception {
424    LOG.debug("------- Starting Snapshot test -------------");
425    // make sure we don't fail on listing snapshots
426    SnapshotTestingUtils.assertNoSnapshots(admin);
427    // load the table so we have some data
428    SnapshotTestingUtils.loadData(UTIL, TABLE_NAME, DEFAULT_NUM_ROWS, TEST_FAM);
429
430    String snapshotName = "flushSnapshotCreateListDestroy";
431    FileSystem fs = UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getFileSystem();
432    Path rootDir = UTIL.getHBaseCluster().getMaster().getMasterFileSystem().getRootDir();
433    SnapshotTestingUtils.createSnapshotAndValidate(admin, TABLE_NAME, Bytes.toString(TEST_FAM),
434      snapshotName, rootDir, fs, true);
435  }
436
437  private void waitRegionsAfterMerge(final long numRegionsAfterMerge)
438    throws IOException, InterruptedException {
439    // Verify that there's one region less
440    long startTime = EnvironmentEdgeManager.currentTime();
441    while (admin.getRegions(TABLE_NAME).size() != numRegionsAfterMerge) {
442      // This may be flaky... if after 15sec the merge is not complete give up
443      // it will fail in the assertEquals(numRegionsAfterMerge).
444      if ((EnvironmentEdgeManager.currentTime() - startTime) > 15000) {
445        break;
446      }
447      Thread.sleep(100);
448    }
449    SnapshotTestingUtils.waitForTableToBeOnline(UTIL, TABLE_NAME);
450  }
451
452  protected void verifyRowCount(final HBaseTestingUtil util, final TableName tableName,
453    long expectedRows) throws IOException {
454    SnapshotTestingUtils.verifyRowCount(util, tableName, expectedRows);
455  }
456
457  protected int countRows(final Table table, final byte[]... families) throws IOException {
458    return UTIL.countRows(table, families);
459  }
460}