001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver;
019
020import static org.junit.jupiter.api.Assertions.assertEquals;
021import static org.junit.jupiter.api.Assertions.fail;
022
023import java.io.IOException;
024import java.util.List;
025import java.util.stream.Stream;
026import org.apache.hadoop.conf.Configuration;
027import org.apache.hadoop.hbase.HBaseParameterizedTestTemplate;
028import org.apache.hadoop.hbase.HBaseTestingUtil;
029import org.apache.hadoop.hbase.HConstants;
030import org.apache.hadoop.hbase.TableName;
031import org.apache.hadoop.hbase.Waiter.ExplainingPredicate;
032import org.apache.hadoop.hbase.YouAreDeadException;
033import org.apache.hadoop.hbase.client.Get;
034import org.apache.hadoop.hbase.client.Put;
035import org.apache.hadoop.hbase.client.Table;
036import org.apache.hadoop.hbase.testclassification.LargeTests;
037import org.apache.hadoop.hbase.testclassification.RegionServerTests;
038import org.apache.hadoop.hbase.util.Bytes;
039import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
040import org.apache.hadoop.hbase.wal.AsyncFSWALProvider;
041import org.apache.hadoop.hbase.wal.FSHLogProvider;
042import org.apache.hadoop.hbase.wal.WALFactory;
043import org.apache.hadoop.hbase.wal.WALProvider;
044import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
045import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
046import org.junit.jupiter.api.AfterEach;
047import org.junit.jupiter.api.BeforeEach;
048import org.junit.jupiter.api.Tag;
049import org.junit.jupiter.api.TestTemplate;
050import org.junit.jupiter.params.provider.Arguments;
051import org.slf4j.Logger;
052import org.slf4j.LoggerFactory;
053
054/**
055 * This testcase is used to ensure that the compaction marker will fail a compaction if the RS is
056 * already dead. It can not eliminate FNFE when scanning but it does reduce the possibility a lot.
057 */
058@Tag(RegionServerTests.TAG)
059@Tag(LargeTests.TAG)
060@HBaseParameterizedTestTemplate(name = "{index}: wal={0}")
061public class TestCompactionInDeadRegionServer {
062
063  private static final Logger LOG = LoggerFactory.getLogger(TestCompactionInDeadRegionServer.class);
064
065  private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
066
067  private static final TableName TABLE_NAME = TableName.valueOf("test");
068
069  private static final byte[] CF = Bytes.toBytes("cf");
070
071  private static final byte[] CQ = Bytes.toBytes("cq");
072
073  public static final class IgnoreYouAreDeadRS extends HRegionServer {
074
075    public IgnoreYouAreDeadRS(Configuration conf) throws IOException, InterruptedException {
076      super(conf);
077    }
078
079    @Override
080    protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
081      throws IOException {
082      try {
083        super.tryRegionServerReport(reportStartTime, reportEndTime);
084      } catch (YouAreDeadException e) {
085        // ignore, do not abort
086      }
087    }
088  }
089
090  private final Class<? extends WALProvider> walProvider;
091
092  public TestCompactionInDeadRegionServer(Class<? extends WALProvider> walProvider) {
093    this.walProvider = walProvider;
094  }
095
096  public static Stream<Arguments> parameters() {
097    return Stream.of(Arguments.of(FSHLogProvider.class), Arguments.of(AsyncFSWALProvider.class));
098  }
099
100  @BeforeEach
101  public void setUp() throws Exception {
102    UTIL.getConfiguration().setClass(WALFactory.WAL_PROVIDER, walProvider, WALProvider.class);
103    UTIL.getConfiguration().setInt(HConstants.ZK_SESSION_TIMEOUT, 2000);
104    UTIL.getConfiguration().setClass(HConstants.REGION_SERVER_IMPL, IgnoreYouAreDeadRS.class,
105      HRegionServer.class);
106    UTIL.startMiniCluster(2);
107    Table table = UTIL.createTable(TABLE_NAME, CF);
108    for (int i = 0; i < 10; i++) {
109      table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
110    }
111    UTIL.getAdmin().flush(TABLE_NAME);
112    for (int i = 10; i < 20; i++) {
113      table.put(new Put(Bytes.toBytes(i)).addColumn(CF, CQ, Bytes.toBytes(i)));
114    }
115    UTIL.getAdmin().flush(TABLE_NAME);
116  }
117
118  @AfterEach
119  public void tearDown() throws Exception {
120    UTIL.shutdownMiniCluster();
121  }
122
123  @TestTemplate
124  public void test() throws Exception {
125    HRegionServer regionSvr = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
126    HRegion region = regionSvr.getRegions(TABLE_NAME).get(0);
127    String regName = region.getRegionInfo().getEncodedName();
128    List<HRegion> metaRegs = regionSvr.getRegions(TableName.META_TABLE_NAME);
129    if (metaRegs != null && !metaRegs.isEmpty()) {
130      LOG.info("meta is on the same server: " + regionSvr);
131      // when region is on same server as hbase:meta, reassigning meta would abort the server
132      // since WAL is broken.
133      // so the region is moved to a different server
134      HRegionServer otherRs = UTIL.getOtherRegionServer(regionSvr);
135      UTIL.moveRegionAndWait(region.getRegionInfo(), otherRs.getServerName());
136      LOG.info("Moved region: " + regName + " to " + otherRs.getServerName());
137    }
138    HRegionServer rsToSuspend = UTIL.getRSForFirstRegionInTable(TABLE_NAME);
139    region = rsToSuspend.getRegions(TABLE_NAME).get(0);
140
141    ZKWatcher watcher = UTIL.getZooKeeperWatcher();
142    watcher.getRecoverableZooKeeper().delete(
143      ZNodePaths.joinZNode(watcher.getZNodePaths().rsZNode, rsToSuspend.getServerName().toString()),
144      -1);
145    LOG.info("suspending " + rsToSuspend);
146    UTIL.waitFor(60000, 1000, new ExplainingPredicate<Exception>() {
147
148      @Override
149      public boolean evaluate() throws Exception {
150        for (RegionServerThread thread : UTIL.getHBaseCluster().getRegionServerThreads()) {
151          HRegionServer rs = thread.getRegionServer();
152          if (rs != rsToSuspend) {
153            return !rs.getRegions(TABLE_NAME).isEmpty();
154          }
155        }
156        return false;
157      }
158
159      @Override
160      public String explainFailure() throws Exception {
161        return "The region for " + TABLE_NAME + " is still on " + rsToSuspend.getServerName();
162      }
163    });
164    try {
165      region.compact(true);
166      fail("Should fail as our wal file has already been closed, "
167        + "and walDir has also been renamed");
168    } catch (Exception e) {
169      LOG.debug("expected exception: ", e);
170    }
171    Table table = UTIL.getConnection().getTable(TABLE_NAME);
172    // should not hit FNFE
173    for (int i = 0; i < 20; i++) {
174      assertEquals(i, Bytes.toInt(table.get(new Get(Bytes.toBytes(i))).getValue(CF, CQ)));
175    }
176  }
177}