001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.master.assignment; 019 020import java.io.IOException; 021import java.util.Optional; 022import org.apache.hadoop.hbase.HConstants; 023import org.apache.hadoop.hbase.ServerName; 024import org.apache.hadoop.hbase.TableName; 025import org.apache.hadoop.hbase.client.RegionInfo; 026import org.apache.hadoop.hbase.exceptions.UnexpectedStateException; 027import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; 028import org.apache.hadoop.hbase.master.procedure.TableProcedureInterface; 029import org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException; 030import org.apache.hadoop.hbase.procedure2.Procedure; 031import org.apache.hadoop.hbase.procedure2.ProcedureEvent; 032import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; 033import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; 034import org.apache.hadoop.hbase.procedure2.ProcedureUtil; 035import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; 036import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher; 037import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure; 038import org.apache.hadoop.hbase.procedure2.RemoteProcedureException; 039import org.apache.hadoop.hbase.util.RetryCounter; 040import org.apache.yetus.audience.InterfaceAudience; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; 044import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseState; 045import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.RegionRemoteProcedureBaseStateData; 046import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos; 047import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode; 048 049/** 050 * The base class for the remote procedures used to open/close a region. 051 * <p/> 052 * Notice that here we do not care about the result of the remote call, if the remote call is 053 * finished, either succeeded or not, we will always finish the procedure. The parent procedure 054 * should take care of the result and try to reschedule if the result is not good. 055 */ 056@InterfaceAudience.Private 057public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedureEnv> 058 implements TableProcedureInterface, RemoteProcedure<MasterProcedureEnv, ServerName> { 059 060 private static final Logger LOG = LoggerFactory.getLogger(RegionRemoteProcedureBase.class); 061 062 protected RegionInfo region; 063 064 protected ServerName targetServer; 065 066 private RegionRemoteProcedureBaseState state = 067 RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; 068 069 private TransitionCode transitionCode; 070 071 private long seqId; 072 073 private RetryCounter retryCounter; 074 075 protected RegionRemoteProcedureBase() { 076 } 077 078 protected RegionRemoteProcedureBase(TransitRegionStateProcedure parent, RegionInfo region, 079 ServerName targetServer) { 080 this.region = region; 081 this.targetServer = targetServer; 082 parent.attachRemoteProc(this); 083 } 084 085 @Override 086 public Optional<RemoteProcedureDispatcher.RemoteOperation> remoteCallBuild(MasterProcedureEnv env, 087 ServerName remote) { 088 // REPORT_SUCCEED means that this remote open/close request already executed in RegionServer. 089 // So return empty operation and RSProcedureDispatcher no need to send it again. 090 if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) { 091 return Optional.empty(); 092 } 093 return Optional.of(newRemoteOperation()); 094 } 095 096 protected abstract RemoteProcedureDispatcher.RemoteOperation newRemoteOperation(); 097 098 @Override 099 public void remoteOperationCompleted(MasterProcedureEnv env) { 100 // should not be called since we use reportRegionStateTransition to report the result 101 throw new UnsupportedOperationException(); 102 } 103 104 @Override 105 public void remoteOperationFailed(MasterProcedureEnv env, RemoteProcedureException error) { 106 // should not be called since we use reportRegionStateTransition to report the result 107 throw new UnsupportedOperationException(); 108 } 109 110 private RegionStateNode getRegionNode(MasterProcedureEnv env) { 111 return env.getAssignmentManager().getRegionStates().getRegionStateNode(region); 112 } 113 114 @Override 115 public void remoteCallFailed(MasterProcedureEnv env, ServerName remote, IOException exception) { 116 RegionStateNode regionNode = getRegionNode(env); 117 regionNode.lock(); 118 try { 119 if (!env.getMasterServices().getServerManager().isServerOnline(remote)) { 120 // the SCP will interrupt us, give up 121 LOG.debug("{} for region {}, targetServer {} is dead, SCP will interrupt us, give up", this, 122 regionNode, remote); 123 return; 124 } 125 if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { 126 // not sure how can this happen but anyway let's add a check here to avoid waking the wrong 127 // procedure... 128 LOG.warn("{} for region {}, targetServer={} has already been woken up, ignore", this, 129 regionNode, remote); 130 return; 131 } 132 LOG.warn("The remote operation {} for region {} to server {} failed", this, regionNode, 133 remote, exception); 134 // It is OK to not persist the state here, as we do not need to change the region state if the 135 // remote call is failed. If the master crashed before we actually execute the procedure and 136 // persist the new state, it is fine to retry on the same target server again. 137 state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH_FAIL; 138 regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); 139 } finally { 140 regionNode.unlock(); 141 } 142 } 143 144 @Override 145 public TableName getTableName() { 146 return region.getTable(); 147 } 148 149 @Override 150 protected boolean waitInitialized(MasterProcedureEnv env) { 151 if (TableName.isMetaTableName(getTableName())) { 152 return false; 153 } 154 // First we need meta to be loaded, and second, if meta is not online then we will likely to 155 // fail when updating meta so we wait until it is assigned. 156 AssignmentManager am = env.getAssignmentManager(); 157 return am.waitMetaLoaded(this) || am.waitMetaAssigned(this, region); 158 } 159 160 @Override 161 protected void rollback(MasterProcedureEnv env) throws IOException, InterruptedException { 162 throw new UnsupportedOperationException(); 163 } 164 165 @Override 166 protected boolean abort(MasterProcedureEnv env) { 167 return false; 168 } 169 170 // do some checks to see if the report is valid 171 protected abstract void checkTransition(RegionStateNode regionNode, TransitionCode transitionCode, 172 long seqId) throws UnexpectedStateException; 173 174 // change the in memory state of the regionNode, but do not update meta. 175 protected abstract void updateTransitionWithoutPersistingToMeta(MasterProcedureEnv env, 176 RegionStateNode regionNode, TransitionCode transitionCode, long seqId) throws IOException; 177 178 // A bit strange but the procedure store will throw RuntimeException if we can not persist the 179 // state, so upper layer should take care of this... 180 private void persistAndWake(MasterProcedureEnv env, RegionStateNode regionNode) { 181 env.getMasterServices().getMasterProcedureExecutor().getStore().update(this); 182 regionNode.getProcedureEvent().wake(env.getProcedureScheduler()); 183 } 184 185 // should be called with RegionStateNode locked, to avoid race with the execute method below 186 void reportTransition(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName, 187 TransitionCode transitionCode, long seqId) throws IOException { 188 if (state != RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH) { 189 // should be a retry 190 return; 191 } 192 if (!targetServer.equals(serverName)) { 193 throw new UnexpectedStateException("Received report from " + serverName + ", expected " + 194 targetServer + ", " + regionNode + ", proc=" + this); 195 } 196 checkTransition(regionNode, transitionCode, seqId); 197 // this state means we have received the report from RS, does not mean the result is fine, as we 198 // may received a FAILED_OPEN. 199 this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED; 200 this.transitionCode = transitionCode; 201 this.seqId = seqId; 202 // Persist the transition code and openSeqNum(if provided). 203 // We should not update the hbase:meta directly as this may cause races when master restarts, 204 // as the old active master may incorrectly report back to RS and cause the new master to hang 205 // on a OpenRegionProcedure forever. See HBASE-22060 and HBASE-22074 for more details. 206 boolean succ = false; 207 try { 208 persistAndWake(env, regionNode); 209 succ = true; 210 } finally { 211 if (!succ) { 212 this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_DISPATCH; 213 this.transitionCode = null; 214 this.seqId = HConstants.NO_SEQNUM; 215 } 216 } 217 try { 218 updateTransitionWithoutPersistingToMeta(env, regionNode, transitionCode, seqId); 219 } catch (IOException e) { 220 throw new AssertionError("should not happen", e); 221 } 222 } 223 224 void serverCrashed(MasterProcedureEnv env, RegionStateNode regionNode, ServerName serverName) { 225 if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH) { 226 // should be a retry 227 return; 228 } 229 RegionRemoteProcedureBaseState oldState = state; 230 // it is possible that the state is in REGION_REMOTE_PROCEDURE_SERVER_CRASH, think of this 231 // sequence 232 // 1. region is open on the target server and the above reportTransition call is succeeded 233 // 2. before we are woken up and update the meta, the target server crashes, and then we arrive 234 // here 235 this.state = RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_SERVER_CRASH; 236 boolean succ = false; 237 try { 238 persistAndWake(env, regionNode); 239 succ = true; 240 } finally { 241 if (!succ) { 242 this.state = oldState; 243 } 244 } 245 } 246 247 protected abstract void restoreSucceedState(AssignmentManager am, RegionStateNode regionNode, 248 long seqId) throws IOException; 249 250 void stateLoaded(AssignmentManager am, RegionStateNode regionNode) { 251 if (state == RegionRemoteProcedureBaseState.REGION_REMOTE_PROCEDURE_REPORT_SUCCEED) { 252 try { 253 restoreSucceedState(am, regionNode, seqId); 254 } catch (IOException e) { 255 // should not happen as we are just restoring the state 256 throw new AssertionError(e); 257 } 258 } 259 } 260 261 private TransitRegionStateProcedure getParent(MasterProcedureEnv env) { 262 return (TransitRegionStateProcedure) env.getMasterServices().getMasterProcedureExecutor() 263 .getProcedure(getParentProcId()); 264 } 265 266 private void unattach(MasterProcedureEnv env) { 267 getParent(env).unattachRemoteProc(this); 268 } 269 270 @Override 271 protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) 272 throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { 273 RegionStateNode regionNode = getRegionNode(env); 274 regionNode.lock(); 275 try { 276 switch (state) { 277 case REGION_REMOTE_PROCEDURE_DISPATCH: { 278 // The code which wakes us up also needs to lock the RSN so here we do not need to 279 // synchronize 280 // on the event. 281 ProcedureEvent<?> event = regionNode.getProcedureEvent(); 282 try { 283 env.getRemoteDispatcher().addOperationToNode(targetServer, this); 284 } catch (FailedRemoteDispatchException e) { 285 LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + 286 "because the server is alread dead, give up and mark the procedure as complete, " + 287 "the parent procedure will take care of this.", this, region, targetServer, e); 288 unattach(env); 289 return null; 290 } 291 event.suspend(); 292 event.suspendIfNotReady(this); 293 throw new ProcedureSuspendedException(); 294 } 295 case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED: 296 env.getAssignmentManager().persistToMeta(regionNode); 297 unattach(env); 298 return null; 299 case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL: 300 // the remote call is failed so we do not need to change the region state, just return. 301 unattach(env); 302 return null; 303 case REGION_REMOTE_PROCEDURE_SERVER_CRASH: 304 env.getAssignmentManager().regionClosedAbnormally(regionNode); 305 unattach(env); 306 return null; 307 default: 308 throw new IllegalStateException("Unknown state: " + state); 309 } 310 } catch (IOException e) { 311 if (retryCounter == null) { 312 retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration()); 313 } 314 long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); 315 LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e); 316 setTimeout(Math.toIntExact(backoff)); 317 setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT); 318 skipPersistence(); 319 throw new ProcedureSuspendedException(); 320 } finally { 321 regionNode.unlock(); 322 } 323 } 324 325 @Override 326 protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) { 327 setState(ProcedureProtos.ProcedureState.RUNNABLE); 328 env.getProcedureScheduler().addFront(this); 329 return false; // 'false' means that this procedure handled the timeout 330 } 331 332 @Override 333 public boolean storeInDispatchedQueue() { 334 return false; 335 } 336 337 @Override 338 protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException { 339 RegionRemoteProcedureBaseStateData.Builder builder = 340 RegionRemoteProcedureBaseStateData.newBuilder().setRegion(ProtobufUtil.toRegionInfo(region)) 341 .setTargetServer(ProtobufUtil.toServerName(targetServer)).setState(state); 342 if (transitionCode != null) { 343 builder.setTransitionCode(transitionCode); 344 builder.setSeqId(seqId); 345 } 346 serializer.serialize(builder.build()); 347 } 348 349 @Override 350 protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException { 351 RegionRemoteProcedureBaseStateData data = 352 serializer.deserialize(RegionRemoteProcedureBaseStateData.class); 353 region = ProtobufUtil.toRegionInfo(data.getRegion()); 354 targetServer = ProtobufUtil.toServerName(data.getTargetServer()); 355 state = data.getState(); 356 if (data.hasTransitionCode()) { 357 transitionCode = data.getTransitionCode(); 358 seqId = data.getSeqId(); 359 } 360 } 361 362 @Override 363 protected void afterReplay(MasterProcedureEnv env) { 364 getParent(env).attachRemoteProc(this); 365 } 366 367 @Override public String getProcName() { 368 return getClass().getSimpleName() + " " + region.getEncodedName(); 369 } 370 371 @Override protected void toStringClassDetails(StringBuilder builder) { 372 builder.append(getProcName()); 373 if (targetServer != null) { 374 builder.append(", server="); 375 builder.append(this.targetServer); 376 } 377 if (this.retryCounter != null) { 378 builder.append(", retry="); 379 builder.append(this.retryCounter); 380 } 381 } 382}