001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.master.replication;
019
020import static org.apache.hadoop.hbase.replication.ReplicationUtils.getPeerRemoteWALDir;
021import static org.apache.hadoop.hbase.replication.ReplicationUtils.getPeerReplayWALDir;
022import static org.apache.hadoop.hbase.replication.ReplicationUtils.getPeerSnapshotWALDir;
023
024import java.io.IOException;
025import java.util.ArrayList;
026import java.util.HashSet;
027import java.util.List;
028import java.util.Optional;
029import java.util.Set;
030import java.util.concurrent.ConcurrentHashMap;
031import java.util.concurrent.ConcurrentMap;
032import org.apache.hadoop.fs.FileStatus;
033import org.apache.hadoop.fs.FileSystem;
034import org.apache.hadoop.fs.Path;
035import org.apache.hadoop.hbase.ServerName;
036import org.apache.hadoop.hbase.master.MasterServices;
037import org.apache.hadoop.hbase.master.ServerListener;
038import org.apache.hadoop.hbase.master.ServerManager;
039import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
040import org.apache.hadoop.hbase.procedure2.Procedure;
041import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
042import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
043import org.apache.hadoop.hbase.replication.ReplicationException;
044import org.apache.hadoop.hbase.replication.ReplicationUtils;
045import org.apache.hadoop.hbase.util.FSUtils;
046import org.apache.yetus.audience.InterfaceAudience;
047import org.slf4j.Logger;
048import org.slf4j.LoggerFactory;
049
050import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
051
052/**
053 * The manager for replaying remote wal.
054 * <p/>
055 * First, it will be used to balance the replay work across all the region servers. We will record
056 * the region servers which have already been used for replaying wal, and prevent sending new replay
057 * work to it, until the previous replay work has been done, where we will remove the region server
058 * from the used worker set. See the comment for {@code UsedReplayWorkersForPeer} for more details.
059 * <p/>
060 * Second, the logic for managing the remote wal directory is kept here. Before replaying the wals,
061 * we will rename the remote wal directory, the new name is called 'replay' directory, see
062 * {@link #renameToPeerReplayWALDir(String)}. This is used to prevent further writing of remote
063 * wals, which is very important for keeping consistency. And then we will start replaying all the
064 * wals, once a wal has been replayed, we will truncate the file, so that if there are crashes
065 * happen, we do not need to replay all the wals again, see {@link #finishReplayWAL(String)} and
066 * {@link #isReplayWALFinished(String)}. After replaying all the wals, we will rename the 'replay'
067 * directory, the new name is called 'snapshot' directory. In the directory, we will keep all the
068 * names for the wals being replayed, since all the files should have been truncated. When we
069 * transitting original the ACTIVE cluster to STANDBY later, and there are region server crashes, we
070 * will see the wals in this directory to determine whether a wal should be split and replayed or
071 * not. You can see the code in {@link org.apache.hadoop.hbase.regionserver.SplitLogWorker} for more
072 * details.
073 */
074@InterfaceAudience.Private
075public class SyncReplicationReplayWALManager {
076
077  private static final Logger LOG = LoggerFactory.getLogger(SyncReplicationReplayWALManager.class);
078
079  private final ServerManager serverManager;
080
081  private final FileSystem fs;
082
083  private final Path walRootDir;
084
085  private final Path remoteWALDir;
086
087  /**
088   * This class is used to record the used workers(region servers) for a replication peer. For
089   * balancing the replaying remote wal job, we will only schedule one remote replay procedure each
090   * time. So when acquiring a worker, we will first get all the region servers for this cluster,
091   * and then filter out the used ones.
092   * <p/>
093   * The {@link ProcedureEvent} is used for notifying procedures that there are available workers
094   * now. We used to use sleeping and retrying before, but if the interval is too large, for
095   * example, exponential backoff, then it is not effective, but if the interval is too small, then
096   * we will flood the procedure wal.
097   * <p/>
098   * The states are only kept in memory, so when restarting, we need to reconstruct these
099   * information, using the information stored in related procedures. See the {@code afterReplay}
100   * method in {@link RecoverStandbyProcedure} and {@link SyncReplicationReplayWALProcedure} for
101   * more details.
102   */
103  private static final class UsedReplayWorkersForPeer {
104
105    private final Set<ServerName> usedWorkers = new HashSet<ServerName>();
106
107    private final ProcedureEvent<?> event;
108
109    public UsedReplayWorkersForPeer(String peerId) {
110      this.event = new ProcedureEvent<>(peerId);
111    }
112
113    public void used(ServerName worker) {
114      usedWorkers.add(worker);
115    }
116
117    public Optional<ServerName> acquire(ServerManager serverManager) {
118      Optional<ServerName> worker = serverManager.getOnlineServers().keySet().stream()
119        .filter(server -> !usedWorkers.contains(server)).findAny();
120      worker.ifPresent(usedWorkers::add);
121      return worker;
122    }
123
124    public void release(ServerName worker) {
125      usedWorkers.remove(worker);
126    }
127
128    public void suspend(Procedure<?> proc) {
129      event.suspend();
130      event.suspendIfNotReady(proc);
131    }
132
133    public void wake(MasterProcedureScheduler scheduler) {
134      if (!event.isReady()) {
135        event.wake(scheduler);
136      }
137    }
138  }
139
140  private final ConcurrentMap<String, UsedReplayWorkersForPeer> usedWorkersByPeer =
141    new ConcurrentHashMap<>();
142
143  public SyncReplicationReplayWALManager(MasterServices services)
144      throws IOException, ReplicationException {
145    this.serverManager = services.getServerManager();
146    this.fs = services.getMasterFileSystem().getWALFileSystem();
147    this.walRootDir = services.getMasterFileSystem().getWALRootDir();
148    this.remoteWALDir = new Path(this.walRootDir, ReplicationUtils.REMOTE_WAL_DIR_NAME);
149    serverManager.registerListener(new ServerListener() {
150
151      @Override
152      public void serverAdded(ServerName serverName) {
153        MasterProcedureScheduler scheduler =
154          services.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler();
155        for (UsedReplayWorkersForPeer usedWorkers : usedWorkersByPeer.values()) {
156          synchronized (usedWorkers) {
157            usedWorkers.wake(scheduler);
158          }
159        }
160      }
161    });
162  }
163
164  public void registerPeer(String peerId) {
165    usedWorkersByPeer.put(peerId, new UsedReplayWorkersForPeer(peerId));
166  }
167
168  public void unregisterPeer(String peerId) {
169    usedWorkersByPeer.remove(peerId);
170  }
171
172  /**
173   * Get a worker for replaying remote wal for a give peer. If no worker available, i.e, all the
174   * region servers have been used by others, a {@link ProcedureSuspendedException} will be thrown
175   * to suspend the procedure. And it will be woken up later when there are available workers,
176   * either by others release a worker, or there is a new region server joins the cluster.
177   */
178  public ServerName acquirePeerWorker(String peerId, Procedure<?> proc)
179      throws ProcedureSuspendedException {
180    UsedReplayWorkersForPeer usedWorkers = usedWorkersByPeer.get(peerId);
181    synchronized (usedWorkers) {
182      Optional<ServerName> worker = usedWorkers.acquire(serverManager);
183      if (worker.isPresent()) {
184        return worker.get();
185      }
186      // no worker available right now, suspend the procedure
187      usedWorkers.suspend(proc);
188    }
189    throw new ProcedureSuspendedException();
190  }
191
192  public void releasePeerWorker(String peerId, ServerName worker,
193      MasterProcedureScheduler scheduler) {
194    UsedReplayWorkersForPeer usedWorkers = usedWorkersByPeer.get(peerId);
195    synchronized (usedWorkers) {
196      usedWorkers.release(worker);
197      usedWorkers.wake(scheduler);
198    }
199  }
200
201  /**
202   * Will only be called when loading procedures, where we need to construct the used worker set for
203   * each peer.
204   */
205  public void addUsedPeerWorker(String peerId, ServerName worker) {
206    usedWorkersByPeer.get(peerId).used(worker);
207  }
208
209  public void createPeerRemoteWALDir(String peerId) throws IOException {
210    Path peerRemoteWALDir = getPeerRemoteWALDir(remoteWALDir, peerId);
211    if (!fs.exists(peerRemoteWALDir) && !fs.mkdirs(peerRemoteWALDir)) {
212      throw new IOException("Unable to mkdir " + peerRemoteWALDir);
213    }
214  }
215
216  private void rename(Path src, Path dst, String peerId) throws IOException {
217    if (fs.exists(src)) {
218      deleteDir(dst, peerId);
219      if (!fs.rename(src, dst)) {
220        throw new IOException(
221          "Failed to rename dir from " + src + " to " + dst + " for peer id=" + peerId);
222      }
223      LOG.info("Renamed dir from {} to {} for peer id={}", src, dst, peerId);
224    } else if (!fs.exists(dst)) {
225      throw new IOException(
226        "Want to rename from " + src + " to " + dst + ", but they both do not exist");
227    }
228  }
229
230  public void renameToPeerReplayWALDir(String peerId) throws IOException {
231    rename(getPeerRemoteWALDir(remoteWALDir, peerId), getPeerReplayWALDir(remoteWALDir, peerId),
232      peerId);
233  }
234
235  public void renameToPeerSnapshotWALDir(String peerId) throws IOException {
236    rename(getPeerReplayWALDir(remoteWALDir, peerId), getPeerSnapshotWALDir(remoteWALDir, peerId),
237      peerId);
238  }
239
240  public List<Path> getReplayWALsAndCleanUpUnusedFiles(String peerId) throws IOException {
241    Path peerReplayWALDir = getPeerReplayWALDir(remoteWALDir, peerId);
242    for (FileStatus status : fs.listStatus(peerReplayWALDir,
243      p -> p.getName().endsWith(ReplicationUtils.RENAME_WAL_SUFFIX))) {
244      Path src = status.getPath();
245      String srcName = src.getName();
246      String dstName =
247        srcName.substring(0, srcName.length() - ReplicationUtils.RENAME_WAL_SUFFIX.length());
248      FSUtils.renameFile(fs, src, new Path(src.getParent(), dstName));
249    }
250    List<Path> wals = new ArrayList<>();
251    for (FileStatus status : fs.listStatus(peerReplayWALDir)) {
252      Path path = status.getPath();
253      if (path.getName().endsWith(ReplicationUtils.SYNC_WAL_SUFFIX)) {
254        wals.add(path);
255      } else {
256        if (!fs.delete(path, true)) {
257          LOG.warn("Can not delete unused file: " + path);
258        }
259      }
260    }
261    return wals;
262  }
263
264  private void deleteDir(Path dir, String peerId) throws IOException {
265    if (!fs.delete(dir, true) && fs.exists(dir)) {
266      throw new IOException("Failed to remove dir " + dir + " for peer id=" + peerId);
267    }
268  }
269
270  public void removePeerRemoteWALs(String peerId) throws IOException {
271    deleteDir(getPeerRemoteWALDir(remoteWALDir, peerId), peerId);
272    deleteDir(getPeerReplayWALDir(remoteWALDir, peerId), peerId);
273    deleteDir(getPeerSnapshotWALDir(remoteWALDir, peerId), peerId);
274  }
275
276  public String removeWALRootPath(Path path) {
277    String pathStr = path.toString();
278    // remove the "/" too.
279    return pathStr.substring(walRootDir.toString().length() + 1);
280  }
281
282  public void finishReplayWAL(String wal) throws IOException {
283    Path walPath = new Path(walRootDir, wal);
284    fs.truncate(walPath, 0);
285  }
286
287  public boolean isReplayWALFinished(String wal) throws IOException {
288    Path walPath = new Path(walRootDir, wal);
289    return fs.getFileStatus(walPath).getLen() == 0;
290  }
291
292  @VisibleForTesting
293  public Path getRemoteWALDir() {
294    return remoteWALDir;
295  }
296}