001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.regionserver.wal;
019
020import static org.apache.hadoop.hbase.util.ConcurrentMapUtils.computeIfAbsent;
021
022import java.util.ArrayList;
023import java.util.Collection;
024import java.util.Collections;
025import java.util.HashMap;
026import java.util.List;
027import java.util.Map;
028import java.util.Set;
029import java.util.TreeMap;
030import java.util.concurrent.ConcurrentHashMap;
031import java.util.concurrent.ConcurrentMap;
032import java.util.stream.Collectors;
033import org.apache.hadoop.hbase.HConstants;
034import org.apache.hadoop.hbase.util.Bytes;
035import org.apache.hadoop.hbase.util.ImmutableByteArray;
036import org.apache.yetus.audience.InterfaceAudience;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
041
042/**
043 * Accounting of sequence ids per region and then by column family. So we can keep our accounting
044 * current, call startCacheFlush and then finishedCacheFlush or abortCacheFlush so this instance can
045 * keep abreast of the state of sequence id persistence. Also call update per append.
046 * <p>
047 * For the implementation, we assume that all the {@code encodedRegionName} passed in are gotten by
048 * {@link org.apache.hadoop.hbase.client.RegionInfo#getEncodedNameAsBytes()}. So it is safe to use
049 * it as a hash key. And for family name, we use {@link ImmutableByteArray} as key. This is because
050 * hash based map is much faster than RBTree or CSLM and here we are on the critical write path. See
051 * HBASE-16278 for more details.
052 * </p>
053 */
054@InterfaceAudience.Private
055class SequenceIdAccounting {
056  private static final Logger LOG = LoggerFactory.getLogger(SequenceIdAccounting.class);
057
058  /**
059   * This lock ties all operations on {@link SequenceIdAccounting#flushingSequenceIds} and
060   * {@link #lowestUnflushedSequenceIds} Maps. {@link #lowestUnflushedSequenceIds} has the
061   * lowest outstanding sequence ids EXCEPT when flushing. When we flush, the current
062   * lowest set for the region/column family are moved (atomically because of this lock) to
063   * {@link #flushingSequenceIds}.
064   * 
065   * <p>The two Maps are tied by this locking object EXCEPT when we go to update the lowest
066   * entry; see {@link #lowestUnflushedSequenceIds}. In here is a putIfAbsent call on
067   * {@link #lowestUnflushedSequenceIds}. In this latter case, we will add this lowest
068   * sequence id if we find that there is no entry for the current column family. There will be no
069   * entry only if we just came up OR we have moved aside current set of lowest sequence ids
070   * because the current set are being flushed (by putting them into {@link #flushingSequenceIds}).
071   * This is how we pick up the next 'lowest' sequence id per region per column family to be used
072   * figuring what is in the next flush.
073   */
074  private final Object tieLock = new Object();
075
076  /**
077   * Map of encoded region names and family names to their OLDEST -- i.e. their first,
078   * the longest-lived, their 'earliest', the 'lowest' -- sequence id.
079   *
080   * <p>When we flush, the current lowest sequence ids get cleared and added to
081   * {@link #flushingSequenceIds}. The next append that comes in, is then added
082   * here to {@link #lowestUnflushedSequenceIds} as the next lowest sequenceid.
083   *
084   * <p>If flush fails, currently server is aborted so no need to restore previous sequence ids.
085   * <p>Needs to be concurrent Maps because we use putIfAbsent updating oldest.
086   */
087  private final ConcurrentMap<byte[], ConcurrentMap<ImmutableByteArray, Long>>
088    lowestUnflushedSequenceIds = new ConcurrentHashMap<>();
089
090  /**
091   * Map of encoded region names and family names to their lowest or OLDEST sequence/edit id
092   * currently being flushed out to hfiles. Entries are moved here from
093   * {@link #lowestUnflushedSequenceIds} while the lock {@link #tieLock} is held
094   * (so movement between the Maps is atomic).
095   */
096  private final Map<byte[], Map<ImmutableByteArray, Long>> flushingSequenceIds = new HashMap<>();
097
098  /**
099   * <p>
100   * Map of region encoded names to the latest/highest region sequence id. Updated on each call to
101   * append.
102   * </p>
103   * <p>
104   * This map uses byte[] as the key, and uses reference equality. It works in our use case as we
105   * use {@link org.apache.hadoop.hbase.client.RegionInfo#getEncodedNameAsBytes()} as keys. For a
106   * given region, it always returns the same array.
107   * </p>
108   */
109  private Map<byte[], Long> highestSequenceIds = new HashMap<>();
110
111  /**
112   * Returns the lowest unflushed sequence id for the region.
113   * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionName</code>. Will
114   * return {@link HConstants#NO_SEQNUM} when none.
115   */
116  long getLowestSequenceId(final byte[] encodedRegionName) {
117    synchronized (this.tieLock) {
118      Map<?, Long> m = this.flushingSequenceIds.get(encodedRegionName);
119      long flushingLowest = m != null ? getLowestSequenceId(m) : Long.MAX_VALUE;
120      m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
121      long unflushedLowest = m != null ? getLowestSequenceId(m) : HConstants.NO_SEQNUM;
122      return Math.min(flushingLowest, unflushedLowest);
123    }
124  }
125
126  /**
127   * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionname</code> and
128   *         <code>familyName</code>. Returned sequenceid may be for an edit currently being
129   *         flushed.
130   */
131  long getLowestSequenceId(final byte[] encodedRegionName, final byte[] familyName) {
132    ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
133    synchronized (this.tieLock) {
134      Map<ImmutableByteArray, Long> m = this.flushingSequenceIds.get(encodedRegionName);
135      if (m != null) {
136        Long lowest = m.get(familyNameWrapper);
137        if (lowest != null) {
138          return lowest;
139        }
140      }
141      m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
142      if (m != null) {
143        Long lowest = m.get(familyNameWrapper);
144        if (lowest != null) {
145          return lowest;
146        }
147      }
148    }
149    return HConstants.NO_SEQNUM;
150  }
151
152  /**
153   * Reset the accounting of highest sequenceid by regionname.
154   * @return Return the previous accounting Map of regions to the last sequence id written into
155   * each.
156   */
157  Map<byte[], Long> resetHighest() {
158    Map<byte[], Long> old = this.highestSequenceIds;
159    this.highestSequenceIds = new HashMap<>();
160    return old;
161  }
162
163  /**
164   * We've been passed a new sequenceid for the region. Set it as highest seen for this region and
165   * if we are to record oldest, or lowest sequenceids, save it as oldest seen if nothing
166   * currently older.
167   * @param encodedRegionName
168   * @param families
169   * @param sequenceid
170   * @param lowest Whether to keep running account of oldest sequence id.
171   */
172  void update(byte[] encodedRegionName, Set<byte[]> families, long sequenceid,
173      final boolean lowest) {
174    Long l = Long.valueOf(sequenceid);
175    this.highestSequenceIds.put(encodedRegionName, l);
176    if (lowest) {
177      ConcurrentMap<ImmutableByteArray, Long> m = getOrCreateLowestSequenceIds(encodedRegionName);
178      for (byte[] familyName : families) {
179        m.putIfAbsent(ImmutableByteArray.wrap(familyName), l);
180      }
181    }
182  }
183
184  /**
185   * Clear all the records of the given region as it is going to be closed.
186   * <p/>
187   * We will call this once we get the region close marker. We need this because that, if we use
188   * Durability.ASYNC_WAL, after calling startCacheFlush, we may still get some ongoing wal entries
189   * that has not been processed yet, this will lead to orphan records in the
190   * lowestUnflushedSequenceIds and then cause too many WAL files.
191   * <p/>
192   * See HBASE-23157 for more details.
193   */
194  void onRegionClose(byte[] encodedRegionName) {
195    synchronized (tieLock) {
196      this.lowestUnflushedSequenceIds.remove(encodedRegionName);
197      Map<ImmutableByteArray, Long> flushing = this.flushingSequenceIds.remove(encodedRegionName);
198      if (flushing != null) {
199        LOG.warn("Still have flushing records when closing {}, {}",
200          Bytes.toString(encodedRegionName),
201          flushing.entrySet().stream().map(e -> e.getKey().toString() + "->" + e.getValue())
202            .collect(Collectors.joining(",", "{", "}")));
203      }
204    }
205    this.highestSequenceIds.remove(encodedRegionName);
206  }
207
208  /**
209   * Update the store sequence id, e.g., upon executing in-memory compaction
210   */
211  void updateStore(byte[] encodedRegionName, byte[] familyName, Long sequenceId,
212      boolean onlyIfGreater) {
213    if (sequenceId == null) {
214      return;
215    }
216    Long highest = this.highestSequenceIds.get(encodedRegionName);
217    if (highest == null || sequenceId > highest) {
218      this.highestSequenceIds.put(encodedRegionName, sequenceId);
219    }
220    ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap(familyName);
221    synchronized (this.tieLock) {
222      ConcurrentMap<ImmutableByteArray, Long> m = getOrCreateLowestSequenceIds(encodedRegionName);
223      boolean replaced = false;
224      while (!replaced) {
225        Long oldSeqId = m.get(familyNameWrapper);
226        if (oldSeqId == null) {
227          m.put(familyNameWrapper, sequenceId);
228          replaced = true;
229        } else if (onlyIfGreater) {
230          if (sequenceId > oldSeqId) {
231            replaced = m.replace(familyNameWrapper, oldSeqId, sequenceId);
232          } else {
233            return;
234          }
235        } else { // replace even if sequence id is not greater than oldSeqId
236          m.put(familyNameWrapper, sequenceId);
237          return;
238        }
239      }
240    }
241  }
242
243  @VisibleForTesting
244  ConcurrentMap<ImmutableByteArray, Long> getOrCreateLowestSequenceIds(byte[] encodedRegionName) {
245    // Intentionally, this access is done outside of this.regionSequenceIdLock. Done per append.
246    return computeIfAbsent(this.lowestUnflushedSequenceIds, encodedRegionName,
247      ConcurrentHashMap::new);
248  }
249
250  /**
251   * @param sequenceids Map to search for lowest value.
252   * @return Lowest value found in <code>sequenceids</code>.
253   */
254  private static long getLowestSequenceId(Map<?, Long> sequenceids) {
255    long lowest = HConstants.NO_SEQNUM;
256    for (Long sid: sequenceids.values()) {
257      if (lowest == HConstants.NO_SEQNUM || sid.longValue() < lowest) {
258        lowest = sid.longValue();
259      }
260    }
261    return lowest;
262  }
263
264  /**
265   * @param src
266   * @return New Map that has same keys as <code>src</code> but instead of a Map for a value, it
267   *         instead has found the smallest sequence id and it returns that as the value instead.
268   */
269  private <T extends Map<?, Long>> Map<byte[], Long> flattenToLowestSequenceId(Map<byte[], T> src) {
270    if (src == null || src.isEmpty()) {
271      return null;
272    }
273    Map<byte[], Long> tgt = new HashMap<>();
274    for (Map.Entry<byte[], T> entry : src.entrySet()) {
275      long lowestSeqId = getLowestSequenceId(entry.getValue());
276      if (lowestSeqId != HConstants.NO_SEQNUM) {
277        tgt.put(entry.getKey(), lowestSeqId);
278      }
279    }
280    return tgt;
281  }
282
283  /**
284   * @param encodedRegionName Region to flush.
285   * @param families Families to flush. May be a subset of all families in the region.
286   * @return Returns {@link HConstants#NO_SEQNUM} if we are flushing the whole region OR if
287   * we are flushing a subset of all families but there are no edits in those families not
288   * being flushed; in other words, this is effectively same as a flush of all of the region
289   * though we were passed a subset of regions. Otherwise, it returns the sequence id of the
290   * oldest/lowest outstanding edit.
291   */
292  Long startCacheFlush(final byte[] encodedRegionName, final Set<byte[]> families) {
293    Map<byte[],Long> familytoSeq = new HashMap<>();
294    for (byte[] familyName : families){
295      familytoSeq.put(familyName,HConstants.NO_SEQNUM);
296    }
297    return startCacheFlush(encodedRegionName,familytoSeq);
298  }
299
300  Long startCacheFlush(final byte[] encodedRegionName, final Map<byte[], Long> familyToSeq) {
301    Map<ImmutableByteArray, Long> oldSequenceIds = null;
302    Long lowestUnflushedInRegion = HConstants.NO_SEQNUM;
303    synchronized (tieLock) {
304      Map<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
305      if (m != null) {
306        // NOTE: Removal from this.lowestUnflushedSequenceIds must be done in controlled
307        // circumstance because another concurrent thread now may add sequenceids for this family
308        // (see above in getOrCreateLowestSequenceId). Make sure you are ok with this. Usually it
309        // is fine because updates are blocked when this method is called. Make sure!!!
310        for (Map.Entry<byte[], Long> entry : familyToSeq.entrySet()) {
311          ImmutableByteArray familyNameWrapper = ImmutableByteArray.wrap((byte[]) entry.getKey());
312          Long seqId = null;
313          if(entry.getValue() == HConstants.NO_SEQNUM) {
314            seqId = m.remove(familyNameWrapper);
315          } else {
316            seqId = m.replace(familyNameWrapper, entry.getValue());
317          }
318          if (seqId != null) {
319            if (oldSequenceIds == null) {
320              oldSequenceIds = new HashMap<>();
321            }
322            oldSequenceIds.put(familyNameWrapper, seqId);
323          }
324        }
325        if (oldSequenceIds != null && !oldSequenceIds.isEmpty()) {
326          if (this.flushingSequenceIds.put(encodedRegionName, oldSequenceIds) != null) {
327            LOG.warn("Flushing Map not cleaned up for " + Bytes.toString(encodedRegionName) +
328              ", sequenceid=" + oldSequenceIds);
329          }
330        }
331        if (m.isEmpty()) {
332          // Remove it otherwise it will be in oldestUnflushedStoreSequenceIds for ever
333          // even if the region is already moved to other server.
334          // Do not worry about data racing, we held write lock of region when calling
335          // startCacheFlush, so no one can add value to the map we removed.
336          this.lowestUnflushedSequenceIds.remove(encodedRegionName);
337        } else {
338          // Flushing a subset of the region families. Return the sequence id of the oldest entry.
339          lowestUnflushedInRegion = Collections.min(m.values());
340        }
341      }
342    }
343    // Do this check outside lock.
344    if (oldSequenceIds != null && oldSequenceIds.isEmpty()) {
345      // TODO: if we have no oldStoreSeqNum, and WAL is not disabled, presumably either
346      // the region is already flushing (which would make this call invalid), or there
347      // were no appends after last flush, so why are we starting flush? Maybe we should
348      // assert not empty. Less rigorous, but safer, alternative is telling the caller to stop.
349      // For now preserve old logic.
350      LOG.warn("Couldn't find oldest sequenceid for " + Bytes.toString(encodedRegionName));
351    }
352    return lowestUnflushedInRegion;
353  }
354
355  void completeCacheFlush(final byte[] encodedRegionName) {
356    synchronized (tieLock) {
357      this.flushingSequenceIds.remove(encodedRegionName);
358    }
359  }
360
361  void abortCacheFlush(final byte[] encodedRegionName) {
362    // Method is called when we are crashing down because failed write flush AND it is called
363    // if we fail prepare. The below is for the fail prepare case; we restore the old sequence ids.
364    Map<ImmutableByteArray, Long> flushing = null;
365    Map<ImmutableByteArray, Long> tmpMap = new HashMap<>();
366    // Here we are moving sequenceids from flushing back to unflushed; doing opposite of what
367    // happened in startCacheFlush. During prepare phase, we have update lock on the region so
368    // no edits should be coming in via append.
369    synchronized (tieLock) {
370      flushing = this.flushingSequenceIds.remove(encodedRegionName);
371      if (flushing != null) {
372        Map<ImmutableByteArray, Long> unflushed = getOrCreateLowestSequenceIds(encodedRegionName);
373        for (Map.Entry<ImmutableByteArray, Long> e: flushing.entrySet()) {
374          // Set into unflushed the 'old' oldest sequenceid and if any value in flushed with this
375          // value, it will now be in tmpMap.
376          tmpMap.put(e.getKey(), unflushed.put(e.getKey(), e.getValue()));
377        }
378      }
379    }
380
381    // Here we are doing some 'test' to see if edits are going in out of order. What is it for?
382    // Carried over from old code.
383    if (flushing != null) {
384      for (Map.Entry<ImmutableByteArray, Long> e : flushing.entrySet()) {
385        Long currentId = tmpMap.get(e.getKey());
386        if (currentId != null && currentId.longValue() < e.getValue().longValue()) {
387          String errorStr = Bytes.toString(encodedRegionName) + " family "
388              + e.getKey().toString() + " acquired edits out of order current memstore seq="
389              + currentId + ", previous oldest unflushed id=" + e.getValue();
390          LOG.error(errorStr);
391          Runtime.getRuntime().halt(1);
392        }
393      }
394    }
395  }
396
397  /**
398   * See if passed <code>sequenceids</code> are lower -- i.e. earlier -- than any outstanding
399   * sequenceids, sequenceids we are holding on to in this accounting instance.
400   * @param sequenceids Keyed by encoded region name. Cannot be null (doesn't make sense for it to
401   *          be null).
402   * @param keysBlocking An optional collection that is used to return the specific keys that are
403   *          causing this method to return false.
404   * @return true if all sequenceids are lower, older than, the old sequenceids in this instance.
405   */
406  boolean areAllLower(Map<byte[], Long> sequenceids, Collection<byte[]> keysBlocking) {
407    Map<byte[], Long> flushing = null;
408    Map<byte[], Long> unflushed = null;
409    synchronized (this.tieLock) {
410      // Get a flattened -- only the oldest sequenceid -- copy of current flushing and unflushed
411      // data structures to use in tests below.
412      flushing = flattenToLowestSequenceId(this.flushingSequenceIds);
413      unflushed = flattenToLowestSequenceId(this.lowestUnflushedSequenceIds);
414    }
415    boolean result = true;
416    for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
417      long oldestFlushing = Long.MAX_VALUE;
418      long oldestUnflushed = Long.MAX_VALUE;
419      if (flushing != null && flushing.containsKey(e.getKey())) {
420        oldestFlushing = flushing.get(e.getKey());
421      }
422      if (unflushed != null && unflushed.containsKey(e.getKey())) {
423        oldestUnflushed = unflushed.get(e.getKey());
424      }
425      long min = Math.min(oldestFlushing, oldestUnflushed);
426      if (min <= e.getValue()) {
427        if (keysBlocking == null) {
428          return false;
429        }
430        result = false;
431        keysBlocking.add(e.getKey());
432        // Continue examining the map so we could log all regions blocking this WAL.
433      }
434    }
435    return result;
436  }
437
438  /**
439   * Iterates over the given Map and compares sequence ids with corresponding entries in
440   * {@link #lowestUnflushedSequenceIds}. If a region in
441   * {@link #lowestUnflushedSequenceIds} has a sequence id less than that passed in
442   * <code>sequenceids</code> then return it.
443   * @param sequenceids Sequenceids keyed by encoded region name.
444   * @return stores of regions found in this instance with sequence ids less than those passed in.
445   */
446  Map<byte[], List<byte[]>> findLower(Map<byte[], Long> sequenceids) {
447    Map<byte[], List<byte[]>> toFlush = null;
448    // Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
449    synchronized (tieLock) {
450      for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
451        Map<ImmutableByteArray, Long> m = this.lowestUnflushedSequenceIds.get(e.getKey());
452        if (m == null) {
453          continue;
454        }
455        for (Map.Entry<ImmutableByteArray, Long> me : m.entrySet()) {
456          if (me.getValue() <= e.getValue()) {
457            if (toFlush == null) {
458              toFlush = new TreeMap(Bytes.BYTES_COMPARATOR);
459            }
460            toFlush.computeIfAbsent(e.getKey(), k -> new ArrayList<>())
461              .add(Bytes.toBytes(me.getKey().toString()));
462          }
463        }
464      }
465    }
466    return toFlush;
467  }
468}