View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.regionserver.wal;
19  
20  import java.util.ArrayList;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.TreeMap;
27  import java.util.concurrent.ConcurrentMap;
28  import java.util.concurrent.ConcurrentSkipListMap;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.hadoop.hbase.HConstants;
33  import org.apache.hadoop.hbase.util.Bytes;
34  
35  import com.google.common.collect.Maps;
36  
37  /**
38   * Accounting of sequence ids per region and then by column family. So we can our accounting
39   * current, call startCacheFlush and then finishedCacheFlush or abortCacheFlush so this instance
40   * can keep abreast of the state of sequence id persistence. Also call update per append.
41   */
42  class SequenceIdAccounting {
43    private static final Log LOG = LogFactory.getLog(SequenceIdAccounting.class);
44    /**
45     * This lock ties all operations on {@link SequenceIdAccounting#flushingSequenceIds} and
46     * {@link #lowestUnflushedSequenceIds} Maps. {@link #lowestUnflushedSequenceIds} has the
47     * lowest outstanding sequence ids EXCEPT when flushing. When we flush, the current
48     * lowest set for the region/column family are moved (atomically because of this lock) to
49     * {@link #flushingSequenceIds}.
50     * 
51     * <p>The two Maps are tied by this locking object EXCEPT when we go to update the lowest
52     * entry; see {@link #lowest(byte[], Set, Long)}. In here is a putIfAbsent call on
53     * {@link #lowestUnflushedSequenceIds}. In this latter case, we will add this lowest
54     * sequence id if we find that there is no entry for the current column family. There will be no
55     * entry only if we just came up OR we have moved aside current set of lowest sequence ids
56     * because the current set are being flushed (by putting them into {@link #flushingSequenceIds}).
57     * This is how we pick up the next 'lowest' sequence id per region per column family to be used
58     * figuring what is in the next flush.
59     */
60    private final Object tieLock = new Object();
61  
62    /**
63     * Map of encoded region names and family names to their OLDEST -- i.e. their first,
64     * the longest-lived, their 'earliest', the 'lowest' -- sequence id.
65     *
66     * <p>When we flush, the current lowest sequence ids get cleared and added to
67     * {@link #flushingSequenceIds}. The next append that comes in, is then added
68     * here to {@link #lowestUnflushedSequenceIds} as the next lowest sequenceid.
69     *
70     * <p>If flush fails, currently server is aborted so no need to restore previous sequence ids.
71     * <p>Needs to be concurrent Maps because we use putIfAbsent updating oldest.
72     */
73    private final ConcurrentMap<byte[], ConcurrentMap<byte[], Long>> lowestUnflushedSequenceIds
74      = new ConcurrentSkipListMap<byte[], ConcurrentMap<byte[], Long>>(
75        Bytes.BYTES_COMPARATOR);
76  
77    /**
78     * Map of encoded region names and family names to their lowest or OLDEST sequence/edit id
79     * currently being flushed out to hfiles. Entries are moved here from
80     * {@link #lowestUnflushedSequenceIds} while the lock {@link #tieLock} is held
81     * (so movement between the Maps is atomic).
82     */
83    private final Map<byte[], Map<byte[], Long>> flushingSequenceIds =
84      new TreeMap<byte[], Map<byte[], Long>>(Bytes.BYTES_COMPARATOR);
85  
86   /**
87    * Map of region encoded names to the latest/highest region sequence id.  Updated on each
88    * call to append.
89    * <p>
90    * This map uses byte[] as the key, and uses reference equality. It works in our use case as we
91    * use {@link HRegionInfo#getEncodedNameAsBytes()} as keys. For a given region, it always returns
92    * the same array.
93    */
94    private Map<byte[], Long> highestSequenceIds = new HashMap<byte[], Long>();
95  
96    /**
97     * Returns the lowest unflushed sequence id for the region.
98     * @param encodedRegionName
99     * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionName</code>. Will
100    * return {@link HConstants#NO_SEQNUM} when none.
101    */
102   long getLowestSequenceId(final byte [] encodedRegionName) {
103     synchronized (this.tieLock)  {
104       Map<byte[], Long> m = this.flushingSequenceIds.get(encodedRegionName);
105       long flushingLowest = m != null? getLowestSequenceId(m): Long.MAX_VALUE;
106       m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
107       long unflushedLowest = m != null? getLowestSequenceId(m): HConstants.NO_SEQNUM;
108       return Math.min(flushingLowest, unflushedLowest);
109     }
110   }
111 
112   /**
113    * @param encodedRegionName
114    * @param familyName 
115    * @return Lowest outstanding unflushed sequenceid for <code>encodedRegionname</code> and
116    * <code>familyName</code>. Returned sequenceid may be for an edit currently being flushed.
117    */
118   long getLowestSequenceId(final byte [] encodedRegionName, final byte [] familyName) {
119     synchronized (this.tieLock) {
120       Map<byte[], Long> m = this.flushingSequenceIds.get(encodedRegionName);
121       if (m != null) {
122         Long lowest = m.get(familyName);
123         if (lowest != null) return lowest;
124       }
125       m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
126       if (m != null) {
127         Long lowest = m.get(familyName);
128         if (lowest != null) return lowest;
129       }
130     }
131     return HConstants.NO_SEQNUM;
132   }
133 
134   /**
135    * Reset the accounting of highest sequenceid by regionname.
136    * @return Return the previous accounting Map of regions to the last sequence id written into
137    * each.
138    */
139   Map<byte[], Long> resetHighest() {
140     Map<byte[], Long> old = this.highestSequenceIds;
141     this.highestSequenceIds = new HashMap<byte[], Long>();
142     return old;
143   }
144 
145   /**
146    * We've been passed a new sequenceid for the region. Set it as highest seen for this region and
147    * if we are to record oldest, or lowest sequenceids, save it as oldest seen if nothing
148    * currently older.
149    * @param encodedRegionName
150    * @param families
151    * @param sequenceid
152    * @param lowest Whether to keep running account of oldest sequence id.
153    */
154   void update(byte[] encodedRegionName, Set<byte[]> families, long sequenceid,
155       final boolean lowest) {
156     Long l = Long.valueOf(sequenceid);
157     this.highestSequenceIds.put(encodedRegionName, l);
158     if (lowest) {
159       ConcurrentMap<byte[], Long> m = getOrCreateLowestSequenceIds(encodedRegionName);
160       for (byte[] familyName : families) {
161         m.putIfAbsent(familyName, l);
162       }
163     }
164   }
165 
166   ConcurrentMap<byte[], Long> getOrCreateLowestSequenceIds(byte[] encodedRegionName) {
167     // Intentionally, this access is done outside of this.regionSequenceIdLock. Done per append.
168     ConcurrentMap<byte[], Long> m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
169     if (m != null) return m;
170     m = new ConcurrentSkipListMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
171     // Another thread may have added it ahead of us.
172     ConcurrentMap<byte[], Long> alreadyPut =
173         this.lowestUnflushedSequenceIds.putIfAbsent(encodedRegionName, m);
174     return alreadyPut == null? m : alreadyPut;
175   }
176 
177   /**
178    * @param sequenceids Map to search for lowest value.
179    * @return Lowest value found in <code>sequenceids</code>.
180    */
181   static long getLowestSequenceId(Map<byte[], Long> sequenceids) {
182     long lowest = HConstants.NO_SEQNUM;
183     for (Long sid: sequenceids.values()) {
184       if (lowest == HConstants.NO_SEQNUM || sid.longValue() < lowest) {
185         lowest = sid.longValue();
186       }
187     }
188     return lowest;
189   }
190 
191   /**
192    * @param src
193    * @return New Map that has same keys as <code>src</code> but instead of a Map for a value, it
194    * instead has found the smallest sequence id and it returns that as the value instead.
195    */
196   private <T extends Map<byte[], Long>> Map<byte[], Long> flattenToLowestSequenceId(
197       Map<byte[], T> src) {
198     if (src == null || src.isEmpty()) return null;
199     Map<byte[], Long> tgt = Maps.newHashMap();
200     for (Map.Entry<byte[], T> entry: src.entrySet()) {
201       long lowestSeqId = getLowestSequenceId(entry.getValue());
202       if (lowestSeqId != HConstants.NO_SEQNUM) {
203         tgt.put(entry.getKey(), lowestSeqId);
204       }
205     }
206     return tgt;
207   }
208 
209   /**
210    * @param encodedRegionName Region to flush.
211    * @param families Families to flush. May be a subset of all families in the region.
212    * @return Returns {@link HConstants#NO_SEQNUM} if we are flushing the whole region OR if
213    * we are flushing a subset of all families but there are no edits in those families not
214    * being flushed; in other words, this is effectively same as a flush of all of the region
215    * though we were passed a subset of regions. Otherwise, it returns the sequence id of the
216    * oldest/lowest outstanding edit.
217    */
218   Long startCacheFlush(final byte[] encodedRegionName, final Set<byte[]> families) {
219     Map<byte[], Long> oldSequenceIds = null;
220     Long lowestUnflushedInRegion = HConstants.NO_SEQNUM;
221     synchronized (tieLock) {
222       Map<byte[], Long> m = this.lowestUnflushedSequenceIds.get(encodedRegionName);
223       if (m != null) {
224         // NOTE: Removal from this.lowestUnflushedSequenceIds must be done in controlled
225         // circumstance because another concurrent thread now may add sequenceids for this family
226         // (see above in getOrCreateLowestSequenceId). Make sure you are ok with this. Usually it
227         // is fine because updates are blocked when this method is called. Make sure!!!
228         for (byte[] familyName: families) {
229           Long seqId = m.remove(familyName);
230           if (seqId != null) {
231             if (oldSequenceIds == null) oldSequenceIds = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
232             oldSequenceIds.put(familyName, seqId);
233           }
234         }
235         if (oldSequenceIds != null && !oldSequenceIds.isEmpty()) {
236           if (this.flushingSequenceIds.put(encodedRegionName, oldSequenceIds) != null) {
237             LOG.warn("Flushing Map not cleaned up for " + Bytes.toString(encodedRegionName) +
238               ", sequenceid=" + oldSequenceIds);
239           }
240         }
241         if (m.isEmpty()) {
242           // Remove it otherwise it will be in oldestUnflushedStoreSequenceIds for ever
243           // even if the region is already moved to other server.
244           // Do not worry about data racing, we held write lock of region when calling
245           // startCacheFlush, so no one can add value to the map we removed.
246           this.lowestUnflushedSequenceIds.remove(encodedRegionName);
247         } else {
248           // Flushing a subset of the region families. Return the sequence id of the oldest entry.
249           lowestUnflushedInRegion = Collections.min(m.values());
250         }
251       }
252     }
253     // Do this check outside lock.
254     if (oldSequenceIds != null && oldSequenceIds.isEmpty()) {
255       // TODO: if we have no oldStoreSeqNum, and WAL is not disabled, presumably either
256       // the region is already flushing (which would make this call invalid), or there
257       // were no appends after last flush, so why are we starting flush? Maybe we should
258       // assert not empty. Less rigorous, but safer, alternative is telling the caller to stop.
259       // For now preserve old logic.
260       LOG.warn("Couldn't find oldest sequenceid for " + Bytes.toString(encodedRegionName));
261     }
262     return lowestUnflushedInRegion;
263   }
264 
265   void completeCacheFlush(final byte [] encodedRegionName) {
266     synchronized (tieLock) {
267       this.flushingSequenceIds.remove(encodedRegionName);
268     }
269   }
270 
271   void abortCacheFlush(final byte[] encodedRegionName) {
272     // Method is called when we are crashing down because failed write flush AND it is called
273     // if we fail prepare. The below is for the fail prepare case; we restore the old sequence ids.
274     Map<byte[], Long> flushing = null;
275     Map<byte[], Long> tmpMap = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
276     // Here we are moving sequenceids from flushing back to unflushed; doing opposite of what
277     // happened in startCacheFlush. During prepare phase, we have update lock on the region so
278     // no edits should be coming in via append.
279     synchronized (tieLock) {
280       flushing = this.flushingSequenceIds.remove(encodedRegionName);
281       if (flushing != null) {
282         Map<byte[], Long> unflushed = getOrCreateLowestSequenceIds(encodedRegionName);
283         for (Map.Entry<byte[], Long> e: flushing.entrySet()) {
284           // Set into unflushed the 'old' oldest sequenceid and if any value in flushed with this
285           // value, it will now be in tmpMap.
286           tmpMap.put(e.getKey(), unflushed.put(e.getKey(), e.getValue()));
287         }
288       }
289     }
290 
291     // Here we are doing some 'test' to see if edits are going in out of order. What is it for?
292     // Carried over from old code.
293     if (flushing != null) {
294       for (Map.Entry<byte[], Long> e : flushing.entrySet()) {
295         Long currentId = tmpMap.get(e.getKey());
296         if (currentId != null && currentId.longValue() <= e.getValue().longValue()) {
297           String errorStr = Bytes.toString(encodedRegionName) + " family " +
298             Bytes.toString(e.getKey()) + " acquired edits out of order current memstore seq=" +
299               currentId + ", previous oldest unflushed id=" + e.getValue();
300           LOG.error(errorStr);
301           Runtime.getRuntime().halt(1);
302         }
303       }
304     }
305   }
306 
307   /**
308    * See if passed <code>sequenceids</code> are lower -- i.e. earlier -- than any outstanding
309    * sequenceids, sequenceids we are holding on to in this accounting instance.
310    * @param sequenceids Keyed by encoded region name. Cannot be null (doesn't make
311    * sense for it to be null).
312    * @return true if all sequenceids are lower, older than, the old sequenceids in this instance.
313    */
314    boolean areAllLower(Map<byte[], Long> sequenceids) {
315      Map<byte[], Long> flushing = null;
316      Map<byte[], Long> unflushed = null;
317      synchronized (this.tieLock) {
318        // Get a flattened -- only the oldest sequenceid -- copy of current flushing and unflushed
319        // data structures to use in tests below.
320        flushing = flattenToLowestSequenceId(this.flushingSequenceIds);
321        unflushed = flattenToLowestSequenceId(this.lowestUnflushedSequenceIds);
322      }
323     for (Map.Entry<byte[], Long> e : sequenceids.entrySet()) {
324       long oldestFlushing = Long.MAX_VALUE;
325       long oldestUnflushed = Long.MAX_VALUE;
326       if (flushing != null) {
327         if (flushing.containsKey(e.getKey())) oldestFlushing = flushing.get(e.getKey());
328       }
329       if (unflushed != null) {
330         if (unflushed.containsKey(e.getKey())) oldestUnflushed = unflushed.get(e.getKey());
331       }
332       long min = Math.min(oldestFlushing, oldestUnflushed);
333       if (min <= e.getValue()) return false;
334     }
335     return true;
336   }
337 
338    /**
339     * Iterates over the given Map and compares sequence ids with corresponding
340     * entries in {@link #oldestUnflushedRegionSequenceIds}. If a region in
341     * {@link #oldestUnflushedRegionSequenceIds} has a sequence id less than that passed
342     * in <code>sequenceids</code> then return it.
343     * @param sequenceids Sequenceids keyed by encoded region name.
344     * @return regions found in this instance with sequence ids less than those passed in.
345     */
346    byte[][] findLower(Map<byte[], Long> sequenceids) {
347      List<byte[]> toFlush = null;
348      // Keeping the old behavior of iterating unflushedSeqNums under oldestSeqNumsLock.
349      synchronized (tieLock) {
350        for (Map.Entry<byte[], Long> e: sequenceids.entrySet()) {
351          Map<byte[], Long> m = this.lowestUnflushedSequenceIds.get(e.getKey());
352          if (m == null) continue;
353          // The lowest sequence id outstanding for this region.
354          long lowest = getLowestSequenceId(m);
355          if (lowest != HConstants.NO_SEQNUM && lowest <= e.getValue()) {
356            if (toFlush == null) toFlush = new ArrayList<byte[]>();
357            toFlush.add(e.getKey());
358          }
359        }
360      }
361      return toFlush == null? null: toFlush.toArray(new byte[][] { HConstants.EMPTY_BYTE_ARRAY });
362    }
363 }