View Javadoc

1   /**
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.util.concurrent.atomic.AtomicInteger;
23  import java.util.concurrent.atomic.AtomicReference;
24  
25  import org.apache.hadoop.conf.Configuration;
26  import com.google.common.base.Preconditions;
27  
28  /**
29   * A memstore-local allocation buffer.
30   * <p>
31   * The MemStoreLAB is basically a bump-the-pointer allocator that allocates
32   * big (2MB) byte[] chunks from and then doles it out to threads that request
33   * slices into the array.
34   * <p>
35   * The purpose of this class is to combat heap fragmentation in the
36   * regionserver. By ensuring that all KeyValues in a given memstore refer
37   * only to large chunks of contiguous memory, we ensure that large blocks
38   * get freed up when the memstore is flushed.
39   * <p>
40   * Without the MSLAB, the byte array allocated during insertion end up
41   * interleaved throughout the heap, and the old generation gets progressively
42   * more fragmented until a stop-the-world compacting collection occurs.
43   * <p>
44   * TODO: we should probably benchmark whether word-aligning the allocations
45   * would provide a performance improvement - probably would speed up the
46   * Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached
47   * anyway
48   */
49  public class MemStoreLAB {
50    private AtomicReference<Chunk> curChunk = new AtomicReference<Chunk>();
51  
52    final static String CHUNK_SIZE_KEY = "hbase.hregion.memstore.mslab.chunksize";
53    final static int CHUNK_SIZE_DEFAULT = 2048 * 1024;
54    final int chunkSize;
55  
56    final static String MAX_ALLOC_KEY = "hbase.hregion.memstore.mslab.max.allocation";
57    final static int MAX_ALLOC_DEFAULT = 256  * 1024; // allocs bigger than this don't go through allocator
58    final int maxAlloc;
59  
60    public MemStoreLAB() {
61      this(new Configuration());
62    }
63  
64    public MemStoreLAB(Configuration conf) {
65      chunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT);
66      maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT);
67  
68      // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one!
69      Preconditions.checkArgument(
70        maxAlloc <= chunkSize,
71        MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY);
72    }
73  
74    /**
75     * Allocate a slice of the given length.
76     *
77     * If the size is larger than the maximum size specified for this
78     * allocator, returns null.
79     */
80    public Allocation allocateBytes(int size) {
81      Preconditions.checkArgument(size >= 0, "negative size");
82  
83      // Callers should satisfy large allocations directly from JVM since they
84      // don't cause fragmentation as badly.
85      if (size > maxAlloc) {
86        return null;
87      }
88  
89      while (true) {
90        Chunk c = getOrMakeChunk();
91  
92        // Try to allocate from this chunk
93        int allocOffset = c.alloc(size);
94        if (allocOffset != -1) {
95          // We succeeded - this is the common case - small alloc
96          // from a big buffer
97          return new Allocation(c.data, allocOffset);
98        }
99  
100       // not enough space!
101       // try to retire this chunk
102       tryRetireChunk(c);
103     }
104   }
105 
106   /**
107    * Try to retire the current chunk if it is still
108    * <code>c</code>. Postcondition is that curChunk.get()
109    * != c
110    */
111   private void tryRetireChunk(Chunk c) {
112     @SuppressWarnings("unused")
113     boolean weRetiredIt = curChunk.compareAndSet(c, null);
114     // If the CAS succeeds, that means that we won the race
115     // to retire the chunk. We could use this opportunity to
116     // update metrics on external fragmentation.
117     //
118     // If the CAS fails, that means that someone else already
119     // retired the chunk for us.
120   }
121 
122   /**
123    * Get the current chunk, or, if there is no current chunk,
124    * allocate a new one from the JVM.
125    */
126   private Chunk getOrMakeChunk() {
127     while (true) {
128       // Try to get the chunk
129       Chunk c = curChunk.get();
130       if (c != null) {
131         return c;
132       }
133 
134       // No current chunk, so we want to allocate one. We race
135       // against other allocators to CAS in an uninitialized chunk
136       // (which is cheap to allocate)
137       c = new Chunk(chunkSize);
138       if (curChunk.compareAndSet(null, c)) {
139         // we won race - now we need to actually do the expensive
140         // allocation step
141         c.init();
142         return c;
143       }
144       // someone else won race - that's fine, we'll try to grab theirs
145       // in the next iteration of the loop.
146     }
147   }
148 
149   /**
150    * A chunk of memory out of which allocations are sliced.
151    */
152   private static class Chunk {
153     /** Actual underlying data */
154     private byte[] data;
155 
156     private static final int UNINITIALIZED = -1;
157     private static final int OOM = -2;
158     /**
159      * Offset for the next allocation, or the sentinel value -1
160      * which implies that the chunk is still uninitialized.
161      * */
162     private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED);
163 
164     /** Total number of allocations satisfied from this buffer */
165     private AtomicInteger allocCount = new AtomicInteger();
166 
167     /** Size of chunk in bytes */
168     private final int size;
169 
170     /**
171      * Create an uninitialized chunk. Note that memory is not allocated yet, so
172      * this is cheap.
173      * @param size in bytes
174      */
175     private Chunk(int size) {
176       this.size = size;
177     }
178 
179     /**
180      * Actually claim the memory for this chunk. This should only be called from
181      * the thread that constructed the chunk. It is thread-safe against other
182      * threads calling alloc(), who will block until the allocation is complete.
183      */
184     public void init() {
185       assert nextFreeOffset.get() == UNINITIALIZED;
186       try {
187         data = new byte[size];
188       } catch (OutOfMemoryError e) {
189         boolean failInit = nextFreeOffset.compareAndSet(UNINITIALIZED, OOM);
190         assert failInit; // should be true.
191         throw e;
192       }
193       // Mark that it's ready for use
194       boolean initted = nextFreeOffset.compareAndSet(
195           UNINITIALIZED, 0);
196       // We should always succeed the above CAS since only one thread
197       // calls init()!
198       Preconditions.checkState(initted,
199           "Multiple threads tried to init same chunk");
200     }
201 
202     /**
203      * Try to allocate <code>size</code> bytes from the chunk.
204      * @return the offset of the successful allocation, or -1 to indicate not-enough-space
205      */
206     public int alloc(int size) {
207       while (true) {
208         int oldOffset = nextFreeOffset.get();
209         if (oldOffset == UNINITIALIZED) {
210           // The chunk doesn't have its data allocated yet.
211           // Since we found this in curChunk, we know that whoever
212           // CAS-ed it there is allocating it right now. So spin-loop
213           // shouldn't spin long!
214           Thread.yield();
215           continue;
216         }
217         if (oldOffset == OOM) {
218           // doh we ran out of ram. return -1 to chuck this away.
219           return -1;
220         }
221 
222         if (oldOffset + size > data.length) {
223           return -1; // alloc doesn't fit
224         }
225 
226         // Try to atomically claim this chunk
227         if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size)) {
228           // we got the alloc
229           allocCount.incrementAndGet();
230           return oldOffset;
231         }
232         // we raced and lost alloc, try again
233       }
234     }
235 
236     @Override
237     public String toString() {
238       return "Chunk@" + System.identityHashCode(this) +
239         " allocs=" + allocCount.get() + "waste=" +
240         (data.length - nextFreeOffset.get());
241     }
242   }
243 
244   /**
245    * The result of a single allocation. Contains the chunk that the
246    * allocation points into, and the offset in this array where the
247    * slice begins.
248    */
249   public static class Allocation {
250     private final byte[] data;
251     private final int offset;
252 
253     private Allocation(byte[] data, int off) {
254       this.data = data;
255       this.offset = off;
256     }
257 
258     @Override
259     public String toString() {
260       return "Allocation(data=" + data +
261         " with capacity=" + data.length +
262         ", off=" + offset + ")";
263     }
264 
265     byte[] getData() {
266       return data;
267     }
268 
269     int getOffset() {
270       return offset;
271     }
272   }
273 }