001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.wal; 019 020import java.io.IOException; 021import java.util.ArrayList; 022import java.util.List; 023import java.util.Map; 024import java.util.Set; 025import java.util.TreeSet; 026import org.apache.hadoop.hbase.Cell; 027import org.apache.hadoop.hbase.CellUtil; 028import org.apache.hadoop.hbase.HBaseInterfaceAudience; 029import org.apache.hadoop.hbase.KeyValue; 030import org.apache.hadoop.hbase.PrivateCellUtil; 031import org.apache.hadoop.hbase.client.RegionInfo; 032import org.apache.hadoop.hbase.codec.Codec; 033import org.apache.hadoop.hbase.io.HeapSize; 034import org.apache.hadoop.hbase.util.Bytes; 035import org.apache.hadoop.hbase.util.ClassSize; 036import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; 037import org.apache.yetus.audience.InterfaceAudience; 038 039import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos; 040import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.CompactionDescriptor; 041import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.FlushDescriptor; 042import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.RegionEventDescriptor; 043 044/** 045 * Used in HBase's transaction log (WAL) to represent a collection of edits (Cell/KeyValue objects) 046 * that came in as a single transaction. All the edits for a given transaction are written out as a 047 * single record, in PB format, followed (optionally) by Cells written via the WALCellEncoder. 048 * <p> 049 * This class is LimitedPrivate for CPs to read-only. The {@link #add} methods are classified as 050 * private methods, not for use by CPs. 051 * </p> 052 * <p> 053 * A particular WALEdit 'type' is the 'meta' type used to mark key operational events in the WAL 054 * such as compaction, flush, or region open. These meta types do not traverse hbase memstores. They 055 * are edits made by the hbase system rather than edit data submitted by clients. They only show in 056 * the WAL. These 'Meta' types have not been formally specified (or made into an explicit class 057 * type). They evolved organically. HBASE-8457 suggests codifying a WALEdit 'type' by adding a type 058 * field to WALEdit that gets serialized into the WAL. TODO. Would have to work on the 059 * consumption-side. Reading WALs on replay we seem to consume a Cell-at-a-time rather than by 060 * WALEdit. We are already in the below going out of our way to figure particular types -- e.g. if a 061 * compaction, replay, or close meta Marker -- during normal processing so would make sense to do 062 * this. Current system is an awkward marking of Cell columnfamily as {@link #METAFAMILY} and then 063 * setting qualifier based off meta edit type. For replay-time where we read Cell-at-a-time, there 064 * are utility methods below for figuring meta type. See also 065 * {@link #createBulkLoadEvent(RegionInfo, WALProtos.BulkLoadDescriptor)}, etc., for where we create 066 * meta WALEdit instances. 067 * </p> 068 * <p> 069 * WALEdit will accumulate a Set of all column family names referenced by the Cells 070 * {@link #add(Cell)}'d. This is an optimization. Usually when loading a WALEdit, we have the column 071 * family name to-hand.. just shove it into the WALEdit if available. Doing this, we can save on a 072 * parse of each Cell to figure column family down the line when we go to add the WALEdit to the WAL 073 * file. See the hand-off in FSWALEntry Constructor. 074 * @see WALKey 075 */ 076// TODO: Do not expose this class to Coprocessors. It has set methods. A CP might meddle. 077@InterfaceAudience.LimitedPrivate({ HBaseInterfaceAudience.REPLICATION, 078 HBaseInterfaceAudience.COPROC }) 079public class WALEdit implements HeapSize { 080 // Below defines are for writing WALEdit 'meta' Cells.. 081 // TODO: Get rid of this system of special 'meta' Cells. See HBASE-8457. It suggests 082 // adding a type to WALEdit itself for use denoting meta Edits and their types. 083 public static final byte[] METAFAMILY = Bytes.toBytes("METAFAMILY"); 084 085 /** 086 * @deprecated Since 2.3.0. Not used. 087 */ 088 @Deprecated 089 public static final byte[] METAROW = Bytes.toBytes("METAROW"); 090 091 /** 092 * @deprecated Since 2.3.0. Make it protected, internal-use only. Use 093 * {@link #isCompactionMarker(Cell)} 094 */ 095 @Deprecated 096 @InterfaceAudience.Private 097 public static final byte[] COMPACTION = Bytes.toBytes("HBASE::COMPACTION"); 098 099 /** 100 * @deprecated Since 2.3.0. Make it protected, internal-use only. 101 */ 102 @Deprecated 103 @InterfaceAudience.Private 104 public static final byte[] FLUSH = Bytes.toBytes("HBASE::FLUSH"); 105 106 /** 107 * Qualifier for region event meta 'Marker' WALEdits start with the {@link #REGION_EVENT_PREFIX} 108 * prefix ('HBASE::REGION_EVENT::'). After the prefix, we note the type of the event which we get 109 * from the RegionEventDescriptor protobuf instance type (A RegionEventDescriptor protobuf 110 * instance is written as the meta Marker Cell value). Adding a type suffix means we do not have 111 * to deserialize the protobuf to figure out what type of event this is.. .just read the qualifier 112 * suffix. For example, a close region event descriptor will have a qualifier of 113 * HBASE::REGION_EVENT::REGION_CLOSE. See WAL.proto and the EventType in RegionEventDescriptor 114 * protos for all possible event types. 115 */ 116 private static final String REGION_EVENT_STR = "HBASE::REGION_EVENT"; 117 private static final String REGION_EVENT_PREFIX_STR = REGION_EVENT_STR + "::"; 118 private static final byte[] REGION_EVENT_PREFIX = Bytes.toBytes(REGION_EVENT_PREFIX_STR); 119 120 /** 121 * @deprecated Since 2.3.0. Remove. Not for external use. Not used. 122 */ 123 @Deprecated 124 public static final byte[] REGION_EVENT = Bytes.toBytes(REGION_EVENT_STR); 125 126 /** 127 * We use this define figuring if we are carrying a close event. 128 */ 129 private static final byte[] REGION_EVENT_CLOSE = 130 createRegionEventDescriptorQualifier(RegionEventDescriptor.EventType.REGION_CLOSE); 131 132 @InterfaceAudience.Private 133 public static final byte[] BULK_LOAD = Bytes.toBytes("HBASE::BULK_LOAD"); 134 135 /** 136 * Periodically {@link org.apache.hadoop.hbase.replication.regionserver.ReplicationMarkerChore} 137 * will create marker edits with family as {@link WALEdit#METAFAMILY} and 138 * {@link WALEdit#REPLICATION_MARKER} as qualifier and an empty value. 139 * org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceWALReader will populate the 140 * Replication Marker edit with region_server_name, wal_name and wal_offset encoded in 141 * {@link org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.ReplicationMarkerDescriptor} 142 * object. {@link org.apache.hadoop.hbase.replication.regionserver.Replication} will change the 143 * REPLICATION_SCOPE for this edit to GLOBAL so that it can replicate. On the sink cluster, 144 * {@link org.apache.hadoop.hbase.replication.regionserver.ReplicationSink} will convert the 145 * ReplicationMarkerDescriptor into a Put mutation to REPLICATION_SINK_TRACKER_TABLE_NAME_STR 146 * table. 147 */ 148 @InterfaceAudience.Private 149 public static final byte[] REPLICATION_MARKER = Bytes.toBytes("HBASE::REPLICATION_MARKER"); 150 151 private final transient boolean replay; 152 153 private ArrayList<Cell> cells; 154 155 /** 156 * All the Cell families in <code>cells</code>. Updated by {@link #add(Cell)} and 157 * {@link #add(Map)}. This Set is passed to the FSWALEntry so it does not have to recalculate the 158 * Set of families in a transaction; makes for a bunch of CPU savings. 159 */ 160 private Set<byte[]> families = null; 161 162 public WALEdit() { 163 this(1, false); 164 } 165 166 /** 167 * @deprecated since 2.0.1 and will be removed in 4.0.0. Use {@link #WALEdit(int, boolean)} 168 * instead. 169 * @see #WALEdit(int, boolean) 170 * @see <a href="https://issues.apache.org/jira/browse/HBASE-20781">HBASE-20781</a> 171 */ 172 @Deprecated 173 public WALEdit(boolean replay) { 174 this(1, replay); 175 } 176 177 /** 178 * @deprecated since 2.0.1 and will be removed in 4.0.0. Use {@link #WALEdit(int, boolean)} 179 * instead. 180 * @see #WALEdit(int, boolean) 181 * @see <a href="https://issues.apache.org/jira/browse/HBASE-20781">HBASE-20781</a> 182 */ 183 @Deprecated 184 public WALEdit(int cellCount) { 185 this(cellCount, false); 186 } 187 188 /** 189 * @param cellCount Pass so can pre-size the WALEdit. Optimization. 190 */ 191 public WALEdit(int cellCount, boolean isReplay) { 192 this.replay = isReplay; 193 cells = new ArrayList<>(cellCount); 194 } 195 196 /** 197 * Create a new WALEdit from a existing {@link WALEdit}. 198 */ 199 public WALEdit(WALEdit walEdit) { 200 this.replay = walEdit.replay; 201 cells = new ArrayList<>(walEdit.cells); 202 if (walEdit.families != null) { 203 this.families = new TreeSet<>(Bytes.BYTES_COMPARATOR); 204 this.families.addAll(walEdit.families); 205 } 206 207 } 208 209 private Set<byte[]> getOrCreateFamilies() { 210 if (this.families == null) { 211 this.families = new TreeSet<>(Bytes.BYTES_COMPARATOR); 212 } 213 return this.families; 214 } 215 216 /** 217 * For use by FSWALEntry ONLY. An optimization. 218 * @return All families in {@link #getCells()}; may be null. 219 */ 220 public Set<byte[]> getFamilies() { 221 return this.families; 222 } 223 224 /** 225 * @return True is <code>f</code> is {@link #METAFAMILY} 226 * @deprecated Since 2.3.0. Do not expose. Make protected. 227 */ 228 @Deprecated 229 public static boolean isMetaEditFamily(final byte[] f) { 230 return Bytes.equals(METAFAMILY, f); 231 } 232 233 /** 234 * Replaying WALs can read Cell-at-a-time so need this method in those cases. 235 */ 236 public static boolean isMetaEditFamily(Cell cell) { 237 return CellUtil.matchingFamily(cell, METAFAMILY); 238 } 239 240 /** 241 * @return True if this is a meta edit; has one edit only and its columnfamily is 242 * {@link #METAFAMILY}. 243 */ 244 public boolean isMetaEdit() { 245 return this.families != null && this.families.size() == 1 && this.families.contains(METAFAMILY); 246 } 247 248 /** 249 * @return True when current WALEdit is created by log replay. Replication skips WALEdits from 250 * replay. 251 */ 252 public boolean isReplay() { 253 return this.replay; 254 } 255 256 @InterfaceAudience.Private 257 public WALEdit add(Cell cell, byte[] family) { 258 getOrCreateFamilies().add(family); 259 return addCell(cell); 260 } 261 262 @InterfaceAudience.Private 263 public WALEdit add(Cell cell) { 264 // We clone Family each time we add a Cell. Expensive but safe. For CPU savings, use 265 // add(Map) or add(Cell, family). 266 return add(cell, CellUtil.cloneFamily(cell)); 267 } 268 269 @InterfaceAudience.Private 270 public WALEdit add(List<Cell> cells) { 271 if (cells == null || cells.isEmpty()) { 272 return this; 273 } 274 for (Cell cell : cells) { 275 add(cell); 276 } 277 return this; 278 } 279 280 public boolean isEmpty() { 281 return cells.isEmpty(); 282 } 283 284 public int size() { 285 return cells.size(); 286 } 287 288 public ArrayList<Cell> getCells() { 289 return cells; 290 } 291 292 /** 293 * This is not thread safe. This will change the WALEdit and shouldn't be used unless you are sure 294 * that nothing else depends on the contents being immutable. 295 * @param cells the list of cells that this WALEdit now contains. 296 */ 297 @InterfaceAudience.Private 298 // Used by replay. 299 public void setCells(ArrayList<Cell> cells) { 300 this.cells = cells; 301 this.families = null; 302 } 303 304 /** 305 * Reads WALEdit from cells. 306 * @param cellDecoder Cell decoder. 307 * @param expectedCount Expected cell count. 308 * @return Number of KVs read. 309 */ 310 public int readFromCells(Codec.Decoder cellDecoder, int expectedCount) throws IOException { 311 cells.clear(); 312 cells.ensureCapacity(expectedCount); 313 while (cells.size() < expectedCount && cellDecoder.advance()) { 314 add(cellDecoder.current()); 315 } 316 return cells.size(); 317 } 318 319 @Override 320 public long heapSize() { 321 long ret = ClassSize.ARRAYLIST; 322 for (Cell cell : cells) { 323 ret += cell.heapSize(); 324 } 325 return ret; 326 } 327 328 public long estimatedSerializedSizeOf() { 329 long ret = 0; 330 for (Cell cell : cells) { 331 ret += PrivateCellUtil.estimatedSerializedSizeOf(cell); 332 } 333 return ret; 334 } 335 336 @Override 337 public String toString() { 338 StringBuilder sb = new StringBuilder(); 339 340 sb.append("[#edits: ").append(cells.size()).append(" = <"); 341 for (Cell cell : cells) { 342 sb.append(cell); 343 sb.append("; "); 344 } 345 sb.append(">]"); 346 return sb.toString(); 347 } 348 349 public static WALEdit createFlushWALEdit(RegionInfo hri, FlushDescriptor f) { 350 KeyValue kv = new KeyValue(getRowForRegion(hri), METAFAMILY, FLUSH, 351 EnvironmentEdgeManager.currentTime(), f.toByteArray()); 352 return new WALEdit().add(kv, METAFAMILY); 353 } 354 355 public static FlushDescriptor getFlushDescriptor(Cell cell) throws IOException { 356 return CellUtil.matchingColumn(cell, METAFAMILY, FLUSH) 357 ? FlushDescriptor.parseFrom(CellUtil.cloneValue(cell)) 358 : null; 359 } 360 361 /** 362 * @return A meta Marker WALEdit that has a single Cell whose value is the passed in 363 * <code>regionEventDesc</code> serialized and whose row is this region, columnfamily is 364 * {@link #METAFAMILY} and qualifier is {@link #REGION_EVENT_PREFIX} + 365 * {@link RegionEventDescriptor#getEventType()}; for example 366 * HBASE::REGION_EVENT::REGION_CLOSE. 367 */ 368 public static WALEdit createRegionEventWALEdit(RegionInfo hri, 369 RegionEventDescriptor regionEventDesc) { 370 return createRegionEventWALEdit(getRowForRegion(hri), regionEventDesc); 371 } 372 373 @InterfaceAudience.Private 374 public static WALEdit createRegionEventWALEdit(byte[] rowForRegion, 375 RegionEventDescriptor regionEventDesc) { 376 KeyValue kv = new KeyValue(rowForRegion, METAFAMILY, 377 createRegionEventDescriptorQualifier(regionEventDesc.getEventType()), 378 EnvironmentEdgeManager.currentTime(), regionEventDesc.toByteArray()); 379 return new WALEdit().add(kv, METAFAMILY); 380 } 381 382 /** 383 * @return Cell qualifier for the passed in RegionEventDescriptor Type; e.g. we'll return 384 * something like a byte array with HBASE::REGION_EVENT::REGION_OPEN in it. 385 */ 386 @InterfaceAudience.Private 387 public static byte[] createRegionEventDescriptorQualifier(RegionEventDescriptor.EventType t) { 388 return Bytes.toBytes(REGION_EVENT_PREFIX_STR + t.toString()); 389 } 390 391 /** 392 * Public so can be accessed from regionserver.wal package. 393 * @return True if this is a Marker Edit and it is a RegionClose type. 394 */ 395 public boolean isRegionCloseMarker() { 396 return isMetaEdit() && PrivateCellUtil.matchingQualifier(this.cells.get(0), REGION_EVENT_CLOSE, 397 0, REGION_EVENT_CLOSE.length); 398 } 399 400 /** 401 * @return Returns a RegionEventDescriptor made by deserializing the content of the passed in 402 * <code>cell</code>, IFF the <code>cell</code> is a RegionEventDescriptor type WALEdit. 403 */ 404 public static RegionEventDescriptor getRegionEventDescriptor(Cell cell) throws IOException { 405 return CellUtil.matchingColumnFamilyAndQualifierPrefix(cell, METAFAMILY, REGION_EVENT_PREFIX) 406 ? RegionEventDescriptor.parseFrom(CellUtil.cloneValue(cell)) 407 : null; 408 } 409 410 /** Returns A Marker WALEdit that has <code>c</code> serialized as its value */ 411 public static WALEdit createCompaction(final RegionInfo hri, final CompactionDescriptor c) { 412 byte[] pbbytes = c.toByteArray(); 413 KeyValue kv = new KeyValue(getRowForRegion(hri), METAFAMILY, COMPACTION, 414 EnvironmentEdgeManager.currentTime(), pbbytes); 415 return new WALEdit().add(kv, METAFAMILY); // replication scope null so this won't be replicated 416 } 417 418 public static byte[] getRowForRegion(RegionInfo hri) { 419 byte[] startKey = hri.getStartKey(); 420 if (startKey.length == 0) { 421 // empty row key is not allowed in mutations because it is both the start key and the end key 422 // we return the smallest byte[] that is bigger (in lex comparison) than byte[0]. 423 return new byte[] { 0 }; 424 } 425 return startKey; 426 } 427 428 /** 429 * Deserialized and returns a CompactionDescriptor is the KeyValue contains one. 430 * @param kv the key value 431 * @return deserialized CompactionDescriptor or null. 432 */ 433 public static CompactionDescriptor getCompaction(Cell kv) throws IOException { 434 return isCompactionMarker(kv) ? CompactionDescriptor.parseFrom(CellUtil.cloneValue(kv)) : null; 435 } 436 437 /** 438 * Returns true if the given cell is a serialized {@link CompactionDescriptor} 439 * @see #getCompaction(Cell) 440 */ 441 public static boolean isCompactionMarker(Cell cell) { 442 return CellUtil.matchingColumn(cell, METAFAMILY, COMPACTION); 443 } 444 445 /** 446 * Create a bulk loader WALEdit 447 * @param hri The RegionInfo for the region in which we are bulk loading 448 * @param bulkLoadDescriptor The descriptor for the Bulk Loader 449 * @return The WALEdit for the BulkLoad 450 */ 451 public static WALEdit createBulkLoadEvent(RegionInfo hri, 452 WALProtos.BulkLoadDescriptor bulkLoadDescriptor) { 453 KeyValue kv = new KeyValue(getRowForRegion(hri), METAFAMILY, BULK_LOAD, 454 EnvironmentEdgeManager.currentTime(), bulkLoadDescriptor.toByteArray()); 455 return new WALEdit().add(kv, METAFAMILY); 456 } 457 458 /** 459 * Deserialized and returns a BulkLoadDescriptor from the passed in Cell 460 * @param cell the key value 461 * @return deserialized BulkLoadDescriptor or null. 462 */ 463 public static WALProtos.BulkLoadDescriptor getBulkLoadDescriptor(Cell cell) throws IOException { 464 return CellUtil.matchingColumn(cell, METAFAMILY, BULK_LOAD) 465 ? WALProtos.BulkLoadDescriptor.parseFrom(CellUtil.cloneValue(cell)) 466 : null; 467 } 468 469 /** 470 * Append the given map of family->edits to a WALEdit data structure. This does not write to the 471 * WAL itself. Note that as an optimization, we will stamp the Set of column families into the 472 * WALEdit to save on our having to calculate column families subsequently down in the actual WAL 473 * writing. 474 * @param familyMap map of family->edits 475 */ 476 public void add(Map<byte[], List<Cell>> familyMap) { 477 for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { 478 // 'foreach' loop NOT used. See HBASE-12023 "...creates too many iterator objects." 479 int listSize = e.getValue().size(); 480 // Add all Cells first and then at end, add the family rather than call {@link #add(Cell)} 481 // and have it clone family each time. Optimization! 482 for (int i = 0; i < listSize; i++) { 483 addCell(e.getValue().get(i)); 484 } 485 addFamily(e.getKey()); 486 } 487 } 488 489 private void addFamily(byte[] family) { 490 getOrCreateFamilies().add(family); 491 } 492 493 private WALEdit addCell(Cell cell) { 494 this.cells.add(cell); 495 return this; 496 } 497 498 /** 499 * Creates a replication tracker edit with {@link #METAFAMILY} family and 500 * {@link #REPLICATION_MARKER} qualifier and has null value. 501 * @param rowKey rowkey 502 * @param timestamp timestamp 503 */ 504 public static WALEdit createReplicationMarkerEdit(byte[] rowKey, long timestamp) { 505 KeyValue kv = 506 new KeyValue(rowKey, METAFAMILY, REPLICATION_MARKER, timestamp, KeyValue.Type.Put); 507 return new WALEdit().add(kv); 508 } 509 510 /** 511 * Checks whether this edit is a replication marker edit. 512 * @param edit edit 513 * @return true if the cell within an edit has column = METAFAMILY and qualifier = 514 * REPLICATION_MARKER, false otherwise 515 */ 516 public static boolean isReplicationMarkerEdit(WALEdit edit) { 517 // Check just the first cell from the edit. ReplicationMarker edit will have only 1 cell. 518 return edit.getCells().size() == 1 519 && CellUtil.matchingColumn(edit.getCells().get(0), METAFAMILY, REPLICATION_MARKER); 520 } 521}