@InterfaceAudience.LimitedPrivate(value="Tools") public class BulkLoadHFilesTool extends org.apache.hadoop.conf.Configured implements BulkLoadHFiles, org.apache.hadoop.util.Tool
BulkLoadHFiles
, and also can be executed from command line as a
tool.Modifier and Type | Class and Description |
---|---|
private static interface |
BulkLoadHFilesTool.BulkHFileVisitor<TFamily> |
BulkLoadHFiles.LoadQueueItem
Modifier and Type | Field and Description |
---|---|
private boolean |
assignSeqIds |
static String |
BULK_LOAD_HFILES_BY_FAMILY
HBASE-24221 Support bulkLoadHFile by family to avoid long time waiting of bulkLoadHFile because
of compacting at server side
|
private boolean |
bulkLoadByFamily |
private String |
bulkToken |
private List<String> |
clusterIds |
static String |
FAIL_IF_NEED_SPLIT_HFILE |
private boolean |
failIfNeedSplitHFile |
private FsDelegationToken |
fsDelegationToken |
private static org.slf4j.Logger |
LOG |
private int |
maxFilesPerRegionPerFamily |
static String |
NAME |
private int |
nrThreads |
private AtomicInteger |
numRetries |
private boolean |
replicate |
(package private) static String |
TMP_DIR |
private UserProvider |
userProvider |
private static String |
VALIDATE_HFILES
Whether to run validation on hfiles before loading.
|
ALWAYS_COPY_FILES, ASSIGN_SEQ_IDS, CREATE_TABLE_CONF_KEY, IGNORE_UNMATCHED_CF_CONF_KEY, MAX_FILES_PER_REGION_PER_FAMILY, RETRY_ON_IO_EXCEPTION
Constructor and Description |
---|
BulkLoadHFilesTool(org.apache.hadoop.conf.Configuration conf) |
Modifier and Type | Method and Description |
---|---|
Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> |
bulkLoad(TableName tableName,
Map<byte[],List<org.apache.hadoop.fs.Path>> family2Files)
Perform a bulk load of the given directory into the given pre-existing table.
|
Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> |
bulkLoad(TableName tableName,
org.apache.hadoop.fs.Path dir)
Perform a bulk load of the given directory into the given pre-existing table.
|
protected void |
bulkLoadPhase(AsyncClusterConnection conn,
TableName tableName,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem> regionGroups,
boolean copyFiles,
Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> item2RegionMap)
This takes the LQI's grouped by likely regions and attempts to bulk load them.
|
private boolean |
checkHFilesCountPerRegionPerFamily(org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem> regionGroups) |
private void |
checkRegionIndexValid(int idx,
List<Pair<byte[],byte[]>> startEndKeys,
TableName tableName)
we can consider there is a region hole or overlap in following conditions.
|
private void |
cleanup(AsyncClusterConnection conn,
TableName tableName,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
ExecutorService pool) |
private static void |
copyHFileHalf(org.apache.hadoop.conf.Configuration conf,
org.apache.hadoop.fs.Path inFile,
org.apache.hadoop.fs.Path outFile,
Reference reference,
ColumnFamilyDescriptor familyDescriptor)
Copy half of an HFile into a new HFile.
|
private ExecutorService |
createExecutorService() |
private void |
createTable(TableName tableName,
org.apache.hadoop.fs.Path hfofDir,
AsyncAdmin admin)
If the table is created for the first time, then "completebulkload" reads the files twice.
|
void |
disableReplication()
Disables replication for all bulkloads done via this instance, when bulkload replication is
configured.
|
private static void |
discoverLoadQueue(org.apache.hadoop.conf.Configuration conf,
Deque<BulkLoadHFiles.LoadQueueItem> ret,
org.apache.hadoop.fs.Path hfofDir,
boolean validateHFile)
Walk the given directory for all HFiles, and return a Queue containing all such files.
|
private Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> |
doBulkLoad(AsyncClusterConnection conn,
TableName tableName,
Map<byte[],List<org.apache.hadoop.fs.Path>> map,
boolean silence,
boolean copyFile)
Perform a bulk load of the given map of families to hfiles into the given pre-existing table.
|
private Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> |
doBulkLoad(AsyncClusterConnection conn,
TableName tableName,
org.apache.hadoop.fs.Path hfofDir,
boolean silence,
boolean copyFile)
Perform a bulk load of the given directory into the given pre-existing table.
|
private int |
getRegionIndex(List<Pair<byte[],byte[]>> startEndKeys,
byte[] key) |
private String |
getUniqueName() |
private Map<byte[],Collection<BulkLoadHFiles.LoadQueueItem>> |
groupByFamilies(Collection<BulkLoadHFiles.LoadQueueItem> itemsInRegion) |
protected Pair<List<BulkLoadHFiles.LoadQueueItem>,String> |
groupOrSplit(AsyncClusterConnection conn,
TableName tableName,
org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem> regionGroups,
BulkLoadHFiles.LoadQueueItem item,
List<Pair<byte[],byte[]>> startEndKeys)
Attempt to assign the given load queue item into its target region group.
|
private Pair<org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem>,Set<String>> |
groupOrSplitPhase(AsyncClusterConnection conn,
TableName tableName,
ExecutorService pool,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
List<Pair<byte[],byte[]>> startEndKeys) |
static byte[][] |
inferBoundaries(SortedMap<byte[],Integer> bdryMap)
Infers region boundaries for a new table.
|
void |
initialize() |
private boolean |
isAlwaysCopyFiles() |
private boolean |
isCreateTable() |
boolean |
isReplicationDisabled()
Returns true if replication has been disabled.
|
private boolean |
isSilence() |
void |
loadHFileQueue(AsyncClusterConnection conn,
TableName tableName,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
boolean copyFiles)
Used by the replication sink to load the hfiles from the source cluster.
|
static void |
main(String[] args) |
private Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> |
performBulkLoad(AsyncClusterConnection conn,
TableName tableName,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
ExecutorService pool,
boolean copyFile) |
private static void |
populateLoadQueue(Deque<BulkLoadHFiles.LoadQueueItem> ret,
Map<byte[],List<org.apache.hadoop.fs.Path>> map)
Populate the Queue with given HFiles
|
static void |
prepareHFileQueue(AsyncClusterConnection conn,
TableName tableName,
Map<byte[],List<org.apache.hadoop.fs.Path>> map,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
boolean silence)
Prepare a collection of
LoadQueueItem from list of source hfiles contained in the
passed directory and validates whether the prepared queue has all the valid table column
families in it. |
static void |
prepareHFileQueue(org.apache.hadoop.conf.Configuration conf,
AsyncClusterConnection conn,
TableName tableName,
org.apache.hadoop.fs.Path hfilesDir,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
boolean validateHFile,
boolean silence)
Prepare a collection of
LoadQueueItem from list of source hfiles contained in the
passed directory and validates whether the prepared queue has all the valid table column
families in it. |
int |
run(String[] args) |
void |
setBulkToken(String bulkToken) |
void |
setClusterIds(List<String> clusterIds) |
private static boolean |
shouldCopyHFileMetaKey(byte[] key) |
private List<BulkLoadHFiles.LoadQueueItem> |
splitStoreFile(BulkLoadHFiles.LoadQueueItem item,
TableDescriptor tableDesc,
byte[] splitKey) |
(package private) static void |
splitStoreFile(org.apache.hadoop.conf.Configuration conf,
org.apache.hadoop.fs.Path inFile,
ColumnFamilyDescriptor familyDesc,
byte[] splitKey,
org.apache.hadoop.fs.Path bottomOut,
org.apache.hadoop.fs.Path topOut)
Split a storefile into a top and bottom half, maintaining the metadata, recreating bloom
filters, etc.
|
private void |
tableExists(AsyncClusterConnection conn,
TableName tableName) |
private void |
throwAndLogTableNotFoundException(TableName tn) |
protected CompletableFuture<Collection<BulkLoadHFiles.LoadQueueItem>> |
tryAtomicRegionLoad(AsyncClusterConnection conn,
TableName tableName,
boolean copyFiles,
byte[] first,
Collection<BulkLoadHFiles.LoadQueueItem> lqis)
Attempts to do an atomic load of many hfiles into a region.
|
private void |
usage() |
private static void |
validateFamiliesInHFiles(TableDescriptor tableDesc,
Deque<BulkLoadHFiles.LoadQueueItem> queue,
boolean silence)
Checks whether there is any invalid family name in HFiles to be bulk loaded.
|
private static <TFamily> void |
visitBulkHFiles(org.apache.hadoop.fs.FileSystem fs,
org.apache.hadoop.fs.Path bulkDir,
BulkLoadHFilesTool.BulkHFileVisitor<TFamily> visitor,
boolean validateHFile)
Iterate over the bulkDir hfiles.
|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
create
private static final org.slf4j.Logger LOG
public static final String NAME
private static final String VALIDATE_HFILES
public static final String BULK_LOAD_HFILES_BY_FAMILY
public static final String FAIL_IF_NEED_SPLIT_HFILE
static final String TMP_DIR
private int maxFilesPerRegionPerFamily
private boolean assignSeqIds
private boolean bulkLoadByFamily
private FsDelegationToken fsDelegationToken
private UserProvider userProvider
private int nrThreads
private final AtomicInteger numRetries
private List<String> clusterIds
private boolean replicate
private boolean failIfNeedSplitHFile
public BulkLoadHFilesTool(org.apache.hadoop.conf.Configuration conf)
public void initialize()
private ExecutorService createExecutorService()
private boolean isCreateTable()
private boolean isSilence()
private boolean isAlwaysCopyFiles()
private static boolean shouldCopyHFileMetaKey(byte[] key)
private static void validateFamiliesInHFiles(TableDescriptor tableDesc, Deque<BulkLoadHFiles.LoadQueueItem> queue, boolean silence) throws IOException
IOException
private static void populateLoadQueue(Deque<BulkLoadHFiles.LoadQueueItem> ret, Map<byte[],List<org.apache.hadoop.fs.Path>> map)
private static <TFamily> void visitBulkHFiles(org.apache.hadoop.fs.FileSystem fs, org.apache.hadoop.fs.Path bulkDir, BulkLoadHFilesTool.BulkHFileVisitor<TFamily> visitor, boolean validateHFile) throws IOException
VALIDATE_HFILES
to false.IOException
private static void discoverLoadQueue(org.apache.hadoop.conf.Configuration conf, Deque<BulkLoadHFiles.LoadQueueItem> ret, org.apache.hadoop.fs.Path hfofDir, boolean validateHFile) throws IOException
IOException
public static void prepareHFileQueue(AsyncClusterConnection conn, TableName tableName, Map<byte[],List<org.apache.hadoop.fs.Path>> map, Deque<BulkLoadHFiles.LoadQueueItem> queue, boolean silence) throws IOException
LoadQueueItem
from list of source hfiles contained in the
passed directory and validates whether the prepared queue has all the valid table column
families in it.map
- map of family to List of hfilestableName
- table to which hfiles should be loadedqueue
- queue which needs to be loaded into the tablesilence
- true to ignore unmatched column familiesIOException
- If any I/O or network error occurredpublic static void prepareHFileQueue(org.apache.hadoop.conf.Configuration conf, AsyncClusterConnection conn, TableName tableName, org.apache.hadoop.fs.Path hfilesDir, Deque<BulkLoadHFiles.LoadQueueItem> queue, boolean validateHFile, boolean silence) throws IOException
LoadQueueItem
from list of source hfiles contained in the
passed directory and validates whether the prepared queue has all the valid table column
families in it.hfilesDir
- directory containing list of hfiles to be loaded into the tablequeue
- queue which needs to be loaded into the tablevalidateHFile
- if true hfiles will be validated for its formatsilence
- true to ignore unmatched column familiesIOException
- If any I/O or network error occurredpublic void loadHFileQueue(AsyncClusterConnection conn, TableName tableName, Deque<BulkLoadHFiles.LoadQueueItem> queue, boolean copyFiles) throws IOException
conn
- Connection to usetableName
- Table to which these hfiles should be loaded toqueue
- LoadQueueItem
has hfiles yet to be loadedIOException
@InterfaceAudience.Private protected CompletableFuture<Collection<BulkLoadHFiles.LoadQueueItem>> tryAtomicRegionLoad(AsyncClusterConnection conn, TableName tableName, boolean copyFiles, byte[] first, Collection<BulkLoadHFiles.LoadQueueItem> lqis)
conn
- Connection to usetableName
- Table to which these hfiles should be loaded tocopyFiles
- whether replicate to peer cluster while bulkloadingfirst
- the start key of regionlqis
- hfiles should be loaded@InterfaceAudience.Private protected void bulkLoadPhase(AsyncClusterConnection conn, TableName tableName, Deque<BulkLoadHFiles.LoadQueueItem> queue, org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem> regionGroups, boolean copyFiles, Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> item2RegionMap) throws IOException
IOException
private Map<byte[],Collection<BulkLoadHFiles.LoadQueueItem>> groupByFamilies(Collection<BulkLoadHFiles.LoadQueueItem> itemsInRegion)
private boolean checkHFilesCountPerRegionPerFamily(org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem> regionGroups)
private Pair<org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem>,Set<String>> groupOrSplitPhase(AsyncClusterConnection conn, TableName tableName, ExecutorService pool, Deque<BulkLoadHFiles.LoadQueueItem> queue, List<Pair<byte[],byte[]>> startEndKeys) throws IOException
conn
- the HBase cluster connectiontableName
- the table name of the table to load intopool
- the ExecutorServicequeue
- the queue for LoadQueueItemstartEndKeys
- start and end keysIOException
private String getUniqueName()
private List<BulkLoadHFiles.LoadQueueItem> splitStoreFile(BulkLoadHFiles.LoadQueueItem item, TableDescriptor tableDesc, byte[] splitKey) throws IOException
IOException
private int getRegionIndex(List<Pair<byte[],byte[]>> startEndKeys, byte[] key)
startEndKeys
- the start/end keys of regions belong to this table, the list in ascending
order by start keykey
- the key need to find which region belong toprivate void checkRegionIndexValid(int idx, List<Pair<byte[],byte[]>> startEndKeys, TableName tableName) throws IOException
IOException
@InterfaceAudience.Private protected Pair<List<BulkLoadHFiles.LoadQueueItem>,String> groupOrSplit(AsyncClusterConnection conn, TableName tableName, org.apache.hbase.thirdparty.com.google.common.collect.Multimap<ByteBuffer,BulkLoadHFiles.LoadQueueItem> regionGroups, BulkLoadHFiles.LoadQueueItem item, List<Pair<byte[],byte[]>> startEndKeys) throws IOException
IOException
- if an IO failure is encountered@InterfaceAudience.Private static void splitStoreFile(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path inFile, ColumnFamilyDescriptor familyDesc, byte[] splitKey, org.apache.hadoop.fs.Path bottomOut, org.apache.hadoop.fs.Path topOut) throws IOException
IOException
private static void copyHFileHalf(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path inFile, org.apache.hadoop.fs.Path outFile, Reference reference, ColumnFamilyDescriptor familyDescriptor) throws IOException
IOException
public static byte[][] inferBoundaries(SortedMap<byte[],Integer> bdryMap)
Algo:
private void createTable(TableName tableName, org.apache.hadoop.fs.Path hfofDir, AsyncAdmin admin) throws IOException
IOException
private Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> performBulkLoad(AsyncClusterConnection conn, TableName tableName, Deque<BulkLoadHFiles.LoadQueueItem> queue, ExecutorService pool, boolean copyFile) throws IOException
IOException
private void cleanup(AsyncClusterConnection conn, TableName tableName, Deque<BulkLoadHFiles.LoadQueueItem> queue, ExecutorService pool) throws IOException
IOException
private Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> doBulkLoad(AsyncClusterConnection conn, TableName tableName, Map<byte[],List<org.apache.hadoop.fs.Path>> map, boolean silence, boolean copyFile) throws IOException
map
- map of family to List of hfilestableName
- table to load the hfilessilence
- true to ignore unmatched column familiescopyFile
- always copy hfiles if trueIOException
private Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> doBulkLoad(AsyncClusterConnection conn, TableName tableName, org.apache.hadoop.fs.Path hfofDir, boolean silence, boolean copyFile) throws IOException
tableName
- table to load the hfileshfofDir
- the directory that was provided as the output path of a job using
HFileOutputFormatsilence
- true to ignore unmatched column familiescopyFile
- always copy hfiles if trueIOException
public Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> bulkLoad(TableName tableName, Map<byte[],List<org.apache.hadoop.fs.Path>> family2Files) throws IOException
BulkLoadHFiles
bulkLoad
in interface BulkLoadHFiles
tableName
- the table to load intofamily2Files
- map of family to List of hfilesTableNotFoundException
- if table does not yet existIOException
public Map<BulkLoadHFiles.LoadQueueItem,ByteBuffer> bulkLoad(TableName tableName, org.apache.hadoop.fs.Path dir) throws IOException
BulkLoadHFiles
bulkLoad
in interface BulkLoadHFiles
tableName
- the table to load intodir
- the directory that was provided as the output path of a job using
HFileOutputFormat
TableNotFoundException
- if table does not yet existIOException
private void tableExists(AsyncClusterConnection conn, TableName tableName) throws IOException
TableNotFoundException
- if table does not exist.IOException
private void throwAndLogTableNotFoundException(TableName tn) throws TableNotFoundException
TableNotFoundException
public void setBulkToken(String bulkToken)
public void setClusterIds(List<String> clusterIds)
private void usage()
public int run(String[] args) throws Exception
run
in interface org.apache.hadoop.util.Tool
Exception
public void disableReplication()
BulkLoadHFiles
disableReplication
in interface BulkLoadHFiles
public boolean isReplicationDisabled()
BulkLoadHFiles
isReplicationDisabled
in interface BulkLoadHFiles
Copyright © 2007–2020 The Apache Software Foundation. All rights reserved.