001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.util.Comparator;
023import java.util.List;
024import java.util.Map;
025import java.util.TreeMap;
026import java.util.concurrent.ConcurrentHashMap;
027import java.util.function.Function;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030
031import edu.umd.cs.findbugs.annotations.Nullable;
032import org.apache.commons.lang3.NotImplementedException;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.FSDataInputStream;
035import org.apache.hadoop.fs.FSDataOutputStream;
036import org.apache.hadoop.fs.FileStatus;
037import org.apache.hadoop.fs.FileSystem;
038import org.apache.hadoop.fs.Path;
039import org.apache.hadoop.fs.PathFilter;
040import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder;
041import org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint;
042import org.apache.yetus.audience.InterfaceAudience;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
046import org.apache.hadoop.hbase.client.TableDescriptor;
047import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
048import org.apache.hadoop.hbase.Coprocessor;
049import org.apache.hadoop.hbase.exceptions.DeserializationException;
050import org.apache.hadoop.hbase.HConstants;
051import org.apache.hadoop.hbase.regionserver.BloomType;
052import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
053import org.apache.hbase.thirdparty.com.google.common.primitives.Ints;
054import org.apache.hadoop.hbase.TableDescriptors;
055import org.apache.hadoop.hbase.TableInfoMissingException;
056import org.apache.hadoop.hbase.TableName;
057
058/**
059 * Implementation of {@link TableDescriptors} that reads descriptors from the
060 * passed filesystem.  It expects descriptors to be in a file in the
061 * {@link #TABLEINFO_DIR} subdir of the table's directory in FS.  Can be read-only
062 *  -- i.e. does not modify the filesystem or can be read and write.
063 *
064 * <p>Also has utility for keeping up the table descriptors tableinfo file.
065 * The table schema file is kept in the {@link #TABLEINFO_DIR} subdir
066 * of the table directory in the filesystem.
067 * It has a {@link #TABLEINFO_FILE_PREFIX} and then a suffix that is the
068 * edit sequenceid: e.g. <code>.tableinfo.0000000003</code>.  This sequenceid
069 * is always increasing.  It starts at zero.  The table schema file with the
070 * highest sequenceid has the most recent schema edit. Usually there is one file
071 * only, the most recent but there may be short periods where there are more
072 * than one file. Old files are eventually cleaned.  Presumption is that there
073 * will not be lots of concurrent clients making table schema edits.  If so,
074 * the below needs a bit of a reworking and perhaps some supporting api in hdfs.
075 */
076@InterfaceAudience.Private
077public class FSTableDescriptors implements TableDescriptors {
078  private static final Logger LOG = LoggerFactory.getLogger(FSTableDescriptors.class);
079  private final FileSystem fs;
080  private final Path rootdir;
081  private final boolean fsreadonly;
082  private volatile boolean usecache;
083  private volatile boolean fsvisited;
084
085  @VisibleForTesting
086  long cachehits = 0;
087  @VisibleForTesting
088  long invocations = 0;
089
090  /**
091   * The file name prefix used to store HTD in HDFS
092   */
093  static final String TABLEINFO_FILE_PREFIX = ".tableinfo";
094  static final String TABLEINFO_DIR = ".tabledesc";
095  static final String TMP_DIR = ".tmp";
096
097  // This cache does not age out the old stuff.  Thinking is that the amount
098  // of data we keep up in here is so small, no need to do occasional purge.
099  // TODO.
100  private final Map<TableName, TableDescriptor> cache = new ConcurrentHashMap<>();
101
102  /**
103   * Table descriptor for <code>hbase:meta</code> catalog table
104   */
105  private final TableDescriptor metaTableDescriptor;
106
107  /**
108   * Construct a FSTableDescriptors instance using the hbase root dir of the given
109   * conf and the filesystem where that root dir lives.
110   * This instance can do write operations (is not read only).
111   */
112  public FSTableDescriptors(final Configuration conf) throws IOException {
113    this(conf, FSUtils.getCurrentFileSystem(conf), FSUtils.getRootDir(conf));
114  }
115
116  public FSTableDescriptors(final Configuration conf, final FileSystem fs, final Path rootdir)
117          throws IOException {
118    this(conf, fs, rootdir, false, true);
119  }
120
121  /**
122   * @param fsreadonly True if we are read-only when it comes to filesystem
123   *                   operations; i.e. on remove, we do not do delete in fs.
124   */
125  public FSTableDescriptors(final Configuration conf, final FileSystem fs,
126                            final Path rootdir, final boolean fsreadonly, final boolean usecache) throws IOException {
127    this(conf, fs, rootdir, fsreadonly, usecache, null);
128  }
129
130  /**
131   * @param fsreadonly True if we are read-only when it comes to filesystem
132   *                   operations; i.e. on remove, we do not do delete in fs.
133   * @param metaObserver Used by HMaster. It need to modify the META_REPLICAS_NUM for meta table descriptor.
134   *                     see HMaster#finishActiveMasterInitialization
135   *                     TODO: This is a workaround. Should remove this ugly code...
136   */
137  public FSTableDescriptors(final Configuration conf, final FileSystem fs,
138                            final Path rootdir, final boolean fsreadonly, final boolean usecache,
139                            Function<TableDescriptorBuilder, TableDescriptorBuilder> metaObserver) throws IOException {
140    this.fs = fs;
141    this.rootdir = rootdir;
142    this.fsreadonly = fsreadonly;
143    this.usecache = usecache;
144    this.metaTableDescriptor = metaObserver == null ? createMetaTableDescriptor(conf)
145          : metaObserver.apply(createMetaTableDescriptorBuilder(conf)).build();
146  }
147
148  @VisibleForTesting
149  public static TableDescriptorBuilder createMetaTableDescriptorBuilder(final Configuration conf) throws IOException {
150    // TODO We used to set CacheDataInL1 for META table. When we have BucketCache in file mode, now
151    // the META table data goes to File mode BC only. Test how that affect the system. If too much,
152    // we have to rethink about adding back the setCacheDataInL1 for META table CFs.
153    return TableDescriptorBuilder.newBuilder(TableName.META_TABLE_NAME)
154      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.CATALOG_FAMILY)
155        .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
156          HConstants.DEFAULT_HBASE_META_VERSIONS))
157        .setInMemory(true)
158        .setBlocksize(conf.getInt(HConstants.HBASE_META_BLOCK_SIZE,
159          HConstants.DEFAULT_HBASE_META_BLOCK_SIZE))
160        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
161        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
162        .setBloomFilterType(BloomType.NONE)
163        .build())
164      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.TABLE_FAMILY)
165        .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
166          HConstants.DEFAULT_HBASE_META_VERSIONS))
167        .setInMemory(true)
168        .setBlocksize(8 * 1024)
169        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
170        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
171        .setBloomFilterType(BloomType.NONE)
172        .build())
173      .setCoprocessor(CoprocessorDescriptorBuilder.newBuilder(
174        MultiRowMutationEndpoint.class.getName())
175        .setPriority(Coprocessor.PRIORITY_SYSTEM)
176        .build());
177  }
178
179  @VisibleForTesting
180  public static TableDescriptor createMetaTableDescriptor(final Configuration conf)
181      throws IOException {
182    return createMetaTableDescriptorBuilder(conf).build();
183  }
184
185  @Override
186  public void setCacheOn() throws IOException {
187    this.cache.clear();
188    this.usecache = true;
189  }
190
191  @Override
192  public void setCacheOff() throws IOException {
193    this.usecache = false;
194    this.cache.clear();
195  }
196
197  @VisibleForTesting
198  public boolean isUsecache() {
199    return this.usecache;
200  }
201
202  /**
203   * Get the current table descriptor for the given table, or null if none exists.
204   *
205   * Uses a local cache of the descriptor but still checks the filesystem on each call
206   * to see if a newer file has been created since the cached one was read.
207   */
208  @Override
209  @Nullable
210  public TableDescriptor get(final TableName tablename)
211  throws IOException {
212    invocations++;
213    if (TableName.META_TABLE_NAME.equals(tablename)) {
214      cachehits++;
215      return metaTableDescriptor;
216    }
217    // hbase:meta is already handled. If some one tries to get the descriptor for
218    // .logs, .oldlogs or .corrupt throw an exception.
219    if (HConstants.HBASE_NON_USER_TABLE_DIRS.contains(tablename.getNameAsString())) {
220       throw new IOException("No descriptor found for non table = " + tablename);
221    }
222
223    if (usecache) {
224      // Look in cache of descriptors.
225      TableDescriptor cachedtdm = this.cache.get(tablename);
226      if (cachedtdm != null) {
227        cachehits++;
228        return cachedtdm;
229      }
230    }
231    TableDescriptor tdmt = null;
232    try {
233      tdmt = getTableDescriptorFromFs(fs, rootdir, tablename);
234    } catch (NullPointerException e) {
235      LOG.debug("Exception during readTableDecriptor. Current table name = "
236          + tablename, e);
237    } catch (TableInfoMissingException e) {
238      // ignore. This is regular operation
239    } catch (IOException ioe) {
240      LOG.debug("Exception during readTableDecriptor. Current table name = "
241          + tablename, ioe);
242    }
243    // last HTD written wins
244    if (usecache && tdmt != null) {
245      this.cache.put(tablename, tdmt);
246    }
247
248    return tdmt;
249  }
250
251  /**
252   * Returns a map from table name to table descriptor for all tables.
253   */
254  @Override
255  public Map<String, TableDescriptor> getAll()
256  throws IOException {
257    Map<String, TableDescriptor> tds = new TreeMap<>();
258
259    if (fsvisited && usecache) {
260      for (Map.Entry<TableName, TableDescriptor> entry: this.cache.entrySet()) {
261        tds.put(entry.getKey().toString(), entry.getValue());
262      }
263      // add hbase:meta to the response
264      tds.put(this.metaTableDescriptor.getTableName().getNameAsString(), metaTableDescriptor);
265    } else {
266      LOG.trace("Fetching table descriptors from the filesystem.");
267      boolean allvisited = true;
268      for (Path d : FSUtils.getTableDirs(fs, rootdir)) {
269        TableDescriptor htd = null;
270        try {
271          htd = get(FSUtils.getTableName(d));
272        } catch (FileNotFoundException fnfe) {
273          // inability of retrieving one HTD shouldn't stop getting the remaining
274          LOG.warn("Trouble retrieving htd", fnfe);
275        }
276        if (htd == null) {
277          allvisited = false;
278          continue;
279        } else {
280          tds.put(htd.getTableName().getNameAsString(), htd);
281        }
282        fsvisited = allvisited;
283      }
284    }
285    return tds;
286  }
287
288  /**
289    * Find descriptors by namespace.
290    * @see #get(org.apache.hadoop.hbase.TableName)
291    */
292  @Override
293  public Map<String, TableDescriptor> getByNamespace(String name)
294  throws IOException {
295    Map<String, TableDescriptor> htds = new TreeMap<>();
296    List<Path> tableDirs =
297        FSUtils.getLocalTableDirs(fs, FSUtils.getNamespaceDir(rootdir, name));
298    for (Path d: tableDirs) {
299      TableDescriptor htd = null;
300      try {
301        htd = get(FSUtils.getTableName(d));
302      } catch (FileNotFoundException fnfe) {
303        // inability of retrieving one HTD shouldn't stop getting the remaining
304        LOG.warn("Trouble retrieving htd", fnfe);
305      }
306      if (htd == null) continue;
307      htds.put(FSUtils.getTableName(d).getNameAsString(), htd);
308    }
309    return htds;
310  }
311
312  /**
313   * Adds (or updates) the table descriptor to the FileSystem
314   * and updates the local cache with it.
315   */
316  @Override
317  public void add(TableDescriptor htd) throws IOException {
318    if (fsreadonly) {
319      throw new NotImplementedException("Cannot add a table descriptor - in read only mode");
320    }
321    TableName tableName = htd.getTableName();
322    if (TableName.META_TABLE_NAME.equals(tableName)) {
323      throw new NotImplementedException(HConstants.NOT_IMPLEMENTED);
324    }
325    if (HConstants.HBASE_NON_USER_TABLE_DIRS.contains(tableName.getNameAsString())) {
326      throw new NotImplementedException(
327          "Cannot add a table descriptor for a reserved subdirectory name: "
328              + htd.getTableName().getNameAsString());
329    }
330    updateTableDescriptor(htd);
331  }
332
333  /**
334   * Removes the table descriptor from the local cache and returns it.
335   * If not in read only mode, it also deletes the entire table directory(!)
336   * from the FileSystem.
337   */
338  @Override
339  public TableDescriptor remove(final TableName tablename)
340  throws IOException {
341    if (fsreadonly) {
342      throw new NotImplementedException("Cannot remove a table descriptor - in read only mode");
343    }
344    Path tabledir = getTableDir(tablename);
345    if (this.fs.exists(tabledir)) {
346      if (!this.fs.delete(tabledir, true)) {
347        throw new IOException("Failed delete of " + tabledir.toString());
348      }
349    }
350    TableDescriptor descriptor = this.cache.remove(tablename);
351    return descriptor;
352  }
353
354  /**
355   * Checks if a current table info file exists for the given table
356   *
357   * @param tableName name of table
358   * @return true if exists
359   * @throws IOException
360   */
361  public boolean isTableInfoExists(TableName tableName) throws IOException {
362    return getTableInfoPath(tableName) != null;
363  }
364
365  /**
366   * Find the most current table info file for the given table in the hbase root directory.
367   * @return The file status of the current table info file or null if it does not exist
368   */
369  private FileStatus getTableInfoPath(final TableName tableName) throws IOException {
370    Path tableDir = getTableDir(tableName);
371    return getTableInfoPath(tableDir);
372  }
373
374  private FileStatus getTableInfoPath(Path tableDir)
375  throws IOException {
376    return getTableInfoPath(fs, tableDir, !fsreadonly);
377  }
378
379  /**
380   * Find the most current table info file for the table located in the given table directory.
381   *
382   * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
383   * files and takes the 'current' one - meaning the one with the highest sequence number if present
384   * or no sequence number at all if none exist (for backward compatibility from before there
385   * were sequence numbers).
386   *
387   * @return The file status of the current table info file or null if it does not exist
388   * @throws IOException
389   */
390  public static FileStatus getTableInfoPath(FileSystem fs, Path tableDir)
391  throws IOException {
392    return getTableInfoPath(fs, tableDir, false);
393  }
394
395  /**
396   * Find the most current table info file for the table in the given table directory.
397   *
398   * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
399   * files and takes the 'current' one - meaning the one with the highest sequence number if
400   * present or no sequence number at all if none exist (for backward compatibility from before
401   * there were sequence numbers).
402   * If there are multiple table info files found and removeOldFiles is true it also deletes the
403   * older files.
404   *
405   * @return The file status of the current table info file or null if none exist
406   * @throws IOException
407   */
408  private static FileStatus getTableInfoPath(FileSystem fs, Path tableDir, boolean removeOldFiles)
409  throws IOException {
410    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
411    return getCurrentTableInfoStatus(fs, tableInfoDir, removeOldFiles);
412  }
413
414  /**
415   * Find the most current table info file in the given directory
416   *
417   * Looks within the given directory for any table info files
418   * and takes the 'current' one - meaning the one with the highest sequence number if present
419   * or no sequence number at all if none exist (for backward compatibility from before there
420   * were sequence numbers).
421   * If there are multiple possible files found
422   * and the we're not in read only mode it also deletes the older files.
423   *
424   * @return The file status of the current table info file or null if it does not exist
425   * @throws IOException
426   */
427  // only visible for FSTableDescriptorMigrationToSubdir, can be removed with that
428  static FileStatus getCurrentTableInfoStatus(FileSystem fs, Path dir, boolean removeOldFiles)
429  throws IOException {
430    FileStatus [] status = FSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
431    if (status == null || status.length < 1) return null;
432    FileStatus mostCurrent = null;
433    for (FileStatus file : status) {
434      if (mostCurrent == null || TABLEINFO_FILESTATUS_COMPARATOR.compare(file, mostCurrent) < 0) {
435        mostCurrent = file;
436      }
437    }
438    if (removeOldFiles && status.length > 1) {
439      // Clean away old versions
440      for (FileStatus file : status) {
441        Path path = file.getPath();
442        if (!file.equals(mostCurrent)) {
443          if (!fs.delete(file.getPath(), false)) {
444            LOG.warn("Failed cleanup of " + path);
445          } else {
446            LOG.debug("Cleaned up old tableinfo file " + path);
447          }
448        }
449      }
450    }
451    return mostCurrent;
452  }
453
454  /**
455   * Compare {@link FileStatus} instances by {@link Path#getName()}. Returns in
456   * reverse order.
457   */
458  @VisibleForTesting
459  static final Comparator<FileStatus> TABLEINFO_FILESTATUS_COMPARATOR =
460  new Comparator<FileStatus>() {
461    @Override
462    public int compare(FileStatus left, FileStatus right) {
463      return right.compareTo(left);
464    }};
465
466  /**
467   * Return the table directory in HDFS
468   */
469  @VisibleForTesting Path getTableDir(final TableName tableName) {
470    return FSUtils.getTableDir(rootdir, tableName);
471  }
472
473  private static final PathFilter TABLEINFO_PATHFILTER = new PathFilter() {
474    @Override
475    public boolean accept(Path p) {
476      // Accept any file that starts with TABLEINFO_NAME
477      return p.getName().startsWith(TABLEINFO_FILE_PREFIX);
478    }};
479
480  /**
481   * Width of the sequenceid that is a suffix on a tableinfo file.
482   */
483  @VisibleForTesting static final int WIDTH_OF_SEQUENCE_ID = 10;
484
485  /*
486   * @param number Number to use as suffix.
487   * @return Returns zero-prefixed decimal version of passed
488   * number (Does absolute in case number is negative).
489   */
490  private static String formatTableInfoSequenceId(final int number) {
491    byte [] b = new byte[WIDTH_OF_SEQUENCE_ID];
492    int d = Math.abs(number);
493    for (int i = b.length - 1; i >= 0; i--) {
494      b[i] = (byte)((d % 10) + '0');
495      d /= 10;
496    }
497    return Bytes.toString(b);
498  }
499
500  /**
501   * Regex to eat up sequenceid suffix on a .tableinfo file.
502   * Use regex because may encounter oldstyle .tableinfos where there is no
503   * sequenceid on the end.
504   */
505  private static final Pattern TABLEINFO_FILE_REGEX =
506    Pattern.compile(TABLEINFO_FILE_PREFIX + "(\\.([0-9]{" + WIDTH_OF_SEQUENCE_ID + "}))?$");
507
508  /**
509   * @param p Path to a <code>.tableinfo</code> file.
510   * @return The current editid or 0 if none found.
511   */
512  @VisibleForTesting static int getTableInfoSequenceId(final Path p) {
513    if (p == null) return 0;
514    Matcher m = TABLEINFO_FILE_REGEX.matcher(p.getName());
515    if (!m.matches()) throw new IllegalArgumentException(p.toString());
516    String suffix = m.group(2);
517    if (suffix == null || suffix.length() <= 0) return 0;
518    return Integer.parseInt(m.group(2));
519  }
520
521  /**
522   * @param sequenceid
523   * @return Name of tableinfo file.
524   */
525  @VisibleForTesting static String getTableInfoFileName(final int sequenceid) {
526    return TABLEINFO_FILE_PREFIX + "." + formatTableInfoSequenceId(sequenceid);
527  }
528
529  /**
530   * Returns the latest table descriptor for the given table directly from the file system
531   * if it exists, bypassing the local cache.
532   * Returns null if it's not found.
533   */
534  public static TableDescriptor getTableDescriptorFromFs(FileSystem fs,
535      Path hbaseRootDir, TableName tableName) throws IOException {
536    Path tableDir = FSUtils.getTableDir(hbaseRootDir, tableName);
537    return getTableDescriptorFromFs(fs, tableDir);
538  }
539
540  /**
541   * Returns the latest table descriptor for the table located at the given directory
542   * directly from the file system if it exists.
543   * @throws TableInfoMissingException if there is no descriptor
544   */
545  public static TableDescriptor getTableDescriptorFromFs(FileSystem fs, Path tableDir)
546  throws IOException {
547    FileStatus status = getTableInfoPath(fs, tableDir, false);
548    if (status == null) {
549      throw new TableInfoMissingException("No table descriptor file under " + tableDir);
550    }
551    return readTableDescriptor(fs, status);
552  }
553
554  private static TableDescriptor readTableDescriptor(FileSystem fs, FileStatus status)
555      throws IOException {
556    int len = Ints.checkedCast(status.getLen());
557    byte [] content = new byte[len];
558    FSDataInputStream fsDataInputStream = fs.open(status.getPath());
559    try {
560      fsDataInputStream.readFully(content);
561    } finally {
562      fsDataInputStream.close();
563    }
564    TableDescriptor htd = null;
565    try {
566      htd = TableDescriptorBuilder.parseFrom(content);
567    } catch (DeserializationException e) {
568      throw new IOException("content=" + Bytes.toShort(content), e);
569    }
570    return htd;
571  }
572
573  /**
574   * Update table descriptor on the file system
575   * @throws IOException Thrown if failed update.
576   * @throws NotImplementedException if in read only mode
577   */
578  @VisibleForTesting Path updateTableDescriptor(TableDescriptor td)
579  throws IOException {
580    if (fsreadonly) {
581      throw new NotImplementedException("Cannot update a table descriptor - in read only mode");
582    }
583    TableName tableName = td.getTableName();
584    Path tableDir = getTableDir(tableName);
585    Path p = writeTableDescriptor(fs, td, tableDir, getTableInfoPath(tableDir));
586    if (p == null) throw new IOException("Failed update");
587    LOG.info("Updated tableinfo=" + p);
588    if (usecache) {
589      this.cache.put(td.getTableName(), td);
590    }
591    return p;
592  }
593
594  /**
595   * Deletes all the table descriptor files from the file system.
596   * Used in unit tests only.
597   * @throws NotImplementedException if in read only mode
598   */
599  public void deleteTableDescriptorIfExists(TableName tableName) throws IOException {
600    if (fsreadonly) {
601      throw new NotImplementedException("Cannot delete a table descriptor - in read only mode");
602    }
603
604    Path tableDir = getTableDir(tableName);
605    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
606    deleteTableDescriptorFiles(fs, tableInfoDir, Integer.MAX_VALUE);
607  }
608
609  /**
610   * Deletes files matching the table info file pattern within the given directory
611   * whose sequenceId is at most the given max sequenceId.
612   */
613  private static void deleteTableDescriptorFiles(FileSystem fs, Path dir, int maxSequenceId)
614  throws IOException {
615    FileStatus [] status = FSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
616    for (FileStatus file : status) {
617      Path path = file.getPath();
618      int sequenceId = getTableInfoSequenceId(path);
619      if (sequenceId <= maxSequenceId) {
620        boolean success = FSUtils.delete(fs, path, false);
621        if (success) {
622          LOG.debug("Deleted " + path);
623        } else {
624          LOG.error("Failed to delete table descriptor at " + path);
625        }
626      }
627    }
628  }
629
630  /**
631   * Attempts to write a new table descriptor to the given table's directory.
632   * It first writes it to the .tmp dir then uses an atomic rename to move it into place.
633   * It begins at the currentSequenceId + 1 and tries 10 times to find a new sequence number
634   * not already in use.
635   * Removes the current descriptor file if passed in.
636   *
637   * @return Descriptor file or null if we failed write.
638   */
639  private static Path writeTableDescriptor(final FileSystem fs,
640    final TableDescriptor htd, final Path tableDir,
641    final FileStatus currentDescriptorFile)
642  throws IOException {
643    // Get temporary dir into which we'll first write a file to avoid half-written file phenomenon.
644    // This directory is never removed to avoid removing it out from under a concurrent writer.
645    Path tmpTableDir = new Path(tableDir, TMP_DIR);
646    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
647
648    // What is current sequenceid?  We read the current sequenceid from
649    // the current file.  After we read it, another thread could come in and
650    // compete with us writing out next version of file.  The below retries
651    // should help in this case some but its hard to do guarantees in face of
652    // concurrent schema edits.
653    int currentSequenceId = currentDescriptorFile == null ? 0 :
654      getTableInfoSequenceId(currentDescriptorFile.getPath());
655    int newSequenceId = currentSequenceId;
656
657    // Put arbitrary upperbound on how often we retry
658    int retries = 10;
659    int retrymax = currentSequenceId + retries;
660    Path tableInfoDirPath = null;
661    do {
662      newSequenceId += 1;
663      String filename = getTableInfoFileName(newSequenceId);
664      Path tempPath = new Path(tmpTableDir, filename);
665      if (fs.exists(tempPath)) {
666        LOG.debug(tempPath + " exists; retrying up to " + retries + " times");
667        continue;
668      }
669      tableInfoDirPath = new Path(tableInfoDir, filename);
670      try {
671        writeTD(fs, tempPath, htd);
672        fs.mkdirs(tableInfoDirPath.getParent());
673        if (!fs.rename(tempPath, tableInfoDirPath)) {
674          throw new IOException("Failed rename of " + tempPath + " to " + tableInfoDirPath);
675        }
676        LOG.debug("Wrote into " + tableInfoDirPath);
677      } catch (IOException ioe) {
678        // Presume clash of names or something; go around again.
679        LOG.debug("Failed write and/or rename; retrying", ioe);
680        if (!FSUtils.deleteDirectory(fs, tempPath)) {
681          LOG.warn("Failed cleanup of " + tempPath);
682        }
683        tableInfoDirPath = null;
684        continue;
685      }
686      break;
687    } while (newSequenceId < retrymax);
688    if (tableInfoDirPath != null) {
689      // if we succeeded, remove old table info files.
690      deleteTableDescriptorFiles(fs, tableInfoDir, newSequenceId - 1);
691    }
692    return tableInfoDirPath;
693  }
694
695  private static void writeTD(final FileSystem fs, final Path p, final TableDescriptor htd)
696  throws IOException {
697    FSDataOutputStream out = fs.create(p, false);
698    try {
699      // We used to write this file out as a serialized HTD Writable followed by two '\n's and then
700      // the toString version of HTD.  Now we just write out the pb serialization.
701      out.write(TableDescriptorBuilder.toByteArray(htd));
702    } finally {
703      out.close();
704    }
705  }
706
707  /**
708   * Create new TableDescriptor in HDFS. Happens when we are creating table.
709   * Used by tests.
710   * @return True if we successfully created file.
711   */
712  public boolean createTableDescriptor(TableDescriptor htd) throws IOException {
713    return createTableDescriptor(htd, false);
714  }
715
716  /**
717   * Create new TableDescriptor in HDFS. Happens when we are creating table. If
718   * forceCreation is true then even if previous table descriptor is present it
719   * will be overwritten
720   *
721   * @return True if we successfully created file.
722   */
723  public boolean createTableDescriptor(TableDescriptor htd, boolean forceCreation)
724  throws IOException {
725    Path tableDir = getTableDir(htd.getTableName());
726    return createTableDescriptorForTableDirectory(tableDir, htd, forceCreation);
727  }
728
729  /**
730   * Create a new TableDescriptor in HDFS in the specified table directory. Happens when we create
731   * a new table or snapshot a table.
732   * @param tableDir table directory under which we should write the file
733   * @param htd description of the table to write
734   * @param forceCreation if <tt>true</tt>,then even if previous table descriptor is present it will
735   *          be overwritten
736   * @return <tt>true</tt> if the we successfully created the file, <tt>false</tt> if the file
737   *         already exists and we weren't forcing the descriptor creation.
738   * @throws IOException if a filesystem error occurs
739   */
740  public boolean createTableDescriptorForTableDirectory(Path tableDir,
741      TableDescriptor htd, boolean forceCreation) throws IOException {
742    if (fsreadonly) {
743      throw new NotImplementedException("Cannot create a table descriptor - in read only mode");
744    }
745    FileStatus status = getTableInfoPath(fs, tableDir);
746    if (status != null) {
747      LOG.debug("Current path=" + status.getPath());
748      if (!forceCreation) {
749        if (fs.exists(status.getPath()) && status.getLen() > 0) {
750          if (readTableDescriptor(fs, status).equals(htd)) {
751            LOG.trace("TableInfo already exists.. Skipping creation");
752            return false;
753          }
754        }
755      }
756    }
757    Path p = writeTableDescriptor(fs, htd, tableDir, status);
758    return p != null;
759  }
760
761}
762