001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import edu.umd.cs.findbugs.annotations.Nullable;
021import java.io.FileNotFoundException;
022import java.io.IOException;
023import java.util.Comparator;
024import java.util.List;
025import java.util.Map;
026import java.util.TreeMap;
027import java.util.concurrent.ConcurrentHashMap;
028import java.util.function.Function;
029import java.util.regex.Matcher;
030import java.util.regex.Pattern;
031import org.apache.commons.lang3.NotImplementedException;
032import org.apache.hadoop.conf.Configuration;
033import org.apache.hadoop.fs.FSDataInputStream;
034import org.apache.hadoop.fs.FSDataOutputStream;
035import org.apache.hadoop.fs.FileStatus;
036import org.apache.hadoop.fs.FileSystem;
037import org.apache.hadoop.fs.Path;
038import org.apache.hadoop.fs.PathFilter;
039import org.apache.hadoop.hbase.Coprocessor;
040import org.apache.hadoop.hbase.HConstants;
041import org.apache.hadoop.hbase.TableDescriptors;
042import org.apache.hadoop.hbase.TableInfoMissingException;
043import org.apache.hadoop.hbase.TableName;
044import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
045import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder;
046import org.apache.hadoop.hbase.client.TableDescriptor;
047import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
048import org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint;
049import org.apache.hadoop.hbase.exceptions.DeserializationException;
050import org.apache.hadoop.hbase.regionserver.BloomType;
051import org.apache.yetus.audience.InterfaceAudience;
052import org.slf4j.Logger;
053import org.slf4j.LoggerFactory;
054
055import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
056import org.apache.hbase.thirdparty.com.google.common.primitives.Ints;
057
058/**
059 * Implementation of {@link TableDescriptors} that reads descriptors from the
060 * passed filesystem.  It expects descriptors to be in a file in the
061 * {@link #TABLEINFO_DIR} subdir of the table's directory in FS.  Can be read-only
062 *  -- i.e. does not modify the filesystem or can be read and write.
063 *
064 * <p>Also has utility for keeping up the table descriptors tableinfo file.
065 * The table schema file is kept in the {@link #TABLEINFO_DIR} subdir
066 * of the table directory in the filesystem.
067 * It has a {@link #TABLEINFO_FILE_PREFIX} and then a suffix that is the
068 * edit sequenceid: e.g. <code>.tableinfo.0000000003</code>.  This sequenceid
069 * is always increasing.  It starts at zero.  The table schema file with the
070 * highest sequenceid has the most recent schema edit. Usually there is one file
071 * only, the most recent but there may be short periods where there are more
072 * than one file. Old files are eventually cleaned.  Presumption is that there
073 * will not be lots of concurrent clients making table schema edits.  If so,
074 * the below needs a bit of a reworking and perhaps some supporting api in hdfs.
075 */
076@InterfaceAudience.Private
077public class FSTableDescriptors implements TableDescriptors {
078  private static final Logger LOG = LoggerFactory.getLogger(FSTableDescriptors.class);
079  private final FileSystem fs;
080  private final Path rootdir;
081  private final boolean fsreadonly;
082  private volatile boolean usecache;
083  private volatile boolean fsvisited;
084
085  @VisibleForTesting
086  long cachehits = 0;
087  @VisibleForTesting
088  long invocations = 0;
089
090  /**
091   * The file name prefix used to store HTD in HDFS
092   */
093  static final String TABLEINFO_FILE_PREFIX = ".tableinfo";
094  static final String TABLEINFO_DIR = ".tabledesc";
095  static final String TMP_DIR = ".tmp";
096
097  // This cache does not age out the old stuff.  Thinking is that the amount
098  // of data we keep up in here is so small, no need to do occasional purge.
099  // TODO.
100  private final Map<TableName, TableDescriptor> cache = new ConcurrentHashMap<>();
101
102  /**
103   * Construct a FSTableDescriptors instance using the hbase root dir of the given conf and the
104   * filesystem where that root dir lives. This instance can do write operations (is not read only).
105   */
106  public FSTableDescriptors(final Configuration conf) throws IOException {
107    this(CommonFSUtils.getCurrentFileSystem(conf), CommonFSUtils.getRootDir(conf));
108  }
109
110  public FSTableDescriptors(final FileSystem fs, final Path rootdir) {
111    this(fs, rootdir, false, true);
112  }
113
114  public FSTableDescriptors(final FileSystem fs, final Path rootdir, final boolean fsreadonly,
115      final boolean usecache) {
116    this.fs = fs;
117    this.rootdir = rootdir;
118    this.fsreadonly = fsreadonly;
119    this.usecache = usecache;
120  }
121
122  public static void tryUpdateMetaTableDescriptor(Configuration conf) throws IOException {
123    tryUpdateMetaTableDescriptor(conf, CommonFSUtils.getCurrentFileSystem(conf),
124      CommonFSUtils.getRootDir(conf), null);
125  }
126
127  public static void tryUpdateMetaTableDescriptor(Configuration conf, FileSystem fs, Path rootdir,
128      Function<TableDescriptorBuilder, TableDescriptorBuilder> metaObserver) throws IOException {
129    // see if we already have meta descriptor on fs. Write one if not.
130    try {
131      getTableDescriptorFromFs(fs, rootdir, TableName.META_TABLE_NAME);
132    } catch (TableInfoMissingException e) {
133      TableDescriptorBuilder builder = createMetaTableDescriptorBuilder(conf);
134      if (metaObserver != null) {
135        builder = metaObserver.apply(builder);
136      }
137      TableDescriptor td = builder.build();
138      LOG.info("Creating new hbase:meta table descriptor {}", td);
139      TableName tableName = td.getTableName();
140      Path tableDir = CommonFSUtils.getTableDir(rootdir, tableName);
141      Path p = writeTableDescriptor(fs, td, tableDir, getTableInfoPath(fs, tableDir, true));
142      if (p == null) {
143        throw new IOException("Failed update hbase:meta table descriptor");
144      }
145      LOG.info("Updated hbase:meta table descriptor to {}", p);
146    }
147  }
148
149  @VisibleForTesting
150  public static TableDescriptorBuilder createMetaTableDescriptorBuilder(final Configuration conf)
151    throws IOException {
152    // TODO We used to set CacheDataInL1 for META table. When we have BucketCache in file mode, now
153    // the META table data goes to File mode BC only. Test how that affect the system. If too much,
154    // we have to rethink about adding back the setCacheDataInL1 for META table CFs.
155    return TableDescriptorBuilder.newBuilder(TableName.META_TABLE_NAME)
156      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.CATALOG_FAMILY)
157        .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
158          HConstants.DEFAULT_HBASE_META_VERSIONS))
159        .setInMemory(true)
160        .setBlocksize(conf.getInt(HConstants.HBASE_META_BLOCK_SIZE,
161          HConstants.DEFAULT_HBASE_META_BLOCK_SIZE))
162        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
163        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
164        .setBloomFilterType(BloomType.NONE)
165        .build())
166      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.TABLE_FAMILY)
167        .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
168          HConstants.DEFAULT_HBASE_META_VERSIONS))
169        .setInMemory(true)
170        .setBlocksize(8 * 1024)
171        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
172        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
173        .setBloomFilterType(BloomType.NONE)
174        .build())
175      .setColumnFamily(ColumnFamilyDescriptorBuilder
176        .newBuilder(HConstants.REPLICATION_BARRIER_FAMILY)
177        .setMaxVersions(HConstants.ALL_VERSIONS)
178        .setInMemory(true)
179        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
180        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
181        .setBloomFilterType(BloomType.NONE)
182        .build())
183      .setCoprocessor(CoprocessorDescriptorBuilder.newBuilder(
184        MultiRowMutationEndpoint.class.getName())
185        .setPriority(Coprocessor.PRIORITY_SYSTEM)
186        .build());
187  }
188
189  @Override
190  public void setCacheOn() throws IOException {
191    this.cache.clear();
192    this.usecache = true;
193  }
194
195  @Override
196  public void setCacheOff() throws IOException {
197    this.usecache = false;
198    this.cache.clear();
199  }
200
201  @VisibleForTesting
202  public boolean isUsecache() {
203    return this.usecache;
204  }
205
206  /**
207   * Get the current table descriptor for the given table, or null if none exists.
208   *
209   * Uses a local cache of the descriptor but still checks the filesystem on each call
210   * to see if a newer file has been created since the cached one was read.
211   */
212  @Override
213  @Nullable
214  public TableDescriptor get(final TableName tablename)
215  throws IOException {
216    invocations++;
217    if (usecache) {
218      // Look in cache of descriptors.
219      TableDescriptor cachedtdm = this.cache.get(tablename);
220      if (cachedtdm != null) {
221        cachehits++;
222        return cachedtdm;
223      }
224    }
225    TableDescriptor tdmt = null;
226    try {
227      tdmt = getTableDescriptorFromFs(fs, rootdir, tablename);
228    } catch (NullPointerException e) {
229      LOG.debug("Exception during readTableDecriptor. Current table name = "
230          + tablename, e);
231    } catch (TableInfoMissingException e) {
232      // ignore. This is regular operation
233    } catch (IOException ioe) {
234      LOG.debug("Exception during readTableDecriptor. Current table name = "
235          + tablename, ioe);
236    }
237    // last HTD written wins
238    if (usecache && tdmt != null) {
239      this.cache.put(tablename, tdmt);
240    }
241
242    return tdmt;
243  }
244
245  /**
246   * Returns a map from table name to table descriptor for all tables.
247   */
248  @Override
249  public Map<String, TableDescriptor> getAll() throws IOException {
250    Map<String, TableDescriptor> tds = new TreeMap<>();
251    if (fsvisited && usecache) {
252      for (Map.Entry<TableName, TableDescriptor> entry: this.cache.entrySet()) {
253        tds.put(entry.getKey().getNameWithNamespaceInclAsString(), entry.getValue());
254      }
255    } else {
256      LOG.trace("Fetching table descriptors from the filesystem.");
257      boolean allvisited = true;
258      for (Path d : FSUtils.getTableDirs(fs, rootdir)) {
259        TableDescriptor htd = null;
260        try {
261          htd = get(CommonFSUtils.getTableName(d));
262        } catch (FileNotFoundException fnfe) {
263          // inability of retrieving one HTD shouldn't stop getting the remaining
264          LOG.warn("Trouble retrieving htd", fnfe);
265        }
266        if (htd == null) {
267          allvisited = false;
268          continue;
269        } else {
270          tds.put(htd.getTableName().getNameWithNamespaceInclAsString(), htd);
271        }
272        fsvisited = allvisited;
273      }
274    }
275    return tds;
276  }
277
278  /**
279    * Find descriptors by namespace.
280    * @see #get(org.apache.hadoop.hbase.TableName)
281    */
282  @Override
283  public Map<String, TableDescriptor> getByNamespace(String name)
284  throws IOException {
285    Map<String, TableDescriptor> htds = new TreeMap<>();
286    List<Path> tableDirs =
287        FSUtils.getLocalTableDirs(fs, CommonFSUtils.getNamespaceDir(rootdir, name));
288    for (Path d: tableDirs) {
289      TableDescriptor htd = null;
290      try {
291        htd = get(CommonFSUtils.getTableName(d));
292      } catch (FileNotFoundException fnfe) {
293        // inability of retrieving one HTD shouldn't stop getting the remaining
294        LOG.warn("Trouble retrieving htd", fnfe);
295      }
296      if (htd == null) continue;
297      htds.put(CommonFSUtils.getTableName(d).getNameAsString(), htd);
298    }
299    return htds;
300  }
301
302  /**
303   * Adds (or updates) the table descriptor to the FileSystem
304   * and updates the local cache with it.
305   */
306  @Override
307  public void update(TableDescriptor htd) throws IOException {
308    if (fsreadonly) {
309      throw new NotImplementedException("Cannot add a table descriptor - in read only mode");
310    }
311    updateTableDescriptor(htd);
312  }
313
314  /**
315   * Removes the table descriptor from the local cache and returns it.
316   * If not in read only mode, it also deletes the entire table directory(!)
317   * from the FileSystem.
318   */
319  @Override
320  public TableDescriptor remove(final TableName tablename) throws IOException {
321    if (fsreadonly) {
322      throw new NotImplementedException("Cannot remove a table descriptor - in read only mode");
323    }
324    Path tabledir = getTableDir(tablename);
325    if (this.fs.exists(tabledir)) {
326      if (!this.fs.delete(tabledir, true)) {
327        throw new IOException("Failed delete of " + tabledir.toString());
328      }
329    }
330    TableDescriptor descriptor = this.cache.remove(tablename);
331    return descriptor;
332  }
333
334  private FileStatus getTableInfoPath(Path tableDir) throws IOException {
335    return getTableInfoPath(fs, tableDir, !fsreadonly);
336  }
337
338  /**
339   * Find the most current table info file for the table located in the given table directory.
340   *
341   * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
342   * files and takes the 'current' one - meaning the one with the highest sequence number if present
343   * or no sequence number at all if none exist (for backward compatibility from before there
344   * were sequence numbers).
345   *
346   * @return The file status of the current table info file or null if it does not exist
347   */
348  public static FileStatus getTableInfoPath(FileSystem fs, Path tableDir)
349  throws IOException {
350    return getTableInfoPath(fs, tableDir, false);
351  }
352
353  /**
354   * Find the most current table info file for the table in the given table directory.
355   *
356   * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
357   * files and takes the 'current' one - meaning the one with the highest sequence number if
358   * present or no sequence number at all if none exist (for backward compatibility from before
359   * there were sequence numbers).
360   * If there are multiple table info files found and removeOldFiles is true it also deletes the
361   * older files.
362   *
363   * @return The file status of the current table info file or null if none exist
364   */
365  private static FileStatus getTableInfoPath(FileSystem fs, Path tableDir, boolean removeOldFiles)
366      throws IOException {
367    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
368    return getCurrentTableInfoStatus(fs, tableInfoDir, removeOldFiles);
369  }
370
371  /**
372   * Find the most current table info file in the given directory
373   *
374   * Looks within the given directory for any table info files
375   * and takes the 'current' one - meaning the one with the highest sequence number if present
376   * or no sequence number at all if none exist (for backward compatibility from before there
377   * were sequence numbers).
378   * If there are multiple possible files found
379   * and the we're not in read only mode it also deletes the older files.
380   *
381   * @return The file status of the current table info file or null if it does not exist
382   * @throws IOException
383   */
384  // only visible for FSTableDescriptorMigrationToSubdir, can be removed with that
385  static FileStatus getCurrentTableInfoStatus(FileSystem fs, Path dir, boolean removeOldFiles)
386    throws IOException {
387    FileStatus[] status = CommonFSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
388    if (status == null || status.length < 1) return null;
389    FileStatus mostCurrent = null;
390    for (FileStatus file : status) {
391      if (mostCurrent == null || TABLEINFO_FILESTATUS_COMPARATOR.compare(file, mostCurrent) < 0) {
392        mostCurrent = file;
393      }
394    }
395    if (removeOldFiles && status.length > 1) {
396      // Clean away old versions
397      for (FileStatus file : status) {
398        Path path = file.getPath();
399        if (!file.equals(mostCurrent)) {
400          if (!fs.delete(file.getPath(), false)) {
401            LOG.warn("Failed cleanup of " + path);
402          } else {
403            LOG.debug("Cleaned up old tableinfo file " + path);
404          }
405        }
406      }
407    }
408    return mostCurrent;
409  }
410
411  /**
412   * Compare {@link FileStatus} instances by {@link Path#getName()}. Returns in
413   * reverse order.
414   */
415  @VisibleForTesting
416  static final Comparator<FileStatus> TABLEINFO_FILESTATUS_COMPARATOR =
417  new Comparator<FileStatus>() {
418    @Override
419    public int compare(FileStatus left, FileStatus right) {
420      return right.compareTo(left);
421    }};
422
423  /**
424   * Return the table directory in HDFS
425   */
426  @VisibleForTesting
427  Path getTableDir(final TableName tableName) {
428    return CommonFSUtils.getTableDir(rootdir, tableName);
429  }
430
431  private static final PathFilter TABLEINFO_PATHFILTER = new PathFilter() {
432    @Override
433    public boolean accept(Path p) {
434      // Accept any file that starts with TABLEINFO_NAME
435      return p.getName().startsWith(TABLEINFO_FILE_PREFIX);
436    }};
437
438  /**
439   * Width of the sequenceid that is a suffix on a tableinfo file.
440   */
441  @VisibleForTesting static final int WIDTH_OF_SEQUENCE_ID = 10;
442
443  /*
444   * @param number Number to use as suffix.
445   * @return Returns zero-prefixed decimal version of passed
446   * number (Does absolute in case number is negative).
447   */
448  private static String formatTableInfoSequenceId(final int number) {
449    byte [] b = new byte[WIDTH_OF_SEQUENCE_ID];
450    int d = Math.abs(number);
451    for (int i = b.length - 1; i >= 0; i--) {
452      b[i] = (byte)((d % 10) + '0');
453      d /= 10;
454    }
455    return Bytes.toString(b);
456  }
457
458  /**
459   * Regex to eat up sequenceid suffix on a .tableinfo file.
460   * Use regex because may encounter oldstyle .tableinfos where there is no
461   * sequenceid on the end.
462   */
463  private static final Pattern TABLEINFO_FILE_REGEX =
464    Pattern.compile(TABLEINFO_FILE_PREFIX + "(\\.([0-9]{" + WIDTH_OF_SEQUENCE_ID + "}))?$");
465
466  /**
467   * @param p Path to a <code>.tableinfo</code> file.
468   * @return The current editid or 0 if none found.
469   */
470  @VisibleForTesting static int getTableInfoSequenceId(final Path p) {
471    if (p == null) return 0;
472    Matcher m = TABLEINFO_FILE_REGEX.matcher(p.getName());
473    if (!m.matches()) throw new IllegalArgumentException(p.toString());
474    String suffix = m.group(2);
475    if (suffix == null || suffix.length() <= 0) return 0;
476    return Integer.parseInt(m.group(2));
477  }
478
479  /**
480   * @param sequenceid
481   * @return Name of tableinfo file.
482   */
483  @VisibleForTesting static String getTableInfoFileName(final int sequenceid) {
484    return TABLEINFO_FILE_PREFIX + "." + formatTableInfoSequenceId(sequenceid);
485  }
486
487  /**
488   * Returns the latest table descriptor for the given table directly from the file system
489   * if it exists, bypassing the local cache.
490   * Returns null if it's not found.
491   */
492  public static TableDescriptor getTableDescriptorFromFs(FileSystem fs,
493      Path hbaseRootDir, TableName tableName) throws IOException {
494    Path tableDir = CommonFSUtils.getTableDir(hbaseRootDir, tableName);
495    return getTableDescriptorFromFs(fs, tableDir);
496  }
497
498  /**
499   * Returns the latest table descriptor for the table located at the given directory
500   * directly from the file system if it exists.
501   * @throws TableInfoMissingException if there is no descriptor
502   */
503  public static TableDescriptor getTableDescriptorFromFs(FileSystem fs, Path tableDir)
504  throws IOException {
505    FileStatus status = getTableInfoPath(fs, tableDir, false);
506    if (status == null) {
507      throw new TableInfoMissingException("No table descriptor file under " + tableDir);
508    }
509    return readTableDescriptor(fs, status);
510  }
511
512  private static TableDescriptor readTableDescriptor(FileSystem fs, FileStatus status)
513      throws IOException {
514    int len = Ints.checkedCast(status.getLen());
515    byte [] content = new byte[len];
516    FSDataInputStream fsDataInputStream = fs.open(status.getPath());
517    try {
518      fsDataInputStream.readFully(content);
519    } finally {
520      fsDataInputStream.close();
521    }
522    TableDescriptor htd = null;
523    try {
524      htd = TableDescriptorBuilder.parseFrom(content);
525    } catch (DeserializationException e) {
526      throw new IOException("content=" + Bytes.toShort(content), e);
527    }
528    return htd;
529  }
530
531  /**
532   * Update table descriptor on the file system
533   * @throws IOException Thrown if failed update.
534   * @throws NotImplementedException if in read only mode
535   */
536  @VisibleForTesting
537  Path updateTableDescriptor(TableDescriptor td) throws IOException {
538    if (fsreadonly) {
539      throw new NotImplementedException("Cannot update a table descriptor - in read only mode");
540    }
541    TableName tableName = td.getTableName();
542    Path tableDir = getTableDir(tableName);
543    Path p = writeTableDescriptor(fs, td, tableDir, getTableInfoPath(tableDir));
544    if (p == null) {
545      throw new IOException("Failed update");
546    }
547    LOG.info("Updated tableinfo=" + p);
548    if (usecache) {
549      this.cache.put(td.getTableName(), td);
550    }
551    return p;
552  }
553
554  /**
555   * Deletes files matching the table info file pattern within the given directory
556   * whose sequenceId is at most the given max sequenceId.
557   */
558  private static void deleteTableDescriptorFiles(FileSystem fs, Path dir, int maxSequenceId)
559  throws IOException {
560    FileStatus [] status = CommonFSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
561    for (FileStatus file : status) {
562      Path path = file.getPath();
563      int sequenceId = getTableInfoSequenceId(path);
564      if (sequenceId <= maxSequenceId) {
565        boolean success = CommonFSUtils.delete(fs, path, false);
566        if (success) {
567          LOG.debug("Deleted " + path);
568        } else {
569          LOG.error("Failed to delete table descriptor at " + path);
570        }
571      }
572    }
573  }
574
575  /**
576   * Attempts to write a new table descriptor to the given table's directory.
577   * It first writes it to the .tmp dir then uses an atomic rename to move it into place.
578   * It begins at the currentSequenceId + 1 and tries 10 times to find a new sequence number
579   * not already in use.
580   * Removes the current descriptor file if passed in.
581   *
582   * @return Descriptor file or null if we failed write.
583   */
584  private static Path writeTableDescriptor(final FileSystem fs,
585    final TableDescriptor htd, final Path tableDir,
586    final FileStatus currentDescriptorFile)
587  throws IOException {
588    // Get temporary dir into which we'll first write a file to avoid half-written file phenomenon.
589    // This directory is never removed to avoid removing it out from under a concurrent writer.
590    Path tmpTableDir = new Path(tableDir, TMP_DIR);
591    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
592
593    // What is current sequenceid?  We read the current sequenceid from
594    // the current file.  After we read it, another thread could come in and
595    // compete with us writing out next version of file.  The below retries
596    // should help in this case some but its hard to do guarantees in face of
597    // concurrent schema edits.
598    int currentSequenceId = currentDescriptorFile == null ? 0 :
599      getTableInfoSequenceId(currentDescriptorFile.getPath());
600    int newSequenceId = currentSequenceId;
601
602    // Put arbitrary upperbound on how often we retry
603    int retries = 10;
604    int retrymax = currentSequenceId + retries;
605    Path tableInfoDirPath = null;
606    do {
607      newSequenceId += 1;
608      String filename = getTableInfoFileName(newSequenceId);
609      Path tempPath = new Path(tmpTableDir, filename);
610      if (fs.exists(tempPath)) {
611        LOG.debug(tempPath + " exists; retrying up to " + retries + " times");
612        continue;
613      }
614      tableInfoDirPath = new Path(tableInfoDir, filename);
615      try {
616        writeTD(fs, tempPath, htd);
617        fs.mkdirs(tableInfoDirPath.getParent());
618        if (!fs.rename(tempPath, tableInfoDirPath)) {
619          throw new IOException("Failed rename of " + tempPath + " to " + tableInfoDirPath);
620        }
621        LOG.debug("Wrote into " + tableInfoDirPath);
622      } catch (IOException ioe) {
623        // Presume clash of names or something; go around again.
624        LOG.debug("Failed write and/or rename; retrying", ioe);
625        if (!CommonFSUtils.deleteDirectory(fs, tempPath)) {
626          LOG.warn("Failed cleanup of " + tempPath);
627        }
628        tableInfoDirPath = null;
629        continue;
630      }
631      break;
632    } while (newSequenceId < retrymax);
633    if (tableInfoDirPath != null) {
634      // if we succeeded, remove old table info files.
635      deleteTableDescriptorFiles(fs, tableInfoDir, newSequenceId - 1);
636    }
637    return tableInfoDirPath;
638  }
639
640  private static void writeTD(final FileSystem fs, final Path p, final TableDescriptor htd)
641  throws IOException {
642    FSDataOutputStream out = fs.create(p, false);
643    try {
644      // We used to write this file out as a serialized HTD Writable followed by two '\n's and then
645      // the toString version of HTD.  Now we just write out the pb serialization.
646      out.write(TableDescriptorBuilder.toByteArray(htd));
647    } finally {
648      out.close();
649    }
650  }
651
652  /**
653   * Create new TableDescriptor in HDFS. Happens when we are creating table.
654   * Used by tests.
655   * @return True if we successfully created file.
656   */
657  public boolean createTableDescriptor(TableDescriptor htd) throws IOException {
658    return createTableDescriptor(htd, false);
659  }
660
661  /**
662   * Create new TableDescriptor in HDFS. Happens when we are creating table. If
663   * forceCreation is true then even if previous table descriptor is present it
664   * will be overwritten
665   *
666   * @return True if we successfully created file.
667   */
668  public boolean createTableDescriptor(TableDescriptor htd, boolean forceCreation)
669  throws IOException {
670    Path tableDir = getTableDir(htd.getTableName());
671    return createTableDescriptorForTableDirectory(tableDir, htd, forceCreation);
672  }
673
674  /**
675   * Create a new TableDescriptor in HDFS in the specified table directory. Happens when we create
676   * a new table during cluster start or in Clone and Create Table Procedures. Checks readOnly flag
677   * passed on construction.
678   * @param tableDir table directory under which we should write the file
679   * @param htd description of the table to write
680   * @param forceCreation if <tt>true</tt>,then even if previous table descriptor is present it will
681   *          be overwritten
682   * @return <tt>true</tt> if the we successfully created the file, <tt>false</tt> if the file
683   *         already exists and we weren't forcing the descriptor creation.
684   * @throws IOException if a filesystem error occurs
685   */
686  public boolean createTableDescriptorForTableDirectory(Path tableDir, TableDescriptor htd,
687      boolean forceCreation) throws IOException {
688    if (this.fsreadonly) {
689      throw new NotImplementedException("Cannot create a table descriptor - in read only mode");
690    }
691    return createTableDescriptorForTableDirectory(this.fs, tableDir, htd, forceCreation);
692  }
693
694  /**
695   * Create a new TableDescriptor in HDFS in the specified table directory. Happens when we create
696   * a new table snapshoting. Does not enforce read-only. That is for caller to determine.
697   * @param fs Filesystem to use.
698   * @param tableDir table directory under which we should write the file
699   * @param htd description of the table to write
700   * @param forceCreation if <tt>true</tt>,then even if previous table descriptor is present it will
701   *          be overwritten
702   * @return <tt>true</tt> if the we successfully created the file, <tt>false</tt> if the file
703   *         already exists and we weren't forcing the descriptor creation.
704   * @throws IOException if a filesystem error occurs
705   */
706  public static boolean createTableDescriptorForTableDirectory(FileSystem fs, Path tableDir,
707      TableDescriptor htd, boolean forceCreation) throws IOException {
708    FileStatus status = getTableInfoPath(fs, tableDir);
709    if (status != null) {
710      LOG.debug("Current path=" + status.getPath());
711      if (!forceCreation) {
712        if (fs.exists(status.getPath()) && status.getLen() > 0) {
713          if (readTableDescriptor(fs, status).equals(htd)) {
714            LOG.trace("TableInfo already exists.. Skipping creation");
715            return false;
716          }
717        }
718      }
719    }
720    return writeTableDescriptor(fs, htd, tableDir, status) != null;
721  }
722}
723