001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.util;
019
020import java.io.FileNotFoundException;
021import java.io.IOException;
022import java.util.Comparator;
023import java.util.List;
024import java.util.Map;
025import java.util.TreeMap;
026import java.util.concurrent.ConcurrentHashMap;
027import java.util.function.Function;
028import java.util.regex.Matcher;
029import java.util.regex.Pattern;
030
031import edu.umd.cs.findbugs.annotations.Nullable;
032import org.apache.commons.lang3.NotImplementedException;
033import org.apache.hadoop.conf.Configuration;
034import org.apache.hadoop.fs.FSDataInputStream;
035import org.apache.hadoop.fs.FSDataOutputStream;
036import org.apache.hadoop.fs.FileStatus;
037import org.apache.hadoop.fs.FileSystem;
038import org.apache.hadoop.fs.Path;
039import org.apache.hadoop.fs.PathFilter;
040import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder;
041import org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint;
042import org.apache.yetus.audience.InterfaceAudience;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
046import org.apache.hadoop.hbase.client.TableDescriptor;
047import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
048import org.apache.hadoop.hbase.Coprocessor;
049import org.apache.hadoop.hbase.exceptions.DeserializationException;
050import org.apache.hadoop.hbase.HConstants;
051import org.apache.hadoop.hbase.regionserver.BloomType;
052import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
053import org.apache.hbase.thirdparty.com.google.common.primitives.Ints;
054import org.apache.hadoop.hbase.TableDescriptors;
055import org.apache.hadoop.hbase.TableInfoMissingException;
056import org.apache.hadoop.hbase.TableName;
057
058/**
059 * Implementation of {@link TableDescriptors} that reads descriptors from the
060 * passed filesystem.  It expects descriptors to be in a file in the
061 * {@link #TABLEINFO_DIR} subdir of the table's directory in FS.  Can be read-only
062 *  -- i.e. does not modify the filesystem or can be read and write.
063 *
064 * <p>Also has utility for keeping up the table descriptors tableinfo file.
065 * The table schema file is kept in the {@link #TABLEINFO_DIR} subdir
066 * of the table directory in the filesystem.
067 * It has a {@link #TABLEINFO_FILE_PREFIX} and then a suffix that is the
068 * edit sequenceid: e.g. <code>.tableinfo.0000000003</code>.  This sequenceid
069 * is always increasing.  It starts at zero.  The table schema file with the
070 * highest sequenceid has the most recent schema edit. Usually there is one file
071 * only, the most recent but there may be short periods where there are more
072 * than one file. Old files are eventually cleaned.  Presumption is that there
073 * will not be lots of concurrent clients making table schema edits.  If so,
074 * the below needs a bit of a reworking and perhaps some supporting api in hdfs.
075 */
076@InterfaceAudience.Private
077public class FSTableDescriptors implements TableDescriptors {
078  private static final Logger LOG = LoggerFactory.getLogger(FSTableDescriptors.class);
079  private final FileSystem fs;
080  private final Path rootdir;
081  private final boolean fsreadonly;
082  private volatile boolean usecache;
083  private volatile boolean fsvisited;
084
085  @VisibleForTesting
086  long cachehits = 0;
087  @VisibleForTesting
088  long invocations = 0;
089
090  /**
091   * The file name prefix used to store HTD in HDFS
092   */
093  static final String TABLEINFO_FILE_PREFIX = ".tableinfo";
094  static final String TABLEINFO_DIR = ".tabledesc";
095  static final String TMP_DIR = ".tmp";
096
097  // This cache does not age out the old stuff.  Thinking is that the amount
098  // of data we keep up in here is so small, no need to do occasional purge.
099  // TODO.
100  private final Map<TableName, TableDescriptor> cache = new ConcurrentHashMap<>();
101
102  /**
103   * Table descriptor for <code>hbase:meta</code> catalog table
104   */
105  private final TableDescriptor metaTableDescriptor;
106
107  /**
108   * Construct a FSTableDescriptors instance using the hbase root dir of the given
109   * conf and the filesystem where that root dir lives.
110   * This instance can do write operations (is not read only).
111   */
112  public FSTableDescriptors(final Configuration conf) throws IOException {
113    this(conf, FSUtils.getCurrentFileSystem(conf), FSUtils.getRootDir(conf));
114  }
115
116  public FSTableDescriptors(final Configuration conf, final FileSystem fs, final Path rootdir)
117          throws IOException {
118    this(conf, fs, rootdir, false, true);
119  }
120
121  /**
122   * @param fsreadonly True if we are read-only when it comes to filesystem
123   *                   operations; i.e. on remove, we do not do delete in fs.
124   */
125  public FSTableDescriptors(final Configuration conf, final FileSystem fs,
126                            final Path rootdir, final boolean fsreadonly, final boolean usecache) throws IOException {
127    this(conf, fs, rootdir, fsreadonly, usecache, null);
128  }
129
130  /**
131   * @param fsreadonly True if we are read-only when it comes to filesystem
132   *                   operations; i.e. on remove, we do not do delete in fs.
133   * @param metaObserver Used by HMaster. It need to modify the META_REPLICAS_NUM for meta table descriptor.
134   *                     see HMaster#finishActiveMasterInitialization
135   *                     TODO: This is a workaround. Should remove this ugly code...
136   */
137  public FSTableDescriptors(final Configuration conf, final FileSystem fs,
138                            final Path rootdir, final boolean fsreadonly, final boolean usecache,
139                            Function<TableDescriptorBuilder, TableDescriptorBuilder> metaObserver) throws IOException {
140    this.fs = fs;
141    this.rootdir = rootdir;
142    this.fsreadonly = fsreadonly;
143    this.usecache = usecache;
144    this.metaTableDescriptor = metaObserver == null ? createMetaTableDescriptor(conf)
145          : metaObserver.apply(createMetaTableDescriptorBuilder(conf)).build();
146  }
147
148  @VisibleForTesting
149  public static TableDescriptorBuilder createMetaTableDescriptorBuilder(final Configuration conf) throws IOException {
150    // TODO We used to set CacheDataInL1 for META table. When we have BucketCache in file mode, now
151    // the META table data goes to File mode BC only. Test how that affect the system. If too much,
152    // we have to rethink about adding back the setCacheDataInL1 for META table CFs.
153    return TableDescriptorBuilder.newBuilder(TableName.META_TABLE_NAME)
154      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.CATALOG_FAMILY)
155        .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
156          HConstants.DEFAULT_HBASE_META_VERSIONS))
157        .setInMemory(true)
158        .setBlocksize(conf.getInt(HConstants.HBASE_META_BLOCK_SIZE,
159          HConstants.DEFAULT_HBASE_META_BLOCK_SIZE))
160        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
161        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
162        .setBloomFilterType(BloomType.NONE)
163        .build())
164      .setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(HConstants.TABLE_FAMILY)
165        .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
166          HConstants.DEFAULT_HBASE_META_VERSIONS))
167        .setInMemory(true)
168        .setBlocksize(8 * 1024)
169        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
170        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
171        .setBloomFilterType(BloomType.NONE)
172        .build())
173      .setColumnFamily(ColumnFamilyDescriptorBuilder
174        .newBuilder(HConstants.REPLICATION_BARRIER_FAMILY)
175        .setMaxVersions(HConstants.ALL_VERSIONS)
176        .setInMemory(true)
177        .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
178        // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
179        .setBloomFilterType(BloomType.NONE)
180        .build())
181      .setCoprocessor(CoprocessorDescriptorBuilder.newBuilder(
182        MultiRowMutationEndpoint.class.getName())
183        .setPriority(Coprocessor.PRIORITY_SYSTEM)
184        .build());
185  }
186
187  @VisibleForTesting
188  public static TableDescriptor createMetaTableDescriptor(final Configuration conf)
189      throws IOException {
190    return createMetaTableDescriptorBuilder(conf).build();
191  }
192
193  @Override
194  public void setCacheOn() throws IOException {
195    this.cache.clear();
196    this.usecache = true;
197  }
198
199  @Override
200  public void setCacheOff() throws IOException {
201    this.usecache = false;
202    this.cache.clear();
203  }
204
205  @VisibleForTesting
206  public boolean isUsecache() {
207    return this.usecache;
208  }
209
210  /**
211   * Get the current table descriptor for the given table, or null if none exists.
212   *
213   * Uses a local cache of the descriptor but still checks the filesystem on each call
214   * to see if a newer file has been created since the cached one was read.
215   */
216  @Override
217  @Nullable
218  public TableDescriptor get(final TableName tablename)
219  throws IOException {
220    invocations++;
221    if (TableName.META_TABLE_NAME.equals(tablename)) {
222      cachehits++;
223      return metaTableDescriptor;
224    }
225    // hbase:meta is already handled. If some one tries to get the descriptor for
226    // .logs, .oldlogs or .corrupt throw an exception.
227    if (HConstants.HBASE_NON_USER_TABLE_DIRS.contains(tablename.getNameAsString())) {
228       throw new IOException("No descriptor found for non table = " + tablename);
229    }
230
231    if (usecache) {
232      // Look in cache of descriptors.
233      TableDescriptor cachedtdm = this.cache.get(tablename);
234      if (cachedtdm != null) {
235        cachehits++;
236        return cachedtdm;
237      }
238    }
239    TableDescriptor tdmt = null;
240    try {
241      tdmt = getTableDescriptorFromFs(fs, rootdir, tablename);
242    } catch (NullPointerException e) {
243      LOG.debug("Exception during readTableDecriptor. Current table name = "
244          + tablename, e);
245    } catch (TableInfoMissingException e) {
246      // ignore. This is regular operation
247    } catch (IOException ioe) {
248      LOG.debug("Exception during readTableDecriptor. Current table name = "
249          + tablename, ioe);
250    }
251    // last HTD written wins
252    if (usecache && tdmt != null) {
253      this.cache.put(tablename, tdmt);
254    }
255
256    return tdmt;
257  }
258
259  /**
260   * Returns a map from table name to table descriptor for all tables.
261   */
262  @Override
263  public Map<String, TableDescriptor> getAll()
264  throws IOException {
265    Map<String, TableDescriptor> tds = new TreeMap<>();
266
267    if (fsvisited && usecache) {
268      for (Map.Entry<TableName, TableDescriptor> entry: this.cache.entrySet()) {
269        tds.put(entry.getKey().getNameWithNamespaceInclAsString(), entry.getValue());
270      }
271      // add hbase:meta to the response
272      tds.put(this.metaTableDescriptor.getTableName().getNameAsString(), metaTableDescriptor);
273    } else {
274      LOG.trace("Fetching table descriptors from the filesystem.");
275      boolean allvisited = true;
276      for (Path d : FSUtils.getTableDirs(fs, rootdir)) {
277        TableDescriptor htd = null;
278        try {
279          htd = get(FSUtils.getTableName(d));
280        } catch (FileNotFoundException fnfe) {
281          // inability of retrieving one HTD shouldn't stop getting the remaining
282          LOG.warn("Trouble retrieving htd", fnfe);
283        }
284        if (htd == null) {
285          allvisited = false;
286          continue;
287        } else {
288          tds.put(htd.getTableName().getNameWithNamespaceInclAsString(), htd);
289        }
290        fsvisited = allvisited;
291      }
292    }
293    return tds;
294  }
295
296  /**
297    * Find descriptors by namespace.
298    * @see #get(org.apache.hadoop.hbase.TableName)
299    */
300  @Override
301  public Map<String, TableDescriptor> getByNamespace(String name)
302  throws IOException {
303    Map<String, TableDescriptor> htds = new TreeMap<>();
304    List<Path> tableDirs =
305        FSUtils.getLocalTableDirs(fs, FSUtils.getNamespaceDir(rootdir, name));
306    for (Path d: tableDirs) {
307      TableDescriptor htd = null;
308      try {
309        htd = get(FSUtils.getTableName(d));
310      } catch (FileNotFoundException fnfe) {
311        // inability of retrieving one HTD shouldn't stop getting the remaining
312        LOG.warn("Trouble retrieving htd", fnfe);
313      }
314      if (htd == null) continue;
315      htds.put(FSUtils.getTableName(d).getNameAsString(), htd);
316    }
317    return htds;
318  }
319
320  /**
321   * Adds (or updates) the table descriptor to the FileSystem
322   * and updates the local cache with it.
323   */
324  @Override
325  public void add(TableDescriptor htd) throws IOException {
326    if (fsreadonly) {
327      throw new NotImplementedException("Cannot add a table descriptor - in read only mode");
328    }
329    TableName tableName = htd.getTableName();
330    if (TableName.META_TABLE_NAME.equals(tableName)) {
331      throw new NotImplementedException(HConstants.NOT_IMPLEMENTED);
332    }
333    if (HConstants.HBASE_NON_USER_TABLE_DIRS.contains(tableName.getNameAsString())) {
334      throw new NotImplementedException(
335          "Cannot add a table descriptor for a reserved subdirectory name: "
336              + htd.getTableName().getNameAsString());
337    }
338    updateTableDescriptor(htd);
339  }
340
341  /**
342   * Removes the table descriptor from the local cache and returns it.
343   * If not in read only mode, it also deletes the entire table directory(!)
344   * from the FileSystem.
345   */
346  @Override
347  public TableDescriptor remove(final TableName tablename)
348  throws IOException {
349    if (fsreadonly) {
350      throw new NotImplementedException("Cannot remove a table descriptor - in read only mode");
351    }
352    Path tabledir = getTableDir(tablename);
353    if (this.fs.exists(tabledir)) {
354      if (!this.fs.delete(tabledir, true)) {
355        throw new IOException("Failed delete of " + tabledir.toString());
356      }
357    }
358    TableDescriptor descriptor = this.cache.remove(tablename);
359    return descriptor;
360  }
361
362  /**
363   * Checks if a current table info file exists for the given table
364   *
365   * @param tableName name of table
366   * @return true if exists
367   * @throws IOException
368   */
369  public boolean isTableInfoExists(TableName tableName) throws IOException {
370    return getTableInfoPath(tableName) != null;
371  }
372
373  /**
374   * Find the most current table info file for the given table in the hbase root directory.
375   * @return The file status of the current table info file or null if it does not exist
376   */
377  private FileStatus getTableInfoPath(final TableName tableName) throws IOException {
378    Path tableDir = getTableDir(tableName);
379    return getTableInfoPath(tableDir);
380  }
381
382  private FileStatus getTableInfoPath(Path tableDir)
383  throws IOException {
384    return getTableInfoPath(fs, tableDir, !fsreadonly);
385  }
386
387  /**
388   * Find the most current table info file for the table located in the given table directory.
389   *
390   * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
391   * files and takes the 'current' one - meaning the one with the highest sequence number if present
392   * or no sequence number at all if none exist (for backward compatibility from before there
393   * were sequence numbers).
394   *
395   * @return The file status of the current table info file or null if it does not exist
396   * @throws IOException
397   */
398  public static FileStatus getTableInfoPath(FileSystem fs, Path tableDir)
399  throws IOException {
400    return getTableInfoPath(fs, tableDir, false);
401  }
402
403  /**
404   * Find the most current table info file for the table in the given table directory.
405   *
406   * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
407   * files and takes the 'current' one - meaning the one with the highest sequence number if
408   * present or no sequence number at all if none exist (for backward compatibility from before
409   * there were sequence numbers).
410   * If there are multiple table info files found and removeOldFiles is true it also deletes the
411   * older files.
412   *
413   * @return The file status of the current table info file or null if none exist
414   * @throws IOException
415   */
416  private static FileStatus getTableInfoPath(FileSystem fs, Path tableDir, boolean removeOldFiles)
417  throws IOException {
418    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
419    return getCurrentTableInfoStatus(fs, tableInfoDir, removeOldFiles);
420  }
421
422  /**
423   * Find the most current table info file in the given directory
424   *
425   * Looks within the given directory for any table info files
426   * and takes the 'current' one - meaning the one with the highest sequence number if present
427   * or no sequence number at all if none exist (for backward compatibility from before there
428   * were sequence numbers).
429   * If there are multiple possible files found
430   * and the we're not in read only mode it also deletes the older files.
431   *
432   * @return The file status of the current table info file or null if it does not exist
433   * @throws IOException
434   */
435  // only visible for FSTableDescriptorMigrationToSubdir, can be removed with that
436  static FileStatus getCurrentTableInfoStatus(FileSystem fs, Path dir, boolean removeOldFiles)
437  throws IOException {
438    FileStatus [] status = FSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
439    if (status == null || status.length < 1) return null;
440    FileStatus mostCurrent = null;
441    for (FileStatus file : status) {
442      if (mostCurrent == null || TABLEINFO_FILESTATUS_COMPARATOR.compare(file, mostCurrent) < 0) {
443        mostCurrent = file;
444      }
445    }
446    if (removeOldFiles && status.length > 1) {
447      // Clean away old versions
448      for (FileStatus file : status) {
449        Path path = file.getPath();
450        if (!file.equals(mostCurrent)) {
451          if (!fs.delete(file.getPath(), false)) {
452            LOG.warn("Failed cleanup of " + path);
453          } else {
454            LOG.debug("Cleaned up old tableinfo file " + path);
455          }
456        }
457      }
458    }
459    return mostCurrent;
460  }
461
462  /**
463   * Compare {@link FileStatus} instances by {@link Path#getName()}. Returns in
464   * reverse order.
465   */
466  @VisibleForTesting
467  static final Comparator<FileStatus> TABLEINFO_FILESTATUS_COMPARATOR =
468  new Comparator<FileStatus>() {
469    @Override
470    public int compare(FileStatus left, FileStatus right) {
471      return right.compareTo(left);
472    }};
473
474  /**
475   * Return the table directory in HDFS
476   */
477  @VisibleForTesting Path getTableDir(final TableName tableName) {
478    return FSUtils.getTableDir(rootdir, tableName);
479  }
480
481  private static final PathFilter TABLEINFO_PATHFILTER = new PathFilter() {
482    @Override
483    public boolean accept(Path p) {
484      // Accept any file that starts with TABLEINFO_NAME
485      return p.getName().startsWith(TABLEINFO_FILE_PREFIX);
486    }};
487
488  /**
489   * Width of the sequenceid that is a suffix on a tableinfo file.
490   */
491  @VisibleForTesting static final int WIDTH_OF_SEQUENCE_ID = 10;
492
493  /*
494   * @param number Number to use as suffix.
495   * @return Returns zero-prefixed decimal version of passed
496   * number (Does absolute in case number is negative).
497   */
498  private static String formatTableInfoSequenceId(final int number) {
499    byte [] b = new byte[WIDTH_OF_SEQUENCE_ID];
500    int d = Math.abs(number);
501    for (int i = b.length - 1; i >= 0; i--) {
502      b[i] = (byte)((d % 10) + '0');
503      d /= 10;
504    }
505    return Bytes.toString(b);
506  }
507
508  /**
509   * Regex to eat up sequenceid suffix on a .tableinfo file.
510   * Use regex because may encounter oldstyle .tableinfos where there is no
511   * sequenceid on the end.
512   */
513  private static final Pattern TABLEINFO_FILE_REGEX =
514    Pattern.compile(TABLEINFO_FILE_PREFIX + "(\\.([0-9]{" + WIDTH_OF_SEQUENCE_ID + "}))?$");
515
516  /**
517   * @param p Path to a <code>.tableinfo</code> file.
518   * @return The current editid or 0 if none found.
519   */
520  @VisibleForTesting static int getTableInfoSequenceId(final Path p) {
521    if (p == null) return 0;
522    Matcher m = TABLEINFO_FILE_REGEX.matcher(p.getName());
523    if (!m.matches()) throw new IllegalArgumentException(p.toString());
524    String suffix = m.group(2);
525    if (suffix == null || suffix.length() <= 0) return 0;
526    return Integer.parseInt(m.group(2));
527  }
528
529  /**
530   * @param sequenceid
531   * @return Name of tableinfo file.
532   */
533  @VisibleForTesting static String getTableInfoFileName(final int sequenceid) {
534    return TABLEINFO_FILE_PREFIX + "." + formatTableInfoSequenceId(sequenceid);
535  }
536
537  /**
538   * Returns the latest table descriptor for the given table directly from the file system
539   * if it exists, bypassing the local cache.
540   * Returns null if it's not found.
541   */
542  public static TableDescriptor getTableDescriptorFromFs(FileSystem fs,
543      Path hbaseRootDir, TableName tableName) throws IOException {
544    Path tableDir = FSUtils.getTableDir(hbaseRootDir, tableName);
545    return getTableDescriptorFromFs(fs, tableDir);
546  }
547
548  /**
549   * Returns the latest table descriptor for the table located at the given directory
550   * directly from the file system if it exists.
551   * @throws TableInfoMissingException if there is no descriptor
552   */
553  public static TableDescriptor getTableDescriptorFromFs(FileSystem fs, Path tableDir)
554  throws IOException {
555    FileStatus status = getTableInfoPath(fs, tableDir, false);
556    if (status == null) {
557      throw new TableInfoMissingException("No table descriptor file under " + tableDir);
558    }
559    return readTableDescriptor(fs, status);
560  }
561
562  private static TableDescriptor readTableDescriptor(FileSystem fs, FileStatus status)
563      throws IOException {
564    int len = Ints.checkedCast(status.getLen());
565    byte [] content = new byte[len];
566    FSDataInputStream fsDataInputStream = fs.open(status.getPath());
567    try {
568      fsDataInputStream.readFully(content);
569    } finally {
570      fsDataInputStream.close();
571    }
572    TableDescriptor htd = null;
573    try {
574      htd = TableDescriptorBuilder.parseFrom(content);
575    } catch (DeserializationException e) {
576      throw new IOException("content=" + Bytes.toShort(content), e);
577    }
578    return htd;
579  }
580
581  /**
582   * Update table descriptor on the file system
583   * @throws IOException Thrown if failed update.
584   * @throws NotImplementedException if in read only mode
585   */
586  @VisibleForTesting Path updateTableDescriptor(TableDescriptor td)
587  throws IOException {
588    if (fsreadonly) {
589      throw new NotImplementedException("Cannot update a table descriptor - in read only mode");
590    }
591    TableName tableName = td.getTableName();
592    Path tableDir = getTableDir(tableName);
593    Path p = writeTableDescriptor(fs, td, tableDir, getTableInfoPath(tableDir));
594    if (p == null) throw new IOException("Failed update");
595    LOG.info("Updated tableinfo=" + p);
596    if (usecache) {
597      this.cache.put(td.getTableName(), td);
598    }
599    return p;
600  }
601
602  /**
603   * Deletes all the table descriptor files from the file system.
604   * Used in unit tests only.
605   * @throws NotImplementedException if in read only mode
606   */
607  public void deleteTableDescriptorIfExists(TableName tableName) throws IOException {
608    if (fsreadonly) {
609      throw new NotImplementedException("Cannot delete a table descriptor - in read only mode");
610    }
611
612    Path tableDir = getTableDir(tableName);
613    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
614    deleteTableDescriptorFiles(fs, tableInfoDir, Integer.MAX_VALUE);
615  }
616
617  /**
618   * Deletes files matching the table info file pattern within the given directory
619   * whose sequenceId is at most the given max sequenceId.
620   */
621  private static void deleteTableDescriptorFiles(FileSystem fs, Path dir, int maxSequenceId)
622  throws IOException {
623    FileStatus [] status = FSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
624    for (FileStatus file : status) {
625      Path path = file.getPath();
626      int sequenceId = getTableInfoSequenceId(path);
627      if (sequenceId <= maxSequenceId) {
628        boolean success = FSUtils.delete(fs, path, false);
629        if (success) {
630          LOG.debug("Deleted " + path);
631        } else {
632          LOG.error("Failed to delete table descriptor at " + path);
633        }
634      }
635    }
636  }
637
638  /**
639   * Attempts to write a new table descriptor to the given table's directory.
640   * It first writes it to the .tmp dir then uses an atomic rename to move it into place.
641   * It begins at the currentSequenceId + 1 and tries 10 times to find a new sequence number
642   * not already in use.
643   * Removes the current descriptor file if passed in.
644   *
645   * @return Descriptor file or null if we failed write.
646   */
647  private static Path writeTableDescriptor(final FileSystem fs,
648    final TableDescriptor htd, final Path tableDir,
649    final FileStatus currentDescriptorFile)
650  throws IOException {
651    // Get temporary dir into which we'll first write a file to avoid half-written file phenomenon.
652    // This directory is never removed to avoid removing it out from under a concurrent writer.
653    Path tmpTableDir = new Path(tableDir, TMP_DIR);
654    Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
655
656    // What is current sequenceid?  We read the current sequenceid from
657    // the current file.  After we read it, another thread could come in and
658    // compete with us writing out next version of file.  The below retries
659    // should help in this case some but its hard to do guarantees in face of
660    // concurrent schema edits.
661    int currentSequenceId = currentDescriptorFile == null ? 0 :
662      getTableInfoSequenceId(currentDescriptorFile.getPath());
663    int newSequenceId = currentSequenceId;
664
665    // Put arbitrary upperbound on how often we retry
666    int retries = 10;
667    int retrymax = currentSequenceId + retries;
668    Path tableInfoDirPath = null;
669    do {
670      newSequenceId += 1;
671      String filename = getTableInfoFileName(newSequenceId);
672      Path tempPath = new Path(tmpTableDir, filename);
673      if (fs.exists(tempPath)) {
674        LOG.debug(tempPath + " exists; retrying up to " + retries + " times");
675        continue;
676      }
677      tableInfoDirPath = new Path(tableInfoDir, filename);
678      try {
679        writeTD(fs, tempPath, htd);
680        fs.mkdirs(tableInfoDirPath.getParent());
681        if (!fs.rename(tempPath, tableInfoDirPath)) {
682          throw new IOException("Failed rename of " + tempPath + " to " + tableInfoDirPath);
683        }
684        LOG.debug("Wrote into " + tableInfoDirPath);
685      } catch (IOException ioe) {
686        // Presume clash of names or something; go around again.
687        LOG.debug("Failed write and/or rename; retrying", ioe);
688        if (!FSUtils.deleteDirectory(fs, tempPath)) {
689          LOG.warn("Failed cleanup of " + tempPath);
690        }
691        tableInfoDirPath = null;
692        continue;
693      }
694      break;
695    } while (newSequenceId < retrymax);
696    if (tableInfoDirPath != null) {
697      // if we succeeded, remove old table info files.
698      deleteTableDescriptorFiles(fs, tableInfoDir, newSequenceId - 1);
699    }
700    return tableInfoDirPath;
701  }
702
703  private static void writeTD(final FileSystem fs, final Path p, final TableDescriptor htd)
704  throws IOException {
705    FSDataOutputStream out = fs.create(p, false);
706    try {
707      // We used to write this file out as a serialized HTD Writable followed by two '\n's and then
708      // the toString version of HTD.  Now we just write out the pb serialization.
709      out.write(TableDescriptorBuilder.toByteArray(htd));
710    } finally {
711      out.close();
712    }
713  }
714
715  /**
716   * Create new TableDescriptor in HDFS. Happens when we are creating table.
717   * Used by tests.
718   * @return True if we successfully created file.
719   */
720  public boolean createTableDescriptor(TableDescriptor htd) throws IOException {
721    return createTableDescriptor(htd, false);
722  }
723
724  /**
725   * Create new TableDescriptor in HDFS. Happens when we are creating table. If
726   * forceCreation is true then even if previous table descriptor is present it
727   * will be overwritten
728   *
729   * @return True if we successfully created file.
730   */
731  public boolean createTableDescriptor(TableDescriptor htd, boolean forceCreation)
732  throws IOException {
733    Path tableDir = getTableDir(htd.getTableName());
734    return createTableDescriptorForTableDirectory(tableDir, htd, forceCreation);
735  }
736
737  /**
738   * Create a new TableDescriptor in HDFS in the specified table directory. Happens when we create
739   * a new table or snapshot a table.
740   * @param tableDir table directory under which we should write the file
741   * @param htd description of the table to write
742   * @param forceCreation if <tt>true</tt>,then even if previous table descriptor is present it will
743   *          be overwritten
744   * @return <tt>true</tt> if the we successfully created the file, <tt>false</tt> if the file
745   *         already exists and we weren't forcing the descriptor creation.
746   * @throws IOException if a filesystem error occurs
747   */
748  public boolean createTableDescriptorForTableDirectory(Path tableDir,
749      TableDescriptor htd, boolean forceCreation) throws IOException {
750    if (fsreadonly) {
751      throw new NotImplementedException("Cannot create a table descriptor - in read only mode");
752    }
753    FileStatus status = getTableInfoPath(fs, tableDir);
754    if (status != null) {
755      LOG.debug("Current path=" + status.getPath());
756      if (!forceCreation) {
757        if (fs.exists(status.getPath()) && status.getLen() > 0) {
758          if (readTableDescriptor(fs, status).equals(htd)) {
759            LOG.trace("TableInfo already exists.. Skipping creation");
760            return false;
761          }
762        }
763      }
764    }
765    Path p = writeTableDescriptor(fs, htd, tableDir, status);
766    return p != null;
767  }
768
769}
770