View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.util;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  import java.util.Comparator;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.TreeMap;
26  import java.util.concurrent.ConcurrentHashMap;
27  import java.util.regex.Matcher;
28  import java.util.regex.Pattern;
29
30  import com.google.common.annotations.VisibleForTesting;
31  import com.google.common.primitives.Ints;
32  import edu.umd.cs.findbugs.annotations.Nullable;
33  import org.apache.commons.lang.NotImplementedException;
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.fs.FSDataInputStream;
38  import org.apache.hadoop.fs.FSDataOutputStream;
39  import org.apache.hadoop.fs.FileStatus;
40  import org.apache.hadoop.fs.FileSystem;
41  import org.apache.hadoop.fs.Path;
42  import org.apache.hadoop.fs.PathFilter;
43  import org.apache.hadoop.hbase.Coprocessor;
44  import org.apache.hadoop.hbase.HColumnDescriptor;
45  import org.apache.hadoop.hbase.HConstants;
46  import org.apache.hadoop.hbase.HTableDescriptor;
47  import org.apache.hadoop.hbase.TableDescriptors;
48  import org.apache.hadoop.hbase.TableInfoMissingException;
49  import org.apache.hadoop.hbase.TableName;
50  import org.apache.hadoop.hbase.classification.InterfaceAudience;
51  import org.apache.hadoop.hbase.exceptions.DeserializationException;
52  import org.apache.hadoop.hbase.regionserver.BloomType;
53
54  /**
55   * Implementation of {@link TableDescriptors} that reads descriptors from the
56   * passed filesystem.  It expects descriptors to be in a file in the
57   * {@link #TABLEINFO_DIR} subdir of the table's directory in FS.  Can be read-only
58   *  -- i.e. does not modify the filesystem or can be read and write.
59   *
60   * <p>Also has utility for keeping up the table descriptors tableinfo file.
61   * The table schema file is kept in the {@link #TABLEINFO_DIR} subdir
62   * of the table directory in the filesystem.
63   * It has a {@link #TABLEINFO_FILE_PREFIX} and then a suffix that is the
64   * edit sequenceid: e.g. <code>.tableinfo.0000000003</code>.  This sequenceid
65   * is always increasing.  It starts at zero.  The table schema file with the
66   * highest sequenceid has the most recent schema edit. Usually there is one file
67   * only, the most recent but there may be short periods where there are more
68   * than one file. Old files are eventually cleaned.  Presumption is that there
69   * will not be lots of concurrent clients making table schema edits.  If so,
70   * the below needs a bit of a reworking and perhaps some supporting api in hdfs.
71   */
72  @InterfaceAudience.Private
73  public class FSTableDescriptors implements TableDescriptors {
74    private static final Log LOG = LogFactory.getLog(FSTableDescriptors.class);
75    private final FileSystem fs;
76    private final Path rootdir;
77    private final boolean fsreadonly;
78    private volatile boolean usecache;
79    private volatile boolean fsvisited;
80
81    @VisibleForTesting long cachehits = 0;
82    @VisibleForTesting long invocations = 0;
83
84    /** The file name prefix used to store HTD in HDFS  */
85    static final String TABLEINFO_FILE_PREFIX = ".tableinfo";
86    static final String TABLEINFO_DIR = ".tabledesc";
87    static final String TMP_DIR = ".tmp";
88
89    // This cache does not age out the old stuff.  Thinking is that the amount
90    // of data we keep up in here is so small, no need to do occasional purge.
91    // TODO.
92    private final Map<TableName, HTableDescriptor> cache =
93      new ConcurrentHashMap<TableName, HTableDescriptor>();
94
95    /**
96     * Table descriptor for <code>hbase:meta</code> catalog table
97     */
98    private final HTableDescriptor metaTableDescriptor;
99
100   /**
101    * Construct a FSTableDescriptors instance using the hbase root dir of the given
102    * conf and the filesystem where that root dir lives.
103    * This instance can do write operations (is not read only).
104    */
105   public FSTableDescriptors(final Configuration conf) throws IOException {
106     this(conf, FSUtils.getCurrentFileSystem(conf), FSUtils.getRootDir(conf));
107   }
108
109   public FSTableDescriptors(final Configuration conf, final FileSystem fs, final Path rootdir)
110   throws IOException {
111     this(conf, fs, rootdir, false, true);
112   }
113
114   /**
115    * @param fsreadonly True if we are read-only when it comes to filesystem
116    * operations; i.e. on remove, we do not do delete in fs.
117    */
118   public FSTableDescriptors(final Configuration conf, final FileSystem fs,
119     final Path rootdir, final boolean fsreadonly, final boolean usecache) throws IOException {
120     super();
121     this.fs = fs;
122     this.rootdir = rootdir;
123     this.fsreadonly = fsreadonly;
124     this.usecache = usecache;
125
126     this.metaTableDescriptor = createMetaTableDescriptor(conf);
127   }
128
129   @VisibleForTesting
130   public static HTableDescriptor createMetaTableDescriptor(final Configuration conf)
131       throws IOException {
132     HTableDescriptor metaDescriptor = new HTableDescriptor(
133         TableName.META_TABLE_NAME,
134         new HColumnDescriptor[] {
135             new HColumnDescriptor(HConstants.CATALOG_FAMILY)
136                 .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
137                     HConstants.DEFAULT_HBASE_META_VERSIONS))
138                 .setInMemory(true)
139                 .setBlocksize(conf.getInt(HConstants.HBASE_META_BLOCK_SIZE,
140                     HConstants.DEFAULT_HBASE_META_BLOCK_SIZE))
141                 .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
142                     // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
143                 .setBloomFilterType(BloomType.NONE)
144                     // Enable cache of data blocks in L1 if more than one caching tier deployed:
145                     // e.g. if using CombinedBlockCache (BucketCache).
146                 .setCacheDataInL1(true),
147             new HColumnDescriptor(HConstants.REPLICATION_BARRIER_FAMILY)
148                 .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
149                     HConstants.DEFAULT_HBASE_META_VERSIONS))
150                 .setInMemory(true)
151                 .setBlocksize(conf.getInt(HConstants.HBASE_META_BLOCK_SIZE,
152                     HConstants.DEFAULT_HBASE_META_BLOCK_SIZE))
153                 .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
154                 // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
155                 .setBloomFilterType(BloomType.NONE)
156                 // Enable cache of data blocks in L1 if more than one caching tier deployed:
157                 // e.g. if using CombinedBlockCache (BucketCache).
158                 .setCacheDataInL1(true),
159             new HColumnDescriptor(HConstants.REPLICATION_POSITION_FAMILY)
160                 .setMaxVersions(conf.getInt(HConstants.HBASE_META_VERSIONS,
161                     HConstants.DEFAULT_HBASE_META_VERSIONS))
162                 .setInMemory(true)
163                 .setBlocksize(conf.getInt(HConstants.HBASE_META_BLOCK_SIZE,
164                     HConstants.DEFAULT_HBASE_META_BLOCK_SIZE))
165                 .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
166                 // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
167                 .setBloomFilterType(BloomType.NONE)
168                 // Enable cache of data blocks in L1 if more than one caching tier deployed:
169                 // e.g. if using CombinedBlockCache (BucketCache).
170                 .setCacheDataInL1(true),
171             new HColumnDescriptor(HConstants.TABLE_FAMILY)
172                 // Ten is arbitrary number.  Keep versions to help debugging.
173                 .setMaxVersions(10)
174                 .setInMemory(true)
175                 .setBlocksize(8 * 1024)
176                 .setScope(HConstants.REPLICATION_SCOPE_LOCAL)
177                     // Disable blooms for meta.  Needs work.  Seems to mess w/ getClosestOrBefore.
178                 .setBloomFilterType(BloomType.NONE)
179                     // Enable cache of data blocks in L1 if more than one caching tier deployed:
180                     // e.g. if using CombinedBlockCache (BucketCache).
181                 .setCacheDataInL1(true)
182         }) {
183     };
184     metaDescriptor.addCoprocessor(
185         "org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint",
186         null, Coprocessor.PRIORITY_SYSTEM, null);
187     return metaDescriptor;
188   }
189
190   @Override
191   public void setCacheOn() throws IOException {
192     this.cache.clear();
193     this.usecache = true;
194   }
195
196   @Override
197   public void setCacheOff() throws IOException {
198     this.usecache = false;
199     this.cache.clear();
200   }
201
202   @VisibleForTesting
203   public boolean isUsecache() {
204     return this.usecache;
205   }
206
207   /**
208    * Get the current table descriptor for the given table, or null if none exists.
209    *
210    * Uses a local cache of the descriptor but still checks the filesystem on each call
211    * to see if a newer file has been created since the cached one was read.
212    */
213   @Override
214   @Nullable
215   public HTableDescriptor get(final TableName tablename)
216   throws IOException {
217     invocations++;
218     if (TableName.META_TABLE_NAME.equals(tablename)) {
219       cachehits++;
220       return metaTableDescriptor;
221     }
222     // hbase:meta is already handled. If some one tries to get the descriptor for
223     // .logs, .oldlogs or .corrupt throw an exception.
224     if (HConstants.HBASE_NON_USER_TABLE_DIRS.contains(tablename.getNameAsString())) {
225        throw new IOException("No descriptor found for non table = " + tablename);
226     }
227
228     if (usecache) {
229       // Look in cache of descriptors.
230       HTableDescriptor cachedtdm = this.cache.get(tablename);
231       if (cachedtdm != null) {
232         cachehits++;
233         return cachedtdm;
234       }
235     }
236     HTableDescriptor tdmt = null;
237     try {
238       tdmt = getTableDescriptorFromFs(fs, rootdir, tablename);
239     } catch (NullPointerException e) {
240       LOG.debug("Exception during readTableDecriptor. Current table name = "
241           + tablename, e);
242     } catch (TableInfoMissingException e) {
243       // ignore. This is regular operation
244     } catch (IOException ioe) {
245       LOG.debug("Exception during readTableDecriptor. Current table name = "
246           + tablename, ioe);
247     }
248     // last HTD written wins
249     if (usecache && tdmt != null) {
250       this.cache.put(tablename, tdmt);
251     }
252
253     return tdmt;
254   }
255
256   /**
257    * Returns a map from table name to table descriptor for all tables.
258    */
259   @Override
260   public Map<String, HTableDescriptor> getAllDescriptors()
261   throws IOException {
262     Map<String, HTableDescriptor> tds = new TreeMap<String, HTableDescriptor>();
263
264     if (fsvisited && usecache) {
265       for (Map.Entry<TableName, HTableDescriptor> entry: this.cache.entrySet()) {
266         tds.put(entry.getKey().toString(), entry.getValue());
267       }
268       // add hbase:meta to the response
269       tds.put(this.metaTableDescriptor.getNameAsString(), metaTableDescriptor);
270     } else {
271       LOG.debug("Fetching table descriptors from the filesystem.");
272       boolean allvisited = true;
273       for (Path d : FSUtils.getTableDirs(fs, rootdir)) {
274         HTableDescriptor htd = null;
275         try {
276           htd = get(FSUtils.getTableName(d));
277         } catch (FileNotFoundException fnfe) {
278           // inability of retrieving one HTD shouldn't stop getting the remaining
279           LOG.warn("Trouble retrieving htd", fnfe);
280         }
281         if (htd == null) {
282           allvisited = false;
283           continue;
284         } else {
285           tds.put(htd.getTableName().getNameAsString(), htd);
286         }
287         fsvisited = allvisited;
288       }
289     }
290     return tds;
291   }
292
293   /**
294    * Returns a map from table name to table descriptor for all tables.
295    */
296   @Override
297   public Map<String, HTableDescriptor> getAll() throws IOException {
298     Map<String, HTableDescriptor> htds = new TreeMap<String, HTableDescriptor>();
299     Map<String, HTableDescriptor> allDescriptors = getAllDescriptors();
300     for (Map.Entry<String, HTableDescriptor> entry : allDescriptors
301         .entrySet()) {
302       htds.put(entry.getKey(), entry.getValue());
303     }
304     return htds;
305   }
306 
307   /**
308     * Find descriptors by namespace.
309     * @see #get(org.apache.hadoop.hbase.TableName)
310     */
311   @Override
312   public Map<String, HTableDescriptor> getByNamespace(String name)
313   throws IOException {
314     Map<String, HTableDescriptor> htds = new TreeMap<String, HTableDescriptor>();
315     List<Path> tableDirs =
316         FSUtils.getLocalTableDirs(fs, FSUtils.getNamespaceDir(rootdir, name));
317     for (Path d: tableDirs) {
318       HTableDescriptor htd = null;
319       try {
320         htd = get(FSUtils.getTableName(d));
321       } catch (FileNotFoundException fnfe) {
322         // inability of retrieving one HTD shouldn't stop getting the remaining
323         LOG.warn("Trouble retrieving htd", fnfe);
324       }
325       if (htd == null) continue;
326       htds.put(FSUtils.getTableName(d).getNameAsString(), htd);
327     }
328     return htds;
329   }
330
331   /**
332    * Adds (or updates) the table descriptor to the FileSystem
333    * and updates the local cache with it.
334    */
335   @Override
336   public void add(HTableDescriptor htd) throws IOException {
337     if (fsreadonly) {
338       throw new NotImplementedException("Cannot add a table descriptor - in read only mode");
339     }
340     TableName tableName = htd.getTableName();
341     if (TableName.META_TABLE_NAME.equals(tableName)) {
342       throw new NotImplementedException();
343     }
344     if (HConstants.HBASE_NON_USER_TABLE_DIRS.contains(tableName.getNameAsString())) {
345       throw new NotImplementedException(
346           "Cannot add a table descriptor for a reserved subdirectory name: "
347               + htd.getNameAsString());
348     }
349     updateTableDescriptor(htd);
350   }
351
352   /**
353    * Removes the table descriptor from the local cache and returns it.
354    * If not in read only mode, it also deletes the entire table directory(!)
355    * from the FileSystem.
356    */
357   @Override
358   public HTableDescriptor remove(final TableName tablename)
359   throws IOException {
360     if (fsreadonly) {
361       throw new NotImplementedException("Cannot remove a table descriptor - in read only mode");
362     }
363     Path tabledir = getTableDir(tablename);
364     if (this.fs.exists(tabledir)) {
365       if (!this.fs.delete(tabledir, true)) {
366         throw new IOException("Failed delete of " + tabledir.toString());
367       }
368     }
369     HTableDescriptor descriptor = this.cache.remove(tablename);
370     return descriptor;
371   }
372
373   /**
374    * Checks if a current table info file exists for the given table
375    *
376    * @param tableName name of table
377    * @return true if exists
378    * @throws IOException
379    */
380   public boolean isTableInfoExists(TableName tableName) throws IOException {
381     return getTableInfoPath(tableName) != null;
382   }
383
384   /**
385    * Find the most current table info file for the given table in the hbase root directory.
386    * @return The file status of the current table info file or null if it does not exist
387    */
388   private FileStatus getTableInfoPath(final TableName tableName) throws IOException {
389     Path tableDir = getTableDir(tableName);
390     return getTableInfoPath(tableDir);
391   }
392
393   private FileStatus getTableInfoPath(Path tableDir)
394   throws IOException {
395     return getTableInfoPath(fs, tableDir, !fsreadonly);
396   }
397
398   /**
399    * Find the most current table info file for the table located in the given table directory.
400    *
401    * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
402    * files and takes the 'current' one - meaning the one with the highest sequence number if present
403    * or no sequence number at all if none exist (for backward compatibility from before there
404    * were sequence numbers).
405    *
406    * @return The file status of the current table info file or null if it does not exist
407    * @throws IOException
408    */
409   public static FileStatus getTableInfoPath(FileSystem fs, Path tableDir)
410   throws IOException {
411     return getTableInfoPath(fs, tableDir, false);
412   }
413
414   /**
415    * Find the most current table info file for the table in the given table directory.
416    *
417    * Looks within the {@link #TABLEINFO_DIR} subdirectory of the given directory for any table info
418    * files and takes the 'current' one - meaning the one with the highest sequence number if
419    * present or no sequence number at all if none exist (for backward compatibility from before
420    * there were sequence numbers).
421    * If there are multiple table info files found and removeOldFiles is true it also deletes the
422    * older files.
423    *
424    * @return The file status of the current table info file or null if none exist
425    * @throws IOException
426    */
427   private static FileStatus getTableInfoPath(FileSystem fs, Path tableDir, boolean removeOldFiles)
428   throws IOException {
429     Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
430     return getCurrentTableInfoStatus(fs, tableInfoDir, removeOldFiles);
431   }
432
433   /**
434    * Find the most current table info file in the given directory
435    *
436    * Looks within the given directory for any table info files
437    * and takes the 'current' one - meaning the one with the highest sequence number if present
438    * or no sequence number at all if none exist (for backward compatibility from before there
439    * were sequence numbers).
440    * If there are multiple possible files found
441    * and the we're not in read only mode it also deletes the older files.
442    *
443    * @return The file status of the current table info file or null if it does not exist
444    * @throws IOException
445    */
446   // only visible for FSTableDescriptorMigrationToSubdir, can be removed with that
447   static FileStatus getCurrentTableInfoStatus(FileSystem fs, Path dir, boolean removeOldFiles)
448   throws IOException {
449     FileStatus [] status = FSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
450     if (status == null || status.length < 1) return null;
451     FileStatus mostCurrent = null;
452     for (FileStatus file : status) {
453       if (mostCurrent == null || TABLEINFO_FILESTATUS_COMPARATOR.compare(file, mostCurrent) < 0) {
454         mostCurrent = file;
455       }
456     }
457     if (removeOldFiles && status.length > 1) {
458       // Clean away old versions
459       for (FileStatus file : status) {
460         Path path = file.getPath();
461         if (file != mostCurrent) {
462           if (!fs.delete(file.getPath(), false)) {
463             LOG.warn("Failed cleanup of " + path);
464           } else {
465             LOG.debug("Cleaned up old tableinfo file " + path);
466           }
467         }
468       }
469     }
470     return mostCurrent;
471   }
472
473   /**
474    * Compare {@link FileStatus} instances by {@link Path#getName()}. Returns in
475    * reverse order.
476    */
477   @VisibleForTesting
478   static final Comparator<FileStatus> TABLEINFO_FILESTATUS_COMPARATOR =
479   new Comparator<FileStatus>() {
480     @Override
481     public int compare(FileStatus left, FileStatus right) {
482       return right.compareTo(left);
483     }};
484
485   /**
486    * Return the table directory in HDFS
487    */
488   @VisibleForTesting Path getTableDir(final TableName tableName) {
489     return FSUtils.getTableDir(rootdir, tableName);
490   }
491
492   private static final PathFilter TABLEINFO_PATHFILTER = new PathFilter() {
493     @Override
494     public boolean accept(Path p) {
495       // Accept any file that starts with TABLEINFO_NAME
496       return p.getName().startsWith(TABLEINFO_FILE_PREFIX);
497     }};
498
499   /**
500    * Width of the sequenceid that is a suffix on a tableinfo file.
501    */
502   @VisibleForTesting static final int WIDTH_OF_SEQUENCE_ID = 10;
503
504   /*
505    * @param number Number to use as suffix.
506    * @return Returns zero-prefixed decimal version of passed
507    * number (Does absolute in case number is negative).
508    */
509   private static String formatTableInfoSequenceId(final int number) {
510     byte [] b = new byte[WIDTH_OF_SEQUENCE_ID];
511     int d = Math.abs(number);
512     for (int i = b.length - 1; i >= 0; i--) {
513       b[i] = (byte)((d % 10) + '0');
514       d /= 10;
515     }
516     return Bytes.toString(b);
517   }
518
519   /**
520    * Regex to eat up sequenceid suffix on a .tableinfo file.
521    * Use regex because may encounter oldstyle .tableinfos where there is no
522    * sequenceid on the end.
523    */
524   private static final Pattern TABLEINFO_FILE_REGEX =
525     Pattern.compile(TABLEINFO_FILE_PREFIX + "(\\.([0-9]{" + WIDTH_OF_SEQUENCE_ID + "}))?$");
526
527   /**
528    * @param p Path to a <code>.tableinfo</code> file.
529    * @return The current editid or 0 if none found.
530    */
531   @VisibleForTesting static int getTableInfoSequenceId(final Path p) {
532     if (p == null) return 0;
533     Matcher m = TABLEINFO_FILE_REGEX.matcher(p.getName());
534     if (!m.matches()) throw new IllegalArgumentException(p.toString());
535     String suffix = m.group(2);
536     if (suffix == null || suffix.length() <= 0) return 0;
537     return Integer.parseInt(m.group(2));
538   }
539
540   /**
541    * @param sequenceid
542    * @return Name of tableinfo file.
543    */
544   @VisibleForTesting static String getTableInfoFileName(final int sequenceid) {
545     return TABLEINFO_FILE_PREFIX + "." + formatTableInfoSequenceId(sequenceid);
546   }
547
548   /**
549    * Returns the latest table descriptor for the given table directly from the file system
550    * if it exists, bypassing the local cache.
551    * Returns null if it's not found.
552    */
553   public static HTableDescriptor getTableDescriptorFromFs(FileSystem fs,
554       Path hbaseRootDir, TableName tableName) throws IOException {
555     Path tableDir = FSUtils.getTableDir(hbaseRootDir, tableName);
556     return getTableDescriptorFromFs(fs, tableDir);
557   }
558
559   /**
560    * Returns the latest table descriptor for the table located at the given directory
561    * directly from the file system if it exists.
562    * @throws TableInfoMissingException if there is no descriptor
563    */
564   public static HTableDescriptor getTableDescriptorFromFs(FileSystem fs, Path tableDir)
565   throws IOException {
566     FileStatus status = getTableInfoPath(fs, tableDir, false);
567     if (status == null) {
568       throw new TableInfoMissingException("No table descriptor file under " + tableDir);
569     }
570     return readTableDescriptor(fs, status);
571   }
572
573   private static HTableDescriptor readTableDescriptor(FileSystem fs, FileStatus status)
574       throws IOException {
575     int len = Ints.checkedCast(status.getLen());
576     byte [] content = new byte[len];
577     FSDataInputStream fsDataInputStream = fs.open(status.getPath());
578     try {
579       fsDataInputStream.readFully(content);
580     } finally {
581       fsDataInputStream.close();
582     }
583     HTableDescriptor htd = null;
584     try {
585       htd = HTableDescriptor.parseFrom(content);
586     } catch (DeserializationException e) {
587       throw new IOException("content=" + Bytes.toShort(content), e);
588     }
589     return htd;
590   }
591
592   /**
593    * Update table descriptor on the file system
594    * @throws IOException Thrown if failed update.
595    * @throws NotImplementedException if in read only mode
596    */
597   @VisibleForTesting Path updateTableDescriptor(HTableDescriptor td)
598   throws IOException {
599     if (fsreadonly) {
600       throw new NotImplementedException("Cannot update a table descriptor - in read only mode");
601     }
602     TableName tableName = td.getTableName();
603     Path tableDir = getTableDir(tableName);
604     Path p = writeTableDescriptor(fs, td, tableDir, getTableInfoPath(tableDir));
605     if (p == null) throw new IOException("Failed update");
606     LOG.info("Updated tableinfo=" + p);
607     if (usecache) {
608       this.cache.put(td.getTableName(), td);
609     }
610     return p;
611   }
612
613   /**
614    * Deletes all the table descriptor files from the file system.
615    * Used in unit tests only.
616    * @throws NotImplementedException if in read only mode
617    */
618   public void deleteTableDescriptorIfExists(TableName tableName) throws IOException {
619     if (fsreadonly) {
620       throw new NotImplementedException("Cannot delete a table descriptor - in read only mode");
621     }
622
623     Path tableDir = getTableDir(tableName);
624     Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
625     deleteTableDescriptorFiles(fs, tableInfoDir, Integer.MAX_VALUE);
626   }
627
628   /**
629    * Deletes files matching the table info file pattern within the given directory
630    * whose sequenceId is at most the given max sequenceId.
631    */
632   private static void deleteTableDescriptorFiles(FileSystem fs, Path dir, int maxSequenceId)
633   throws IOException {
634     FileStatus [] status = FSUtils.listStatus(fs, dir, TABLEINFO_PATHFILTER);
635     for (FileStatus file : status) {
636       Path path = file.getPath();
637       int sequenceId = getTableInfoSequenceId(path);
638       if (sequenceId <= maxSequenceId) {
639         boolean success = FSUtils.delete(fs, path, false);
640         if (success) {
641           LOG.debug("Deleted table descriptor at " + path);
642         } else {
643           LOG.error("Failed to delete descriptor at " + path);
644         }
645       }
646     }
647   }
648
649   /**
650    * Attempts to write a new table descriptor to the given table's directory.
651    * It first writes it to the .tmp dir then uses an atomic rename to move it into place.
652    * It begins at the currentSequenceId + 1 and tries 10 times to find a new sequence number
653    * not already in use.
654    * Removes the current descriptor file if passed in.
655    *
656    * @return Descriptor file or null if we failed write.
657    */
658   private static Path writeTableDescriptor(final FileSystem fs,
659     final HTableDescriptor htd, final Path tableDir,
660     final FileStatus currentDescriptorFile)
661   throws IOException {
662     // Get temporary dir into which we'll first write a file to avoid half-written file phenomenon.
663     // This directory is never removed to avoid removing it out from under a concurrent writer.
664     Path tmpTableDir = new Path(tableDir, TMP_DIR);
665     Path tableInfoDir = new Path(tableDir, TABLEINFO_DIR);
666
667     // What is current sequenceid?  We read the current sequenceid from
668     // the current file.  After we read it, another thread could come in and
669     // compete with us writing out next version of file.  The below retries
670     // should help in this case some but its hard to do guarantees in face of
671     // concurrent schema edits.
672     int currentSequenceId = currentDescriptorFile == null ? 0 :
673       getTableInfoSequenceId(currentDescriptorFile.getPath());
674     int newSequenceId = currentSequenceId;
675
676     // Put arbitrary upperbound on how often we retry
677     int retries = 10;
678     int retrymax = currentSequenceId + retries;
679     Path tableInfoDirPath = null;
680     do {
681       newSequenceId += 1;
682       String filename = getTableInfoFileName(newSequenceId);
683       Path tempPath = new Path(tmpTableDir, filename);
684       if (fs.exists(tempPath)) {
685         LOG.debug(tempPath + " exists; retrying up to " + retries + " times");
686         continue;
687       }
688       tableInfoDirPath = new Path(tableInfoDir, filename);
689       try {
690         writeTD(fs, tempPath, htd);
691         fs.mkdirs(tableInfoDirPath.getParent());
692         if (!fs.rename(tempPath, tableInfoDirPath)) {
693           throw new IOException("Failed rename of " + tempPath + " to " + tableInfoDirPath);
694         }
695         LOG.debug("Wrote descriptor into: " + tableInfoDirPath);
696       } catch (IOException ioe) {
697         // Presume clash of names or something; go around again.
698         LOG.debug("Failed write and/or rename; retrying", ioe);
699         if (!FSUtils.deleteDirectory(fs, tempPath)) {
700           LOG.warn("Failed cleanup of " + tempPath);
701         }
702         tableInfoDirPath = null;
703         continue;
704       }
705       break;
706     } while (newSequenceId < retrymax);
707     if (tableInfoDirPath != null) {
708       // if we succeeded, remove old table info files.
709       deleteTableDescriptorFiles(fs, tableInfoDir, newSequenceId - 1);
710     }
711     return tableInfoDirPath;
712   }
713
714   private static void writeTD(final FileSystem fs, final Path p, final HTableDescriptor htd)
715   throws IOException {
716     FSDataOutputStream out = fs.create(p, false);
717     try {
718       // We used to write this file out as a serialized HTD Writable followed by two '\n's and then
719       // the toString version of HTD.  Now we just write out the pb serialization.
720       out.write(htd.toByteArray());
721     } finally {
722       out.close();
723     }
724   }
725
726   /**
727    * Create new HTableDescriptor in HDFS. Happens when we are creating table.
728    * Used by tests.
729    * @return True if we successfully created file.
730    */
731   public boolean createTableDescriptor(HTableDescriptor htd) throws IOException {
732     return createTableDescriptor(htd, false);
733   }
734
735   /**
736    * Create new HTableDescriptor in HDFS. Happens when we are creating table. If
737    * forceCreation is true then even if previous table descriptor is present it
738    * will be overwritten
739    *
740    * @return True if we successfully created file.
741    */
742   public boolean createTableDescriptor(HTableDescriptor htd, boolean forceCreation)
743   throws IOException {
744     Path tableDir = getTableDir(htd.getTableName());
745     return createTableDescriptorForTableDirectory(tableDir, htd, forceCreation);
746   }
747
748   /**
749    * Create a new HTableDescriptor in HDFS in the specified table directory. Happens when we create
750    * a new table or snapshot a table.
751    * @param tableDir table directory under which we should write the file
752    * @param htd description of the table to write
753    * @param forceCreation if <tt>true</tt>,then even if previous table descriptor is present it will
754    *          be overwritten
755    * @return <tt>true</tt> if the we successfully created the file, <tt>false</tt> if the file
756    *         already exists and we weren't forcing the descriptor creation.
757    * @throws IOException if a filesystem error occurs
758    */
759   public boolean createTableDescriptorForTableDirectory(Path tableDir,
760       HTableDescriptor htd, boolean forceCreation) throws IOException {
761     if (fsreadonly) {
762       throw new NotImplementedException("Cannot create a table descriptor - in read only mode");
763     }
764     FileStatus status = getTableInfoPath(fs, tableDir);
765     if (status != null) {
766       LOG.debug("Current tableInfoPath = " + status.getPath());
767       if (!forceCreation) {
768         if (fs.exists(status.getPath()) && status.getLen() > 0) {
769           if (readTableDescriptor(fs, status).equals(htd)) {
770             LOG.debug("TableInfo already exists.. Skipping creation");
771             return false;
772           }
773         }
774       }
775     }
776     Path p = writeTableDescriptor(fs, htd, tableDir, status);
777     return p != null;
778   }
779
780 }
781