001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.hbase.io.hfile.bucket;
019
020import java.util.concurrent.ConcurrentHashMap;
021import java.util.concurrent.atomic.AtomicInteger;
022import org.apache.yetus.audience.InterfaceAudience;
023import org.slf4j.Logger;
024import org.slf4j.LoggerFactory;
025
026/**
027 * Pool of string values encoded to integer IDs for use in BlockCacheKey. This allows for avoiding
028 * duplicating string values for file names, region and CF values on various BlockCacheKey
029 * instances. Normally, single hfiles have many blocks. This means all blocks from the same file
030 * will have the very same file, region and CF names. On very large BucketCache setups (i.e. file
031 * based cache with TB size order), can save few GBs of memory by avoiding repeating these common
032 * string values on blocks from the same file. The FilePathStringPool is implemented as a singleton,
033 * since the same pool should be shared by all BlockCacheKey instances, as well as the BucketCache
034 * object itself. The Id for an encoded string is an integer. Any new String added to the pool is
035 * assigned the next available integer ID, starting from 0 upwards. That sets the total pool
036 * capacity to Integer.MAX_VALUE. In the event of ID exhaustion (integer overflow when Id values
037 * reach Integer.MAX_VALUE), the encode() method will restart iterating over int values
038 * incrementally from 0 until it finds an unused ID. Strings can be removed from the pool using the
039 * remove() method. BucketCache should call this when evicting all blocks for a given file (see
040 * BucketCache.evictFileBlocksFromCache()).
041 * <p>
042 * Thread-safe implementation that maintains bidirectional mappings between strings and IDs.
043 * </p>
044 */
045@InterfaceAudience.Private
046public class FilePathStringPool {
047  private static final Logger LOG = LoggerFactory.getLogger(FilePathStringPool.class);
048
049  // Bidirectional mappings for string objects re-use
050  private final ConcurrentHashMap<String, Integer> stringToId = new ConcurrentHashMap<>();
051  private final ConcurrentHashMap<Integer, String> idToString = new ConcurrentHashMap<>();
052  private final AtomicInteger nextId = new AtomicInteger(0);
053
054  private static FilePathStringPool instance;
055
056  public static FilePathStringPool getInstance() {
057    synchronized (FilePathStringPool.class) {
058      if (instance == null) {
059        instance = new FilePathStringPool();
060      }
061    }
062    return instance;
063  }
064
065  private FilePathStringPool() {
066    // Private constructor for singleton
067  }
068
069  /**
070   * Gets or creates an integer ID for the given String.
071   * @param string value for the file/region/CF name.
072   * @return the integer ID encoding this string in the pool.
073   */
074  public int encode(String string) {
075    if (string == null) {
076      throw new IllegalArgumentException("string cannot be null");
077    }
078    return stringToId.computeIfAbsent(string, name -> {
079      if (stringToId.size() == Integer.MAX_VALUE) {
080        throw new IllegalStateException(
081          "String pool has reached maximum capacity of " + Integer.MAX_VALUE + " unique strings.");
082      }
083      int id = nextId.getAndIncrement();
084      while (idToString.containsKey(id)) {
085        id = nextId.getAndIncrement();
086        if (id == Integer.MAX_VALUE) {
087          nextId.set(0);
088          LOG.info("Id values reached Integer.MAX_VALUE, restarting from 0");
089        }
090      }
091      idToString.put(id, name);
092      LOG.trace("Encoded new string to ID {}: {}", id, name);
093      return id;
094    });
095  }
096
097  /**
098   * Decodes an integer ID back to its original file name.
099   * @param id the integer ID
100   * @return the original file name, or null if not found
101   */
102  public String decode(int id) {
103    return idToString.get(id);
104  }
105
106  /**
107   * Checks if a given string ID is already being used.
108   * @param id the integer ID to check
109   * @return true if the ID exists
110   */
111  public boolean contains(int id) {
112    return idToString.containsKey(id);
113  }
114
115  /**
116   * Checks if a given string has been encoded.
117   * @param string the value to check
118   * @return true if the string value has been encoded
119   */
120  public boolean contains(String string) {
121    return stringToId.containsKey(string);
122  }
123
124  /**
125   * Gets the number of unique file names currently tracked.
126   * @return the number of entries in the codec
127   */
128  public int size() {
129    return stringToId.size();
130  }
131
132  /**
133   * Removes a string value and its ID from the pool. This should only be called when all blocks for
134   * a file have been evicted from the cache.
135   * @param string the file name to remove
136   * @return true if the file name was removed, false if it wasn't present
137   */
138  public boolean remove(String string) {
139    if (string == null) {
140      return false;
141    }
142    Integer id = stringToId.remove(string);
143    if (id != null) {
144      idToString.remove(id);
145      LOG.debug("Removed string value from pool: {} (ID: {})", string, id);
146      return true;
147    }
148    return false;
149  }
150
151  /**
152   * Clears all mappings from the codec.
153   */
154  public void clear() {
155    stringToId.clear();
156    idToString.clear();
157    nextId.set(0);
158    LOG.info("Cleared all file name mappings from codec");
159  }
160
161  /**
162   * Gets statistics about memory savings from string pooling.
163   * @return a formatted string with compression statistics
164   */
165  public String getPoolStats() {
166    long uniqueStrings = stringToId.size();
167    if (uniqueStrings == 0) {
168      return "No strings encoded";
169    }
170    // Calculate average string length
171    long totalChars = stringToId.keySet().stream().mapToLong(String::length).sum();
172    double avgLength = (double) totalChars / uniqueStrings;
173    return String.format("FilePathStringPool stats: %d unique strings, avg length: %.1f chars, ",
174      uniqueStrings, avgLength);
175  }
176}