001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.hbase.io.hfile.bucket; 019 020import java.util.concurrent.ConcurrentHashMap; 021import java.util.concurrent.atomic.AtomicInteger; 022import org.apache.yetus.audience.InterfaceAudience; 023import org.slf4j.Logger; 024import org.slf4j.LoggerFactory; 025 026/** 027 * Pool of string values encoded to integer IDs for use in BlockCacheKey. This allows for avoiding 028 * duplicating string values for file names, region and CF values on various BlockCacheKey 029 * instances. Normally, single hfiles have many blocks. This means all blocks from the same file 030 * will have the very same file, region and CF names. On very large BucketCache setups (i.e. file 031 * based cache with TB size order), can save few GBs of memory by avoiding repeating these common 032 * string values on blocks from the same file. The FilePathStringPool is implemented as a singleton, 033 * since the same pool should be shared by all BlockCacheKey instances, as well as the BucketCache 034 * object itself. The Id for an encoded string is an integer. Any new String added to the pool is 035 * assigned the next available integer ID, starting from 0 upwards. That sets the total pool 036 * capacity to Integer.MAX_VALUE. In the event of ID exhaustion (integer overflow when Id values 037 * reach Integer.MAX_VALUE), the encode() method will restart iterating over int values 038 * incrementally from 0 until it finds an unused ID. Strings can be removed from the pool using the 039 * remove() method. BucketCache should call this when evicting all blocks for a given file (see 040 * BucketCache.evictFileBlocksFromCache()). 041 * <p> 042 * Thread-safe implementation that maintains bidirectional mappings between strings and IDs. 043 * </p> 044 */ 045@InterfaceAudience.Private 046public class FilePathStringPool { 047 private static final Logger LOG = LoggerFactory.getLogger(FilePathStringPool.class); 048 049 // Bidirectional mappings for string objects re-use 050 private final ConcurrentHashMap<String, Integer> stringToId = new ConcurrentHashMap<>(); 051 private final ConcurrentHashMap<Integer, String> idToString = new ConcurrentHashMap<>(); 052 private final AtomicInteger nextId = new AtomicInteger(0); 053 054 private static FilePathStringPool instance; 055 056 public static FilePathStringPool getInstance() { 057 synchronized (FilePathStringPool.class) { 058 if (instance == null) { 059 instance = new FilePathStringPool(); 060 } 061 } 062 return instance; 063 } 064 065 private FilePathStringPool() { 066 // Private constructor for singleton 067 } 068 069 /** 070 * Gets or creates an integer ID for the given String. 071 * @param string value for the file/region/CF name. 072 * @return the integer ID encoding this string in the pool. 073 */ 074 public int encode(String string) { 075 if (string == null) { 076 throw new IllegalArgumentException("string cannot be null"); 077 } 078 return stringToId.computeIfAbsent(string, name -> { 079 if (stringToId.size() == Integer.MAX_VALUE) { 080 throw new IllegalStateException( 081 "String pool has reached maximum capacity of " + Integer.MAX_VALUE + " unique strings."); 082 } 083 int id = nextId.getAndIncrement(); 084 while (idToString.containsKey(id)) { 085 id = nextId.getAndIncrement(); 086 if (id == Integer.MAX_VALUE) { 087 nextId.set(0); 088 LOG.info("Id values reached Integer.MAX_VALUE, restarting from 0"); 089 } 090 } 091 idToString.put(id, name); 092 LOG.trace("Encoded new string to ID {}: {}", id, name); 093 return id; 094 }); 095 } 096 097 /** 098 * Decodes an integer ID back to its original file name. 099 * @param id the integer ID 100 * @return the original file name, or null if not found 101 */ 102 public String decode(int id) { 103 return idToString.get(id); 104 } 105 106 /** 107 * Checks if a given string ID is already being used. 108 * @param id the integer ID to check 109 * @return true if the ID exists 110 */ 111 public boolean contains(int id) { 112 return idToString.containsKey(id); 113 } 114 115 /** 116 * Checks if a given string has been encoded. 117 * @param string the value to check 118 * @return true if the string value has been encoded 119 */ 120 public boolean contains(String string) { 121 return stringToId.containsKey(string); 122 } 123 124 /** 125 * Gets the number of unique file names currently tracked. 126 * @return the number of entries in the codec 127 */ 128 public int size() { 129 return stringToId.size(); 130 } 131 132 /** 133 * Removes a string value and its ID from the pool. This should only be called when all blocks for 134 * a file have been evicted from the cache. 135 * @param string the file name to remove 136 * @return true if the file name was removed, false if it wasn't present 137 */ 138 public boolean remove(String string) { 139 if (string == null) { 140 return false; 141 } 142 Integer id = stringToId.remove(string); 143 if (id != null) { 144 idToString.remove(id); 145 LOG.debug("Removed string value from pool: {} (ID: {})", string, id); 146 return true; 147 } 148 return false; 149 } 150 151 /** 152 * Clears all mappings from the codec. 153 */ 154 public void clear() { 155 stringToId.clear(); 156 idToString.clear(); 157 nextId.set(0); 158 LOG.info("Cleared all file name mappings from codec"); 159 } 160 161 /** 162 * Gets statistics about memory savings from string pooling. 163 * @return a formatted string with compression statistics 164 */ 165 public String getPoolStats() { 166 long uniqueStrings = stringToId.size(); 167 if (uniqueStrings == 0) { 168 return "No strings encoded"; 169 } 170 // Calculate average string length 171 long totalChars = stringToId.keySet().stream().mapToLong(String::length).sum(); 172 double avgLength = (double) totalChars / uniqueStrings; 173 return String.format("FilePathStringPool stats: %d unique strings, avg length: %.1f chars, ", 174 uniqueStrings, avgLength); 175 } 176}