Re: [PR] HBASE-29727 Introduce a String pool for repeating filename, region and cf string fields in BlockCacheKey [hbase]

via GitHub Thu, 27 Nov 2025 02:59:11 -0800


wchevreuil commented on code in PR #7477:
URL: https://github.com/apache/hbase/pull/7477#discussion_r2568084631



##########
hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/BlockCacheKey.java:
##########
@@ -19,83 +19,164 @@
 
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.io.HeapSize;
+import org.apache.hadoop.hbase.io.hfile.bucket.FilePathStringPool;
 import org.apache.hadoop.hbase.util.ClassSize;
+import org.apache.hadoop.hbase.util.HFileArchiveUtil;
 import org.apache.yetus.audience.InterfaceAudience;
 
 /**
  * Cache Key for use with implementations of {@link BlockCache}
  */
 @InterfaceAudience.Private
 public class BlockCacheKey implements HeapSize, java.io.Serializable {
-  private static final long serialVersionUID = -5199992013113130534L;
-  private final String hfileName;
+  private static final long serialVersionUID = -5199992013113130535L; // 
Changed due to format
+                                                                      // change
+
+  // New compressed format using integer file ID (when codec is available)
+  private final int hfileNameId;
+
+  private transient final FilePathStringPool stringPool;
+
+  private final int regionId;
+
+  private final int cfId;
+
   private final long offset;
+
   private BlockType blockType;
+
   private final boolean isPrimaryReplicaBlock;
 
-  private Path filePath;
+  private final boolean archived;
 
   /**
-   * Construct a new BlockCacheKey
+   * Constructs a new BlockCacheKey with the file name and offset only. To be 
used for cache lookups
+   * only, DO NOT use this for creating keys when inserting into the cache. 
Use either the
+   * overriding constructors with the path parameter or the region and cf 
parameters, otherwise,
+   * region cache metrics won't be recorded properly.
    * @param hfileName The name of the HFile this block belongs to.
    * @param offset    Offset of the block into the file
    */
   public BlockCacheKey(String hfileName, long offset) {
-    this(hfileName, offset, true, BlockType.DATA);
+    this(hfileName, null, null, offset, true, BlockType.DATA, false);
   }
 
+  /**
+   * Constructs a new BlockCacheKey with the file name, offset, replica and 
type only. To be used
+   * for cache lookups only, DO NOT use this for creating keys when inserting 
into the cache. Use
+   * either the overriding constructors with the path parameter or the region 
and cf parameters,
+   * otherwise, region cache metrics won't be recorded properly.
+   * @param hfileName        The name of the HFile this block belongs to.
+   * @param offset           Offset of the block into the file
+   * @param isPrimaryReplica Whether this is from primary replica
+   * @param blockType        Type of block
+   */
   public BlockCacheKey(String hfileName, long offset, boolean isPrimaryReplica,
     BlockType blockType) {
+    this(hfileName, null, null, offset, isPrimaryReplica, blockType, false);
+  }
+
+  /**
+   * Construct a new BlockCacheKey, with file, column family and region 
information. This should be
+   * used when inserting keys into the cache, so that region cache metrics are 
recorded properly.
+   * @param hfileName        The name of the HFile this block belongs to.
+   * @param cfName           The column family name
+   * @param regionName       The region name
+   * @param offset           Offset of the block into the file
+   * @param isPrimaryReplica Whether this is from primary replica
+   * @param blockType        Type of block
+   */
+  public BlockCacheKey(String hfileName, String cfName, String regionName, 
long offset,
+    boolean isPrimaryReplica, BlockType blockType, boolean archived) {
     this.isPrimaryReplicaBlock = isPrimaryReplica;
-    this.hfileName = hfileName;
     this.offset = offset;
     this.blockType = blockType;
+    this.stringPool = FilePathStringPool.getInstance();
+    // Use string pool for file, region and cf values
+    this.hfileNameId = stringPool.encode(hfileName);
+    this.regionId = (regionName != null) ? stringPool.encode(regionName) : -1;
+    this.cfId = (cfName != null) ? stringPool.encode(cfName) : -1;
+    this.archived = archived;
   }
 
+  /**
+   * Construct a new BlockCacheKey using a file path. File, column family and 
region information
+   * will be extracted from the passed path. This should be used when 
inserting keys into the cache,
+   * so that region cache metrics are recorded properly.
+   * @param hfilePath        The path to the HFile
+   * @param offset           Offset of the block into the file
+   * @param isPrimaryReplica Whether this is from primary replica
+   * @param blockType        Type of block
+   */
   public BlockCacheKey(Path hfilePath, long offset, boolean isPrimaryReplica, 
BlockType blockType) {
-    this.filePath = hfilePath;
-    this.isPrimaryReplicaBlock = isPrimaryReplica;
-    this.hfileName = hfilePath.getName();
-    this.offset = offset;
-    this.blockType = blockType;
+    this(hfilePath.getName(), hfilePath.getParent().getName(),
+      hfilePath.getParent().getParent().getName(), offset, isPrimaryReplica, 
blockType,
+      HFileArchiveUtil.isHFileArchived(hfilePath));
   }
 
   @Override
   public int hashCode() {
-    return hfileName.hashCode() * 127 + (int) (offset ^ (offset >>> 32));
+    return hfileNameId * 127 + (int) (offset ^ (offset >>> 32));

Review Comment:
   IDs are provided by the FileStringPool.encode() method, which keeps a 
ConcurrentHashMap of all Ids currently in use, and calculates new Ids for not 
yet pooled strings atomically. It also keeps a limit of Integer.MAX_VALUE 
possible IDs, if we reach that value, FileStringPool.encode() throws an error. 
When a string is removed from the pool (FileStringPool.remove()), its 
associated Id is released and can be reused. Removing of a string should only 
be made when we are sure no BlockCacheKey relies on such string. It's up for 
BucketCache to perform Strings removal, as it's the only component with enough 
information about when a given string is no longer used (i.e. when we remove 
all blocks for a given file and all files from a given region).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] HBASE-29727 Introduce a String pool for repeating filename, region and cf string fields in BlockCacheKey [hbase]

Reply via email to