[
https://issues.apache.org/jira/browse/HBASE-29299?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Work on HBASE-29299 started by Eungsop Yoo.
-------------------------------------------
> Reopen initialReader of HStoreFile to refresh metadata when read failed
> -----------------------------------------------------------------------
>
> Key: HBASE-29299
> URL: https://issues.apache.org/jira/browse/HBASE-29299
> Project: HBase
> Issue Type: Bug
> Affects Versions: 2.4.18, 2.5.11
> Reporter: Eungsop Yoo
> Assignee: Eungsop Yoo
> Priority: Major
>
> I discovered an issue while testing Erasure Coding. If more DataNodes go down
> than the number of parity stripes, the Scan naturally fails. However, even
> after restarting the downed DataNodes, the Scan continues to fail. This issue
> does not occur every time, but it happens with high probability. The root
> cause is that the initialReader inside the HStoreFile holds an HDFS metadata
> cache, which does not get refreshed. Therefore, I modified the logic to close
> the initialReader and reopen it when an exception occurs.
> Here is the log captured when the scan fails:
> {code}
> org.apache.hadoop.hbase.client.RetriesExhaustedException: Failed after
> attempts=8, exceptions:
> 2025-05-07T08:17:57.123Z,
> RpcRetryingCaller{globalStartTime=2025-05-07T08:17:57.084Z, pause=100,
> maxAttempts=8}, java.io.IOException: java.io.IOException: Could not seek
> StoreFileScanner[HFileScanner for reader reader=hdfs://hbase-alpha25/hbas
> e/data/default/test1/8a9fd0285a94ed3a8a16f595842e17fa/c/0ca5ca4cd7d14fe993d19e4632b2fb52,
> compression=none, cacheConf=cacheDataOnRead=true, cacheDataOnWrite=false,
> cacheIndexesOnWrite=false, cacheBloomsOnWrite=false, cacheEvictOnClose=false,
> c
> acheDataCompressed=false, prefetchOnOpen=false,
> firstKey=Optional[user00000000000000000000000000000000256006064453599002/c:field0/1745905948161/Put/seqid=0],
>
> lastKey=Optional[user00000000000000000000000000000000511999723045682420/c:field3/1745
> 905845638/Put/seqid=0], avgKeyLen=73, avgValueLen=30, entries=134592,
> length=15040759, cur=null] to key
> org.apache.hadoop.hbase.PrivateCellUtil$FirstOnRowDeleteFamilyCell@1e25b769
> at
> org.apache.hadoop.hbase.regionserver.StoreFileScanner.seek(StoreFileScanner.java:232)
> at
> org.apache.hadoop.hbase.regionserver.StoreScanner.seekScanners(StoreScanner.java:416)
> at
> org.apache.hadoop.hbase.regionserver.StoreScanner.<init>(StoreScanner.java:260)
> at
> org.apache.hadoop.hbase.regionserver.HStore.createScanner(HStore.java:1712)
> at
> org.apache.hadoop.hbase.regionserver.HStore.getScanner(HStore.java:1703)
> at
> org.apache.hadoop.hbase.regionserver.RegionScannerImpl.initializeScanners(RegionScannerImpl.java:166)
> at
> org.apache.hadoop.hbase.regionserver.RegionScannerImpl.<init>(RegionScannerImpl.java:146)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.instantiateRegionScanner(HRegion.java:3019)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.lambda$getScanner$3(HRegion.java:3004)
> at org.apache.hadoop.hbase.trace.TraceUtil.trace(TraceUtil.java:216)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2990)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2985)
> at
> org.apache.hadoop.hbase.regionserver.HRegion.getScanner(HRegion.java:2979)
> at
> org.apache.hadoop.hbase.regionserver.RSRpcServices.newRegionScanner(RSRpcServices.java:3203)
> at
> org.apache.hadoop.hbase.regionserver.RSRpcServices.scan(RSRpcServices.java:3580)
> at
> org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:45006)
> at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:415)
> at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:124)
> at org.apache.hadoop.hbase.ipc.RpcHandler.run(RpcHandler.java:102)
> at org.apache.hadoop.hbase.ipc.RpcHandler.run(RpcHandler.java:82)
> Caused by: java.io.IOException: Encountered an exception when invoking
> ByteBuffer positioned read when trying to read 0 bytes from position 0
> at
> org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtraDirectly(BlockIOUtils.java:368)
> at
> org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtra(BlockIOUtils.java:311)
> at
> org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readAtOffset(HFileBlock.java:1481)
> at
> org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readBlockDataInternal(HFileBlock.java:1719)
> at
> org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderImpl.readBlockData(HFileBlock.java:1519)
> at
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1331)
> at
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl.readBlock(HFileReaderImpl.java:1252)
> at
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl$HFileScannerImpl.readAndUpdateNewBlock(HFileReaderImpl.java:943)
> at
> org.apache.hadoop.hbase.io.hfile.HFileReaderImpl$HFileScannerImpl.seekTo(HFileReaderImpl.java:932)
> at
> org.apache.hadoop.hbase.regionserver.StoreFileScanner.seekAtOrAfter(StoreFileScanner.java:311)
> at
> org.apache.hadoop.hbase.regionserver.StoreFileScanner.seek(StoreFileScanner.java:214)
> ... 19 more
> Caused by: java.lang.reflect.InvocationTargetException
> at
> java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118)
> at java.base/java.lang.reflect.Method.invoke(Method.java:580)
> at
> org.apache.hadoop.hbase.io.util.BlockIOUtils.preadWithExtraDirectly(BlockIOUtils.java:363)
> ... 29 more
> Caused by: java.io.IOException: 3 missing blocks, the stripe is:
> AlignedStripe(Offset=0, length=33, fetchedChunksNum=0, missingChunksNum=3);
> locatedBlocks is: LocatedBlocks{; fileLength=15040759;
> underConstruction=false; blocks=[LocatedStri
> pedBlock{BP-5442367-10.202.27.120-1743751500104:blk_-9223372036854771360_190437;
> getBlockSize()=15040759; corrupt=false; offset=0;
> locs=[DatanodeInfoWithStorage[10.202.5.226:1004,DS-7207429b-7335-4e37-9848-4d9b88ab83e0,DISK],
> DatanodeInfoWithS
> torage[10.202.4.17:1004,DS-42a094e9-a2df-4317-b7bb-7685c3a4e13e,DISK],
> DatanodeInfoWithStorage[10.203.21.242:1004,DS-19008e58-49f0-4820-945a-0533a6fb4d0a,DISK],
>
> DatanodeInfoWithStorage[10.202.15.79:1004,DS-1816a8ec-7dbd-4889-afc7-063ecb6521ec,
> DISK],
> DatanodeInfoWithStorage[10.202.12.73:1004,DS-d5c48efe-cfdd-4ea9-9eb3-59af4afe3824,DISK]];
> indices=[0, 1, 2, 3, 4]}];
> lastLocatedBlock=LocatedStripedBlock{BP-5442367-10.202.27.120-1743751500104:blk_-9223372036854771360_190437;
> getBlockS
> ize()=15040759; corrupt=false; offset=0;
> locs=[DatanodeInfoWithStorage[10.202.5.226:1004,DS-7207429b-7335-4e37-9848-4d9b88ab83e0,DISK],
>
> DatanodeInfoWithStorage[10.202.4.17:1004,DS-42a094e9-a2df-4317-b7bb-7685c3a4e13e,DISK],
> DatanodeInfoWithSto
> rage[10.203.21.242:1004,DS-19008e58-49f0-4820-945a-0533a6fb4d0a,DISK],
> DatanodeInfoWithStorage[10.202.15.79:1004,DS-1816a8ec-7dbd-4889-afc7-063ecb6521ec,DISK],
>
> DatanodeInfoWithStorage[10.202.12.73:1004,DS-d5c48efe-cfdd-4ea9-9eb3-59af4afe3824,D
> ISK]]; indices=[0, 1, 2, 3, 4]}; isLastBlockComplete=true;
> ecPolicy=ErasureCodingPolicy=[Name=RS-3-2-1024k, Schema=[ECSchema=[Codec=rs,
> numDataUnits=3, numParityUnits=2]], CellSize=1048576, Id=2]}
> at
> org.apache.hadoop.hdfs.StripeReader.checkMissingBlocks(StripeReader.java:180)
> at
> org.apache.hadoop.hdfs.StripeReader.readDataForDecoding(StripeReader.java:198)
> at
> org.apache.hadoop.hdfs.StripeReader.readStripe(StripeReader.java:344)
> at
> org.apache.hadoop.hdfs.DFSStripedInputStream.fetchBlockByteRange(DFSStripedInputStream.java:506)
> at
> org.apache.hadoop.hdfs.DFSInputStream.pread(DFSInputStream.java:1499)
> at
> org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:1708)
> at
> org.apache.hadoop.fs.FSDataInputStream.read(FSDataInputStream.java:259)
> at
> java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
> ... 31 more
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)