chennurchaitanya commented on issue #8338: URL: https://github.com/apache/iceberg/issues/8338#issuecomment-1686592953
@nastra , please find the full stack trace. User class threw exception: org.apache.spark.SparkException: Job aborted due to stage failure: Task 10726 in stage 0.0 failed 4 times, most recent failure: Lost task 10726.3 in stage 0.0 (TID 10767) (lgppishdp0035.gso.aexp.com executor 4): org.apache.iceberg.exceptions.NotFoundException: File does not exist: s3a://XXXXXXXXXXX/data/event_ts_day=1970-01-01/00007-167930-e397f970-eb12-41ad-9e10-e855c8fd6e53-00001.parquetat org.apache.iceberg.hadoop.HadoopInputFile.lazyStat(HadoopInputFile.java:164) at org.apache.iceberg.hadoop.HadoopInputFile.getStat(HadoopInputFile.java:200) at org.apache.iceberg.parquet.ParquetIO.file(ParquetIO.java:51) at org.apache.iceberg.parquet.ReadConf.newReader(ReadConf.java:231) at org.apache.iceberg.parquet.ReadConf.<init>(ReadConf.java:80) at org.apache.iceberg.parquet.VectorizedParquetReader.init(VectorizedParquetReader.java:90) at org.apache.iceberg.parquet.VectorizedParquetReader.iterator(VectorizedParquetReader.java:99) at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:79) at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:36) at org.apache.iceberg.spark.source.BaseReader.next(BaseReader.java:135) at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:93) at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:130) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759) at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.next(InMemoryRelation.scala:87) at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.next(InMemoryRelation.scala:79) at scala.collection.Iterator$$anon$10.next(Iterator.scala:461) at org.apache.spark.storage.memory.PartiallyUnrolledIterator.next(MemoryStore.scala:783) at org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:140) at org.apache.spark.serializer.SerializerManager.dataSerializeStream(SerializerManager.scala:177) at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$3(BlockManager.scala:1502) at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$3$adapted(BlockManager.scala:1500) at org.apache.spark.storage.DiskStore.put(DiskStore.scala:70) at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1500) at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1420) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1484) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1307) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384) at org.apache.spark.rdd.RDD.iterator(RDD.scala:335) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750) Caused by: java.io.FileNotFoundException: No such file or directory: s3a://XXXXXXXXX/data/event_ts_day=1970-01-01/00007-167930-e397f970-eb12-41ad-9e10-e855c8fd6e53-00001.parquetat org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3345) at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3174) at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3042) at org.apache.iceberg.hadoop.HadoopInputFile.lazyStat(HadoopInputFile.java:162)... 57 moreDriver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)Caused by: org.apache.iceberg.exceptions.NotFoundException: File does not exist: s3a://siem-universal/siem_bigipasm/data/event_ts_day=1970-01-01/00007-167930-e397f970-eb12-41ad-9e10-e855c8fd6e53-00001.parquetat org.apache.iceberg.hadoop.HadoopInputFile.lazyStat(HadoopInputFile.java:164) at org.apache.iceberg.hadoop.HadoopInputFile.getStat(HadoopInputFile.java:200) at org.apache.iceberg.parquet.ParquetIO.file(ParquetIO.java:51) at org.apache.iceberg.parquet.ReadConf.newReader(ReadConf.java:231) at org.apache.iceberg.parquet.ReadConf.<init>(ReadConf.java:80) at org.apache.iceberg.parquet.VectorizedParquetReader.init(VectorizedParquetReader.java:90) at org.apache.iceberg.parquet.VectorizedParquetReader.iterator(VectorizedParquetReader.java:99) at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:79) at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:36) at org.apache.iceberg.spark.source.BaseReader.next(BaseReader.java:135) at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:93) at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:130) at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759) at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.next(InMemoryRelation.scala:87) at org.apache.spark.sql.execution.columnar.DefaultCachedBatchSerializer$$anon$1.next(InMemoryRelation.scala:79) at scala.collection.Iterator$$anon$10.next(Iterator.scala:461) at org.apache.spark.storage.memory.PartiallyUnrolledIterator.next(MemoryStore.scala:783) at org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:140) at org.apache.spark.serializer.SerializerManager.dataSerializeStream(SerializerManager.scala:177) at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$3(BlockManager.scala:1502) at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$3$adapted(BlockManager.scala:1500) at org.apache.spark.storage.DiskStore.put(DiskStore.scala:70) at org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1500) at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1420) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1484) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1307) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384) at org.apache.spark.rdd.RDD.iterator(RDD.scala:335) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:750)Caused by: java.io.FileNotFoundException: No such file or directory: s3a://XXXXXXXXXX/data/event_ts_day=1970-01-01/00007-167930-e397f970-eb12-41ad-9e10-e855c8fd6e53-00001.parquetat org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3345) at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3174) at org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:3042) at org.apache.iceberg.hadoop.HadoopInputFile.lazyStat(HadoopInputFile.java:162)... 57 more -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
