Omega359 commented on issue #8863: URL: https://github.com/apache/iceberg/issues/8863#issuecomment-1778246891
I've just encountered this exception as well but the circumstances are somewhat different. One process is writing out an iceberg table primarily via appends with the occasional delete. Upon completion a new cluster is spun up that reads that table. This issue presented itself during that process: ``` 23/10/24 22:25:51 ERROR BaseReader: Error reading file(s): s3://vdcint-transaction-dev-txn/AR_IDw/dev/transaction_2023_10_ndleq8xl/source_alias=MC3/transaction_date_year=2016/00081-10437993-40ecbe40-af70-41a4-9fde-61bc6a6abeb2-00001.parquet java.lang.IllegalArgumentException: requirement failed: length (-135733377) cannot be smaller than -1 at scala.Predef$.require(Predef.scala:281) ~[scala-library-2.12.15.jar:?] at org.apache.spark.rdd.InputFileBlockHolder$.set(InputFileBlockHolder.scala:79) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.rdd.InputFileBlockHolder.set(InputFileBlockHolder.scala) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:89) ~[app.jar:?] at org.apache.iceberg.spark.source.BatchDataReader.open(BatchDataReader.java:41) ~[app.jar:?] at org.apache.iceberg.spark.source.BaseReader.next(BaseReader.java:141) ~[app.jar:?] at org.apache.spark.sql.execution.datasources.v2.PartitionIterator.hasNext(DataSourceRDD.scala:120) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.sql.execution.datasources.v2.MetricsIterator.hasNext(DataSourceRDD.scala:158) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1(DataSourceRDD.scala:63) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.$anonfun$hasNext$1$adapted(DataSourceRDD.scala:63) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at scala.Option.exists(Option.scala:376) ~[scala-library-2.12.15.jar:?] at org.apache.spark.sql.execution.datasources.v2.DataSourceRDD$$anon$1.hasNext(DataSourceRDD.scala:63) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) ~[scala-library-2.12.15.jar:?] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.columnartorow_nextBatch_0$(Unknown Source) ~[?:?] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.hashAgg_doAggregateWithoutKey_0$(Unknown Source) ~[?:?] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source) ~[?:?] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:35) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.hasNext(Unknown Source) ~[?:?] at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:959) ~[spark-sql_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) ~[scala-library-2.12.15.jar:?] at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:142) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:101) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.scheduler.Task.run(Task.scala:141) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:554) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1541) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:557) ~[spark-core_2.12-3.4.1-amzn-1.jar:3.4.1-amzn-1] at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) ~[?:?] at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) ~[?:?] at java.lang.Thread.run(Thread.java:833) ~[?:?] ``` Iceberg 1.4.0, Spark 3.4.1 (EMR 6.14.0), JDK 17. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org