amogh-jahagirdar commented on code in PR #6717: URL: https://github.com/apache/iceberg/pull/6717#discussion_r1092832125
########## spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java: ########## @@ -654,6 +658,13 @@ private Table load(Identifier ident) { return new SparkTable(table, snapshotId, !cacheEnabled); } + Matcher branch = BRANCH.matcher(ident.name()); + if (branch.matches()) { + Snapshot snapshot = table.snapshot(branch.group(1)); + if (snapshot != null) { + return new SparkTable(table, snapshot.snapshotId(), !cacheEnabled); + } + } Review Comment: Nit: could we just have each if block match and resolve the snapshot ID, and then just have one `return new SparkTable(table, snapshotId, !cacheEnabled)` at the end. That'll reduce duplication considering we need to also handle the tag case also. ########## spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java: ########## @@ -722,6 +741,9 @@ private Table loadFromPathIdentifier(PathIdentifier ident) { long snapshotIdAsOfTime = SnapshotUtil.snapshotIdAsOfTime(table, asOfTimestamp); return new SparkTable(table, snapshotIdAsOfTime, !cacheEnabled); + } else if (branch != null && table.snapshot(branch) != null) { + return new SparkTable(table, table.snapshot(branch).snapshotId(), !cacheEnabled); + } else { return new SparkTable(table, snapshotId, !cacheEnabled); Review Comment: Can we resolve to a snapshot ID and then return the table? Imo that makes the logic a bit easier to read since it isolates the resolving the snapshot ID from the various options. ########## spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java: ########## @@ -370,4 +370,42 @@ public void testSnapshotSelectionByTimestampAndBranchOrTagFails() throws IOExcep .isInstanceOf(IllegalArgumentException.class) .hasMessageStartingWith("Cannot override ref, already set snapshot id="); } + + @Test + public void testSnapshotSelectionByBranchWithSchemaChange() throws IOException { + String tableLocation = temp.newFolder("iceberg-table").toString(); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, tableLocation); + + // produce the first snapshot + List<SimpleRecord> firstBatchRecords = + Lists.newArrayList( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); + firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); + + table.manageSnapshots().createBranch("branch", table.currentSnapshot().snapshotId()).commit(); + + Dataset<Row> currentSnapshotResult = + spark.read().format("iceberg").option("branch", "branch").load(tableLocation); + List<SimpleRecord> currentSnapshotRecords = + currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List<SimpleRecord> expectedRecords = Lists.newArrayList(); + expectedRecords.addAll(firstBatchRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); + + table.updateSchema().deleteColumn("data").commit(); + + Dataset<Row> deleteSnapshotResult = + spark.read().format("iceberg").option("branch", "branch").load(tableLocation); + List<SimpleRecord> deletedSnapshotRecords = + deleteSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + List<SimpleRecord> expectedRecordsAfterDeletion = Lists.newArrayList(); + expectedRecordsAfterDeletion.addAll(firstBatchRecords); + Assert.assertEquals( + "Current snapshot rows should match", expectedRecords, deletedSnapshotRecords); Review Comment: I think some inline comments here can make it easier to understand what the test is doing -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org