zeroshade commented on code in PR #497:
URL: https://github.com/apache/iceberg-go/pull/497#discussion_r2231653692
##########
table/table_test.go:
##########
@@ -1202,6 +1202,1059 @@ func TestTableWriting(t *testing.T) {
suite.Run(t, &TableWritingTestSuite{formatVersion: 2})
}
+// WriteOperationsTestSuite provides comprehensive testing for advanced write
operations
+type WriteOperationsTestSuite struct {
+ suite.Suite
+ ctx context.Context
+ location string
+ tableSchema *iceberg.Schema
+ arrSchema *arrow.Schema
+ arrTable arrow.Table
+}
+
+func TestWriteOperations(t *testing.T) {
+ suite.Run(t, new(WriteOperationsTestSuite))
+}
+
+func (s *WriteOperationsTestSuite) SetupSuite() {
+ s.ctx = context.Background()
+ mem := memory.DefaultAllocator
+
+ // Create a test schema
+ s.tableSchema = iceberg.NewSchema(0,
+ iceberg.NestedField{ID: 1, Name: "id", Type:
iceberg.PrimitiveTypes.Int64, Required: true},
+ iceberg.NestedField{ID: 2, Name: "data", Type:
iceberg.PrimitiveTypes.String},
+ iceberg.NestedField{ID: 3, Name: "ts", Type:
iceberg.PrimitiveTypes.Timestamp})
+
+ s.arrSchema = arrow.NewSchema([]arrow.Field{
+ {Name: "id", Type: arrow.PrimitiveTypes.Int64, Nullable: false},
+ {Name: "data", Type: arrow.BinaryTypes.String, Nullable: true},
+ {Name: "ts", Type: &arrow.TimestampType{Unit:
arrow.Microsecond}, Nullable: true},
+ }, nil)
+
+ // Create test data
+ var err error
+ s.arrTable, err = array.TableFromJSON(mem, s.arrSchema, []string{
+ `[
+ {"id": 1, "data": "foo", "ts": 1672531200000000},
+ {"id": 2, "data": "bar", "ts": 1672534800000000},
+ {"id": 3, "data": "baz", "ts": 1672538400000000}
+ ]`,
+ })
+ s.Require().NoError(err)
+}
+
+func (s *WriteOperationsTestSuite) SetupTest() {
+ s.location = filepath.ToSlash(strings.Replace(s.T().TempDir(), "#", "",
-1))
+}
+
+func (s *WriteOperationsTestSuite) TearDownSuite() {
+ s.arrTable.Release()
+}
+
+func (s *WriteOperationsTestSuite) getMetadataLoc() string {
+ return fmt.Sprintf("%s/metadata/%05d-%s.metadata.json",
+ s.location, 1, uuid.New().String())
+}
+
+func (s *WriteOperationsTestSuite) writeParquet(fio iceio.WriteFileIO,
filePath string, arrTbl arrow.Table) {
+ fo, err := fio.Create(filePath)
+ s.Require().NoError(err)
+
+ s.Require().NoError(pqarrow.WriteTable(arrTbl, fo, arrTbl.NumRows(),
+ nil, pqarrow.DefaultWriterProps()))
+}
+
+func (s *WriteOperationsTestSuite) createTable(identifier table.Identifier,
formatVersion int, spec iceberg.PartitionSpec) *table.Table {
+ meta, err := table.NewMetadata(s.tableSchema, &spec,
table.UnsortedSortOrder,
+ s.location, iceberg.Properties{"format-version":
strconv.Itoa(formatVersion)})
+ s.Require().NoError(err)
+
+ return table.New(
+ identifier,
+ meta,
+ s.getMetadataLoc(),
+ func(ctx context.Context) (iceio.IO, error) {
+ return iceio.LocalFS{}, nil
+ },
+ &mockedCatalog{},
+ )
+}
+
+func (s *WriteOperationsTestSuite) createTableWithData(identifier
table.Identifier, numFiles int) (*table.Table, []string) {
+ tbl := s.createTable(identifier, 2, *iceberg.UnpartitionedSpec)
+
+ files := make([]string, 0, numFiles)
+ fs := s.getFS(tbl)
+
+ for i := 0; i < numFiles; i++ {
+ filePath := fmt.Sprintf("%s/data/test-%d.parquet", s.location,
i)
+ s.writeParquet(fs, filePath, s.arrTable)
+ files = append(files, filePath)
+ }
+
+ // Add files to table
+ tx := tbl.NewTransaction()
+ s.Require().NoError(tx.AddFiles(s.ctx, files, nil, false))
+
+ committedTbl, err := tx.Commit(s.ctx)
+ s.Require().NoError(err)
+
+ return committedTbl, files
+}
+
+func (s *WriteOperationsTestSuite) getFS(tbl *table.Table) iceio.WriteFileIO {
+ fs, err := tbl.FS(s.ctx)
+ s.Require().NoError(err)
+
+ return fs.(iceio.WriteFileIO)
+}
+
+//
=============================================================================
+// SNAPSHOT VALIDATION HELPERS
+//
=============================================================================
+
+// validateSnapshotFiles checks that the snapshot contains exactly the
expected files
+func (s *WriteOperationsTestSuite) validateSnapshotFiles(snapshot
*table.Snapshot, fs iceio.IO, expectedFiles []string) {
+ s.Require().NotNil(snapshot, "Snapshot should not be nil")
+
+ // Get actual files from snapshot using public API
+ manifests, err := snapshot.Manifests(fs)
+ s.Require().NoError(err, "Failed to read manifests from snapshot")
+
+ var actualFiles []string
+ for _, manifest := range manifests {
+ entries, err := manifest.FetchEntries(fs, false)
+ s.Require().NoError(err, "Failed to fetch entries from
manifest")
+
+ for _, entry := range entries {
+ // Only include data files (not delete files)
+ if entry.DataFile().ContentType() ==
iceberg.EntryContentData {
+ actualFiles = append(actualFiles,
entry.DataFile().FilePath())
+ }
+ }
+ }
+
+ // Sort for comparison
+ expectedSorted := make([]string, len(expectedFiles))
+ copy(expectedSorted, expectedFiles)
+ slices.Sort(expectedSorted)
+ slices.Sort(actualFiles)
+
+ s.Equal(len(expectedSorted), len(actualFiles), "File count mismatch -
expected %d files, got %d", len(expectedSorted), len(actualFiles))
+ s.Equal(expectedSorted, actualFiles, "File paths don't match
expected.\nExpected: %v\nActual: %v", expectedSorted, actualFiles)
+
+ s.T().Logf("Snapshot file validation passed: %d files match expected",
len(actualFiles))
+}
+
+// validateSnapshotSummary checks operation type and summary properties
+func (s *WriteOperationsTestSuite) validateSnapshotSummary(snapshot
*table.Snapshot, expectedOp table.Operation, expectedCounts map[string]string) {
+ s.Require().NotNil(snapshot, "Snapshot should not be nil")
+ s.Require().NotNil(snapshot.Summary, "Snapshot summary should not be
nil")
+
+ s.Equal(expectedOp, snapshot.Summary.Operation, "Snapshot operation
mismatch")
+
+ if expectedCounts != nil {
+ for key, expectedValue := range expectedCounts {
+ actualValue, exists := snapshot.Summary.Properties[key]
+ s.True(exists, "Summary property %s should exist", key)
+ s.Equal(expectedValue, actualValue, "Summary property
%s mismatch - expected %s, got %s", key, expectedValue, actualValue)
+ }
+ }
+
+ s.T().Logf("Snapshot summary validation passed: operation=%s,
properties=%d", expectedOp, len(expectedCounts))
+}
+
+// validateManifestStructure validates manifest files and returns total entry
count
+func (s *WriteOperationsTestSuite) validateManifestStructure(snapshot
*table.Snapshot, fs iceio.IO) int {
+ s.Require().NotNil(snapshot, "Snapshot should not be nil")
+
+ manifests, err := snapshot.Manifests(fs)
+ s.Require().NoError(err, "Failed to read manifests from snapshot")
+ s.Greater(len(manifests), 0, "Should have at least one manifest")
+
+ totalEntries := 0
+ for i, manifest := range manifests {
+ // Validate manifest is readable
+ entries, err := manifest.FetchEntries(fs, false)
+ s.Require().NoError(err, "Failed to fetch entries from manifest
%d", i)
+ totalEntries += len(entries)
+
+ // Validate manifest metadata
+ s.Greater(manifest.Length(), int64(0), "Manifest %d should have
positive length", i)
+ s.NotEmpty(manifest.FilePath(), "Manifest %d should have valid
path", i)
+
+ s.T().Logf("📄 Manifest %d: %s (%d entries, %d bytes)", i,
manifest.FilePath(), len(entries), manifest.Length())
+ }
+
+ s.T().Logf("Manifest structure validation passed: %d manifests, %d
total entries", len(manifests), totalEntries)
+
+ return totalEntries
+}
+
+// validateSnapshotState performs comprehensive validation of snapshot state
+func (s *WriteOperationsTestSuite) validateSnapshotState(snapshot
*table.Snapshot, fs iceio.IO, expectedFiles []string, expectedOp
table.Operation, expectedCounts map[string]string) {
+ s.T().Logf("🔍 Validating snapshot state (ID: %d)", snapshot.SnapshotID)
+
+ // Validate all components
+ s.validateSnapshotFiles(snapshot, fs, expectedFiles)
+ s.validateSnapshotSummary(snapshot, expectedOp, expectedCounts)
+ entryCount := s.validateManifestStructure(snapshot, fs)
+
+ // Ensure manifest entries match file count
+ s.Equal(len(expectedFiles), entryCount, "Manifest entry count should
match expected file count")
+
+ s.T().Logf("🎉 Complete snapshot validation passed for snapshot %d",
snapshot.SnapshotID)
+}
+
+// validateDataIntegrity scans the table and validates row count and basic data
+func (s *WriteOperationsTestSuite) validateDataIntegrity(tbl *table.Table,
expectedRowCount int64) {
+ scan := tbl.Scan()
+ results, err := scan.ToArrowTable(s.ctx)
+ s.Require().NoError(err, "Failed to scan table for data integrity
check")
+ defer results.Release()
+
+ s.Equal(expectedRowCount, results.NumRows(), "Row count mismatch -
expected %d, got %d", expectedRowCount, results.NumRows())
+
+ // Basic data validation - ensure we can read the data
+ s.Equal(3, int(results.NumCols()), "Should have 3 columns (id, data,
ts)")
+
+ s.T().Logf("Data integrity validation passed: %d rows, %d columns",
results.NumRows(), results.NumCols())
+}
+
+// getSnapshotFiles extracts the list of data file paths from a snapshot for
comparison
+func (s *WriteOperationsTestSuite) getSnapshotFiles(snapshot *table.Snapshot,
fs iceio.IO) []string {
+ if snapshot == nil {
+ return []string{}
+ }
+
+ manifests, err := snapshot.Manifests(fs)
+ s.Require().NoError(err, "Failed to read manifests from snapshot")
+
+ var files []string
+ for _, manifest := range manifests {
+ entries, err := manifest.FetchEntries(fs, false)
+ s.Require().NoError(err, "Failed to fetch entries from
manifest")
+
+ for _, entry := range entries {
+ // Only include data files (not delete files)
+ if entry.DataFile().ContentType() ==
iceberg.EntryContentData {
+ files = append(files,
entry.DataFile().FilePath())
+ }
+ }
+ }
+
+ // Sort for consistent comparison
+ slices.Sort(files)
+
+ return files
+}
+
+//
=============================================================================
+// TESTS WITH ENHANCED VALIDATION
+//
=============================================================================
+
+func TestRewriteFiles(t *testing.T) {
+ suite.Run(t, &WriteOperationsTestSuite{})
+}
+
+func (s *WriteOperationsTestSuite) TestRewriteFiles() {
+ s.Run("RewriteDataFiles", func() {
Review Comment:
When you use the test suite pattern, having a function
`TestRewriteDataFiles` will create the subtest for you. You don't need to
manually add `s.Run` here. Since there's no commonality between the subtests it
would be better for them to be separate functions.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]