Fokko commented on code in PR #369: URL: https://github.com/apache/iceberg-go/pull/369#discussion_r2021749556
########## table/internal/parquet_files.go: ########## @@ -170,6 +172,86 @@ func (parquetFormat) PrimitiveTypeToPhysicalType(typ iceberg.PrimitiveType) stri } } +func (parquetFormat) GetWriteProperties(props iceberg.Properties) any { + writerProps := []parquet.WriterProperty{ + parquet.WithDictionaryDefault(false), + parquet.WithMaxRowGroupLength(int64(props.GetInt("write.parquet.row-group-limit", 1048576))), + parquet.WithDataPageSize(int64(props.GetInt("write.parquet.page-size-bytes", 1024*1024))), + parquet.WithDataPageVersion(parquet.DataPageV2), + parquet.WithBatchSize(int64(props.GetInt("write.parquet.page-row-limit", 20000))), + parquet.WithDictionaryPageSizeLimit(int64(props.GetInt("write.parquet.dict-size-bytes", 2*1024*1024))), Review Comment: Should we move these in constants? Is that common in Go? ########## table/table_test.go: ########## @@ -835,6 +839,271 @@ func (t *TableWritingTestSuite) TestReplaceDataFiles() { }, staged.CurrentSnapshot().Summary) } +func (t *TableWritingTestSuite) TestWriteSpecialCharacterColumn() { + ident := table.Identifier{"default", "write_special_character_column"} + colNameWithSpecialChar := "letter/abc" + + s := iceberg.NewSchema(0, + iceberg.NestedField{ID: 1, Name: colNameWithSpecialChar, Type: iceberg.PrimitiveTypes.String}, + iceberg.NestedField{ID: 2, Name: "id", Type: iceberg.PrimitiveTypes.Int32}, + iceberg.NestedField{ID: 3, Name: "name", Type: iceberg.PrimitiveTypes.String, Required: true}, + iceberg.NestedField{ID: 4, Name: "address", Required: true, Type: &iceberg.StructType{ + FieldList: []iceberg.NestedField{ + {ID: 5, Name: "street", Type: iceberg.PrimitiveTypes.String, Required: true}, + {ID: 6, Name: "city", Type: iceberg.PrimitiveTypes.String, Required: true}, + {ID: 7, Name: "zip", Type: iceberg.PrimitiveTypes.Int32, Required: true}, + {ID: 8, Name: colNameWithSpecialChar, Type: iceberg.PrimitiveTypes.String, Required: true}, + }, + }}) + + arrowSchema := arrow.NewSchema([]arrow.Field{ + {Name: colNameWithSpecialChar, Type: arrow.BinaryTypes.String, Nullable: true}, + {Name: "id", Type: arrow.PrimitiveTypes.Int32, Nullable: true}, + {Name: "name", Type: arrow.BinaryTypes.String}, + {Name: "address", Type: arrow.StructOf( + arrow.Field{Name: "street", Type: arrow.BinaryTypes.String}, + arrow.Field{Name: "city", Type: arrow.BinaryTypes.String}, + arrow.Field{Name: "zip", Type: arrow.PrimitiveTypes.Int32}, + arrow.Field{Name: colNameWithSpecialChar, Type: arrow.BinaryTypes.String}, + )}, + }, nil) + + arrowTable, err := array.TableFromJSON(memory.DefaultAllocator, arrowSchema, []string{ + `[ + { + "letter/abc": "a", + "id": 1, + "name": "AB", + "address": {"street": "123", "city": "SFO", "zip": 12345, "letter/abc": "a"} + }, + { + "letter/abc": null, + "id": 2, + "name": "CD", + "address": {"street": "456", "city": "SW", "zip": 67890, "letter/abc": "b"} + }, + { + "letter/abc": "z", + "id": 3, + "name": "EF", + "address": {"street": "789", "city": "Random", "zip": 10112, "letter/abc": "c"} + } + ]`, + }) + t.Require().NoError(err) + defer arrowTable.Release() + + tbl := t.createTable(ident, t.formatVersion, *iceberg.UnpartitionedSpec, s) + rdr := array.NewTableReader(arrowTable, 1) + defer rdr.Release() + + tx := tbl.NewTransaction() + t.Require().NoError(tx.Append(t.ctx, rdr, nil)) + + scan, err := tx.Scan() + t.Require().NoError(err) + + result, err := scan.ToArrowTable(t.ctx) + t.Require().NoError(err) + defer result.Release() + + t.True(array.TableEqual(arrowTable, result), "expected:\n %s\ngot:\n %s", arrowTable, result) +} + +func (t *TableWritingTestSuite) getInMemCatalog() catalog.Catalog { + cat, err := catalog.Load(context.Background(), "default", iceberg.Properties{ + "uri": ":memory:", + "type": "sql", + sql.DriverKey: sqliteshim.ShimName, + sql.DialectKey: string(sql.SQLite), + "warehouse": "file://" + t.location, + }) + t.Require().NoError(err) + + return cat +} + +func (t *TableWritingTestSuite) createTableWithProps(identifier table.Identifier, props iceberg.Properties, sc *iceberg.Schema) *table.Table { + cat := t.getInMemCatalog() + cat.DropTable(t.ctx, identifier) + cat.DropNamespace(t.ctx, catalog.NamespaceFromIdent(identifier)) + + t.Require().NoError(cat.CreateNamespace(t.ctx, catalog.NamespaceFromIdent(identifier), nil)) + tbl, err := cat.CreateTable(t.ctx, identifier, sc, catalog.WithProperties(props), + catalog.WithLocation(t.location)) + + t.Require().NoError(err) + + return tbl +} + +func tableSchema() *iceberg.Schema { + return iceberg.NewSchema(0, + iceberg.NestedField{ID: 1, Name: "bool", Type: iceberg.PrimitiveTypes.Bool}, + iceberg.NestedField{ID: 2, Name: "string", Type: iceberg.PrimitiveTypes.String}, + iceberg.NestedField{ID: 3, Name: "string_long", Type: iceberg.PrimitiveTypes.String}, + iceberg.NestedField{ID: 4, Name: "int", Type: iceberg.PrimitiveTypes.Int32}, + iceberg.NestedField{ID: 5, Name: "long", Type: iceberg.PrimitiveTypes.Int64}, + iceberg.NestedField{ID: 6, Name: "float", Type: iceberg.PrimitiveTypes.Float32}, + iceberg.NestedField{ID: 7, Name: "double", Type: iceberg.PrimitiveTypes.Float64}, + iceberg.NestedField{ID: 8, Name: "time", Type: iceberg.PrimitiveTypes.Time}, + iceberg.NestedField{ID: 9, Name: "timestamp", Type: iceberg.PrimitiveTypes.Timestamp}, + iceberg.NestedField{ID: 10, Name: "timestamptz", Type: iceberg.PrimitiveTypes.TimestampTz}, + iceberg.NestedField{ID: 11, Name: "date", Type: iceberg.PrimitiveTypes.Date}, + iceberg.NestedField{ID: 12, Name: "uuid", Type: iceberg.PrimitiveTypes.UUID}, + iceberg.NestedField{ID: 13, Name: "binary", Type: iceberg.PrimitiveTypes.Binary}, + iceberg.NestedField{ID: 14, Name: "fixed", Type: iceberg.FixedTypeOf(16)}, + ) Review Comment: Should we add decimal as well? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org