laskoviymishka commented on code in PR #1138: URL: https://github.com/apache/iceberg-go/pull/1138#discussion_r3402581162
########## table/internal/geo_codec.go: ########## @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package internal Review Comment: This is `package internal`, and `WKTToWKB`'s only callers are `_test.go` files, but go-geom is now a direct require in go.mod — so every consumer of `table/internal` (scanner, DV writer) transitively pulls it into the production module graph. I'd move this into a `_test.go` file in `package internal_test` (the test already lives there) or a small testutil package, so go-geom stays test-only. Non-blocking from my side, and I haven't seen it raised on the threads — if the maintainers are fine carrying it that's a reasonable call — but I'd lean toward keeping it out of the production deps. ########## table/arrow_utils.go: ########## @@ -1826,3 +1846,152 @@ func positionDeleteRecordsToDataFilesDV(ctx context.Context, rootLocation string } } } + +func checkCRSString(rawCrs json.RawMessage) bool { + b := bytes.TrimSpace(rawCrs) + + return len(b) > 0 && b[0] == '"' +} + +func checkCRSSJSON(rawCrs json.RawMessage) bool { + b := bytes.TrimSpace(rawCrs) + + return len(b) > 0 && b[0] == '{' +} + +func geoArrowCRSToIcebergCRS(meta geoarrow.Metadata) (string, error) { + if len(meta.CRS) == 0 { + return "srid:0", nil + } + + switch { + case checkCRSString(meta.CRS): + var crs string + if len(meta.CRS) > 0 { + if err := json.Unmarshal(meta.CRS, &crs); err != nil { + return "", fmt.Errorf("invalid geoarrow CRS metadata: %w", err) + } + } + + if strings.EqualFold(crs, "OGC:CRS84") || strings.EqualFold(crs, "EPSG:4326") { + return "OGC:CRS84", nil + } + + switch meta.CRSType { + case geoarrow.CRSTypeSRID: + return "srid:" + crs, nil + case geoarrow.CRSTypeWKT22019: + return "", errors.New("CRS type wkt2:2019 not supported") + default: + if len(crs) <= 32 { + return crs, nil + } + + return "", errors.New("crs length too long") + } + case checkCRSSJSON(meta.CRS): + var crs map[string]json.RawMessage + if len(meta.CRS) > 0 { + if err := json.Unmarshal(meta.CRS, &crs); err != nil { + return "", fmt.Errorf("invalid geoarrow CRS metadata: %w", err) + } + } + + idRaw, ok := crs["id"] + if !ok || len(idRaw) == 0 { + return "", errors.New("unsupported CRS") + } + + var id map[string]json.RawMessage + if err := json.Unmarshal(idRaw, &id); err != nil { + return "", errors.New("unsupported CRS") + } + + var authority string + if err := json.Unmarshal(id["authority"], &authority); err != nil || authority == "" { + return "", errors.New("unsupported CRS") + } + + codeRaw := id["code"] + if len(codeRaw) == 0 { + return "", errors.New("unsupported CRS") + } + + var code string + if err := json.Unmarshal(codeRaw, &code); err != nil { + var codeNum json.Number + if err := json.Unmarshal(codeRaw, &codeNum); err != nil || codeNum.String() == "" { + return "", errors.New("unsupported CRS") + } + code = codeNum.String() + } + if code == "" { + return "", errors.New("unsupported CRS") + } + + authorityCode := authority + ":" + code + if strings.EqualFold(authorityCode, "OGC:CRS84") || strings.EqualFold(authorityCode, "EPSG:4326") { + return "OGC:CRS84", nil + } + + return authorityCode, nil + default: + return "", errors.New("unsupported CRS: CRS must either be omitted, a string, or a JSON object") + } +} + +func geoArrowMetadataToIcebergType(meta geoarrow.Metadata) (iceberg.Type, error) { + crs, err := geoArrowCRSToIcebergCRS(meta) + if err != nil { + return nil, err + } + + switch meta.Edges { + case geoarrow.EdgePlanar: + return iceberg.GeometryTypeOf(crs) + case geoarrow.EdgeVincenty, geoarrow.EdgeKarney, geoarrow.EdgeThomas, geoarrow.EdgeAndoyer, geoarrow.EdgeSpherical: + return iceberg.GeographyTypeOf(crs, string(meta.Edges)) + default: + return nil, fmt.Errorf("unsupported geoarrow edges %q", meta.Edges) + } +} + +var authorityCodeCRS = regexp.MustCompile(`^[A-Za-z][A-Za-z0-9_-]*:[A-Za-z0-9_.-]+$`) + +func icebergCRSToGeoArrowMetadata(crs string) geoarrow.Metadata { + lowerCRS := strings.ToLower(crs) + if strings.HasPrefix(lowerCRS, "srid:") { + id := lowerCRS[len("srid:"):] Review Comment: The id is sliced out of the lowercased string, so a non-numeric SRID gets silently case-folded: `srid:MyDB` goes in, `mydb` gets marshalled, and it round-trips back as `srid:mydb`. Numeric SRIDs are fine, which is why it's latent. I'd slice the original `crs` instead — `id := crs[len("srid:"):]` — and keep the `HasPrefix` check on `lowerCRS`. The `iceberg_to_arrow_round_trip` loop only iterates `typeCases`, which has `geomSRID0` but no non-zero SRID, so the common path isn't exercised either. Adding a `srid:4326` case to `typeCases` covers the happy path and would have caught this — worth pairing the fix with that test case. ########## table/arrow_utils.go: ########## @@ -1826,3 +1846,152 @@ func positionDeleteRecordsToDataFilesDV(ctx context.Context, rootLocation string } } } + +func checkCRSString(rawCrs json.RawMessage) bool { + b := bytes.TrimSpace(rawCrs) + + return len(b) > 0 && b[0] == '"' +} + +func checkCRSSJSON(rawCrs json.RawMessage) bool { Review Comment: Minor: `checkCRSSJSON` has a double S — rename to `checkCRSJSON` at the def and the call site. ########## table/arrow_utils.go: ########## @@ -1826,3 +1846,152 @@ func positionDeleteRecordsToDataFilesDV(ctx context.Context, rootLocation string } } } + +func checkCRSString(rawCrs json.RawMessage) bool { + b := bytes.TrimSpace(rawCrs) + + return len(b) > 0 && b[0] == '"' +} + +func checkCRSSJSON(rawCrs json.RawMessage) bool { + b := bytes.TrimSpace(rawCrs) + + return len(b) > 0 && b[0] == '{' +} + +func geoArrowCRSToIcebergCRS(meta geoarrow.Metadata) (string, error) { + if len(meta.CRS) == 0 { + return "srid:0", nil + } + + switch { + case checkCRSString(meta.CRS): Review Comment: A small cleanup cluster in this function: The `if len(meta.CRS) > 0` wrappers inside both this `checkCRSString` arm and the `checkCRSSJSON` arm below are dead — the guard at the top of the function already returns on empty, and the `check*` funcs only return true for non-empty input (staticcheck SA9003). I'd drop both wrappers. On the write path (line 1984), `strings.EqualFold(lowerCRS, "EPSG:4326")` runs a case-fold against an already-lowercased string — `lowerCRS == "epsg:4326"` is enough. And in the JSON-object code parsing (line 1923), `|| codeNum.String() == ""` is unreachable — a successful `json.Number` unmarshal is never empty. I'd drop just that clause; the trailing `if code == ""` is reachable for `{"code":""}`, so leave that one. ########## table/internal/geo_codec.go: ########## @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package internal + +import ( + "github.com/geoarrow/geoarrow-go" + + "github.com/twpayne/go-geom/encoding/wkb" + "github.com/twpayne/go-geom/encoding/wkt" +) + +// WKTToWKB is a helper which converts Well Known Text (WKT) to Well Known Bytes (WKB). +// Note that return bytes are little endian. +func WKTToWKB(s string) (geoarrow.WKBBytes, error) { + geometry, err := wkt.Unmarshal(s) Review Comment: `geo_codec_test.go` is all happy-path — no invalid-WKT case. I'd add a `wantErr` row (e.g. `"not wkt"`) so the error path is exercised. While we're here, the `wkt.Unmarshal` / `wkb.Marshal` errors return bare; wrapping them (`fmt.Errorf("parse WKT: %w", err)`) makes failures legible at the call site. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
