This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 0f57504 fix(arrow/compute/exprs): Handle large types in expr handling
(#440)
0f57504 is described below
commit 0f575047aea671161430ed0a4af2be9de0e92b66
Author: Matt Topol <[email protected]>
AuthorDate: Wed Jul 16 11:01:13 2025 -0400
fix(arrow/compute/exprs): Handle large types in expr handling (#440)
### Rationale for this change
An issue was discovered in https://github.com/apache/iceberg-go when
working with compacted files and parquet files written by pyiceberg.
Essentially it boiled down to the fact that if an expression contained a
Field Reference to a LargeString field, evaluation would fail as it
expected `String` and not `LargeString` when converting from the
Substrait types. This difference shouldn't cause an issue so we need to
ensure that LargeString is allowed when String is expected, same for
LargeBinary and Binary
### What changes are included in this PR?
Simply adding a case to the error condition when the types aren't equal
to allow String + LargeString or Binary + LargeBInary to be allowed.
### Are these changes tested?
A unit test has been added to test it.
### Are there any user-facing changes?
Just the new case being allowed when it was previously disallowed.
---
arrow/compute/exprs/exec.go | 21 +++++++++++++++++---
arrow/compute/exprs/exec_test.go | 41 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 59 insertions(+), 3 deletions(-)
diff --git a/arrow/compute/exprs/exec.go b/arrow/compute/exprs/exec.go
index 174cb0d..57fb7c8 100644
--- a/arrow/compute/exprs/exec.go
+++ b/arrow/compute/exprs/exec.go
@@ -395,6 +395,11 @@ func literalToDatum(mem memory.Allocator, lit
expr.Literal, ext ExtensionIDSet)
// You can provide an allocator to use through the context via
compute.WithAllocator.
//
// You can provide the ExtensionIDSet to use through the context via
WithExtensionIDSet.
+//
+// Note: Substrait expressions do not have an equivalent to
LargeString/LargeBinary, so
+// field references which expect a string or binary will allow their Large
counterparts
+// to be used as well. Most of the time, this shouldn't be an issue as the
compute kernels
+// should handle any casting that might be necessary.
func ExecuteScalarExpression(ctx context.Context, inputSchema *arrow.Schema,
expression expr.Expression, partialInput compute.Datum) (compute.Datum, error) {
if expression == nil {
return nil, arrow.ErrInvalid
@@ -479,9 +484,19 @@ func execFieldRef(ctx context.Context, e
*expr.FieldReference, input compute.Exe
} else if err != nil {
return nil, err
}
- if !arrow.TypeEqual(out.(compute.ArrayLikeDatum).Type(), expectedType) {
- return nil, fmt.Errorf("%w: referenced field %s was %s, but
should have been %s",
- arrow.ErrInvalid, ref,
out.(compute.ArrayLikeDatum).Type(), expectedType)
+
+ dt := out.(compute.ArrayLikeDatum).Type()
+ if !arrow.TypeEqual(dt, expectedType) {
+ // substrait doesn't have a LARGE_STRING or LARGE_BINARY type,
so we
+ // need to special case check if we got a LARGE_STRING or
LARGE_BINARY
+ // type when we expected a STRING or BINARY type.
+ switch {
+ case expectedType.ID() == arrow.STRING && dt.ID() ==
arrow.LARGE_STRING:
+ case expectedType.ID() == arrow.BINARY && dt.ID() ==
arrow.LARGE_BINARY:
+ default:
+ return nil, fmt.Errorf("%w: referenced field %s was %s,
but should have been %s",
+ arrow.ErrInvalid, ref, dt, expectedType)
+ }
}
return out, nil
diff --git a/arrow/compute/exprs/exec_test.go b/arrow/compute/exprs/exec_test.go
index 9812643..2d95810 100644
--- a/arrow/compute/exprs/exec_test.go
+++ b/arrow/compute/exprs/exec_test.go
@@ -621,3 +621,44 @@ func Test_Types(t *testing.T) {
})
}
}
+
+func TestLargeTypes(t *testing.T) {
+ sc := arrow.NewSchema([]arrow.Field{
+ {Name: "str", Type: arrow.BinaryTypes.LargeString, Nullable:
true},
+ {Name: "bin", Type: arrow.BinaryTypes.LargeBinary, Nullable:
true},
+ }, nil)
+
+ rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, sc,
strings.NewReader(`[
+ {"str": "hello world", "bin": "Zm9vYmFy"}
+ ]`))
+ require.NoError(t, err)
+ defer rec.Release()
+
+ dr := compute.NewDatumWithoutOwning(rec)
+
+ t.Run("large_string ref", func(t *testing.T) {
+ bldr := exprs.NewExprBuilder(extSet)
+ require.NoError(t, bldr.SetInputSchema(sc))
+
+ e, err := bldr.FieldRef("str").BuildExpr()
+ require.NoError(t, err, "Failed to build field reference
expression")
+
+ ctx := context.Background()
+ result, err := exprs.ExecuteScalarExpression(ctx, sc, e, dr)
+ require.NoError(t, err, "Failed to execute scalar expression")
+ defer result.Release()
+ })
+
+ t.Run("large_binary ref", func(t *testing.T) {
+ bldr := exprs.NewExprBuilder(extSet)
+ require.NoError(t, bldr.SetInputSchema(sc))
+
+ e, err := bldr.FieldRef("bin").BuildExpr()
+ require.NoError(t, err, "Failed to build field reference
expression")
+
+ ctx := context.Background()
+ result, err := exprs.ExecuteScalarExpression(ctx, sc, e, dr)
+ require.NoError(t, err, "Failed to execute scalar expression")
+ defer result.Release()
+ })
+}