exprs): Handle large types in expr handling (#440)

zeroshade Wed, 16 Jul 2025 08:02:23 -0700

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git



The following commit(s) were added to refs/heads/main by this push:
     new 0f57504  fix(arrow/compute/exprs): Handle large types in expr handling 
(#440)
0f57504 is described below

commit 0f575047aea671161430ed0a4af2be9de0e92b66
Author: Matt Topol <[email protected]>
AuthorDate: Wed Jul 16 11:01:13 2025 -0400

    fix(arrow/compute/exprs): Handle large types in expr handling (#440)
    
    ### Rationale for this change
    An issue was discovered in https://github.com/apache/iceberg-go when
    working with compacted files and parquet files written by pyiceberg.
    Essentially it boiled down to the fact that if an expression contained a
    Field Reference to a LargeString field, evaluation would fail as it
    expected `String` and not `LargeString` when converting from the
    Substrait types. This difference shouldn't cause an issue so we need to
    ensure that LargeString is allowed when String is expected, same for
    LargeBinary and Binary
    
    ### What changes are included in this PR?
    Simply adding a case to the error condition when the types aren't equal
    to allow String + LargeString or Binary + LargeBInary to be allowed.
    
    ### Are these changes tested?
    A unit test has been added to test it.
    
    ### Are there any user-facing changes?
    Just the new case being allowed when it was previously disallowed.
---
 arrow/compute/exprs/exec.go      | 21 +++++++++++++++++---
 arrow/compute/exprs/exec_test.go | 41 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/arrow/compute/exprs/exec.go b/arrow/compute/exprs/exec.go
index 174cb0d..57fb7c8 100644
--- a/arrow/compute/exprs/exec.go
+++ b/arrow/compute/exprs/exec.go
@@ -395,6 +395,11 @@ func literalToDatum(mem memory.Allocator, lit 
expr.Literal, ext ExtensionIDSet)
 // You can provide an allocator to use through the context via 
compute.WithAllocator.
 //
 // You can provide the ExtensionIDSet to use through the context via 
WithExtensionIDSet.
+//
+// Note: Substrait expressions do not have an equivalent to 
LargeString/LargeBinary, so
+// field references which expect a string or binary will allow their Large 
counterparts
+// to be used as well. Most of the time, this shouldn't be an issue as the 
compute kernels
+// should handle any casting that might be necessary.
 func ExecuteScalarExpression(ctx context.Context, inputSchema *arrow.Schema, 
expression expr.Expression, partialInput compute.Datum) (compute.Datum, error) {
        if expression == nil {
                return nil, arrow.ErrInvalid
@@ -479,9 +484,19 @@ func execFieldRef(ctx context.Context, e 
*expr.FieldReference, input compute.Exe
        } else if err != nil {
                return nil, err
        }
-       if !arrow.TypeEqual(out.(compute.ArrayLikeDatum).Type(), expectedType) {
-               return nil, fmt.Errorf("%w: referenced field %s was %s, but 
should have been %s",
-                       arrow.ErrInvalid, ref, 
out.(compute.ArrayLikeDatum).Type(), expectedType)
+
+       dt := out.(compute.ArrayLikeDatum).Type()
+       if !arrow.TypeEqual(dt, expectedType) {
+               // substrait doesn't have a LARGE_STRING or LARGE_BINARY type, 
so we
+               // need to special case check if we got a LARGE_STRING or 
LARGE_BINARY
+               // type when we expected a STRING or BINARY type.
+               switch {
+               case expectedType.ID() == arrow.STRING && dt.ID() == 
arrow.LARGE_STRING:
+               case expectedType.ID() == arrow.BINARY && dt.ID() == 
arrow.LARGE_BINARY:
+               default:
+                       return nil, fmt.Errorf("%w: referenced field %s was %s, 
but should have been %s",
+                               arrow.ErrInvalid, ref, dt, expectedType)
+               }
        }
 
        return out, nil
diff --git a/arrow/compute/exprs/exec_test.go b/arrow/compute/exprs/exec_test.go
index 9812643..2d95810 100644
--- a/arrow/compute/exprs/exec_test.go
+++ b/arrow/compute/exprs/exec_test.go
@@ -621,3 +621,44 @@ func Test_Types(t *testing.T) {
                })
        }
 }
+
+func TestLargeTypes(t *testing.T) {
+       sc := arrow.NewSchema([]arrow.Field{
+               {Name: "str", Type: arrow.BinaryTypes.LargeString, Nullable: 
true},
+               {Name: "bin", Type: arrow.BinaryTypes.LargeBinary, Nullable: 
true},
+       }, nil)
+
+       rec, _, err := array.RecordFromJSON(memory.DefaultAllocator, sc, 
strings.NewReader(`[
+               {"str": "hello world", "bin": "Zm9vYmFy"}
+       ]`))
+       require.NoError(t, err)
+       defer rec.Release()
+
+       dr := compute.NewDatumWithoutOwning(rec)
+
+       t.Run("large_string ref", func(t *testing.T) {
+               bldr := exprs.NewExprBuilder(extSet)
+               require.NoError(t, bldr.SetInputSchema(sc))
+
+               e, err := bldr.FieldRef("str").BuildExpr()
+               require.NoError(t, err, "Failed to build field reference 
expression")
+
+               ctx := context.Background()
+               result, err := exprs.ExecuteScalarExpression(ctx, sc, e, dr)
+               require.NoError(t, err, "Failed to execute scalar expression")
+               defer result.Release()
+       })
+
+       t.Run("large_binary ref", func(t *testing.T) {
+               bldr := exprs.NewExprBuilder(extSet)
+               require.NoError(t, bldr.SetInputSchema(sc))
+
+               e, err := bldr.FieldRef("bin").BuildExpr()
+               require.NoError(t, err, "Failed to build field reference 
expression")
+
+               ctx := context.Background()
+               result, err := exprs.ExecuteScalarExpression(ctx, sc, e, dr)
+               require.NoError(t, err, "Failed to execute scalar expression")
+               defer result.Release()
+       })
+}

(arrow-go) branch main updated: fix(arrow/compute/exprs): Handle large types in expr handling (#440)

Reply via email to