andygrove commented on code in PR #4017:
URL: https://github.com/apache/datafusion-comet/pull/4017#discussion_r3140626275


##########
native/spark-expr/src/string_funcs/substring.rs:
##########
@@ -113,3 +119,629 @@ impl PhysicalExpr for SubstringExpr {
         )))
     }
 }
+
+/// Implement Spark's substring semantics for negative start positions.
+/// Spark: start = numChars + pos, end = start + len, clamp both, empty if 
start >= end.
+/// Arrow: start = max(0, numChars + pos), take len chars — differs when start 
is clamped.
+fn spark_substring_negative_start(
+    array: &ArrayRef,
+    start: i64,
+    len: u64,
+) -> datafusion::common::Result<ArrayRef> {
+    use arrow::array::{
+        BinaryArray, DictionaryArray, GenericBinaryBuilder, 
GenericStringBuilder, LargeBinaryArray,
+    };
+
+    match array.data_type() {
+        DataType::Utf8 => {
+            let str_array = as_string_array(array);
+            let mut builder = GenericStringBuilder::<i32>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeUtf8 => {
+            let str_array = as_largestring_array(array);
+            let mut builder = GenericStringBuilder::<i64>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Binary => {
+            let bin_array = 
array.as_any().downcast_ref::<BinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i32>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeBinary => {
+            let bin_array = 
array.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i64>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = spark_substring_negative_start(dict.values(), start, 
len)?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), 
values)?;
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        _ => Ok(Arc::clone(array)),

Review Comment:
   Should this be an error rather than just returning the input data?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to