kazuyukitanimura commented on code in PR #4017:
URL: https://github.com/apache/datafusion-comet/pull/4017#discussion_r3140745084


##########
native/spark-expr/src/string_funcs/substring.rs:
##########
@@ -113,3 +119,629 @@ impl PhysicalExpr for SubstringExpr {
         )))
     }
 }
+
+/// Implement Spark's substring semantics for negative start positions.
+/// Spark: start = numChars + pos, end = start + len, clamp both, empty if 
start >= end.
+/// Arrow: start = max(0, numChars + pos), take len chars — differs when start 
is clamped.
+fn spark_substring_negative_start(
+    array: &ArrayRef,
+    start: i64,
+    len: u64,
+) -> datafusion::common::Result<ArrayRef> {
+    use arrow::array::{
+        BinaryArray, DictionaryArray, GenericBinaryBuilder, 
GenericStringBuilder, LargeBinaryArray,
+    };
+
+    match array.data_type() {
+        DataType::Utf8 => {
+            let str_array = as_string_array(array);
+            let mut builder = GenericStringBuilder::<i32>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeUtf8 => {
+            let str_array = as_largestring_array(array);
+            let mut builder = GenericStringBuilder::<i64>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Binary => {
+            let bin_array = 
array.as_any().downcast_ref::<BinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i32>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeBinary => {
+            let bin_array = 
array.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i64>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = spark_substring_negative_start(dict.values(), start, 
len)?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), 
values)?;
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        _ => Ok(Arc::clone(array)),

Review Comment:
   Thank you, updated



##########
native/spark-expr/src/string_funcs/substring.rs:
##########
@@ -113,3 +119,629 @@ impl PhysicalExpr for SubstringExpr {
         )))
     }
 }
+
+/// Implement Spark's substring semantics for negative start positions.
+/// Spark: start = numChars + pos, end = start + len, clamp both, empty if 
start >= end.
+/// Arrow: start = max(0, numChars + pos), take len chars — differs when start 
is clamped.
+fn spark_substring_negative_start(
+    array: &ArrayRef,
+    start: i64,
+    len: u64,
+) -> datafusion::common::Result<ArrayRef> {
+    use arrow::array::{
+        BinaryArray, DictionaryArray, GenericBinaryBuilder, 
GenericStringBuilder, LargeBinaryArray,
+    };
+
+    match array.data_type() {
+        DataType::Utf8 => {
+            let str_array = as_string_array(array);
+            let mut builder = GenericStringBuilder::<i32>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeUtf8 => {
+            let str_array = as_largestring_array(array);
+            let mut builder = GenericStringBuilder::<i64>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Binary => {
+            let bin_array = 
array.as_any().downcast_ref::<BinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i32>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeBinary => {
+            let bin_array = 
array.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i64>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = spark_substring_negative_start(dict.values(), start, 
len)?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), 
values)?;
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        _ => Ok(Arc::clone(array)),
+    }
+}
+
+fn spark_substr_negative(s: &str, pos: i64, len: u64) -> String {
+    let num_chars = s.chars().count() as i64;
+    let start = num_chars + pos;
+    let end = start.saturating_add(len as i64).min(num_chars);
+    let start = start.max(0);
+
+    if start >= end {
+        return String::new();
+    }
+
+    s.chars()
+        .skip(start as usize)
+        .take((end - start) as usize)
+        .collect()
+}

Review Comment:
   Updated



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to