andygrove commented on code in PR #4017:
URL: https://github.com/apache/datafusion-comet/pull/4017#discussion_r3140653932


##########
native/spark-expr/src/string_funcs/substring.rs:
##########
@@ -113,3 +119,629 @@ impl PhysicalExpr for SubstringExpr {
         )))
     }
 }
+
+/// Implement Spark's substring semantics for negative start positions.
+/// Spark: start = numChars + pos, end = start + len, clamp both, empty if 
start >= end.
+/// Arrow: start = max(0, numChars + pos), take len chars — differs when start 
is clamped.
+fn spark_substring_negative_start(
+    array: &ArrayRef,
+    start: i64,
+    len: u64,
+) -> datafusion::common::Result<ArrayRef> {
+    use arrow::array::{
+        BinaryArray, DictionaryArray, GenericBinaryBuilder, 
GenericStringBuilder, LargeBinaryArray,
+    };
+
+    match array.data_type() {
+        DataType::Utf8 => {
+            let str_array = as_string_array(array);
+            let mut builder = GenericStringBuilder::<i32>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeUtf8 => {
+            let str_array = as_largestring_array(array);
+            let mut builder = GenericStringBuilder::<i64>::new();
+            for i in 0..str_array.len() {
+                if str_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    
builder.append_value(spark_substr_negative(str_array.value(i), start, len));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Binary => {
+            let bin_array = 
array.as_any().downcast_ref::<BinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i32>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::LargeBinary => {
+            let bin_array = 
array.as_any().downcast_ref::<LargeBinaryArray>().unwrap();
+            let mut builder = GenericBinaryBuilder::<i64>::new();
+            for i in 0..bin_array.len() {
+                if bin_array.is_null(i) {
+                    builder.append_null();
+                } else {
+                    builder.append_value(spark_binary_substr_negative(
+                        bin_array.value(i),
+                        start,
+                        len,
+                    ));
+                }
+            }
+            Ok(Arc::new(builder.finish()) as ArrayRef)
+        }
+        DataType::Dictionary(_, _) => {
+            let dict = as_dictionary_array::<Int32Type>(array);
+            let values = spark_substring_negative_start(dict.values(), start, 
len)?;
+            let result = DictionaryArray::try_new(dict.keys().clone(), 
values)?;
+            Ok(Arc::new(result) as ArrayRef)
+        }
+        _ => Ok(Arc::clone(array)),
+    }
+}
+
+fn spark_substr_negative(s: &str, pos: i64, len: u64) -> String {
+    let num_chars = s.chars().count() as i64;
+    let start = num_chars + pos;
+    let end = start.saturating_add(len as i64).min(num_chars);
+    let start = start.max(0);
+
+    if start >= end {
+        return String::new();
+    }
+
+    s.chars()
+        .skip(start as usize)
+        .take((end - start) as usize)
+        .collect()
+}

Review Comment:
   Claude recommended an optimized version to avoid an intermediate string 
allocation per row. I have not verified.
   
   ```rust
   fn spark_substr_negative(s: &str, pos: i64, len: u64) -> &str {              
                                                                                
                                                          
         let num_chars = s.chars().count() as i64;                              
         let end = (num_chars + pos).saturating_add(len as i64).min(num_chars); 
                                                                                
                                                            
         let start = (num_chars + pos).max(0);
         if start >= end {                                                      
                                                                                
                                                            
             return "";                                                         
                                                                                
                                                            
         }
                                                                                
                                                                                
                                                            
         // Translate char indices [start, end) to byte offsets in a single 
forward pass.                                                                   
                                                                
         let mut it = s.char_indices();
         let byte_start = it.by_ref().nth(start as usize).map(|(b, _)| 
b).unwrap_or(s.len());                                                          
                                                                     
         let span = (end - start - 1) as usize;                                 
                                                                                
                                                            
         let byte_end = it.nth(span).map(|(b, _)| b).unwrap_or(s.len());
                                                                                
                                                                                
                                                            
         &s[byte_start..byte_end]                                               
     }         
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to