This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 03bdb2a190 GH-40278: [C++] Support casting string to duration in CSV 
converter (#46035)
03bdb2a190 is described below

commit 03bdb2a1901a10fc1c0f41bcd7dfa0f68a9a5aff
Author: Zihan Qi <[email protected]>
AuthorDate: Wed May 7 17:56:48 2025 +0200

    GH-40278: [C++] Support casting string to duration in CSV converter (#46035)
    
    ### Rationale for this change
    Currently, the Arrow C++ CSV converter does not support parsing strings 
into `duration` types. This limits CSV ingestion capabilities when handling 
datasets with time-based intervals represented as numeric strings (e.g., 
`1000`, `2000000`). This PR adds support for parsing such strings into Arrow's 
`DurationType`.
    
    Note: Human-readable duration formats such as `1s`, `2m`, or `3h` are not 
supported in this PR. Support for those formats may be considered in a future 
enhancement.
    
    ### What changes are included in this PR?
    - Added `DurationValueDecoder` using `StringConverter<DurationType>`
    - Registered support in both standard and dictionary converters
    - Added unit tests covering:
      - Basic parsing across all time units (s, ms, µs, ns)
      - Null and custom null values
      - Whitespace handling and error cases
    
    ### Are these changes tested?
    Yes, conversion logic is fully covered by new tests in `converter_test.cc`.
    
    ### Are there any user-facing changes?
    Yes, users can now convert duration strings in CSV files to Arrow 
`duration` arrays by specifying the appropriate schema type.
    
    * GitHub Issue: #40278
    
    Lead-authored-by: Zihan Qi <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/csv/converter.cc      | 28 +++++++++++++++++
 cpp/src/arrow/csv/converter_test.cc | 62 +++++++++++++++++++++++++++++++++++++
 docs/source/cpp/csv.rst             |  1 +
 docs/source/python/csv.rst          |  7 ++++-
 4 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 3825364fa9..3e991126d1 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -470,6 +470,32 @@ struct MultipleParsersTimestampValueDecoder : public 
ValueDecoder {
   std::vector<const TimestampParser*> parsers_;
 };
 
+//
+// Value decoder for durations
+//
+struct DurationValueDecoder : public ValueDecoder {
+  using value_type = int64_t;
+
+  explicit DurationValueDecoder(const std::shared_ptr<DataType>& type,
+                                const ConvertOptions& options)
+      : ValueDecoder(type, options),
+        concrete_type_(checked_cast<const DurationType&>(*type)),
+        string_converter_() {}
+
+  Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type* 
out) {
+    TrimWhiteSpace(&data, &size);
+    if (ARROW_PREDICT_FALSE(!string_converter_.Convert(
+            concrete_type_, reinterpret_cast<const char*>(data), size, out))) {
+      return GenericConversionError(type_, data, size);
+    }
+    return Status::OK();
+  }
+
+ protected:
+  const DurationType& concrete_type_;
+  arrow::internal::StringConverter<DurationType> string_converter_;
+};
+
 /////////////////////////////////////////////////////////////////////////
 // Concrete Converter hierarchy
 
@@ -702,6 +728,7 @@ Result<std::shared_ptr<Converter>> Converter::Make(const 
std::shared_ptr<DataTyp
     NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
     NUMERIC_CONVERTER_CASE(Type::TIME32, Time32Type)
     NUMERIC_CONVERTER_CASE(Type::TIME64, Time64Type)
+    NUMERIC_CONVERTER_CASE(Type::DURATION, DurationType)
     CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType, 
BooleanValueDecoder>))
     CONVERTER_CASE(Type::BINARY,
                    (PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
@@ -785,6 +812,7 @@ Result<std::shared_ptr<DictionaryConverter>> 
DictionaryConverter::Make(
     CONVERTER_CASE(Type::UINT64, UInt64Type, NumericValueDecoder<UInt64Type>)
     CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder<FloatType>)
     CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder<DoubleType>)
+    CONVERTER_CASE(Type::DURATION, DurationType, DurationValueDecoder)
     REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
     CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryType,
                    FixedSizeBinaryValueDecoder)
diff --git a/cpp/src/arrow/csv/converter_test.cc 
b/cpp/src/arrow/csv/converter_test.cc
index f4491d7441..5dc078e7fd 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -660,6 +660,68 @@ TEST(TimestampConversion, UserDefinedParsersWithZone) {
   AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0}, 
options);
 }
 
+TEST(DurationConversion, Basics) {
+  auto type = duration(TimeUnit::SECOND);
+  AssertConversion<DurationType, int64_t>(
+      type, {"1,120\n", "10800,345600\n", "-1,-120\n", "-10800,-345600\n"},
+      {{1, 10800, -1, -10800}, {120, 345600, -120, -345600}});
+
+  type = duration(TimeUnit::MILLI);
+  AssertConversion<DurationType, int64_t>(
+      type, {"1000,120000\n", "10800000,345600000\n", "500,0\n", 
"-1000,-120000\n"},
+      {{1000, 10800000, 500, -1000}, {120000, 345600000, 0, -120000}});
+
+  type = duration(TimeUnit::MICRO);
+  AssertConversion<DurationType, int64_t>(
+      type, {"1000000,500000\n", "120000000,10800000000\n", 
"-500000,-1000000\n"},
+      {{1000000, 120000000, -500000}, {500000, 10800000000, -1000000}});
+
+  type = duration(TimeUnit::NANO);
+  AssertConversion<DurationType, int64_t>(
+      type,
+      {"1000000000,500000000\n", "120000000000,10800000000000\n", "7000,9\n",
+       "-7000,-9\n"},
+      {{1000000000, 120000000000, 7000, -7000}, {500000000, 10800000000000, 9, 
-9}});
+}
+
+TEST(DurationConversion, Nulls) {
+  auto type = duration(TimeUnit::MILLI);
+  AssertConversion<DurationType, int64_t>(type, {"1000,N/A\n", ",10800000\n"},
+                                          {{1000, 0}, {0, 10800000}},
+                                          {{true, false}, {false, true}});
+}
+
+TEST(DurationConversion, CustomNulls) {
+  auto options = ConvertOptions::Defaults();
+  options.null_values = {"xxx", "zzz"};
+
+  auto type = duration(TimeUnit::SECOND);
+  AssertConversion<DurationType, int64_t>(type, {"1,xxx\n"}, {{1}, {0}},
+                                          {{true}, {false}}, options);
+
+  options.quoted_strings_can_be_null = false;
+  AssertConversionError(type, {"\"1\",\"xxx\"\n"}, {1}, options);
+
+  AssertConversion<DurationType, int64_t>(type, {"1,xxx\n", "zzz,120\n"},
+                                          {{1, 0}, {0, 120}},
+                                          {{true, false}, {false, true}}, 
options);
+}
+
+TEST(DurationConversion, Whitespace) {
+  auto type = duration(TimeUnit::MILLI);
+  AssertConversion<DurationType, int64_t>(type,
+                                          {" 1000 , 120000 \n", " 500 , 
10800000 \n"},
+                                          {{1000, 500}, {120000, 10800000}});
+}
+
+TEST(DurationConversion, Invalid) {
+  auto type = duration(TimeUnit::SECOND);
+  AssertConversionError(type, {"xyz\n"}, {0});
+  AssertConversionError(type, {"123abc\n"}, {0});
+  AssertConversionError(type, {"1.5\n"}, {0});  // floats not allowed
+  AssertConversionError(type, {"s1\n"}, {0});   // bad format
+}
+
 Decimal128 Dec128(std::string_view value) {
   Decimal128 dec;
   int32_t scale = 0;
diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst
index 6078ec5892..bcb17bdc58 100644
--- a/docs/source/cpp/csv.rst
+++ b/docs/source/cpp/csv.rst
@@ -265,6 +265,7 @@ can be chosen from the following list:
 * Binary and Large Binary
 * String and Large String (with optional UTF8 input validation)
 * Fixed-Size Binary
+* Duration (numeric strings matching the schema unit, e.g., "60000" for 
duration[ms])
 * Dictionary with index type Int32 and value type one of the following:
   Binary, String, LargeBinary, LargeString,  Int32, UInt32, Int64, UInt64,
   Float32, Float64, Decimal128
diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst
index f2c344a6fb..5eb68e9ccd 100644
--- a/docs/source/python/csv.rst
+++ b/docs/source/python/csv.rst
@@ -30,7 +30,7 @@ The features currently offered are the following:
 * fetching column names from the first row in the CSV file
 * column-wise type inference and conversion to one of ``null``, ``int64``,
   ``float64``, ``date32``, ``time32[s]``, ``timestamp[s]``, ``timestamp[ns]``,
-  ``string`` or ``binary`` data
+  ``duration`` (from numeric strings), ``string`` or ``binary`` data
 * opportunistic dictionary encoding of ``string`` and ``binary`` columns
   (disabled by default)
 * detecting various spellings of null values such as ``NaN`` or ``#N/A``
@@ -125,6 +125,11 @@ a :class:`ConvertOptions` instance and pass it to 
:func:`read_csv`::
        }
    ))
 
+.. note::
+   To assign a column as ``duration``, the CSV values must be numeric strings
+   that match the expected unit (e.g. ``60000`` for 60 seconds when
+   using ``duration[ms]``).
+
 Available convert options are:
 
 .. autosummary::

Reply via email to