This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 03bdb2a190 GH-40278: [C++] Support casting string to duration in CSV
converter (#46035)
03bdb2a190 is described below
commit 03bdb2a1901a10fc1c0f41bcd7dfa0f68a9a5aff
Author: Zihan Qi <[email protected]>
AuthorDate: Wed May 7 17:56:48 2025 +0200
GH-40278: [C++] Support casting string to duration in CSV converter (#46035)
### Rationale for this change
Currently, the Arrow C++ CSV converter does not support parsing strings
into `duration` types. This limits CSV ingestion capabilities when handling
datasets with time-based intervals represented as numeric strings (e.g.,
`1000`, `2000000`). This PR adds support for parsing such strings into Arrow's
`DurationType`.
Note: Human-readable duration formats such as `1s`, `2m`, or `3h` are not
supported in this PR. Support for those formats may be considered in a future
enhancement.
### What changes are included in this PR?
- Added `DurationValueDecoder` using `StringConverter<DurationType>`
- Registered support in both standard and dictionary converters
- Added unit tests covering:
- Basic parsing across all time units (s, ms, µs, ns)
- Null and custom null values
- Whitespace handling and error cases
### Are these changes tested?
Yes, conversion logic is fully covered by new tests in `converter_test.cc`.
### Are there any user-facing changes?
Yes, users can now convert duration strings in CSV files to Arrow
`duration` arrays by specifying the appropriate schema type.
* GitHub Issue: #40278
Lead-authored-by: Zihan Qi <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/csv/converter.cc | 28 +++++++++++++++++
cpp/src/arrow/csv/converter_test.cc | 62 +++++++++++++++++++++++++++++++++++++
docs/source/cpp/csv.rst | 1 +
docs/source/python/csv.rst | 7 ++++-
4 files changed, 97 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 3825364fa9..3e991126d1 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -470,6 +470,32 @@ struct MultipleParsersTimestampValueDecoder : public
ValueDecoder {
std::vector<const TimestampParser*> parsers_;
};
+//
+// Value decoder for durations
+//
+struct DurationValueDecoder : public ValueDecoder {
+ using value_type = int64_t;
+
+ explicit DurationValueDecoder(const std::shared_ptr<DataType>& type,
+ const ConvertOptions& options)
+ : ValueDecoder(type, options),
+ concrete_type_(checked_cast<const DurationType&>(*type)),
+ string_converter_() {}
+
+ Status Decode(const uint8_t* data, uint32_t size, bool quoted, value_type*
out) {
+ TrimWhiteSpace(&data, &size);
+ if (ARROW_PREDICT_FALSE(!string_converter_.Convert(
+ concrete_type_, reinterpret_cast<const char*>(data), size, out))) {
+ return GenericConversionError(type_, data, size);
+ }
+ return Status::OK();
+ }
+
+ protected:
+ const DurationType& concrete_type_;
+ arrow::internal::StringConverter<DurationType> string_converter_;
+};
+
/////////////////////////////////////////////////////////////////////////
// Concrete Converter hierarchy
@@ -702,6 +728,7 @@ Result<std::shared_ptr<Converter>> Converter::Make(const
std::shared_ptr<DataTyp
NUMERIC_CONVERTER_CASE(Type::DATE64, Date64Type)
NUMERIC_CONVERTER_CASE(Type::TIME32, Time32Type)
NUMERIC_CONVERTER_CASE(Type::TIME64, Time64Type)
+ NUMERIC_CONVERTER_CASE(Type::DURATION, DurationType)
CONVERTER_CASE(Type::BOOL, (PrimitiveConverter<BooleanType,
BooleanValueDecoder>))
CONVERTER_CASE(Type::BINARY,
(PrimitiveConverter<BinaryType, BinaryValueDecoder<false>>))
@@ -785,6 +812,7 @@ Result<std::shared_ptr<DictionaryConverter>>
DictionaryConverter::Make(
CONVERTER_CASE(Type::UINT64, UInt64Type, NumericValueDecoder<UInt64Type>)
CONVERTER_CASE(Type::FLOAT, FloatType, NumericValueDecoder<FloatType>)
CONVERTER_CASE(Type::DOUBLE, DoubleType, NumericValueDecoder<DoubleType>)
+ CONVERTER_CASE(Type::DURATION, DurationType, DurationValueDecoder)
REAL_CONVERTER_CASE(Type::DECIMAL, Decimal128Type, DecimalValueDecoder)
CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryType,
FixedSizeBinaryValueDecoder)
diff --git a/cpp/src/arrow/csv/converter_test.cc
b/cpp/src/arrow/csv/converter_test.cc
index f4491d7441..5dc078e7fd 100644
--- a/cpp/src/arrow/csv/converter_test.cc
+++ b/cpp/src/arrow/csv/converter_test.cc
@@ -660,6 +660,68 @@ TEST(TimestampConversion, UserDefinedParsersWithZone) {
AssertConversionError(type, {"01/02/1970,1970-01-03T00:00:00+0000\n"}, {0},
options);
}
+TEST(DurationConversion, Basics) {
+ auto type = duration(TimeUnit::SECOND);
+ AssertConversion<DurationType, int64_t>(
+ type, {"1,120\n", "10800,345600\n", "-1,-120\n", "-10800,-345600\n"},
+ {{1, 10800, -1, -10800}, {120, 345600, -120, -345600}});
+
+ type = duration(TimeUnit::MILLI);
+ AssertConversion<DurationType, int64_t>(
+ type, {"1000,120000\n", "10800000,345600000\n", "500,0\n",
"-1000,-120000\n"},
+ {{1000, 10800000, 500, -1000}, {120000, 345600000, 0, -120000}});
+
+ type = duration(TimeUnit::MICRO);
+ AssertConversion<DurationType, int64_t>(
+ type, {"1000000,500000\n", "120000000,10800000000\n",
"-500000,-1000000\n"},
+ {{1000000, 120000000, -500000}, {500000, 10800000000, -1000000}});
+
+ type = duration(TimeUnit::NANO);
+ AssertConversion<DurationType, int64_t>(
+ type,
+ {"1000000000,500000000\n", "120000000000,10800000000000\n", "7000,9\n",
+ "-7000,-9\n"},
+ {{1000000000, 120000000000, 7000, -7000}, {500000000, 10800000000000, 9,
-9}});
+}
+
+TEST(DurationConversion, Nulls) {
+ auto type = duration(TimeUnit::MILLI);
+ AssertConversion<DurationType, int64_t>(type, {"1000,N/A\n", ",10800000\n"},
+ {{1000, 0}, {0, 10800000}},
+ {{true, false}, {false, true}});
+}
+
+TEST(DurationConversion, CustomNulls) {
+ auto options = ConvertOptions::Defaults();
+ options.null_values = {"xxx", "zzz"};
+
+ auto type = duration(TimeUnit::SECOND);
+ AssertConversion<DurationType, int64_t>(type, {"1,xxx\n"}, {{1}, {0}},
+ {{true}, {false}}, options);
+
+ options.quoted_strings_can_be_null = false;
+ AssertConversionError(type, {"\"1\",\"xxx\"\n"}, {1}, options);
+
+ AssertConversion<DurationType, int64_t>(type, {"1,xxx\n", "zzz,120\n"},
+ {{1, 0}, {0, 120}},
+ {{true, false}, {false, true}},
options);
+}
+
+TEST(DurationConversion, Whitespace) {
+ auto type = duration(TimeUnit::MILLI);
+ AssertConversion<DurationType, int64_t>(type,
+ {" 1000 , 120000 \n", " 500 ,
10800000 \n"},
+ {{1000, 500}, {120000, 10800000}});
+}
+
+TEST(DurationConversion, Invalid) {
+ auto type = duration(TimeUnit::SECOND);
+ AssertConversionError(type, {"xyz\n"}, {0});
+ AssertConversionError(type, {"123abc\n"}, {0});
+ AssertConversionError(type, {"1.5\n"}, {0}); // floats not allowed
+ AssertConversionError(type, {"s1\n"}, {0}); // bad format
+}
+
Decimal128 Dec128(std::string_view value) {
Decimal128 dec;
int32_t scale = 0;
diff --git a/docs/source/cpp/csv.rst b/docs/source/cpp/csv.rst
index 6078ec5892..bcb17bdc58 100644
--- a/docs/source/cpp/csv.rst
+++ b/docs/source/cpp/csv.rst
@@ -265,6 +265,7 @@ can be chosen from the following list:
* Binary and Large Binary
* String and Large String (with optional UTF8 input validation)
* Fixed-Size Binary
+* Duration (numeric strings matching the schema unit, e.g., "60000" for
duration[ms])
* Dictionary with index type Int32 and value type one of the following:
Binary, String, LargeBinary, LargeString, Int32, UInt32, Int64, UInt64,
Float32, Float64, Decimal128
diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst
index f2c344a6fb..5eb68e9ccd 100644
--- a/docs/source/python/csv.rst
+++ b/docs/source/python/csv.rst
@@ -30,7 +30,7 @@ The features currently offered are the following:
* fetching column names from the first row in the CSV file
* column-wise type inference and conversion to one of ``null``, ``int64``,
``float64``, ``date32``, ``time32[s]``, ``timestamp[s]``, ``timestamp[ns]``,
- ``string`` or ``binary`` data
+ ``duration`` (from numeric strings), ``string`` or ``binary`` data
* opportunistic dictionary encoding of ``string`` and ``binary`` columns
(disabled by default)
* detecting various spellings of null values such as ``NaN`` or ``#N/A``
@@ -125,6 +125,11 @@ a :class:`ConvertOptions` instance and pass it to
:func:`read_csv`::
}
))
+.. note::
+ To assign a column as ``duration``, the CSV values must be numeric strings
+ that match the expected unit (e.g. ``60000`` for 60 seconds when
+ using ``duration[ms]``).
+
Available convert options are:
.. autosummary::