This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 240cbf4f83 Move examples in arrow-csv to docstrings, polish up docs
(#9001)
240cbf4f83 is described below
commit 240cbf4f838387445b0209db4b14dbb277b05a12
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Dec 19 12:52:31 2025 -0500
Move examples in arrow-csv to docstrings, polish up docs (#9001)
# Which issue does this PR close?
# Rationale for this change
while reviewing @xanderbailey's PR in
https://github.com/apache/arrow-rs/pull/8960, I found that there are
examples for arrow-csv and they are hard to find. Also each example add
extra binaries and thus slows down CI and tests. For example the
`whitespace_handling` example makes a new 2.9MB binary:
```shell
cargo run -p arrow-csv --example whitespace_handling
...
du -s -h target/debug/examples/whitespace_handling
2.9M target/debug/examples/whitespace_handling
```
Let's consolidate the examples to make them easier to find
# What changes are included in this PR?
1. Consolidate the examples
2. Improver other csv docs
# Are these changes tested?
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
3. Serve as another way to document the expected behavior of the code
# Are there any user-facing changes?
Docs only, no functional changes
---
arrow-csv/examples/README.md | 21 --------
arrow-csv/examples/csv_calculation.rs | 56 --------------------
arrow-csv/examples/whitespace_handling.rs | 86 -------------------------------
arrow-csv/src/lib.rs | 4 +-
arrow-csv/src/reader/mod.rs | 58 ++++++++++++++++++---
arrow-csv/src/writer.rs | 47 ++++++-----------
6 files changed, 71 insertions(+), 201 deletions(-)
diff --git a/arrow-csv/examples/README.md b/arrow-csv/examples/README.md
deleted file mode 100644
index 340413e76d..0000000000
--- a/arrow-csv/examples/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Examples
-- [`csv_calculation.rs`](csv_calculation.rs): performs a simple calculation
using the CSV reader
\ No newline at end of file
diff --git a/arrow-csv/examples/csv_calculation.rs
b/arrow-csv/examples/csv_calculation.rs
deleted file mode 100644
index 6ce963e2b0..0000000000
--- a/arrow-csv/examples/csv_calculation.rs
+++ /dev/null
@@ -1,56 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow_array::cast::AsArray;
-use arrow_array::types::Int16Type;
-use arrow_csv::ReaderBuilder;
-
-use arrow_schema::{DataType, Field, Schema};
-use std::fs::File;
-use std::sync::Arc;
-
-fn main() {
- // read csv from file
- let file = File::open("arrow-csv/test/data/example.csv").unwrap();
- let csv_schema = Schema::new(vec![
- Field::new("c1", DataType::Int16, true),
- Field::new("c2", DataType::Float32, true),
- Field::new("c3", DataType::Utf8, true),
- Field::new("c4", DataType::Boolean, true),
- ]);
- let mut reader = ReaderBuilder::new(Arc::new(csv_schema))
- .with_header(true)
- .build(file)
- .unwrap();
-
- match reader.next() {
- Some(r) => match r {
- Ok(r) => {
- // get the column(0) max value
- let col = r.column(0).as_primitive::<Int16Type>();
- let max = col.iter().max().flatten();
- println!("max value column(0): {max:?}")
- }
- Err(e) => {
- println!("{e:?}");
- }
- },
- None => {
- println!("csv is empty");
- }
- }
-}
diff --git a/arrow-csv/examples/whitespace_handling.rs
b/arrow-csv/examples/whitespace_handling.rs
deleted file mode 100644
index 77bb1a8a8c..0000000000
--- a/arrow-csv/examples/whitespace_handling.rs
+++ /dev/null
@@ -1,86 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow_array::*;
-use arrow_csv::WriterBuilder;
-use arrow_schema::*;
-use std::sync::Arc;
-
-fn main() {
- // Create a sample schema with string columns
- let schema = Schema::new(vec![
- Field::new("name", DataType::Utf8, false),
- Field::new("city", DataType::Utf8, false),
- Field::new("country", DataType::Utf8, false),
- ]);
-
- // Create sample data with leading and trailing whitespace
- let name = StringArray::from(vec![
- " John Doe ",
- " Jane Smith",
- "Bob Johnson ",
- "Alice Williams",
- ]);
- let city = StringArray::from(vec![
- " New York ",
- "Los Angeles ",
- " Chicago",
- "Houston",
- ]);
- let country = StringArray::from(vec![" USA ", " USA ", " USA ", "
USA "]);
-
- let batch = RecordBatch::try_new(
- Arc::new(schema),
- vec![Arc::new(name), Arc::new(city), Arc::new(country)],
- )
- .unwrap();
-
- println!("Original CSV (with whitespace):");
- let mut buf = Vec::new();
- let mut writer = WriterBuilder::new().build(&mut buf);
- writer.write(&batch).unwrap();
- drop(writer);
- println!("{}", String::from_utf8(buf).unwrap());
-
- println!("\nCSV with ignore_leading_whitespace:");
- let mut buf = Vec::new();
- let mut writer = WriterBuilder::new()
- .with_ignore_leading_whitespace(true)
- .build(&mut buf);
- writer.write(&batch).unwrap();
- drop(writer);
- println!("{}", String::from_utf8(buf).unwrap());
-
- println!("\nCSV with ignore_trailing_whitespace:");
- let mut buf = Vec::new();
- let mut writer = WriterBuilder::new()
- .with_ignore_trailing_whitespace(true)
- .build(&mut buf);
- writer.write(&batch).unwrap();
- drop(writer);
- println!("{}", String::from_utf8(buf).unwrap());
-
- println!("\nCSV with both ignore_leading_whitespace and
ignore_trailing_whitespace:");
- let mut buf = Vec::new();
- let mut writer = WriterBuilder::new()
- .with_ignore_leading_whitespace(true)
- .with_ignore_trailing_whitespace(true)
- .build(&mut buf);
- writer.write(&batch).unwrap();
- drop(writer);
- println!("{}", String::from_utf8(buf).unwrap());
-}
diff --git a/arrow-csv/src/lib.rs b/arrow-csv/src/lib.rs
index 54c4fc03f5..4c4b040981 100644
--- a/arrow-csv/src/lib.rs
+++ b/arrow-csv/src/lib.rs
@@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.
-//! Transfer data between the Arrow memory format and CSV (comma-separated
values).
+//! Transfer data between the [Apache Arrow] memory format and CSV
(comma-separated values).
+//!
+//! [Apache Arrow]: https://arrow.apache.org/
#![doc(
html_logo_url =
"https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 0a72b57e85..e26072fea9 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! CSV Reader
+//! CSV Reading: [`Reader`] and [`ReaderBuilder`]
//!
//! # Basic Usage
//!
@@ -42,6 +42,46 @@
//! let batch = csv.next().unwrap().unwrap();
//! ```
//!
+//! # Example: Numeric calculations on CSV
+//! This code finds the maximum value in column 0 of a CSV file containing
+//! ```csv
+//! c1,c2,c3,c4
+//! 1,1.1,"hong kong",true
+//! 3,323.12,"XiAn",false
+//! 10,131323.12,"cheng du",false
+//! ```
+//!
+//! ```
+//! # use arrow_array::cast::AsArray;
+//! # use arrow_array::types::Int16Type;
+//! # use arrow_csv::ReaderBuilder;
+//! # use arrow_schema::{DataType, Field, Schema};
+//! # use std::fs::File;
+//! # use std::sync::Arc;
+//! // Open the example file
+//! let file = File::open("test/data/example.csv").unwrap();
+//! let csv_schema = Schema::new(vec![
+//! Field::new("c1", DataType::Int16, true),
+//! Field::new("c2", DataType::Float32, true),
+//! Field::new("c3", DataType::Utf8, true),
+//! Field::new("c4", DataType::Boolean, true),
+//! ]);
+//! let mut reader = ReaderBuilder::new(Arc::new(csv_schema))
+//! .with_header(true)
+//! .build(file)
+//! .unwrap();
+//! // find the maximum value in column 0 across all batches
+//! let mut max_c0 = 0;
+//! while let Some(r) = reader.next() {
+//! let r = r.unwrap(); // handle error
+//! // get the max value in column(0) for this batch
+//! let col = r.column(0).as_primitive::<Int16Type>();
+//! let batch_max = col.iter().max().flatten().unwrap_or_default();
+//! max_c0 = max_c0.max(batch_max);
+//! }
+//! assert_eq!(max_c0, 10);
+//!```
+//!
//! # Async Usage
//!
//! The lower-level [`Decoder`] can be integrated with various forms of async
data streams,
@@ -441,13 +481,18 @@ pub fn infer_schema_from_files(
type Bounds = Option<(usize, usize)>;
/// CSV file reader using [`std::io::BufReader`]
+///
+/// See [`ReaderBuilder`] to construct a CSV reader with options and the
+/// [module-level documentation](crate::reader) for more details and examples
pub type Reader<R> = BufReader<StdBufReader<R>>;
-/// CSV file reader
+/// CSV file reader implementation. See [`Reader`] for usage
+///
+/// Despite having the same name as [`std::io::BufReader`, this structure does
+/// not buffer reads itself
pub struct BufReader<R> {
/// File reader
reader: R,
-
/// The decoder
decoder: Decoder,
}
@@ -1053,7 +1098,7 @@ fn build_boolean_array(
.map(|e| Arc::new(e) as ArrayRef)
}
-/// CSV file reader builder
+/// Builder for CSV [`Reader`]s
#[derive(Debug)]
pub struct ReaderBuilder {
/// Schema of the CSV file
@@ -1071,9 +1116,10 @@ pub struct ReaderBuilder {
}
impl ReaderBuilder {
- /// Create a new builder for configuring CSV parsing options.
+ /// Create a new builder for configuring [`Reader`] CSV parsing options.
///
- /// To convert a builder into a reader, call `ReaderBuilder::build`
+ /// To convert a builder into a reader, call [`ReaderBuilder::build`]. See
+ /// the [module-level documentation](crate::reader) for more details and
examples.
///
/// # Example
///
diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs
index fcf30a80dc..c38d1cdec3 100644
--- a/arrow-csv/src/writer.rs
+++ b/arrow-csv/src/writer.rs
@@ -15,13 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-//! CSV Writer
+//! CSV Writing: [`Writer`] and [`WriterBuilder`]
//!
//! This CSV writer allows Arrow data (in record batches) to be written as CSV
files.
//! The writer does not support writing `ListArray` and `StructArray`.
//!
//! # Example
-//!
//! ```
//! # use arrow_array::*;
//! # use arrow_array::types::*;
@@ -75,14 +74,13 @@
//! - `DataType::LargeUtf8`
//! - `DataType::Utf8View`
//!
-//! ## Example with whitespace handling
+//! ## Example: Use [`WriterBuilder`] to control whitespace handling
//!
//! ```
//! # use arrow_array::*;
//! # use arrow_csv::WriterBuilder;
//! # use arrow_schema::*;
//! # use std::sync::Arc;
-//!
//! let schema = Schema::new(vec for examples.
#[derive(Debug)]
pub struct Writer<W: Write> {
/// The object to write to
@@ -248,12 +229,15 @@ pub struct Writer<W: Write> {
impl<W: Write> Writer<W> {
/// Create a new CsvWriter from a writable object, with default options
+ ///
+ /// See [`WriterBuilder`] for configure options, and the [module
+ /// documentation](crate::writer) for examples.
pub fn new(writer: W) -> Self {
let delimiter = b',';
WriterBuilder::new().with_delimiter(delimiter).build(writer)
}
- /// Write a RecordBatch to a writable object
+ /// Write a RecordBatch to the underlying writer
pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> {
let num_columns = batch.num_columns();
if self.beginning {
@@ -418,9 +402,10 @@ impl Default for WriterBuilder {
}
impl WriterBuilder {
- /// Create a new builder for configuring CSV writing options.
+ /// Create a new builder for configuring CSV [`Writer`] options.
///
- /// To convert a builder into a writer, call `WriterBuilder::build`
+ /// To convert a builder into a writer, call [`WriterBuilder::build`]. See
+ /// the [module documentation](crate::writer) for more examples.
///
/// # Example
///