This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 785697094f docs(parquet): add example for preserving dictionary
encoding (#9116)
785697094f is described below
commit 785697094fddffa7e5e38428a2ee4dbfa3c9af48
Author: Andrea Bozzo <[email protected]>
AuthorDate: Sat Jan 10 13:22:45 2026 +0100
docs(parquet): add example for preserving dictionary encoding (#9116)
This PR adds a second example to `ArrowReaderOptions::with_schema`
demonstrating how to preserve dictionary encoding when reading Parquet
string columns.
Closes #9095
---
parquet/src/arrow/arrow_reader/mod.rs | 53 +++++++++++++++++++++++++++++++++++
1 file changed, 53 insertions(+)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index a626076ebd..6a3f76b388 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -508,6 +508,59 @@ impl ArrowReaderOptions {
/// let mut reader = builder.build().unwrap();
/// let _batch = reader.next().unwrap().unwrap();
/// ```
+ ///
+ /// # Example: Preserving Dictionary Encoding
+ ///
+ /// By default, Parquet string columns are read as `Utf8Array` (or
`LargeUtf8Array`),
+ /// even if the underlying Parquet data uses dictionary encoding. You can
preserve
+ /// the dictionary encoding by specifying a `Dictionary` type in the
schema hint:
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use tempfile::tempfile;
+ /// use arrow_array::{ArrayRef, RecordBatch, StringArray};
+ /// use arrow_schema::{DataType, Field, Schema};
+ /// use parquet::arrow::arrow_reader::{ArrowReaderOptions,
ParquetRecordBatchReaderBuilder};
+ /// use parquet::arrow::ArrowWriter;
+ ///
+ /// // Write a Parquet file with string data
+ /// let file = tempfile().unwrap();
+ /// let schema = Arc::new(Schema::new(vec![
+ /// Field::new("city", DataType::Utf8, false)
+ /// ]));
+ /// let cities = StringArray::from(vec!["Berlin", "Berlin", "Paris",
"Berlin", "Paris"]);
+ /// let batch = RecordBatch::try_new(schema.clone(),
vec![Arc::new(cities)]).unwrap();
+ ///
+ /// let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(),
batch.schema(), None).unwrap();
+ /// writer.write(&batch).unwrap();
+ /// writer.close().unwrap();
+ ///
+ /// // Read the file back, requesting dictionary encoding preservation
+ /// let dict_schema = Arc::new(Schema::new(vec![
+ /// Field::new("city", DataType::Dictionary(
+ /// Box::new(DataType::Int32),
+ /// Box::new(DataType::Utf8)
+ /// ), false)
+ /// ]));
+ /// let options = ArrowReaderOptions::new().with_schema(dict_schema);
+ /// let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
+ /// file.try_clone().unwrap(),
+ /// options
+ /// ).unwrap();
+ ///
+ /// let mut reader = builder.build().unwrap();
+ /// let batch = reader.next().unwrap().unwrap();
+ ///
+ /// // The column is now a DictionaryArray
+ /// assert!(matches!(
+ /// batch.column(0).data_type(),
+ /// DataType::Dictionary(_, _)
+ /// ));
+ /// ```
+ ///
+ /// **Note**: Dictionary encoding preservation works best when:
+ /// 1. The original column was dictionary encoded (the default for string
columns)
+ /// 2. There are a small number of distinct values
pub fn with_schema(self, schema: SchemaRef) -> Self {
Self {
supplied_schema: Some(schema),