(arrow-rs) branch main updated: docs(parquet): add example for preserving dictionary encoding (#9116)

alamb Sat, 10 Jan 2026 04:23:10 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 785697094f docs(parquet): add example for preserving dictionary 
encoding (#9116)
785697094f is described below

commit 785697094fddffa7e5e38428a2ee4dbfa3c9af48
Author: Andrea Bozzo <[email protected]>
AuthorDate: Sat Jan 10 13:22:45 2026 +0100

    docs(parquet): add example for preserving dictionary encoding (#9116)
    
    This PR adds a second example to `ArrowReaderOptions::with_schema`
    demonstrating how to preserve dictionary encoding when reading Parquet
    string columns.
    
    Closes #9095
---
 parquet/src/arrow/arrow_reader/mod.rs | 53 +++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index a626076ebd..6a3f76b388 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -508,6 +508,59 @@ impl ArrowReaderOptions {
     /// let mut reader = builder.build().unwrap();
     /// let _batch = reader.next().unwrap().unwrap();
     /// ```
+    ///
+    /// # Example: Preserving Dictionary Encoding
+    ///
+    /// By default, Parquet string columns are read as `Utf8Array` (or 
`LargeUtf8Array`),
+    /// even if the underlying Parquet data uses dictionary encoding. You can 
preserve
+    /// the dictionary encoding by specifying a `Dictionary` type in the 
schema hint:
+    ///
+    /// ```
+    /// use std::sync::Arc;
+    /// use tempfile::tempfile;
+    /// use arrow_array::{ArrayRef, RecordBatch, StringArray};
+    /// use arrow_schema::{DataType, Field, Schema};
+    /// use parquet::arrow::arrow_reader::{ArrowReaderOptions, 
ParquetRecordBatchReaderBuilder};
+    /// use parquet::arrow::ArrowWriter;
+    ///
+    /// // Write a Parquet file with string data
+    /// let file = tempfile().unwrap();
+    /// let schema = Arc::new(Schema::new(vec![
+    ///     Field::new("city", DataType::Utf8, false)
+    /// ]));
+    /// let cities = StringArray::from(vec!["Berlin", "Berlin", "Paris", 
"Berlin", "Paris"]);
+    /// let batch = RecordBatch::try_new(schema.clone(), 
vec![Arc::new(cities)]).unwrap();
+    ///
+    /// let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), 
batch.schema(), None).unwrap();
+    /// writer.write(&batch).unwrap();
+    /// writer.close().unwrap();
+    ///
+    /// // Read the file back, requesting dictionary encoding preservation
+    /// let dict_schema = Arc::new(Schema::new(vec![
+    ///     Field::new("city", DataType::Dictionary(
+    ///         Box::new(DataType::Int32),
+    ///         Box::new(DataType::Utf8)
+    ///     ), false)
+    /// ]));
+    /// let options = ArrowReaderOptions::new().with_schema(dict_schema);
+    /// let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
+    ///     file.try_clone().unwrap(),
+    ///     options
+    /// ).unwrap();
+    ///
+    /// let mut reader = builder.build().unwrap();
+    /// let batch = reader.next().unwrap().unwrap();
+    ///
+    /// // The column is now a DictionaryArray
+    /// assert!(matches!(
+    ///     batch.column(0).data_type(),
+    ///     DataType::Dictionary(_, _)
+    /// ));
+    /// ```
+    ///
+    /// **Note**: Dictionary encoding preservation works best when:
+    /// 1. The original column was dictionary encoded (the default for string 
columns)
+    /// 2. There are a small number of distinct values
     pub fn with_schema(self, schema: SchemaRef) -> Self {
         Self {
             supplied_schema: Some(schema),

(arrow-rs) branch main updated: docs(parquet): add example for preserving dictionary encoding (#9116)

Reply via email to