This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 885e7bf956 Add more examples of using Parquet encryption (#7374)
885e7bf956 is described below

commit 885e7bf9567940c6be4ab162984a42f3fdb99d78
Author: Adam Reeve <[email protected]>
AuthorDate: Sat Apr 5 02:30:04 2025 +1300

    Add more examples of using Parquet encryption (#7374)
    
    * Add encryption round-trip example
    
    * Add examples for building FileEncryptionProperties and 
FileDecryptionProperties
    
    * Add an example of using a KeyRetriever
    
    * Apply suggestions from code review
    
    Co-authored-by: Matthijs Brobbel <[email protected]>
    
    ---------
    
    Co-authored-by: Matthijs Brobbel <[email protected]>
---
 parquet/src/encryption/decrypt.rs | 111 +++++++++++++++++++++++++++++++++++++-
 parquet/src/encryption/encrypt.rs |  37 +++++++++++++
 parquet/src/encryption/mod.rs     |  92 ++++++++++++++++++++++++++++++-
 3 files changed, 237 insertions(+), 3 deletions(-)

diff --git a/parquet/src/encryption/decrypt.rs 
b/parquet/src/encryption/decrypt.rs
index 41e5757f36..0927421344 100644
--- a/parquet/src/encryption/decrypt.rs
+++ b/parquet/src/encryption/decrypt.rs
@@ -28,6 +28,77 @@ use std::io::Read;
 use std::sync::Arc;
 
 /// Trait for retrieving an encryption key using the key's metadata
+///
+/// # Example
+///
+/// This shows how you might use a `KeyRetriever` to decrypt a Parquet file
+/// if you have a set of known encryption keys with identifiers, but at read 
time
+/// you may not know which columns were encrypted and which keys were used.
+///
+/// In practice, the key metadata might instead store an encrypted key that 
must
+/// be decrypted with a Key Management Server.
+///
+/// ```
+/// # use std::collections::HashMap;
+/// # use std::sync::{Arc, Mutex};
+/// # use parquet::encryption::decrypt::{FileDecryptionProperties, 
KeyRetriever};
+/// # use parquet::encryption::encrypt::FileEncryptionProperties;
+/// # use parquet::errors::ParquetError;
+/// // Define known encryption keys
+/// let mut keys = HashMap::new();
+/// keys.insert("kf".to_owned(), b"0123456789012345".to_vec());
+/// keys.insert("kc1".to_owned(), b"1234567890123450".to_vec());
+/// keys.insert("kc2".to_owned(), b"1234567890123451".to_vec());
+///
+/// // Create encryption properties for writing a file,
+/// // and specify the key identifiers as the key metadata.
+/// let encryption_properties = 
FileEncryptionProperties::builder(keys.get("kf").unwrap().clone())
+///     .with_footer_key_metadata("kf".into())
+///     .with_column_key_and_metadata("x", keys.get("kc1").unwrap().clone(), 
"kc1".as_bytes().into())
+///     .with_column_key_and_metadata("y", keys.get("kc2").unwrap().clone(), 
"kc2".as_bytes().into())
+///     .build()?;
+///
+/// // Write an encrypted file with the properties
+/// // ...
+///
+/// // Define a KeyRetriever that can get encryption keys using their 
identifiers
+/// struct CustomKeyRetriever {
+///     keys: Mutex<HashMap<String, Vec<u8>>>,
+/// }
+///
+/// impl KeyRetriever for CustomKeyRetriever {
+///     fn retrieve_key(&self, key_metadata: &[u8]) -> 
parquet::errors::Result<Vec<u8>> {
+///         // Metadata is bytes, so convert it to a string identifier
+///         let key_metadata = std::str::from_utf8(key_metadata).map_err(|e| {
+///             ParquetError::General(format!("Could not convert key metadata 
to string: {e}"))
+///         })?;
+///         // Lookup the key
+///         let keys = self.keys.lock().unwrap();
+///         match keys.get(key_metadata) {
+///             Some(key) => Ok(key.clone()),
+///             None => Err(ParquetError::General(format!(
+///                 "Could not retrieve key for metadata {key_metadata:?}"
+///             ))),
+///         }
+///     }
+/// }
+///
+/// let key_retriever = Arc::new(CustomKeyRetriever {
+///     keys: Mutex::new(keys),
+/// });
+///
+/// // Create decryption properties for reading an encrypted file.
+/// // Note that we don't need to specify which columns are encrypted,
+/// // this is determined by the file metadata and the required keys will be 
retrieved
+/// // dynamically using our key retriever.
+/// let decryption_properties = 
FileDecryptionProperties::with_key_retriever(key_retriever)
+///     .build()?;
+///
+/// // Read an encrypted file with the decryption properties
+/// // ...
+///
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
 pub trait KeyRetriever: Send + Sync {
     /// Retrieve a decryption key given the key metadata
     fn retrieve_key(&self, key_metadata: &[u8]) -> Result<Vec<u8>>;
@@ -195,7 +266,43 @@ impl PartialEq for DecryptionKeys {
     }
 }
 
-/// FileDecryptionProperties hold keys and AAD data required to decrypt a 
Parquet file.
+/// `FileDecryptionProperties` hold keys and AAD data required to decrypt a 
Parquet file.
+///
+/// When reading Arrow data, the `FileDecryptionProperties` should be included 
in the
+/// [`ArrowReaderOptions`](crate::arrow::arrow_reader::ArrowReaderOptions)  
using
+/// 
[`with_file_decryption_properties`](crate::arrow::arrow_reader::ArrowReaderOptions::with_file_decryption_properties).
+///
+/// # Examples
+///
+/// Create `FileDecryptionProperties` for a file encrypted with uniform 
encryption,
+/// where all metadata and data are encrypted with the footer key:
+/// ```
+/// # use parquet::encryption::decrypt::FileDecryptionProperties;
+/// let file_encryption_properties = 
FileDecryptionProperties::builder(b"0123456789012345".into())
+///     .build()?;
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
+///
+/// Create properties for a file where columns are encrypted with different 
keys:
+/// ```
+/// # use parquet::encryption::decrypt::FileDecryptionProperties;
+/// let file_encryption_properties = 
FileDecryptionProperties::builder(b"0123456789012345".into())
+///     .with_column_key("x", b"1234567890123450".into())
+///     .with_column_key("y", b"1234567890123451".into())
+///     .build()?;
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
+///
+/// Specify additional authenticated data, used to protect against data 
replacement.
+/// This must match the AAD prefix provided when the file was written, 
otherwise
+/// data decryption will fail.
+/// ```
+/// # use parquet::encryption::decrypt::FileDecryptionProperties;
+/// let file_encryption_properties = 
FileDecryptionProperties::builder(b"0123456789012345".into())
+///     .with_aad_prefix("example_file".into())
+///     .build()?;
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
 #[derive(Clone, PartialEq)]
 pub struct FileDecryptionProperties {
     keys: DecryptionKeys,
@@ -277,6 +384,8 @@ impl std::fmt::Debug for FileDecryptionProperties {
 }
 
 /// Builder for [`FileDecryptionProperties`]
+///
+/// See [`FileDecryptionProperties`] for example usage.
 pub struct DecryptionPropertiesBuilder {
     footer_key: Option<Vec<u8>>,
     key_retriever: Option<Arc<dyn KeyRetriever>>,
diff --git a/parquet/src/encryption/encrypt.rs 
b/parquet/src/encryption/encrypt.rs
index 13cab64fa6..9a801434c0 100644
--- a/parquet/src/encryption/encrypt.rs
+++ b/parquet/src/encryption/encrypt.rs
@@ -53,6 +53,41 @@ impl EncryptionKey {
 
 #[derive(Debug, Clone, PartialEq)]
 /// Defines how data in a Parquet file should be encrypted
+///
+/// The `FileEncryptionProperties` should be included in the 
[`WriterProperties`](crate::file::properties::WriterProperties)
+/// used to write a file by using 
[`WriterPropertiesBuilder::with_file_encryption_properties`](crate::file::properties::WriterPropertiesBuilder::with_file_encryption_properties).
+///
+/// # Examples
+///
+/// Create `FileEncryptionProperties` for a file encrypted with uniform 
encryption,
+/// where all metadata and data are encrypted with the footer key:
+/// ```
+/// # use parquet::encryption::encrypt::FileEncryptionProperties;
+/// let file_encryption_properties = 
FileEncryptionProperties::builder(b"0123456789012345".into())
+///     .build()?;
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
+///
+/// Create properties for a file where columns are encrypted with different 
keys.
+/// Any columns without a key specified will be unencrypted:
+/// ```
+/// # use parquet::encryption::encrypt::FileEncryptionProperties;
+/// let file_encryption_properties = 
FileEncryptionProperties::builder(b"0123456789012345".into())
+///     .with_column_key("x", b"1234567890123450".into())
+///     .with_column_key("y", b"1234567890123451".into())
+///     .build()?;
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
+///
+/// Specify additional authenticated data, used to protect against data 
replacement.
+/// This should represent the file identity:
+/// ```
+/// # use parquet::encryption::encrypt::FileEncryptionProperties;
+/// let file_encryption_properties = 
FileEncryptionProperties::builder(b"0123456789012345".into())
+///     .with_aad_prefix("example_file".into())
+///     .build()?;
+/// # Ok::<(), parquet::errors::ParquetError>(())
+/// ```
 pub struct FileEncryptionProperties {
     encrypt_footer: bool,
     footer_key: EncryptionKey,
@@ -141,6 +176,8 @@ impl FileEncryptionProperties {
 }
 
 /// Builder for [`FileEncryptionProperties`]
+///
+/// See [`FileEncryptionProperties`] for example usage.
 pub struct EncryptionPropertiesBuilder {
     encrypt_footer: bool,
     footer_key: EncryptionKey,
diff --git a/parquet/src/encryption/mod.rs b/parquet/src/encryption/mod.rs
index 062c351ac1..c1f4ca0da3 100644
--- a/parquet/src/encryption/mod.rs
+++ b/parquet/src/encryption/mod.rs
@@ -15,8 +15,96 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Encryption implementation specific to Parquet, as described
-//! in the 
[spec](https://github.com/apache/parquet-format/blob/master/Encryption.md).
+//! This module implements Parquet Modular Encryption, as described in the
+//! 
[specification](https://github.com/apache/parquet-format/blob/master/Encryption.md).
+//!
+//! # Example of writing and reading an encrypted Parquet file
+//!
+//! ```
+//! use arrow::array::{ArrayRef, Float32Array, Int32Array, RecordBatch};
+//! use parquet::arrow::arrow_reader::{ArrowReaderOptions, 
ParquetRecordBatchReaderBuilder};
+//! use parquet::arrow::ArrowWriter;
+//! use parquet::encryption::decrypt::FileDecryptionProperties;
+//! use parquet::encryption::encrypt::FileEncryptionProperties;
+//! use parquet::errors::Result;
+//! use parquet::file::properties::WriterProperties;
+//! use std::fs::File;
+//! use std::sync::Arc;
+//! use tempfile::TempDir;
+//!
+//! // Define 16 byte AES encryption keys to use.
+//! static FOOTER_KEY: &[u8; 16] = b"0123456789012345";
+//! static COLUMN_KEY_1: &[u8; 16] = b"1234567890123450";
+//! static COLUMN_KEY_2: &[u8; 16] = b"1234567890123451";
+//!
+//! let temp_dir = TempDir::new()?;
+//! let file_path = temp_dir.path().join("encrypted_example.parquet");
+//!
+//! // Create file encryption properties, which define how the file is 
encrypted.
+//! // We will specify a key to encrypt the footer metadata,
+//! // then separate keys for different columns.
+//! // This allows fine-grained control of access to different columns within 
a Parquet file.
+//! // Note that any columns without an encryption key specified will be left 
un-encrypted.
+//! // If only a footer key is specified, then all columns are encrypted with 
the footer key.
+//! let encryption_properties = 
FileEncryptionProperties::builder(FOOTER_KEY.into())
+//!     .with_column_key("x", COLUMN_KEY_1.into())
+//!     .with_column_key("y", COLUMN_KEY_2.into())
+//!     // We also set an AAD prefix, which is optional.
+//!     // This contributes to the "additional authenticated data" that is 
used to verify file
+//!     // integrity and prevents data being swapped with data encrypted with 
the same key.
+//!     .with_aad_prefix(b"example_aad".into())
+//!     // Specify that the AAD prefix is stored in the file, so readers don't 
need
+//!     // to provide it to read the data, but can optionally provide it if 
they want to
+//!     // verify file integrity.
+//!     .with_aad_prefix_storage(true)
+//!     .build()?;
+//!
+//! let writer_properties = WriterProperties::builder()
+//!     .with_file_encryption_properties(encryption_properties)
+//!     .build();
+//!
+//! // Write the encrypted Parquet file
+//! {
+//!     let file = File::create(&file_path)?;
+//!
+//!     let ids = Int32Array::from(vec![0, 1, 2, 3, 4, 5]);
+//!     let x_vals = Float32Array::from(vec![0.0, 0.1, 0.2, 0.3, 0.4, 0.5]);
+//!     let y_vals = Float32Array::from(vec![1.0, 1.1, 1.2, 1.3, 1.4, 1.5]);
+//!     let batch = RecordBatch::try_from_iter(vec![
+//!       ("id", Arc::new(ids) as ArrayRef),
+//!       ("x", Arc::new(x_vals) as ArrayRef),
+//!       ("y", Arc::new(y_vals) as ArrayRef),
+//!     ])?;
+//!
+//!     let mut writer = ArrowWriter::try_new(file, batch.schema(), 
Some(writer_properties))?;
+//!
+//!     writer.write(&batch)?;
+//!     writer.close()?;
+//! }
+//!
+//! // In order to read the encrypted Parquet file, we need to know the 
encryption
+//! // keys used to encrypt it.
+//! // We don't need to provide the AAD prefix as it was stored in the file 
metadata,
+//! // but we could specify it here if we wanted to verify the file hasn't 
been tampered with:
+//! let decryption_properties = 
FileDecryptionProperties::builder(FOOTER_KEY.into())
+//!     .with_column_key("x", COLUMN_KEY_1.into())
+//!     .with_column_key("y", COLUMN_KEY_2.into())
+//!     .build()?;
+//!
+//! let reader_options =
+//!     
ArrowReaderOptions::new().with_file_decryption_properties(decryption_properties);
+//!
+//! // Read the file using the configured decryption properties
+//! let file = File::open(&file_path)?;
+//!
+//! let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, 
reader_options)?;
+//! let record_reader = builder.build()?;
+//! for batch in record_reader {
+//!     let batch = batch?;
+//!     println!("Read batch: {batch:?}");
+//! }
+//! # Ok::<(), parquet::errors::ParquetError>(())
+//! ```
 
 pub(crate) mod ciphers;
 pub mod decrypt;

Reply via email to