LiaCastaneda commented on code in PR #21765: URL: https://github.com/apache/datafusion/pull/21765#discussion_r3356276108
########## datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs: ########## @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aggregates::group_values::GroupValues; +use crate::hash_utils::RandomState; +use arrow::array::{ + Array, ArrayRef, DictionaryArray, LargeStringArray, LargeStringBuilder, ListArray, + ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, + StringViewArray, StringViewBuilder, +}; +use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; +use datafusion_common::DataFusionError::{Internal, NotImplemented}; +use datafusion_common::Result; +use datafusion_common::hash_utils::create_hashes; +use datafusion_expr::EmitTo; +use hashbrown::HashTable; +use hashbrown::hash_table::Entry as HashTableEntry; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Heuristic for sizing the values buffer of string builders during emit: +/// dictionary-encoded values are short by design (categorical strings, short +/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common +/// case while keeping over-allocation cheap when values are smaller. +const AVG_BYTES_PER_DICT_VALUE: usize = 16; +const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for small-medium groups + +macro_rules! decode_list { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + None => builder.append_null(), + Some(raw_vector) => { + let mut offset = 0; + while offset < raw_vector.len() { + let len = i64::from_ne_bytes( + raw_vector[offset..offset + 8] + .try_into() + .expect("slice of length 8"), + ); + offset += 8; + if len == -1 { + builder.values().append_null(); + } else { + let s = unsafe { + std::str::from_utf8_unchecked( + &raw_vector[offset..offset + len as usize], + ) + }; + builder.values().append_value(s); + offset += len as usize; + } + } + builder.append(true); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +macro_rules! decode_scalar_string { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + Some(raw_vector) => { + let s = unsafe { std::str::from_utf8_unchecked(raw_vector) }; + builder.append_value(s); + } + None => builder.append_null(), + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the entry +struct DictEntry { + hash: u64, + group_id: usize, + bytes: Vec<u8>, +} + +pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> { + // stores the order new unique elements are seen for self.emit() + seen_elements: Vec<Option<Vec<u8>>>, + value_dt: DataType, + _phantom: PhantomData<K>, + // keeps track of which values weve already seen, keyed by raw value hash. + unique_dict_value_mapping: HashTable<DictEntry>, + + random_state: RandomState, + + // cache the group id for nulls since they all map to the same group + null_group_id: Option<usize>, + intern_called: bool, + // 0. cache pointer of arrays, this avoids having to re-compute hashing for arrays weve already seen on past iterations + // 1. avoid re-allocating buffer inbetween calls, instead of allocating a new vector each time re-use inbetween calls + values_cache: (Option<ArrayRef>, Vec<u64>), +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> { + pub fn new(data_type: &DataType) -> Self { + Self { + seen_elements: Vec::new(), + unique_dict_value_mapping: HashTable::with_capacity(INITIAL_PRE_ALLOCATION), + value_dt: data_type.clone(), + _phantom: PhantomData, + random_state: RandomState::with_seed(0), + null_group_id: None, + intern_called: false, + values_cache: (None, Vec::new()), + } + } + + /// Returns the existing `group_id` for the value with this hash and bytes, + /// or inserts a new entry and returns the freshly assigned group_id. + fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize { + match self + .unique_dict_value_mapping + .entry(hash, |e| e.bytes == raw, |e| e.hash) + { + HashTableEntry::Occupied(o) => o.get().group_id, + HashTableEntry::Vacant(v) => { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(Some(raw.to_vec())); // replace this with raw buffer to avoid this double copy #TODO see https://github.com/apache/datafusion/issues/22078 + v.insert(DictEntry { + hash, + group_id: new_group_id, + bytes: raw.to_vec(), + }); + new_group_id + } + } + } + fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> { + self.values_cache.1.clear(); + self.values_cache.1.resize(values.len(), 0); + create_hashes( + [Arc::clone(values)], + &self.random_state, + &mut self.values_cache.1, + )?; + Ok(()) + //Ok(hashes) + } + + fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> { + match values.data_type() { + DataType::Utf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringArray>() + .expect("Expected StringArray") + .value(index) + .as_bytes(), + ), + DataType::LargeUtf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<LargeStringArray>() + .expect("Expected LargeStringArray") + .value(index) + .as_bytes(), + ), + DataType::Utf8View => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringViewArray>() + .expect("Expected StringViewArray") + .value(index) + .as_bytes(), + ), + DataType::List(_) => { + let list_array = values + .as_any() + .downcast_ref::<ListArray>() + .expect("Expected ListArray"); + + debug_assert!(!list_array.is_null(index)); + + let start = list_array.value_offsets()[index] as usize; + let end = list_array.value_offsets()[index + 1] as usize; + let child = list_array.values(); + + let mut bytes = Vec::new(); + for i in start..end { + if child.is_null(i) { + // acts as a marker for transform_into_array to write a null + bytes.extend_from_slice(&(-1i64).to_ne_bytes()); + } else { + let raw = Self::get_raw_bytes(child, i); + bytes.extend_from_slice(&(raw.len() as i64).to_ne_bytes()); + bytes.extend_from_slice(&raw); + } + } + Cow::Owned(bytes) + } + other => unimplemented!("get_raw_bytes not implemented for {other:?}"), + } + } + + #[inline] + fn get_null_group_id(&mut self) -> usize { + if let Some(group_id) = self.null_group_id { + group_id + } else { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(None); + self.null_group_id = Some(new_group_id); + new_group_id + } + } + fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> Result<ArrayRef> { + let item_capacity = raw.len(); + let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE; + match &self.value_dt { + DataType::Utf8 => decode_scalar_string!( + raw, + StringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::LargeUtf8 => decode_scalar_string!( + raw, + LargeStringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::Utf8View => decode_scalar_string!( + raw, + StringViewBuilder::with_capacity(item_capacity) + ), + DataType::List(field) => match field.data_type() { + DataType::Utf8 => decode_list!( + raw, + ListBuilder::with_capacity( + StringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::LargeUtf8 => decode_list!( + raw, + ListBuilder::with_capacity( + LargeStringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::Utf8View => decode_list!( + raw, + ListBuilder::with_capacity( + StringViewBuilder::with_capacity(item_capacity), + item_capacity, + ) + ), + other => Err(NotImplemented(format!( + "transform_into_array not implemented for List<{other:?}>" + ))), + }, + other => Err(NotImplemented(format!( + "transform_into_array not implemented for {other:?}" + ))), + } + } + #[inline] + fn keys_to_usize(key_array: &PrimitiveArray<K>) -> Vec<Option<usize>> { + (0..key_array.len()) + .map(|i| { + if key_array.is_null(i) { + None + } else { + Some(key_array.value(i).to_usize().unwrap()) + } + }) + .collect() + } +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K> { + fn size(&self) -> usize { + let seen_elements_size = self.seen_elements.capacity() + * size_of::<Option<Vec<u8>>>() + + self + .seen_elements + .iter() + .filter_map(|opt| opt.as_ref()) + .map(|inner| inner.capacity()) + .sum::<usize>(); + + let unique_mapping_size = self.unique_dict_value_mapping.capacity() + * size_of::<DictEntry>() + + self + .unique_dict_value_mapping + .iter() + .map(|e| e.bytes.capacity()) + .sum::<usize>(); + + let values_cache_size = self.values_cache.1.capacity() * size_of::<u64>() + + self + .values_cache + .0 + .as_ref() + .map(|a| a.get_array_memory_size()) + .unwrap_or(0); + + size_of::<Self>() + seen_elements_size + unique_mapping_size + values_cache_size + } + fn len(&self) -> usize { + self.seen_elements.len() + } + fn is_empty(&self) -> bool { + self.seen_elements.is_empty() + } + fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> { + assert_eq!( + cols.len(), + 1, + "GroupValuesDictionary only supports a single column" + ); + let array = Arc::clone(&cols[0]); + groups.clear(); // zero out buffer + let dict_array = array + .as_any() + .downcast_ref::<DictionaryArray<K>>() + .ok_or_else(|| { + Internal(format!( + "GroupValuesDictionary expected DictionaryArray but got {:?}", + array.data_type() + )) + })?; + + let values = dict_array.values(); + let key_array = dict_array.keys(); + if key_array.is_empty() { + return Ok(()); + } + let keys_as_usize: Vec<Option<usize>> = Self::keys_to_usize(key_array); Review Comment: maybe this allocation can be removed and we can iterate and convert the keys on the fly? ########## datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs: ########## @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aggregates::group_values::GroupValues; +use crate::hash_utils::RandomState; +use arrow::array::{ + Array, ArrayRef, DictionaryArray, LargeStringArray, LargeStringBuilder, ListArray, + ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, + StringViewArray, StringViewBuilder, +}; +use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; +use datafusion_common::DataFusionError::{Internal, NotImplemented}; +use datafusion_common::Result; +use datafusion_common::hash_utils::create_hashes; +use datafusion_expr::EmitTo; +use hashbrown::HashTable; +use hashbrown::hash_table::Entry as HashTableEntry; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Heuristic for sizing the values buffer of string builders during emit: +/// dictionary-encoded values are short by design (categorical strings, short +/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common +/// case while keeping over-allocation cheap when values are smaller. +const AVG_BYTES_PER_DICT_VALUE: usize = 16; +const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for small-medium groups + +macro_rules! decode_list { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + None => builder.append_null(), + Some(raw_vector) => { + let mut offset = 0; + while offset < raw_vector.len() { + let len = i64::from_ne_bytes( + raw_vector[offset..offset + 8] + .try_into() + .expect("slice of length 8"), + ); + offset += 8; + if len == -1 { + builder.values().append_null(); + } else { + let s = unsafe { + std::str::from_utf8_unchecked( + &raw_vector[offset..offset + len as usize], + ) + }; + builder.values().append_value(s); + offset += len as usize; + } + } + builder.append(true); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +macro_rules! decode_scalar_string { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + Some(raw_vector) => { + let s = unsafe { std::str::from_utf8_unchecked(raw_vector) }; + builder.append_value(s); + } + None => builder.append_null(), + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the entry +struct DictEntry { + hash: u64, + group_id: usize, + bytes: Vec<u8>, +} + +pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> { + // stores the order new unique elements are seen for self.emit() + seen_elements: Vec<Option<Vec<u8>>>, + value_dt: DataType, + _phantom: PhantomData<K>, + // keeps track of which values weve already seen, keyed by raw value hash. + unique_dict_value_mapping: HashTable<DictEntry>, + + random_state: RandomState, + + // cache the group id for nulls since they all map to the same group + null_group_id: Option<usize>, + intern_called: bool, + // 0. cache pointer of arrays, this avoids having to re-compute hashing for arrays weve already seen on past iterations + // 1. avoid re-allocating buffer inbetween calls, instead of allocating a new vector each time re-use inbetween calls + values_cache: (Option<ArrayRef>, Vec<u64>), +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> { + pub fn new(data_type: &DataType) -> Self { + Self { + seen_elements: Vec::new(), + unique_dict_value_mapping: HashTable::with_capacity(INITIAL_PRE_ALLOCATION), + value_dt: data_type.clone(), + _phantom: PhantomData, + random_state: RandomState::with_seed(0), + null_group_id: None, + intern_called: false, + values_cache: (None, Vec::new()), + } + } + + /// Returns the existing `group_id` for the value with this hash and bytes, + /// or inserts a new entry and returns the freshly assigned group_id. + fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize { + match self + .unique_dict_value_mapping + .entry(hash, |e| e.bytes == raw, |e| e.hash) + { + HashTableEntry::Occupied(o) => o.get().group_id, + HashTableEntry::Vacant(v) => { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(Some(raw.to_vec())); // replace this with raw buffer to avoid this double copy #TODO see https://github.com/apache/datafusion/issues/22078 + v.insert(DictEntry { + hash, + group_id: new_group_id, + bytes: raw.to_vec(), + }); + new_group_id + } + } + } + fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> { + self.values_cache.1.clear(); + self.values_cache.1.resize(values.len(), 0); + create_hashes( + [Arc::clone(values)], + &self.random_state, + &mut self.values_cache.1, + )?; + Ok(()) + //Ok(hashes) + } + + fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> { + match values.data_type() { + DataType::Utf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringArray>() + .expect("Expected StringArray") + .value(index) + .as_bytes(), + ), + DataType::LargeUtf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<LargeStringArray>() + .expect("Expected LargeStringArray") + .value(index) + .as_bytes(), + ), + DataType::Utf8View => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringViewArray>() + .expect("Expected StringViewArray") + .value(index) + .as_bytes(), + ), + DataType::List(_) => { + let list_array = values + .as_any() + .downcast_ref::<ListArray>() + .expect("Expected ListArray"); + + debug_assert!(!list_array.is_null(index)); + + let start = list_array.value_offsets()[index] as usize; + let end = list_array.value_offsets()[index + 1] as usize; + let child = list_array.values(); + + let mut bytes = Vec::new(); + for i in start..end { + if child.is_null(i) { + // acts as a marker for transform_into_array to write a null + bytes.extend_from_slice(&(-1i64).to_ne_bytes()); + } else { + let raw = Self::get_raw_bytes(child, i); + bytes.extend_from_slice(&(raw.len() as i64).to_ne_bytes()); + bytes.extend_from_slice(&raw); + } + } + Cow::Owned(bytes) + } + other => unimplemented!("get_raw_bytes not implemented for {other:?}"), + } + } + + #[inline] + fn get_null_group_id(&mut self) -> usize { + if let Some(group_id) = self.null_group_id { + group_id + } else { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(None); + self.null_group_id = Some(new_group_id); + new_group_id + } + } + fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> Result<ArrayRef> { + let item_capacity = raw.len(); + let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE; + match &self.value_dt { + DataType::Utf8 => decode_scalar_string!( + raw, + StringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::LargeUtf8 => decode_scalar_string!( + raw, + LargeStringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::Utf8View => decode_scalar_string!( + raw, + StringViewBuilder::with_capacity(item_capacity) + ), + DataType::List(field) => match field.data_type() { + DataType::Utf8 => decode_list!( + raw, + ListBuilder::with_capacity( + StringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::LargeUtf8 => decode_list!( + raw, + ListBuilder::with_capacity( + LargeStringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::Utf8View => decode_list!( + raw, + ListBuilder::with_capacity( + StringViewBuilder::with_capacity(item_capacity), + item_capacity, + ) + ), + other => Err(NotImplemented(format!( + "transform_into_array not implemented for List<{other:?}>" + ))), + }, + other => Err(NotImplemented(format!( + "transform_into_array not implemented for {other:?}" + ))), + } + } + #[inline] + fn keys_to_usize(key_array: &PrimitiveArray<K>) -> Vec<Option<usize>> { + (0..key_array.len()) + .map(|i| { + if key_array.is_null(i) { + None + } else { + Some(key_array.value(i).to_usize().unwrap()) + } + }) + .collect() + } +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K> { + fn size(&self) -> usize { + let seen_elements_size = self.seen_elements.capacity() + * size_of::<Option<Vec<u8>>>() + + self + .seen_elements + .iter() + .filter_map(|opt| opt.as_ref()) + .map(|inner| inner.capacity()) + .sum::<usize>(); + + let unique_mapping_size = self.unique_dict_value_mapping.capacity() + * size_of::<DictEntry>() + + self + .unique_dict_value_mapping + .iter() + .map(|e| e.bytes.capacity()) + .sum::<usize>(); + + let values_cache_size = self.values_cache.1.capacity() * size_of::<u64>() + + self + .values_cache + .0 + .as_ref() + .map(|a| a.get_array_memory_size()) + .unwrap_or(0); + + size_of::<Self>() + seen_elements_size + unique_mapping_size + values_cache_size + } + fn len(&self) -> usize { + self.seen_elements.len() + } + fn is_empty(&self) -> bool { + self.seen_elements.is_empty() + } + fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> { + assert_eq!( + cols.len(), + 1, + "GroupValuesDictionary only supports a single column" + ); + let array = Arc::clone(&cols[0]); + groups.clear(); // zero out buffer + let dict_array = array + .as_any() + .downcast_ref::<DictionaryArray<K>>() + .ok_or_else(|| { + Internal(format!( + "GroupValuesDictionary expected DictionaryArray but got {:?}", + array.data_type() + )) + })?; Review Comment: ```suggestion array.as_dictionary(); ``` ########## datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs: ########## @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aggregates::group_values::GroupValues; +use crate::hash_utils::RandomState; +use arrow::array::{ + Array, ArrayRef, DictionaryArray, LargeStringArray, LargeStringBuilder, ListArray, + ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, + StringViewArray, StringViewBuilder, +}; +use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; +use datafusion_common::DataFusionError::{Internal, NotImplemented}; +use datafusion_common::Result; +use datafusion_common::hash_utils::create_hashes; +use datafusion_expr::EmitTo; +use hashbrown::HashTable; +use hashbrown::hash_table::Entry as HashTableEntry; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Heuristic for sizing the values buffer of string builders during emit: +/// dictionary-encoded values are short by design (categorical strings, short +/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common +/// case while keeping over-allocation cheap when values are smaller. +const AVG_BYTES_PER_DICT_VALUE: usize = 16; +const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for small-medium groups + +macro_rules! decode_list { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + None => builder.append_null(), + Some(raw_vector) => { + let mut offset = 0; + while offset < raw_vector.len() { + let len = i64::from_ne_bytes( + raw_vector[offset..offset + 8] + .try_into() + .expect("slice of length 8"), + ); + offset += 8; + if len == -1 { + builder.values().append_null(); + } else { + let s = unsafe { + std::str::from_utf8_unchecked( + &raw_vector[offset..offset + len as usize], + ) + }; + builder.values().append_value(s); + offset += len as usize; + } + } + builder.append(true); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +macro_rules! decode_scalar_string { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + Some(raw_vector) => { + let s = unsafe { std::str::from_utf8_unchecked(raw_vector) }; + builder.append_value(s); + } + None => builder.append_null(), + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the entry +struct DictEntry { + hash: u64, + group_id: usize, + bytes: Vec<u8>, +} + +pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> { + // stores the order new unique elements are seen for self.emit() + seen_elements: Vec<Option<Vec<u8>>>, + value_dt: DataType, + _phantom: PhantomData<K>, + // keeps track of which values weve already seen, keyed by raw value hash. + unique_dict_value_mapping: HashTable<DictEntry>, + + random_state: RandomState, + + // cache the group id for nulls since they all map to the same group + null_group_id: Option<usize>, + intern_called: bool, + // 0. cache pointer of arrays, this avoids having to re-compute hashing for arrays weve already seen on past iterations + // 1. avoid re-allocating buffer inbetween calls, instead of allocating a new vector each time re-use inbetween calls + values_cache: (Option<ArrayRef>, Vec<u64>), +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> { + pub fn new(data_type: &DataType) -> Self { + Self { + seen_elements: Vec::new(), + unique_dict_value_mapping: HashTable::with_capacity(INITIAL_PRE_ALLOCATION), + value_dt: data_type.clone(), + _phantom: PhantomData, + random_state: RandomState::with_seed(0), + null_group_id: None, + intern_called: false, + values_cache: (None, Vec::new()), + } + } + + /// Returns the existing `group_id` for the value with this hash and bytes, + /// or inserts a new entry and returns the freshly assigned group_id. + fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize { + match self + .unique_dict_value_mapping + .entry(hash, |e| e.bytes == raw, |e| e.hash) + { + HashTableEntry::Occupied(o) => o.get().group_id, + HashTableEntry::Vacant(v) => { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(Some(raw.to_vec())); // replace this with raw buffer to avoid this double copy #TODO see https://github.com/apache/datafusion/issues/22078 + v.insert(DictEntry { + hash, + group_id: new_group_id, + bytes: raw.to_vec(), + }); + new_group_id + } + } + } + fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> { + self.values_cache.1.clear(); + self.values_cache.1.resize(values.len(), 0); + create_hashes( + [Arc::clone(values)], + &self.random_state, + &mut self.values_cache.1, + )?; + Ok(()) + //Ok(hashes) + } + + fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> { + match values.data_type() { + DataType::Utf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringArray>() + .expect("Expected StringArray") + .value(index) + .as_bytes(), + ), + DataType::LargeUtf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<LargeStringArray>() + .expect("Expected LargeStringArray") + .value(index) + .as_bytes(), + ), + DataType::Utf8View => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringViewArray>() + .expect("Expected StringViewArray") + .value(index) + .as_bytes(), + ), + DataType::List(_) => { + let list_array = values + .as_any() + .downcast_ref::<ListArray>() + .expect("Expected ListArray"); + + debug_assert!(!list_array.is_null(index)); + + let start = list_array.value_offsets()[index] as usize; + let end = list_array.value_offsets()[index + 1] as usize; + let child = list_array.values(); + + let mut bytes = Vec::new(); + for i in start..end { + if child.is_null(i) { + // acts as a marker for transform_into_array to write a null + bytes.extend_from_slice(&(-1i64).to_ne_bytes()); + } else { + let raw = Self::get_raw_bytes(child, i); + bytes.extend_from_slice(&(raw.len() as i64).to_ne_bytes()); + bytes.extend_from_slice(&raw); + } + } + Cow::Owned(bytes) + } + other => unimplemented!("get_raw_bytes not implemented for {other:?}"), + } + } + + #[inline] + fn get_null_group_id(&mut self) -> usize { + if let Some(group_id) = self.null_group_id { + group_id + } else { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(None); + self.null_group_id = Some(new_group_id); + new_group_id + } + } + fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> Result<ArrayRef> { + let item_capacity = raw.len(); + let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE; + match &self.value_dt { + DataType::Utf8 => decode_scalar_string!( + raw, + StringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::LargeUtf8 => decode_scalar_string!( + raw, + LargeStringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::Utf8View => decode_scalar_string!( + raw, + StringViewBuilder::with_capacity(item_capacity) + ), + DataType::List(field) => match field.data_type() { + DataType::Utf8 => decode_list!( + raw, + ListBuilder::with_capacity( + StringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::LargeUtf8 => decode_list!( + raw, + ListBuilder::with_capacity( + LargeStringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::Utf8View => decode_list!( + raw, + ListBuilder::with_capacity( + StringViewBuilder::with_capacity(item_capacity), + item_capacity, + ) + ), + other => Err(NotImplemented(format!( + "transform_into_array not implemented for List<{other:?}>" + ))), + }, + other => Err(NotImplemented(format!( + "transform_into_array not implemented for {other:?}" + ))), + } + } + #[inline] + fn keys_to_usize(key_array: &PrimitiveArray<K>) -> Vec<Option<usize>> { + (0..key_array.len()) + .map(|i| { + if key_array.is_null(i) { + None + } else { + Some(key_array.value(i).to_usize().unwrap()) + } + }) + .collect() + } +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K> { + fn size(&self) -> usize { + let seen_elements_size = self.seen_elements.capacity() + * size_of::<Option<Vec<u8>>>() + + self + .seen_elements + .iter() + .filter_map(|opt| opt.as_ref()) + .map(|inner| inner.capacity()) + .sum::<usize>(); + + let unique_mapping_size = self.unique_dict_value_mapping.capacity() + * size_of::<DictEntry>() + + self + .unique_dict_value_mapping + .iter() + .map(|e| e.bytes.capacity()) + .sum::<usize>(); + + let values_cache_size = self.values_cache.1.capacity() * size_of::<u64>() + + self + .values_cache + .0 + .as_ref() + .map(|a| a.get_array_memory_size()) + .unwrap_or(0); + + size_of::<Self>() + seen_elements_size + unique_mapping_size + values_cache_size + } + fn len(&self) -> usize { + self.seen_elements.len() + } + fn is_empty(&self) -> bool { + self.seen_elements.is_empty() + } + fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> { + assert_eq!( + cols.len(), + 1, + "GroupValuesDictionary only supports a single column" + ); + let array = Arc::clone(&cols[0]); + groups.clear(); // zero out buffer + let dict_array = array + .as_any() + .downcast_ref::<DictionaryArray<K>>() + .ok_or_else(|| { + Internal(format!( + "GroupValuesDictionary expected DictionaryArray but got {:?}", + array.data_type() + )) + })?; + + let values = dict_array.values(); + let key_array = dict_array.keys(); + if key_array.is_empty() { + return Ok(()); + } + let keys_as_usize: Vec<Option<usize>> = Self::keys_to_usize(key_array); + + let cache_hit = self + .values_cache + .0 + .as_ref() + .map(|cached| Arc::ptr_eq(cached, values)) + .unwrap_or(false); + + if !cache_hit { + // first time seeing this values array to compute hashes, and cache the pointer, cheap operation Review Comment: wdyt about rephrasing this to something like ```suggestion // values array changed since last batch - recompute hashes and update cached pointer ``` ########## datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs: ########## @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aggregates::group_values::GroupValues; +use crate::hash_utils::RandomState; +use arrow::array::{ + Array, ArrayRef, DictionaryArray, LargeStringArray, LargeStringBuilder, ListArray, + ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, + StringViewArray, StringViewBuilder, +}; +use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; +use datafusion_common::DataFusionError::{Internal, NotImplemented}; +use datafusion_common::Result; +use datafusion_common::hash_utils::create_hashes; +use datafusion_expr::EmitTo; +use hashbrown::HashTable; +use hashbrown::hash_table::Entry as HashTableEntry; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Heuristic for sizing the values buffer of string builders during emit: +/// dictionary-encoded values are short by design (categorical strings, short +/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common +/// case while keeping over-allocation cheap when values are smaller. +const AVG_BYTES_PER_DICT_VALUE: usize = 16; +const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for small-medium groups + +macro_rules! decode_list { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + None => builder.append_null(), + Some(raw_vector) => { + let mut offset = 0; + while offset < raw_vector.len() { + let len = i64::from_ne_bytes( + raw_vector[offset..offset + 8] + .try_into() + .expect("slice of length 8"), + ); + offset += 8; + if len == -1 { + builder.values().append_null(); + } else { + let s = unsafe { + std::str::from_utf8_unchecked( + &raw_vector[offset..offset + len as usize], + ) + }; + builder.values().append_value(s); + offset += len as usize; + } + } + builder.append(true); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +macro_rules! decode_scalar_string { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + Some(raw_vector) => { + let s = unsafe { std::str::from_utf8_unchecked(raw_vector) }; + builder.append_value(s); + } + None => builder.append_null(), + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the entry +struct DictEntry { + hash: u64, + group_id: usize, + bytes: Vec<u8>, +} + +pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> { + // stores the order new unique elements are seen for self.emit() + seen_elements: Vec<Option<Vec<u8>>>, + value_dt: DataType, + _phantom: PhantomData<K>, + // keeps track of which values weve already seen, keyed by raw value hash. + unique_dict_value_mapping: HashTable<DictEntry>, + + random_state: RandomState, + + // cache the group id for nulls since they all map to the same group + null_group_id: Option<usize>, + intern_called: bool, + // 0. cache pointer of arrays, this avoids having to re-compute hashing for arrays weve already seen on past iterations + // 1. avoid re-allocating buffer inbetween calls, instead of allocating a new vector each time re-use inbetween calls + values_cache: (Option<ArrayRef>, Vec<u64>), +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> { + pub fn new(data_type: &DataType) -> Self { + Self { + seen_elements: Vec::new(), + unique_dict_value_mapping: HashTable::with_capacity(INITIAL_PRE_ALLOCATION), + value_dt: data_type.clone(), + _phantom: PhantomData, + random_state: RandomState::with_seed(0), + null_group_id: None, + intern_called: false, + values_cache: (None, Vec::new()), + } + } + + /// Returns the existing `group_id` for the value with this hash and bytes, + /// or inserts a new entry and returns the freshly assigned group_id. + fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize { + match self + .unique_dict_value_mapping + .entry(hash, |e| e.bytes == raw, |e| e.hash) + { + HashTableEntry::Occupied(o) => o.get().group_id, + HashTableEntry::Vacant(v) => { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(Some(raw.to_vec())); // replace this with raw buffer to avoid this double copy #TODO see https://github.com/apache/datafusion/issues/22078 + v.insert(DictEntry { + hash, + group_id: new_group_id, + bytes: raw.to_vec(), + }); + new_group_id + } + } + } + fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> { + self.values_cache.1.clear(); + self.values_cache.1.resize(values.len(), 0); + create_hashes( + [Arc::clone(values)], + &self.random_state, + &mut self.values_cache.1, + )?; + Ok(()) + //Ok(hashes) + } + + fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> { + match values.data_type() { + DataType::Utf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringArray>() + .expect("Expected StringArray") + .value(index) + .as_bytes(), + ), + DataType::LargeUtf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<LargeStringArray>() + .expect("Expected LargeStringArray") + .value(index) + .as_bytes(), + ), + DataType::Utf8View => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringViewArray>() + .expect("Expected StringViewArray") + .value(index) + .as_bytes(), + ), + DataType::List(_) => { + let list_array = values + .as_any() + .downcast_ref::<ListArray>() + .expect("Expected ListArray"); + + debug_assert!(!list_array.is_null(index)); + + let start = list_array.value_offsets()[index] as usize; + let end = list_array.value_offsets()[index + 1] as usize; + let child = list_array.values(); + + let mut bytes = Vec::new(); + for i in start..end { + if child.is_null(i) { + // acts as a marker for transform_into_array to write a null + bytes.extend_from_slice(&(-1i64).to_ne_bytes()); + } else { + let raw = Self::get_raw_bytes(child, i); + bytes.extend_from_slice(&(raw.len() as i64).to_ne_bytes()); + bytes.extend_from_slice(&raw); + } + } + Cow::Owned(bytes) + } + other => unimplemented!("get_raw_bytes not implemented for {other:?}"), + } + } + + #[inline] + fn get_null_group_id(&mut self) -> usize { + if let Some(group_id) = self.null_group_id { + group_id + } else { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(None); + self.null_group_id = Some(new_group_id); + new_group_id + } + } + fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> Result<ArrayRef> { + let item_capacity = raw.len(); + let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE; + match &self.value_dt { + DataType::Utf8 => decode_scalar_string!( + raw, + StringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::LargeUtf8 => decode_scalar_string!( + raw, + LargeStringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::Utf8View => decode_scalar_string!( + raw, + StringViewBuilder::with_capacity(item_capacity) + ), + DataType::List(field) => match field.data_type() { + DataType::Utf8 => decode_list!( + raw, + ListBuilder::with_capacity( + StringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::LargeUtf8 => decode_list!( + raw, + ListBuilder::with_capacity( + LargeStringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::Utf8View => decode_list!( + raw, + ListBuilder::with_capacity( + StringViewBuilder::with_capacity(item_capacity), + item_capacity, + ) + ), + other => Err(NotImplemented(format!( + "transform_into_array not implemented for List<{other:?}>" + ))), + }, + other => Err(NotImplemented(format!( + "transform_into_array not implemented for {other:?}" + ))), + } + } + #[inline] + fn keys_to_usize(key_array: &PrimitiveArray<K>) -> Vec<Option<usize>> { + (0..key_array.len()) + .map(|i| { + if key_array.is_null(i) { + None + } else { + Some(key_array.value(i).to_usize().unwrap()) + } + }) + .collect() + } +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K> { + fn size(&self) -> usize { + let seen_elements_size = self.seen_elements.capacity() + * size_of::<Option<Vec<u8>>>() + + self + .seen_elements + .iter() + .filter_map(|opt| opt.as_ref()) + .map(|inner| inner.capacity()) + .sum::<usize>(); + + let unique_mapping_size = self.unique_dict_value_mapping.capacity() + * size_of::<DictEntry>() + + self + .unique_dict_value_mapping + .iter() + .map(|e| e.bytes.capacity()) + .sum::<usize>(); + + let values_cache_size = self.values_cache.1.capacity() * size_of::<u64>() + + self + .values_cache + .0 + .as_ref() + .map(|a| a.get_array_memory_size()) + .unwrap_or(0); + + size_of::<Self>() + seen_elements_size + unique_mapping_size + values_cache_size + } + fn len(&self) -> usize { + self.seen_elements.len() + } + fn is_empty(&self) -> bool { + self.seen_elements.is_empty() + } + fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> { + assert_eq!( + cols.len(), + 1, + "GroupValuesDictionary only supports a single column" + ); + let array = Arc::clone(&cols[0]); + groups.clear(); // zero out buffer + let dict_array = array + .as_any() + .downcast_ref::<DictionaryArray<K>>() + .ok_or_else(|| { + Internal(format!( + "GroupValuesDictionary expected DictionaryArray but got {:?}", + array.data_type() + )) + })?; + + let values = dict_array.values(); + let key_array = dict_array.keys(); + if key_array.is_empty() { + return Ok(()); + } + let keys_as_usize: Vec<Option<usize>> = Self::keys_to_usize(key_array); + + let cache_hit = self + .values_cache + .0 + .as_ref() + .map(|cached| Arc::ptr_eq(cached, values)) + .unwrap_or(false); + + if !cache_hit { + // first time seeing this values array to compute hashes, and cache the pointer, cheap operation + self.compute_value_hashes(values)?; + self.values_cache.0 = Some(Arc::clone(values)); + } + let mut key_to_group: Vec<Option<usize>> = vec![None; values.len()]; + if self.intern_called { + for key_opt in keys_as_usize.iter() { + let Some(original_key) = *key_opt else { + continue; + }; + if values.is_null(original_key) { + continue; + } + if key_to_group[original_key].is_some() { + continue; + } + + let hash = self.values_cache.1[original_key]; + let raw = Self::get_raw_bytes(values, original_key); + if let Some(entry) = self + .unique_dict_value_mapping + .find(hash, |e| e.bytes == raw.as_ref()) + { + key_to_group[original_key] = Some(entry.group_id); + continue; + } + } + } + // iterate keys array (n iterations) + // only d insertions at most, repeated work is cached + for key_opt in keys_as_usize.iter() { Review Comment: This code is quite algorithmic, especially for people who won't read the PR description. I think it would be useful to add some documentation since there are too many nested branches without explanation -- wdyt about refactoring this loop with some inline comments? ``` for i in 0..key_array.len() { // null key -> null row if key_array.is_null(i) { groups.push(self.get_null_group_id()); continue; } let key = key_array.value(i).to_usize().unwrap(); // already resolved this dictionary index earlier in this batch if let Some(group_id) = key_to_group[key] { groups.push(group_id); continue; } // non-null key pointing to a null value slot — treat as null if values.is_null(key) { let gid = self.get_null_group_id(); key_to_group[key] = Some(gid); groups.push(gid); continue; } // new value — insert into hash table and assign a new group id let hash = self.values_cache.1[key]; let raw = Self::get_raw_bytes(values, key); let gid = self.lookup_or_insert_in_table(hash, raw.as_ref()); key_to_group[key] = Some(gid); groups.push(gid); } ``` ########## datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs: ########## @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aggregates::group_values::GroupValues; +use crate::hash_utils::RandomState; +use arrow::array::{ + Array, ArrayRef, DictionaryArray, LargeStringArray, LargeStringBuilder, ListArray, + ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, + StringViewArray, StringViewBuilder, +}; +use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; +use datafusion_common::DataFusionError::{Internal, NotImplemented}; +use datafusion_common::Result; +use datafusion_common::hash_utils::create_hashes; +use datafusion_expr::EmitTo; +use hashbrown::HashTable; +use hashbrown::hash_table::Entry as HashTableEntry; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Heuristic for sizing the values buffer of string builders during emit: +/// dictionary-encoded values are short by design (categorical strings, short +/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common +/// case while keeping over-allocation cheap when values are smaller. +const AVG_BYTES_PER_DICT_VALUE: usize = 16; +const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for small-medium groups + +macro_rules! decode_list { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + None => builder.append_null(), + Some(raw_vector) => { + let mut offset = 0; + while offset < raw_vector.len() { + let len = i64::from_ne_bytes( + raw_vector[offset..offset + 8] + .try_into() + .expect("slice of length 8"), + ); + offset += 8; + if len == -1 { + builder.values().append_null(); + } else { + let s = unsafe { + std::str::from_utf8_unchecked( + &raw_vector[offset..offset + len as usize], + ) + }; + builder.values().append_value(s); + offset += len as usize; + } + } + builder.append(true); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +macro_rules! decode_scalar_string { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + Some(raw_vector) => { + let s = unsafe { std::str::from_utf8_unchecked(raw_vector) }; + builder.append_value(s); + } + None => builder.append_null(), + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the entry +struct DictEntry { + hash: u64, + group_id: usize, + bytes: Vec<u8>, +} + +pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> { + // stores the order new unique elements are seen for self.emit() + seen_elements: Vec<Option<Vec<u8>>>, + value_dt: DataType, + _phantom: PhantomData<K>, + // keeps track of which values weve already seen, keyed by raw value hash. + unique_dict_value_mapping: HashTable<DictEntry>, + + random_state: RandomState, + + // cache the group id for nulls since they all map to the same group + null_group_id: Option<usize>, + intern_called: bool, + // 0. cache pointer of arrays, this avoids having to re-compute hashing for arrays weve already seen on past iterations + // 1. avoid re-allocating buffer inbetween calls, instead of allocating a new vector each time re-use inbetween calls + values_cache: (Option<ArrayRef>, Vec<u64>), +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> { + pub fn new(data_type: &DataType) -> Self { + Self { + seen_elements: Vec::new(), + unique_dict_value_mapping: HashTable::with_capacity(INITIAL_PRE_ALLOCATION), + value_dt: data_type.clone(), + _phantom: PhantomData, + random_state: RandomState::with_seed(0), + null_group_id: None, + intern_called: false, + values_cache: (None, Vec::new()), + } + } + + /// Returns the existing `group_id` for the value with this hash and bytes, + /// or inserts a new entry and returns the freshly assigned group_id. + fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize { + match self + .unique_dict_value_mapping + .entry(hash, |e| e.bytes == raw, |e| e.hash) + { + HashTableEntry::Occupied(o) => o.get().group_id, + HashTableEntry::Vacant(v) => { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(Some(raw.to_vec())); // replace this with raw buffer to avoid this double copy #TODO see https://github.com/apache/datafusion/issues/22078 + v.insert(DictEntry { + hash, + group_id: new_group_id, + bytes: raw.to_vec(), + }); + new_group_id + } + } + } + fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> { + self.values_cache.1.clear(); + self.values_cache.1.resize(values.len(), 0); + create_hashes( + [Arc::clone(values)], + &self.random_state, + &mut self.values_cache.1, + )?; + Ok(()) + //Ok(hashes) + } + + fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> { + match values.data_type() { + DataType::Utf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringArray>() + .expect("Expected StringArray") + .value(index) + .as_bytes(), + ), + DataType::LargeUtf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<LargeStringArray>() + .expect("Expected LargeStringArray") + .value(index) + .as_bytes(), + ), + DataType::Utf8View => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringViewArray>() + .expect("Expected StringViewArray") + .value(index) + .as_bytes(), + ), + DataType::List(_) => { + let list_array = values + .as_any() + .downcast_ref::<ListArray>() + .expect("Expected ListArray"); + + debug_assert!(!list_array.is_null(index)); + + let start = list_array.value_offsets()[index] as usize; + let end = list_array.value_offsets()[index + 1] as usize; + let child = list_array.values(); + + let mut bytes = Vec::new(); + for i in start..end { + if child.is_null(i) { + // acts as a marker for transform_into_array to write a null + bytes.extend_from_slice(&(-1i64).to_ne_bytes()); + } else { + let raw = Self::get_raw_bytes(child, i); + bytes.extend_from_slice(&(raw.len() as i64).to_ne_bytes()); + bytes.extend_from_slice(&raw); + } + } + Cow::Owned(bytes) + } + other => unimplemented!("get_raw_bytes not implemented for {other:?}"), + } + } + + #[inline] + fn get_null_group_id(&mut self) -> usize { + if let Some(group_id) = self.null_group_id { + group_id + } else { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(None); + self.null_group_id = Some(new_group_id); + new_group_id + } + } + fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> Result<ArrayRef> { + let item_capacity = raw.len(); + let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE; + match &self.value_dt { + DataType::Utf8 => decode_scalar_string!( + raw, + StringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::LargeUtf8 => decode_scalar_string!( + raw, + LargeStringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::Utf8View => decode_scalar_string!( + raw, + StringViewBuilder::with_capacity(item_capacity) + ), + DataType::List(field) => match field.data_type() { + DataType::Utf8 => decode_list!( + raw, + ListBuilder::with_capacity( + StringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::LargeUtf8 => decode_list!( + raw, + ListBuilder::with_capacity( + LargeStringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::Utf8View => decode_list!( + raw, + ListBuilder::with_capacity( + StringViewBuilder::with_capacity(item_capacity), + item_capacity, + ) + ), + other => Err(NotImplemented(format!( + "transform_into_array not implemented for List<{other:?}>" + ))), + }, + other => Err(NotImplemented(format!( + "transform_into_array not implemented for {other:?}" + ))), + } + } + #[inline] + fn keys_to_usize(key_array: &PrimitiveArray<K>) -> Vec<Option<usize>> { + (0..key_array.len()) + .map(|i| { + if key_array.is_null(i) { + None + } else { + Some(key_array.value(i).to_usize().unwrap()) + } + }) + .collect() + } +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K> { + fn size(&self) -> usize { + let seen_elements_size = self.seen_elements.capacity() + * size_of::<Option<Vec<u8>>>() + + self + .seen_elements + .iter() + .filter_map(|opt| opt.as_ref()) + .map(|inner| inner.capacity()) + .sum::<usize>(); + + let unique_mapping_size = self.unique_dict_value_mapping.capacity() + * size_of::<DictEntry>() + + self + .unique_dict_value_mapping + .iter() + .map(|e| e.bytes.capacity()) + .sum::<usize>(); + + let values_cache_size = self.values_cache.1.capacity() * size_of::<u64>() + + self + .values_cache + .0 + .as_ref() + .map(|a| a.get_array_memory_size()) Review Comment: `get_array_memory_size()` over counts if it has underlying buffers that are shared with another array ( the `RecordBatch` that owns this values array may already be tracked by the memory pool). This could cause the memory pool to falsely exceeded limits when it hasn't actually been exceeded. Although in practice I don't think this is very probable. Maybe it's worth using a more acurate method like `a.to_data().get_array_memory_size()` ([example](https://github.com/apache/datafusion/blob/b7761dc59155f04cd7d4b45eb7c5d4a1c3587d32/datafusion/functions-aggregate/src/array_agg.rs#L472)) ########## datafusion/physical-plan/src/aggregates/group_values/single_group_by/dictionary.rs: ########## @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::aggregates::group_values::GroupValues; +use crate::hash_utils::RandomState; +use arrow::array::{ + Array, ArrayRef, DictionaryArray, LargeStringArray, LargeStringBuilder, ListArray, + ListBuilder, PrimitiveArray, PrimitiveBuilder, StringArray, StringBuilder, + StringViewArray, StringViewBuilder, +}; +use arrow::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType}; +use datafusion_common::DataFusionError::{Internal, NotImplemented}; +use datafusion_common::Result; +use datafusion_common::hash_utils::create_hashes; +use datafusion_expr::EmitTo; +use hashbrown::HashTable; +use hashbrown::hash_table::Entry as HashTableEntry; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Heuristic for sizing the values buffer of string builders during emit: +/// dictionary-encoded values are short by design (categorical strings, short +/// identifiers), so 16 B/item avoids the realloc-doubling chain in the common +/// case while keeping over-allocation cheap when values are smaller. +const AVG_BYTES_PER_DICT_VALUE: usize = 16; +const INITIAL_PRE_ALLOCATION: usize = 8 * 1024; // avoid re-allocation`s for small-medium groups + +macro_rules! decode_list { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + None => builder.append_null(), + Some(raw_vector) => { + let mut offset = 0; + while offset < raw_vector.len() { + let len = i64::from_ne_bytes( + raw_vector[offset..offset + 8] + .try_into() + .expect("slice of length 8"), + ); + offset += 8; + if len == -1 { + builder.values().append_null(); + } else { + let s = unsafe { + std::str::from_utf8_unchecked( + &raw_vector[offset..offset + len as usize], + ) + }; + builder.values().append_value(s); + offset += len as usize; + } + } + builder.append(true); + } + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +macro_rules! decode_scalar_string { + ($raw:expr, $builder:expr) => {{ + let mut builder = $builder; + for raw_bytes in $raw { + match raw_bytes { + Some(raw_vector) => { + let s = unsafe { std::str::from_utf8_unchecked(raw_vector) }; + builder.append_value(s); + } + None => builder.append_null(), + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) + }}; +} +/// Entry stored inside `unique_dict_value_mapping`. Caching `hash` on the entry +struct DictEntry { + hash: u64, + group_id: usize, + bytes: Vec<u8>, +} + +pub struct GroupValuesDictionary<K: ArrowDictionaryKeyType + Send> { + // stores the order new unique elements are seen for self.emit() + seen_elements: Vec<Option<Vec<u8>>>, + value_dt: DataType, + _phantom: PhantomData<K>, + // keeps track of which values weve already seen, keyed by raw value hash. + unique_dict_value_mapping: HashTable<DictEntry>, + + random_state: RandomState, + + // cache the group id for nulls since they all map to the same group + null_group_id: Option<usize>, + intern_called: bool, + // 0. cache pointer of arrays, this avoids having to re-compute hashing for arrays weve already seen on past iterations + // 1. avoid re-allocating buffer inbetween calls, instead of allocating a new vector each time re-use inbetween calls + values_cache: (Option<ArrayRef>, Vec<u64>), +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValuesDictionary<K> { + pub fn new(data_type: &DataType) -> Self { + Self { + seen_elements: Vec::new(), + unique_dict_value_mapping: HashTable::with_capacity(INITIAL_PRE_ALLOCATION), + value_dt: data_type.clone(), + _phantom: PhantomData, + random_state: RandomState::with_seed(0), + null_group_id: None, + intern_called: false, + values_cache: (None, Vec::new()), + } + } + + /// Returns the existing `group_id` for the value with this hash and bytes, + /// or inserts a new entry and returns the freshly assigned group_id. + fn lookup_or_insert_in_table(&mut self, hash: u64, raw: &[u8]) -> usize { + match self + .unique_dict_value_mapping + .entry(hash, |e| e.bytes == raw, |e| e.hash) + { + HashTableEntry::Occupied(o) => o.get().group_id, + HashTableEntry::Vacant(v) => { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(Some(raw.to_vec())); // replace this with raw buffer to avoid this double copy #TODO see https://github.com/apache/datafusion/issues/22078 + v.insert(DictEntry { + hash, + group_id: new_group_id, + bytes: raw.to_vec(), + }); + new_group_id + } + } + } + fn compute_value_hashes(&mut self, values: &ArrayRef) -> Result<()> { + self.values_cache.1.clear(); + self.values_cache.1.resize(values.len(), 0); + create_hashes( + [Arc::clone(values)], + &self.random_state, + &mut self.values_cache.1, + )?; + Ok(()) + //Ok(hashes) + } + + fn get_raw_bytes(values: &ArrayRef, index: usize) -> Cow<'_, [u8]> { + match values.data_type() { + DataType::Utf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringArray>() + .expect("Expected StringArray") + .value(index) + .as_bytes(), + ), + DataType::LargeUtf8 => Cow::Borrowed( + values + .as_any() + .downcast_ref::<LargeStringArray>() + .expect("Expected LargeStringArray") + .value(index) + .as_bytes(), + ), + DataType::Utf8View => Cow::Borrowed( + values + .as_any() + .downcast_ref::<StringViewArray>() + .expect("Expected StringViewArray") + .value(index) + .as_bytes(), + ), + DataType::List(_) => { + let list_array = values + .as_any() + .downcast_ref::<ListArray>() + .expect("Expected ListArray"); + + debug_assert!(!list_array.is_null(index)); + + let start = list_array.value_offsets()[index] as usize; + let end = list_array.value_offsets()[index + 1] as usize; + let child = list_array.values(); + + let mut bytes = Vec::new(); + for i in start..end { + if child.is_null(i) { + // acts as a marker for transform_into_array to write a null + bytes.extend_from_slice(&(-1i64).to_ne_bytes()); + } else { + let raw = Self::get_raw_bytes(child, i); + bytes.extend_from_slice(&(raw.len() as i64).to_ne_bytes()); + bytes.extend_from_slice(&raw); + } + } + Cow::Owned(bytes) + } + other => unimplemented!("get_raw_bytes not implemented for {other:?}"), + } + } + + #[inline] + fn get_null_group_id(&mut self) -> usize { + if let Some(group_id) = self.null_group_id { + group_id + } else { + let new_group_id = self.seen_elements.len(); + self.seen_elements.push(None); + self.null_group_id = Some(new_group_id); + new_group_id + } + } + fn transform_into_array(&self, raw: &[Option<Vec<u8>>]) -> Result<ArrayRef> { + let item_capacity = raw.len(); + let data_capacity = item_capacity * AVG_BYTES_PER_DICT_VALUE; + match &self.value_dt { + DataType::Utf8 => decode_scalar_string!( + raw, + StringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::LargeUtf8 => decode_scalar_string!( + raw, + LargeStringBuilder::with_capacity(item_capacity, data_capacity) + ), + DataType::Utf8View => decode_scalar_string!( + raw, + StringViewBuilder::with_capacity(item_capacity) + ), + DataType::List(field) => match field.data_type() { + DataType::Utf8 => decode_list!( + raw, + ListBuilder::with_capacity( + StringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::LargeUtf8 => decode_list!( + raw, + ListBuilder::with_capacity( + LargeStringBuilder::with_capacity(item_capacity, data_capacity), + item_capacity, + ) + ), + DataType::Utf8View => decode_list!( + raw, + ListBuilder::with_capacity( + StringViewBuilder::with_capacity(item_capacity), + item_capacity, + ) + ), + other => Err(NotImplemented(format!( + "transform_into_array not implemented for List<{other:?}>" + ))), + }, + other => Err(NotImplemented(format!( + "transform_into_array not implemented for {other:?}" + ))), + } + } + #[inline] + fn keys_to_usize(key_array: &PrimitiveArray<K>) -> Vec<Option<usize>> { + (0..key_array.len()) + .map(|i| { + if key_array.is_null(i) { + None + } else { + Some(key_array.value(i).to_usize().unwrap()) + } + }) + .collect() + } +} + +impl<K: ArrowDictionaryKeyType + Send> GroupValues for GroupValuesDictionary<K> { + fn size(&self) -> usize { + let seen_elements_size = self.seen_elements.capacity() + * size_of::<Option<Vec<u8>>>() + + self + .seen_elements + .iter() + .filter_map(|opt| opt.as_ref()) + .map(|inner| inner.capacity()) + .sum::<usize>(); + + let unique_mapping_size = self.unique_dict_value_mapping.capacity() + * size_of::<DictEntry>() + + self + .unique_dict_value_mapping + .iter() + .map(|e| e.bytes.capacity()) + .sum::<usize>(); + + let values_cache_size = self.values_cache.1.capacity() * size_of::<u64>() + + self + .values_cache + .0 + .as_ref() + .map(|a| a.get_array_memory_size()) + .unwrap_or(0); + + size_of::<Self>() + seen_elements_size + unique_mapping_size + values_cache_size + } + fn len(&self) -> usize { + self.seen_elements.len() + } + fn is_empty(&self) -> bool { + self.seen_elements.is_empty() + } + fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()> { + assert_eq!( + cols.len(), + 1, + "GroupValuesDictionary only supports a single column" + ); + let array = Arc::clone(&cols[0]); + groups.clear(); // zero out buffer + let dict_array = array + .as_any() + .downcast_ref::<DictionaryArray<K>>() + .ok_or_else(|| { + Internal(format!( + "GroupValuesDictionary expected DictionaryArray but got {:?}", + array.data_type() + )) + })?; + + let values = dict_array.values(); + let key_array = dict_array.keys(); + if key_array.is_empty() { + return Ok(()); + } + let keys_as_usize: Vec<Option<usize>> = Self::keys_to_usize(key_array); + + let cache_hit = self + .values_cache + .0 + .as_ref() + .map(|cached| Arc::ptr_eq(cached, values)) + .unwrap_or(false); + + if !cache_hit { + // first time seeing this values array to compute hashes, and cache the pointer, cheap operation + self.compute_value_hashes(values)?; + self.values_cache.0 = Some(Arc::clone(values)); + } + let mut key_to_group: Vec<Option<usize>> = vec![None; values.len()]; + if self.intern_called { Review Comment: Can we add a small comment on why we have this branch? ``` // For values already seen in prior batches, pre fill `key_to_group` // so the main loop below can skip hash table lookups for those keys. // This pre scan is skipped on the first batch since the hash table is empty. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
