gene-bordegaray commented on code in PR #22002:
URL: https://github.com/apache/datafusion/pull/22002#discussion_r3178372433
##########
datafusion/physical-expr/src/partitioning.rs:
##########
@@ -133,13 +175,327 @@ impl Display for Partitioning {
.join(", ");
write!(f, "Hash([{phy_exprs_str}], {size})")
}
+ Partitioning::Custom(custom) => write!(f, "{custom}"),
Partitioning::UnknownPartitioning(size) => {
write!(f, "UnknownPartitioning({size})")
}
}
}
}
+/// Ordered range partitioning for one or more expressions.
+///
+/// Each [`PartitionRange`] describes the value range for one output partition.
+/// Ranges are interpreted lexicographically across [`Self::exprs`]. This type
+/// records the partitioning contract; callers are responsible for constructing
+/// non-overlapping ranges that accurately describe the source.
+#[derive(Debug, Clone)]
+pub struct RangePartitioning {
+ exprs: Vec<Arc<dyn PhysicalExpr>>,
+ ranges: Vec<PartitionRange>,
+}
+
+impl RangePartitioning {
+ /// Create a new [`RangePartitioning`].
+ ///
+ /// Each bound must have the same arity as `exprs`.
+ pub fn try_new(
+ exprs: Vec<Arc<dyn PhysicalExpr>>,
+ ranges: Vec<PartitionRange>,
+ ) -> Result<Self> {
+ if exprs.is_empty() {
+ return plan_err!("RangePartitioning requires at least one
expression");
+ }
+ if ranges.is_empty() {
+ return plan_err!("RangePartitioning requires at least one range");
+ }
+
+ for range in &ranges {
+ range.validate(exprs.len())?;
+ }
+
+ Ok(Self { exprs, ranges })
+ }
+
+ /// Expressions whose values determine the partition range.
+ pub fn exprs(&self) -> &[Arc<dyn PhysicalExpr>] {
+ &self.exprs
+ }
+
+ /// Per-partition ranges, in partition index order.
+ pub fn ranges(&self) -> &[PartitionRange] {
+ &self.ranges
+ }
+
+ /// Number of range partitions.
+ pub fn partition_count(&self) -> usize {
+ self.ranges.len()
+ }
+
+ /// Returns how this range partitioning satisfies a hash distribution
Review Comment:
this is something we will have to be careful of and raises the question:
Should we allow different types of partitioning to satisfy each other? This
may provide more optimizations like skipping repartitions, but can see this
getting quite brittle
This is also why the idea of "compatibility" was introduced:
- `satisfaction` - decides whether a partitioning is needed
- `compatibility` - decides whether partition specific behavior is valid
(like dynamic filter routing)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]