wgtmac commented on code in PR #373:
URL: https://github.com/apache/iceberg-cpp/pull/373#discussion_r2581765410


##########
src/iceberg/expression/aggregate.h:
##########
@@ -127,6 +128,10 @@ class ICEBERG_EXPORT BoundAggregate : public 
Aggregate<BoundTerm>, public Bound
   }
 
   Result<Literal> Evaluate(const StructLike& data) const override = 0;
+  virtual Result<Literal> Evaluate(const DataFile& file) const;
+
+  /// \brief Whether metrics in the data file are sufficient to evaluate.
+  virtual bool HasValue(const DataFile& file) const { return false; }

Review Comment:
   Same here



##########
src/iceberg/expression/aggregate.cc:
##########
@@ -211,10 +445,34 @@ Result<Literal> MaxAggregate::Evaluate(const StructLike& 
data) const {
   return term()->Evaluate(data);
 }
 
+Result<Literal> MaxAggregate::Evaluate(const DataFile& file) const {
+  auto field_id = GetFieldId(term());
+  auto upper = GetMapValue(file.upper_bounds, field_id);
+  auto ptype = GetPrimitiveType(*term());
+  if (!upper.has_value()) {
+    SingleValueStructLike data(Literal::Null(ptype));
+    return term()->Evaluate(data);
+  }
+
+  ICEBERG_ASSIGN_OR_RAISE(auto literal, Literal::Deserialize(*upper, ptype));
+  SingleValueStructLike data(std::move(literal));

Review Comment:
   Should we consider reusing SingleValueStructLike? The Java impl does this in 
the ValueAggregate (which we don't have yet)



##########
src/iceberg/expression/aggregate.cc:
##########
@@ -38,6 +40,58 @@ std::shared_ptr<PrimitiveType> GetPrimitiveType(const 
BoundTerm& term) {
   return internal::checked_pointer_cast<PrimitiveType>(term.type());
 }
 
+Result<Scalar> LiteralToScalar(const Literal& literal) {
+  if (literal.IsNull()) {
+    return Scalar{std::monostate{}};
+  }
+
+  switch (literal.type()->type_id()) {
+    case TypeId::kBoolean:
+      return Scalar{std::get<bool>(literal.value())};
+    case TypeId::kInt:
+    case TypeId::kDate:
+      return Scalar{std::get<int32_t>(literal.value())};
+    case TypeId::kLong:
+    case TypeId::kTime:
+    case TypeId::kTimestamp:
+    case TypeId::kTimestampTz:
+      return Scalar{std::get<int64_t>(literal.value())};
+    case TypeId::kFloat:
+      return Scalar{std::get<float>(literal.value())};
+    case TypeId::kDouble:
+      return Scalar{std::get<double>(literal.value())};
+    case TypeId::kString: {
+      const auto& str = std::get<std::string>(literal.value());
+      return Scalar{std::string_view(str)};
+    }
+    case TypeId::kBinary:
+    case TypeId::kFixed: {
+      const auto& bytes = std::get<std::vector<uint8_t>>(literal.value());
+      return Scalar{
+          std::string_view(reinterpret_cast<const char*>(bytes.data()), 
bytes.size())};
+    }
+    case TypeId::kDecimal:
+      return Scalar{std::get<Decimal>(literal.value())};
+    default:
+      return NotSupported("Cannot convert literal of type {} to Scalar",
+                          literal.type()->ToString());
+  }
+}
+
+class SingleValueStructLike : public StructLike {
+ public:
+  explicit SingleValueStructLike(Literal literal) : 
literal_(std::move(literal)) {}
+
+  Result<Scalar> GetField(size_t /*pos*/) const override {

Review Comment:
   Should we return error if pos is not 0?



##########
src/iceberg/expression/aggregate.h:
##########
@@ -143,11 +148,16 @@ class ICEBERG_EXPORT BoundAggregate : public 
Aggregate<BoundTerm>, public Bound
 class ICEBERG_EXPORT CountAggregate : public BoundAggregate {
  public:
   Result<Literal> Evaluate(const StructLike& data) const final;
+  Result<Literal> Evaluate(const DataFile& file) const override;
 
   std::unique_ptr<Aggregator> NewAggregator() const override;
 
   /// \brief Count for a single row. Subclasses implement this.
   virtual Result<int64_t> CountFor(const StructLike& data) const = 0;
+  /// \brief Count using metrics from a data file. Nullopt if not available.
+  virtual Result<std::optional<int64_t>> CountFor(const DataFile& file) const;

Review Comment:
   Is it better to declare it as `Result<int64_t> CountFor(const DataFile& 
file) const`? We will call `HasValue` before `CountFor`, so it is fine to 
return an error if it does not have value.



##########
src/iceberg/expression/aggregate.cc:
##########
@@ -173,6 +366,23 @@ Result<int64_t> CountNonNullAggregate::CountFor(const 
StructLike& data) const {
       [](const auto& val) { return val.IsNull() ? 0 : 1; });
 }
 
+Result<std::optional<int64_t>> CountNonNullAggregate::CountFor(
+    const DataFile& file) const {
+  auto field_id = GetFieldId(term());
+  auto value_count = GetMapValue(file.value_counts, field_id);
+  auto null_count = GetMapValue(file.null_value_counts, field_id).value_or(0);

Review Comment:
   Perhaps it is simpler if we directly call HasValue in CountFor and return 
error if false?



##########
src/iceberg/expression/aggregate.cc:
##########
@@ -173,6 +366,23 @@ Result<int64_t> CountNonNullAggregate::CountFor(const 
StructLike& data) const {
       [](const auto& val) { return val.IsNull() ? 0 : 1; });
 }
 
+Result<std::optional<int64_t>> CountNonNullAggregate::CountFor(
+    const DataFile& file) const {
+  auto field_id = GetFieldId(term());
+  auto value_count = GetMapValue(file.value_counts, field_id);
+  auto null_count = GetMapValue(file.null_value_counts, field_id).value_or(0);

Review Comment:
   Is this correct? `HasValue` has guaranteed that `field_id` exists in the 
`file.null_value_counts`, so we don't need to call `value_or(0)`, right? We can 
simply return `NotFound` error if any of value_count and null_count is missing.



##########
src/iceberg/expression/aggregate.h:
##########
@@ -127,6 +128,10 @@ class ICEBERG_EXPORT BoundAggregate : public 
Aggregate<BoundTerm>, public Bound
   }
 
   Result<Literal> Evaluate(const StructLike& data) const override = 0;
+  virtual Result<Literal> Evaluate(const DataFile& file) const;

Review Comment:
   Why not pure virtual?



##########
src/iceberg/expression/aggregate.h:
##########
@@ -143,11 +148,16 @@ class ICEBERG_EXPORT BoundAggregate : public 
Aggregate<BoundTerm>, public Bound
 class ICEBERG_EXPORT CountAggregate : public BoundAggregate {
  public:
   Result<Literal> Evaluate(const StructLike& data) const final;
+  Result<Literal> Evaluate(const DataFile& file) const override;
 
   std::unique_ptr<Aggregator> NewAggregator() const override;
 
   /// \brief Count for a single row. Subclasses implement this.
   virtual Result<int64_t> CountFor(const StructLike& data) const = 0;
+  /// \brief Count using metrics from a data file. Nullopt if not available.
+  virtual Result<std::optional<int64_t>> CountFor(const DataFile& file) const;

Review Comment:
   BTW, why not making it a pure virtual function?



##########
src/iceberg/expression/aggregate.cc:
##########
@@ -38,6 +40,58 @@ std::shared_ptr<PrimitiveType> GetPrimitiveType(const 
BoundTerm& term) {
   return internal::checked_pointer_cast<PrimitiveType>(term.type());
 }
 
+Result<Scalar> LiteralToScalar(const Literal& literal) {

Review Comment:
   Move this to struct_like.h/.cc?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to