wgtmac commented on code in PR #156:
URL: https://github.com/apache/iceberg-cpp/pull/156#discussion_r2253515467


##########
src/iceberg/transform.h:
##########
@@ -170,8 +172,8 @@ class ICEBERG_EXPORT TransformFunction {
  public:
   virtual ~TransformFunction() = default;
   TransformFunction(TransformType transform_type, std::shared_ptr<Type> 
source_type);
-  /// \brief Transform an input array to a new array
-  virtual Result<ArrowArray> Transform(const ArrowArray& data) = 0;
+  /// \brief Transform an input Literal to a new Literal
+  virtual Result<std::optional<Literal>> Transform(const Literal& literal) = 0;

Review Comment:
   Does it make sense to introduce a `void` variant to Literal? It does not 
deserve to pay for an extra optional value check every time when the void 
transform is a rare case.



##########
test/transform_test.cc:
##########
@@ -193,4 +187,316 @@ TEST(TransformResultTypeTest, NegativeCases) {
   }
 }
 
+TEST(TransformLiteralTest, IdentityTransform) {
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::boolean(),
+       .source = Literal::Boolean(true),
+       .expected = Literal::Boolean(true)},
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(42),
+       .expected = Literal::Int(42)},
+      {.source_type = iceberg::int32(),
+       .source = Literal::Date(30000),
+       .expected = Literal::Date(30000)},
+      {.source_type = iceberg::int64(),
+       .source = Literal::Long(1234567890),
+       .expected = Literal::Long(1234567890)},
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),
+       .expected = Literal::Timestamp(1622547800000000)},
+      {.source_type = iceberg::timestamp_tz(),
+       .source = Literal::TimestampTz(1622547800000000),
+       .expected = Literal::TimestampTz(1622547800000000)},
+      {.source_type = iceberg::float32(),
+       .source = Literal::Float(3.14),
+       .expected = Literal::Float(3.14)},
+      {.source_type = iceberg::float64(),
+       .source = Literal::Double(1.23e-5),
+       .expected = Literal::Double(1.23e-5)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("Hello, World!"),
+       .expected = Literal::String("Hello, World!")},
+      {.source_type = iceberg::binary(),
+       .source = Literal::Binary({0x01, 0x02, 0x03}),
+       .expected = Literal::Binary({0x01, 0x02, 0x03})},
+  };
+
+  for (const auto& c : cases) {
+    auto transform = Transform::Identity();
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind identity 
transform";
+
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformLiteralTest, BucketTransform) {
+  constexpr int32_t num_buckets = 4;
+  auto transform = Transform::Bucket(num_buckets);
+
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::int32(),
+       .source = Literal::Int(42),
+       .expected = Literal::Int(3)},
+      {.source_type = iceberg::date(),
+       .source = Literal::Date(30000),
+       .expected = Literal::Int(2)},
+      {.source_type = iceberg::int64(),
+       .source = Literal::Long(1234567890),
+       .expected = Literal::Int(3)},
+      {.source_type = iceberg::timestamp(),
+       .source = Literal::Timestamp(1622547800000000),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::timestamp_tz(),
+       .source = Literal::TimestampTz(1622547800000000),
+       .expected = Literal::Int(1)},
+      {.source_type = iceberg::string(),
+       .source = Literal::String("test"),
+       .expected = Literal::Int(3)},
+  };
+
+  for (const auto& c : cases) {
+    auto transformPtr = transform->Bind(c.source_type);
+    ASSERT_TRUE(transformPtr.has_value()) << "Failed to bind bucket transform";
+    auto result = transformPtr.value()->Transform(c.source);
+    ASSERT_TRUE(result.has_value())
+        << "Failed to transform literal: " << c.source.ToString();
+
+    EXPECT_EQ(result.value(), c.expected)
+        << "Unexpected result for source: " << c.source.ToString();
+  }
+}
+
+TEST(TransformLiteralTest, TruncateTransform) {
+  struct Case {
+    std::shared_ptr<Type> source_type;
+    int32_t width;
+    Literal source;
+    Literal expected;
+  };
+
+  const std::vector<Case> cases = {
+      {.source_type = iceberg::int32(),
+       .width = 5,
+       .source = Literal::Int(123456),
+       .expected = Literal::Int(123455)},
+      {.source_type = iceberg::string(),
+       .width = 5,
+       .source = Literal::String("Hello, World!"),
+       .expected = Literal::String("Hello")},
+      {.source_type = iceberg::string(),
+       .width = 5,
+       .source = Literal::String("😜🧐🤔🤪🥳"),
+       // Truncate to 5 bytes, the safe point should be four bytes which fits 
the first

Review Comment:
   No, truncation should be based on the number of unicode codepoints (a.k.a. 
emoji symbols) not the number of characters. Please take a look at 
https://github.com/apache/iceberg/blob/acecf3cb697a7fa6bfcb1b58fc5d7d70914436e7/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java#L41-L55



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to