wgtmac commented on code in PR #238: URL: https://github.com/apache/iceberg-cpp/pull/238#discussion_r2428946710
########## src/iceberg/util/truncate_util.cc: ########## @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include <cstdint> +#include <memory> +#include <utility> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +template <TypeId type_id> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + std::unreachable(); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t width) { + int32_t v = std::get<int32_t>(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kLong>(const Literal& literal, int32_t width) { + int64_t v = std::get<int64_t>(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kDecimal>(const Literal& literal, int32_t width) { + const auto& decimal = std::get<Decimal>(literal.value()); + const auto& type = std::dynamic_pointer_cast<DecimalType>(literal.type()); + ICEBERG_DCHECK(type != nullptr, "Literal type must be DecimalType"); Review Comment: ```suggestion const auto& type = internal::checked_cast<DecimalType>(*literal.type()); ``` ########## src/iceberg/util/truncate_util.cc: ########## @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include <cstdint> +#include <memory> +#include <utility> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +template <TypeId type_id> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + std::unreachable(); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t width) { + int32_t v = std::get<int32_t>(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kLong>(const Literal& literal, int32_t width) { + int64_t v = std::get<int64_t>(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kDecimal>(const Literal& literal, int32_t width) { + const auto& decimal = std::get<Decimal>(literal.value()); + const auto& type = std::dynamic_pointer_cast<DecimalType>(literal.type()); + ICEBERG_DCHECK(type != nullptr, "Literal type must be DecimalType"); + return Literal::MakeDecimal(TruncateUtils::TruncateDecimal(decimal, width), + type->precision(), type->scale()); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kString>(const Literal& literal, int32_t width) { + // Strings are truncated to a valid UTF-8 string with no more than L code points. Review Comment: ```suggestion // Strings are truncated to a valid UTF-8 string with no more than `width` code points. ``` ########## src/iceberg/util/decimal.h: ########## @@ -164,6 +165,12 @@ class ICEBERG_EXPORT Decimal : public util::Formattable { /// \return error status if the length is an invalid value static Result<Decimal> FromBigEndian(const uint8_t* data, int32_t length); + /// \brief Convert Decimal's unscaled value to two’s-complement big-endian binary, using + /// the minimum number of bytes for the value. + /// \param value The unscaled value. + /// \return A vector containing the big-endian bytes. + static std::vector<uint8_t> ToBigEndian(int128_t value); Review Comment: Why not making it a member function? ########## src/iceberg/util/bucket_util.h: ########## @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <cstdint> +#include <span> + +#include "iceberg/expression/literal.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" + +namespace iceberg { + +class ICEBERG_EXPORT BucketUtils { + public: + /// \brief Hash a 32-bit integer using MurmurHash3 and return a 32-bit hash value. + /// \param value The input integer to hash. + /// \note Integer and long hash results must be identical for all integer values. This + /// ensures that schema evolution does not change bucket partition values if integer + /// types are promoted. + /// \return A 32-bit hash value. + static inline int32_t HashInt(int32_t value) { + return HashLong(static_cast<int64_t>(value)); + } + + /// \brief Hash a 64-bit integer using MurmurHash3 and return a 32-bit hash value. + /// \param value The input long to hash. + /// \return A 32-bit hash value. + static int32_t HashLong(int64_t value); + + /// \brief Hash a byte array using MurmurHash3 and return a 32-bit hash value. + /// \param bytes The input byte array to hash. + /// \return A 32-bit hash value. + static int32_t HashBytes(std::span<const uint8_t> bytes); + + /// \brief Compute the bucket index for a given literal and number of buckets. + /// \param literal The input literal to hash. + /// \param num_buckets The number of buckets to hash into. + /// \return A Result containing the bucket index or an error. + /// The bucket index is in the range [0, num_buckets - 1]. + static Result<Literal> BucketIndex(const Literal& literal, int32_t num_buckets); Review Comment: Is it better to return `int32_t`? The caller can wrap it in a Literal. ########## src/iceberg/util/truncate_util.cc: ########## @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include <cstdint> +#include <memory> +#include <utility> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +template <TypeId type_id> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + std::unreachable(); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t width) { + int32_t v = std::get<int32_t>(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kLong>(const Literal& literal, int32_t width) { + int64_t v = std::get<int64_t>(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kDecimal>(const Literal& literal, int32_t width) { + const auto& decimal = std::get<Decimal>(literal.value()); + const auto& type = std::dynamic_pointer_cast<DecimalType>(literal.type()); + ICEBERG_DCHECK(type != nullptr, "Literal type must be DecimalType"); + return Literal::MakeDecimal(TruncateUtils::TruncateDecimal(decimal, width), + type->precision(), type->scale()); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kString>(const Literal& literal, int32_t width) { + // Strings are truncated to a valid UTF-8 string with no more than L code points. + const auto& str = std::get<std::string>(literal.value()); + return Literal::String(TruncateUtils::TruncateUTF8(str, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t width) { + /// In contrast to strings, binary values do not have an assumed encoding and are + /// truncated to L bytes. + const auto& data = std::get<std::vector<uint8_t>>(literal.value()); + if (data.size() <= width) { + return literal; + } + return Literal::Binary(std::vector<uint8_t>(data.begin(), data.begin() + width)); +} + +} // namespace + +Decimal TruncateUtils::TruncateDecimal(const Decimal& decimal, int32_t width) { + return decimal - (((decimal % width) + width) % width); +} + +#define DISPATCH_TRUNCATE_LITERAL(TYPE_ID) \ + case TYPE_ID: \ + return TruncateLiteralImpl<TYPE_ID>(literal, width); + +Result<Literal> TruncateUtils::TruncateLiteral(const Literal& literal, int32_t width) { + if (literal.IsNull()) [[unlikely]] { + // Return null as is + return literal; + } + + if (literal.IsAboveMax()) [[unlikely]] { + return NotSupported("Cannot truncate AboveMax"); + } + + if (literal.IsBelowMin()) [[unlikely]] { + return NotSupported("Cannot truncate BelowMin"); Review Comment: ```suggestion if (literal.IsAboveMax() || literal.IsBelowMin()) [[unlikely]] { return NotSupported("Cannot truncate {}", literal->ToString()); ``` ########## src/iceberg/util/temporal_util.cc: ########## @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/temporal_util.h" + +#include <chrono> +#include <utility> + +#include "iceberg/result.h" + +namespace iceberg { + +namespace { + +using namespace std::chrono; // NOLINT + +template <TypeId type_id> +Result<Literal> ExtractYearImpl(const Literal& literal) { + std::unreachable(); +} + +template <> +Result<Literal> ExtractYearImpl<TypeId::kDate>(const Literal& literal) { + auto value = std::get<int32_t>(literal.value()); + auto epoch = sys_days(year{1970} / January / 1); Review Comment: Can we add something like below for the boilerplate code? ```cpp constexpr auto kEpoch = year{1970} / January / 1; inline const auto kEpochDays = sys_days(kEpoch); inline const auto kEpochYmd = year_month_day(kEpochDays); inline year_month_day DateToYmd(int32_t days_since_epoch) { return {kEpochDays + days{days_since_epoch}}; } inline year_month_day TimestampToYmd(int64_t micros_since_epoch) { return {floor<days>(sys_time<microseconds>(microseconds{micros_since_epoch}))}; } inline int32_t CalculateMonthsSinceEpoch(const year_month_day& ymd) { auto delta = ymd.year() - kEpochYmd.year(); // Calculate the month as months from 1970-01 // Note: January is month 1, so we subtract 1 to get zero-based month count. return static_cast<int32_t>(delta.count() * 12 + static_cast<unsigned>(ymd.month()) - 1); } ``` ########## src/iceberg/util/truncate_util.cc: ########## @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include <cstdint> +#include <memory> +#include <utility> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +template <TypeId type_id> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + std::unreachable(); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t width) { + int32_t v = std::get<int32_t>(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kLong>(const Literal& literal, int32_t width) { + int64_t v = std::get<int64_t>(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kDecimal>(const Literal& literal, int32_t width) { + const auto& decimal = std::get<Decimal>(literal.value()); + const auto& type = std::dynamic_pointer_cast<DecimalType>(literal.type()); + ICEBERG_DCHECK(type != nullptr, "Literal type must be DecimalType"); + return Literal::MakeDecimal(TruncateUtils::TruncateDecimal(decimal, width), + type->precision(), type->scale()); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kString>(const Literal& literal, int32_t width) { + // Strings are truncated to a valid UTF-8 string with no more than L code points. + const auto& str = std::get<std::string>(literal.value()); + return Literal::String(TruncateUtils::TruncateUTF8(str, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kBinary>(const Literal& literal, int32_t width) { + /// In contrast to strings, binary values do not have an assumed encoding and are + /// truncated to L bytes. Review Comment: ```suggestion // In contrast to strings, binary values do not have an assumed encoding and are // truncated to `width` bytes. ``` We only need `///` for docstrings so here using `//` is enough. ########## src/iceberg/util/truncate_util.cc: ########## @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/truncate_util.h" + +#include <cstdint> +#include <memory> +#include <utility> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +namespace { +template <TypeId type_id> +Literal TruncateLiteralImpl(const Literal& literal, int32_t width) { + std::unreachable(); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kInt>(const Literal& literal, int32_t width) { + int32_t v = std::get<int32_t>(literal.value()); + return Literal::Int(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kLong>(const Literal& literal, int32_t width) { + int64_t v = std::get<int64_t>(literal.value()); + return Literal::Long(TruncateUtils::TruncateInteger(v, width)); +} + +template <> +Literal TruncateLiteralImpl<TypeId::kDecimal>(const Literal& literal, int32_t width) { + const auto& decimal = std::get<Decimal>(literal.value()); + const auto& type = std::dynamic_pointer_cast<DecimalType>(literal.type()); + ICEBERG_DCHECK(type != nullptr, "Literal type must be DecimalType"); + return Literal::MakeDecimal(TruncateUtils::TruncateDecimal(decimal, width), + type->precision(), type->scale()); Review Comment: ```suggestion return Literal::MakeDecimal(TruncateUtils::TruncateDecimal(decimal, width), type.precision(), type.scale()); ``` ########## src/iceberg/util/temporal_util.cc: ########## @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/temporal_util.h" + +#include <chrono> +#include <utility> + +#include "iceberg/result.h" + Review Comment: ```suggestion ``` ########## src/iceberg/util/decimal.cc: ########## @@ -505,6 +504,36 @@ Result<Decimal> Decimal::FromBigEndian(const uint8_t* bytes, int32_t length) { return Decimal(static_cast<int128_t>(result)); } +std::vector<uint8_t> Decimal::ToBigEndian(int128_t value) { + std::vector<uint8_t> bytes(kMaxDecimalBytes); + + auto uvalue = static_cast<uint128_t>(value); + std::memcpy(bytes.data(), &uvalue, 16); Review Comment: ```suggestion std::memcpy(bytes.data(), &uvalue, kMaxDecimalBytes); ``` ########## src/iceberg/util/bucket_util.cc: ########## @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/bucket_util.h" + +#include <utility> + +#include "iceberg/type.h" +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" +#include "iceberg/util/murmurhash3_internal.h" + +namespace iceberg { + +namespace { +template <TypeId type_id> +int32_t HashLiteral(const Literal& literal) { + std::unreachable(); +} + +template <> +int32_t HashLiteral<TypeId::kInt>(const Literal& literal) { + return BucketUtils::HashInt(std::get<int32_t>(literal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kDate>(const Literal& literal) { + return BucketUtils::HashInt(std::get<int32_t>(literal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kLong>(const Literal& literal) { + return BucketUtils::HashLong(std::get<int64_t>(literal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kTime>(const Literal& literal) { + return BucketUtils::HashLong(std::get<int64_t>(literal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kTimestamp>(const Literal& literal) { + return BucketUtils::HashLong(std::get<int64_t>(literal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kTimestampTz>(const Literal& literal) { + return BucketUtils::HashLong(std::get<int64_t>(literal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kDecimal>(const Literal& literal) { + const auto& decimal = std::get<Decimal>(literal.value()); + return BucketUtils::HashBytes(Decimal::ToBigEndian(decimal.value())); +} + +template <> +int32_t HashLiteral<TypeId::kString>(const Literal& literal) { + const auto& str = std::get<std::string>(literal.value()); + return BucketUtils::HashBytes( + std::span<const uint8_t>(reinterpret_cast<const uint8_t*>(str.data()), str.size())); +} + +template <> +int32_t HashLiteral<TypeId::kUuid>(const Literal& literal) { + const auto& uuid = std::get<Uuid>(literal.value()); + return BucketUtils::HashBytes(uuid.bytes()); +} + +template <> +int32_t HashLiteral<TypeId::kBinary>(const Literal& literal) { + const auto& binary = std::get<std::vector<uint8_t>>(literal.value()); + return BucketUtils::HashBytes(binary); +} + +template <> +int32_t HashLiteral<TypeId::kFixed>(const Literal& literal) { + const auto& fixed = std::get<std::vector<uint8_t>>(literal.value()); + return BucketUtils::HashBytes(fixed); +} + +} // namespace + +int32_t BucketUtils::HashBytes(std::span<const uint8_t> bytes) { + int32_t hash_value = 0; + MurmurHash3_x86_32(bytes.data(), bytes.size(), 0, &hash_value); + return hash_value; +} + +int32_t BucketUtils::HashLong(int64_t value) { + int32_t hash_value = 0; + value = ToLittleEndian(value); + MurmurHash3_x86_32(&value, sizeof(int64_t), 0, &hash_value); + return hash_value; +} + +#define DISPATCH_HASH_LITERAL(TYPE_ID) \ + case TYPE_ID: \ + hash_value = HashLiteral<TYPE_ID>(literal); \ + break; + +Result<Literal> BucketUtils::BucketIndex(const Literal& literal, int32_t num_buckets) { + ICEBERG_DCHECK(num_buckets > 0, "Number of buckets must be positive"); Review Comment: I'd prefer this to be an error instead of an assert. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
