wgtmac commented on code in PR #185: URL: https://github.com/apache/iceberg-cpp/pull/185#discussion_r2312908195
########## src/iceberg/util/conversions.h: ########## @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <span> +#include <vector> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { Review Comment: ```suggestion namespace iceberg { ``` ########## src/iceberg/CMakeLists.txt: ########## @@ -50,7 +50,8 @@ set(ICEBERG_SOURCES arrow_c_data_guard_internal.cc util/murmurhash3_internal.cc util/timepoint.cc - util/gzip_internal.cc) + util/gzip_internal.cc + util/conversions.cc) Review Comment: Please sort them alphabetically. ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true + result.push_back(std::get<bool>(value) ? 0x01 : 0x00); + return result; + } + + case TypeId::kInt: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kDate: { + // Stores days from 1970-01-01 in a 4-byte little-endian int + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kLong: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTime: { + // Stores microseconds from midnight in an 8-byte little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestamp: { + // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian + // long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestampTz: { + // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte + // little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kFloat: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<float>(value)); + return result; + } + + case TypeId::kDouble: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<double>(value)); + return result; + } + + case TypeId::kString: { + // UTF-8 bytes (without length) + const auto& str = std::get<std::string>(value); + result.insert(result.end(), str.begin(), str.end()); + return result; + } + + case TypeId::kBinary: { + // Binary value (without length) + const auto& binary_data = std::get<std::vector<uint8_t>>(value); + result.insert(result.end(), binary_data.begin(), binary_data.end()); + return result; + } + + case TypeId::kFixed: { + // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or + // std::vector<uint8_t> + if (std::holds_alternative<std::array<uint8_t, 16>>(value)) { + const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value); + result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end()); + } else if (std::holds_alternative<std::vector<uint8_t>>(value)) { + result = std::get<std::vector<uint8_t>>(value); + } else { + std::string actual_type = std::visit( + [](auto&& arg) -> std::string { return typeid(arg).name(); }, value); + + return InvalidArgument("Invalid value type for Fixed literal, got type: {}", + actual_type); + } + return result; + } + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Serialization for type {} is not supported", type.ToString()); + } +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) { + // Cannot serialize special values + if (literal.IsAboveMax()) { + return NotSupported("Cannot serialize AboveMax"); + } + if (literal.IsBelowMin()) { + return NotSupported("Cannot serialize BelowMin"); + } + if (literal.IsNull()) { + return NotSupported("Cannot serialize null"); + } + + return ToBytes(*literal.type(), literal.value()); +} + +Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type, + std::span<const uint8_t> data) { + // Empty data represents null value + if (data.empty()) { + return Literal::Value{std::monostate{}}; + } + + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + if (data.size() != 1) { + return InvalidArgument("Boolean requires 1 byte, got {}", data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data)); + // 0x00 for false, non-zero byte for true + return Literal::Value{static_cast<bool>(value != 0x00)}; + } + + case TypeId::kInt: { + if (data.size() != sizeof(int32_t)) { + return InvalidArgument("Int requires {} bytes, got {}", sizeof(int32_t), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data)); + return Literal::Value{value}; + } + + case TypeId::kDate: { + if (data.size() != sizeof(int32_t)) { + return InvalidArgument("Date requires {} bytes, got {}", sizeof(int32_t), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data)); + return Literal::Value{value}; + } + + case TypeId::kLong: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + int64_t value; + if (data.size() == 8) { + // Standard 8-byte long + ICEBERG_ASSIGN_OR_RAISE(auto long_value, ReadLittleEndian<int64_t>(data)); + value = long_value; + } else if (data.size() == 4) { + // Type was promoted from int to long + ICEBERG_ASSIGN_OR_RAISE(auto int_value, ReadLittleEndian<int32_t>(data)); + value = static_cast<int64_t>(int_value); + } else { + auto type_name_view = ToString(type_id); + std::string type_name{type_name_view}; + if (!type_name.empty()) { + type_name[0] = static_cast<char>(std::toupper(type_name[0])); + } + return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name, + data.size()); + } + + return Literal::Value{value}; + } + + case TypeId::kFloat: { + if (data.size() != sizeof(float)) { + return InvalidArgument("Float requires {} bytes, got {}", sizeof(float), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<float>(data)); + return Literal::Value{value}; + } + + case TypeId::kDouble: { + if (data.size() == 8) { + // Standard 8-byte double + ICEBERG_ASSIGN_OR_RAISE(auto double_value, ReadLittleEndian<double>(data)); + return Literal::Value{double_value}; + } else if (data.size() == 4) { + // Type was promoted from float to double + ICEBERG_ASSIGN_OR_RAISE(auto float_value, ReadLittleEndian<float>(data)); + return Literal::Value{static_cast<double>(float_value)}; + } else { + return InvalidArgument("Double requires 4 or 8 bytes, got {}", data.size()); + } + } + + case TypeId::kString: { + return Literal::Value{ + std::string(reinterpret_cast<const char*>(data.data()), data.size())}; + } + + case TypeId::kBinary: { + return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())}; + } + + case TypeId::kFixed: { + if (data.size() == 16) { + std::array<uint8_t, 16> fixed_bytes; + std::ranges::copy(data, fixed_bytes.begin()); + return Literal::Value{fixed_bytes}; + } else { + return Literal::Value{std::vector<uint8_t>(data.begin(), data.end())}; + } + } + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Deserialization for type {} is not supported", + type.ToString()); + } +} + +Result<Literal> Conversions::FromBytes(std::shared_ptr<PrimitiveType> type, + std::span<const uint8_t> data) { + if (!type) { + return InvalidArgument("Type cannot be null"); + } + + ICEBERG_ASSIGN_OR_RAISE(auto value, FromBytes(*type, data)); + + // If we got a null value (monostate), create a null Literal + if (std::holds_alternative<std::monostate>(value)) { + return Literal::Null(type); + } + + return Literal(value, type); Review Comment: Use `std::move`? ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true + result.push_back(std::get<bool>(value) ? 0x01 : 0x00); + return result; + } + + case TypeId::kInt: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kDate: { + // Stores days from 1970-01-01 in a 4-byte little-endian int + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kLong: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTime: { + // Stores microseconds from midnight in an 8-byte little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestamp: { + // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian + // long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestampTz: { + // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte + // little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kFloat: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<float>(value)); + return result; + } + + case TypeId::kDouble: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<double>(value)); + return result; + } + + case TypeId::kString: { + // UTF-8 bytes (without length) + const auto& str = std::get<std::string>(value); + result.insert(result.end(), str.begin(), str.end()); + return result; + } + + case TypeId::kBinary: { + // Binary value (without length) + const auto& binary_data = std::get<std::vector<uint8_t>>(value); + result.insert(result.end(), binary_data.begin(), binary_data.end()); + return result; + } + + case TypeId::kFixed: { + // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or + // std::vector<uint8_t> + if (std::holds_alternative<std::array<uint8_t, 16>>(value)) { + const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value); + result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end()); + } else if (std::holds_alternative<std::vector<uint8_t>>(value)) { + result = std::get<std::vector<uint8_t>>(value); + } else { + std::string actual_type = std::visit( + [](auto&& arg) -> std::string { return typeid(arg).name(); }, value); + + return InvalidArgument("Invalid value type for Fixed literal, got type: {}", + actual_type); + } + return result; + } + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Serialization for type {} is not supported", type.ToString()); + } +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) { + // Cannot serialize special values + if (literal.IsAboveMax()) { + return NotSupported("Cannot serialize AboveMax"); + } + if (literal.IsBelowMin()) { + return NotSupported("Cannot serialize BelowMin"); + } + if (literal.IsNull()) { + return NotSupported("Cannot serialize null"); + } + + return ToBytes(*literal.type(), literal.value()); +} + +Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type, + std::span<const uint8_t> data) { + // Empty data represents null value Review Comment: Are you sure about this? Should we return error instead? ########## src/iceberg/expression/literal.cc: ########## @@ -151,11 +152,11 @@ Literal Literal::Binary(std::vector<uint8_t> value) { Result<Literal> Literal::Deserialize(std::span<const uint8_t> data, std::shared_ptr<PrimitiveType> type) { - return NotImplemented("Deserialization of Literal is not implemented yet"); + return Conversions::FromBytes(std::move(type), data); Review Comment: Do we actually need to add `std::move` on `std::span`? ########## src/iceberg/util/conversions.h: ########## @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <span> +#include <vector> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { +class ICEBERG_EXPORT Conversions { + public: + static Result<std::vector<uint8_t>> ToBytes(const PrimitiveType& type, + const Literal::Value& value); + + static Result<std::vector<uint8_t>> ToBytes(const Literal& literal); + + static Result<Literal::Value> FromBytes(const PrimitiveType& type, + std::span<const uint8_t> data); + + static Result<Literal> FromBytes(std::shared_ptr<PrimitiveType> type, + std::span<const uint8_t> data); +}; Review Comment: ```suggestion }; ``` ########## src/iceberg/util/conversions.h: ########## @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <span> +#include <vector> + +#include "iceberg/expression/literal.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { +class ICEBERG_EXPORT Conversions { Review Comment: Please add comment to public classes and functions. ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { Review Comment: ```suggestion void WriteLittleEndian(T value, std::vector<uint8_t>* buffer) { ``` Usually the output parameter is placed at last and use pointer instead of reference type. ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true + result.push_back(std::get<bool>(value) ? 0x01 : 0x00); + return result; + } + + case TypeId::kInt: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kDate: { + // Stores days from 1970-01-01 in a 4-byte little-endian int + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kLong: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTime: { + // Stores microseconds from midnight in an 8-byte little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestamp: { + // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian + // long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestampTz: { + // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte + // little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kFloat: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<float>(value)); + return result; + } + + case TypeId::kDouble: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<double>(value)); + return result; + } + + case TypeId::kString: { + // UTF-8 bytes (without length) + const auto& str = std::get<std::string>(value); + result.insert(result.end(), str.begin(), str.end()); + return result; + } + + case TypeId::kBinary: { + // Binary value (without length) + const auto& binary_data = std::get<std::vector<uint8_t>>(value); + result.insert(result.end(), binary_data.begin(), binary_data.end()); + return result; + } + + case TypeId::kFixed: { + // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or + // std::vector<uint8_t> + if (std::holds_alternative<std::array<uint8_t, 16>>(value)) { + const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value); + result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end()); + } else if (std::holds_alternative<std::vector<uint8_t>>(value)) { + result = std::get<std::vector<uint8_t>>(value); + } else { + std::string actual_type = std::visit( + [](auto&& arg) -> std::string { return typeid(arg).name(); }, value); + + return InvalidArgument("Invalid value type for Fixed literal, got type: {}", + actual_type); + } + return result; + } + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Serialization for type {} is not supported", type.ToString()); + } +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) { + // Cannot serialize special values + if (literal.IsAboveMax()) { + return NotSupported("Cannot serialize AboveMax"); + } + if (literal.IsBelowMin()) { + return NotSupported("Cannot serialize BelowMin"); + } + if (literal.IsNull()) { + return NotSupported("Cannot serialize null"); + } + + return ToBytes(*literal.type(), literal.value()); +} + +Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type, + std::span<const uint8_t> data) { + // Empty data represents null value + if (data.empty()) { + return Literal::Value{std::monostate{}}; + } + + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { Review Comment: Can we use template FromBytes like below to make the switch case more compact? ``` template <TypeId type_id> Result<Literal::Value> FromBytes(std::span<const uint8_t> data) ``` ########## test/literal_test.cc: ########## @@ -383,4 +383,142 @@ TEST(LiteralTest, DoubleZeroComparison) { EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); } +void CheckBinaryRoundTrip(const std::vector<uint8_t>& input_bytes, + const Literal& expected_literal, + std::shared_ptr<PrimitiveType> type) { + // Deserialize from bytes + auto literal_result = Literal::Deserialize(input_bytes, type); + ASSERT_TRUE(literal_result.has_value()); + + // Check type and value are correct + EXPECT_EQ(literal_result->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(literal_result->ToString(), expected_literal.ToString()); + + // Serialize back to bytes + auto bytes_result = literal_result->Serialize(); + ASSERT_TRUE(bytes_result.has_value()); + EXPECT_EQ(*bytes_result, input_bytes); + + // Deserialize again to verify + auto final_literal = Literal::Deserialize(*bytes_result, type); + ASSERT_TRUE(final_literal.has_value()); + EXPECT_EQ(final_literal->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(final_literal->ToString(), expected_literal.ToString()); +} + +// binary serialization tests +TEST(LiteralSerializationTest, BinaryBoolean) { Review Comment: Can you please use parameterized test pattern to reuse a single test case for different types? ########## test/manifest_list_reader_test.cc: ########## @@ -20,9 +20,10 @@ #include <arrow/filesystem/localfs.h> #include <avro/GenericDatum.hh> #include <gtest/gtest.h> +#include <iceberg/avro/avro_register.h> Review Comment: ```suggestion ``` ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true + result.push_back(std::get<bool>(value) ? 0x01 : 0x00); + return result; + } + + case TypeId::kInt: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kDate: { + // Stores days from 1970-01-01 in a 4-byte little-endian int + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kLong: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTime: { + // Stores microseconds from midnight in an 8-byte little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestamp: { + // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian + // long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestampTz: { + // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte + // little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kFloat: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<float>(value)); + return result; + } + + case TypeId::kDouble: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<double>(value)); + return result; + } + + case TypeId::kString: { + // UTF-8 bytes (without length) + const auto& str = std::get<std::string>(value); + result.insert(result.end(), str.begin(), str.end()); + return result; + } + + case TypeId::kBinary: { + // Binary value (without length) + const auto& binary_data = std::get<std::vector<uint8_t>>(value); + result.insert(result.end(), binary_data.begin(), binary_data.end()); + return result; + } + + case TypeId::kFixed: { + // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or + // std::vector<uint8_t> + if (std::holds_alternative<std::array<uint8_t, 16>>(value)) { + const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value); + result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end()); + } else if (std::holds_alternative<std::vector<uint8_t>>(value)) { + result = std::get<std::vector<uint8_t>>(value); + } else { + std::string actual_type = std::visit( + [](auto&& arg) -> std::string { return typeid(arg).name(); }, value); + + return InvalidArgument("Invalid value type for Fixed literal, got type: {}", + actual_type); + } + return result; + } + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Serialization for type {} is not supported", type.ToString()); + } +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) { + // Cannot serialize special values + if (literal.IsAboveMax()) { + return NotSupported("Cannot serialize AboveMax"); + } + if (literal.IsBelowMin()) { + return NotSupported("Cannot serialize BelowMin"); + } + if (literal.IsNull()) { + return NotSupported("Cannot serialize null"); + } + + return ToBytes(*literal.type(), literal.value()); +} + +Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type, + std::span<const uint8_t> data) { + // Empty data represents null value + if (data.empty()) { + return Literal::Value{std::monostate{}}; + } + + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + if (data.size() != 1) { + return InvalidArgument("Boolean requires 1 byte, got {}", data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data)); + // 0x00 for false, non-zero byte for true + return Literal::Value{static_cast<bool>(value != 0x00)}; + } + + case TypeId::kInt: { + if (data.size() != sizeof(int32_t)) { + return InvalidArgument("Int requires {} bytes, got {}", sizeof(int32_t), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data)); + return Literal::Value{value}; + } + + case TypeId::kDate: { + if (data.size() != sizeof(int32_t)) { + return InvalidArgument("Date requires {} bytes, got {}", sizeof(int32_t), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data)); + return Literal::Value{value}; + } + + case TypeId::kLong: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + int64_t value; + if (data.size() == 8) { + // Standard 8-byte long + ICEBERG_ASSIGN_OR_RAISE(auto long_value, ReadLittleEndian<int64_t>(data)); + value = long_value; + } else if (data.size() == 4) { Review Comment: Isn't this branch applied to `case TypeId::kLong` only? ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true Review Comment: Is it too verbose to add these comments? ########## src/iceberg/expression/literal.h: ########## @@ -144,8 +144,8 @@ class ICEBERG_EXPORT Literal { Literal(Value value, std::shared_ptr<PrimitiveType> type); friend class LiteralCaster; + friend class Conversions; Review Comment: Sort them alphabetically? ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true + result.push_back(std::get<bool>(value) ? 0x01 : 0x00); + return result; + } + + case TypeId::kInt: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kDate: { + // Stores days from 1970-01-01 in a 4-byte little-endian int + WriteLittleEndian(result, std::get<int32_t>(value)); + return result; + } + + case TypeId::kLong: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTime: { + // Stores microseconds from midnight in an 8-byte little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestamp: { + // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian + // long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kTimestampTz: { + // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte + // little-endian long + WriteLittleEndian(result, std::get<int64_t>(value)); + return result; + } + + case TypeId::kFloat: { + // Stored as 4-byte little-endian + WriteLittleEndian(result, std::get<float>(value)); + return result; + } + + case TypeId::kDouble: { + // Stored as 8-byte little-endian + WriteLittleEndian(result, std::get<double>(value)); + return result; + } + + case TypeId::kString: { + // UTF-8 bytes (without length) + const auto& str = std::get<std::string>(value); + result.insert(result.end(), str.begin(), str.end()); + return result; + } + + case TypeId::kBinary: { + // Binary value (without length) + const auto& binary_data = std::get<std::vector<uint8_t>>(value); + result.insert(result.end(), binary_data.begin(), binary_data.end()); + return result; + } + + case TypeId::kFixed: { + // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or + // std::vector<uint8_t> + if (std::holds_alternative<std::array<uint8_t, 16>>(value)) { + const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value); + result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end()); + } else if (std::holds_alternative<std::vector<uint8_t>>(value)) { + result = std::get<std::vector<uint8_t>>(value); + } else { + std::string actual_type = std::visit( + [](auto&& arg) -> std::string { return typeid(arg).name(); }, value); + + return InvalidArgument("Invalid value type for Fixed literal, got type: {}", + actual_type); + } + return result; + } + // TODO(Li Feiyang): Add support for UUID and Decimal + + default: + return NotSupported("Serialization for type {} is not supported", type.ToString()); + } +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const Literal& literal) { + // Cannot serialize special values + if (literal.IsAboveMax()) { + return NotSupported("Cannot serialize AboveMax"); + } + if (literal.IsBelowMin()) { + return NotSupported("Cannot serialize BelowMin"); + } + if (literal.IsNull()) { + return NotSupported("Cannot serialize null"); + } + + return ToBytes(*literal.type(), literal.value()); +} + +Result<Literal::Value> Conversions::FromBytes(const PrimitiveType& type, + std::span<const uint8_t> data) { + // Empty data represents null value + if (data.empty()) { + return Literal::Value{std::monostate{}}; + } + + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + if (data.size() != 1) { + return InvalidArgument("Boolean requires 1 byte, got {}", data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data)); + // 0x00 for false, non-zero byte for true + return Literal::Value{static_cast<bool>(value != 0x00)}; + } + + case TypeId::kInt: { + if (data.size() != sizeof(int32_t)) { + return InvalidArgument("Int requires {} bytes, got {}", sizeof(int32_t), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data)); + return Literal::Value{value}; + } + + case TypeId::kDate: { + if (data.size() != sizeof(int32_t)) { + return InvalidArgument("Date requires {} bytes, got {}", sizeof(int32_t), + data.size()); + } + ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data)); + return Literal::Value{value}; + } + + case TypeId::kLong: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: { + int64_t value; + if (data.size() == 8) { + // Standard 8-byte long + ICEBERG_ASSIGN_OR_RAISE(auto long_value, ReadLittleEndian<int64_t>(data)); + value = long_value; + } else if (data.size() == 4) { + // Type was promoted from int to long + ICEBERG_ASSIGN_OR_RAISE(auto int_value, ReadLittleEndian<int32_t>(data)); + value = static_cast<int64_t>(int_value); + } else { + auto type_name_view = ToString(type_id); + std::string type_name{type_name_view}; + if (!type_name.empty()) { + type_name[0] = static_cast<char>(std::toupper(type_name[0])); + } + return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name, Review Comment: Shouldn't we use `ToString(type_id)` in the error message directly? ########## test/literal_test.cc: ########## @@ -383,4 +383,142 @@ TEST(LiteralTest, DoubleZeroComparison) { EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); } +void CheckBinaryRoundTrip(const std::vector<uint8_t>& input_bytes, + const Literal& expected_literal, + std::shared_ptr<PrimitiveType> type) { + // Deserialize from bytes + auto literal_result = Literal::Deserialize(input_bytes, type); + ASSERT_TRUE(literal_result.has_value()); + + // Check type and value are correct + EXPECT_EQ(literal_result->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(literal_result->ToString(), expected_literal.ToString()); + + // Serialize back to bytes + auto bytes_result = literal_result->Serialize(); + ASSERT_TRUE(bytes_result.has_value()); + EXPECT_EQ(*bytes_result, input_bytes); + + // Deserialize again to verify + auto final_literal = Literal::Deserialize(*bytes_result, type); + ASSERT_TRUE(final_literal.has_value()); + EXPECT_EQ(final_literal->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(final_literal->ToString(), expected_literal.ToString()); +} + +// binary serialization tests +TEST(LiteralSerializationTest, BinaryBoolean) { + CheckBinaryRoundTrip({1}, Literal::Boolean(true), boolean()); + CheckBinaryRoundTrip({0}, Literal::Boolean(false), boolean()); +} + +TEST(LiteralSerializationTest, BinaryInt) { + CheckBinaryRoundTrip({32, 0, 0, 0}, Literal::Int(32), int32()); +} + +TEST(LiteralSerializationTest, BinaryLong) { + CheckBinaryRoundTrip({32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32), int64()); +} + +TEST(LiteralSerializationTest, BinaryFloat) { + CheckBinaryRoundTrip({0, 0, 128, 63}, Literal::Float(1.0f), float32()); +} + +TEST(LiteralSerializationTest, BinaryDouble) { + CheckBinaryRoundTrip({0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0), float64()); +} + +TEST(LiteralSerializationTest, BinaryString) { + CheckBinaryRoundTrip({105, 99, 101, 98, 101, 114, 103}, Literal::String("iceberg"), + string()); +} + +TEST(LiteralSerializationTest, BinaryData) { + std::vector<uint8_t> data = {0x01, 0x02, 0x03, 0xFF}; + CheckBinaryRoundTrip(data, Literal::Binary(data), binary()); +} + +// Type promotion tests +TEST(LiteralSerializationTest, TypePromotion) { + // 4-byte int data can be deserialized as long + std::vector<uint8_t> int_data = {32, 0, 0, 0}; + auto long_result = Literal::Deserialize(int_data, int64()); + ASSERT_TRUE(long_result.has_value()); + EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong); + EXPECT_EQ(long_result->ToString(), "32"); + + auto long_bytes = long_result->Serialize(); + ASSERT_TRUE(long_bytes.has_value()); + EXPECT_EQ(long_bytes->size(), 8); + + // 4-byte float data can be deserialized as double + std::vector<uint8_t> float_data = {0, 0, 128, 63}; + auto double_result = Literal::Deserialize(float_data, float64()); + ASSERT_TRUE(double_result.has_value()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); + EXPECT_EQ(double_result->ToString(), "1.000000"); + + auto double_bytes = double_result->Serialize(); + ASSERT_TRUE(double_bytes.has_value()); + EXPECT_EQ(double_bytes->size(), 8); +} + +// Edge case serialization tests +TEST(LiteralSerializationTest, EdgeCases) { + // Negative integers + CheckBinaryRoundTrip({224, 255, 255, 255}, Literal::Int(-32), int32()); + CheckBinaryRoundTrip({224, 255, 255, 255, 255, 255, 255, 255}, Literal::Long(-32), + int64()); + + // Empty string special handling: empty string -> empty bytes -> null conversion + auto empty_string = Literal::String(""); + auto empty_bytes = empty_string.Serialize(); + ASSERT_TRUE(empty_bytes.has_value()); + EXPECT_TRUE(empty_bytes->empty()); + + // Empty bytes deserialize to null, not empty string + auto null_result = Literal::Deserialize(*empty_bytes, string()); + ASSERT_TRUE(null_result.has_value()); + EXPECT_TRUE(null_result->IsNull()); + + // Special floating point value serialization + auto nan_float = Literal::Float(std::numeric_limits<float>::quiet_NaN()); + auto nan_bytes = nan_float.Serialize(); + ASSERT_TRUE(nan_bytes.has_value()); + EXPECT_EQ(nan_bytes->size(), 4); + + auto inf_float = Literal::Float(std::numeric_limits<float>::infinity()); + auto inf_bytes = inf_float.Serialize(); + ASSERT_TRUE(inf_bytes.has_value()); + EXPECT_EQ(inf_bytes->size(), 4); +} + +// Error case serialization tests +TEST(LiteralSerializationTest, ErrorCases) { + // AboveMax/BelowMin values cannot be serialized + auto long_literal = + Literal::Long(static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1); + auto above_max_result = long_literal.CastTo(int32()); + ASSERT_TRUE(above_max_result.has_value()); + EXPECT_TRUE(above_max_result->IsAboveMax()); Review Comment: Can we directly create AboveMax? ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { Review Comment: BTW, why not `std::vector<uint8_t> WriteLittleEndian(T value)`? ########## test/literal_test.cc: ########## @@ -383,4 +383,142 @@ TEST(LiteralTest, DoubleZeroComparison) { EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); } +void CheckBinaryRoundTrip(const std::vector<uint8_t>& input_bytes, + const Literal& expected_literal, + std::shared_ptr<PrimitiveType> type) { + // Deserialize from bytes + auto literal_result = Literal::Deserialize(input_bytes, type); + ASSERT_TRUE(literal_result.has_value()); + + // Check type and value are correct + EXPECT_EQ(literal_result->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(literal_result->ToString(), expected_literal.ToString()); + + // Serialize back to bytes + auto bytes_result = literal_result->Serialize(); + ASSERT_TRUE(bytes_result.has_value()); + EXPECT_EQ(*bytes_result, input_bytes); + + // Deserialize again to verify + auto final_literal = Literal::Deserialize(*bytes_result, type); + ASSERT_TRUE(final_literal.has_value()); + EXPECT_EQ(final_literal->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(final_literal->ToString(), expected_literal.ToString()); +} + +// binary serialization tests +TEST(LiteralSerializationTest, BinaryBoolean) { + CheckBinaryRoundTrip({1}, Literal::Boolean(true), boolean()); + CheckBinaryRoundTrip({0}, Literal::Boolean(false), boolean()); +} + +TEST(LiteralSerializationTest, BinaryInt) { + CheckBinaryRoundTrip({32, 0, 0, 0}, Literal::Int(32), int32()); +} + +TEST(LiteralSerializationTest, BinaryLong) { + CheckBinaryRoundTrip({32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32), int64()); +} + +TEST(LiteralSerializationTest, BinaryFloat) { + CheckBinaryRoundTrip({0, 0, 128, 63}, Literal::Float(1.0f), float32()); +} + +TEST(LiteralSerializationTest, BinaryDouble) { + CheckBinaryRoundTrip({0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0), float64()); +} + +TEST(LiteralSerializationTest, BinaryString) { + CheckBinaryRoundTrip({105, 99, 101, 98, 101, 114, 103}, Literal::String("iceberg"), + string()); +} + +TEST(LiteralSerializationTest, BinaryData) { + std::vector<uint8_t> data = {0x01, 0x02, 0x03, 0xFF}; + CheckBinaryRoundTrip(data, Literal::Binary(data), binary()); +} + +// Type promotion tests +TEST(LiteralSerializationTest, TypePromotion) { + // 4-byte int data can be deserialized as long + std::vector<uint8_t> int_data = {32, 0, 0, 0}; + auto long_result = Literal::Deserialize(int_data, int64()); + ASSERT_TRUE(long_result.has_value()); + EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong); + EXPECT_EQ(long_result->ToString(), "32"); + + auto long_bytes = long_result->Serialize(); + ASSERT_TRUE(long_bytes.has_value()); + EXPECT_EQ(long_bytes->size(), 8); + + // 4-byte float data can be deserialized as double + std::vector<uint8_t> float_data = {0, 0, 128, 63}; + auto double_result = Literal::Deserialize(float_data, float64()); + ASSERT_TRUE(double_result.has_value()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); + EXPECT_EQ(double_result->ToString(), "1.000000"); + + auto double_bytes = double_result->Serialize(); + ASSERT_TRUE(double_bytes.has_value()); + EXPECT_EQ(double_bytes->size(), 8); +} + +// Edge case serialization tests +TEST(LiteralSerializationTest, EdgeCases) { Review Comment: This can also use parameterized test, right? ########## src/iceberg/util/conversions.cc: ########## @@ -0,0 +1,296 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/util/conversions.h" + +#include <array> +#include <cstring> +#include <span> +#include <string> + +#include "iceberg/util/endian.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +/// \brief Write a value in little-endian format to the buffer. +template <EndianConvertible T> +void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) { + value = ToLittleEndian(value); + const auto* bytes = reinterpret_cast<const uint8_t*>(&value); + buffer.insert(buffer.end(), bytes, bytes + sizeof(T)); +} + +/// \brief Read a value in little-endian format from the data. +template <EndianConvertible T> +Result<T> ReadLittleEndian(std::span<const uint8_t> data) { + if (data.size() < sizeof(T)) [[unlikely]] { + return InvalidArgument("Insufficient data to read {} bytes, got {}", sizeof(T), + data.size()); + } + + T value; + std::memcpy(&value, data.data(), sizeof(T)); + return FromLittleEndian(value); +} + +Result<std::vector<uint8_t>> Conversions::ToBytes(const PrimitiveType& type, + const Literal::Value& value) { + std::vector<uint8_t> result; + const auto type_id = type.type_id(); + + switch (type_id) { + case TypeId::kBoolean: { + // 0x00 for false, 0x01 for true Review Comment: Can we simply do the following? ```cpp // In literal.h or a new literal_traits.h template <TypeId type_id> struct LiteralTraits { using ValueType = void; }; template <> struct LiteralTraits<TypeId::kBoolean> { using ValueType = bool; }; template <> struct LiteralTraits<TypeId::kInt> { using ValueType = int32_t; }; template <> struct LiteralTraits<TypeId::kLong> { using ValueType = int64_t; }; // ... // In this file template <TypeId type_id> Result<std::vector<uint8_t>> ToBytes(const Literal::Value& value) { return WriteLittleEndian(std::get<LiteralTraits<type_id>>(value)); } #define DISPATCH_LITERAL_TO_BYTES(type_id) \ case type_id: \ return ToBytes<type_id>(value); Result<std::vector<uint8_t>> ToBytes(TypeId type_id, const Literal::Value& value) { switch (type_id) { DISPATCH_LITERAL_TO_BYTES(TypeId::kBoolean) DISPATCH_LITERAL_TO_BYTES(TypeId::kInt) DISPATCH_LITERAL_TO_BYTES(TypeId::kLong) DISPATCH_LITERAL_TO_BYTES(TypeId::kFloat) DISPATCH_LITERAL_TO_BYTES(TypeId::kDouble) DISPATCH_LITERAL_TO_BYTES(TypeId::kString) DISPATCH_LITERAL_TO_BYTES(TypeId::kBinary) // ... } } #undef DISPATCH_LITERAL_TO_BYTES ``` ########## test/literal_test.cc: ########## @@ -383,4 +383,142 @@ TEST(LiteralTest, DoubleZeroComparison) { EXPECT_EQ(neg_zero <=> pos_zero, std::partial_ordering::less); } +void CheckBinaryRoundTrip(const std::vector<uint8_t>& input_bytes, + const Literal& expected_literal, + std::shared_ptr<PrimitiveType> type) { + // Deserialize from bytes + auto literal_result = Literal::Deserialize(input_bytes, type); + ASSERT_TRUE(literal_result.has_value()); + + // Check type and value are correct + EXPECT_EQ(literal_result->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(literal_result->ToString(), expected_literal.ToString()); + + // Serialize back to bytes + auto bytes_result = literal_result->Serialize(); + ASSERT_TRUE(bytes_result.has_value()); + EXPECT_EQ(*bytes_result, input_bytes); + + // Deserialize again to verify + auto final_literal = Literal::Deserialize(*bytes_result, type); + ASSERT_TRUE(final_literal.has_value()); + EXPECT_EQ(final_literal->type()->type_id(), expected_literal.type()->type_id()); + EXPECT_EQ(final_literal->ToString(), expected_literal.ToString()); +} + +// binary serialization tests +TEST(LiteralSerializationTest, BinaryBoolean) { + CheckBinaryRoundTrip({1}, Literal::Boolean(true), boolean()); + CheckBinaryRoundTrip({0}, Literal::Boolean(false), boolean()); +} + +TEST(LiteralSerializationTest, BinaryInt) { + CheckBinaryRoundTrip({32, 0, 0, 0}, Literal::Int(32), int32()); +} + +TEST(LiteralSerializationTest, BinaryLong) { + CheckBinaryRoundTrip({32, 0, 0, 0, 0, 0, 0, 0}, Literal::Long(32), int64()); +} + +TEST(LiteralSerializationTest, BinaryFloat) { + CheckBinaryRoundTrip({0, 0, 128, 63}, Literal::Float(1.0f), float32()); +} + +TEST(LiteralSerializationTest, BinaryDouble) { + CheckBinaryRoundTrip({0, 0, 0, 0, 0, 0, 240, 63}, Literal::Double(1.0), float64()); +} + +TEST(LiteralSerializationTest, BinaryString) { + CheckBinaryRoundTrip({105, 99, 101, 98, 101, 114, 103}, Literal::String("iceberg"), + string()); +} + +TEST(LiteralSerializationTest, BinaryData) { + std::vector<uint8_t> data = {0x01, 0x02, 0x03, 0xFF}; + CheckBinaryRoundTrip(data, Literal::Binary(data), binary()); +} + +// Type promotion tests +TEST(LiteralSerializationTest, TypePromotion) { + // 4-byte int data can be deserialized as long + std::vector<uint8_t> int_data = {32, 0, 0, 0}; + auto long_result = Literal::Deserialize(int_data, int64()); + ASSERT_TRUE(long_result.has_value()); + EXPECT_EQ(long_result->type()->type_id(), TypeId::kLong); + EXPECT_EQ(long_result->ToString(), "32"); + + auto long_bytes = long_result->Serialize(); + ASSERT_TRUE(long_bytes.has_value()); + EXPECT_EQ(long_bytes->size(), 8); + + // 4-byte float data can be deserialized as double + std::vector<uint8_t> float_data = {0, 0, 128, 63}; + auto double_result = Literal::Deserialize(float_data, float64()); + ASSERT_TRUE(double_result.has_value()); + EXPECT_EQ(double_result->type()->type_id(), TypeId::kDouble); + EXPECT_EQ(double_result->ToString(), "1.000000"); + + auto double_bytes = double_result->Serialize(); + ASSERT_TRUE(double_bytes.has_value()); + EXPECT_EQ(double_bytes->size(), 8); +} + +// Edge case serialization tests +TEST(LiteralSerializationTest, EdgeCases) { + // Negative integers + CheckBinaryRoundTrip({224, 255, 255, 255}, Literal::Int(-32), int32()); + CheckBinaryRoundTrip({224, 255, 255, 255, 255, 255, 255, 255}, Literal::Long(-32), + int64()); + + // Empty string special handling: empty string -> empty bytes -> null conversion + auto empty_string = Literal::String(""); + auto empty_bytes = empty_string.Serialize(); + ASSERT_TRUE(empty_bytes.has_value()); + EXPECT_TRUE(empty_bytes->empty()); + + // Empty bytes deserialize to null, not empty string + auto null_result = Literal::Deserialize(*empty_bytes, string()); + ASSERT_TRUE(null_result.has_value()); + EXPECT_TRUE(null_result->IsNull()); + + // Special floating point value serialization + auto nan_float = Literal::Float(std::numeric_limits<float>::quiet_NaN()); + auto nan_bytes = nan_float.Serialize(); + ASSERT_TRUE(nan_bytes.has_value()); + EXPECT_EQ(nan_bytes->size(), 4); + + auto inf_float = Literal::Float(std::numeric_limits<float>::infinity()); + auto inf_bytes = inf_float.Serialize(); + ASSERT_TRUE(inf_bytes.has_value()); + EXPECT_EQ(inf_bytes->size(), 4); +} + +// Error case serialization tests +TEST(LiteralSerializationTest, ErrorCases) { + // AboveMax/BelowMin values cannot be serialized + auto long_literal = + Literal::Long(static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1); + auto above_max_result = long_literal.CastTo(int32()); + ASSERT_TRUE(above_max_result.has_value()); + EXPECT_TRUE(above_max_result->IsAboveMax()); + + auto serialize_result = above_max_result->Serialize(); + EXPECT_FALSE(serialize_result.has_value()); Review Comment: Use ASSERT_THAT to verify the error message. Same apply for below. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
