wgtmac commented on code in PR #185:
URL: https://github.com/apache/iceberg-cpp/pull/185#discussion_r2295507915
##########
test/manifest_list_reader_test.cc:
##########
@@ -76,43 +77,40 @@ class ManifestListReaderV1Test : public
ManifestListReaderTestBase {
std::vector<int64_t> file_size = {6185, 6113};
std::vector<int64_t> snapshot_id = {7532614258660258098,
7532614258660258098};
- std::vector<std::vector<std::uint8_t>> lower_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
-
- std::vector<std::vector<std::uint8_t>> upper_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 4,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 6,
- .existing_rows_count = 0,
- .deleted_rows_count = 0,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[0],
- .upper_bound = upper_bounds[0]}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 2,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 6,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[1],
- .upper_bound = upper_bounds[1]}}}};
+ return {
+ {.manifest_path = paths[0],
+ .manifest_length = file_size[0],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[0],
+ .added_files_count = 4,
+ .existing_files_count = 0,
+ .deleted_files_count = 0,
+ .added_rows_count = 6,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 0,
+ .partitions = {{.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
+
iceberg::Literal::String("2022-02-22").Serialize().value(),
+ .upper_bound =
+
iceberg::Literal::String("2022-2-23").Serialize().value()}}},
+
+ {.manifest_path = paths[1],
+ .manifest_length = file_size[1],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[1],
+ .added_files_count = 0,
+ .existing_files_count = 0,
+ .deleted_files_count = 2,
+ .added_rows_count = 0,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 6,
+ .partitions = {
+ {.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
iceberg::Literal::String("2022-2-22").Serialize().value(),
Review Comment:
```suggestion
.lower_bound =
Literal::String("2022-2-22").Serialize().value(),
```
##########
test/manifest_list_reader_test.cc:
##########
@@ -76,43 +77,40 @@ class ManifestListReaderV1Test : public
ManifestListReaderTestBase {
std::vector<int64_t> file_size = {6185, 6113};
std::vector<int64_t> snapshot_id = {7532614258660258098,
7532614258660258098};
- std::vector<std::vector<std::uint8_t>> lower_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
-
- std::vector<std::vector<std::uint8_t>> upper_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 4,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 6,
- .existing_rows_count = 0,
- .deleted_rows_count = 0,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[0],
- .upper_bound = upper_bounds[0]}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 2,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 6,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[1],
- .upper_bound = upper_bounds[1]}}}};
+ return {
+ {.manifest_path = paths[0],
+ .manifest_length = file_size[0],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[0],
+ .added_files_count = 4,
+ .existing_files_count = 0,
+ .deleted_files_count = 0,
+ .added_rows_count = 6,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 0,
+ .partitions = {{.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
+
iceberg::Literal::String("2022-02-22").Serialize().value(),
+ .upper_bound =
+
iceberg::Literal::String("2022-2-23").Serialize().value()}}},
Review Comment:
```suggestion
Literal::String("2022-2-23").Serialize().value()}}},
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
+ }
+
+ const auto& value = literal.value();
+ const auto type_id = literal.type()->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ // 0x00 for false, 0x01 for true
+ result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+ return result;
+ }
+
+ case TypeId::kInt: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kDate: {
+ // Stores days from 1970-01-01 in a 4-byte little-endian int
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kLong: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTime: {
+ // Stores microseconds from midnight in an 8-byte little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestamp: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian
+ // long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestampTz: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+ // little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kFloat: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<float>(value));
+ return result;
+ }
+
+ case TypeId::kDouble: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<double>(value));
+ return result;
+ }
+
+ case TypeId::kString: {
+ // UTF-8 bytes (without length)
+ const auto& str = std::get<std::string>(value);
+ result.insert(result.end(), str.begin(), str.end());
+ return result;
+ }
+
+ case TypeId::kBinary: {
+ // Binary value (without length)
+ const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+ result = binary_data;
+ return result;
+ }
+
+ case TypeId::kFixed: {
+ // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+ // std::vector<uint8_t>
+ if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+ const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+ result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+ } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+ result = std::get<std::vector<uint8_t>>(value);
+ } else {
+ return InvalidArgument("Invalid value type for Fixed literal");
+ }
+ return result;
+ }
+
+ case TypeId::kUuid: {
+ // 16-byte big-endian value
+ const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+ WriteBigEndian16(result, uuid_bytes);
+ return result;
+ }
+
+ default:
+ return NotImplemented("Serialization for type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+ std::shared_ptr<PrimitiveType>
type) {
+ if (!type) {
+ return InvalidArgument("Type cannot be null");
+ }
+
+ // Empty data represents null value
+ if (data.empty()) {
+ return Literal::Null(type);
+ }
+
+ const auto type_id = type->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+ // 0x00 for false, non-zero byte for true
+ return Literal::Boolean(value != 0x00);
+ }
+
+ case TypeId::kInt: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Int(value);
+ }
+
+ case TypeId::kDate: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Date(value);
+ }
+
+ case TypeId::kLong:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz: {
+ int64_t value;
+
+ if (data.size() == 4) {
+ // Type was promoted from int to long
+ ICEBERG_ASSIGN_OR_RAISE(auto int_value,
ReadLittleEndian<int32_t>(data));
+ value = static_cast<int64_t>(int_value);
+ } else if (data.size() == 8) {
+ // Standard 8-byte long
+ ICEBERG_ASSIGN_OR_RAISE(auto long_value,
ReadLittleEndian<int64_t>(data));
+ value = long_value;
+ } else {
+ const char* type_name = [type_id]() {
+ switch (type_id) {
+ case TypeId::kLong:
+ return "Long";
+ case TypeId::kTime:
+ return "Time";
+ case TypeId::kTimestamp:
+ return "Timestamp";
+ case TypeId::kTimestampTz:
+ return "TimestampTz";
+ default:
+ return "Unknown";
+ }
+ }();
+ return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name,
+ data.size());
+ }
+
+ return CreateLongTypeLiteral(value, type_id);
+ }
+
+ case TypeId::kFloat: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<float>(data));
+ return Literal::Float(value);
+ }
+
+ case TypeId::kDouble: {
+ if (data.size() == 4) {
+ // Type was promoted from float to double
+ ICEBERG_ASSIGN_OR_RAISE(auto float_value,
ReadLittleEndian<float>(data));
+ return Literal::Double(static_cast<double>(float_value));
+ } else if (data.size() == 8) {
+ // Standard 8-byte double
+ ICEBERG_ASSIGN_OR_RAISE(auto double_value,
ReadLittleEndian<double>(data));
+ return Literal::Double(double_value);
+ } else {
+ return InvalidArgument("Double requires 4 or 8 bytes, got {}",
data.size());
+ }
+ }
+
+ case TypeId::kString: {
+ return Literal::String(
+ std::string(reinterpret_cast<const char*>(data.data()),
data.size()));
+ }
+
+ case TypeId::kBinary: {
+ return Literal::Binary(std::vector<uint8_t>(data.begin(), data.end()));
+ }
+
+ case TypeId::kFixed: {
+ if (data.size() == 16) {
+ std::array<uint8_t, 16> fixed_bytes;
+ std::ranges::copy(data, fixed_bytes.begin());
+ return Literal(Literal::Value{fixed_bytes}, type);
+ } else {
+ return Literal(Literal::Value{std::vector<uint8_t>(data.begin(),
data.end())},
+ type);
+ }
+ }
+
+ case TypeId::kUuid: {
+ ICEBERG_ASSIGN_OR_RAISE(auto uuid_value, ReadBigEndian16(data));
+ return Literal(Literal::Value{uuid_value}, type);
+ }
+
+ case TypeId::kDecimal: {
+ if (data.size() > 16) {
+ return InvalidArgument(
+ "Decimal data too large, maximum 16 bytes supported, got {}",
data.size());
+ }
+
+ std::array<uint8_t, 16> decimal_bytes{};
+ // Copy data to the end of the array (big-endian format for decimals)
+ // This handles variable-length decimals by right-aligning them
+ std::ranges::copy(data, decimal_bytes.end() - data.size());
+ return Literal(Literal::Value{decimal_bytes}, type);
+ }
+
+ default:
+ return NotImplemented("Deserialization for type {} is not supported",
+ type->ToString());
Review Comment:
```suggestion
return NotSupported("Cannot deserialize type: {}", type->ToString());
```
##########
src/iceberg/expression/literal.h:
##########
@@ -144,6 +144,19 @@ class ICEBERG_EXPORT Literal {
Literal(Value value, std::shared_ptr<PrimitiveType> type);
friend class LiteralCaster;
+ friend class LiteralSerializer;
+
+ /// \brief Format a date value as a string.
+ std::string FormatDate(int32_t days_since_epoch) const;
Review Comment:
These `FormatXXX` functions are used only internally, right? We should
either move them to `literal.cc` or add a separate utility class for formatting
time values like `iceberg/util/time_utils.h`. I prefer the latter because we
can add test cases for them.
##########
test/manifest_list_reader_test.cc:
##########
@@ -76,43 +77,40 @@ class ManifestListReaderV1Test : public
ManifestListReaderTestBase {
std::vector<int64_t> file_size = {6185, 6113};
std::vector<int64_t> snapshot_id = {7532614258660258098,
7532614258660258098};
- std::vector<std::vector<std::uint8_t>> lower_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
-
- std::vector<std::vector<std::uint8_t>> upper_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 4,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 6,
- .existing_rows_count = 0,
- .deleted_rows_count = 0,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[0],
- .upper_bound = upper_bounds[0]}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 2,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 6,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[1],
- .upper_bound = upper_bounds[1]}}}};
+ return {
+ {.manifest_path = paths[0],
+ .manifest_length = file_size[0],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[0],
+ .added_files_count = 4,
+ .existing_files_count = 0,
+ .deleted_files_count = 0,
+ .added_rows_count = 6,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 0,
+ .partitions = {{.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
+
iceberg::Literal::String("2022-02-22").Serialize().value(),
+ .upper_bound =
+
iceberg::Literal::String("2022-2-23").Serialize().value()}}},
+
+ {.manifest_path = paths[1],
+ .manifest_length = file_size[1],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[1],
+ .added_files_count = 0,
+ .existing_files_count = 0,
+ .deleted_files_count = 2,
+ .added_rows_count = 0,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 6,
+ .partitions = {
+ {.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
iceberg::Literal::String("2022-2-22").Serialize().value(),
+ .upper_bound =
+
iceberg::Literal::String("2022-2-23").Serialize().value()}}}};
Review Comment:
```suggestion
Literal::String("2022-2-23").Serialize().value()}}}};
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
Review Comment:
```suggestion
return NotSupported("Cannot serialize BelowMin");
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
Review Comment:
Can we use C++20 concepts instead of `static_assert` which is more
type-safe? We should only support limited set of types.
##########
test/manifest_list_reader_test.cc:
##########
@@ -76,43 +77,40 @@ class ManifestListReaderV1Test : public
ManifestListReaderTestBase {
std::vector<int64_t> file_size = {6185, 6113};
std::vector<int64_t> snapshot_id = {7532614258660258098,
7532614258660258098};
- std::vector<std::vector<std::uint8_t>> lower_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x30, 0x32, 0x2D, 0x32, 0x32},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x32}};
-
- std::vector<std::vector<std::uint8_t>> upper_bounds = {
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33},
- {0x32, 0x30, 0x32, 0x32, 0x2D, 0x32, 0x2D, 0x32, 0x33}};
-
- return {{.manifest_path = paths[0],
- .manifest_length = file_size[0],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[0],
- .added_files_count = 4,
- .existing_files_count = 0,
- .deleted_files_count = 0,
- .added_rows_count = 6,
- .existing_rows_count = 0,
- .deleted_rows_count = 0,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[0],
- .upper_bound = upper_bounds[0]}}},
-
- {.manifest_path = paths[1],
- .manifest_length = file_size[1],
- .partition_spec_id = 0,
- .added_snapshot_id = snapshot_id[1],
- .added_files_count = 0,
- .existing_files_count = 0,
- .deleted_files_count = 2,
- .added_rows_count = 0,
- .existing_rows_count = 0,
- .deleted_rows_count = 6,
- .partitions = {{.contains_null = false,
- .contains_nan = false,
- .lower_bound = lower_bounds[1],
- .upper_bound = upper_bounds[1]}}}};
+ return {
+ {.manifest_path = paths[0],
+ .manifest_length = file_size[0],
+ .partition_spec_id = 0,
+ .added_snapshot_id = snapshot_id[0],
+ .added_files_count = 4,
+ .existing_files_count = 0,
+ .deleted_files_count = 0,
+ .added_rows_count = 6,
+ .existing_rows_count = 0,
+ .deleted_rows_count = 0,
+ .partitions = {{.contains_null = false,
+ .contains_nan = false,
+ .lower_bound =
+
iceberg::Literal::String("2022-02-22").Serialize().value(),
Review Comment:
```suggestion
Literal::String("2022-02-22").Serialize().value(),
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if constexpr (std::endian::native == std::endian::little) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ if constexpr (sizeof(T) > 1) {
+ T le_value = std::byteswap(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ // For single byte types, no byteswap needed
+ buffer.push_back(static_cast<uint8_t>(value));
+ }
+ }
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <typename T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if (data.size() < sizeof(T)) {
+ return InvalidArgument("Insufficient data to read {} bytes, got {}",
sizeof(T),
+ data.size());
+ }
+
+ T value;
+ std::memcpy(&value, data.data(), sizeof(T));
+
+ if constexpr (std::endian::native != std::endian::little && sizeof(T) > 1) {
+ value = std::byteswap(value);
+ }
+ return value;
+}
+
+/// \brief Write a 16-byte value in big-endian format (for UUID and Decimal).
+void WriteBigEndian16(std::vector<uint8_t>& buffer,
+ const std::array<uint8_t, 16>& value) {
+ buffer.insert(buffer.end(), value.begin(), value.end());
+}
+
+/// \brief Read a 16-byte value in big-endian format (for UUID and Decimal).
+Result<std::array<uint8_t, 16>> ReadBigEndian16(std::span<const uint8_t> data)
{
Review Comment:
Same question as above.
##########
test/manifest_reader_test.cc:
##########
@@ -59,24 +59,33 @@ class ManifestReaderV1Test : public TempFileTestBase {
"order_ts_hour=2021-01-26-00/"
"00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"};
std::vector<int64_t> partitions = {447696, 473976, 465192, 447672};
+
+ // TODO(Li Feiyang): The Decimal type and its serialization logic are not
yet fully
+ // implemented to support variable-length encoding as required by the
Iceberg
+ // specification. Using Literal::Binary as a temporary substitute to
represent the raw
+ // bytes for the decimal values.
std::vector<std::map<int32_t, std::vector<uint8_t>>> bounds = {
- {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x12, 0xe2}},
- {4, {0xc0, 'y', 0xe7, 0x98, 0xd6, 0xb9, 0x05, 0x00}}},
- {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x12, 0xe3}},
- {4, {0xc0, 0x19, '#', '=', 0xe2, 0x0f, 0x06, 0x00}}},
- {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x0e, '"'}},
- {4, {0xc0, 0xd9, '7', 0x93, 0x1f, 0xf3, 0x05, 0x00}}},
- {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x0e, '!'}},
- {4, {0xc0, 0x19, 0x10, '{', 0xc2, 0xb9, 0x05, 0x00}}},
+ {{1, iceberg::Literal::Long(1234).Serialize().value()},
Review Comment:
```suggestion
{{1, Literal::Long(1234).Serialize().value()},
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if constexpr (std::endian::native == std::endian::little) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ if constexpr (sizeof(T) > 1) {
+ T le_value = std::byteswap(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ // For single byte types, no byteswap needed
+ buffer.push_back(static_cast<uint8_t>(value));
+ }
+ }
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <typename T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if (data.size() < sizeof(T)) {
+ return InvalidArgument("Insufficient data to read {} bytes, got {}",
sizeof(T),
+ data.size());
+ }
+
+ T value;
+ std::memcpy(&value, data.data(), sizeof(T));
+
+ if constexpr (std::endian::native != std::endian::little && sizeof(T) > 1) {
+ value = std::byteswap(value);
+ }
+ return value;
+}
+
+/// \brief Write a 16-byte value in big-endian format (for UUID and Decimal).
+void WriteBigEndian16(std::vector<uint8_t>& buffer,
Review Comment:
Is it possible to reuse the same template function signature as
`WriteBigEndian` with specialization for `std::array<uint8_t, 16>`?
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
Review Comment:
I would recommend to add `iceberg/util/endian.h` like
https://github.com/apache/arrow/blob/main/cpp/src/arrow/util/endian.h
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if constexpr (std::endian::native == std::endian::little) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ if constexpr (sizeof(T) > 1) {
+ T le_value = std::byteswap(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ // For single byte types, no byteswap needed
+ buffer.push_back(static_cast<uint8_t>(value));
+ }
+ }
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <typename T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if (data.size() < sizeof(T)) {
+ return InvalidArgument("Insufficient data to read {} bytes, got {}",
sizeof(T),
+ data.size());
+ }
+
+ T value;
+ std::memcpy(&value, data.data(), sizeof(T));
+
+ if constexpr (std::endian::native != std::endian::little && sizeof(T) > 1) {
+ value = std::byteswap(value);
+ }
+ return value;
+}
+
+/// \brief Write a 16-byte value in big-endian format (for UUID and Decimal).
+void WriteBigEndian16(std::vector<uint8_t>& buffer,
+ const std::array<uint8_t, 16>& value) {
+ buffer.insert(buffer.end(), value.begin(), value.end());
+}
+
+/// \brief Read a 16-byte value in big-endian format (for UUID and Decimal).
+Result<std::array<uint8_t, 16>> ReadBigEndian16(std::span<const uint8_t> data)
{
+ if (data.size() < 16) {
+ return InvalidArgument("Insufficient data to read 16 bytes, got {}",
data.size());
+ }
+
+ std::array<uint8_t, 16> result;
+ std::copy(data.begin(), data.begin() + 16, result.begin());
+ return result;
+}
+
+/// \brief Create a literal from int64 value and long-type TypeId.
+Result<Literal> CreateLongTypeLiteral(int64_t value, TypeId type_id) {
+ switch (type_id) {
+ case TypeId::kLong:
+ return Literal::Long(value);
+ case TypeId::kTime:
+ return Literal::Time(value);
+ case TypeId::kTimestamp:
+ return Literal::Timestamp(value);
+ case TypeId::kTimestampTz:
+ return Literal::TimestampTz(value);
+ default:
+ return InvalidArgument("Unsupported long type: {}",
static_cast<int>(type_id));
+ }
+}
+
+} // namespace
+
+/// \brief LiteralSerializer handles serialization/deserialization operations
for Literal.
+/// This is an internal implementation class.
+class LiteralSerializer {
Review Comment:
Checking the java code, it seems that single-value serialization is not only
used by Literal. We need to create a separate utility file to support reuse.
For example, we can create `iceberg/util/conversions.h` and provide following
functions:
```cpp
class ICEBERG_EXPORT Conversions {
public:
static Result<std::vector<uint8_t>> ToBytes(const Primitive& type, const
Literal::Value& value);
static Result<std::vector<uint8_t>> ToBytes(const Literal& literal);
static Result<Literal::Value> FromBytes(const Primitive& type,
std::span<const uint8_t> data);
static Result<Literal> FromBytes(std::shared_ptr<Primitive> type,
std::span<const uint8_t> data);
};
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
Review Comment:
```suggestion
return NotSupported("Cannot serialize AboveMax");
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
Review Comment:
```suggestion
return NotSupported("Cannot serialize null");
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
+ }
+
+ const auto& value = literal.value();
+ const auto type_id = literal.type()->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ // 0x00 for false, 0x01 for true
+ result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+ return result;
+ }
+
+ case TypeId::kInt: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kDate: {
+ // Stores days from 1970-01-01 in a 4-byte little-endian int
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kLong: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTime: {
+ // Stores microseconds from midnight in an 8-byte little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestamp: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian
+ // long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestampTz: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+ // little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kFloat: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<float>(value));
+ return result;
+ }
+
+ case TypeId::kDouble: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<double>(value));
+ return result;
+ }
+
+ case TypeId::kString: {
+ // UTF-8 bytes (without length)
+ const auto& str = std::get<std::string>(value);
+ result.insert(result.end(), str.begin(), str.end());
+ return result;
+ }
+
+ case TypeId::kBinary: {
+ // Binary value (without length)
+ const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+ result = binary_data;
+ return result;
+ }
+
+ case TypeId::kFixed: {
+ // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+ // std::vector<uint8_t>
+ if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+ const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+ result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+ } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+ result = std::get<std::vector<uint8_t>>(value);
+ } else {
+ return InvalidArgument("Invalid value type for Fixed literal");
+ }
+ return result;
+ }
+
+ case TypeId::kUuid: {
+ // 16-byte big-endian value
+ const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+ WriteBigEndian16(result, uuid_bytes);
+ return result;
+ }
+
+ default:
+ return NotImplemented("Serialization for type {} is not supported",
Review Comment:
```suggestion
return NotSupported("Serialization for type {} is not supported",
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
Review Comment:
It shouldn't serialize null.
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
+ }
+
+ const auto& value = literal.value();
+ const auto type_id = literal.type()->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ // 0x00 for false, 0x01 for true
+ result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+ return result;
+ }
+
+ case TypeId::kInt: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kDate: {
+ // Stores days from 1970-01-01 in a 4-byte little-endian int
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kLong: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTime: {
+ // Stores microseconds from midnight in an 8-byte little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestamp: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian
+ // long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestampTz: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+ // little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kFloat: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<float>(value));
+ return result;
+ }
+
+ case TypeId::kDouble: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<double>(value));
+ return result;
+ }
+
+ case TypeId::kString: {
+ // UTF-8 bytes (without length)
+ const auto& str = std::get<std::string>(value);
+ result.insert(result.end(), str.begin(), str.end());
+ return result;
+ }
+
+ case TypeId::kBinary: {
+ // Binary value (without length)
+ const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+ result = binary_data;
+ return result;
+ }
+
+ case TypeId::kFixed: {
+ // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+ // std::vector<uint8_t>
+ if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+ const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+ result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+ } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+ result = std::get<std::vector<uint8_t>>(value);
+ } else {
+ return InvalidArgument("Invalid value type for Fixed literal");
+ }
+ return result;
+ }
+
+ case TypeId::kUuid: {
+ // 16-byte big-endian value
+ const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+ WriteBigEndian16(result, uuid_bytes);
+ return result;
+ }
+
+ default:
+ return NotImplemented("Serialization for type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+ std::shared_ptr<PrimitiveType>
type) {
+ if (!type) {
+ return InvalidArgument("Type cannot be null");
+ }
+
+ // Empty data represents null value
+ if (data.empty()) {
+ return Literal::Null(type);
+ }
+
+ const auto type_id = type->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+ // 0x00 for false, non-zero byte for true
+ return Literal::Boolean(value != 0x00);
+ }
+
+ case TypeId::kInt: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Int(value);
+ }
+
+ case TypeId::kDate: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Date(value);
+ }
+
+ case TypeId::kLong:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz: {
+ int64_t value;
+
+ if (data.size() == 4) {
+ // Type was promoted from int to long
+ ICEBERG_ASSIGN_OR_RAISE(auto int_value,
ReadLittleEndian<int32_t>(data));
+ value = static_cast<int64_t>(int_value);
+ } else if (data.size() == 8) {
+ // Standard 8-byte long
+ ICEBERG_ASSIGN_OR_RAISE(auto long_value,
ReadLittleEndian<int64_t>(data));
+ value = long_value;
+ } else {
+ const char* type_name = [type_id]() {
+ switch (type_id) {
+ case TypeId::kLong:
+ return "Long";
+ case TypeId::kTime:
+ return "Time";
+ case TypeId::kTimestamp:
+ return "Timestamp";
+ case TypeId::kTimestampTz:
+ return "TimestampTz";
+ default:
+ return "Unknown";
+ }
+ }();
+ return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name,
+ data.size());
+ }
+
+ return CreateLongTypeLiteral(value, type_id);
+ }
+
+ case TypeId::kFloat: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<float>(data));
+ return Literal::Float(value);
+ }
+
+ case TypeId::kDouble: {
+ if (data.size() == 4) {
+ // Type was promoted from float to double
+ ICEBERG_ASSIGN_OR_RAISE(auto float_value,
ReadLittleEndian<float>(data));
+ return Literal::Double(static_cast<double>(float_value));
+ } else if (data.size() == 8) {
+ // Standard 8-byte double
+ ICEBERG_ASSIGN_OR_RAISE(auto double_value,
ReadLittleEndian<double>(data));
+ return Literal::Double(double_value);
+ } else {
+ return InvalidArgument("Double requires 4 or 8 bytes, got {}",
data.size());
+ }
+ }
+
+ case TypeId::kString: {
+ return Literal::String(
+ std::string(reinterpret_cast<const char*>(data.data()),
data.size()));
+ }
+
+ case TypeId::kBinary: {
+ return Literal::Binary(std::vector<uint8_t>(data.begin(), data.end()));
+ }
+
+ case TypeId::kFixed: {
+ if (data.size() == 16) {
+ std::array<uint8_t, 16> fixed_bytes;
+ std::ranges::copy(data, fixed_bytes.begin());
+ return Literal(Literal::Value{fixed_bytes}, type);
+ } else {
+ return Literal(Literal::Value{std::vector<uint8_t>(data.begin(),
data.end())},
+ type);
+ }
+ }
+
+ case TypeId::kUuid: {
+ ICEBERG_ASSIGN_OR_RAISE(auto uuid_value, ReadBigEndian16(data));
+ return Literal(Literal::Value{uuid_value}, type);
+ }
+
+ case TypeId::kDecimal: {
+ if (data.size() > 16) {
+ return InvalidArgument(
+ "Decimal data too large, maximum 16 bytes supported, got {}",
data.size());
+ }
+
+ std::array<uint8_t, 16> decimal_bytes{};
+ // Copy data to the end of the array (big-endian format for decimals)
+ // This handles variable-length decimals by right-aligning them
+ std::ranges::copy(data, decimal_bytes.end() - data.size());
+ return Literal(Literal::Value{decimal_bytes}, type);
+ }
+
+ default:
+ return NotImplemented("Deserialization for type {} is not supported",
+ type->ToString());
+ }
+
+ // Should not reach here
+ return InvalidArgument("Unexpected error in deserialization");
Review Comment:
```suggestion
std::unreachable();
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
+ }
+
+ const auto& value = literal.value();
+ const auto type_id = literal.type()->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ // 0x00 for false, 0x01 for true
+ result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+ return result;
+ }
+
+ case TypeId::kInt: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kDate: {
+ // Stores days from 1970-01-01 in a 4-byte little-endian int
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kLong: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTime: {
+ // Stores microseconds from midnight in an 8-byte little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestamp: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian
+ // long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestampTz: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+ // little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kFloat: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<float>(value));
+ return result;
+ }
+
+ case TypeId::kDouble: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<double>(value));
+ return result;
+ }
+
+ case TypeId::kString: {
+ // UTF-8 bytes (without length)
+ const auto& str = std::get<std::string>(value);
+ result.insert(result.end(), str.begin(), str.end());
+ return result;
+ }
+
+ case TypeId::kBinary: {
+ // Binary value (without length)
+ const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+ result = binary_data;
+ return result;
+ }
+
+ case TypeId::kFixed: {
+ // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+ // std::vector<uint8_t>
+ if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+ const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+ result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+ } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+ result = std::get<std::vector<uint8_t>>(value);
+ } else {
+ return InvalidArgument("Invalid value type for Fixed literal");
+ }
+ return result;
+ }
+
+ case TypeId::kUuid: {
+ // 16-byte big-endian value
+ const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+ WriteBigEndian16(result, uuid_bytes);
+ return result;
+ }
+
+ default:
+ return NotImplemented("Serialization for type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+ std::shared_ptr<PrimitiveType>
type) {
+ if (!type) {
+ return InvalidArgument("Type cannot be null");
+ }
+
+ // Empty data represents null value
+ if (data.empty()) {
+ return Literal::Null(type);
+ }
+
+ const auto type_id = type->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+ // 0x00 for false, non-zero byte for true
+ return Literal::Boolean(value != 0x00);
+ }
+
+ case TypeId::kInt: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Int(value);
+ }
+
+ case TypeId::kDate: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Date(value);
+ }
+
+ case TypeId::kLong:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz: {
+ int64_t value;
+
+ if (data.size() == 4) {
+ // Type was promoted from int to long
+ ICEBERG_ASSIGN_OR_RAISE(auto int_value,
ReadLittleEndian<int32_t>(data));
+ value = static_cast<int64_t>(int_value);
+ } else if (data.size() == 8) {
+ // Standard 8-byte long
+ ICEBERG_ASSIGN_OR_RAISE(auto long_value,
ReadLittleEndian<int64_t>(data));
+ value = long_value;
+ } else {
+ const char* type_name = [type_id]() {
+ switch (type_id) {
+ case TypeId::kLong:
+ return "Long";
+ case TypeId::kTime:
+ return "Time";
+ case TypeId::kTimestamp:
+ return "Timestamp";
+ case TypeId::kTimestampTz:
+ return "TimestampTz";
+ default:
+ return "Unknown";
+ }
+ }();
+ return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name,
+ data.size());
+ }
+
+ return CreateLongTypeLiteral(value, type_id);
Review Comment:
```suggestion
return {value, std::move(type)};
```
Why not directly use `Literal(Value value, std::shared_ptr<PrimitiveType>
type)` to create it? You don't need CreateLongTypeLiteral.
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
+ }
+
+ const auto& value = literal.value();
+ const auto type_id = literal.type()->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ // 0x00 for false, 0x01 for true
+ result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+ return result;
+ }
+
+ case TypeId::kInt: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kDate: {
+ // Stores days from 1970-01-01 in a 4-byte little-endian int
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kLong: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTime: {
+ // Stores microseconds from midnight in an 8-byte little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestamp: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian
+ // long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestampTz: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+ // little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kFloat: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<float>(value));
+ return result;
+ }
+
+ case TypeId::kDouble: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<double>(value));
+ return result;
+ }
+
+ case TypeId::kString: {
+ // UTF-8 bytes (without length)
+ const auto& str = std::get<std::string>(value);
+ result.insert(result.end(), str.begin(), str.end());
+ return result;
+ }
+
+ case TypeId::kBinary: {
+ // Binary value (without length)
+ const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+ result = binary_data;
+ return result;
+ }
+
+ case TypeId::kFixed: {
+ // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+ // std::vector<uint8_t>
+ if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+ const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+ result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+ } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+ result = std::get<std::vector<uint8_t>>(value);
+ } else {
+ return InvalidArgument("Invalid value type for Fixed literal");
+ }
+ return result;
+ }
+
+ case TypeId::kUuid: {
+ // 16-byte big-endian value
+ const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+ WriteBigEndian16(result, uuid_bytes);
+ return result;
+ }
+
+ default:
+ return NotImplemented("Serialization for type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+ std::shared_ptr<PrimitiveType>
type) {
+ if (!type) {
+ return InvalidArgument("Type cannot be null");
+ }
+
+ // Empty data represents null value
+ if (data.empty()) {
+ return Literal::Null(type);
+ }
+
+ const auto type_id = type->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+ // 0x00 for false, non-zero byte for true
+ return Literal::Boolean(value != 0x00);
+ }
+
+ case TypeId::kInt: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Int(value);
+ }
+
+ case TypeId::kDate: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Date(value);
+ }
+
+ case TypeId::kLong:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz: {
+ int64_t value;
+
+ if (data.size() == 4) {
+ // Type was promoted from int to long
+ ICEBERG_ASSIGN_OR_RAISE(auto int_value,
ReadLittleEndian<int32_t>(data));
+ value = static_cast<int64_t>(int_value);
+ } else if (data.size() == 8) {
+ // Standard 8-byte long
+ ICEBERG_ASSIGN_OR_RAISE(auto long_value,
ReadLittleEndian<int64_t>(data));
+ value = long_value;
+ } else {
+ const char* type_name = [type_id]() {
Review Comment:
Add a `ICEBERG_EXPORT std::string_view ToString(TypeId id)` to
`iceberg/util/type.h`? Please also use lowercase form like what we did in
`Type::ToString()`.
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if constexpr (std::endian::native == std::endian::little) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ if constexpr (sizeof(T) > 1) {
+ T le_value = std::byteswap(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ // For single byte types, no byteswap needed
+ buffer.push_back(static_cast<uint8_t>(value));
+ }
+ }
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <typename T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if (data.size() < sizeof(T)) {
Review Comment:
```suggestion
if (data.size() < sizeof(T)) [[unlikely]] {
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -294,13 +388,21 @@ std::string Literal::ToString() const {
}
return result;
}
+ case TypeId::kDate: {
+ return FormatDate(std::get<int32_t>(value_));
+ }
+ case TypeId::kTime: {
+ return FormatTime(std::get<int64_t>(value_));
+ }
+ case TypeId::kTimestamp: {
+ return FormatTimestamp(std::get<int64_t>(value_));
+ }
+ case TypeId::kTimestampTz: {
+ return FormatTimestampTz(std::get<int64_t>(value_));
+ }
case TypeId::kDecimal:
case TypeId::kUuid:
- case TypeId::kFixed:
- case TypeId::kDate:
- case TypeId::kTime:
- case TypeId::kTimestamp:
- case TypeId::kTimestampTz: {
+ case TypeId::kFixed: {
Review Comment:
BTW, please help me check the `ToString` should follow these rules:
1. String literals should be printed as is.
2. Binary and fixed literals should be printed as base16-encoded texts and
quoted by `X''`.
3. Other values should be quoted in `""`.
I'd suggest to create a separate PR to fix this. WDYT?
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if constexpr (std::endian::native == std::endian::little) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ if constexpr (sizeof(T) > 1) {
+ T le_value = std::byteswap(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ // For single byte types, no byteswap needed
+ buffer.push_back(static_cast<uint8_t>(value));
+ }
+ }
Review Comment:
```suggestion
} else if constexpr (sizeof(T) > 1) {
T le_value = std::byteswap(value);
const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
} else {
// For single byte, no byteswap needed
buffer.push_back(static_cast<uint8_t>(value));
}
```
##########
src/iceberg/expression/literal.cc:
##########
@@ -355,4 +457,305 @@ Result<Literal> LiteralCaster::CastTo(const Literal&
literal,
target_type->ToString());
}
+// LiteralSerializer implementation
+
+Result<std::vector<uint8_t>> LiteralSerializer::ToBytes(const Literal&
literal) {
+ // Cannot serialize special values
+ if (literal.IsAboveMax()) {
+ return InvalidArgument("Cannot serialize AboveMax literal");
+ }
+ if (literal.IsBelowMin()) {
+ return InvalidArgument("Cannot serialize BelowMin literal");
+ }
+
+ std::vector<uint8_t> result;
+
+ // Null values serialize to empty buffer
+ if (literal.IsNull()) {
+ return result;
+ }
+
+ const auto& value = literal.value();
+ const auto type_id = literal.type()->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ // 0x00 for false, 0x01 for true
+ result.push_back(std::get<bool>(value) ? 0x01 : 0x00);
+ return result;
+ }
+
+ case TypeId::kInt: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kDate: {
+ // Stores days from 1970-01-01 in a 4-byte little-endian int
+ WriteLittleEndian(result, std::get<int32_t>(value));
+ return result;
+ }
+
+ case TypeId::kLong: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTime: {
+ // Stores microseconds from midnight in an 8-byte little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestamp: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte
little-endian
+ // long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kTimestampTz: {
+ // Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte
+ // little-endian long
+ WriteLittleEndian(result, std::get<int64_t>(value));
+ return result;
+ }
+
+ case TypeId::kFloat: {
+ // Stored as 4-byte little-endian
+ WriteLittleEndian(result, std::get<float>(value));
+ return result;
+ }
+
+ case TypeId::kDouble: {
+ // Stored as 8-byte little-endian
+ WriteLittleEndian(result, std::get<double>(value));
+ return result;
+ }
+
+ case TypeId::kString: {
+ // UTF-8 bytes (without length)
+ const auto& str = std::get<std::string>(value);
+ result.insert(result.end(), str.begin(), str.end());
+ return result;
+ }
+
+ case TypeId::kBinary: {
+ // Binary value (without length)
+ const auto& binary_data = std::get<std::vector<uint8_t>>(value);
+ result = binary_data;
+ return result;
+ }
+
+ case TypeId::kFixed: {
+ // Fixed(L) - Binary value, could be stored in std::array<uint8_t, 16> or
+ // std::vector<uint8_t>
+ if (std::holds_alternative<std::array<uint8_t, 16>>(value)) {
+ const auto& fixed_bytes = std::get<std::array<uint8_t, 16>>(value);
+ result.insert(result.end(), fixed_bytes.begin(), fixed_bytes.end());
+ } else if (std::holds_alternative<std::vector<uint8_t>>(value)) {
+ result = std::get<std::vector<uint8_t>>(value);
+ } else {
+ return InvalidArgument("Invalid value type for Fixed literal");
+ }
+ return result;
+ }
+
+ case TypeId::kUuid: {
+ // 16-byte big-endian value
+ const auto& uuid_bytes = std::get<std::array<uint8_t, 16>>(value);
+ WriteBigEndian16(result, uuid_bytes);
+ return result;
+ }
+
+ default:
+ return NotImplemented("Serialization for type {} is not supported",
+ literal.type()->ToString());
+ }
+}
+
+Result<Literal> LiteralSerializer::FromBytes(std::span<const uint8_t> data,
+ std::shared_ptr<PrimitiveType>
type) {
+ if (!type) {
+ return InvalidArgument("Type cannot be null");
+ }
+
+ // Empty data represents null value
+ if (data.empty()) {
+ return Literal::Null(type);
+ }
+
+ const auto type_id = type->type_id();
+
+ switch (type_id) {
+ case TypeId::kBoolean: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<uint8_t>(data));
+ // 0x00 for false, non-zero byte for true
+ return Literal::Boolean(value != 0x00);
+ }
+
+ case TypeId::kInt: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Int(value);
+ }
+
+ case TypeId::kDate: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<int32_t>(data));
+ return Literal::Date(value);
+ }
+
+ case TypeId::kLong:
+ case TypeId::kTime:
+ case TypeId::kTimestamp:
+ case TypeId::kTimestampTz: {
+ int64_t value;
+
+ if (data.size() == 4) {
+ // Type was promoted from int to long
+ ICEBERG_ASSIGN_OR_RAISE(auto int_value,
ReadLittleEndian<int32_t>(data));
+ value = static_cast<int64_t>(int_value);
+ } else if (data.size() == 8) {
+ // Standard 8-byte long
+ ICEBERG_ASSIGN_OR_RAISE(auto long_value,
ReadLittleEndian<int64_t>(data));
+ value = long_value;
+ } else {
+ const char* type_name = [type_id]() {
+ switch (type_id) {
+ case TypeId::kLong:
+ return "Long";
+ case TypeId::kTime:
+ return "Time";
+ case TypeId::kTimestamp:
+ return "Timestamp";
+ case TypeId::kTimestampTz:
+ return "TimestampTz";
+ default:
+ return "Unknown";
+ }
+ }();
+ return InvalidArgument("{} requires 4 or 8 bytes, got {}", type_name,
+ data.size());
+ }
+
+ return CreateLongTypeLiteral(value, type_id);
+ }
+
+ case TypeId::kFloat: {
+ ICEBERG_ASSIGN_OR_RAISE(auto value, ReadLittleEndian<float>(data));
+ return Literal::Float(value);
+ }
+
+ case TypeId::kDouble: {
+ if (data.size() == 4) {
+ // Type was promoted from float to double
+ ICEBERG_ASSIGN_OR_RAISE(auto float_value,
ReadLittleEndian<float>(data));
+ return Literal::Double(static_cast<double>(float_value));
+ } else if (data.size() == 8) {
Review Comment:
Put `if (data.size() == 8)` before `if (data.size() == 4)`. The reason is
that `8` should be more common than `4` in real cases.
##########
test/manifest_reader_test.cc:
##########
@@ -59,24 +59,33 @@ class ManifestReaderV1Test : public TempFileTestBase {
"order_ts_hour=2021-01-26-00/"
"00000-2-d5ae78b7-4449-45ec-adb7-c0e9c0bdb714-0-00004.parquet"};
std::vector<int64_t> partitions = {447696, 473976, 465192, 447672};
+
+ // TODO(Li Feiyang): The Decimal type and its serialization logic are not
yet fully
+ // implemented to support variable-length encoding as required by the
Iceberg
+ // specification. Using Literal::Binary as a temporary substitute to
represent the raw
+ // bytes for the decimal values.
std::vector<std::map<int32_t, std::vector<uint8_t>>> bounds = {
- {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x12, 0xe2}},
- {4, {0xc0, 'y', 0xe7, 0x98, 0xd6, 0xb9, 0x05, 0x00}}},
- {{1, {0xd2, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {'.', 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x12, 0xe3}},
- {4, {0xc0, 0x19, '#', '=', 0xe2, 0x0f, 0x06, 0x00}}},
- {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x0e, '"'}},
- {4, {0xc0, 0xd9, '7', 0x93, 0x1f, 0xf3, 0x05, 0x00}}},
- {{1, {'{', 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {2, {0xc8, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
- {3, {0x0e, '!'}},
- {4, {0xc0, 0x19, 0x10, '{', 0xc2, 0xb9, 0x05, 0x00}}},
+ {{1, iceberg::Literal::Long(1234).Serialize().value()},
Review Comment:
Same apply for below
##########
src/iceberg/expression/literal.cc:
##########
@@ -19,13 +19,107 @@
#include "iceberg/expression/literal.h"
+#include <algorithm>
+#include <bit>
+#include <chrono>
#include <cmath>
#include <concepts>
+#include <cstring>
+#include <iomanip>
+#include <sstream>
#include "iceberg/exception.h"
+#include "iceberg/util/macros.h"
namespace iceberg {
+namespace {
+/// \brief Write a value in little-endian format to the buffer.
+template <typename T>
+void WriteLittleEndian(std::vector<uint8_t>& buffer, T value) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
+
+ if constexpr (std::endian::native == std::endian::little) {
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ if constexpr (sizeof(T) > 1) {
+ T le_value = std::byteswap(value);
+ const auto* bytes = reinterpret_cast<const uint8_t*>(&le_value);
+ buffer.insert(buffer.end(), bytes, bytes + sizeof(T));
+ } else {
+ // For single byte types, no byteswap needed
+ buffer.push_back(static_cast<uint8_t>(value));
+ }
+ }
+}
+
+/// \brief Read a value in little-endian format from the data.
+template <typename T>
+Result<T> ReadLittleEndian(std::span<const uint8_t> data) {
+ static_assert(std::is_trivially_copyable_v<T>, "Type must be trivially
copyable");
Review Comment:
ditio, try to explore concepts.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]