evindj commented on code in PR #588: URL: https://github.com/apache/iceberg-cpp/pull/588#discussion_r2928796644
########## src/iceberg/puffin/blob.cc: ########## @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/puffin/blob.h" + +#include <format> + +#include "iceberg/util/formatter_internal.h" + +namespace iceberg::puffin { + +std::string ToString(const Blob& blob) { + std::string repr = "Blob["; + std::format_to(std::back_inserter(repr), "type='{}',inputFields={},", blob.type, + blob.input_fields); + std::format_to(std::back_inserter(repr), "snapshotId={},sequenceNumber={},", + blob.snapshot_id, blob.sequence_number); + std::format_to(std::back_inserter(repr), "dataSize={}", blob.data.size()); Review Comment: Nit: may be add a checksum if available for fast comparison(I do understand checksum is not part of the puffin spec). ########## src/iceberg/puffin/blob_metadata.h: ########## @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/blob_metadata.h +/// Blob metadata structure for Puffin files. + +#include <cstdint> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/iceberg_export.h" + +namespace iceberg::puffin { + +/// \brief Metadata about a blob stored in a Puffin file. +/// +/// This represents the metadata stored in the Puffin file footer, +/// including the blob's location within the file. +struct ICEBERG_EXPORT BlobMetadata { + /// Type of the blob. See StandardBlobTypes for known types. + std::string type; Review Comment: It is weird that the type is declared as string. it makes it harder to understand what type of blobs are accepted but harder to catch typos. ########## src/iceberg/puffin/blob_metadata.h: ########## @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/blob_metadata.h +/// Blob metadata structure for Puffin files. + +#include <cstdint> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/iceberg_export.h" + +namespace iceberg::puffin { + +/// \brief Metadata about a blob stored in a Puffin file. +/// +/// This represents the metadata stored in the Puffin file footer, +/// including the blob's location within the file. +struct ICEBERG_EXPORT BlobMetadata { + /// Type of the blob. See StandardBlobTypes for known types. + std::string type; + /// List of field IDs the blob was computed for. + std::vector<int32_t> input_fields; + /// ID of the Iceberg table's snapshot the blob was computed from. + int64_t snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from. + int64_t sequence_number; + /// Offset in the file where the blob data starts. + int64_t offset; + /// Length of the blob data in the file (after compression, if compressed). Review Comment: How to check if the blob is compressed, Is it if the compression_codec is set? ########## src/iceberg/puffin/types.h: ########## @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/types.h +/// Standard blob types and properties for Puffin files. + +#include <string_view> + +namespace iceberg::puffin { + +/// \brief Standard blob types defined by the Iceberg specification. +struct StandardBlobTypes { + /// A serialized form of a "compact" Theta sketch produced by the + /// Apache DataSketches library. + static constexpr std::string_view kApacheDatasketchesThetaV1 = + "apache-datasketches-theta-v1"; + + /// A serialized deletion vector according to the Iceberg spec. + static constexpr std::string_view kDeletionVectorV1 = "deletion-vector-v1"; +}; Review Comment: Should we do enum + toString ? ########## src/iceberg/puffin/blob_metadata.h: ########## @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/puffin/blob_metadata.h +/// Blob metadata structure for Puffin files. + +#include <cstdint> +#include <optional> +#include <string> +#include <unordered_map> +#include <vector> + +#include "iceberg/iceberg_export.h" + +namespace iceberg::puffin { + +/// \brief Metadata about a blob stored in a Puffin file. +/// +/// This represents the metadata stored in the Puffin file footer, +/// including the blob's location within the file. +struct ICEBERG_EXPORT BlobMetadata { + /// Type of the blob. See StandardBlobTypes for known types. + std::string type; + /// List of field IDs the blob was computed for. + std::vector<int32_t> input_fields; + /// ID of the Iceberg table's snapshot the blob was computed from. + int64_t snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from. + int64_t sequence_number; + /// Offset in the file where the blob data starts. + int64_t offset; + /// Length of the blob data in the file (after compression, if compressed). + int64_t length; + /// Compression codec name, or std::nullopt if uncompressed. + std::optional<std::string> compression_codec; Review Comment: ditto same pattern string or enum -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
