Fokko commented on code in PR #212: URL: https://github.com/apache/iceberg-cpp/pull/212#discussion_r2390069067
########## src/iceberg/table_properties.h: ########## @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include <limits> +#include <memory> +#include <string> +#include <unordered_map> +#include <unordered_set> + +#include "iceberg/iceberg_export.h" +#include "iceberg/util/config.h" + +namespace iceberg { + +/// \brief Table properties for Iceberg tables. +/// +/// This class provides configuration entries for various Iceberg table properties +/// including format settings, commit behavior, file formats, compression settings, +/// and other table-level configurations. +class ICEBERG_EXPORT TableProperties : public ConfigBase<TableProperties> { + public: + template <typename T> + using Entry = const ConfigBase<TableProperties>::Entry<T>; + + // Reserved table properties + + /// \brief Reserved table property for table format version. + /// + /// Iceberg will default a new table's format version to the latest stable and + /// recommended version. This reserved property keyword allows users to override the + /// Iceberg format version of the table metadata. + /// + /// If this table property exists when creating a table, the table will use the + /// specified format version. If a table updates this property, it will try to upgrade + /// to the specified format version. + /// + /// \note incomplete or unstable versions cannot be selected using this property. + inline static Entry<std::string> kFormatVersion{"format-version", ""}; + /// \brief Reserved table property for table UUID. + inline static Entry<std::string> kUuid{"uuid", ""}; + /// \brief Reserved table property for the total number of snapshots. + inline static Entry<std::string> kSnapshotCount{"snapshot-count", ""}; + /// \brief Reserved table property for current snapshot summary. + inline static Entry<std::string> kCurrentSnapshotSummary{"current-snapshot-summary", + ""}; + /// \brief Reserved table property for current snapshot id. + inline static Entry<std::string> kCurrentSnapshotId{"current-snapshot-id", ""}; + /// \brief Reserved table property for current snapshot timestamp. + inline static Entry<std::string> kCurrentSnapshotTimestamp{ + "current-snapshot-timestamp-ms", ""}; + /// \brief Reserved table property for the JSON representation of current schema. + inline static Entry<std::string> kCurrentSchema{"current-schema", ""}; + /// \brief Reserved table property for the JSON representation of current(default) + /// partition spec. + inline static Entry<std::string> kDefaultPartitionSpec{"default-partition-spec", ""}; + /// \brief Reserved table property for the JSON representation of current(default) sort + /// order. + inline static Entry<std::string> kDefaultSortOrder{"default-sort-order", ""}; + + // Commit properties + + inline static Entry<int32_t> kCommitNumRetries{"commit.retry.num-retries", 4}; + inline static Entry<int32_t> kCommitMinRetryWaitMs{"commit.retry.min-wait-ms", 100}; + inline static Entry<int32_t> kCommitMaxRetryWaitMs{"commit.retry.max-wait-ms", + 60 * 1000}; // 1 minute + inline static Entry<int32_t> kCommitTotalRetryTimeMs{"commit.retry.total-timeout-ms", + 30 * 60 * 1000}; // 30 minutes + inline static Entry<int32_t> kCommitNumStatusChecks{"commit.status-check.num-retries", + 3}; + inline static Entry<int64_t> kCommitStatusChecksMinWaitMs{ + "commit.status-check.min-wait-ms", int64_t{1000}}; // 1 second + inline static Entry<int64_t> kCommitStatusChecksMaxWaitMs{ + "commit.status-check.max-wait-ms", int64_t{60 * 1000}}; // 1 minute + inline static Entry<int64_t> kCommitStatusChecksTotalWaitMs{ + "commit.status-check.total-timeout-ms", int64_t{30 * 60 * 1000}}; // 30 minutes + + // Manifest properties + + inline static Entry<int64_t> kManifestTargetSizeBytes{ + "commit.manifest.target-size-bytes", int64_t{8 * 1024 * 1024}}; // 8 MB + inline static Entry<int32_t> kManifestMinMergeCount{ + "commit.manifest.min-count-to-merge", 100}; + inline static Entry<bool> kManifestMergeEnabled{"commit.manifest-merge.enabled", true}; + + // File format properties + + inline static Entry<std::string> kDefaultFileFormat{"write.format.default", "parquet"}; + inline static Entry<std::string> kDeleteDefaultFileFormat{"write.delete.format.default", + "parquet"}; + + // Parquet properties + + inline static Entry<int32_t> kParquetRowGroupSizeBytes{ + "write.parquet.row-group-size-bytes", 128 * 1024 * 1024}; // 128 MB + inline static Entry<int32_t> kDeleteParquetRowGroupSizeBytes{ + "write.delete.parquet.row-group-size-bytes", 128 * 1024 * 1024}; // 128 MB + inline static Entry<int32_t> kParquetPageSizeBytes{"write.parquet.page-size-bytes", + 1024 * 1024}; // 1 MB + inline static Entry<int32_t> kDeleteParquetPageSizeBytes{ + "write.delete.parquet.page-size-bytes", 1024 * 1024}; // 1 MB + inline static Entry<int32_t> kParquetPageRowLimit{"write.parquet.page-row-limit", + 20'000}; + inline static Entry<int32_t> kDeleteParquetPageRowLimit{ + "write.delete.parquet.page-row-limit", 20'000}; + inline static Entry<int32_t> kParquetDictSizeBytes{"write.parquet.dict-size-bytes", + 2 * 1024 * 1024}; // 2 MB + inline static Entry<int32_t> kDeleteParquetDictSizeBytes{ + "write.delete.parquet.dict-size-bytes", 2 * 1024 * 1024}; // 2 MB + inline static Entry<std::string> kParquetCompression{"write.parquet.compression-codec", + "zstd"}; + inline static Entry<std::string> kDeleteParquetCompression{ + "write.delete.parquet.compression-codec", "zstd"}; + inline static Entry<std::string> kParquetCompressionLevel{ + "write.parquet.compression-level", ""}; + inline static Entry<std::string> kDeleteParquetCompressionLevel{ + "write.delete.parquet.compression-level", ""}; + inline static Entry<int32_t> kParquetRowGroupCheckMinRecordCount{ + "write.parquet.row-group-check-min-record-count", 100}; + inline static Entry<int32_t> kDeleteParquetRowGroupCheckMinRecordCount{ + "write.delete.parquet.row-group-check-min-record-count", 100}; + inline static Entry<int32_t> kParquetRowGroupCheckMaxRecordCount{ + "write.parquet.row-group-check-max-record-count", 10'000}; + inline static Entry<int32_t> kDeleteParquetRowGroupCheckMaxRecordCount{ + "write.delete.parquet.row-group-check-max-record-count", 10'000}; + inline static Entry<int32_t> kParquetBloomFilterMaxBytes{ + "write.parquet.bloom-filter-max-bytes", 1024 * 1024}; // 1 MB + inline static std::string_view kParquetBloomFilterColumnFppPrefix{ + "write.parquet.bloom-filter-fpp.column."}; + inline static std::string_view kParquetBloomFilterColumnEnabledPrefix{ + "write.parquet.bloom-filter-enabled.column."}; + inline static std::string_view kParquetColumnStatsEnabledPrefix{ + "write.parquet.stats-enabled.column."}; + + // Avro properties + inline static Entry<std::string> kAvroCompression{"write.avro.compression-codec", + "gzip"}; + inline static Entry<std::string> kDeleteAvroCompression{ + "write.delete.avro.compression-codec", "gzip"}; + inline static Entry<std::string> kAvroCompressionLevel{"write.avro.compression-level", + ""}; + inline static Entry<std::string> kDeleteAvroCompressionLevel{ + "write.delete.avro.compression-level", ""}; + + // ORC properties + inline static Entry<int64_t> kOrcStripeSizeBytes{"write.orc.stripe-size-bytes", Review Comment: Why are we going with int64 here, and with int32 in Parquet? Int32 seems to be sufficient? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
