mxm commented on code in PR #13032: URL: https://github.com/apache/iceberg/pull/13032#discussion_r2322018020
########## flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/TableMetadataCache.java: ########## @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; +import org.apache.flink.annotation.Internal; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.NoSuchTableException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * TableMetadataCache is responsible for caching table metadata to avoid hitting the catalog too + * frequently. We store table identifier, schema, partition spec, and a set of past schema + * comparison results of the active table schema against the last input schemas. + */ +@Internal +class TableMetadataCache { + + private static final Logger LOG = LoggerFactory.getLogger(TableMetadataCache.class); + private static final int MAX_SCHEMA_COMPARISON_RESULTS_TO_CACHE = 10; + private static final Tuple2<Boolean, Exception> EXISTS = Tuple2.of(true, null); + private static final Tuple2<Boolean, Exception> NOT_EXISTS = Tuple2.of(false, null); + static final Tuple2<Schema, CompareSchemasVisitor.Result> NOT_FOUND = + Tuple2.of(null, CompareSchemasVisitor.Result.SCHEMA_UPDATE_NEEDED); + + private final Catalog catalog; + private final long refreshMs; + private final Cache<TableIdentifier, CacheItem> cache; + + TableMetadataCache(Catalog catalog, int maximumSize, long refreshMs) { + this.catalog = catalog; + this.refreshMs = refreshMs; + this.cache = Caffeine.newBuilder().maximumSize(maximumSize).build(); + } + + Tuple2<Boolean, Exception> exists(TableIdentifier identifier) { + CacheItem cached = cache.getIfPresent(identifier); + if (cached != null && Boolean.TRUE.equals(cached.tableExists)) { + return EXISTS; + } else if (needsRefresh(cached, true)) { + return refreshTable(identifier); + } else { + return NOT_EXISTS; + } + } + + String branch(TableIdentifier identifier, String branch) { + return branch(identifier, branch, true); + } + + Tuple2<Schema, CompareSchemasVisitor.Result> schema(TableIdentifier identifier, Schema input) { + return schema(identifier, input, true); + } + + PartitionSpec spec(TableIdentifier identifier, PartitionSpec spec) { + return spec(identifier, spec, true); + } + + void update(TableIdentifier identifier, Table table) { + cache.put( + identifier, + new CacheItem(true, table.refs().keySet(), new SchemaInfo(table.schemas()), table.specs())); + } + + private String branch(TableIdentifier identifier, String branch, boolean allowRefresh) { + CacheItem cached = cache.getIfPresent(identifier); + if (cached != null && cached.tableExists && cached.branches.contains(branch)) { + return branch; + } + + if (needsRefresh(cached, allowRefresh)) { + refreshTable(identifier); + return branch(identifier, branch, false); + } else { + return null; + } + } + + private Tuple2<Schema, CompareSchemasVisitor.Result> schema( + TableIdentifier identifier, Schema input, boolean allowRefresh) { + CacheItem cached = cache.getIfPresent(identifier); + Schema compatible = null; + if (cached != null && cached.tableExists) { + // This only works if the {@link Schema#equals(Object)} returns true for the old schema + // and a new schema. Performance is paramount as this code is on the hot path. Every other + // way for comparing 2 schemas were performing worse than the + // {@link CompareByNameVisitor#visit(Schema, Schema, boolean)}, so caching was useless. + Tuple2<Schema, CompareSchemasVisitor.Result> lastResult = + cached.schema.lastResults.get(input); + if (lastResult != null) { + return lastResult; + } + + for (Map.Entry<Integer, Schema> tableSchema : cached.schema.schemas.entrySet()) { Review Comment: >@mxm @pvary, why do we need to loop through all previous table schemas here? We need to loop through all existing schemas to avoid adding a new schema in case there is already a matching schema. If we do not find a match, we will modify the current table schema and thereby produce a new schema. >Would it be more correct to always compare the incoming record schema to the latest table schema and evolve the table, or convert the incoming records to match the latest schema? The logic is as follows: 1. If there is an exact match schema already (i.e. field types and field names match) => use that schema 2. If there is a compatible schema (e.g. extra optional field) => convert RowData to match that schema 3. Otherwise, evolve the current table schema to match the input schema >The side effect of the current approach is that the dynamic sink will create multiple DynamicCommittable instances for each resolved table schema of cached.schema.schemas.entrySet() in the [dynamic commit aggregator](https://github.com/apache/iceberg/blob/be03c998d96d0d1fae13aa8c53d6c7c87e2d60ba/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicWriteResultAggregator.java#L101), [..] You're right that the schema isn't used in the commit process. The data files already reference the schema id. This creates more DynamicCommittable than if we didn't include schema in the WriteTarget. But semantically this doesn't change how the table gets modified in the end. > [..] which will issue multiple commits to an Iceberg table per checkpoint - one commit per each table schema id I'm not sure that is true. We commit once per table / branch. >However, each of these commits will have a schema-id in its metadata snapshot pointing to the latest table schema, which seems wrong. See the [SnapshotProducer implementation](https://github.com/apache/iceberg/blob/be03c998d96d0d1fae13aa8c53d6c7c87e2d60ba/core/src/main/java/org/apache/iceberg/SnapshotProducer.java#L322). I think it makes sense to use the latest schema, since we have to pick one schema and cannot include all. >Is it valid in Iceberg to constantly alternate between multiple schemas? Or should we only evolve the latest schema and adjust incoming records to match it? What do you think? We only ever evolve the current table schema. We adjust incoming records if there is no existing schema that we can use. But we need to support using old schemas because it is a common use case that old data gets written. Since we don't allow breaking changes like removing fields, all old schemas are still valid and can be used safely, while remaining compatible with newer schemas. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
