junrao commented on code in PR #16969: URL: https://github.com/apache/kafka/pull/16969#discussion_r1739168653
########## core/src/main/java/kafka/server/share/DelayedShareFetch.java: ########## @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.server.share; + +import kafka.server.DelayedOperation; +import kafka.server.LogReadResult; +import kafka.server.QuotaFactory; +import kafka.server.ReplicaManager; + +import org.apache.kafka.common.TopicIdPartition; +import org.apache.kafka.common.requests.FetchRequest; +import org.apache.kafka.storage.internals.log.FetchPartitionData; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import scala.Option; +import scala.Tuple2; +import scala.collection.Seq; +import scala.jdk.javaapi.CollectionConverters; +import scala.runtime.BoxedUnit; + +/** + * A delayed share fetch operation has been introduced in case there is no share partition for which we can acquire records. We will try to wait + * for MaxWaitMs for records to be released else complete the share fetch request. + */ +public class DelayedShareFetch extends DelayedOperation { + private final SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData; + private final ReplicaManager replicaManager; + private final Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap; + private final Map<TopicIdPartition, FetchRequest.PartitionData> topicPartitionDataFromTryComplete = new LinkedHashMap<>(); + + private static final Logger log = LoggerFactory.getLogger(DelayedShareFetch.class); + + DelayedShareFetch( + SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData, + ReplicaManager replicaManager, + Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap) { + super(shareFetchPartitionData.fetchParams().maxWaitMs, Option.empty()); + this.shareFetchPartitionData = shareFetchPartitionData; + this.replicaManager = replicaManager; + this.partitionCacheMap = partitionCacheMap; + } + + @Override + public void onExpiration() { + } + + /** + * Complete the share fetch operation by fetching records for all partitions in the share fetch request irrespective + * of whether they have any acquired records. This is called when the fetch operation is forced to complete either Review Comment: > irrespective of whether they have any acquired records. This seems outdated? ########## server/src/main/java/org/apache/kafka/server/config/ShareGroupConfig.java: ########## @@ -63,14 +63,19 @@ public class ShareGroupConfig { public static final int SHARE_GROUP_MAX_RECORD_LOCK_DURATION_MS_DEFAULT = 60000; public static final String SHARE_GROUP_MAX_RECORD_LOCK_DURATION_MS_DOC = "The record acquisition lock maximum duration in milliseconds for share groups."; + public static final String SHARE_FETCH_PURGATORY_PURGE_INTERVAL_REQUESTS_CONFIG = "share.fetch.purgatory.purge.interval.requests"; Review Comment: This config is not in the KIP. So we need to update the KIP. ########## core/src/main/java/kafka/server/share/SharePartitionManager.java: ########## @@ -564,16 +592,29 @@ void maybeProcessFetchQueue() { ); } else { sharePartition.releaseFetchLock(); - log.info("Record lock partition limit exceeded for SharePartition with key {}, " + - "cannot acquire more records", sharePartitionKey); } } }); - if (topicPartitionData.isEmpty()) { - // No locks for share partitions could be acquired, so we complete the request and - // will re-fetch for the client in next poll. + if (shareFetchPartitionData.partitionMaxBytes.isEmpty()) { Review Comment: Should we move this check earlier? ########## core/src/main/java/kafka/server/share/SharePartitionManager.java: ########## @@ -564,16 +592,29 @@ void maybeProcessFetchQueue() { ); } else { sharePartition.releaseFetchLock(); - log.info("Record lock partition limit exceeded for SharePartition with key {}, " + - "cannot acquire more records", sharePartitionKey); } } }); - if (topicPartitionData.isEmpty()) { - // No locks for share partitions could be acquired, so we complete the request and - // will re-fetch for the client in next poll. + if (shareFetchPartitionData.partitionMaxBytes.isEmpty()) { + // If there are no partitions to fetch then complete the future with an empty map. shareFetchPartitionData.future.complete(Collections.emptyMap()); + // Release the lock so that other threads can process the queue. + releaseProcessFetchQueueLock(); + if (!fetchQueue.isEmpty()) + maybeProcessFetchQueue(); + return; + } + if (topicPartitionData.isEmpty()) { + // No locks for any of the share partitions in the fetch request could be acquired. + Set<Object> delayedShareFetchWatchKeys = new HashSet<>(); + shareFetchPartitionData.partitionMaxBytes.keySet().forEach( + topicIdPartition -> delayedShareFetchWatchKeys.add( + new DelayedShareFetchKey(topicIdPartition, shareFetchPartitionData.groupId))); + + // Add the share fetch to the delayed share fetch purgatory to process the fetch request. + addDelayedShareFetch(new DelayedShareFetch(shareFetchPartitionData, replicaManager, partitionCacheMap), Review Comment: `delayedShareFetchPurgatory.tryCompleteElseWatch()` calls `tryComplete`, which tries to acquire the partition level lock. Could we remove the logic for acquiring the partition lock in this method? ########## core/src/main/java/kafka/server/share/DelayedShareFetch.java: ########## @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package kafka.server.share; + +import kafka.server.DelayedOperation; +import kafka.server.LogReadResult; +import kafka.server.QuotaFactory; +import kafka.server.ReplicaManager; + +import org.apache.kafka.common.TopicIdPartition; +import org.apache.kafka.common.requests.FetchRequest; +import org.apache.kafka.storage.internals.log.FetchPartitionData; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import scala.Option; +import scala.Tuple2; +import scala.collection.Seq; +import scala.jdk.javaapi.CollectionConverters; +import scala.runtime.BoxedUnit; + +/** + * A delayed share fetch operation has been introduced in case there is no share partition for which we can acquire records. We will try to wait + * for MaxWaitMs for records to be released else complete the share fetch request. + */ +public class DelayedShareFetch extends DelayedOperation { + private final SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData; + private final ReplicaManager replicaManager; + private final Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap; + private final Map<TopicIdPartition, FetchRequest.PartitionData> topicPartitionDataFromTryComplete = new LinkedHashMap<>(); + + private static final Logger log = LoggerFactory.getLogger(DelayedShareFetch.class); + + DelayedShareFetch( + SharePartitionManager.ShareFetchPartitionData shareFetchPartitionData, + ReplicaManager replicaManager, + Map<SharePartitionManager.SharePartitionKey, SharePartition> partitionCacheMap) { + super(shareFetchPartitionData.fetchParams().maxWaitMs, Option.empty()); + this.shareFetchPartitionData = shareFetchPartitionData; + this.replicaManager = replicaManager; + this.partitionCacheMap = partitionCacheMap; + } + + @Override + public void onExpiration() { + } + + /** + * Complete the share fetch operation by fetching records for all partitions in the share fetch request irrespective + * of whether they have any acquired records. This is called when the fetch operation is forced to complete either + * because records can be acquired for some partitions or due to MaxWaitMs timeout. + */ + @Override + public void onComplete() { + log.trace("Completing the delayed share fetch request for group {}, member {}, " + + "topic partitions {}", shareFetchPartitionData.groupId(), + shareFetchPartitionData.memberId(), shareFetchPartitionData.partitionMaxBytes().keySet()); + + if (shareFetchPartitionData.future().isDone()) + return; + + Map<TopicIdPartition, FetchRequest.PartitionData> topicPartitionData; + // tryComplete did not invoke forceComplete, so we need to check if we have any partitions to fetch. + if (topicPartitionDataFromTryComplete.isEmpty()) + topicPartitionData = topicPartitionDataForAcquirablePartitions(); + // tryComplete invoked forceComplete, so we can use the data from tryComplete. + else + topicPartitionData = topicPartitionDataFromTryComplete; + try { + if (topicPartitionData.isEmpty()) { + // No locks for share partitions could be acquired, so we complete the request with an empty response. + shareFetchPartitionData.future().complete(Collections.emptyMap()); + return; + } + log.trace("Fetchable share partitions data: {} with groupId: {} fetch params: {}", + topicPartitionData, shareFetchPartitionData.groupId(), shareFetchPartitionData.fetchParams()); + + Seq<Tuple2<TopicIdPartition, LogReadResult>> responseLogResult = replicaManager.readFromLog( + shareFetchPartitionData.fetchParams(), + CollectionConverters.asScala( + topicPartitionData.entrySet().stream().map(entry -> + new Tuple2<>(entry.getKey(), entry.getValue())).collect(Collectors.toList()) + ), + QuotaFactory.UnboundedQuota$.MODULE$, + true); + + List<Tuple2<TopicIdPartition, FetchPartitionData>> responseData = new ArrayList<>(); + responseLogResult.foreach(tpLogResult -> { + TopicIdPartition topicIdPartition = tpLogResult._1(); + LogReadResult logResult = tpLogResult._2(); + FetchPartitionData fetchPartitionData = logResult.toFetchPartitionData(false); + responseData.add(new Tuple2<>(topicIdPartition, fetchPartitionData)); + return BoxedUnit.UNIT; + }); + + log.trace("Data successfully retrieved by replica manager: {}", responseData); + ShareFetchUtils.processFetchResponse(shareFetchPartitionData, responseData, partitionCacheMap, replicaManager) + .whenComplete((result, throwable) -> { + if (throwable != null) { + log.error("Error processing fetch response for share partitions", throwable); + shareFetchPartitionData.future().completeExceptionally(throwable); + } else { + shareFetchPartitionData.future().complete(result); + } + // Releasing the lock to move ahead with the next request in queue. + releasePartitionsLock(shareFetchPartitionData.groupId(), topicPartitionData.keySet()); + }); + + } catch (Exception e) { + // Release the locks acquired for the partitions in the share fetch request in case there is an exception + log.error("Error processing delayed share fetch request", e); + shareFetchPartitionData.future().completeExceptionally(e); + releasePartitionsLock(shareFetchPartitionData.groupId(), topicPartitionData.keySet()); + } + } + + /** + * Try to complete the fetch operation if we can acquire records for any partition in the share fetch request. + */ + @Override + public boolean tryComplete() { + log.trace("Try to complete the delayed share fetch request for group {}, member {}, topic partitions {}", + shareFetchPartitionData.groupId(), shareFetchPartitionData.memberId(), + shareFetchPartitionData.partitionMaxBytes().keySet()); + + for (TopicIdPartition topicIdPartition: shareFetchPartitionData.partitionMaxBytes().keySet()) { Review Comment: Could we reuse `topicPartitionDataForAcquirablePartitions` for the following code? ########## core/src/main/java/kafka/server/share/SharePartitionManager.java: ########## @@ -281,6 +291,12 @@ public CompletableFuture<Map<TopicIdPartition, ShareAcknowledgeResponseData.Part }); return Errors.NONE; }); + + // If we have an acknowledgement completed for a topic-partition, then we should check if + // there is a pending share fetch request for the topic-partition and complete it. + DelayedShareFetchKey delayedShareFetchKey = new DelayedShareFetchKey(topicIdPartition, groupId); + delayedShareFetchPurgatory.checkAndComplete(delayedShareFetchKey); Review Comment: It seems that we should call `delayedShareFetchPurgatory` in some other places. 1. When the partition level lock is released. 2. When the HWM advances in a partition. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
