This is an automated email from the ASF dual-hosted git repository. gavinchou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 3d5575e05e1 [fix](cold hot separation) Fix the issue that files on the remote storage are not deleted after triggering cold data compaction. (#48109) 3d5575e05e1 is described below commit 3d5575e05e11911683653df716ef93d9585fcc9c Author: yagagagaga <zhangminkefromflyd...@gmail.com> AuthorDate: Thu Feb 27 21:10:18 2025 +0800 [fix](cold hot separation) Fix the issue that files on the remote storage are not deleted after triggering cold data compaction. (#48109) --- be/src/olap/olap_server.cpp | 2 +- .../org/apache/doris/regression/suite/Suite.groovy | 18 +++ regression-test/pipeline/p0/conf/be.conf | 4 + .../cold_data_compaction.groovy | 132 +++++++++++++++++++++ 4 files changed, 155 insertions(+), 1 deletion(-) diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index 19eea915723..0ab3c694292 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -1285,7 +1285,7 @@ void StorageEngine::do_remove_unused_remote_files() { } cooldown_meta_id = t->tablet_meta()->cooldown_meta_id(); } - auto [cooldown_replica_id, cooldown_term] = t->cooldown_conf(); + auto [cooldown_term, cooldown_replica_id] = t->cooldown_conf(); if (cooldown_replica_id != t->replica_id()) { return; } diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy index 957a13efe5e..f567cd57d8f 100644 --- a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy @@ -17,6 +17,12 @@ package org.apache.doris.regression.suite +import com.amazonaws.auth.AWSStaticCredentialsProvider +import com.amazonaws.auth.BasicAWSCredentials +import com.amazonaws.client.builder.AwsClientBuilder +import com.amazonaws.services.s3.AmazonS3 +import com.amazonaws.services.s3.AmazonS3ClientBuilder + import static java.util.concurrent.TimeUnit.SECONDS import com.google.common.base.Strings @@ -97,6 +103,8 @@ class Suite implements GroovyInterceptable { final List<Future> lazyCheckFutures = new Vector<>() static Boolean isTrinoConnectorDownloaded = false + private AmazonS3 s3Client = null + Suite(String name, String group, SuiteContext context, SuiteCluster cluster) { this.name = name this.group = group @@ -989,6 +997,16 @@ class Suite implements GroovyInterceptable { return s3Url } + synchronized AmazonS3 getS3Client() { + if (s3Client == null) { + def credentials = new BasicAWSCredentials(getS3AK(), getS3SK()) + def endpointConfiguration = new AwsClientBuilder.EndpointConfiguration(getS3Endpoint(), getS3Region()) + s3Client = AmazonS3ClientBuilder.standard().withEndpointConfiguration(endpointConfiguration) + .withCredentials(new AWSStaticCredentialsProvider(credentials)).build() + } + return s3Client + } + String getJdbcPassword() { String sk = context.config.otherConfigs.get("jdbcPassword"); return sk diff --git a/regression-test/pipeline/p0/conf/be.conf b/regression-test/pipeline/p0/conf/be.conf index 8b240d9f069..958f901db59 100644 --- a/regression-test/pipeline/p0/conf/be.conf +++ b/regression-test/pipeline/p0/conf/be.conf @@ -78,5 +78,9 @@ enable_write_index_searcher_cache=true # enable download small files in batch, see apache/doris#45061 for details enable_batch_download = true +remove_unused_remote_files_interval_sec=60 +cold_data_compaction_interval_sec=60 + # So feature has bug, so by default is false, only open it in pipeline to observe enable_parquet_page_index=true + diff --git a/regression-test/suites/cold_heat_separation/cold_data_compaction.groovy b/regression-test/suites/cold_heat_separation/cold_data_compaction.groovy new file mode 100644 index 00000000000..bf9e33e7528 --- /dev/null +++ b/regression-test/suites/cold_heat_separation/cold_data_compaction.groovy @@ -0,0 +1,132 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import com.amazonaws.services.s3.model.ListObjectsRequest +import java.util.function.Supplier + +suite("test_cold_data_compaction") { + def retryUntilTimeout = { int timeoutSecond, Supplier<Boolean> closure -> + long start = System.currentTimeMillis() + while (true) { + if (closure.get()) { + return + } else { + if (System.currentTimeMillis() - start > timeoutSecond * 1000) { + throw new RuntimeException("" + + "Operation timeout, maybe you need to check " + + "remove_unused_remote_files_interval_sec and " + + "cold_data_compaction_interval_sec in be.conf") + } else { + sleep(10_000) + } + } + } + } + + String suffix = UUID.randomUUID().hashCode().abs().toString() + String s3Prefix = "regression/cold_data_compaction" + multi_sql """ + DROP TABLE IF EXISTS t_recycle_in_s3; + DROP STORAGE POLICY IF EXISTS test_policy_${suffix}; + DROP RESOURCE IF EXISTS 'remote_s3_${suffix}'; + + CREATE RESOURCE "remote_s3_${suffix}" + PROPERTIES + ( + "type" = "s3", + "s3.endpoint" = "${getS3Endpoint()}", + "s3.region" = "${getS3Region()}", + "s3.bucket" = "${getS3BucketName()}", + "s3.root.path" = "${s3Prefix}", + "s3.access_key" = "${getS3AK()}", + "s3.secret_key" = "${getS3SK()}", + "s3.connection.maximum" = "50", + "s3.connection.request.timeout" = "3000", + "s3.connection.timeout" = "1000" + ); + CREATE STORAGE POLICY test_policy_${suffix} + PROPERTIES( + "storage_resource" = "remote_s3_${suffix}", + "cooldown_ttl" = "5" + ); + CREATE TABLE IF NOT EXISTS t_recycle_in_s3 + ( + k1 BIGINT, + k2 LARGEINT, + v1 VARCHAR(2048) + ) + DISTRIBUTED BY HASH (k1) BUCKETS 1 + PROPERTIES( + "storage_policy" = "test_policy_${suffix}", + "disable_auto_compaction" = "true", + "replication_num" = "1" + ); + """ + + // insert 5 RowSets + multi_sql """ + insert into t_recycle_in_s3 values(1, 1, 'Tom'); + insert into t_recycle_in_s3 values(2, 2, 'Jelly'); + insert into t_recycle_in_s3 values(3, 3, 'Spike'); + insert into t_recycle_in_s3 values(4, 4, 'Tyke'); + insert into t_recycle_in_s3 values(5, 5, 'Tuffy'); + """ + + // wait until files upload to S3 + retryUntilTimeout(1800, { + def res = sql_return_maparray "show data from t_recycle_in_s3" + String size = "" + String remoteSize = "" + for (final def line in res) { + if ("t_recycle_in_s3".equals(line.TableName)) { + size = line.Size + remoteSize = line.RemoteSize + break + } + } + logger.info("waiting for data to be uploaded to S3: t_recycle_in_s3's local data size: ${size}, remote data size: ${remoteSize}") + return size.startsWith("0") && !remoteSize.startsWith("0") + }) + + String tabletId = sql_return_maparray("show tablets from t_recycle_in_s3")[0].TabletId + // check number of remote files + def filesBeforeCompaction = getS3Client().listObjects( + new ListObjectsRequest().withBucketName(getS3BucketName()).withPrefix(s3Prefix + "/data/${tabletId}")).getObjectSummaries() + + // 5 RowSets + 1 meta + assertEquals(6, filesBeforeCompaction.size()) + + // trigger cold data compaction + sql """alter table t_recycle_in_s3 set ("disable_auto_compaction" = "false")""" + + // wait until compaction finish + retryUntilTimeout(1800, { + def filesAfterCompaction = getS3Client().listObjects( + new ListObjectsRequest().withBucketName(getS3BucketName()).withPrefix(s3Prefix+ "/data/${tabletId}")).getObjectSummaries() + logger.info("t_recycle_in_s3's remote file number is ${filesAfterCompaction.size()}") + // 1 RowSet + 1 meta + return filesAfterCompaction.size() == 2 + }) + + sql "drop table t_recycle_in_s3 force" + retryUntilTimeout(1800, { + def filesAfterDrop = getS3Client().listObjects( + new ListObjectsRequest().withBucketName(getS3BucketName()).withPrefix(s3Prefix+ "/data/${tabletId}")).getObjectSummaries() + logger.info("after drop t_recycle_in_s3, remote file number is ${filesAfterDrop.size()}") + return filesAfterDrop.size() == 0 + }) +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org