This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 9e0a29b0fbab9920db77c4618d8596412b73eea3 Author: meiyi <myime...@gmail.com> AuthorDate: Thu Jun 13 19:38:14 2024 +0800 [fix](group commit) make group commit cancel in time (#36249) ## Proposed changes If group commit time interval is larger than the load timeout, and there is no new client load to reuse the internal group commit load, the group commit can not cancel in time because it stuck in wait: ``` #0 0x00007f33937a47aa in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00005651105dbd05 in __gthread_cond_timedwait(pthread_cond_t*, pthread_mutex_t*, timespec const*) () #2 0x000056511063f385 in std::__condvar::wait_until(std::mutex&, timespec&) () #3 0x000056511063dc2e in std::cv_status std::condition_variable::__wait_until_impl<std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::unique_lock<std::mutex>&, std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) () #4 0x000056511063cedf in std::cv_status std::condition_variable::wait_until<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::unique_lock<std::mutex>&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) () #5 0x0000565110824f48 in std::cv_status std::condition_variable::wait_for<long, std::ratio<1l, 1000l> >(std::unique_lock<std::mutex>&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&) () #6 0x0000565113b5612a in doris::LoadBlockQueue::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*, bool*) () #7 0x000056513f900941 in doris::pipeline::GroupCommitOperatorX::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) () #8 0x000056513c69c0b6 in doris::pipeline::ScanOperatorX<doris::pipeline::GroupCommitLocalState>::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) () #9 0x000056514009d5f1 in doris::pipeline::PipelineTask::execute(bool*) () #10 0x00005651400fb24a in doris::pipeline::TaskScheduler::_do_work(unsigned long) () ``` --- be/src/runtime/group_commit_mgr.cpp | 2 +- .../insert_p0/test_group_commit_timeout.groovy | 55 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index 3faafa90b66..06cf494c842 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -149,7 +149,7 @@ Status LoadBlockQueue::get_block(RuntimeState* runtime_state, vectorized::Block* << ", runtime_state=" << runtime_state; } } - _get_cond.wait_for(l, std::chrono::milliseconds(left_milliseconds)); + _get_cond.wait_for(l, std::chrono::milliseconds(std::min(left_milliseconds, 10000L))); } if (runtime_state->is_cancelled()) { auto st = runtime_state->cancel_reason(); diff --git a/regression-test/suites/insert_p0/test_group_commit_timeout.groovy b/regression-test/suites/insert_p0/test_group_commit_timeout.groovy new file mode 100644 index 00000000000..7866a33df0e --- /dev/null +++ b/regression-test/suites/insert_p0/test_group_commit_timeout.groovy @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_group_commit_timeout", "nonConcurrent") { + def tableName = "test_group_commit_timeout" + sql """ + CREATE TABLE if not exists ${tableName} ( + `id` int(11) NOT NULL, + `name` varchar(100) NULL, + `score` int(11) NULL default "-1" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "group_commit_interval_ms" = "300000" + ); + """ + + def query_timeout = sql """show variables where variable_name = 'query_timeout';""" + def insert_timeout = sql """show variables where variable_name = 'insert_timeout';""" + logger.info("query_timeout: ${query_timeout}, insert_timeout: ${insert_timeout}") + + long start = System.currentTimeMillis() + try { + sql "SET global query_timeout = 5" + sql "SET global insert_timeout = 5" + + sql "set group_commit = sync_mode" + sql "insert into ${tableName} values(1, 'a', 10)" + assertTrue(false) + } catch (Exception e) { + long end = System.currentTimeMillis() + logger.info("failed " + e.getMessage()) + assertTrue(e.getMessage().contains("FragmentMgr cancel worker going to cancel timeout instance")) + assertTrue(end - start <= 60000) + } finally { + sql "SET global query_timeout = ${query_timeout[0][1]}" + sql "SET global insert_timeout = ${insert_timeout[0][1]}" + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org