yiguolei commented on code in PR #30746: URL: https://github.com/apache/doris/pull/30746#discussion_r1477233912
########## be/src/vec/exec/scan/scanner_scheduler.cpp: ########## @@ -310,61 +244,43 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler, if (!scanner->is_init()) { status = scanner->init(); if (!status.ok()) { - ctx->set_status_on_error(status); eos = true; } } + if (!eos && !scanner->is_open()) { status = scanner->open(state); if (!status.ok()) { - ctx->set_status_on_error(status); eos = true; } scanner->set_opened(); } static_cast<void>(scanner->try_append_late_arrival_runtime_filter()); - // Because we use thread pool to scan data from storage. One scanner can't - // use this thread too long, this can starve other query's scanner. So, we - // need yield this thread when we do enough work. However, OlapStorage read - // data in pre-aggregate mode, then we can't use storage returned data to - // judge if we need to yield. So we record all raw data read in this round - // scan, if this exceeds row number or bytes threshold, we yield this thread. - std::vector<vectorized::BlockUPtr> blocks; - int64_t raw_bytes_read = 0; - int64_t raw_bytes_threshold = config::doris_scanner_row_bytes; - int num_rows_in_block = 0; - - // Only set to true when ctx->done() return true. - // Use this flag because we need distinguish eos from `should_stop`. - // If eos is true, we still need to return blocks, - // but is should_stop is true, no need to return blocks - bool should_stop = false; - // Has to wait at least one full block, or it will cause a lot of schedule task in priority - // queue, it will affect query latency and query concurrency for example ssb 3.3. - auto should_do_scan = [&, batch_size = state->batch_size(), - time = state->wait_full_block_schedule_times()]() { - if (raw_bytes_read < raw_bytes_threshold) { - return true; - } else if (num_rows_in_block < batch_size) { - return raw_bytes_read < raw_bytes_threshold * time; - } - return false; - }; - - while (!eos && should_do_scan()) { - // TODO llj task group should should_yield? + bool first_read = true; + int last_read_rows = ctx->batch_size(); + while (!eos) { if (UNLIKELY(ctx->done())) { - // No need to set status on error here. - // Because done() maybe caused by "should_stop" - should_stop = true; + eos = true; break; } + BlockUPtr free_block = nullptr; + if (first_read) { + status = scanner->get_block_after_projects(state, running_scanner->current_block.get(), + &eos); + first_read = false; + if (running_scanner->current_block->rows() > 0) { + last_read_rows = running_scanner->current_block->rows(); + } + } else { + free_block = ctx->get_free_block(last_read_rows); Review Comment: I think the block rows num should always == state.batch_size. So that it could be reused. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org