This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new e140938d81 [Perfomance][export] Opt the export of CSV tranformer (#24003) e140938d81 is described below commit e140938d811fb1fab51c035a334170fd12e09c1c Author: HappenLee <happen...@hotmail.com> AuthorDate: Fri Sep 8 20:26:54 2023 +0800 [Perfomance][export] Opt the export of CSV tranformer (#24003) --- be/src/vec/runtime/vcsv_transformer.cpp | 80 +++++++++++++++++---------------- be/src/vec/runtime/vcsv_transformer.h | 5 +-- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/be/src/vec/runtime/vcsv_transformer.cpp b/be/src/vec/runtime/vcsv_transformer.cpp index da5a697460..1db5440b9c 100644 --- a/be/src/vec/runtime/vcsv_transformer.cpp +++ b/be/src/vec/runtime/vcsv_transformer.cpp @@ -90,29 +90,34 @@ Status VCSVTransformer::write(const Block& block) { for (size_t col_id = 0; col_id < block.columns(); col_id++) { auto col = block.get_by_position(col_id); if (col.column->is_null_at(i)) { - _plain_text_outstream << NULL_IN_CSV; + fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV); } else { switch (_output_vexpr_ctxs[col_id]->root()->type().type) { case TYPE_BOOLEAN: case TYPE_TINYINT: - _plain_text_outstream << (int)*reinterpret_cast<const int8_t*>( - col.column->get_data_at(i).data); + fmt::format_to( + _outstream_buffer, "{}", + (int)*reinterpret_cast<const int8_t*>(col.column->get_data_at(i).data)); break; case TYPE_SMALLINT: - _plain_text_outstream - << *reinterpret_cast<const int16_t*>(col.column->get_data_at(i).data); + fmt::format_to( + _outstream_buffer, "{}", + *reinterpret_cast<const int16_t*>(col.column->get_data_at(i).data)); break; case TYPE_INT: - _plain_text_outstream - << *reinterpret_cast<const int32_t*>(col.column->get_data_at(i).data); + fmt::format_to( + _outstream_buffer, "{}", + *reinterpret_cast<const int32_t*>(col.column->get_data_at(i).data)); break; case TYPE_BIGINT: - _plain_text_outstream - << *reinterpret_cast<const int64_t*>(col.column->get_data_at(i).data); + fmt::format_to( + _outstream_buffer, "{}", + *reinterpret_cast<const int64_t*>(col.column->get_data_at(i).data)); break; case TYPE_LARGEINT: - _plain_text_outstream - << *reinterpret_cast<const __int128*>(col.column->get_data_at(i).data); + fmt::format_to( + _outstream_buffer, "{}", + *reinterpret_cast<const __int128*>(col.column->get_data_at(i).data)); break; case TYPE_FLOAT: { char buffer[MAX_FLOAT_STR_LENGTH + 2]; @@ -121,7 +126,7 @@ Status VCSVTransformer::write(const Block& block) { buffer[0] = '\0'; int length = FloatToBuffer(float_value, MAX_FLOAT_STR_LENGTH, buffer); DCHECK(length >= 0) << "gcvt float failed, float value=" << float_value; - _plain_text_outstream << buffer; + fmt::format_to(_outstream_buffer, "{}", buffer); break; } case TYPE_DOUBLE: { @@ -130,45 +135,45 @@ Status VCSVTransformer::write(const Block& block) { // For example: For a double value 27361919854.929001, // the direct output of using std::stringstream is 2.73619e+10, // and after conversion to a string, it outputs 27361919854.929001 - char buffer[MAX_DOUBLE_STR_LENGTH + 2]; + char buffer[MAX_DOUBLE_STR_LENGTH + 2] = "\0"; double double_value = *reinterpret_cast<const double*>(col.column->get_data_at(i).data); buffer[0] = '\0'; int length = DoubleToBuffer(double_value, MAX_DOUBLE_STR_LENGTH, buffer); DCHECK(length >= 0) << "gcvt double failed, double value=" << double_value; - _plain_text_outstream << buffer; + fmt::format_to(_outstream_buffer, "{}", buffer); break; } case TYPE_DATEV2: { - char buf[64]; + char buf[64] = "\0"; const DateV2Value<DateV2ValueType>* time_val = (const DateV2Value<DateV2ValueType>*)(col.column->get_data_at(i).data); time_val->to_string(buf); - _plain_text_outstream << buf; + fmt::format_to(_outstream_buffer, "{}", buf); break; } case TYPE_DATETIMEV2: { - char buf[64]; + char buf[64] = "\0"; const DateV2Value<DateTimeV2ValueType>* time_val = (const DateV2Value<DateTimeV2ValueType>*)(col.column->get_data_at(i) .data); time_val->to_string(buf, _output_vexpr_ctxs[col_id]->root()->type().scale); - _plain_text_outstream << buf; + fmt::format_to(_outstream_buffer, "{}", buf); break; } case TYPE_DATE: case TYPE_DATETIME: { - char buf[64]; + char buf[64] = "\0"; const VecDateTimeValue* time_val = (const VecDateTimeValue*)(col.column->get_data_at(i).data); time_val->to_string(buf); - _plain_text_outstream << buf; + fmt::format_to(_outstream_buffer, "{}", buf); break; } case TYPE_OBJECT: case TYPE_HLL: { if (!_output_object_data) { - _plain_text_outstream << NULL_IN_CSV; + fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV); break; } [[fallthrough]]; @@ -177,70 +182,67 @@ Status VCSVTransformer::write(const Block& block) { case TYPE_CHAR: case TYPE_STRING: { auto value = col.column->get_data_at(i); - _plain_text_outstream << value; + fmt::format_to(_outstream_buffer, "{}", value); break; } case TYPE_DECIMALV2: { const DecimalV2Value decimal_val( reinterpret_cast<const PackedInt128*>(col.column->get_data_at(i).data) ->value); - std::string decimal_str; - decimal_str = decimal_val.to_string(); - _plain_text_outstream << decimal_str; + fmt::format_to(_outstream_buffer, "{}", decimal_val.to_string()); break; } case TYPE_DECIMAL32: { - _plain_text_outstream << col.type->to_string(*col.column, i); + fmt::format_to(_outstream_buffer, "{}", col.type->to_string(*col.column, i)); break; } case TYPE_DECIMAL64: { - _plain_text_outstream << col.type->to_string(*col.column, i); + fmt::format_to(_outstream_buffer, "{}", col.type->to_string(*col.column, i)); break; } case TYPE_DECIMAL128I: { - _plain_text_outstream << col.type->to_string(*col.column, i); + fmt::format_to(_outstream_buffer, "{}", col.type->to_string(*col.column, i)); break; } case TYPE_ARRAY: { - _plain_text_outstream << col.type->to_string(*col.column, i); + fmt::format_to(_outstream_buffer, "{}", col.type->to_string(*col.column, i)); break; } case TYPE_MAP: { - _plain_text_outstream << col.type->to_string(*col.column, i); + fmt::format_to(_outstream_buffer, "{}", col.type->to_string(*col.column, i)); break; } case TYPE_STRUCT: { - _plain_text_outstream << col.type->to_string(*col.column, i); + fmt::format_to(_outstream_buffer, "{}", col.type->to_string(*col.column, i)); break; } default: { // not supported type, like BITMAP, just export null - _plain_text_outstream << NULL_IN_CSV; + fmt::format_to(_outstream_buffer, "{}", NULL_IN_CSV); } } } if (col_id < block.columns() - 1) { - _plain_text_outstream << _column_separator; + fmt::format_to(_outstream_buffer, "{}", _column_separator); } } - _plain_text_outstream << _line_delimiter; + fmt::format_to(_outstream_buffer, "{}", _line_delimiter); } return _flush_plain_text_outstream(); } Status VCSVTransformer::_flush_plain_text_outstream() { - size_t pos = _plain_text_outstream.tellp(); + size_t pos = _outstream_buffer.size(); if (pos == 0) { return Status::OK(); } - const std::string& buf = _plain_text_outstream.str(); - RETURN_IF_ERROR(_file_writer->append(buf)); + RETURN_IF_ERROR( + _file_writer->append(Slice(_outstream_buffer.data(), _outstream_buffer.size()))); // clear the stream - _plain_text_outstream.str(""); - _plain_text_outstream.clear(); + _outstream_buffer.clear(); return Status::OK(); } diff --git a/be/src/vec/runtime/vcsv_transformer.h b/be/src/vec/runtime/vcsv_transformer.h index fb3232ac93..f796ef52f5 100644 --- a/be/src/vec/runtime/vcsv_transformer.h +++ b/be/src/vec/runtime/vcsv_transformer.h @@ -63,13 +63,12 @@ private: doris::io::FileWriter* _file_writer; // Used to buffer the export data of plain text - // TODO(cmy): I simply use a stringstrteam to buffer the data, to avoid calling + // TODO(cmy): I simply use a fmt::memmory_buffer to buffer the data, to avoid calling // file writer's write() for every single row. // But this cannot solve the problem of a row of data that is too large. // For example: bitmap_to_string() may return large volume of data. // And the speed is relative low, in my test, is about 6.5MB/s. - std::stringstream _plain_text_outstream; - static const size_t OUTSTREAM_BUFFER_SIZE_BYTES; + fmt::memory_buffer _outstream_buffer; }; } // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org