This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 9e21318834 [refactor](dynamic table) Make segment_writer unaware of dynamic schema, and ensure parsing is exception-safe. (#19594) 9e21318834 is described below commit 9e21318834d5fd886def3cf4a81641f5d55ff8fd Author: lihangyu <15605149...@163.com> AuthorDate: Thu Jun 1 10:25:04 2023 +0800 [refactor](dynamic table) Make segment_writer unaware of dynamic schema, and ensure parsing is exception-safe. (#19594) 1. make ColumnObject exception safe 2. introduce FlushContext and construct schema at memtable flush stage to make segment independent from dynamic schema 3. add more test cases --- be/src/olap/memtable.cpp | 80 +- be/src/olap/memtable.h | 4 +- be/src/olap/rowset/beta_rowset_writer.cpp | 23 +- be/src/olap/rowset/beta_rowset_writer.h | 12 +- be/src/olap/rowset/rowset_writer.h | 15 +- be/src/olap/rowset/segment_v2/segment_writer.cpp | 72 +- be/src/olap/rowset/segment_v2/segment_writer.h | 12 +- be/src/vec/columns/column_object.cpp | 125 +- be/src/vec/columns/column_object.h | 26 +- be/src/vec/common/schema_util.cpp | 18 +- be/src/vec/common/schema_util.h | 2 +- be/src/vec/data_types/convert_field_to_type.cpp | 50 +- be/src/vec/data_types/convert_field_to_type.h | 4 +- be/src/vec/data_types/get_least_supertype.cpp | 210 ++- be/src/vec/data_types/get_least_supertype.h | 8 +- be/src/vec/exec/format/json/new_json_reader.cpp | 35 +- be/src/vec/json/json_parser.cpp | 5 +- be/src/vec/json/parse2column.cpp | 40 +- be/src/vec/json/parse2column.h | 10 +- be/src/vec/olap/olap_data_convertor.cpp | 8 +- lowercase.json | 5 + .../data/dynamic_table_p0/floating_point.json | 4 + .../data/dynamic_table_p0/floating_point2.json | 4 + .../data/dynamic_table_p0/floating_point3.json | 4 + .../data/dynamic_table_p0/invalid_dimension.json | 4 + .../data/dynamic_table_p0/invalid_format.json | 3 + regression-test/data/dynamic_table_p0/load.out | 1939 ++++++++++++++++++++ .../data/dynamic_table_p0/lowercase.json | 4 + regression-test/data/dynamic_table_p0/sql/q05.out | 14 +- .../data/dynamic_table_p0/uppercase.json | 4 + .../suites/dynamic_table_p0/load.groovy | 77 + 31 files changed, 2481 insertions(+), 340 deletions(-) diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index 6bc82b052c..c398a22b15 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -45,15 +45,17 @@ #include "util/doris_metrics.h" #include "util/runtime_profile.h" #include "util/stopwatch.hpp" +#include "util/string_util.h" #include "vec/aggregate_functions/aggregate_function_reader.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" #include "vec/columns/column.h" #include "vec/columns/column_object.h" #include "vec/columns/column_string.h" #include "vec/common/assert_cast.h" +#include "vec/common/schema_util.h" #include "vec/core/column_with_type_and_name.h" #include "vec/data_types/data_type.h" -#include "vec/data_types/serde/data_type_serde.h" +#include "vec/data_types/data_type_factory.hpp" #include "vec/json/path_in_data.h" #include "vec/jsonb/serialize.h" @@ -475,12 +477,14 @@ Status MemTable::_do_flush() { _aggregate<true>(); } vectorized::Block block = _output_mutable_block.to_block(); + FlushContext ctx; + ctx.block = █ if (_tablet_schema->is_dynamic_schema()) { // Unfold variant column - unfold_variant_column(block); + RETURN_IF_ERROR(unfold_variant_column(block, &ctx)); } SCOPED_RAW_TIMER(&_stat.segment_writer_ns); - RETURN_IF_ERROR(_rowset_writer->flush_single_memtable(&block, &_flush_size)); + RETURN_IF_ERROR(_rowset_writer->flush_single_memtable(&block, &_flush_size, &ctx)); return Status::OK(); } @@ -488,27 +492,77 @@ Status MemTable::close() { return flush(); } -void MemTable::unfold_variant_column(vectorized::Block& block) { +Status MemTable::unfold_variant_column(vectorized::Block& block, FlushContext* ctx) { if (block.rows() == 0) { - return; + return Status::OK(); } + + // Sanitize block to match exactly from the same type of frontend meta + vectorized::schema_util::FullBaseSchemaView schema_view; + schema_view.table_id = _tablet_schema->table_id(); vectorized::ColumnWithTypeAndName* variant_column = block.try_get_by_name(BeConsts::DYNAMIC_COLUMN_NAME); if (!variant_column) { - return; + return Status::OK(); } - // remove it + auto base_column = variant_column->column; vectorized::ColumnObject& object_column = - assert_cast<vectorized::ColumnObject&>(variant_column->column->assume_mutable_ref()); - // extend + assert_cast<vectorized::ColumnObject&>(base_column->assume_mutable_ref()); + if (object_column.empty()) { + block.erase(BeConsts::DYNAMIC_COLUMN_NAME); + return Status::OK(); + } + object_column.finalize(); + // Has extended columns + RETURN_IF_ERROR(vectorized::schema_util::send_fetch_full_base_schema_view_rpc(&schema_view)); + // Dynamic Block consists of two parts, dynamic part of columns and static part of columns + // static dynamic + // | ----- | ------- | + // The static ones are original _tablet_schame columns + TabletSchemaSPtr flush_schema = std::make_shared<TabletSchema>(*_tablet_schema); + vectorized::Block flush_block(std::move(block)); + // The dynamic ones are auto generated and extended, append them the the orig_block for (auto& entry : object_column.get_subcolumns()) { - if (entry->path.get_path() == vectorized::ColumnObject::COLUMN_NAME_DUMMY) { + const std::string& column_name = entry->path.get_path(); + auto column_iter = schema_view.column_name_to_column.find(column_name); + if (UNLIKELY(column_iter == schema_view.column_name_to_column.end())) { + // Column maybe dropped by light weight schema change DDL continue; } - block.insert({entry->data.get_finalized_column().get_ptr(), - entry->data.get_least_common_type(), entry->path.get_path()}); + TabletColumn column(column_iter->second); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type( + column, column.is_nullable()); + // Dynamic generated columns does not appear in original tablet schema + if (_tablet_schema->field_index(column.name()) < 0) { + flush_schema->append_column(column); + flush_block.insert({data_type->create_column(), data_type, column.name()}); + } + } + + // Ensure column are all present at this schema version.Otherwise there will be some senario: + // Load1 -> version(10) with schema [a, b, c, d, e], d & e is new added columns and schema version became 10 + // Load2 -> version(10) with schema [a, b, c] and has no extended columns and fetched the schema at version 10 + // Load2 will persist meta with [a, b, c] but Load1 will persist meta with [a, b, c, d, e] + // So we should make sure that rowset at the same schema version alawys contain the same size of columns. + // so that all columns at schema_version is in either _tablet_schema or schema_change_recorder + for (const auto& [name, column] : schema_view.column_name_to_column) { + if (_tablet_schema->field_index(name) == -1) { + const auto& tcolumn = schema_view.column_name_to_column[name]; + TabletColumn new_column(tcolumn); + _rowset_writer->mutable_schema_change_recorder()->add_extended_columns( + column, schema_view.schema_version); + } } - block.erase(BeConsts::DYNAMIC_COLUMN_NAME); + + // Last schema alignment before flush to disk, due to the schema maybe variant before this procedure + // Eg. add columnA(INT) -> drop ColumnA -> add ColumnA(Double), then columnA could be type of `Double`, + // unfold will cast to Double type + RETURN_IF_ERROR(vectorized::schema_util::unfold_object( + flush_block.get_position_by_name(BeConsts::DYNAMIC_COLUMN_NAME), flush_block, true)); + flush_block.erase(BeConsts::DYNAMIC_COLUMN_NAME); + ctx->flush_schema = flush_schema; + block.swap(flush_block); + return Status::OK(); } void MemTable::serialize_block_to_row_column(vectorized::Block& block) { diff --git a/be/src/olap/memtable.h b/be/src/olap/memtable.h index b99127bbf6..95d41df62d 100644 --- a/be/src/olap/memtable.h +++ b/be/src/olap/memtable.h @@ -44,6 +44,7 @@ class SlotDescriptor; class TabletSchema; class TupleDescriptor; enum KeysType : int; +struct FlushContext; // row pos in _input_mutable_block struct RowInBlock { @@ -163,7 +164,8 @@ private: // Eg. [A | B | C | (D, E, F)] // After unfold block structure changed to -> [A | B | C | D | E | F] // The expanded D, E, F is dynamic part of the block - void unfold_variant_column(vectorized::Block& block); + // The flushed Block columns should match exactly from the same type of frontend meta + Status unfold_variant_column(vectorized::Block& block, FlushContext* ctx); private: TabletSharedPtr _tablet; diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 76a2a2d4a6..5358732daa 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -41,6 +41,7 @@ #include "olap/olap_define.h" #include "olap/rowset/beta_rowset.h" #include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_writer.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/rowset/segment_v2/segment.h" @@ -132,7 +133,9 @@ Status BetaRowsetWriter::add_block(const vectorized::Block* block) { return Status::OK(); } if (UNLIKELY(_segment_writer == nullptr)) { - RETURN_IF_ERROR(_create_segment_writer(&_segment_writer, block)); + FlushContext ctx; + ctx.block = block; + RETURN_IF_ERROR(_create_segment_writer(&_segment_writer, &ctx)); } return _add_block(block, &_segment_writer); } @@ -406,7 +409,8 @@ Status BetaRowsetWriter::_segcompaction_ramaining_if_necessary() { } Status BetaRowsetWriter::_add_block(const vectorized::Block* block, - std::unique_ptr<segment_v2::SegmentWriter>* segment_writer) { + std::unique_ptr<segment_v2::SegmentWriter>* segment_writer, + const FlushContext* flush_ctx) { size_t block_size_in_bytes = block->bytes(); size_t block_row_num = block->rows(); size_t row_avg_size_in_bytes = std::max((size_t)1, block_size_in_bytes / block_row_num); @@ -417,7 +421,7 @@ Status BetaRowsetWriter::_add_block(const vectorized::Block* block, if (UNLIKELY(max_row_add < 1)) { // no space for another single row, need flush now RETURN_IF_ERROR(_flush_segment_writer(segment_writer)); - RETURN_IF_ERROR(_create_segment_writer(segment_writer, block)); + RETURN_IF_ERROR(_create_segment_writer(segment_writer, flush_ctx)); max_row_add = (*segment_writer)->max_row_to_add(row_avg_size_in_bytes); DCHECK(max_row_add > 0); } @@ -462,13 +466,14 @@ Status BetaRowsetWriter::flush() { return Status::OK(); } -Status BetaRowsetWriter::flush_single_memtable(const vectorized::Block* block, int64* flush_size) { +Status BetaRowsetWriter::flush_single_memtable(const vectorized::Block* block, int64* flush_size, + const FlushContext* ctx) { if (block->rows() == 0) { return Status::OK(); } std::unique_ptr<segment_v2::SegmentWriter> writer; - RETURN_IF_ERROR(_create_segment_writer(&writer, block)); + RETURN_IF_ERROR(_create_segment_writer(&writer, ctx)); RETURN_IF_ERROR(_add_block(block, &writer)); RETURN_IF_ERROR(_flush_segment_writer(&writer, flush_size)); RETURN_IF_ERROR(_segcompaction_if_necessary()); @@ -666,7 +671,7 @@ RowsetSharedPtr BetaRowsetWriter::build_tmp() { Status BetaRowsetWriter::_do_create_segment_writer( std::unique_ptr<segment_v2::SegmentWriter>* writer, bool is_segcompaction, int64_t begin, - int64_t end, const vectorized::Block* block) { + int64_t end, const FlushContext* flush_ctx) { std::string path; int32_t segment_id = 0; if (is_segcompaction) { @@ -715,7 +720,7 @@ Status BetaRowsetWriter::_do_create_segment_writer( std::lock_guard<SpinLock> l(_lock); _file_writers.push_back(std::move(file_writer)); } - auto s = (*writer)->init(block); + auto s = (*writer)->init(flush_ctx); if (!s.ok()) { LOG(WARNING) << "failed to init segment writer: " << s.to_string(); writer->reset(nullptr); @@ -727,7 +732,7 @@ Status BetaRowsetWriter::_do_create_segment_writer( } Status BetaRowsetWriter::_create_segment_writer(std::unique_ptr<segment_v2::SegmentWriter>* writer, - const vectorized::Block* block) { + const FlushContext* flush_ctx) { size_t total_segment_num = _num_segment - _segcompacted_point + 1 + _num_segcompacted; if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) { LOG(WARNING) << "too many segments in rowset." @@ -738,7 +743,7 @@ Status BetaRowsetWriter::_create_segment_writer(std::unique_ptr<segment_v2::Segm << " _num_segcompacted:" << _num_segcompacted; return Status::Error<TOO_MANY_SEGMENTS>(); } else { - return _do_create_segment_writer(writer, false, -1, -1, block); + return _do_create_segment_writer(writer, false, -1, -1, flush_ctx); } } diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index b7d5b10ee6..389c1bbede 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -79,7 +79,8 @@ public: // Return the file size flushed to disk in "flush_size" // This method is thread-safe. - Status flush_single_memtable(const vectorized::Block* block, int64_t* flush_size) override; + Status flush_single_memtable(const vectorized::Block* block, int64_t* flush_size, + const FlushContext* ctx = nullptr) override; RowsetSharedPtr build() override; @@ -106,7 +107,7 @@ public: int32_t get_atomic_num_segment() const override { return _num_segment.load(); } // Maybe modified by local schema change - vectorized::schema_util::LocalSchemaChangeRecorder* mutable_schema_change_recorder() { + vectorized::schema_util::LocalSchemaChangeRecorder* mutable_schema_change_recorder() override { return _context.schema_change_recorder.get(); } @@ -122,13 +123,14 @@ public: private: Status _add_block(const vectorized::Block* block, - std::unique_ptr<segment_v2::SegmentWriter>* writer); + std::unique_ptr<segment_v2::SegmentWriter>* writer, + const FlushContext* flush_ctx = nullptr); Status _do_create_segment_writer(std::unique_ptr<segment_v2::SegmentWriter>* writer, bool is_segcompaction, int64_t begin, int64_t end, - const vectorized::Block* block = nullptr); + const FlushContext* ctx = nullptr); Status _create_segment_writer(std::unique_ptr<segment_v2::SegmentWriter>* writer, - const vectorized::Block* block = nullptr); + const FlushContext* ctx = nullptr); Status _flush_segment_writer(std::unique_ptr<segment_v2::SegmentWriter>* writer, int64_t* flush_size = nullptr); void _build_rowset_meta(std::shared_ptr<RowsetMeta> rowset_meta); diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index 1bdc4daa2d..6bfa9f6e9a 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -20,16 +20,25 @@ #include <gen_cpp/olap_file.pb.h> #include <gen_cpp/types.pb.h> +#include "common/factory_creator.h" #include "gutil/macros.h" #include "olap/column_mapping.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_writer_context.h" +#include "olap/tablet_schema.h" #include "vec/core/block.h" namespace doris { class MemTable; +// Context for single memtable flush +struct FlushContext { + ENABLE_FACTORY_CREATOR(FlushContext); + TabletSchemaSPtr flush_schema = nullptr; + const vectorized::Block* block = nullptr; +}; + class RowsetWriter { public: RowsetWriter() = default; @@ -59,7 +68,8 @@ public: } virtual Status final_flush() { return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>(); } - virtual Status flush_single_memtable(const vectorized::Block* block, int64_t* flush_size) { + virtual Status flush_single_memtable(const vectorized::Block* block, int64_t* flush_size, + const FlushContext* ctx = nullptr) { return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>(); } @@ -95,6 +105,9 @@ public: virtual void set_segment_start_id(int num_segment) { LOG(FATAL) << "not supported!"; } + virtual vectorized::schema_util::LocalSchemaChangeRecorder* + mutable_schema_change_recorder() = 0; + private: DISALLOW_COPY_AND_ASSIGN(RowsetWriter); }; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 3feb778e4a..f2e52736db 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -35,12 +35,15 @@ #include "olap/key_coder.h" #include "olap/olap_common.h" #include "olap/primary_key_index.h" -#include "olap/row_cursor.h" // IWYU pragma: keep +#include "olap/row_cursor.h" // IWYU pragma: keep +#include "olap/row_cursor.h" // RowCursor +#include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" // RowsetWriterContext #include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter #include "olap/rowset/segment_v2/page_io.h" #include "olap/rowset/segment_v2/page_pointer.h" #include "olap/short_key_index.h" +#include "olap/tablet_schema.h" #include "runtime/memory/mem_tracker.h" #include "service/point_query_executor.h" #include "util/coding.h" @@ -113,30 +116,20 @@ void SegmentWriter::init_column_meta(ColumnMetaPB* meta, uint32_t column_id, } } -Status SegmentWriter::init(const vectorized::Block* block) { +Status SegmentWriter::init(const FlushContext* flush_ctx) { std::vector<uint32_t> column_ids; int column_cnt = _tablet_schema->num_columns(); - if (block && !_tablet_schema->is_partial_update()) { - // partial update only contain several columns - column_cnt = block->columns(); + if (flush_ctx && flush_ctx->flush_schema) { + column_cnt = flush_ctx->flush_schema->num_columns(); } for (uint32_t i = 0; i < column_cnt; ++i) { column_ids.emplace_back(i); } - return init(column_ids, true, block); -} - -// Dynamic table with extended columns and directly write from delta writer -// Compaction/SchemaChange path will use the latest schema version of rowset -// as it's shcema, so it's block is not from dynamic table load procedure. -// If it is a dynamic table load procedure we should handle auto generated columns. -bool SegmentWriter::_should_create_writers_with_dynamic_block(size_t num_columns_in_block) { - return _tablet_schema->is_dynamic_schema() && _opts.is_direct_write && - num_columns_in_block > _tablet_schema->columns().size(); + return init(column_ids, true, flush_ctx); } Status SegmentWriter::init(const std::vector<uint32_t>& col_ids, bool has_key, - const vectorized::Block* block) { + const FlushContext* flush_ctx) { DCHECK(_column_writers.empty()); DCHECK(_column_ids.empty()); _has_key = has_key; @@ -144,7 +137,8 @@ Status SegmentWriter::init(const std::vector<uint32_t>& col_ids, bool has_key, _column_ids.insert(_column_ids.end(), col_ids.begin(), col_ids.end()); _olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); _opts.compression_type = - (block == nullptr || block->bytes() > config::segment_compression_threshold_kb * 1024) + (flush_ctx == nullptr || flush_ctx->block == nullptr || + flush_ctx->block->bytes() > config::segment_compression_threshold_kb * 1024) ? _tablet_schema->compression_type() : NO_COMPRESSION; auto create_column_writer = [&](uint32_t cid, const auto& column) -> auto { @@ -240,10 +234,10 @@ Status SegmentWriter::init(const std::vector<uint32_t>& col_ids, bool has_key, return Status::OK(); }; - if (block && _should_create_writers_with_dynamic_block(block->columns())) { - RETURN_IF_ERROR(_create_writers_with_dynamic_block(block, create_column_writer)); + if (flush_ctx && flush_ctx->flush_schema) { + RETURN_IF_ERROR(_create_writers(*flush_ctx->flush_schema, col_ids, create_column_writer)); } else { - RETURN_IF_ERROR(_create_writers(create_column_writer)); + RETURN_IF_ERROR(_create_writers(*_tablet_schema, col_ids, create_column_writer)); } // we don't need the short key index for unique key merge on write table. @@ -266,41 +260,11 @@ Status SegmentWriter::init(const std::vector<uint32_t>& col_ids, bool has_key, } Status SegmentWriter::_create_writers( + const TabletSchema& tablet_schema, const std::vector<uint32_t>& col_ids, std::function<Status(uint32_t, const TabletColumn&)> create_column_writer) { - _olap_data_convertor->reserve(_column_ids.size()); - for (auto& cid : _column_ids) { - RETURN_IF_ERROR(create_column_writer(cid, _tablet_schema->column(cid))); - } - return Status::OK(); -} - -// Dynamic Block consists of two parts, dynamic part of columns and static part of columns -// static dynamic -// | ----- | ------- | -// the static ones are original _tablet_schame columns -// the dynamic ones are auto generated and extended from file scan -Status SegmentWriter::_create_writers_with_dynamic_block( - const vectorized::Block* block, - std::function<Status(uint32_t, const TabletColumn&)> create_column_writer) { - // generate writers from schema and extended schema info - _olap_data_convertor->reserve(block->columns()); - // new columns added, query column info from Master - vectorized::schema_util::FullBaseSchemaView schema_view; - CHECK(block->columns() > _tablet_schema->num_columns()); - schema_view.table_id = _tablet_schema->table_id(); - RETURN_IF_ERROR(vectorized::schema_util::send_fetch_full_base_schema_view_rpc(&schema_view)); - // create writers with static columns - for (size_t i = 0; i < _tablet_schema->columns().size(); ++i) { - create_column_writer(i, _tablet_schema->column(i)); - } - // create writers with auto generated columns - for (size_t i = _tablet_schema->columns().size(); i < block->columns(); ++i) { - const auto& column_type_name = block->get_by_position(i); - const auto& tcolumn = schema_view.column_name_to_column[column_type_name.name]; - TabletColumn new_column(tcolumn); - RETURN_IF_ERROR(create_column_writer(i, new_column)); - _opts.rowset_ctx->schema_change_recorder->add_extended_columns(new_column, - schema_view.schema_version); + _olap_data_convertor->reserve(col_ids.size()); + for (auto& cid : col_ids) { + RETURN_IF_ERROR(create_column_writer(cid, tablet_schema.column(cid))); } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index ea97958825..09a3576cf5 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -35,6 +35,7 @@ #include "gutil/macros.h" #include "gutil/strings/substitute.h" #include "olap/olap_define.h" +#include "olap/rowset/rowset_writer.h" #include "olap/rowset/segment_v2/column_writer.h" #include "olap/tablet.h" #include "olap/tablet_schema.h" @@ -57,6 +58,7 @@ class ShortKeyIndexBuilder; class PrimaryKeyIndexBuilder; class KeyCoder; struct RowsetWriterContext; +struct FlushContext; namespace io { class FileWriter; @@ -89,11 +91,11 @@ public: std::shared_ptr<MowContext> mow_context); ~SegmentWriter(); - Status init(const vectorized::Block* block = nullptr); + Status init(const FlushContext* flush_ctx = nullptr); // for vertical compaction Status init(const std::vector<uint32_t>& col_ids, bool has_key, - const vectorized::Block* block = nullptr); + const FlushContext* flush_ctx = nullptr); template <typename RowType> Status append_row(const RowType& row); @@ -133,11 +135,9 @@ public: const std::vector<bool>& use_default_flag, bool has_default); private: - Status _create_writers_with_dynamic_block( - const vectorized::Block* block, - std::function<Status(uint32_t, const TabletColumn&)> writer_creator); - Status _create_writers(std::function<Status(uint32_t, const TabletColumn&)> writer_creator); DISALLOW_COPY_AND_ASSIGN(SegmentWriter); + Status _create_writers(const TabletSchema& tablet_schema, const std::vector<uint32_t>& col_ids, + std::function<Status(uint32_t, const TabletColumn&)> writer_creator); Status _write_data(); Status _write_ordinal_index(); Status _write_zone_map(); diff --git a/be/src/vec/columns/column_object.cpp b/be/src/vec/columns/column_object.cpp index 89aabbab94..7830e3fe8f 100644 --- a/be/src/vec/columns/column_object.cpp +++ b/be/src/vec/columns/column_object.cpp @@ -29,6 +29,8 @@ #include <map> #include <optional> +#include "common/exception.h" +#include "common/status.h" #include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" #include "vec/columns/columns_number.h" @@ -158,7 +160,6 @@ private: /// Returns 0 for scalar fields. class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t> { public: - explicit FieldVisitorToNumberOfDimensions(Status* st) : _st(st) {} size_t operator()(const Array& x) const { const size_t size = x.size(); std::optional<size_t> dimensions; @@ -172,8 +173,8 @@ public: if (!dimensions) { dimensions = current_dimensions; } else if (current_dimensions != *dimensions) { - *_st = Status::InvalidArgument( - "Number of dimensions mismatched among array elements"); + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + "Number of dimensions mismatched among array elements"); return 0; } } @@ -183,9 +184,6 @@ public: size_t operator()(const T&) const { return 0; } - -private: - mutable Status* _st; }; /// Visitor that allows to get type of scalar field @@ -217,9 +215,16 @@ public: return 0; } size_t operator()(const Int64& x) { - // Only Int64 at present + // // Only Int64 | Int32 at present + // field_types.insert(FieldType::Int64); + // type_indexes.insert(TypeIndex::Int64); + // return 0; field_types.insert(FieldType::Int64); - type_indexes.insert(TypeIndex::Int64); + if (x <= std::numeric_limits<Int32>::max() && x >= std::numeric_limits<Int32>::min()) { + type_indexes.insert(TypeIndex::Int32); + } else { + type_indexes.insert(TypeIndex::Int64); + } return 0; } size_t operator()(const Null&) { @@ -233,8 +238,8 @@ public: type_indexes.insert(TypeId<NearestFieldType<T>>::value); return 0; } - Status get_scalar_type(DataTypePtr* type) const { - return get_least_supertype(type_indexes, type, true /*compatible with string type*/); + void get_scalar_type(DataTypePtr* type) const { + get_least_supertype(type_indexes, type, true /*compatible with string type*/); } bool contain_nulls() const { return have_nulls; } bool need_convert_field() const { return field_types.size() > 1; } @@ -246,21 +251,18 @@ private: }; } // namespace -Status get_field_info(const Field& field, FieldInfo* info) { +void get_field_info(const Field& field, FieldInfo* info) { FieldVisitorToScalarType to_scalar_type_visitor; apply_visitor(to_scalar_type_visitor, field); DataTypePtr type = nullptr; - RETURN_IF_ERROR(to_scalar_type_visitor.get_scalar_type(&type)); + to_scalar_type_visitor.get_scalar_type(&type); // array item's dimension may missmatch, eg. [1, 2, [1, 2, 3]] - Status num_to_dimensions_status; *info = { type, to_scalar_type_visitor.contain_nulls(), to_scalar_type_visitor.need_convert_field(), - apply_visitor(FieldVisitorToNumberOfDimensions(&num_to_dimensions_status), field), + apply_visitor(FieldVisitorToNumberOfDimensions(), field), }; - RETURN_IF_ERROR(num_to_dimensions_status); - return Status::OK(); } ColumnObject::Subcolumn::Subcolumn(MutableColumnPtr&& data_, bool is_nullable_) @@ -297,10 +299,10 @@ size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const { return res; } -Status ColumnObject::Subcolumn::insert(Field field) { +void ColumnObject::Subcolumn::insert(Field field) { FieldInfo info; - RETURN_IF_ERROR(get_field_info(field, &info)); - return insert(std::move(field), std::move(info)); + get_field_info(field, &info); + insert(std::move(field), std::move(info)); } void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) { @@ -308,11 +310,11 @@ void ColumnObject::Subcolumn::add_new_column_part(DataTypePtr type) { least_common_type = LeastCommonType {std::move(type)}; } -Status ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { +void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { auto base_type = std::move(info.scalar_type); if (is_nothing(base_type)) { insertDefault(); - return Status::OK(); + return; } auto column_dim = least_common_type.get_dimensions(); auto value_dim = info.num_dimensions; @@ -323,8 +325,10 @@ Status ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { value_dim = column_dim; } if (value_dim != column_dim) { - return Status::InvalidArgument( - "Dimension of types mismatched between inserted value and column."); + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + "Dimension of types mismatched between inserted value and column, " + "expected:{}, but meet:{} for type:{}", + column_dim, value_dim, least_common_type.get()->get_name()); } if (is_nullable && !is_nothing(base_type)) { base_type = make_nullable(base_type); @@ -348,9 +352,8 @@ Status ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { } else if (!least_common_base_type->equals(*base_type) && !is_nothing(base_type)) { if (!schema_util::is_conversion_required_between_integers(*base_type, *least_common_base_type)) { - RETURN_IF_ERROR( - get_least_supertype(DataTypes {std::move(base_type), least_common_base_type}, - &base_type, true /*compatible with string type*/)); + get_least_supertype(DataTypes {std::move(base_type), least_common_base_type}, + &base_type, true /*compatible with string type*/); type_changed = true; if (!least_common_base_type->equals(*base_type)) { add_new_column_part(create_array_of_type(std::move(base_type), value_dim)); @@ -360,15 +363,14 @@ Status ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { if (type_changed || info.need_convert) { Field new_field; - RETURN_IF_ERROR(convert_field_to_type(field, *least_common_type.get(), &new_field)); + convert_field_to_type(field, *least_common_type.get(), &new_field); field = new_field; } data.back()->insert(field); - return Status::OK(); } -Status ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t start, size_t length) { +void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t start, size_t length) { assert(src.is_finalized()); const auto& src_column = src.data.back(); const auto& src_type = src.least_common_type.get(); @@ -379,18 +381,20 @@ Status ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t sta data.back()->insert_range_from(*src_column, start, length); } else { DataTypePtr new_least_common_type = nullptr; - RETURN_IF_ERROR(get_least_supertype(DataTypes {least_common_type.get(), src_type}, - &new_least_common_type, - true /*compatible with string type*/)); + get_least_supertype(DataTypes {least_common_type.get(), src_type}, &new_least_common_type, + true /*compatible with string type*/); ColumnPtr casted_column; - RETURN_IF_ERROR(schema_util::cast_column({src_column, src_type, ""}, new_least_common_type, - &casted_column)); + Status st = schema_util::cast_column({src_column, src_type, ""}, new_least_common_type, + &casted_column); + if (!st.ok()) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, st.to_string() + ", real_code:{}", + st.code()); + } if (!least_common_type.get()->equals(*new_least_common_type)) { add_new_column_part(std::move(new_least_common_type)); } data.back()->insert_range_from(*casted_column, start, length); } - return Status::OK(); } bool ColumnObject::Subcolumn::is_finalized() const { @@ -400,7 +404,9 @@ bool ColumnObject::Subcolumn::is_finalized() const { template <typename Func> ColumnPtr ColumnObject::apply_for_subcolumns(Func&& func, std::string_view func_name) const { if (!is_finalized()) { - LOG(FATAL) << "Cannot " << func_name << " non-finalized ColumnObject"; + // LOG(FATAL) << "Cannot " << func_name << " non-finalized ColumnObject"; + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "Cannot {} non-finalized ColumnObject", func_name); } auto res = ColumnObject::create(is_nullable); for (const auto& subcolumn : subcolumns) { @@ -560,7 +566,11 @@ void ColumnObject::check_consistency() const { } for (const auto& leaf : subcolumns) { if (num_rows != leaf->data.size()) { - assert(false); + // LOG(FATAL) << "unmatched column:" << leaf->path.get_path() + // << ", expeted rows:" << num_rows << ", but meet:" << leaf->data.size(); + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "unmatched column: {}, expeted rows: {}, but meet: {}", + leaf->path.get_path(), num_rows, leaf->data.size()); } } } @@ -575,7 +585,8 @@ size_t ColumnObject::size() const { MutableColumnPtr ColumnObject::clone_resized(size_t new_size) const { /// cloneResized with new_size == 0 is used for cloneEmpty(). if (new_size != 0) { - LOG(FATAL) << "ColumnObject doesn't support resize to non-zero length"; + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "ColumnObject doesn't support resize to non-zero length"); } return ColumnObject::create(is_nullable); } @@ -605,11 +616,11 @@ void ColumnObject::for_each_subcolumn(ColumnCallback callback) { } } -Status ColumnObject::try_insert_from(const IColumn& src, size_t n) { +void ColumnObject::try_insert_from(const IColumn& src, size_t n) { return try_insert(src[n]); } -Status ColumnObject::try_insert(const Field& field) { +void ColumnObject::try_insert(const Field& field) { const auto& object = field.get<const VariantMap&>(); phmap::flat_hash_set<StringRef, StringRefHash> inserted; size_t old_size = size(); @@ -619,16 +630,16 @@ Status ColumnObject::try_insert(const Field& field) { if (!has_subcolumn(key)) { bool succ = add_sub_column(key, old_size); if (!succ) { - return Status::InvalidArgument( - fmt::format("Failed to add sub column {}", key.get_path())); + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + "Failed to add sub column {}", key.get_path()); } } auto* subcolumn = get_subcolumn(key); if (!subcolumn) { - return Status::InvalidArgument( - fmt::format("Failed to find sub column {}", key.get_path())); + doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + fmt::format("Failed to find sub column {}", key.get_path())); } - RETURN_IF_ERROR(subcolumn->insert(value)); + subcolumn->insert(value); } for (auto& entry : subcolumns) { if (!inserted.contains(entry->path.get_path())) { @@ -636,7 +647,6 @@ Status ColumnObject::try_insert(const Field& field) { } } ++num_rows; - return Status::OK(); } void ColumnObject::insert_default() { @@ -674,26 +684,26 @@ Status ColumnObject::try_insert_indices_from(const IColumn& src, const int* indi if (*x == -1) { ColumnObject::insert_default(); } else { - RETURN_IF_ERROR(ColumnObject::try_insert_from(src, *x)); + ColumnObject::try_insert_from(src, *x); } } finalize(); return Status::OK(); } -Status ColumnObject::try_insert_range_from(const IColumn& src, size_t start, size_t length) { +void ColumnObject::try_insert_range_from(const IColumn& src, size_t start, size_t length) { const auto& src_object = assert_cast<const ColumnObject&>(src); if (UNLIKELY(src_object.empty())) { - return Status::OK(); + return; } for (auto& entry : subcolumns) { if (src_object.has_subcolumn(entry->path)) { auto* subcolumn = src_object.get_subcolumn(entry->path); if (!subcolumn) { - return Status::InvalidArgument( - fmt::format("Failed to find sub column {}", entry->path.get_path())); + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + "Failed to find sub column {}", entry->path.get_path()); } - RETURN_IF_ERROR(entry->data.insertRangeFrom(*subcolumn, start, length)); + entry->data.insertRangeFrom(*subcolumn, start, length); } else { entry->data.insertManyDefaults(length); } @@ -714,20 +724,19 @@ Status ColumnObject::try_insert_range_from(const IColumn& src, size_t start, siz succ = add_sub_column(entry->path, num_rows); } if (!succ) { - return Status::InvalidArgument( - fmt::format("Failed to add column {}", entry->path.get_path())); + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + "Failed to add column {}", entry->path.get_path()); } auto* subcolumn = get_subcolumn(entry->path); if (!subcolumn) { - return Status::InvalidArgument( - fmt::format("Failed to find sub column {}", entry->path.get_path())); + throw doris::Exception(doris::ErrorCode::INVALID_ARGUMENT, + "Failed to find sub column {}", entry->path.get_path()); } - RETURN_IF_ERROR(subcolumn->insertRangeFrom(entry->data, start, length)); + subcolumn->insertRangeFrom(entry->data, start, length); } } num_rows += length; finalize(); - return Status::OK(); } void ColumnObject::pop_back(size_t length) { diff --git a/be/src/vec/columns/column_object.h b/be/src/vec/columns/column_object.h index 7fec48a0c8..339db8c6b9 100644 --- a/be/src/vec/columns/column_object.h +++ b/be/src/vec/columns/column_object.h @@ -66,7 +66,7 @@ struct FieldInfo { /// Number of dimension in array. 0 if field is scalar. size_t num_dimensions; }; -Status get_field_info(const Field& field, FieldInfo* info); +void get_field_info(const Field& field, FieldInfo* info); /** A column that represents object with dynamic set of subcolumns. * Subcolumns are identified by paths in document and are stored in * a trie-like structure. ColumnObject is not suitable for writing into tables @@ -109,16 +109,16 @@ public: /// Inserts a field, which scalars can be arbitrary, but number of /// dimensions should be consistent with current common type. - /// return Status::InvalidArgument when meet conflict types - Status insert(Field field); + /// throws InvalidArgument when meet conflict types + void insert(Field field); - Status insert(Field field, FieldInfo info); + void insert(Field field, FieldInfo info); void insertDefault(); void insertManyDefaults(size_t length); - Status insertRangeFrom(const Subcolumn& src, size_t start, size_t length); + void insertRangeFrom(const Subcolumn& src, size_t start, size_t length); void pop_back(size_t n); @@ -279,12 +279,7 @@ public: void for_each_subcolumn(ColumnCallback callback) override; // Do nothing, call try_insert instead - void insert(const Field& field) override { - Status st = try_insert(field); - if (!st.ok()) { - LOG(FATAL) << "insert return ERROR status: " << st; - } - } + void insert(const Field& field) override { try_insert(field); } void insert_range_from(const IColumn& src, size_t start, size_t length) override; @@ -294,13 +289,12 @@ public: void insert_indices_from(const IColumn& src, const int* indices_begin, const int* indices_end) override; - // Only called in Block::add_row - Status try_insert(const Field& field); + // May throw execption + void try_insert(const Field& field); - Status try_insert_from(const IColumn& src, size_t n); + void try_insert_from(const IColumn& src, size_t n); - // Only called in Block::add_row - Status try_insert_range_from(const IColumn& src, size_t start, size_t length); + void try_insert_range_from(const IColumn& src, size_t start, size_t length); void insert_default() override; diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 7bcfacb1a0..798722d1b0 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -35,6 +35,7 @@ #include <vector> #include "common/config.h" +#include "common/status.h" #include "olap/olap_common.h" #include "runtime/client_cache.h" #include "runtime/exec_env.h" @@ -273,11 +274,11 @@ Status send_add_columns_rpc(ColumnsWithTypeAndName column_type_names, return Status::OK(); } -void unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type) { +Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type) { auto dynamic_col = block.get_by_position(dynamic_col_position).column->assume_mutable(); auto* column_object_ptr = assert_cast<ColumnObject*>(dynamic_col.get()); if (column_object_ptr->empty()) { - return; + return Status::OK(); } size_t num_rows = column_object_ptr->size(); CHECK(block.rows() <= num_rows); @@ -308,7 +309,8 @@ void unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_origi } if (cast_to_original_type && !dst_type->equals(*types[i])) { // Cast static columns to original slot type - schema_util::cast_column({subcolumns[i], types[i], ""}, dst_type, &column); + RETURN_IF_ERROR( + schema_util::cast_column({subcolumns[i], types[i], ""}, dst_type, &column)); } // replace original column column_type_name->column = column; @@ -326,6 +328,16 @@ void unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_origi entry.column->assume_mutable()->insert_many_defaults(num_rows - entry.column->size()); } } +#ifndef NDEBUG + for (const auto& column_type_name : block) { + if (column_type_name.column->size() != num_rows) { + LOG(FATAL) << "unmatched column:" << column_type_name.name + << ", expeted rows:" << num_rows + << ", but meet:" << column_type_name.column->size(); + } + } +#endif + return Status::OK(); } void LocalSchemaChangeRecorder::add_extended_columns(const TabletColumn& new_column, diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index 978eaa19c7..fda6a69cf4 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -65,7 +65,7 @@ Status cast_column(const ColumnWithTypeAndName& arg, const DataTypePtr& type, Co // from object column and casted to the new type from slot_descs. // Also if column in block is empty, it will be filled // with num_rows of default values -void unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type); +Status unfold_object(size_t dynamic_col_position, Block& block, bool cast_to_original_type); /// If both of types are signed/unsigned integers and size of left field type /// is less than right type, we don't need to convert field, diff --git a/be/src/vec/data_types/convert_field_to_type.cpp b/be/src/vec/data_types/convert_field_to_type.cpp index bc7a821e64..a2a1377050 100644 --- a/be/src/vec/data_types/convert_field_to_type.cpp +++ b/be/src/vec/data_types/convert_field_to_type.cpp @@ -30,6 +30,8 @@ #include <type_traits> #include <vector> +#include "common/exception.h" +#include "common/status.h" #include "vec/common/field_visitors.h" #include "vec/common/typeid_cast.h" #include "vec/core/accurate_comparison.h" @@ -94,8 +96,9 @@ Field convert_numeric_type_impl(const Field& from) { } return result; } + template <typename To> -Status convert_numric_type(const Field& from, const IDataType& type, Field* to) { +void convert_numric_type(const Field& from, const IDataType& type, Field* to) { if (from.get_type() == Field::Types::UInt64) { *to = convert_numeric_type_impl<UInt64, To>(from); } else if (from.get_type() == Field::Types::Int64) { @@ -107,18 +110,17 @@ Status convert_numric_type(const Field& from, const IDataType& type, Field* to) } else if (from.get_type() == Field::Types::Int128) { *to = convert_numeric_type_impl<Int128, To>(from); } else { - return Status::InvalidArgument( - fmt::format("Type mismatch in IN or VALUES section. Expected: {}. Got: {}", - type.get_name(), from.get_type())); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Type mismatch in IN or VALUES section. Expected: {}. Got: {}", + type.get_name(), from.get_type()); } - return Status::OK(); } -Status convert_field_to_typeImpl(const Field& src, const IDataType& type, - const IDataType* from_type_hint, Field* to) { +void convert_field_to_typeImpl(const Field& src, const IDataType& type, + const IDataType* from_type_hint, Field* to) { if (from_type_hint && from_type_hint->equals(type)) { *to = src; - return Status::OK(); + return; } WhichDataType which_type(type); // TODO add more types @@ -163,16 +165,16 @@ Status convert_field_to_typeImpl(const Field& src, const IDataType& type, src.get_type() == Field::Types::UInt64) { /// We don't need any conversion UInt64 is under type of Date and DateTime *to = src; - return Status::OK(); + return; } } else if (which_type.is_string_or_fixed_string()) { if (src.get_type() == Field::Types::String) { *to = src; - return Status::OK(); + return; } // TODO this is a very simple translator, support more complex types *to = apply_visitor(FieldVisitorToStringSimple(), src); - return Status::OK(); + return; } else if (const DataTypeArray* type_array = typeid_cast<const DataTypeArray*>(&type)) { if (src.get_type() == Field::Types::Array) { const Array& src_arr = src.get<Array>(); @@ -180,14 +182,14 @@ Status convert_field_to_typeImpl(const Field& src, const IDataType& type, const auto& element_type = *(type_array->get_nested_type()); Array res(src_arr_size); for (size_t i = 0; i < src_arr_size; ++i) { - RETURN_IF_ERROR(convert_field_to_type(src_arr[i], element_type, &res[i])); + convert_field_to_type(src_arr[i], element_type, &res[i]); if (res[i].is_null() && !element_type.is_nullable()) { - return Status::InvalidArgument( - fmt::format("Cannot convert NULL to {}", element_type.get_name())); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Cannot convert NULL to {}", + element_type.get_name()); } } *to = Field(res); - return Status::OK(); + return; } } // else if (const DataTypeTuple* type_tuple = typeid_cast<const DataTypeTuple*>(&type)) { @@ -229,31 +231,31 @@ Status convert_field_to_typeImpl(const Field& src, const IDataType& type, // return Status::OK(); // } // } - return Status::InvalidArgument( - fmt::format("Type mismatch in IN or VALUES section. Expected: {}. Got: {}", - type.get_name(), src.get_type())); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Type mismatch in IN or VALUES section. Expected: {}. Got: {}", + type.get_name(), src.get_type()); } } // namespace -Status convert_field_to_type(const Field& from_value, const IDataType& to_type, Field* to, - const IDataType* from_type_hint) { +void convert_field_to_type(const Field& from_value, const IDataType& to_type, Field* to, + const IDataType* from_type_hint) { if (from_value.is_null()) { *to = from_value; - return Status::OK(); + return; } if (from_type_hint && from_type_hint->equals(to_type)) { *to = from_value; - return Status::OK(); + return; } if (const auto* nullable_type = typeid_cast<const DataTypeNullable*>(&to_type)) { const IDataType& nested_type = *nullable_type->get_nested_type(); /// NULL remains NULL after any conversion. if (WhichDataType(nested_type).is_nothing()) { *to = {}; - return Status::OK(); + return; } if (from_type_hint && from_type_hint->equals(nested_type)) { *to = from_value; - return Status::OK(); + return; } return convert_field_to_typeImpl(from_value, nested_type, from_type_hint, to); } else { diff --git a/be/src/vec/data_types/convert_field_to_type.h b/be/src/vec/data_types/convert_field_to_type.h index 8d57994c31..07d48452a4 100644 --- a/be/src/vec/data_types/convert_field_to_type.h +++ b/be/src/vec/data_types/convert_field_to_type.h @@ -33,7 +33,7 @@ class IDataType; * If the value does not fall into the range - returns Null. */ -Status convert_field_to_type(const Field& from_value, const IDataType& to_type, Field* field, - const IDataType* from_type_hint = nullptr); +void convert_field_to_type(const Field& from_value, const IDataType& to_type, Field* field, + const IDataType* from_type_hint = nullptr); } // namespace doris::vectorized diff --git a/be/src/vec/data_types/get_least_supertype.cpp b/be/src/vec/data_types/get_least_supertype.cpp index 3d08a77443..be9dd5c05c 100644 --- a/be/src/vec/data_types/get_least_supertype.cpp +++ b/be/src/vec/data_types/get_least_supertype.cpp @@ -20,6 +20,7 @@ #include "vec/data_types/get_least_supertype.h" +#include <fmt/core.h> #include <fmt/format.h> #include <glog/logging.h> #include <stddef.h> @@ -31,6 +32,7 @@ #include <string> #include <vector> +#include "common/status.h" #include "vec/common/typeid_cast.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" @@ -45,23 +47,36 @@ namespace doris::vectorized { namespace { +String type_to_string(const DataTypePtr& type) { + return type->get_name(); +} +String type_to_string(const TypeIndex& type) { + return getTypeName(type); +} + +template <typename DataTypes> String get_exception_message_prefix(const DataTypes& types) { std::stringstream res; res << "There is no supertype for types "; - bool first = true; for (const auto& type : types) { if (!first) res << ", "; first = false; - - res << type->get_name(); + res << type_to_string(type); } - return res.str(); } + } // namespace -Status get_numeric_type(const TypeIndexSet& types, DataTypePtr* type) { +void get_numeric_type(const TypeIndexSet& types, DataTypePtr* type, bool compatible_with_string) { + auto throw_or_return = [&](std::string_view message, int error_code) { + if (compatible_with_string) { + *type = std::make_shared<DataTypeString>(); + return; + } + throw doris::Exception(error_code, message); + }; bool all_numbers = true; size_t max_bits_of_signed_integer = 0; @@ -104,9 +119,11 @@ Status get_numeric_type(const TypeIndexSet& types, DataTypePtr* type) { if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) { if (!all_numbers) { - LOG(INFO) << " because some of them are numbers and some of them are not"; *type = nullptr; - return Status::InvalidArgument("some of them are numbers and some of them are not"); + return throw_or_return( + get_exception_message_prefix(types) + + " because some of them are numbers and some of them are not", + doris::ErrorCode::INVALID_ARGUMENT); } /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. @@ -117,8 +134,9 @@ Status get_numeric_type(const TypeIndexSet& types, DataTypePtr* type) { /// If unsigned is not covered by signed. if (max_bits_of_signed_integer && - max_bits_of_unsigned_integer >= max_bits_of_signed_integer) + max_bits_of_unsigned_integer >= max_bits_of_signed_integer) { ++min_bit_width_of_integer; + } /// If the result must be floating. if (max_mantissa_bits_of_floating) { @@ -126,18 +144,21 @@ Status get_numeric_type(const TypeIndexSet& types, DataTypePtr* type) { std::max(min_bit_width_of_integer, max_mantissa_bits_of_floating); if (min_mantissa_bits <= 24) { *type = std::make_shared<DataTypeFloat32>(); - return Status::OK(); + return; } else if (min_mantissa_bits <= 53) { *type = std::make_shared<DataTypeFloat64>(); - return Status::OK(); + return; } else { LOG(INFO) << " because some of them are integers and some are floating point " "but there is no floating point type, that can exactly represent " "all required integers"; *type = nullptr; - return Status::InvalidArgument( - "there is no floating point type, that can exactly represent " - "all required integers"); + return throw_or_return( + get_exception_message_prefix(types) + + " because some of them are integers and some are floating point " + "but there is no floating point type, that can exactly represent " + "all required integers", + doris::ErrorCode::INVALID_ARGUMENT); } } @@ -145,23 +166,25 @@ Status get_numeric_type(const TypeIndexSet& types, DataTypePtr* type) { if (max_bits_of_signed_integer) { if (min_bit_width_of_integer <= 8) { *type = std::make_shared<DataTypeInt8>(); - return Status::OK(); + return; } else if (min_bit_width_of_integer <= 16) { *type = std::make_shared<DataTypeInt16>(); - return Status::OK(); + return; } else if (min_bit_width_of_integer <= 32) { *type = std::make_shared<DataTypeInt32>(); - return Status::OK(); + return; } else if (min_bit_width_of_integer <= 64) { *type = std::make_shared<DataTypeInt64>(); - return Status::OK(); + return; } else { LOG(INFO) << " because some of them are signed integers and some are unsigned " "integers, but there is no signed integer type, that can exactly " "represent all required unsigned integer values"; - return Status::InvalidArgument( - "there is no signed integer type, that can exactly " - "represent all required unsigned integer values"); + return throw_or_return( + " because some of them are signed integers and some are unsigned " + "integers, but there is no signed integer type, that can exactly " + "represent all required unsigned integer values", + doris::ErrorCode::INVALID_ARGUMENT); } } @@ -169,43 +192,50 @@ Status get_numeric_type(const TypeIndexSet& types, DataTypePtr* type) { { if (min_bit_width_of_integer <= 8) { *type = std::make_shared<DataTypeUInt8>(); - return Status::OK(); + return; } else if (min_bit_width_of_integer <= 16) { *type = std::make_shared<DataTypeUInt16>(); - return Status::OK(); + return; } else if (min_bit_width_of_integer <= 32) { *type = std::make_shared<DataTypeUInt32>(); - return Status::OK(); + return; } else if (min_bit_width_of_integer <= 64) { *type = std::make_shared<DataTypeUInt64>(); - return Status::OK(); + return; } else { - LOG(FATAL) << "Logical error: " - << "but as all data types are unsigned integers, we must have found " - "maximum unsigned integer type"; + LOG(WARNING) << "Logical error: " + << "but as all data types are unsigned integers, we must have found " + "maximum unsigned integer type"; *type = nullptr; - return Status::InvalidArgument( - "all data types are unsigned integers, we must have found " - "maximum unsigned integer type"); + return throw_or_return( + "Logical error: " + get_exception_message_prefix(types) + + " but as all data types are unsigned integers, we must have found " + "maximum unsigned integer type", + doris::ErrorCode::INVALID_ARGUMENT); } } } *type = nullptr; - return Status::OK(); } // TODO conflict type resolve -Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compatible_with_string) { +void get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compatible_with_string) { /// Trivial cases - + auto throw_or_return = [&](std::string_view message, int error_code) { + if (compatible_with_string) { + *type = std::make_shared<DataTypeString>(); + return; + } + throw doris::Exception(error_code, String(message)); + }; if (types.empty()) { *type = std::make_shared<DataTypeNothing>(); - return Status::OK(); + return; } if (types.size() == 1) { *type = types[0]; - return Status::OK(); + return; } /// All types are equal @@ -220,7 +250,7 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa if (all_equal) { *type = types[0]; - return Status::OK(); + return; } } @@ -231,12 +261,16 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa DataTypes non_nothing_types; non_nothing_types.reserve(types.size()); - for (const auto& type : types) - if (!typeid_cast<const DataTypeNothing*>(type.get())) + for (const auto& type : types) { + if (!typeid_cast<const DataTypeNothing*>(type.get())) { non_nothing_types.emplace_back(type); + } + } - if (non_nothing_types.size() < types.size()) - return get_least_supertype(non_nothing_types, type, compatible_with_string); + if (non_nothing_types.size() < types.size()) { + get_least_supertype(non_nothing_types, type, compatible_with_string); + return; + } } /// For Nullable @@ -251,27 +285,28 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa typeid_cast<const DataTypeNullable*>(type.get())) { have_nullable = true; - if (!type_nullable->only_null()) + if (!type_nullable->only_null()) { nested_types.emplace_back(type_nullable->get_nested_type()); - } else + } + } else { nested_types.emplace_back(type); + } } if (have_nullable) { DataTypePtr nested_type; - Status st = get_least_supertype(nested_types, &nested_type, compatible_with_string); - if (!st.ok()) { - return st; - } + get_least_supertype(nested_types, &nested_type, compatible_with_string); *type = std::make_shared<DataTypeNullable>(nested_type); - return st; + return; } } /// Non-recursive rules phmap::flat_hash_set<TypeIndex> type_ids; - for (const auto& type : types) type_ids.insert(type->get_type_id()); + for (const auto& type : types) { + type_ids.insert(type->get_type_id()); + } /// For String and FixedString, or for different FixedStrings, the common type is String. /// No other types are compatible with Strings. TODO Enums? @@ -285,12 +320,14 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa LOG(INFO) << get_exception_message_prefix(types) << " because some of them are String/FixedString and some of them are not"; - return Status::InvalidArgument( - "some of them are String/FixedString and some of them are not"); + return throw_or_return(get_exception_message_prefix(types) + + " because some of them are String/FixedString and " + "some of them are not", + doris::ErrorCode::INVALID_ARGUMENT); } *type = std::make_shared<DataTypeString>(); - return Status::OK(); + return; } } @@ -304,12 +341,14 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa if (!all_date_or_datetime) { LOG(INFO) << get_exception_message_prefix(types) << " because some of them are Date/DateTime and some of them are not"; - return Status::InvalidArgument( - "because some of them are Date/DateTime and some of them are not"); + return throw_or_return( + get_exception_message_prefix(types) + + " because some of them are Date/DateTime and some of them are not", + doris::ErrorCode::INVALID_ARGUMENT); } *type = std::make_shared<DataTypeDateTime>(); - return Status::OK(); + return; } } @@ -340,8 +379,10 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa if (num_supported != type_ids.size()) { LOG(INFO) << get_exception_message_prefix(types) << " because some of them have no lossless convertion to Decimal"; - return Status::InvalidArgument( - "some of them have no lossless convertion to Decimal"); + return throw_or_return( + get_exception_message_prefix(types) + + " because some of them have no lossless convertion to Decimal", + doris::ErrorCode::INVALID_ARGUMENT); } UInt32 max_scale = 0; @@ -364,50 +405,58 @@ Status get_least_supertype(const DataTypes& types, DataTypePtr* type, bool compa LOG(INFO) << fmt::format("{} because the least supertype is Decimal({},{})", get_exception_message_prefix(types), min_precision, max_scale); - return Status::InvalidArgument( - fmt::format("{} because the least supertype is Decimal({},{})", - get_exception_message_prefix(types), min_precision, max_scale)); + return throw_or_return(get_exception_message_prefix(types) + + fmt::format(" because some of them have no lossless " + "convertion to Decimal({},{})", + min_precision, max_scale), + doris::ErrorCode::INVALID_ARGUMENT); } if (have_decimal128 || min_precision > DataTypeDecimal<Decimal64>::max_precision()) { *type = std::make_shared<DataTypeDecimal<Decimal128>>( DataTypeDecimal<Decimal128>::max_precision(), max_scale); - return Status::OK(); + return; } if (have_decimal128i || min_precision > DataTypeDecimal<Decimal64>::max_precision()) { *type = std::make_shared<DataTypeDecimal<Decimal128I>>( DataTypeDecimal<Decimal128I>::max_precision(), max_scale); - return Status::OK(); + return; } if (have_decimal64 || min_precision > DataTypeDecimal<Decimal32>::max_precision()) { *type = std::make_shared<DataTypeDecimal<Decimal64>>( DataTypeDecimal<Decimal64>::max_precision(), max_scale); - return Status::OK(); + return; } *type = std::make_shared<DataTypeDecimal<Decimal32>>( DataTypeDecimal<Decimal32>::max_precision(), max_scale); - return Status::OK(); + return; } } /// For numeric types, the most complicated part. { DataTypePtr numeric_type = nullptr; - Status st = get_numeric_type(type_ids, &numeric_type); + get_numeric_type(type_ids, &numeric_type, compatible_with_string); if (numeric_type) { - DCHECK(st.ok()); *type = numeric_type; - return Status::OK(); + return; } } /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases). *type = nullptr; - return Status::InvalidArgument(get_exception_message_prefix(types)); + return throw_or_return(get_exception_message_prefix(types), ErrorCode::INVALID_ARGUMENT); } -Status get_least_supertype(const TypeIndexSet& types, DataTypePtr* type, - bool compatible_with_string) { +void get_least_supertype(const TypeIndexSet& types, DataTypePtr* type, + bool compatible_with_string) { + auto throw_or_return = [&](std::string_view message, int error_code) { + if (compatible_with_string) { + *type = std::make_shared<DataTypeString>(); + return; + } + throw doris::Exception(error_code, String(message)); + }; TypeIndexSet types_set; for (const auto& t : types) { if (WhichDataType(t).is_nothing()) continue; @@ -416,8 +465,9 @@ Status get_least_supertype(const TypeIndexSet& types, DataTypePtr* type, LOG(INFO) << "Cannot get common type by type ids with parametric type" << getTypeName(t); *type = nullptr; - return Status::InvalidArgument( - "Cannot get common type by type ids with parametric type"); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Cannot get common type by type ids with parametric type {}", + type_to_string(t)); } types_set.insert(t); @@ -425,31 +475,33 @@ Status get_least_supertype(const TypeIndexSet& types, DataTypePtr* type, if (types_set.empty()) { *type = std::make_shared<DataTypeNothing>(); - return Status::OK(); + return; } if (types.count(TypeIndex::String)) { - if (types.size() != 1 && !compatible_with_string) { + if (types.size() != 1) { LOG(INFO) << " because some of them are String and some of them are not"; *type = nullptr; - return Status::InvalidArgument("some of them are String and some of them are not"); + return throw_or_return( + get_exception_message_prefix(types) + + " because some of them are String and some of them are not", + ErrorCode::INVALID_ARGUMENT); } *type = std::make_shared<DataTypeString>(); - return Status::OK(); + return; } /// For numeric types, the most complicated part. DataTypePtr numeric_type = nullptr; - Status st = get_numeric_type(types, &numeric_type); + get_numeric_type(types, &numeric_type, compatible_with_string); if (numeric_type) { - DCHECK(st.ok()); *type = numeric_type; - return Status::OK(); + return; } /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases). *type = nullptr; - return Status::InvalidArgument("unknown type"); + return throw_or_return(get_exception_message_prefix(types), ErrorCode::INVALID_ARGUMENT); } } // namespace doris::vectorized diff --git a/be/src/vec/data_types/get_least_supertype.h b/be/src/vec/data_types/get_least_supertype.h index ad52f553df..72a6fc31f2 100644 --- a/be/src/vec/data_types/get_least_supertype.h +++ b/be/src/vec/data_types/get_least_supertype.h @@ -42,10 +42,10 @@ namespace doris::vectorized { using TypeIndexSet = phmap::flat_hash_set<TypeIndex>; -Status get_least_supertype(const DataTypes& types, DataTypePtr* type, - bool compatible_with_string = false); +void get_least_supertype(const DataTypes& types, DataTypePtr* type, + bool compatible_with_string = false); -Status get_least_supertype(const TypeIndexSet& types, DataTypePtr* type, - bool compatible_with_string = false); +void get_least_supertype(const TypeIndexSet& types, DataTypePtr* type, + bool compatible_with_string = false); } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index daa549776b..5925515200 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -39,6 +39,7 @@ // IWYU pragma: no_include <opentelemetry/common/threadlocal.h> #include "common/compiler_util.h" // IWYU pragma: keep +#include "common/status.h" #include "exprs/json_functions.h" #include "io/file_factory.h" #include "io/fs/buffered_reader.h" @@ -460,8 +461,7 @@ Status NewJsonReader::_parse_dynamic_json(bool* is_empty_row, bool* eof, Block& _bytes_read_counter += size; auto& dynamic_column = block.get_columns().back()->assume_mutable_ref(); auto& column_object = assert_cast<vectorized::ColumnObject&>(dynamic_column); - Defer __finalize_clousure([&] { - // Reached buffer size, unfold intermediate column object + auto finalize_column = [&]() -> Status { size_t batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); if (column_object.size() >= batch_size || _reader_eof) { column_object.finalize(); @@ -470,35 +470,22 @@ Status NewJsonReader::_parse_dynamic_json(bool* is_empty_row, bool* eof, Block& } // Unfold object columns for the purpose of extracting static columns and // fill default values missing in static columns - schema_util::unfold_object(block.columns() - 1, block, - true /*cast to original column type*/); + RETURN_IF_ERROR(schema_util::unfold_object(block.columns() - 1, block, + true /*cast to original column type*/)); } - }); + return Status::OK(); + }; // read all data, then return if (size == 0 || *eof) { *is_empty_row = true; + RETURN_IF_ERROR(finalize_column()); return Status::OK(); } - Status st = doris::vectorized::parse_json_to_variant(column_object, StringRef {json_str, size}, - _json_parser.get()); - if (st.is<DATA_QUALITY_ERROR>()) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "Parse json data for JsonDoc failed. error info: {}", - st.to_string()); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return std::string((char*)json_str, size); }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Case A: if _scanner_eof is set to true in "append_error_msg_to_file", which means - // we meet enough invalid rows and the scanner should be stopped. - // So we set eof to true and return OK, the caller will stop the process as we meet the end of file. - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } + RETURN_IF_CATCH_EXCEPTION(doris::vectorized::parse_json_to_variant( + column_object, StringRef {json_str, size}, _json_parser.get())); + // TODO correctly handle data quality error + RETURN_IF_ERROR(finalize_column()); return Status::OK(); } diff --git a/be/src/vec/json/json_parser.cpp b/be/src/vec/json/json_parser.cpp index 95dd0034c3..a05a1737b9 100644 --- a/be/src/vec/json/json_parser.cpp +++ b/be/src/vec/json/json_parser.cpp @@ -161,8 +161,9 @@ void JSONDataParser<ParserImpl>::traverseArrayElement(const Element& element, if (current_nested_sizes.size() == ctx.current_size) { current_nested_sizes.push_back(array_size); } else if (array_size != current_nested_sizes.back()) { - LOG(FATAL) << fmt::format("Array sizes mismatched ({} and {})", array_size, - current_nested_sizes.back()); + throw doris::Exception(doris::ErrorCode::INTERNAL_ERROR, + "Array sizes mismatched ({} and {})", array_size, + current_nested_sizes.back()); } } path_array.push_back(std::move(values[i])); diff --git a/be/src/vec/json/parse2column.cpp b/be/src/vec/json/parse2column.cpp index 7b12d3e7cd..1ae5d0d591 100644 --- a/be/src/vec/json/parse2column.cpp +++ b/be/src/vec/json/parse2column.cpp @@ -192,8 +192,8 @@ bool try_insert_default_from_nested(const std::shared_ptr<Node>& entry, } template <typename ParserImpl> -Status parse_json_to_variant(IColumn& column, const char* src, size_t length, - JSONDataParser<ParserImpl>* parser) { +void parse_json_to_variant(IColumn& column, const char* src, size_t length, + JSONDataParser<ParserImpl>* parser) { auto& column_object = assert_cast<ColumnObject&>(column); std::optional<ParseResult> result; /// Treat empty string as an empty object @@ -205,8 +205,8 @@ Status parse_json_to_variant(IColumn& column, const char* src, size_t length, } if (!result) { LOG(INFO) << "failed to parse " << std::string_view(src, length) << ", length= " << length; - return Status::DataQualityError( - fmt::format("Failed to parse object {}", std::string_view(src, length))); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to parse object {}", + std::string_view(src, length)); } auto& [paths, values] = *result; assert(paths.size() == values.size()); @@ -214,18 +214,21 @@ Status parse_json_to_variant(IColumn& column, const char* src, size_t length, size_t num_rows = column_object.size(); for (size_t i = 0; i < paths.size(); ++i) { FieldInfo field_info; - RETURN_IF_ERROR(get_field_info(values[i], &field_info)); + get_field_info(values[i], &field_info); // TODO support multi dimensions array if (!config::enable_parse_multi_dimession_array && field_info.num_dimensions >= 2) { - return Status::DataQualityError( + throw doris::Exception( + ErrorCode::INVALID_ARGUMENT, "Sorry multi dimensions array is not supported now, we are working on it"); } if (is_nothing(field_info.scalar_type)) { continue; } if (!paths_set.insert(paths[i].get_path()).second) { - return Status::DataQualityError( - fmt::format("Object has ambiguous path {}, {}", paths[i].get_path())); + // return Status::DataQualityError( + // fmt::format("Object has ambiguous path {}, {}", paths[i].get_path())); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Object has ambiguous path {}", + paths[i].get_path()); } if (!column_object.has_subcolumn(paths[i])) { @@ -237,16 +240,11 @@ Status parse_json_to_variant(IColumn& column, const char* src, size_t length, } auto* subcolumn = column_object.get_subcolumn(paths[i]); if (!subcolumn) { - return Status::DataQualityError( - fmt::format("Failed to find sub column {}", paths[i].get_path())); + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Failed to find sub column {}", + paths[i].get_path()); } assert(subcolumn->size() == num_rows); - Status st = subcolumn->insert(std::move(values[i]), std::move(field_info)); - if (st.is_invalid_argument()) { - return Status::DataQualityError( - fmt::format("Failed to insert field {}", st.to_string())); - } - RETURN_IF_ERROR(st); + subcolumn->insert(std::move(values[i]), std::move(field_info)); } // /// Insert default values to missed subcolumns. const auto& subcolumns = column_object.get_subcolumns(); @@ -259,7 +257,6 @@ Status parse_json_to_variant(IColumn& column, const char* src, size_t length, } } column_object.incr_num_rows(); - return Status::OK(); } bool extract_key(MutableColumns& columns, StringRef json, const std::vector<StringRef>& keys, @@ -268,17 +265,16 @@ bool extract_key(MutableColumns& columns, StringRef json, const std::vector<Stri } // exposed interfaces -Status parse_json_to_variant(IColumn& column, const StringRef& json, - JSONDataParser<SimdJSONParser>* parser) { +void parse_json_to_variant(IColumn& column, const StringRef& json, + JSONDataParser<SimdJSONParser>* parser) { return parse_json_to_variant(column, json.data, json.size, parser); } -Status parse_json_to_variant(IColumn& column, const std::vector<StringRef>& jsons) { +void parse_json_to_variant(IColumn& column, const std::vector<StringRef>& jsons) { auto parser = parsers_pool.get([] { return new JSONDataParser<SimdJSONParser>(); }); for (StringRef str : jsons) { - RETURN_IF_ERROR(parse_json_to_variant(column, str.data, str.size, parser.get())); + parse_json_to_variant(column, str.data, str.size, parser.get()); } - return Status::OK(); } bool extract_key(MutableColumns& columns, const std::vector<StringRef>& jsons, diff --git a/be/src/vec/json/parse2column.h b/be/src/vec/json/parse2column.h index 0bc13dd591..4e78d98164 100644 --- a/be/src/vec/json/parse2column.h +++ b/be/src/vec/json/parse2column.h @@ -35,12 +35,12 @@ class JSONDataParser; namespace doris::vectorized { -// parse a batch of json strings into column object -Status parse_json_to_variant(IColumn& column, const std::vector<StringRef>& jsons); +// parse a batch of json strings into column object, throws doris::Execption when failed +void parse_json_to_variant(IColumn& column, const std::vector<StringRef>& jsons); -// parse a single json -Status parse_json_to_variant(IColumn& column, const StringRef& jsons, - JSONDataParser<SimdJSONParser>* parser); +// parse a single json, throws doris::Execption when failed +void parse_json_to_variant(IColumn& column, const StringRef& jsons, + JSONDataParser<SimdJSONParser>* parser); // extract keys columns from json strings into columns bool extract_key(MutableColumns& columns, const std::vector<StringRef>& jsons, diff --git a/be/src/vec/olap/olap_data_convertor.cpp b/be/src/vec/olap/olap_data_convertor.cpp index 3070975a3c..1deff5f6d2 100644 --- a/be/src/vec/olap/olap_data_convertor.cpp +++ b/be/src/vec/olap/olap_data_convertor.cpp @@ -816,7 +816,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorStruct::convert_to_olap() DataTypePtr sub_type = data_type_struct->get_element(i); ColumnWithTypeAndName sub_typed_column = {sub_column, sub_type, ""}; _sub_convertors[i]->set_source_column(sub_typed_column, _row_pos, _num_rows); - _sub_convertors[i]->convert_to_olap(); + RETURN_IF_ERROR(_sub_convertors[i]->convert_to_olap()); _results[data_cursor] = _sub_convertors[i]->get_data(); _results[null_map_cursor] = _sub_convertors[i]->get_nullmap(); data_cursor++; @@ -867,7 +867,7 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorArray::convert_to_olap( ColumnWithTypeAndName item_typed_column = { item_data, remove_nullable(data_type_array->get_nested_type()), ""}; _item_convertor->set_source_column(item_typed_column, start, size); - _item_convertor->convert_to_olap(); + RETURN_IF_ERROR(_item_convertor->convert_to_olap()); CollectionValue* collection_value = _values.data(); for (size_t i = 0; i < _num_rows; ++i, ++collection_value) { @@ -954,12 +954,12 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorMap::convert_to_olap( _base_offset += elem_size; ColumnWithTypeAndName key_typed_column = {key_data, data_type_map->get_key_type(), "map.key"}; _key_convertor->set_source_column(key_typed_column, start_offset, elem_size); - _key_convertor->convert_to_olap(); + RETURN_IF_ERROR(_key_convertor->convert_to_olap()); ColumnWithTypeAndName value_typed_column = {value_data, data_type_map->get_value_type(), "map.value"}; _value_convertor->set_source_column(value_typed_column, start_offset, elem_size); - _value_convertor->convert_to_olap(); + RETURN_IF_ERROR(_value_convertor->convert_to_olap()); // todo (Amory). put this value into MapValue _results[0] = (void*)elem_size; diff --git a/lowercase.json b/lowercase.json new file mode 100644 index 0000000000..f2728cd0aa --- /dev/null +++ b/lowercase.json @@ -0,0 +1,5 @@ +{"xxxx": 1234} +{"xxxx": 12345678} +{"xxxx": "5679"} +{"XXXX": "5679"} +{"YYY": "5679"} diff --git a/regression-test/data/dynamic_table_p0/floating_point.json b/regression-test/data/dynamic_table_p0/floating_point.json new file mode 100644 index 0000000000..9d4e742a0e --- /dev/null +++ b/regression-test/data/dynamic_table_p0/floating_point.json @@ -0,0 +1,4 @@ +{"type":1} +{"c": 10000000000} +{"c":1.0} +{"ca":1} diff --git a/regression-test/data/dynamic_table_p0/floating_point2.json b/regression-test/data/dynamic_table_p0/floating_point2.json new file mode 100644 index 0000000000..5ad60cd4fc --- /dev/null +++ b/regression-test/data/dynamic_table_p0/floating_point2.json @@ -0,0 +1,4 @@ +{"type":1} +{"a": 10000000000} +{"a":1.0} +{"a":1} diff --git a/regression-test/data/dynamic_table_p0/floating_point3.json b/regression-test/data/dynamic_table_p0/floating_point3.json new file mode 100644 index 0000000000..eab9e4c757 --- /dev/null +++ b/regression-test/data/dynamic_table_p0/floating_point3.json @@ -0,0 +1,4 @@ +{"type":1} +{"c": "10110101"} +{"c":1991.0222} +{"c":100000} diff --git a/regression-test/data/dynamic_table_p0/invalid_dimension.json b/regression-test/data/dynamic_table_p0/invalid_dimension.json new file mode 100644 index 0000000000..23311c2692 --- /dev/null +++ b/regression-test/data/dynamic_table_p0/invalid_dimension.json @@ -0,0 +1,4 @@ +{"type":1} +{"b":[1, 2]} +{"b":""} +{"b":1} diff --git a/regression-test/data/dynamic_table_p0/invalid_format.json b/regression-test/data/dynamic_table_p0/invalid_format.json new file mode 100644 index 0000000000..d9eca95a2a --- /dev/null +++ b/regression-test/data/dynamic_table_p0/invalid_format.json @@ -0,0 +1,3 @@ +{"type":1} +{"a:[1, 2]} +{"a":1} diff --git a/regression-test/data/dynamic_table_p0/load.out b/regression-test/data/dynamic_table_p0/load.out new file mode 100644 index 0000000000..59e01e1dd4 --- /dev/null +++ b/regression-test/data/dynamic_table_p0/load.out @@ -0,0 +1,1939 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +42890 + +-- !sql -- +11924 + +-- !sql -- +23413 + +-- !sql -- +23413 + +-- !sql -- +23413 + +-- !sql -- +27232 + +-- !sql -- +23413 + +-- !sql -- +23413 + +-- !sql -- +38488 + +-- !sql -- +38488 + +-- !sql -- +38488 + +-- !sql -- +38488 + +-- !sql -- +38488 + +-- !sql -- +38488 + +-- !sql -- +12051 + +-- !sql -- +12051 + +-- !sql -- +12051 + +-- !sql -- +12051 + +-- !sql -- +12051 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5369 + +-- !sql -- +5339 + +-- !sql -- +5592 + +-- !sql -- +5592 + +-- !sql -- +4768 + +-- !sql -- +4588 + +-- !sql -- +1035 + +-- !sql -- +1035 + +-- !sql -- +1035 + +-- !sql -- +1035 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +3407 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +4084 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1298 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +789 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +1319 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +175 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +70 + +-- !sql -- +181 + +-- !sql -- +181 + +-- !sql -- +175 + +-- !sql -- +1719 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2115 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2087 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +1952 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2142 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2094 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +1972 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +2151 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +1719 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +524 + +-- !sql -- +1721 + +-- !sql -- +432 + +-- !sql -- +656 + +-- !sql -- +520 + +-- !sql -- +432 + +-- !sql -- +677 + +-- !sql -- +432 + +-- !sql -- +432 + +-- !sql -- +432 + +-- !sql -- +432 + +-- !sql -- +432 + +-- !sql -- +1305 + +-- !sql -- +1135 + +-- !sql -- +1167 + +-- !sql -- +2189 + +-- !sql -- +2189 + +-- !sql -- +2189 + +-- !sql -- +843 + +-- !sql -- +680 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +660 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +119 + +-- !sql -- +223 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +438 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +515 + +-- !sql -- +195 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +52 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +40 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +58 + +-- !sql -- +15 + +-- !sql -- +391 + +-- !sql -- +391 + +-- !sql -- +391 + +-- !sql -- +391 + +-- !sql -- +391 + +-- !sql -- +98 + +-- !sql -- +79 + +-- !sql -- +2 + +-- !sql -- +12 + +-- !sql -- +16 + +-- !sql -- +220 + +-- !sql -- +42 + diff --git a/regression-test/data/dynamic_table_p0/lowercase.json b/regression-test/data/dynamic_table_p0/lowercase.json new file mode 100644 index 0000000000..b5fed5f0a0 --- /dev/null +++ b/regression-test/data/dynamic_table_p0/lowercase.json @@ -0,0 +1,4 @@ +{"xxx": 1234} +{"xxx": "5679"} +{"XXX": "5679"} +{"XXX": "5679"} diff --git a/regression-test/data/dynamic_table_p0/sql/q05.out b/regression-test/data/dynamic_table_p0/sql/q05.out index e53460d268..583b8101a4 100644 --- a/regression-test/data/dynamic_table_p0/sql/q05.out +++ b/regression-test/data/dynamic_table_p0/sql/q05.out @@ -3,27 +3,27 @@ 0 -- !q05_2 -- -30417 +42890 -- !q05_3 -- +oliver006/elasticsearch-gmail 37 +prakhar1989/awesome-courses 28 getguesstimate/guesstimate-app 26 +cachethq/Cachet 17 ericelliott/essential-javascript-links 16 -FreeCodeCamp/FreeCodeCamp 14 -tj/frontend-boilerplate 14 -prakhar1989/awesome-courses 12 -- !q05_4 -- 3487211075 -- !q05_5 -- -94350289813772 +125406378528208 -- !q05_6 -- -518892546 +704293861 -- !q05_7 -- 0 -- !q05_8 -- -56947 +85450 diff --git a/regression-test/data/dynamic_table_p0/uppercase.json b/regression-test/data/dynamic_table_p0/uppercase.json new file mode 100644 index 0000000000..a1236a9ec9 --- /dev/null +++ b/regression-test/data/dynamic_table_p0/uppercase.json @@ -0,0 +1,4 @@ +{"A":1} +{"B":1} +{"C":1} +{"D":1} diff --git a/regression-test/suites/dynamic_table_p0/load.groovy b/regression-test/suites/dynamic_table_p0/load.groovy index 4837ca3cd1..6ac6836194 100644 --- a/regression-test/suites/dynamic_table_p0/load.groovy +++ b/regression-test/suites/dynamic_table_p0/load.groovy @@ -126,6 +126,28 @@ suite("regression_test_dynamic_table", "dynamic_table"){ sql """insert into test_ghdata_json_unique select * from test_ghdata_json""" sql """insert into test_btc_json_unique select * from test_btc_json""" + // abnormal cases + table_name = "abnormal_cases" + sql """ + DROP TABLE IF EXISTS ${table_name}; + """ + sql """ + CREATE TABLE IF NOT EXISTS ${table_name} ( + qid bigint, + XXXX bigint, + ... + ) + DUPLICATE KEY(`qid`) + DISTRIBUTED BY HASH(`qid`) BUCKETS 5 + properties("replication_num" = "1"); + """ + load_json_data.call(table_name, 'true', 'json', 'true', "invalid_dimension.json", 'false') + load_json_data.call(table_name, 'true', 'json', 'true', "invalid_format.json", 'false') + load_json_data.call(table_name, 'true', 'json', 'true', "floating_point.json", 'true') + load_json_data.call(table_name, 'true', 'json', 'true', "floating_point2.json", 'true') + load_json_data.call(table_name, 'true', 'json', 'true', "floating_point3.json", 'true') + load_json_data.call(table_name, 'true', 'json', 'true', "uppercase.json", 'true') + // load more table_name = "gharchive"; sql "DROP TABLE IF EXISTS ${table_name}" @@ -145,6 +167,7 @@ suite("regression_test_dynamic_table", "dynamic_table"){ ); """ def paths = [ + """${getS3Url() + '/regression/gharchive/2015-01-01-22.json'}""", """${getS3Url() + '/regression/gharchive/2015-01-01-16.json'}""", """${getS3Url() + '/regression/gharchive/2016-01-01-16.json'}""", ] @@ -176,4 +199,58 @@ suite("regression_test_dynamic_table", "dynamic_table"){ } } } + + sql 'sync' + meta = sql_meta 'select * from gharchive limit 1' + def array_cols = [ + "payload.commits.url", + "payload.commits.sha", + "payload.commits.author.email", + "payload.commits.distinct", + "payload.commits.author.name", + "payload.commits.message", + "payload.issue.labels.name", + "payload.issue.labels.color", + "payload.issue.labels.url", + "payload.pages.title", + "payload.pages.html_url", + "payload.pages.sha", + "payload.pages.action", + "payload.pages.page_name", + "payload.release.assets.uploader.repos_url", + "payload.release.assets.uploader.id", + "payload.release.assets.uploader.organizations_url", + "payload.release.assets.uploader.received_events_url", + "payload.release.assets.uploader.site_admin", + "payload.release.assets.uploader.subscriptions_url", + "payload.release.assets.state", + "payload.release.assets.size", + "payload.release.assets.uploader.following_url", + "payload.release.assets.uploader.starred_url", + "payload.release.assets.download_count", + "payload.release.assets.created_at", + "payload.release.assets.updated_at", + "payload.release.assets.browser_download_url", + "payload.release.assets.url", + "payload.release.assets.uploader.gravatar_id", + "payload.release.assets.uploader.gists_url", + "payload.release.assets.uploader.url", + "payload.release.assets.content_type", + "payload.release.assets.name", + "payload.release.assets.uploader.login", + "payload.release.assets.uploader.avatar_url", + "payload.release.assets.uploader.html_url", + "payload.release.assets.uploader.followers_url", + "payload.release.assets.uploader.events_url", + "payload.release.assets.uploader.type", + "payload.release.assets.id", + "payload.release.assets.label" + ] + for (List<String> col_meta in meta) { + if (col_meta[0] in array_cols) { + qt_sql "select sum(array_size(`${col_meta[0]}`)) from gharchive" + } else { + qt_sql "select count(`${col_meta[0]}`) from gharchive" + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org