This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 40be6a0b05 [fix](hive) do not split compress data file and support lz4/snappy block codec (#23245) 40be6a0b05 is described below commit 40be6a0b059ede9bda5b29b66ea4ab955503aeaf Author: Mingyu Chen <morning...@163.com> AuthorDate: Sat Aug 26 12:59:05 2023 +0800 [fix](hive) do not split compress data file and support lz4/snappy block codec (#23245) 1. do not split compress data file Some data file in hive is compressed with gzip, deflate, etc. These kinds of file can not be splitted. 2. Support lz4 block codec for hive scan node, use lz4 block codec instead of lz4 frame codec 4. Support snappy block codec For hadoop snappy 5. Optimize the `count(*)` query of csv file For query like `select count(*) from tbl`, only need to split the line, no need to split the column. Need to pick to branch-2.0 after this PR: #22304 --- be/src/exec/decompressor.cpp | 185 ++++++++++++++++++++- be/src/exec/decompressor.h | 39 ++++- be/src/service/internal_service.cpp | 2 + be/src/util/load_util.cpp | 9 +- be/src/vec/exec/format/csv/csv_reader.cpp | 76 +++++++-- be/src/vec/exec/format/csv/csv_reader.h | 6 + .../file_reader/new_plain_text_line_reader.cpp | 10 +- .../file_reader/new_plain_text_line_reader.h | 1 - be/src/vec/exec/scan/vfile_scanner.cpp | 36 ++-- .../java/org/apache/doris/common/util/Util.java | 6 + .../doris/datasource/hive/HiveMetaStoreCache.java | 2 + .../apache/doris/external/hive/util/HiveUtil.java | 3 +- .../doris/planner/external/FileScanNode.java | 17 +- .../doris/planner/external/HiveScanNode.java | 11 ++ .../apache/doris/planner/external/HiveSplit.java | 5 - gensrc/thrift/PlanNodes.thrift | 4 + .../external_table_p2/hive/test_compress_type.out | 47 ++++++ .../hive/test_compress_type.groovy | 61 +++++++ 18 files changed, 464 insertions(+), 56 deletions(-) diff --git a/be/src/exec/decompressor.cpp b/be/src/exec/decompressor.cpp index af69a896a2..964654132a 100644 --- a/be/src/exec/decompressor.cpp +++ b/be/src/exec/decompressor.cpp @@ -42,6 +42,12 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom case CompressType::LZ4FRAME: *decompressor = new Lz4FrameDecompressor(); break; + case CompressType::LZ4BLOCK: + *decompressor = new Lz4BlockDecompressor(); + break; + case CompressType::SNAPPYBLOCK: + *decompressor = new SnappyBlockDecompressor(); + break; #ifdef DORIS_WITH_LZO case CompressType::LZOP: *decompressor = new LzopDecompressor(); @@ -59,6 +65,10 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom return st; } +uint32_t Decompressor::_read_int32(uint8_t* buf) { + return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]; +} + std::string Decompressor::debug_info() { return "Decompressor"; } @@ -239,7 +249,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t size_t* decompressed_len, bool* stream_end, size_t* more_input_bytes, size_t* more_output_bytes) { uint8_t* src = input; - size_t src_size = input_len; + size_t remaining_input_size = input_len; size_t ret = 1; *input_bytes_read = 0; @@ -257,7 +267,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t } LZ4F_frameInfo_t info; - ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &src_size); + ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &remaining_input_size); if (LZ4F_isError(ret)) { return Status::InternalError("LZ4F_getFrameInfo error: {}", std::string(LZ4F_getErrorName(ret))); @@ -270,17 +280,17 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t std::string(LZ4F_getErrorName(ret))); } - *input_bytes_read = src_size; + *input_bytes_read = remaining_input_size; - src += src_size; - src_size = input_len - src_size; + src += remaining_input_size; + remaining_input_size = input_len - remaining_input_size; LOG(INFO) << "lz4 block size: " << _expect_dec_buf_size; } // decompress size_t output_len = output_max_len; - ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &src_size, + ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &remaining_input_size, /* LZ4F_decompressOptions_t */ nullptr); if (LZ4F_isError(ret)) { return Status::InternalError("Decompression error: {}", @@ -288,7 +298,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t } // update - *input_bytes_read += src_size; + *input_bytes_read += remaining_input_size; *decompressed_len = output_len; if (ret == 0) { *stream_end = true; @@ -324,4 +334,165 @@ size_t Lz4FrameDecompressor::get_block_size(const LZ4F_frameInfo_t* info) { } } +/// Lz4BlockDecompressor +Status Lz4BlockDecompressor::init() { + return Status::OK(); +} + +Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, + uint8_t* output, size_t output_max_len, + size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) { + uint8_t* src = input; + size_t remaining_input_size = input_len; + int64_t uncompressed_total_len = 0; + *input_bytes_read = 0; + + // The hadoop lz4 codec is as: + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // <lz4 compressed block> + // .... + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // <lz4 compressed block> + // + // See: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc + while (remaining_input_size > 0) { + // Read uncompressed size + uint32_t uncompressed_block_len = Decompressor::_read_int32(src); + int64_t remaining_output_size = output_max_len - uncompressed_total_len; + if (remaining_output_size < uncompressed_block_len) { + // Need more output buffer + *more_output_bytes = uncompressed_block_len - remaining_output_size; + break; + } + + // Read compressed size + size_t tmp_src_size = remaining_input_size - sizeof(uint32_t); + size_t compressed_len = Decompressor::_read_int32(src + sizeof(uint32_t)); + if (compressed_len == 0 || compressed_len > tmp_src_size) { + // Need more input data + *more_input_bytes = compressed_len - tmp_src_size; + break; + } + + src += 2 * sizeof(uint32_t); + remaining_input_size -= 2 * sizeof(uint32_t); + + // Decompress + int uncompressed_len = LZ4_decompress_safe(reinterpret_cast<const char*>(src), + reinterpret_cast<char*>(output), compressed_len, + remaining_output_size); + if (uncompressed_len < 0 || uncompressed_len != uncompressed_block_len) { + return Status::InternalError( + "lz4 block decompress failed. uncompressed_len: {}, expected: {}", + uncompressed_len, uncompressed_block_len); + } + + output += uncompressed_len; + src += compressed_len; + remaining_input_size -= compressed_len; + uncompressed_total_len += uncompressed_len; + } + + *input_bytes_read += (input_len - remaining_input_size); + *decompressed_len = uncompressed_total_len; + // If no more input and output need, means this is the end of a compressed block + *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); + + return Status::OK(); +} + +std::string Lz4BlockDecompressor::debug_info() { + std::stringstream ss; + ss << "Lz4BlockDecompressor."; + return ss.str(); +} + +/// SnappyBlockDecompressor +Status SnappyBlockDecompressor::init() { + return Status::OK(); +} + +Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len, + size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, + bool* stream_end, size_t* more_input_bytes, + size_t* more_output_bytes) { + uint8_t* src = input; + size_t remaining_input_size = input_len; + int64_t uncompressed_total_len = 0; + *input_bytes_read = 0; + + // The hadoop snappy codec is as: + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // <snappy compressed block> + // .... + // <4 byte big endian uncompressed size> + // <4 byte big endian compressed size> + // <snappy compressed block> + // + // See: + // https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/SnappyCodec.cc + while (remaining_input_size > 0) { + // Read uncompressed size + uint32_t uncompressed_block_len = Decompressor::_read_int32(src); + int64_t remaining_output_size = output_max_len - uncompressed_total_len; + if (remaining_output_size < uncompressed_block_len) { + // Need more output buffer + *more_output_bytes = uncompressed_block_len - remaining_output_size; + break; + } + + // Read compressed size + size_t tmp_src_size = remaining_input_size - sizeof(uint32_t); + size_t compressed_len = _read_int32(src + sizeof(uint32_t)); + if (compressed_len == 0 || compressed_len > tmp_src_size) { + // Need more input data + *more_input_bytes = compressed_len - tmp_src_size; + break; + } + + src += 2 * sizeof(uint32_t); + remaining_input_size -= 2 * sizeof(uint32_t); + + // ATTN: the uncompressed len from GetUncompressedLength() is same as + // uncompressed_block_len, so I think it is unnecessary to get it again. + // Get uncompressed len from snappy + // size_t uncompressed_len; + // if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(src), + // compressed_len, &uncompressed_len)) { + // return Status::InternalError("snappy block decompress failed to get uncompressed len"); + // } + + // Decompress + if (!snappy::RawUncompress(reinterpret_cast<const char*>(src), compressed_len, + reinterpret_cast<char*>(output))) { + return Status::InternalError("snappy block decompress failed. uncompressed_len: {}", + uncompressed_block_len); + } + + output += uncompressed_block_len; + src += compressed_len; + remaining_input_size -= compressed_len; + uncompressed_total_len += uncompressed_block_len; + } + + *input_bytes_read += (input_len - remaining_input_size); + *decompressed_len = uncompressed_total_len; + // If no more input and output need, means this is the end of a compressed block + *stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0); + + return Status::OK(); +} + +std::string SnappyBlockDecompressor::debug_info() { + std::stringstream ss; + ss << "SnappyBlockDecompressor."; + return ss.str(); +} + } // namespace doris diff --git a/be/src/exec/decompressor.h b/be/src/exec/decompressor.h index af37335f1f..2b07e71139 100644 --- a/be/src/exec/decompressor.h +++ b/be/src/exec/decompressor.h @@ -18,7 +18,10 @@ #pragma once #include <bzlib.h> +#include <lz4/lz4.h> #include <lz4/lz4frame.h> +#include <lz4/lz4hc.h> +#include <snappy.h> #include <stddef.h> #include <stdint.h> #include <zlib.h> @@ -34,7 +37,7 @@ namespace doris { -enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP }; +enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP, LZ4BLOCK, SNAPPYBLOCK }; class Decompressor { public: @@ -68,6 +71,8 @@ public: protected: virtual Status init() = 0; + static uint32_t _read_int32(uint8_t* buf); + Decompressor(CompressType ctype) : _ctype(ctype) {} CompressType _ctype; @@ -140,6 +145,38 @@ private: const static unsigned DORIS_LZ4F_VERSION; }; +class Lz4BlockDecompressor : public Decompressor { +public: + ~Lz4BlockDecompressor() override {} + + Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) override; + + std::string debug_info() override; + +private: + friend class Decompressor; + Lz4BlockDecompressor() : Decompressor(CompressType::LZ4FRAME) {} + Status init() override; +}; + +class SnappyBlockDecompressor : public Decompressor { +public: + ~SnappyBlockDecompressor() override {} + + Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output, + size_t output_max_len, size_t* decompressed_len, bool* stream_end, + size_t* more_input_bytes, size_t* more_output_bytes) override; + + std::string debug_info() override; + +private: + friend class Decompressor; + SnappyBlockDecompressor() : Decompressor(CompressType::SNAPPYBLOCK) {} + Status init() override; +}; + #ifdef DORIS_WITH_LZO class LzopDecompressor : public Decompressor { public: diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 3838668577..b458af6336 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -632,6 +632,8 @@ void PInternalServiceImpl::fetch_table_schema(google::protobuf::RpcController* c case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_BZ2: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: { // file_slots is no use diff --git a/be/src/util/load_util.cpp b/be/src/util/load_util.cpp index 8736561db4..1277132378 100644 --- a/be/src/util/load_util.cpp +++ b/be/src/util/load_util.cpp @@ -46,9 +46,15 @@ void LoadUtil::parse_format(const std::string& format_str, const std::string& co } else if (iequal(compress_type_str, "LZ4")) { *format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME; *compress_type = TFileCompressType::LZ4FRAME; + } else if (iequal(compress_type_str, "LZ4_BLOCK")) { + *format_type = TFileFormatType::FORMAT_CSV_LZ4BLOCK; + *compress_type = TFileCompressType::LZ4BLOCK; } else if (iequal(compress_type_str, "LZOP")) { *format_type = TFileFormatType::FORMAT_CSV_LZOP; *compress_type = TFileCompressType::LZO; + } else if (iequal(compress_type_str, "SNAPPY_BLOCK")) { + *format_type = TFileFormatType::FORMAT_CSV_SNAPPYBLOCK; + *compress_type = TFileCompressType::SNAPPYBLOCK; } else if (iequal(compress_type_str, "DEFLATE")) { *format_type = TFileFormatType::FORMAT_CSV_DEFLATE; *compress_type = TFileCompressType::DEFLATE; @@ -72,6 +78,7 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { case TFileFormatType::FORMAT_CSV_DEFLATE: case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: case TFileFormatType::FORMAT_CSV_LZO: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_JSON: @@ -81,4 +88,4 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) { } return false; } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/vec/exec/format/csv/csv_reader.cpp b/be/src/vec/exec/format/csv/csv_reader.cpp index 28d6f484f7..bc41b66298 100644 --- a/be/src/vec/exec/format/csv/csv_reader.cpp +++ b/be/src/vec/exec/format/csv/csv_reader.cpp @@ -343,8 +343,12 @@ Status CsvReader::init_reader(bool is_load) { [[fallthrough]]; case TFileFormatType::FORMAT_CSV_LZ4FRAME: [[fallthrough]]; + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + [[fallthrough]]; case TFileFormatType::FORMAT_CSV_LZOP: [[fallthrough]]; + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + [[fallthrough]]; case TFileFormatType::FORMAT_CSV_DEFLATE: _line_reader = NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(), @@ -400,21 +404,51 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) { const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE); size_t rows = 0; - auto columns = block->mutate_columns(); - while (rows < batch_size && !_line_reader_eof) { - const uint8_t* ptr = nullptr; - size_t size = 0; - RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); - if (_skip_lines > 0) { - _skip_lines--; - continue; + + bool success = false; + if (_push_down_agg_type == TPushAggOp::type::COUNT) { + while (rows < batch_size && !_line_reader_eof) { + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + if (_skip_lines > 0) { + _skip_lines--; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + + RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); + ++rows; } - if (size == 0) { - // Read empty row, just continue - continue; + + for (auto& col : block->mutate_columns()) { + col->resize(rows); } - RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + } else { + auto columns = block->mutate_columns(); + while (rows < batch_size && !_line_reader_eof) { + const uint8_t* ptr = nullptr; + size_t size = 0; + RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx)); + if (_skip_lines > 0) { + _skip_lines--; + continue; + } + if (size == 0) { + // Read empty row, just continue + continue; + } + + RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success)); + if (!success) { + continue; + } + RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows)); + } } *eof = (rows == 0); @@ -476,9 +510,15 @@ Status CsvReader::_create_decompressor() { case TFileCompressType::LZ4FRAME: compress_type = CompressType::LZ4FRAME; break; + case TFileCompressType::LZ4BLOCK: + compress_type = CompressType::LZ4BLOCK; + break; case TFileCompressType::DEFLATE: compress_type = CompressType::DEFLATE; break; + case TFileCompressType::SNAPPYBLOCK: + compress_type = CompressType::SNAPPYBLOCK; + break; default: return Status::InternalError("unknown compress type: {}", _file_compress_type); } @@ -498,12 +538,18 @@ Status CsvReader::_create_decompressor() { case TFileFormatType::FORMAT_CSV_LZ4FRAME: compress_type = CompressType::LZ4FRAME; break; + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: + compress_type = CompressType::LZ4BLOCK; + break; case TFileFormatType::FORMAT_CSV_LZOP: compress_type = CompressType::LZOP; break; case TFileFormatType::FORMAT_CSV_DEFLATE: compress_type = CompressType::DEFLATE; break; + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: + compress_type = CompressType::SNAPPYBLOCK; + break; default: return Status::InternalError("unknown format type: {}", _file_format_type); } @@ -557,7 +603,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block, return Status::OK(); } -Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { +Status CsvReader::_validate_line(const Slice& line, bool* success) { if (!_is_proto_format && !validate_utf8(line.data, line.size)) { if (!_is_load) { return Status::InternalError("Only support csv data in utf8 codec"); @@ -575,7 +621,11 @@ Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { return Status::OK(); } } + *success = true; + return Status::OK(); +} +Status CsvReader::_line_split_to_values(const Slice& line, bool* success) { _split_line(line); if (_is_load) { diff --git a/be/src/vec/exec/format/csv/csv_reader.h b/be/src/vec/exec/format/csv/csv_reader.h index 5721bbd929..2659703f8d 100644 --- a/be/src/vec/exec/format/csv/csv_reader.h +++ b/be/src/vec/exec/format/csv/csv_reader.h @@ -221,6 +221,12 @@ private: // TODO(ftw): parse type Status _parse_col_types(size_t col_nums, std::vector<TypeDescriptor>* col_types); + // check the utf8 encoding of a line. + // return error status to stop processing. + // If return Status::OK but "success" is false, which means this is load request + // and the line is skipped as unqualified row, and the process should continue. + Status _validate_line(const Slice& line, bool* success); + RuntimeState* _state; RuntimeProfile* _profile; ScannerCounter* _counter; diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp index b59bbef1f1..c27aba354f 100644 --- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp +++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.cpp @@ -201,7 +201,6 @@ NewPlainTextLineReader::NewPlainTextLineReader(RuntimeProfile* profile, _output_buf_limit(0), _file_eof(false), _eof(false), - _stream_end(true), _more_input_bytes(0), _more_output_bytes(0), _current_offset(current_offset), @@ -324,6 +323,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool _line_reader_ctx->refresh(); int found_line_delimiter = 0; size_t offset = 0; + bool stream_end = true; while (!done()) { // find line delimiter in current decompressed data uint8_t* cur_ptr = _output_buf + _output_buf_pos; @@ -379,7 +379,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool COUNTER_UPDATE(_bytes_read_counter, read_len); } if (_file_eof || read_len == 0) { - if (!_stream_end) { + if (!stream_end) { return Status::InternalError( "Compressed file has been truncated, which is not allowed"); } else { @@ -392,7 +392,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool if (_decompressor == nullptr) { _output_buf_limit += read_len; - _stream_end = true; + stream_end = true; } else { // only update input limit. // input pos is set at MARK step @@ -418,10 +418,10 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool _input_buf_limit - _input_buf_pos, /* input_len */ &input_read_bytes, _output_buf + _output_buf_limit, /* output */ _output_buf_size - _output_buf_limit, /* output_max_len */ - &decompressed_len, &_stream_end, &_more_input_bytes, &_more_output_bytes)); + &decompressed_len, &stream_end, &_more_input_bytes, &_more_output_bytes)); // LOG(INFO) << "after decompress:" - // << " stream_end: " << _stream_end + // << " stream_end: " << stream_end // << " input_read_bytes: " << input_read_bytes // << " decompressed_len: " << decompressed_len // << " more_input_bytes: " << _more_input_bytes diff --git a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h index 7326812b92..9947259300 100644 --- a/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h +++ b/be/src/vec/exec/format/file_reader/new_plain_text_line_reader.h @@ -235,7 +235,6 @@ private: bool _file_eof; bool _eof; - bool _stream_end; size_t _more_input_bytes; size_t _more_output_bytes; diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 055ec224a3..505b6807b4 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -261,22 +261,26 @@ Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eo // use read_rows instead of _src_block_ptr->rows(), because the first column of _src_block_ptr // may not be filled after calling `get_next_block()`, so _src_block_ptr->rows() may return wrong result. if (read_rows > 0) { - // Convert the src block columns type to string in-place. - RETURN_IF_ERROR(_cast_to_input_block(block)); - // FileReader can fill partition and missing columns itself - if (!_cur_reader->fill_all_columns()) { - // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) - RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); - // Fill columns not exist in file with null or default value - RETURN_IF_ERROR(_fill_missing_columns(read_rows)); + // If the push_down_agg_type is COUNT, no need to do the rest, + // because we only save a number in block. + if (_parent->get_push_down_agg_type() != TPushAggOp::type::COUNT) { + // Convert the src block columns type to string in-place. + RETURN_IF_ERROR(_cast_to_input_block(block)); + // FileReader can fill partition and missing columns itself + if (!_cur_reader->fill_all_columns()) { + // Fill rows in src block with partition columns from path. (e.g. Hive partition columns) + RETURN_IF_ERROR(_fill_columns_from_path(read_rows)); + // Fill columns not exist in file with null or default value + RETURN_IF_ERROR(_fill_missing_columns(read_rows)); + } + // Apply _pre_conjunct_ctxs to filter src block. + RETURN_IF_ERROR(_pre_filter_src_block()); + // Convert src block to output block (dest block), string to dest data type and apply filters. + RETURN_IF_ERROR(_convert_to_output_block(block)); + // Truncate char columns or varchar columns if size is smaller than file columns + // or not found in the file column schema. + RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); } - // Apply _pre_conjunct_ctxs to filter src block. - RETURN_IF_ERROR(_pre_filter_src_block()); - // Convert src block to output block (dest block), string to dest data type and apply filters. - RETURN_IF_ERROR(_convert_to_output_block(block)); - // Truncate char columns or varchar columns if size is smaller than file columns - // or not found in the file column schema. - RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block)); break; } } while (true); @@ -755,8 +759,10 @@ Status VFileScanner::_get_next_reader() { case TFileFormatType::FORMAT_CSV_GZ: case TFileFormatType::FORMAT_CSV_BZ2: case TFileFormatType::FORMAT_CSV_LZ4FRAME: + case TFileFormatType::FORMAT_CSV_LZ4BLOCK: case TFileFormatType::FORMAT_CSV_LZOP: case TFileFormatType::FORMAT_CSV_DEFLATE: + case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK: case TFileFormatType::FORMAT_PROTO: { _cur_reader = CsvReader::create_unique(_state, _profile, &_counter, *_params, range, _file_slot_descs, _io_ctx.get()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java index 8d9c2c6b0c..0dff4b6caa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/Util.java @@ -550,6 +550,8 @@ public class Util { return TFileFormatType.FORMAT_CSV_LZO; } else if (lowerCasePath.endsWith(".deflate")) { return TFileFormatType.FORMAT_CSV_DEFLATE; + } else if (lowerCasePath.endsWith(".snappy")) { + return TFileFormatType.FORMAT_CSV_SNAPPYBLOCK; } else { return TFileFormatType.FORMAT_CSV_PLAIN; } @@ -575,6 +577,8 @@ public class Util { return TFileCompressType.LZO; } else if (lowerCasePath.endsWith(".deflate")) { return TFileCompressType.DEFLATE; + } else if (lowerCasePath.endsWith(".snappy")) { + return TFileCompressType.SNAPPYBLOCK; } else { return TFileCompressType.PLAIN; } @@ -599,6 +603,8 @@ public class Util { || fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE || fileFormatType == TFileFormatType.FORMAT_CSV_GZ || fileFormatType == TFileFormatType.FORMAT_CSV_LZ4FRAME + || fileFormatType == TFileFormatType.FORMAT_CSV_LZ4BLOCK + || fileFormatType == TFileFormatType.FORMAT_CSV_SNAPPYBLOCK || fileFormatType == TFileFormatType.FORMAT_CSV_LZO || fileFormatType == TFileFormatType.FORMAT_CSV_LZOP || fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN; diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index 1ad1b12047..e54466ad86 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -1000,6 +1000,7 @@ public class HiveMetaStoreCache { // File Cache for self splitter. private final List<HiveFileStatus> files = Lists.newArrayList(); // File split cache for old splitter. This is a temp variable. + @Deprecated private final List<FileSplit> splits = Lists.newArrayList(); private boolean isSplittable; // The values of partitions. @@ -1021,6 +1022,7 @@ public class HiveMetaStoreCache { } } + @Deprecated public void addSplit(FileSplit split) { if (isFileVisible(split.getPath())) { splits.add(split); diff --git a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java index 704d0fadf8..e1baea3652 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/external/hive/util/HiveUtil.java @@ -190,7 +190,8 @@ public final class HiveUtil { return true; } - // use reflection to get isSplittable method on FileInputFormat + // use reflection to get isSplitable method on FileInputFormat + // ATTN: the method name is actually "isSplitable", but the right spell is "isSplittable" Method method = null; for (Class<?> clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) { try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java index 662aa939ee..8e2c8ed3a4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileScanNode.java @@ -25,6 +25,7 @@ import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.Util; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.external.FileSplit.FileSplitCreator; import org.apache.doris.qe.ConnectContext; @@ -32,6 +33,7 @@ import org.apache.doris.spi.Split; import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TExplainLevel; import org.apache.doris.thrift.TExpr; +import org.apache.doris.thrift.TFileCompressType; import org.apache.doris.thrift.TFileRangeDesc; import org.apache.doris.thrift.TFileScanNode; import org.apache.doris.thrift.TFileScanRangeParams; @@ -221,19 +223,20 @@ public abstract class FileScanNode extends ExternalScanNode { if (blockLocations == null) { blockLocations = new BlockLocation[0]; } - long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize(); - if (splitSize <= 0) { - splitSize = blockSize; - } - // Min split size is DEFAULT_SPLIT_SIZE(128MB). - splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE); List<Split> result = Lists.newArrayList(); - if (!splittable) { + TFileCompressType compressType = Util.inferFileCompressTypeByPath(path.toString()); + if (!splittable || compressType != TFileCompressType.PLAIN) { LOG.debug("Path {} is not splittable.", path); String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts(); result.add(splitCreator.create(path, 0, length, length, modificationTime, hosts, partitionValues)); return result; } + long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize(); + if (splitSize <= 0) { + splitSize = blockSize; + } + // Min split size is DEFAULT_SPLIT_SIZE(128MB). + splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE); long bytesRemaining; for (bytesRemaining = length; (double) bytesRemaining / (double) splitSize > 1.1D; bytesRemaining -= splitSize) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index 8e71e02fac..1209be1cb9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -51,6 +51,7 @@ import org.apache.doris.qe.ConnectContext; import org.apache.doris.spi.Split; import org.apache.doris.statistics.StatisticalType; import org.apache.doris.thrift.TFileAttributes; +import org.apache.doris.thrift.TFileCompressType; import org.apache.doris.thrift.TFileFormatType; import org.apache.doris.thrift.TFileTextScanRangeParams; import org.apache.doris.thrift.TFileType; @@ -391,4 +392,14 @@ public class HiveScanNode extends FileQueryScanNode { public boolean pushDownAggNoGroupingCheckCol(FunctionCallExpr aggExpr, Column col) { return !col.isAllowNull(); } + + @Override + protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws UserException { + TFileCompressType compressType = super.getFileCompressType(fileSplit); + // hadoop use lz4 blocked codec + if (compressType == TFileCompressType.LZ4FRAME) { + compressType = TFileCompressType.LZ4BLOCK; + } + return compressType; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java index 0f230c85f4..0bc8442760 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveSplit.java @@ -32,11 +32,6 @@ public class HiveSplit extends FileSplit { this.acidInfo = acidInfo; } - public HiveSplit(Path path, long start, long length, long fileLength, String[] hosts, AcidInfo acidInfo) { - super(path, start, length, fileLength, hosts, null); - this.acidInfo = acidInfo; - } - @Override public Object getInfo() { return acidInfo; diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index e3c9bd9aa4..e75754c322 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -116,6 +116,8 @@ enum TFileFormatType { FORMAT_PROTO, FORMAT_JNI, FORMAT_AVRO, + FORMAT_CSV_LZ4BLOCK, + FORMAT_CSV_SNAPPYBLOCK, } // In previous versions, the data compression format and file format were stored together, as TFileFormatType, @@ -132,6 +134,8 @@ enum TFileCompressType { LZ4FRAME, DEFLATE, LZOP, + LZ4BLOCK, + SNAPPYBLOCK } struct THdfsConf { diff --git a/regression-test/data/external_table_p2/hive/test_compress_type.out b/regression-test/data/external_table_p2/hive/test_compress_type.out new file mode 100644 index 0000000000..a95bf1f0dd --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_compress_type.out @@ -0,0 +1,47 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q21 -- +600005 + +-- !q22 -- +1510010 + +-- !q23 -- +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] + +-- !q31 -- +600005 + +-- !q32 -- +1510010 + +-- !q33 -- +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] +4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,2 [...] + diff --git a/regression-test/suites/external_table_p2/hive/test_compress_type.groovy b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy new file mode 100644 index 0000000000..d02ff3fbd0 --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_compress_type.groovy @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_compress_type", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_compress_type" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + + sql """ use multi_catalog """ + + // table test_compress_partitioned has 6 partitions with different compressed file: plain, gzip, bzip2, deflate + sql """set file_split_size=0""" + explain { + sql("select count(*) from test_compress_partitioned") + contains "inputSplitNum=16, totalFileSize=734675596, scanRanges=16" + contains "partition=8/8" + } + qt_q21 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix"""" + qt_q22 """select count(*) from test_compress_partitioned""" + order_qt_q23 """select * from test_compress_partitioned where watchid=4611870011201662970""" + + sql """set file_split_size=8388608""" + explain { + sql("select count(*) from test_compress_partitioned") + contains "inputSplitNum=82, totalFileSize=734675596, scanRanges=82" + contains "partition=8/8" + } + + qt_q31 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix"""" + qt_q32 """select count(*) from test_compress_partitioned""" + order_qt_q33 """select * from test_compress_partitioned where watchid=4611870011201662970""" + sql """set file_split_size=0""" + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org