This is an automated email from the ASF dual-hosted git repository.

zouxinyi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new db3a3ec49d2 [env](compile)open compile check in columns class (#44425)
db3a3ec49d2 is described below

commit db3a3ec49d20019b1cb871c11aa10d13b974ec22
Author: Xinyi Zou <zouxi...@selectdb.com>
AuthorDate: Wed Nov 27 22:13:44 2024 +0800

    [env](compile)open compile check in columns class (#44425)
    
    ### What problem does this PR solve?
    
    Problem Summary:
    
    open compile check in columns class
---
 be/src/util/hash_util.hpp              | 16 ++++-----
 be/src/vec/columns/column_const.cpp    |  5 ++-
 be/src/vec/columns/column_const.h      |  5 ++-
 be/src/vec/columns/column_decimal.h    |  4 ++-
 be/src/vec/columns/column_dictionary.h | 22 +++++++-----
 be/src/vec/columns/column_nullable.cpp |  5 +--
 be/src/vec/columns/column_nullable.h   |  7 ++--
 be/src/vec/columns/column_object.cpp   | 19 +++++++----
 be/src/vec/columns/column_object.h     |  6 ++--
 be/src/vec/columns/column_string.cpp   | 61 +++++++++++++++++++---------------
 be/src/vec/columns/column_string.h     | 29 +++++++++++-----
 be/src/vec/columns/column_vector.cpp   |  3 +-
 be/src/vec/columns/column_vector.h     |  9 ++---
 be/src/vec/core/field.h                | 11 ++++--
 14 files changed, 125 insertions(+), 77 deletions(-)

diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp
index e9ac72c5ccd..d444daa8c68 100644
--- a/be/src/util/hash_util.hpp
+++ b/be/src/util/hash_util.hpp
@@ -46,7 +46,7 @@ public:
         return std::hash<T>()(value);
     }
 
-    static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t 
hash) {
+    static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t 
hash) {
         return crc32(hash, (const unsigned char*)data, bytes);
     }
 
@@ -66,7 +66,7 @@ public:
     // NOTE: Any changes made to this function need to be reflected in 
Codegen::GetHashFn.
     // TODO: crc32 hashes with different seeds do not result in different hash 
functions.
     // The resulting hashes are correlated.
-    static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
+    static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
         if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) {
             return zlib_crc_hash(data, bytes, hash);
         }
@@ -93,7 +93,7 @@ public:
         return hash;
     }
 
-    static uint64_t crc_hash64(const void* data, int32_t bytes, uint64_t hash) 
{
+    static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t 
hash) {
         uint32_t words = bytes / sizeof(uint32_t);
         bytes = bytes % sizeof(uint32_t);
 
@@ -125,7 +125,7 @@ public:
         return converter.u64;
     }
 #else
-    static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) {
+    static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) {
         return zlib_crc_hash(data, bytes, hash);
     }
 #endif
@@ -202,7 +202,7 @@ public:
     // For example, if the data is <1000, 2000, 3000, 4000, ..> and then the 
mod of 1000
     // is taken on the hash, all values will collide to the same bucket.
     // For string values, Fnv is slightly faster than boost.
-    static uint32_t fnv_hash(const void* data, int32_t bytes, uint32_t hash) {
+    static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) {
         const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
 
         while (bytes--) {
@@ -213,7 +213,7 @@ public:
         return hash;
     }
 
-    static uint64_t fnv_hash64(const void* data, int32_t bytes, uint64_t hash) 
{
+    static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t 
hash) {
         const uint8_t* ptr = reinterpret_cast<const uint8_t*>(data);
 
         while (bytes--) {
@@ -291,7 +291,7 @@ public:
     // depending on hardware capabilities.
     // Seed values for different steps of the query execution should use 
different seeds
     // to prevent accidental key collisions. (See IMPALA-219 for more details).
-    static uint32_t hash(const void* data, int32_t bytes, uint32_t seed) {
+    static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) {
 #ifdef __SSE4_2__
 
         if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
@@ -305,7 +305,7 @@ public:
 #endif
     }
 
-    static uint64_t hash64(const void* data, int32_t bytes, uint64_t seed) {
+    static uint64_t hash64(const void* data, uint32_t bytes, uint64_t seed) {
 #ifdef _SSE4_2_
         if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) {
             return crc_hash64(data, bytes, seed);
diff --git a/be/src/vec/columns/column_const.cpp 
b/be/src/vec/columns/column_const.cpp
index f751f1d8d3e..fd05127f6d7 100644
--- a/be/src/vec/columns/column_const.cpp
+++ b/be/src/vec/columns/column_const.cpp
@@ -35,6 +35,7 @@
 #include "vec/core/column_with_type_and_name.h"
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_) : data(data_), 
s(s_) {
     /// Squash Const of Const.
@@ -66,7 +67,9 @@ ColumnConst::ColumnConst(const ColumnPtr& data_, size_t s_, 
bool create_with_emp
 }
 
 ColumnPtr ColumnConst::convert_to_full_column() const {
-    return data->replicate(Offsets(1, s));
+    // Assuming the number of replicate rows will not exceed Offset(UInt32),
+    // currently Column::replicate only supports Uint32 Offsets
+    return data->replicate(Offsets(1, cast_set<Offset>(s)));
 }
 
 ColumnPtr ColumnConst::remove_low_cardinality() const {
diff --git a/be/src/vec/columns/column_const.h 
b/be/src/vec/columns/column_const.h
index ee3860f0635..0fa22ca23bf 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -48,6 +48,7 @@
 class SipHash;
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 class Arena;
 class Block;
@@ -267,7 +268,8 @@ public:
 
     template <typename T>
     T get_value() const {
-        return get_field().safe_get<NearestFieldType<T>>();
+        // Here the cast is correct, relevant code is rather tricky.
+        return static_cast<T>(get_field().safe_get<NearestFieldType<T>>());
     }
 
     void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 
0) override {
@@ -276,3 +278,4 @@ public:
     }
 };
 } // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_decimal.h 
b/be/src/vec/columns/column_decimal.h
index 4c2f69d5ef3..946b268436e 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -53,6 +53,7 @@ class ColumnSorter;
 } // namespace doris
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 /// PaddedPODArray extended by Decimal scale
 template <typename T>
@@ -261,7 +262,7 @@ protected:
         for (U i = 0; i < s; ++i) res[i] = i;
 
         auto sort_end = res.end();
-        if (limit && limit < s / 8.0) {
+        if (limit && limit < static_cast<double>(s) / 8.0L) {
             sort_end = res.begin() + limit;
             if (reverse)
                 std::partial_sort(res.begin(), sort_end, res.end(),
@@ -305,3 +306,4 @@ template <typename T>
 using ColumnVectorOrDecimal = typename ColumnVectorOrDecimalT<T, 
IsDecimalNumber<T>>::Col;
 
 } // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_dictionary.h 
b/be/src/vec/columns/column_dictionary.h
index ae7d001a31d..c4b1f3e27e0 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -29,6 +29,7 @@
 #include "vec/core/types.h"
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 /**
  * For low cardinality string columns, using ColumnDictionary can reduce memory
@@ -265,9 +266,9 @@ public:
         }
     }
 
-    int32_t find_code(const StringRef& value) const { return 
_dict.find_code(value); }
+    T find_code(const StringRef& value) const { return _dict.find_code(value); 
}
 
-    int32_t find_code_by_bound(const StringRef& value, bool greater, bool eq) 
const {
+    T find_code_by_bound(const StringRef& value, bool greater, bool eq) const {
         return _dict.find_code_by_bound(value, greater, eq);
     }
 
@@ -346,8 +347,9 @@ public:
             _total_str_len += value.size;
         }
 
-        int32_t find_code(const StringRef& value) const {
-            for (size_t i = 0; i < _dict_data->size(); i++) {
+        T find_code(const StringRef& value) const {
+            // _dict_data->size will not exceed the range of T.
+            for (T i = 0; i < _dict_data->size(); i++) {
                 if ((*_dict_data)[i] == value) {
                     return i;
                 }
@@ -384,11 +386,11 @@ public:
 
                 // For dictionary data of char type, sv.size is the schema 
length,
                 // so use strnlen to remove the 0 at the end to get the actual 
length.
-                int32_t len = sv.size;
+                size_t len = sv.size;
                 if (type == FieldType::OLAP_FIELD_TYPE_CHAR) {
                     len = strnlen(sv.data, sv.size);
                 }
-                uint32_t hash_val = HashUtil::crc_hash(sv.data, len, 0);
+                uint32_t hash_val = HashUtil::crc_hash(sv.data, 
static_cast<uint32_t>(len), 0);
                 _hash_values[code] = hash_val;
                 _compute_hash_value_flags[code] = 1;
                 return _hash_values[code];
@@ -412,13 +414,14 @@ public:
         //  so upper_bound is the code 0 of b, then evaluate code < 0 and 
returns empty
         // If the predicate is col <= 'a' and upper_bound-1 is -1,
         //  then evaluate code <= -1 and returns empty
-        int32_t find_code_by_bound(const StringRef& value, bool greater, bool 
eq) const {
+        T find_code_by_bound(const StringRef& value, bool greater, bool eq) 
const {
             auto code = find_code(value);
             if (code >= 0) {
                 return code;
             }
-            auto bound = std::upper_bound(_dict_data->begin(), 
_dict_data->end(), value) -
-                         _dict_data->begin();
+            auto bound =
+                    static_cast<T>(std::upper_bound(_dict_data->begin(), 
_dict_data->end(), value) -
+                                   _dict_data->begin());
             return greater ? bound - greater + eq : bound - eq;
         }
 
@@ -536,3 +539,4 @@ template class ColumnDictionary<int32_t>;
 using ColumnDictI32 = vectorized::ColumnDictionary<doris::vectorized::Int32>;
 
 } // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_nullable.cpp 
b/be/src/vec/columns/column_nullable.cpp
index 5e34ad4d8d4..c58c78f5611 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -29,6 +29,7 @@
 #include "vec/utils/util.hpp"
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, 
MutableColumnPtr&& null_map_)
         : NullMapProvider(std::move(null_map_)), 
nested_column(std::move(nested_column_)) {
@@ -62,7 +63,7 @@ void ColumnNullable::update_xxHash_with_value(size_t start, 
size_t end, uint64_t
     } else {
         const auto* __restrict real_null_data =
                 assert_cast<const 
ColumnUInt8&>(get_null_map_column()).get_data().data();
-        for (int i = start; i < end; ++i) {
+        for (size_t i = start; i < end; ++i) {
             if (real_null_data[i] != 0) {
                 hash = HashUtil::xxHash64NullWithSeed(hash);
             }
@@ -78,7 +79,7 @@ void ColumnNullable::update_crc_with_value(size_t start, 
size_t end, uint32_t& h
     } else {
         const auto* __restrict real_null_data =
                 assert_cast<const 
ColumnUInt8&>(get_null_map_column()).get_data().data();
-        for (int i = start; i < end; ++i) {
+        for (size_t i = start; i < end; ++i) {
             if (real_null_data[i] != 0) {
                 hash = HashUtil::zlib_crc_hash_null(hash);
             }
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index 252144fbc5f..83d5e6af35a 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -43,6 +43,7 @@
 class SipHash;
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 class Arena;
 class ColumnSorter;
 
@@ -404,7 +405,8 @@ public:
         }
         static constexpr auto MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH = 1000;
         size_t num_rows = size();
-        size_t num_sampled_rows = std::min(static_cast<size_t>(num_rows * 
sample_ratio), num_rows);
+        size_t num_sampled_rows = std::min(
+                static_cast<size_t>(static_cast<double>(num_rows) * 
sample_ratio), num_rows);
         size_t num_checked_rows = 0;
         size_t res = 0;
         if (num_sampled_rows == num_rows || num_rows <= 
MAX_NUMBER_OF_ROWS_FOR_FULL_SEARCH) {
@@ -423,7 +425,7 @@ public:
         if (num_checked_rows == 0) {
             return 0.0;
         }
-        return static_cast<double>(res) / num_checked_rows;
+        return static_cast<double>(res) / 
static_cast<double>(num_checked_rows);
     }
 
     void convert_dict_codes_if_necessary() override {
@@ -460,3 +462,4 @@ private:
 ColumnPtr make_nullable(const ColumnPtr& column, bool is_nullable = false);
 ColumnPtr remove_nullable(const ColumnPtr& column);
 } // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_object.cpp 
b/be/src/vec/columns/column_object.cpp
index d5e52d07bcf..3e8d3722305 100644
--- a/be/src/vec/columns/column_object.cpp
+++ b/be/src/vec/columns/column_object.cpp
@@ -82,6 +82,7 @@
 #endif
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 namespace {
 
 DataTypePtr create_array_of_type(TypeIndex type, size_t num_dimensions, bool 
is_nullable) {
@@ -653,7 +654,7 @@ bool ColumnObject::Subcolumn::check_if_sparse_column(size_t 
num_rows) {
         defaults_ratio.push_back(data[i]->get_ratio_of_default_rows());
     }
     double default_ratio = std::accumulate(defaults_ratio.begin(), 
defaults_ratio.end(), 0.0) /
-                           defaults_ratio.size();
+                           static_cast<double>(defaults_ratio.size());
     return default_ratio >= config::variant_ratio_of_defaults_as_sparse_column;
 }
 
@@ -1294,7 +1295,11 @@ rapidjson::Value* 
find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
     if (!json.IsObject()) {
         return nullptr;
     }
-    rapidjson::Value name(current_key.data(), current_key.size());
+    /*! RapidJSON uses 32-bit array/string indices even on 64-bit platforms,
+    instead of using \c size_t. Users may override the SizeType by defining
+    \ref RAPIDJSON_NO_SIZETYPEDEFINE.
+    */
+    rapidjson::Value name(current_key.data(), 
cast_set<unsigned>(current_key.size()));
     auto it = json.FindMember(name);
     if (it == json.MemberEnd()) {
         return nullptr;
@@ -1312,7 +1317,7 @@ rapidjson::Value* 
find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
 // 3. empty root jsonb value(not null)
 // 4. type is nothing
 bool skip_empty_json(const ColumnNullable* nullable, const DataTypePtr& type,
-                     TypeIndex base_type_id, int row, const PathInData& path) {
+                     TypeIndex base_type_id, size_t row, const PathInData& 
path) {
     // skip nulls
     if (nullable && nullable->is_null_at(row)) {
         return true;
@@ -1348,7 +1353,7 @@ Status find_and_set_leave_value(const IColumn* column, 
const PathInData& path,
                                 const DataTypeSerDeSPtr& type_serde, const 
DataTypePtr& type,
                                 TypeIndex base_type_index, rapidjson::Value& 
root,
                                 rapidjson::Document::AllocatorType& allocator, 
Arena& mem_pool,
-                                int row) {
+                                size_t row) {
 #ifndef NDEBUG
     // sanitize type and column
     if (column->get_name() != type->create_column()->get_name()) {
@@ -1416,7 +1421,7 @@ void get_json_by_column_tree(rapidjson::Value& root, 
rapidjson::Document::Alloca
     }
 }
 
-Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* 
output) const {
+Status ColumnObject::serialize_one_row_to_string(size_t row, std::string* 
output) const {
     if (!is_finalized()) {
         const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
     }
@@ -1432,7 +1437,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t 
row, std::string* outpu
     return Status::OK();
 }
 
-Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& 
output) const {
+Status ColumnObject::serialize_one_row_to_string(size_t row, BufferWritable& 
output) const {
     if (!is_finalized()) {
         const_cast<ColumnObject*>(this)->finalize(FinalizeMode::READ_MODE);
     }
@@ -1447,7 +1452,7 @@ Status ColumnObject::serialize_one_row_to_string(int64_t 
row, BufferWritable& ou
     return Status::OK();
 }
 
-Status ColumnObject::serialize_one_row_to_json_format(int64_t row, 
rapidjson::StringBuffer* output,
+Status ColumnObject::serialize_one_row_to_json_format(size_t row, 
rapidjson::StringBuffer* output,
                                                       bool* is_null) const {
     CHECK(is_finalized());
     if (subcolumns.empty()) {
diff --git a/be/src/vec/columns/column_object.h 
b/be/src/vec/columns/column_object.h
index 21bb4469115..e4127197a22 100644
--- a/be/src/vec/columns/column_object.h
+++ b/be/src/vec/columns/column_object.h
@@ -272,12 +272,12 @@ public:
         return 
subcolumns.get_mutable_root()->data.get_finalized_column_ptr()->assume_mutable();
     }
 
-    Status serialize_one_row_to_string(int64_t row, std::string* output) const;
+    Status serialize_one_row_to_string(size_t row, std::string* output) const;
 
-    Status serialize_one_row_to_string(int64_t row, BufferWritable& output) 
const;
+    Status serialize_one_row_to_string(size_t row, BufferWritable& output) 
const;
 
     // serialize one row to json format
-    Status serialize_one_row_to_json_format(int64_t row, 
rapidjson::StringBuffer* output,
+    Status serialize_one_row_to_json_format(size_t row, 
rapidjson::StringBuffer* output,
                                             bool* is_null) const;
 
     // merge multiple sub sparse columns into root
diff --git a/be/src/vec/columns/column_string.cpp 
b/be/src/vec/columns/column_string.cpp
index 3caa194551b..cb83a29bbad 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -34,6 +34,7 @@
 #include "vec/core/sort_block.h"
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 template <typename T>
 void ColumnStr<T>::sanity_check() const {
@@ -74,8 +75,8 @@ MutableColumnPtr ColumnStr<T>::clone_resized(size_t to_size) 
const {
             res->offsets.assign(offsets.begin(), offsets.end());
             res->chars.assign(chars.begin(), chars.end());
         }
-
-        res->offsets.resize_fill(to_size, chars.size());
+        // If offset is uint32, size will not exceed, check the size when 
inserting data into ColumnStr<T>.
+        res->offsets.resize_fill(to_size, static_cast<T>(chars.size()));
     }
 
     return res;
@@ -92,14 +93,14 @@ void ColumnStr<T>::shrink_padding_chars() {
 
     // deal the 0-th element. no need to move.
     auto next_start = offset[0];
-    offset[0] = strnlen(data, size_at(0));
+    offset[0] = static_cast<T>(strnlen(data, size_at(0)));
     for (size_t i = 1; i < size; i++) {
         // get the i-th length and whole move it to cover the last's trailing 
void
         auto length = strnlen(data + next_start, offset[i] - next_start);
         memmove(data + offset[i - 1], data + next_start, length);
         // offset i will be changed. so save the old value for (i+1)-th to get 
its length.
         next_start = offset[i];
-        offset[i] = offset[i - 1] + length;
+        offset[i] = offset[i - 1] + static_cast<T>(length);
     }
     chars.resize_fill(offsets.back()); // just call it to shrink memory here. 
no possible to expand.
 }
@@ -125,8 +126,8 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const 
doris::vectorized::IC
                 "Parameter out of bound in IColumnStr<T>::insert_range_from 
method.");
     }
 
-    size_t nested_offset = src_concrete.offset_at(start);
-    size_t nested_length = src_concrete.offsets[start + length - 1] - 
nested_offset;
+    auto nested_offset = src_concrete.offset_at(start);
+    auto nested_length = src_concrete.offsets[start + length - 1] - 
nested_offset;
 
     size_t old_chars_size = chars.size();
     chars.resize(old_chars_size + nested_length);
@@ -136,7 +137,7 @@ void ColumnStr<T>::insert_range_from_ignore_overflow(const 
doris::vectorized::IC
         offsets.assign(src_concrete.offsets.begin(), 
src_concrete.offsets.begin() + length);
     } else {
         size_t old_size = offsets.size();
-        size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see 
PaddedPODArray
+        auto prev_max_offset = offsets.back(); /// -1th index is Ok, see 
PaddedPODArray
         offsets.resize(old_size + length);
 
         for (size_t i = 0; i < length; ++i) {
@@ -161,8 +162,8 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, 
size_t start, size_t le
                     doris::ErrorCode::INTERNAL_ERROR,
                     "Parameter out of bound in 
IColumnStr<T>::insert_range_from method.");
         }
-        size_t nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
-        size_t nested_length = src_offsets[start + length - 1] - nested_offset;
+        auto nested_offset = src_offsets[static_cast<ssize_t>(start) - 1];
+        auto nested_length = src_offsets[start + length - 1] - nested_offset;
 
         size_t old_chars_size = chars.size();
         check_chars_length(old_chars_size + nested_length, offsets.size() + 
length);
@@ -174,11 +175,13 @@ void ColumnStr<T>::insert_range_from(const IColumn& src, 
size_t start, size_t le
             offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
         } else {
             size_t old_size = offsets.size();
-            size_t prev_max_offset = offsets.back(); /// -1th index is Ok, see 
PaddedPODArray
+            auto prev_max_offset = offsets.back(); /// -1th index is Ok, see 
PaddedPODArray
             offsets.resize(old_size + length);
 
             for (size_t i = 0; i < length; ++i) {
-                offsets[old_size + i] = src_offsets[start + i] - nested_offset 
+ prev_max_offset;
+                // if Offsets is uint32, size will not exceed range of uint32, 
cast is OK.
+                offsets[old_size + i] =
+                        static_cast<T>(src_offsets[start + i] - nested_offset) 
+ prev_max_offset;
             }
         }
     };
@@ -208,7 +211,7 @@ void ColumnStr<T>::insert_many_from(const IColumn& src, 
size_t position, size_t
     auto prev_pos = old_chars_size;
     for (; start_pos < end_pos; ++start_pos) {
         memcpy(&chars[prev_pos], data_val, data_length);
-        offsets[start_pos] = prev_pos + data_length;
+        offsets[start_pos] = static_cast<T>(prev_pos + data_length);
         prev_pos = prev_pos + data_length;
     }
 }
@@ -229,7 +232,8 @@ void ColumnStr<T>::insert_indices_from(const IColumn& src, 
const uint32_t* indic
         for (const auto* x = indices_begin; x != indices_end; ++x) {
             int64_t src_offset = *x;
             total_chars_size += src_offset_data[src_offset] - 
src_offset_data[src_offset - 1];
-            dst_offsets_data[dst_offsets_pos++] = total_chars_size;
+            // if Offsets is uint32, size will not exceed range of uint32, 
cast is OK.
+            dst_offsets_data[dst_offsets_pos++] = 
static_cast<T>(total_chars_size);
         }
         check_chars_length(total_chars_size, offsets.size());
 
@@ -267,13 +271,16 @@ void ColumnStr<T>::update_crcs_with_value(uint32_t* 
__restrict hashes, doris::Pr
     if (null_data == nullptr) {
         for (size_t i = 0; i < s; i++) {
             auto data_ref = get_data_at(i);
-            hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, 
hashes[i]);
+            // If offset is uint32, size will not exceed, check the size when 
inserting data into ColumnStr<T>.
+            hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, 
static_cast<uint32_t>(data_ref.size),
+                                                hashes[i]);
         }
     } else {
         for (size_t i = 0; i < s; i++) {
             if (null_data[i] == 0) {
                 auto data_ref = get_data_at(i);
-                hashes[i] = HashUtil::zlib_crc_hash(data_ref.data, 
data_ref.size, hashes[i]);
+                hashes[i] = HashUtil::zlib_crc_hash(
+                        data_ref.data, static_cast<uint32_t>(data_ref.size), 
hashes[i]);
             }
         }
     }
@@ -391,8 +398,9 @@ ColumnPtr ColumnStr<T>::permute(const IColumn::Permutation& 
perm, size_t limit)
 template <typename T>
 StringRef ColumnStr<T>::serialize_value_into_arena(size_t n, Arena& arena,
                                                    char const*& begin) const {
-    uint32_t string_size(size_at(n));
-    uint32_t offset(offset_at(n));
+    // Use uint32 instead of size_t to reduce agg key's length.
+    auto string_size(static_cast<uint32_t>(size_at(n)));
+    auto offset(static_cast<uint32_t>(offset_at(n)));
 
     StringRef res;
     res.size = sizeof(string_size) + string_size;
@@ -421,7 +429,7 @@ const char* 
ColumnStr<T>::deserialize_and_insert_from_arena(const char* pos) {
 
 template <typename T>
 size_t ColumnStr<T>::get_max_row_byte_size() const {
-    size_t max_size = 0;
+    T max_size = 0;
     size_t num_rows = offsets.size();
     for (size_t i = 0; i < num_rows; ++i) {
         max_size = std::max(max_size, size_at(i));
@@ -434,8 +442,9 @@ template <typename T>
 void ColumnStr<T>::serialize_vec(std::vector<StringRef>& keys, size_t num_rows,
                                  size_t max_row_byte_size) const {
     for (size_t i = 0; i < num_rows; ++i) {
-        uint32_t offset(offset_at(i));
-        uint32_t string_size(size_at(i));
+        // Use uint32 instead of size_t to reduce agg key's length.
+        auto offset(static_cast<uint32_t>(offset_at(i)));
+        auto string_size(static_cast<uint32_t>(size_at(i)));
 
         auto* ptr = const_cast<char*>(keys[i].data + keys[i].size);
         memcpy_fixed<uint32_t>(ptr, (char*)&string_size);
@@ -458,8 +467,8 @@ void 
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
             memcpy(dest, null_map + i, sizeof(uint8_t));
 
             if (null_map[i] == 0) {
-                UInt32 offset(offset_at(i));
-                UInt32 string_size(size_at(i));
+                auto offset(offset_at(i));
+                auto string_size(size_at(i));
 
                 memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
                 memcpy(dest + 1 + sizeof(string_size), &chars[offset], 
string_size);
@@ -475,8 +484,8 @@ void 
ColumnStr<T>::serialize_vec_with_null_map(std::vector<StringRef>& keys, siz
             // serialize null first
             memcpy(dest, null_map + i, sizeof(uint8_t));
 
-            UInt32 offset(offset_at(i));
-            UInt32 string_size(size_at(i));
+            auto offset(offset_at(i));
+            auto string_size(size_at(i));
 
             memcpy_fixed<UInt32>(dest + 1, (char*)&string_size);
             memcpy(dest + 1 + sizeof(string_size), &chars[offset], 
string_size);
@@ -559,8 +568,8 @@ ColumnPtr ColumnStr<T>::replicate(const IColumn::Offsets& 
replicate_offsets) con
     T current_new_offset = 0;
 
     for (size_t i = 0; i < col_size; ++i) {
-        size_t size_to_replicate = replicate_offsets[i] - 
prev_replicate_offset;
-        size_t string_size = offsets[i] - prev_string_offset;
+        T size_to_replicate = replicate_offsets[i] - prev_replicate_offset;
+        T string_size = offsets[i] - prev_string_offset;
 
         for (size_t j = 0; j < size_to_replicate; ++j) {
             current_new_offset += string_size;
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index f116d4ce1f1..1674fd90933 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -47,6 +47,7 @@
 #include "vec/core/types.h"
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 class Arena;
 class ColumnSorter;
 
@@ -86,10 +87,10 @@ private:
     Chars chars;
 
     // Start position of i-th element.
-    size_t ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }
+    T ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; }
 
     /// Size of i-th element, including terminating zero.
-    size_t ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - 
offsets[i - 1]; }
+    T ALWAYS_INLINE size_at(ssize_t i) const { return offsets[i] - offsets[i - 
1]; }
 
     template <bool positive>
     struct less;
@@ -220,7 +221,7 @@ public:
 
         const char* ptr = strings[0].data;
         for (size_t i = 0; i != num; i++) {
-            uint32_t len = strings[i].size;
+            size_t len = strings[i].size;
             length += len;
             offset += len;
             offsets.push_back(offset);
@@ -282,7 +283,7 @@ public:
         Char* data = chars.data();
         size_t offset = old_size;
         for (size_t i = 0; i < num; i++) {
-            uint32_t len = strings[i].size;
+            size_t len = strings[i].size;
             if (len) {
                 memcpy(data + offset, strings[i].data, len);
                 offset += len;
@@ -305,7 +306,7 @@ public:
         Char* data = chars.data();
         size_t offset = old_size;
         for (size_t i = 0; i < num; i++) {
-            uint32_t len = strings[i].size;
+            size_t len = strings[i].size;
             if (len) {
                 memcpy(data + offset, strings[i].data, copy_length);
                 offset += len;
@@ -342,9 +343,15 @@ public:
         for (size_t i = 0; i < num; i++) {
             int32_t codeword = data_array[i + start_index];
             new_size += dict[codeword].size;
-            offsets[offset_size + i] = new_size;
+            offsets[offset_size + i] = static_cast<T>(new_size);
         }
 
+        if (new_size > std::numeric_limits<T>::max()) {
+            throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+                                   "ColumnString insert size out of range type 
{} [{},{}]",
+                                   typeid(T).name(), 
std::numeric_limits<T>::min(),
+                                   std::numeric_limits<T>::max());
+        }
         check_chars_length(new_size, offsets.size());
         chars.resize(new_size);
 
@@ -406,13 +413,16 @@ public:
             for (size_t i = start; i < end; ++i) {
                 if (null_data[i] == 0) {
                     auto data_ref = get_data_at(i);
-                    hash = HashUtil::zlib_crc_hash(data_ref.data, 
data_ref.size, hash);
+                    // If offset is uint32, size will not exceed, check the 
size when inserting data into ColumnStr<T>.
+                    hash = HashUtil::zlib_crc_hash(data_ref.data,
+                                                   
static_cast<uint32_t>(data_ref.size), hash);
                 }
             }
         } else {
             for (size_t i = start; i < end; ++i) {
                 auto data_ref = get_data_at(i);
-                hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, 
hash);
+                hash = HashUtil::zlib_crc_hash(data_ref.data, 
static_cast<uint32_t>(data_ref.size),
+                                               hash);
             }
         }
     }
@@ -473,7 +483,7 @@ public:
     void insert_default() override { offsets.push_back(chars.size()); }
 
     void insert_many_defaults(size_t length) override {
-        offsets.resize_fill(offsets.size() + length, chars.size());
+        offsets.resize_fill(offsets.size() + length, 
static_cast<T>(chars.size()));
     }
 
     int compare_at(size_t n, size_t m, const IColumn& rhs_,
@@ -525,3 +535,4 @@ public:
 using ColumnString = ColumnStr<UInt32>;
 using ColumnString64 = ColumnStr<UInt64>;
 } // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/columns/column_vector.cpp 
b/be/src/vec/columns/column_vector.cpp
index 0e24446a5cd..f0f0bec8b99 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -42,6 +42,7 @@
 #include "vec/data_types/data_type.h"
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 template <typename T>
 StringRef ColumnVector<T>::serialize_value_into_arena(size_t n, Arena& arena,
@@ -242,7 +243,7 @@ void ColumnVector<T>::get_permutation(bool reverse, size_t 
limit, int nan_direct
     if (s == 0) return;
 
     // std::partial_sort need limit << s can get performance benefit
-    if (limit > (s / 8.0)) limit = 0;
+    if (limit > (s / 8.0L)) limit = 0;
 
     if (limit) {
         for (size_t i = 0; i < s; ++i) res[i] = i;
diff --git a/be/src/vec/columns/column_vector.h 
b/be/src/vec/columns/column_vector.h
index 2cb320b6992..970997a9186 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -61,6 +61,7 @@ class ColumnSorter;
 } // namespace doris
 
 namespace doris::vectorized {
+#include "common/compile_check_begin.h"
 
 /** Stuff for comparing numbers.
   * Integer values are compared as usual.
@@ -178,10 +179,9 @@ public:
     void insert_range_of_integer(T begin, T end) {
         if constexpr (std::is_integral_v<T>) {
             auto old_size = data.size();
-            data.resize(old_size + (end - begin));
-            for (int i = 0; i < end - begin; i++) {
-                data[old_size + i] = begin + i;
-            }
+            auto new_size = old_size + static_cast<size_t>(end - begin);
+            data.resize(new_size);
+            std::iota(data.begin() + old_size, data.begin() + new_size, begin);
         } else {
             throw doris::Exception(ErrorCode::INTERNAL_ERROR,
                                    "double column not support 
insert_range_of_integer");
@@ -409,3 +409,4 @@ protected:
 };
 
 } // namespace doris::vectorized
+#include "common/compile_check_end.h"
diff --git a/be/src/vec/core/field.h b/be/src/vec/core/field.h
index 8113dc602fb..341f65e075e 100644
--- a/be/src/vec/core/field.h
+++ b/be/src/vec/core/field.h
@@ -165,7 +165,7 @@ class JsonbField {
 public:
     JsonbField() = default;
 
-    JsonbField(const char* ptr, uint32_t len) : size(len) {
+    JsonbField(const char* ptr, size_t len) : size(len) {
         data = new char[size];
         if (!data) {
             LOG(FATAL) << "new data buffer failed, size: " << size;
@@ -213,7 +213,7 @@ public:
     }
 
     const char* get_value() const { return data; }
-    uint32_t get_size() const { return size; }
+    size_t get_size() const { return size; }
 
     bool operator<(const JsonbField& r) const {
         LOG(FATAL) << "comparing between JsonbField is not supported";
@@ -252,7 +252,7 @@ public:
 
 private:
     char* data = nullptr;
-    uint32_t size = 0;
+    size_t size = 0;
 };
 
 template <typename T>
@@ -498,6 +498,9 @@ public:
 
     bool is_null() const { return which == Types::Null; }
 
+    // The template parameter T needs to be consistent with `which`.
+    // If not, use NearestFieldType<> externally.
+    // Maybe modify this in the future, reference: 
https://github.com/ClickHouse/ClickHouse/pull/22003
     template <typename T>
     T& get() {
         using TWithoutRef = std::remove_reference_t<T>;
@@ -520,6 +523,8 @@ public:
         return true;
     }
 
+    // The template parameter T needs to be consistent with `which`.
+    // If not, use NearestFieldType<> externally.
     template <typename T>
     bool try_get(T& result) const {
         const Types::Which requested = TypeToEnum<std::decay_t<T>>::value;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org


Reply via email to