Re: [PR] [opt](arm) Improve performance of bytes32_mask_to_bits32_mask functions with SVE intrinsics [doris]

via GitHub Sun, 28 Dec 2025 19:52:38 -0800


Copilot commented on code in PR #59426:
URL: https://github.com/apache/doris/pull/59426#discussion_r2650135951



##########
be/src/util/simd/bits.h:
##########
@@ -81,6 +85,38 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* 
data) {
                       _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 
16)), zero16)))
               << 16) &
              0xffff0000);
+#elif defined(__ARM_FEATURE_SVE)
+    /*
+    * Assume the input data is [1 (repeated 16 times), 0 (repeated 16 times)].
+    *
+    * 1. Comparison produces a predicate mask: is_one = [1 (repeated 16 
times), 0 (repeated 16 times)].
+    *
+    * 2. Construct a new vector: masked = [1, 2, 4, 8, 16, 32, 64, 128,1, 2, 
4, 8, 16, 32, 64, 128, 0 (repeated 16 times)].
+    * For each position, if the corresponding element in is_one is 1, take the 
value from the mask array (mask_arr); otherwise, take 0.
+    *
+    * 3. Rearorder the elements: res = [1, 1, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 
...], where the index mapping is: j = (i % 4) * 8 + (i / 4).
+    *
+    * 4. Perform a horizontal sum to obtain the final result:
+    *    mask = sum_{i=0}^{7} ( res[i*4] + (res[i*4 + 1] << 8) + (res[i*4 + 2] 
<< 16) + (res[i*4 + 3] << 24)).
+    *    This works because we use svreinterpret_u32_u8, treating every group 
of four u8 elements as a single u32.
+    */
+    static constexpr uint8_t mask_arr[32] = {1,  2,   4,  8,  16,  32, 64, 
128, 1,  2,  4,
+                                             8,  16,  32, 64, 128, 1,  2,  4,  
 8,  16, 32,
+                                             64, 128, 1,  2,  4,   8,  16, 32, 
 64, 128};
+    svbool_t pg = svwhilelt_b8(0, 32);
+    svuint8_t mask_vec = svld1_u8(pg, mask_arr);
+    svbool_t is_one = svcmpeq_n_u8(pg, svld1_u8(pg, data), 1);
+    svuint8_t masked = svsel_u8(is_one, mask_vec, svdup_u8(0));
+
+    // j = i % 4 * 8 + i / 4

Review Comment:
   The inline comment "j = i % 4 * 8 + i / 4" lacks clarity on operator 
precedence. Consider adding parentheses for clarity: "j = (i % 4) * 8 + i / 4" 
to match the computation in the code.
   ```suggestion
       // j = (i % 4) * 8 + i / 4
   ```



##########
be/benchmark/benchmark_bit_simd.hpp:
##########
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <benchmark/benchmark.h>
+#include <vector>
+#include <random>
+
+#include "util/simd/bits.h"
+
+namespace doris {
+
+static void create_test_data(std::vector<uint8_t> &data, size_t size) {
+    std::mt19937 rng(123);
+    for (int i = 0; i < size; ++i) {
+        data[i] = rng() & 1;
+    }
+}
+
+inline uint32_t bitmask_8(const uint8_t* data) {
+    static constexpr uint8_t mask_arr[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 
2, 4, 8, 16, 32, 64, 128};
+    svbool_t pg = svwhilelt_b8(0, 16);
+    svuint8_t mask_vec = svld1_u8(pg, mask_arr);
+
+    svbool_t nonzero = svcmpne_n_u8(pg, svld1_u8(pg, data), 0);
+    svuint8_t masked = svsel_u8(nonzero, mask_vec, svdup_u8(0));
+
+    svuint8_t idx = svindex_u8(0, 1);
+    svuint8_t rev_idx = svsub_u8_x(pg, svdup_u8(15), idx);
+    svuint8_t res_hi = svtbl_u8(masked, rev_idx);
+
+    uint32_t mask = static_cast<uint32_t>(svaddv_u16(pg, 
svreinterpret_u16_u8(svzip1_u8(masked, res_hi))));
+    return mask;
+}
+

Review Comment:
   The function bytes32_mask_to_bits32_mask in the benchmark file has a 
different implementation than the one in be/src/util/simd/bits.h, which may 
lead to confusion and inconsistent benchmarking results. The benchmark should 
either use the actual production function from 
simd::bytes32_mask_to_bits32_mask or clearly document why an alternative 
implementation is being compared.
   ```suggestion
   
   // Alternative benchmark-only implementation of converting 32 bytes of
   // masked data into a 32-bit mask. The production implementation is
   // simd::bytes32_mask_to_bits32_mask (see util/simd/bits.h). This version
   // is intentionally different and is used in BM_bytes32_mask_to_bits32_mask_2
   // to compare its performance against the production implementation.
   ```



##########
be/src/util/simd/bits.h:
##########
@@ -81,6 +85,38 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* 
data) {
                       _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 
16)), zero16)))
               << 16) &
              0xffff0000);
+#elif defined(__ARM_FEATURE_SVE)
+    /*
+    * Assume the input data is [1 (repeated 16 times), 0 (repeated 16 times)].
+    *
+    * 1. Comparison produces a predicate mask: is_one = [1 (repeated 16 
times), 0 (repeated 16 times)].
+    *
+    * 2. Construct a new vector: masked = [1, 2, 4, 8, 16, 32, 64, 128,1, 2, 
4, 8, 16, 32, 64, 128, 0 (repeated 16 times)].
+    * For each position, if the corresponding element in is_one is 1, take the 
value from the mask array (mask_arr); otherwise, take 0.
+    *
+    * 3. Rearorder the elements: res = [1, 1, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 
...], where the index mapping is: j = (i % 4) * 8 + (i / 4).

Review Comment:
   The comment in step 3 contains a mathematical expression "j = (i % 4) * 8 + 
(i / 4)" where the operator precedence is ambiguous. It should be "j = ((i % 4) 
* 8) + (i / 4)" to clarify that the modulo and multiplication are evaluated 
before the division and addition, matching the actual code implementation.



##########
be/src/util/simd/bits.h:
##########
@@ -81,6 +85,38 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* 
data) {
                       _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 
16)), zero16)))
               << 16) &
              0xffff0000);
+#elif defined(__ARM_FEATURE_SVE)
+    /*
+    * Assume the input data is [1 (repeated 16 times), 0 (repeated 16 times)].
+    *
+    * 1. Comparison produces a predicate mask: is_one = [1 (repeated 16 
times), 0 (repeated 16 times)].
+    *
+    * 2. Construct a new vector: masked = [1, 2, 4, 8, 16, 32, 64, 128,1, 2, 
4, 8, 16, 32, 64, 128, 0 (repeated 16 times)].
+    * For each position, if the corresponding element in is_one is 1, take the 
value from the mask array (mask_arr); otherwise, take 0.
+    *
+    * 3. Rearorder the elements: res = [1, 1, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 
...], where the index mapping is: j = (i % 4) * 8 + (i / 4).

Review Comment:
   Typo in the comment: "Rearorder" should be "Reorder".
   ```suggestion
       * 3. Reorder the elements: res = [1, 1, 0, 0, 2, 2, 0, 0, 4, 4, 0, 0, 
...], where the index mapping is: j = (i % 4) * 8 + (i / 4).
   ```



##########
be/benchmark/benchmark_bit_simd.hpp:
##########
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <benchmark/benchmark.h>
+#include <vector>
+#include <random>
+
+#include "util/simd/bits.h"
+
+namespace doris {
+
+static void create_test_data(std::vector<uint8_t> &data, size_t size) {
+    std::mt19937 rng(123);
+    for (int i = 0; i < size; ++i) {
+        data[i] = rng() & 1;
+    }
+}
+
+inline uint32_t bitmask_8(const uint8_t* data) {
+    static constexpr uint8_t mask_arr[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 
2, 4, 8, 16, 32, 64, 128};
+    svbool_t pg = svwhilelt_b8(0, 16);
+    svuint8_t mask_vec = svld1_u8(pg, mask_arr);
+
+    svbool_t nonzero = svcmpne_n_u8(pg, svld1_u8(pg, data), 0);
+    svuint8_t masked = svsel_u8(nonzero, mask_vec, svdup_u8(0));
+
+    svuint8_t idx = svindex_u8(0, 1);
+    svuint8_t rev_idx = svsub_u8_x(pg, svdup_u8(15), idx);
+    svuint8_t res_hi = svtbl_u8(masked, rev_idx);
+
+    uint32_t mask = static_cast<uint32_t>(svaddv_u16(pg, 
svreinterpret_u16_u8(svzip1_u8(masked, res_hi))));
+    return mask;
+}
+
+uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
+    return bitmask_8(data) | (bitmask_8(data + 16) << 16);
+}  

Review Comment:
   The benchmark file uses SVE intrinsics (svwhilelt_b8, svld1_u8, 
svcmpne_n_u8, etc.) without including the required header arm_sve.h and without 
guarding the code with __ARM_FEATURE_SVE preprocessor directive. This will 
cause compilation failures on systems without SVE support. The SVE-specific 
code should be wrapped in #if defined(__ARM_FEATURE_SVE) and the header should 
be conditionally included.



##########
be/src/util/simd/bits.h:
##########
@@ -81,6 +85,38 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* 
data) {
                       _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 
16)), zero16)))
               << 16) &
              0xffff0000);
+#elif defined(__ARM_FEATURE_SVE)
+    /*
+    * Assume the input data is [1 (repeated 16 times), 0 (repeated 16 times)].
+    *
+    * 1. Comparison produces a predicate mask: is_one = [1 (repeated 16 
times), 0 (repeated 16 times)].
+    *
+    * 2. Construct a new vector: masked = [1, 2, 4, 8, 16, 32, 64, 128,1, 2, 
4, 8, 16, 32, 64, 128, 0 (repeated 16 times)].

Review Comment:
   There's a spacing issue in the comment. Missing space after "128," before 
"1" in the sequence. Should be "128, 1, 2, 4..." not "128,1, 2, 4..."
   ```suggestion
       * 2. Construct a new vector: masked = [1, 2, 4, 8, 16, 32, 64, 128, 1, 
2, 4, 8, 16, 32, 64, 128, 0 (repeated 16 times)].
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [opt](arm) Improve performance of bytes32_mask_to_bits32_mask functions with SVE intrinsics [doris]

Reply via email to