https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109605

Wojciech Mula <wojciech_mula at poczta dot onet.pl> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |wojciech_mula at poczta dot 
onet.p
                   |                            |l

--- Comment #3 from Wojciech Mula <wojciech_mula at poczta dot onet.pl> ---
This is somehow related. I needed to generate the particular procedure without
any vector instruction (the surrounding code is free to RVV instructions).

But when a code uses the builtin function `memset`, GCC still emits some vector
instruction. The cure is setting `-fno-builtin`, because pragma does not accept
that option.

The attached sample code comes from simdutf project (src/scalar/utf.f), godbolt
link for convenience https://godbolt.org/z/Ya91he99v.

---no-vector.cpp--
#include <cstdlib>
#include <cstdint>
#include <cstring>

#pragma GCC optimize ("no-tree-vectorize")
#pragma GCC optimize ("no-tree-loop-vectorize")
#pragma GCC optimize ("no-tree-slp-vectorize")
#pragma GCC optimize ("no-builtin") // not accepted by the compiler
bool validate(const char *buf, size_t len) noexcept {
    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
  uint64_t pos = 0;
  uint32_t code_point = 0;
  while (pos < len) {
    // check of the next 16 bytes are ascii.
    uint64_t next_pos = pos + 16;
    if (next_pos <=
        len) { // if it is safe to read 16 more bytes, check that they are
ascii
      uint64_t v1;
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
      uint64_t v2;
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
      uint64_t v{v1 | v2};
      if ((v & 0x8080808080808080) == 0) {
        pos = next_pos;
        continue;
      }
    }
    unsigned char byte = data[pos];

    while (byte < 0b10000000) {
      if (++pos == len) {
        return true;
      }
      byte = data[pos];
    }

    if ((byte & 0b11100000) == 0b11000000) {
      next_pos = pos + 2;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
      if ((code_point < 0x80) || (0x7ff < code_point)) {
        return false;
      }
    } else if ((byte & 0b11110000) == 0b11100000) {
      next_pos = pos + 3;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point = (byte & 0b00001111) << 12 |
                   (data[pos + 1] & 0b00111111) << 6 |
                   (data[pos + 2] & 0b00111111);
      if ((code_point < 0x800) || (0xffff < code_point) ||
          (0xd7ff < code_point && code_point < 0xe000)) {
        return false;
      }
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
      next_pos = pos + 4;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point =
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
      if (code_point <= 0xffff || 0x10ffff < code_point) {
        return false;
      }
    } else {
      // we may have a continuation
      return false;
    }
    pos = next_pos;
  }
  return true;
}
---eof---

The head of generated asm:

---
validate(char const*, unsigned long):
        beq     a1,zero,.L32
        li      a4,2139062272
        addi    a4,a4,-129
        slli    a2,a4,32
        addi    sp,sp,-16
        add     a2,a2,a4
        li      a5,0
        xori    a2,a2,-1
        addi    a7,sp,8
        vsetivli        zero,8,e8,mf2,ta,ma ###### here
.L2:
        addi    a3,a5,16
        add     t1,a0,a5
        bltu    a1,a3,.L36
        vle8.v  v1,0(t1) #####
        addi    a4,a5,8
        add     a4,a0,a4
        vse8.v  v1,0(sp) #####
        vle8.v  v1,0(a4) #####
        ld      a4,0(sp)
        vse8.v  v1,0(a7) #####
        ld      a6,8(sp)
        or      a4,a4,a6
        and     a4,a4,a2
        bne     a4,zero,.L36
        mv      a5,a3
.L6:
---
  • [Bug driver/109605] -fno-... wojciech_mula at poczta dot onet.pl via Gcc-bugs

Reply via email to