https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119253

            Bug ID: 119253
           Summary: RISC-V GCC auto-vectorizes unaligned memory access
                    even if mvector-strict-align is enabled
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: dusan.stojko...@rt-rk.com
  Target Milestone: ---

The following code produces a bus error on the Banana Pi BPI F3 regardless of
if -mvector-strict-align and/or -mstrict-align are passed to the compiler:


```
// compiled with  /path/to/toolchain/riscv64-unknown-linux-gnu-gcc -O3
-march=rv64gcv -mvector-strict-align -mstrict-align vector_test.c -o
vector_test.o;

#include <stdio.h>

// This function replicates the unaligned 32-bit read causing the bus error:
void replicate_bus_error(
    uint8_t  *flag_buf,
    int32_t   bit_pos,
    int32_t   stride,
    uint16_t *no_filter_flags,
    int32_t   ctb_size)
{
    for(int32_t row = 0; row < ctb_size; row++)
    {
        no_filter_flags[row] = 
            (*(uint32_t *)(flag_buf + (bit_pos >> 3))) >> (bit_pos & 7);
        bit_pos += stride;
    }
}

void fixed_bus_error(
    uint8_t  *flag_buf,
    int32_t   bit_pos,
    int32_t   stride,
    uint16_t *no_filter_flags,
    int32_t   ctb_size)
{
    for(int32_t row = 0; row < ctb_size; row++)
    {
        uint8_t *base = flag_buf + (bit_pos >> 3);
        uint32_t val  = (uint32_t)base[0]        |
                        ((uint32_t)base[1] << 8) |
                        ((uint32_t)base[2] << 16)|
                        ((uint32_t)base[3] << 24);
        val >>= (bit_pos & 7);
        no_filter_flags[row] = (uint16_t)val;
        bit_pos += stride;
    }
}

int main(void)
{
    uint8_t buffer[32];
    for(int i = 0; i < 32; i++)
    {
        buffer[i] = (uint8_t)i; // Fill with some test data
    }

    uint16_t out_flags[8] = {0};
    int32_t start_bit_pos = 5;
    int32_t stride   = 8;
    int32_t ctb_size = 9;

    replicate_bus_error(buffer, start_bit_pos, stride, out_flags, ctb_size);

    for(int i = 0; i < 8; i++)
    {
        printf("out_flags[%d] = %u\n", i, out_flags[i]);
    }

    return 0;
}
```

The example is from a video decoding library, which I attempted to simplify to
narrow down the issue. It also includes a version of the function, doing
practically the same thing but written in such a way it isn't being vectorized,
thus ensuring valid memory access.

Interestingly, -mvector-strict-align doesn't change the resulting assembly.
Also, clang seems to vectorize this example and is able to produce valid
assembly code (when tested on the Banana Pi BPI F3):
https://godbolt.org/z/d56ofvqz5

I tested with gcc-trunk configured in the following way:
Target: riscv64-unknown-linux-gnu
Configured with: /home/dstojkovic/Projects/riscv-gnu-toolchain/gcc/configure
--target=riscv64-unknown-linux-gnu
--prefix=/home/dstojkovic/opt/riscv-autovec-12032025
--with-sysroot=/home/dstojkovic/opt/riscv-autovec-12032025/sysroot
--with-pkgversion=g90f5dabddbd --with-system-zlib --enable-shared --enable-tls
--enable-languages=c,c++,fortran --disable-libmudflap --disable-libssp
--disable-libquadmath --disable-libsanitizer --disable-nls --disable-bootstrap
--src=.././gcc --disable-default-pie --disable-multilib --with-abi=lp64d
--with-arch=rv64gcv --with-tune=rocket --with-isa-spec=20191213
'CFLAGS_FOR_TARGET=-O2    -mcmodel=medlow' 'CXXFLAGS_FOR_TARGET=-O2   
-mcmodel=medlow'
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 15.0.1 20250312 (experimental) (g90f5dabddbd)

Reply via email to