https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110309

            Bug ID: 110309
           Summary: Wrong code for masked load expansion
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rguenth at gcc dot gnu.org
  Target Milestone: ---

For

void foo (int * __restrict a, int *b)
{
  for (int i = 0; i < 6; ++i)
    a[i] = b[i] + 42;
}

with -O3 --param vect-partial-vector-usage=1 -march=znver4
-mprefer-vector-width=256 we get

foo:
.LFB0:
        .cfi_startproc
        movl    $42, %eax
        vpxor   %xmm0, %xmm0, %xmm0
        vpblendd        $63, (%rsi), %ymm0, %ymm0
        kmovb   .LC1(%rip), %k1
        vpbroadcastd    %eax, %ymm1
        vpaddd  %ymm1, %ymm0, %ymm0
        vmovdqu32       %ymm0, (%rdi){%k1}
        vzeroupper
        ret

note how the .MASK_LOAD (b_10(D), 32B, { -1, -1, -1, -1, -1, -1, 0, 0 }) is
expanded as vpblendd without a mask, performing a full 32byte load from
(%rsi) which isn't aligned and thus can trap.

;; vect__4.6_8 = .MASK_LOAD (b_10(D), 32B, { -1, -1, -1, -1, -1, -1, 0, 0 });

(insn 7 6 8 (set (reg:QI 86)
        (const_int 63 [0x3f])) "t2.c":4:13 -1
     (nil))

(insn 8 7 0 (set (reg:V8SI 82 [ vect__4.6 ])
        (vec_merge:V8SI (mem:V8SI (reg/v/f:DI 85 [ b ]) [1 MEM <vector(8) int>
[(int *)b_10(D)]+0 S32 A32])
            (reg:V8SI 82 [ vect__4.6 ])
            (reg:QI 86))) "t2.c":4:13 -1
     (nil))

Reply via email to