https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48701

--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Better testcase without inline-asm (because sometimes inline-asm gets in the
way of other optimizations):
```
#include <x86intrin.h>

__m256i blackhole;

void testStore(__m128i xmm0, __m128i xmm1)
{
    __m256i ymm;
    __m128i *xmm = (__m128i *)&ymm;
    xmm[0] = xmm0;
    xmm[1] = xmm1;
    blackhole = ymm;
}
void f(__m128i, __m128i);

void testLoad()
{
    __m256i ymm = blackhole;
    __m128i *xmm = (__m128i *)&ymm;
    f(xmm[0], xmm[1]);
}
```
testStore still needs some help:
  ymm_6 = BIT_INSERT_EXPR <ymm_5(D), xmm0_2(D), 0>;
  ymm_7 = BIT_INSERT_EXPR <ymm_6, xmm1_3(D), 128>;
  blackhole = ymm_7;

Which gives:
```
...
        vpxor   %xmm2, %xmm2, %xmm2
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        vmovdqa %ymm2, -32(%rsp)
        vmovdqa %xmm0, -32(%rsp)
        vmovdqa %xmm1, -16(%rsp)
        vmovdqa -32(%rsp), %ymm4
        vmovdqa %ymm4, blackhole(%rip)
...
```

while testLoad is fine/correct:
testLoad:
        vmovdqa blackhole+16(%rip), %xmm1
        vmovdqa blackhole(%rip), %xmm0
        jmp     f

Reply via email to