https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92803
--- Comment #10 from Jakub Jelinek <jakub at gcc dot gnu.org> --- Another testcase to consider (-O2 -mavx2): typedef double v4df __attribute__((vector_size (32))); typedef double v2df __attribute__((vector_size (16))); typedef short v16hi __attribute__((vector_size (32))); typedef short v8hi __attribute__((vector_size (16))); v2df foo (v4df x, double *p) { return (v2df) { x[1], *p }; } v2df bar (v4df x, double *p) { return (v2df) { x[0], *p }; } v2df baz (v2df x, double *p) { return (v2df) { x[1], *p }; } v2df qux (v2df x, double *p) { return (v2df) { x[0], *p }; } Comparing gcc-9 with trunk + my patch, the differences are: foo: - vunpckhpd %xmm0, %xmm0, %xmm0 - vmovhpd (%rdi), %xmm0, %xmm0 + vbroadcastsd (%rdi), %ymm1 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + vpermpd $77, %ymm0, %ymm0 bar: - vmovhpd (%rdi), %xmm0, %xmm0 + vbroadcastsd (%rdi), %ymm1 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + vpermpd $76, %ymm0, %ymm0 baz: - vunpckhpd %xmm0, %xmm0, %xmm0 - vmovhpd (%rdi), %xmm0, %xmm0 + vmovddup (%rdi), %xmm1 + vunpckhpd %xmm1, %xmm0, %xmm0 qux: - vmovhpd (%rdi), %xmm0, %xmm0 + vmovapd %xmm0, %xmm1 + vmovddup (%rdi), %xmm0 + vmovsd %xmm1, %xmm0, %xmm0