Consider the following testcase:
#include <immintrin.h>
static inline __m256i __attribute__((always_inline))
my_add(__m256i a0, __m256i b0)
{
__m128i a1 = _mm256_extractf128_si256(a0, 1);
__m128i b1 = _mm256_extractf128_si256(b0, 1);
__m256i r =
_mm256_castsi128_si256(_mm_add_epi32(_mm256_castsi256_si128(a0),
_mm256_castsi256_si128(b0)));
r = _mm256_insertf128_si256(r, _mm_add_epi32(a1, b1), 1);
return r;
}
extern int DATA[];
void use_insert_extract()
{
__m256i x = _mm256_loadu_si256((__m256i*)&DATA[0]);
__m256i y = _mm256_loadu_si256((__m256i*)&DATA[1]);
x = my_add(x, y);
x = my_add(x, y);
_mm256_storeu_si256((__m256i*)&DATA[0], x);
}
int main()
{
return DATA[1];
}
Compiled with "g++ -mavx -O3 -Wall -S" one gets the following output:
vmovdqu DATA(%rip), %ymm1
pushq %rbp
vmovdqu DATA+4(%rip), %ymm0
vextractf128 $0x1, %ymm1, %xmm3
vmovdqa %xmm1, %xmm2
movq %rsp, %rbp
vmovdqa %xmm0, %xmm1
vextractf128 $0x1, %ymm0, %xmm0
vpaddd %xmm1, %xmm2, %xmm2
vpaddd %xmm0, %xmm3, %xmm3
vinsertf128 $0x1, %xmm3, %ymm2, %ymm2
vextractf128 $0x1, %ymm2, %xmm3
vpaddd %xmm1, %xmm2, %xmm1
vpaddd %xmm0, %xmm3, %xmm0
vinsertf128 $0x1, %xmm0, %ymm1, %ymm0
vmovdqu %ymm0, DATA(%rip)
ICC 11.1 compiles the same source ("-xavx -O3 -Wall -S") to:
vmovdqu DATA(%rip), %ymm1
vmovdqu 4+DATA(%rip), %ymm0
vextractf128 $1, %ymm1, %xmm2
vextractf128 $1, %ymm0, %xmm6
vpaddd %xmm0, %xmm1, %xmm3
vpaddd %xmm6, %xmm2, %xmm5
vpaddd %xmm0, %xmm3, %xmm4
vpaddd %xmm6, %xmm5, %xmm7
vinsertf128 $1, %xmm7, %ymm4, %ymm8
vmovdqu %ymm8, DATA(%rip)
Note especially the extract after insert which happens because of the double
application of my_add. This kind of optimization (which ICC is able to apply
here) is important because AVX introduces 256 bit vector registers but
arithmetic/logic/comparison operations on integers remain the 128 bit SSE
variants. Thus if you want to handle integers in YMM registers you will find a
lot of vinsertf128 and vextractf128 operations.
--
Summary: [missed optimization] AVX vextractf128 after vinsertf128
Product: gcc
Version: 4.5.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: kretz at kde dot org
GCC build triplet: x86_64-unknown-linux-gnu
GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44551