Consider the following testcase:

#include <immintrin.h>

static inline __m256i __attribute__((always_inline))
my_add(__m256i a0, __m256i b0)
{
    __m128i a1 = _mm256_extractf128_si256(a0, 1);
    __m128i b1 = _mm256_extractf128_si256(b0, 1);
    __m256i r  =
_mm256_castsi128_si256(_mm_add_epi32(_mm256_castsi256_si128(a0),
_mm256_castsi256_si128(b0)));
    r = _mm256_insertf128_si256(r, _mm_add_epi32(a1, b1), 1);
    return r;
}

extern int DATA[];

void use_insert_extract()
{
    __m256i x = _mm256_loadu_si256((__m256i*)&DATA[0]);
    __m256i y = _mm256_loadu_si256((__m256i*)&DATA[1]);
    x = my_add(x, y);
    x = my_add(x, y);
    _mm256_storeu_si256((__m256i*)&DATA[0], x);
}

int main()
{
    return DATA[1];
}

Compiled with "g++ -mavx -O3 -Wall -S" one gets the following output:
        vmovdqu DATA(%rip), %ymm1
        pushq   %rbp
        vmovdqu DATA+4(%rip), %ymm0
        vextractf128    $0x1, %ymm1, %xmm3
        vmovdqa %xmm1, %xmm2
        movq    %rsp, %rbp
        vmovdqa %xmm0, %xmm1
        vextractf128    $0x1, %ymm0, %xmm0
        vpaddd  %xmm1, %xmm2, %xmm2
        vpaddd  %xmm0, %xmm3, %xmm3
        vinsertf128     $0x1, %xmm3, %ymm2, %ymm2
        vextractf128    $0x1, %ymm2, %xmm3
        vpaddd  %xmm1, %xmm2, %xmm1
        vpaddd  %xmm0, %xmm3, %xmm0
        vinsertf128     $0x1, %xmm0, %ymm1, %ymm0
        vmovdqu %ymm0, DATA(%rip)

ICC 11.1 compiles the same source ("-xavx -O3 -Wall -S") to:
        vmovdqu   DATA(%rip), %ymm1
        vmovdqu   4+DATA(%rip), %ymm0
        vextractf128 $1, %ymm1, %xmm2
        vextractf128 $1, %ymm0, %xmm6
        vpaddd    %xmm0, %xmm1, %xmm3
        vpaddd    %xmm6, %xmm2, %xmm5
        vpaddd    %xmm0, %xmm3, %xmm4
        vpaddd    %xmm6, %xmm5, %xmm7
        vinsertf128 $1, %xmm7, %ymm4, %ymm8
        vmovdqu   %ymm8, DATA(%rip)

Note especially the extract after insert which happens because of the double
application of my_add. This kind of optimization (which ICC is able to apply
here) is important because AVX introduces 256 bit vector registers but
arithmetic/logic/comparison operations on integers remain the 128 bit SSE
variants. Thus if you want to handle integers in YMM registers you will find a
lot of vinsertf128 and vextractf128 operations.


-- 
           Summary: [missed optimization] AVX vextractf128 after vinsertf128
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: kretz at kde dot org
 GCC build triplet: x86_64-unknown-linux-gnu
  GCC host triplet: x86_64-unknown-linux-gnu
GCC target triplet: x86_64-unknown-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=44551

Reply via email to