We can vectorize the testcase to: movq pix1(%rip), %xmm0 movq pix2(%rip), %xmm1 psadbw %xmm1, %xmm0 movd %xmm0, %edx * pextrq $1, %xmm0, %eax * addl %edx, %eax ret
The instructions, marked with (*) are generated due to middle end limitation that generates SImode vector (V2SImode) temporary for int summing variable. This is not the case with psadbw, which natively sums to V1DImode. The code is still way better that the un-vectorized loop. 2019-08-01 Uroš Bizjak <ubiz...@gmail.com> PR target/85693 * config/i386/mmx.md (usadv8qi): New expander. testsuite/ChangeLog: 2019-08-01 Uroš Bizjak <ubiz...@gmail.com> PR target/85693 * gcc.target/i386/pr85693-1.c: New test. Bootstrapped and regression tested on x86_64-linux-gnu. Committed to mainline SVN. Uros.
Index: config/i386/mmx.md =================================================================== --- config/i386/mmx.md (revision 273979) +++ config/i386/mmx.md (working copy) @@ -1970,6 +1970,21 @@ DONE; }) +(define_expand "usadv8qi" + [(match_operand:V2SI 0 "register_operand") + (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "vector_operand") + (match_operand:V2SI 3 "vector_operand")] + "TARGET_MMX_WITH_SSE" +{ + rtx t1 = gen_reg_rtx (V1DImode); + rtx t2 = gen_reg_rtx (V2SImode); + emit_insn (gen_mmx_psadbw (t1, operands[1], operands[2])); + convert_move (t2, t1, 0); + emit_insn (gen_addv2si3 (operands[0], t2, operands[3])); + DONE; +}) + (define_insn_and_split "mmx_pmovmskb" [(set (match_operand:SI 0 "register_operand" "=r,r") (unspec:SI [(match_operand:V8QI 1 "register_operand" "y,x")] Index: testsuite/gcc.target/i386/pr85693-1.c =================================================================== --- testsuite/gcc.target/i386/pr85693-1.c (nonexistent) +++ testsuite/gcc.target/i386/pr85693-1.c (working copy) @@ -0,0 +1,21 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-msse2 -O2 -ftree-vectorize" } */ + +#define N 8 + +int abs (int); + +unsigned char pix1[N], pix2[N]; + +int foo (void) +{ + int i_sum = 0; + int i; + + for (i = 0; i < N; i++) + i_sum += abs (pix1[i] - pix2[i]); + + return i_sum; +} + +/* { dg-final { scan-assembler "psadbw" } } */