12 Regression] Optimization regression for handrolled branchless assignment since r11-4717-g3e190757fa332d32

jakub at gcc dot gnu.org via Gcc-bugs Mon, 04 Apr 2022 04:51:43 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105135


--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Trying to microbenchmark this in a tight loop on i9-7960X shows in this case
cmov probably better (but cmov is really a lottery on x86):
cat pr105135.c
__attribute__((noipa)) char to_lower_1(const char c) { return c + ((c >= 'A' &&
c <= 'Z') * 32); }
__attribute__((noipa)) char to_lower_2(const char c) { return c + (((c >= 'A')
& (c <= 'Z')) * 32); }
__attribute__((noipa)) char to_lower_3(const char c) { if (c >= 'A' && c <=
'Z') return c + 32; return c; }
$ cat pr105135-2.c
__attribute__((noipa)) char to_lower_1(const char c);
__attribute__((noipa)) char to_lower_2(const char c);
__attribute__((noipa)) char to_lower_3(const char c);
#define N 1000000000

int
main ()
{
  unsigned long long r = 0;
#ifdef Aa
  for (long long i = 0; i < N; i++)
    r += to_lower ((i & 1) ? 'A' : 'a');
#else
  for (long long i = 0; i < N; i++)
    r += to_lower ('A');
#endif
  asm volatile ("" : : "r" (r));
}
$ for i in "./cc1 -quiet" "gcc -S"; do for j in 1 2 3; do for k in "" -DAa; do
eval $i -O3 pr105135.c; gcc -Dto_lower=to_lower_$j $k -O3 -o pr105135{,.s}
pr105135-2.c; echo $i $j $k; time ./pr105135; done; done; done
./cc1 -quiet 1

real    0m1.230s
user    0m1.228s
sys     0m0.001s
./cc1 -quiet 1 -DAa

real    0m1.706s
user    0m1.703s
sys     0m0.001s
./cc1 -quiet 2

real    0m1.222s
user    0m1.221s
sys     0m0.000s
./cc1 -quiet 2 -DAa

real    0m1.686s
user    0m1.683s
sys     0m0.001s
./cc1 -quiet 3

real    0m1.232s
user    0m1.230s
sys     0m0.000s
./cc1 -quiet 3 -DAa

real    0m1.450s
user    0m1.447s
sys     0m0.001s
gcc -S 1

real    0m1.232s
user    0m1.229s
sys     0m0.001s
gcc -S 1 -DAa

real    0m1.391s
user    0m1.389s
sys     0m0.001s
gcc -S 2

real    0m1.233s
user    0m1.230s
sys     0m0.001s
gcc -S 2 -DAa

real    0m1.398s
user    0m1.397s
sys     0m0.000s
gcc -S 3

real    0m1.232s
user    0m1.229s
sys     0m0.001s
gcc -S 3 -DAa

real    0m1.430s
user    0m1.428s
sys     0m0.000s
where gcc is GCC 10.x and ./cc1 is current trunk.
Seems for the constant 'A' case it is actually a wash, but with alternating
'A'/'a' cmov is better.
clang seems to emit for the first 2 functions very similar code to gcc, the
only difference is that
shift left and addition are performed using 8-bit rather than 32-bit
instructions, so:
        leal    -65(%rdi), %eax
        cmpb    $26, %al
        setb    %al
        shlb    $5, %al
        addb    %dil, %al
and that seems to perform better.
I have tried to use
        leal    -65(%rdi), %ecx
        xorl    %eax, %eax
        cmpb    $25, %cl
        setbe   %al
        sall    $5, %eax
        addl    %edi, %eax
to perform manually what our peephole2 tries to do for setXX instructions but
in this case fails to do as %eax is live in the comparison before it,
that helped a little bit but not as much as the 8-bit instructions do.
But when I disable the
 ;; Avoid redundant prefixes by splitting HImode arithmetic to SImode.
 ;; Do not split instructions with mask registers.
 (define_split
   [(set (match_operand 0 "general_reg_operand")
        (match_operator 3 "promotable_binary_operator"
           [(match_operand 1 "general_reg_operand")
            (match_operand 2 "aligned_operand")]))
    (clobber (reg:CC FLAGS_REG))]
   "! TARGET_PARTIAL_REG_STALL && reload_completed
    && ((GET_MODE (operands[0]) == HImode
        && ((optimize_function_for_speed_p (cfun) && !TARGET_FAST_PREFIX)
             /* ??? next two lines just !satisfies_constraint_K (...) */
            || !CONST_INT_P (operands[2])
            || satisfies_constraint_K (operands[2])))
        || (GET_MODE (operands[0]) == QImode
-          && (TARGET_PROMOTE_QImode || optimize_function_for_size_p (cfun))))"
+          && (0 || optimize_function_for_size_p (cfun))))"
   [(parallel [(set (match_dup 0)
                   (match_op_dup 3 [(match_dup 1) (match_dup 2)]))
              (clobber (reg:CC FLAGS_REG))])]
 {
   operands[0] = gen_lowpart (SImode, operands[0]);
   operands[1] = gen_lowpart (SImode, operands[1]);
   if (GET_CODE (operands[3]) != ASHIFT)
     operands[2] = gen_lowpart (SImode, operands[2]);
   operands[3] = shallow_copy_rtx (operands[3]);
   PUT_MODE (operands[3], SImode);
 })
splitter so that the code is basically the same as from clang, it is still
slower than the clang version, so it is just weird.
Anyway, the GIMPLE optimization is IMNSHO sound, it is all about how exactly
the backend handles it.

[Bug middle-end/105135] [11/12 Regression] Optimization regression for handrolled branchless assignment since r11-4717-g3e190757fa332d32

Reply via email to