Hi Uros, Many thanks for the review, feedback and suggestions. Here's a revised patch incorporating all of the requested changes. Bootstrapped and regression tested on x86_64-pc-linux-gnu, both -m64 and -m32, with no new failures. Ok for mainline?
2022-04-21 Roger Sayle <ro...@nextmovesoftware.com> Uroš Bizjak <ubiz...@gmail.com> gcc/ChangeLog PR target/105135 * config/i386/i386.md (*xor_cmov<mode>): Transform setcc, negate then and into mov $0, followed by a cmov. (*lea_cmov<mode>): Transform setcc, ashift const then plus into lea followed by cmov. gcc/testsuite/ChangeLog PR target/105135 * gcc.target/i386/cmov10.c: New test case. * gcc.target/i386/cmov11.c: New test case. * gcc.target/i386/pr105135.c: New test case. Cheers, Roger -- > -----Original Message----- > From: Uros Bizjak <ubiz...@gmail.com> > Sent: 20 April 2022 08:41 > To: Roger Sayle <ro...@nextmovesoftware.com> > Cc: gcc-patches@gcc.gnu.org > Subject: Re: [x86_64 PATCH] PR middle-end/105135: Catch more cmov idioms in > combine. > > On Tue, Apr 19, 2022 at 1:58 PM Roger Sayle <ro...@nextmovesoftware.com> > wrote: > > > > > > This patch addresses PR middle-end/105135, a missed-optimization > > regression affecting mainline. I agree with Jakub's comment that the > > middle-end optimizations are sound, reducing basic blocks and > > conditional expressions at the tree-level, but requiring backend's to > > recognize conditional move instructions/idioms if/when beneficial. > > This patch introduces two new define_insn_and_split in i386.md to recognize > two additional cmove idioms. > > > > The first recognizes (PR105135's): > > > > int foo(int x, int y, int z) > > { > > return ((x < y) << 5) + z; > > } > > > > and transforms (the 6 insns, 13 bytes): > > > > xorl %eax, %eax ;; 2 bytes > > cmpl %esi, %edi ;; 2 bytes > > setl %al ;; 3 bytes > > sall $5, %eax ;; 3 bytes > > addl %edx, %eax ;; 2 bytes > > ret ;; 1 byte > > > > into (the 4 insns, 9 bytes): > > > > cmpl %esi, %edi ;; 2 bytes > > leal 32(%rdx), %eax ;; 3 bytes > > cmovge %edx, %eax ;; 3 bytes > > ret ;; 1 byte > > > > > > The second catches the very closely related (from PR 98865): > > > > int bar(int x, int y, int z) > > { > > return -(x < y) & z; > > } > > > > and transforms the (6 insns, 12 bytes): > > xorl %eax, %eax ;; 2 bytes > > cmpl %esi, %edi ;; 2 bytes > > setl %al ;; 3 bytes > > negl %eax ;; 2 bytes > > andl %edx, %eax ;; 2 bytes > > ret ;; 1 byte > > > > into (4 insns, 8 bytes): > > xorl %eax, %eax ;; 2 bytes > > cmpl %esi, %edi ;; 2 bytes > > cmovl %edx, %eax ;; 3 bytes > > ret ;; 1 byte > > > > They both have in common that they recognize a setcc followed by two > > instructions, and replace them with one instruction and a cmov, which > > is typically a performance win, but always a size win. Fine tuning > > these decisions based on microarchitecture is much easier in the > > backend, than the middle-end. > > > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > > and make -k check, both with and without --target_board=unix{-m32}, > > with no new failures. Ok for mainline? > > > > > > 2022-04-19 Roger Sayle <ro...@nextmovesoftware.com> > > > > gcc/ChangeLog > > PR target/105135 > > * config/i386/i386.md (*xor_cmov<mode>): Transform setcc, negate > > then and into mov $0, followed by a cmov. > > (*lea_cmov<mode>): Transform setcc, ashift const then plus into > > lea followed by cmov. > > > > gcc/testsuite/ChangeLog > > PR target/105135 > > * gcc.target/i386/cmov10.c: New test case. > > * gcc.target/i386/cmov11.c: New test case. > > * gcc.target/i386/pr105135.c: New test case. > > > > > > Thanks in advance, > > Roger > > > +;; Transform setcc;negate;and into mov_zero;cmov (define_insn_and_split > +"*xor_cmov<mode>" > + [(set (match_operand:SWI248 0 "register_operand") > + (and:SWI248 > + (neg:SWI248 (match_operator:SWI248 1 "ix86_comparison_operator" > + [(match_operand 2 "flags_reg_operand") > + (const_int 0)])) > + (match_operand:SWI248 3 "register_operand"))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_CMOVE && can_create_pseudo_p ()" > > Please use ix86_pre_reload_split instead of can_create_pseudo_p () here. > > + "#" > + "&& 1" > + [(set (match_dup 4) (const_int 0)) > + (set (match_dup 0) > + (if_then_else:SWI248 (match_op_dup 1 [(match_dup 2) (const_int 0)]) > + (match_dup 3) (match_dup 4)))] { > + operands[4] = gen_reg_rtx (<MODE>mode); > +}) > > Single line preparation statements should use double quotes instead of curly > braces. See many examples in i386 .md files. > > +;; Transform setcc;ashift_const;plus into lea_const;cmov > +(define_insn_and_split "*lea_cmov<mode>" > + [(set (match_operand:SWI 0 "register_operand") > + (plus:SWI (ashift:SWI (match_operator:SWI 1 "ix86_comparison_operator" > + [(match_operand 2 "flags_reg_operand") > + (const_int 0)]) > + (match_operand:SWI 3 "const_int_operand")) > + (match_operand:SWI 4 "register_operand"))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_CMOVE && can_create_pseudo_p ()" > > Same here, ix86_pre_reload_split should be used for define_insn_and_split > (FYI, > can_create_pseudo_p is still good for define_split where no instruction is > defined). > > + "#" > + "&& 1" > + [(set (match_dup 5) (plus:<LEAMODE> (match_dup 4) (match_dup 6))) > + (set (match_dup 0) > + (if_then_else:<LEAMODE> (match_op_dup 1 [(match_dup 2) (const_int 0)]) > + (match_dup 5) (match_dup 4)))] { > + operands[5] = gen_reg_rtx (<LEAMODE>mode); > + operands[6] = GEN_INT (1 << INTVAL (operands[3])); > + if (<MODE>mode != <LEAMODE>mode) > + { > + operands[0] = gen_lowpart (<LEAMODE>mode, operands[0]); > + operands[4] = gen_lowpart (<LEAMODE>mode, operands[4]); > > gen_lowpart is dangerous to use before reload. It can choke when integer mode > SUBREG of e.g. FP mode register is passed here. So you have to either > guarantee > there are no unsupported subregs (but please note that the compiler is > extremely creative in this area) or you have to force register to a pseudo > (which > can possibly defeat your optimization by generating unwanted moves). > > Uros. > > + } > +}) > >
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c74edd1..e82f037 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -20751,6 +20751,54 @@ operands[9] = replace_rtx (operands[6], operands[0], operands[1], true); }) +;; Transform setcc;negate;and into mov_zero;cmov +(define_insn_and_split "*xor_cmov<mode>" + [(set (match_operand:SWI248 0 "register_operand") + (and:SWI248 + (neg:SWI248 (match_operator:SWI248 1 "ix86_comparison_operator" + [(match_operand 2 "flags_reg_operand") + (const_int 0)])) + (match_operand:SWI248 3 "register_operand"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 4) (const_int 0)) + (set (match_dup 0) + (if_then_else:SWI248 (match_op_dup 1 [(match_dup 2) (const_int 0)]) + (match_dup 3) (match_dup 4)))] + "operands[4] = gen_reg_rtx (<MODE>mode);") + +;; Transform setcc;ashift_const;plus into lea_const;cmov +(define_insn_and_split "*lea_cmov<mode>" + [(set (match_operand:SWI 0 "register_operand") + (plus:SWI (ashift:SWI (match_operator:SWI 1 "ix86_comparison_operator" + [(match_operand 2 "flags_reg_operand") + (const_int 0)]) + (match_operand:SWI 3 "const_int_operand")) + (match_operand:SWI 4 "register_operand"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_CMOVE + && ix86_pre_reload_split () + && (!SUBREG_P (operands[0]) + || SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (operands[0]))))" + "#" + "&& 1" + [(set (match_dup 5) (plus:<LEAMODE> (match_dup 4) (match_dup 6))) + (set (match_dup 0) + (if_then_else:<LEAMODE> (match_op_dup 1 [(match_dup 2) (const_int 0)]) + (match_dup 5) (match_dup 4)))] +{ + operands[5] = gen_reg_rtx (<LEAMODE>mode); + operands[6] = GEN_INT (1 << INTVAL (operands[3])); + if (<MODE>mode != <LEAMODE>mode) + { + operands[4] = force_reg (<MODE>mode, operands[4]); + operands[0] = gen_lowpart (<LEAMODE>mode, operands[0]); + operands[4] = gen_lowpart (<LEAMODE>mode, operands[4]); + } +}) + (define_insn "movhf_mask" [(set (match_operand:HF 0 "nonimmediate_operand" "=v,m,v") (unspec:HF diff --git a/gcc/testsuite/gcc.target/i386/cmov10.c b/gcc/testsuite/gcc.target/i386/cmov10.c new file mode 100644 index 0000000..c04fdd8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cmov10.c @@ -0,0 +1,9 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ + +int foo(int x, int y, int z) +{ + return ((x < y) << 5) + z; +} + +/* { dg-final { scan-assembler "cmovge" } } */ diff --git a/gcc/testsuite/gcc.target/i386/cmov11.c b/gcc/testsuite/gcc.target/i386/cmov11.c new file mode 100644 index 0000000..65f2bfc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/cmov11.c @@ -0,0 +1,9 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ + +int foo(int x, int y, int z) +{ + return -(x < y) & z; +} + +/* { dg-final { scan-assembler "cmovl" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr105135.c b/gcc/testsuite/gcc.target/i386/pr105135.c new file mode 100644 index 0000000..3ed3c9e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr105135.c @@ -0,0 +1,16 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ + +char to_lower_1(const char c) { return c + ((c >= 'A' && c <= 'Z') * 32); } + +char to_lower_2(const char c) { return c + (((c >= 'A') & (c <= 'Z')) * 32); } + +char to_lower_3(const char c) { + if (c >= 'A' && c <= 'Z') { + return c + 32; + } + return c; +} + +/* { dg-final { scan-assembler-not "setbe" } } */ +/* { dg-final { scan-assembler-not "sall" } } */