Hi Uros,

Many thanks for the review, feedback and suggestions.
Here's a revised patch incorporating all of the requested
changes.  Bootstrapped  and regression tested on
x86_64-pc-linux-gnu, both -m64 and -m32, with no
new failures.  Ok for mainline?


2022-04-21  Roger Sayle  <ro...@nextmovesoftware.com>
            Uroš Bizjak  <ubiz...@gmail.com>

gcc/ChangeLog
        PR target/105135
        * config/i386/i386.md (*xor_cmov<mode>): Transform setcc, negate
        then and into mov $0, followed by a cmov.
        (*lea_cmov<mode>): Transform setcc, ashift const then plus into
        lea followed by cmov.

gcc/testsuite/ChangeLog
        PR target/105135
        * gcc.target/i386/cmov10.c: New test case.
        * gcc.target/i386/cmov11.c: New test case.
        * gcc.target/i386/pr105135.c: New test case.

Cheers,
Roger
--

> -----Original Message-----
> From: Uros Bizjak <ubiz...@gmail.com>
> Sent: 20 April 2022 08:41
> To: Roger Sayle <ro...@nextmovesoftware.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [x86_64 PATCH] PR middle-end/105135: Catch more cmov idioms in
> combine.
> 
> On Tue, Apr 19, 2022 at 1:58 PM Roger Sayle <ro...@nextmovesoftware.com>
> wrote:
> >
> >
> > This patch addresses PR middle-end/105135, a missed-optimization
> > regression affecting mainline.  I agree with Jakub's comment that the
> > middle-end optimizations are sound, reducing basic blocks and
> > conditional expressions at the tree-level, but requiring backend's to
> > recognize conditional move instructions/idioms if/when beneficial.
> > This patch introduces two new define_insn_and_split in i386.md to recognize
> two additional cmove idioms.
> >
> > The first recognizes (PR105135's):
> >
> > int foo(int x, int y, int z)
> > {
> >   return ((x < y) << 5) + z;
> > }
> >
> > and transforms (the 6 insns, 13 bytes):
> >
> >         xorl    %eax, %eax      ;; 2 bytes
> >         cmpl    %esi, %edi      ;; 2 bytes
> >         setl    %al             ;; 3 bytes
> >         sall    $5, %eax        ;; 3 bytes
> >         addl    %edx, %eax      ;; 2 bytes
> >         ret                     ;; 1 byte
> >
> > into (the 4 insns, 9 bytes):
> >
> >         cmpl    %esi, %edi      ;; 2 bytes
> >         leal    32(%rdx), %eax  ;; 3 bytes
> >         cmovge  %edx, %eax      ;; 3 bytes
> >         ret                     ;; 1 byte
> >
> >
> > The second catches the very closely related (from PR 98865):
> >
> > int bar(int x, int y, int z)
> > {
> >   return -(x < y) & z;
> > }
> >
> > and transforms the (6 insns, 12 bytes):
> >         xorl    %eax, %eax      ;; 2 bytes
> >         cmpl    %esi, %edi      ;; 2 bytes
> >         setl    %al             ;; 3 bytes
> >         negl    %eax            ;; 2 bytes
> >         andl    %edx, %eax      ;; 2 bytes
> >         ret                     ;; 1 byte
> >
> > into (4 insns, 8 bytes):
> >         xorl    %eax, %eax      ;; 2 bytes
> >         cmpl    %esi, %edi      ;; 2 bytes
> >         cmovl   %edx, %eax      ;; 3 bytes
> >         ret                     ;; 1 byte
> >
> > They both have in common that they recognize a setcc followed by two
> > instructions, and replace them with one instruction and a cmov, which
> > is typically a performance win, but always a size win.  Fine tuning
> > these decisions based on microarchitecture is much easier in the
> > backend, than the middle-end.
> >
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check, both with and without --target_board=unix{-m32},
> > with no new failures.  Ok for mainline?
> >
> >
> > 2022-04-19  Roger Sayle  <ro...@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         PR target/105135
> >         * config/i386/i386.md (*xor_cmov<mode>): Transform setcc, negate
> >         then and into mov $0, followed by a cmov.
> >         (*lea_cmov<mode>): Transform setcc, ashift const then plus into
> >         lea followed by cmov.
> >
> > gcc/testsuite/ChangeLog
> >         PR target/105135
> >         * gcc.target/i386/cmov10.c: New test case.
> >         * gcc.target/i386/cmov11.c: New test case.
> >         * gcc.target/i386/pr105135.c: New test case.
> >
> >
> > Thanks in advance,
> > Roger
> 
> 
> +;; Transform setcc;negate;and into mov_zero;cmov (define_insn_and_split
> +"*xor_cmov<mode>"
> +  [(set (match_operand:SWI248 0 "register_operand")
> +    (and:SWI248
> +      (neg:SWI248 (match_operator:SWI248 1 "ix86_comparison_operator"
> +            [(match_operand 2 "flags_reg_operand")
> +             (const_int 0)]))
> +      (match_operand:SWI248 3 "register_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_CMOVE && can_create_pseudo_p ()"
> 
> Please use ix86_pre_reload_split instead of can_create_pseudo_p () here.
> 
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 4) (const_int 0))
> +   (set (match_dup 0)
> +    (if_then_else:SWI248 (match_op_dup 1 [(match_dup 2) (const_int 0)])
> +                 (match_dup 3) (match_dup 4)))] {
> +  operands[4] = gen_reg_rtx (<MODE>mode);
> +})
> 
> Single line preparation statements should use double quotes instead of curly
> braces. See many examples in i386 .md files.
> 
> +;; Transform setcc;ashift_const;plus into lea_const;cmov
> +(define_insn_and_split "*lea_cmov<mode>"
> +  [(set (match_operand:SWI 0 "register_operand")
> +    (plus:SWI (ashift:SWI (match_operator:SWI 1 "ix86_comparison_operator"
> +                [(match_operand 2 "flags_reg_operand")
> +                 (const_int 0)])
> +                  (match_operand:SWI 3 "const_int_operand"))
> +          (match_operand:SWI 4 "register_operand")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_CMOVE && can_create_pseudo_p ()"
> 
> Same here, ix86_pre_reload_split should be used for define_insn_and_split 
> (FYI,
> can_create_pseudo_p is still good for define_split where no instruction is
> defined).
> 
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 5) (plus:<LEAMODE> (match_dup 4) (match_dup 6)))
> +   (set (match_dup 0)
> +    (if_then_else:<LEAMODE> (match_op_dup 1 [(match_dup 2) (const_int 0)])
> +                (match_dup 5) (match_dup 4)))] {
> +  operands[5] = gen_reg_rtx (<LEAMODE>mode);
> +  operands[6] = GEN_INT (1 << INTVAL (operands[3]));
> +  if (<MODE>mode != <LEAMODE>mode)
> +    {
> +      operands[0] = gen_lowpart (<LEAMODE>mode, operands[0]);
> +      operands[4] = gen_lowpart (<LEAMODE>mode, operands[4]);
> 
> gen_lowpart is dangerous to use before reload. It can choke when integer mode
> SUBREG of e.g. FP mode register is passed here. So you have to either 
> guarantee
> there are no unsupported subregs (but please note that the compiler is
> extremely creative in this area) or you have to force register to a pseudo 
> (which
> can possibly defeat your optimization by generating unwanted moves).
> 
> Uros.
> 
> +    }
> +})
> >
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c74edd1..e82f037 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20751,6 +20751,54 @@
   operands[9] = replace_rtx (operands[6], operands[0], operands[1], true);
 })
 
+;; Transform setcc;negate;and into mov_zero;cmov
+(define_insn_and_split "*xor_cmov<mode>"
+  [(set (match_operand:SWI248 0 "register_operand")
+       (and:SWI248
+         (neg:SWI248 (match_operator:SWI248 1 "ix86_comparison_operator"
+                       [(match_operand 2 "flags_reg_operand")
+                        (const_int 0)]))
+         (match_operand:SWI248 3 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_CMOVE && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4) (const_int 0))
+   (set (match_dup 0)
+       (if_then_else:SWI248 (match_op_dup 1 [(match_dup 2) (const_int 0)])
+                            (match_dup 3) (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (<MODE>mode);")
+
+;; Transform setcc;ashift_const;plus into lea_const;cmov
+(define_insn_and_split "*lea_cmov<mode>"
+  [(set (match_operand:SWI 0 "register_operand")
+       (plus:SWI (ashift:SWI (match_operator:SWI 1 "ix86_comparison_operator"
+                               [(match_operand 2 "flags_reg_operand")
+                                (const_int 0)])
+                             (match_operand:SWI 3 "const_int_operand"))
+                 (match_operand:SWI 4 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_CMOVE
+   && ix86_pre_reload_split ()
+   && (!SUBREG_P (operands[0])
+       || SCALAR_INT_MODE_P (GET_MODE (SUBREG_REG (operands[0]))))"
+  "#"
+  "&& 1"
+  [(set (match_dup 5) (plus:<LEAMODE> (match_dup 4) (match_dup 6)))
+   (set (match_dup 0)
+       (if_then_else:<LEAMODE> (match_op_dup 1 [(match_dup 2) (const_int 0)])
+                               (match_dup 5) (match_dup 4)))]
+{
+  operands[5] = gen_reg_rtx (<LEAMODE>mode);
+  operands[6] = GEN_INT (1 << INTVAL (operands[3]));
+  if (<MODE>mode != <LEAMODE>mode)
+    {
+      operands[4] = force_reg (<MODE>mode, operands[4]);
+      operands[0] = gen_lowpart (<LEAMODE>mode, operands[0]);
+      operands[4] = gen_lowpart (<LEAMODE>mode, operands[4]);
+    }
+})
+
 (define_insn "movhf_mask"
   [(set (match_operand:HF 0 "nonimmediate_operand" "=v,m,v")
        (unspec:HF
diff --git a/gcc/testsuite/gcc.target/i386/cmov10.c 
b/gcc/testsuite/gcc.target/i386/cmov10.c
new file mode 100644
index 0000000..c04fdd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmov10.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+int foo(int x, int y, int z)
+{
+  return ((x < y) << 5) + z;
+}
+
+/* { dg-final { scan-assembler "cmovge" } } */
diff --git a/gcc/testsuite/gcc.target/i386/cmov11.c 
b/gcc/testsuite/gcc.target/i386/cmov11.c
new file mode 100644
index 0000000..65f2bfc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cmov11.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+int foo(int x, int y, int z)
+{
+  return -(x < y) & z;
+}
+
+/* { dg-final { scan-assembler "cmovl" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr105135.c 
b/gcc/testsuite/gcc.target/i386/pr105135.c
new file mode 100644
index 0000000..3ed3c9e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105135.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+char to_lower_1(const char c) { return c + ((c >= 'A' && c <= 'Z') * 32); }
+
+char to_lower_2(const char c) { return c + (((c >= 'A') & (c <= 'Z')) * 32); }
+
+char to_lower_3(const char c) {
+    if (c >= 'A' && c <= 'Z') {
+        return c + 32;
+    }
+    return c;
+}
+
+/* { dg-final { scan-assembler-not "setbe" } } */
+/* { dg-final { scan-assembler-not "sall" } } */

Reply via email to