From: Philipp Tomsich <p...@gnu.org>

The function
    long f(long a)
    {
        return(a & 0xFFFFFFFFull) << 3;
    }
is folded into
    _1 = a_2(D) << 3;
    _3 = _1 & 34359738360;
wheras the construction
    return (a & 0xFFFFFFFFull) * 8;
results in
    _1 = a_2(D) & 4294967295;
    _3 = _1 * 8;

This leads to suboptimal code-generation for RISC-V (march=rv64g), as
the shifted constant needs to be expanded into 3 RTX and 2 RTX (one
each for the LSHIFT_EXPR and the BIT_AND_EXPR) which will overwhelm
the combine pass (a sequence of 5 RTX are not considered):
        li      a5,1            # tmp78,        # 23    [c=4 l=4]  
*movdi_64bit/1
        slli    a5,a5,35        #, tmp79, tmp78 # 24    [c=4 l=4]  ashldi3
        addi    a5,a5,-8        #, tmp77, tmp79 # 9     [c=4 l=4]  adddi3/1
        slli    a0,a0,3         #, tmp76, tmp80 # 6     [c=4 l=4]  ashldi3
        and     a0,a0,a5        # tmp77,, tmp76 # 15    [c=4 l=4]  anddi3/0
        ret                     # 28    [c=0 l=4]  simple_return
instead of:
        slli    a0,a0,32        #, tmp76, tmp79 # 26    [c=4 l=4]  ashldi3
        srli    a0,a0,29        #,, tmp76       # 27    [c=4 l=4]  lshrdi3
        ret                                     # 24    [c=0 l=4]  simple_return

We address this by adding a simplification for
   (a << s) & M, where ((M >> s) << s) == M
to
   (a & M_unshifted) << s, where M_unshifted := (M >> s)
which undistributes the LSHIFT.

Signed-off-by: Philipp Tomsich <p...@gnu.org>
---
 gcc/match.pd                            | 11 +++++++++--
 gcc/testsuite/gcc.target/riscv/zextws.c | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/zextws.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 349eab6..6bb9535 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3079,6 +3079,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
             }
         }
      }
+    (if (GIMPLE && (((mask >> shiftc) << shiftc) == mask)
+               && (exact_log2((mask >> shiftc) + 1) >= 0)
+               && (shift == LSHIFT_EXPR))
+        (with
+         { tree newmaskt = build_int_cst_type(TREE_TYPE (@2), mask >> shiftc); 
}
+         (shift (convert (bit_and:shift_type (convert @0) { newmaskt; })) @1))
      /* ((X << 16) & 0xff00) is (X, 0).  */
      (if ((mask & zerobits) == mask)
       { build_int_cst (type, 0); }
@@ -3100,7 +3106,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
           (if (!tree_int_cst_equal (newmaskt, @2))
            (if (shift_type != TREE_TYPE (@3))
             (bit_and (convert (shift:shift_type (convert @3) @1)) { newmaskt; 
})
-            (bit_and @4 { newmaskt; })))))))))))))
+            (bit_and @4 { newmaskt; }))))))))))))))
 
 /* Fold (X {&,^,|} C2) << C1 into (X << C1) {&,^,|} (C2 << C1)
    (X {&,^,|} C2) >> C1 into (X >> C1) & (C2 >> C1).  */
@@ -3108,7 +3114,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (for bit_op (bit_and bit_xor bit_ior)
   (simplify
    (shift (convert?:s (bit_op:s @0 INTEGER_CST@2)) INTEGER_CST@1)
-   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+   (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+        && !wi::exact_log2(wi::to_wide(@2) + 1))
     (with { tree mask = int_const_binop (shift, fold_convert (type, @2), @1); }
      (bit_op (shift (convert @0) @1) { mask; }))))))
 
diff --git a/gcc/testsuite/gcc.target/riscv/zextws.c 
b/gcc/testsuite/gcc.target/riscv/zextws.c
new file mode 100644
index 0000000..8ac93f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zextws.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g -mabi=lp64 -O2" } */
+
+/* Test for
+     (a << s) & M', where ((M >> s) << s) == M
+   being undistributed into
+     (a & M_unshifted) << s, where M_unshifted := (M >> s)
+   to produce the sequence (or similar)
+     slli      a0,a0,32
+     srli      a0,a0,29
+*/
+long
+zextws_mask (long i)
+{
+  return (i & 0xffffffffULL) << 3;
+}
+/* { dg-final { scan-assembler "slli" } } */
+/* { dg-final { scan-assembler "srli" } } */
-- 
1.8.3.1

Reply via email to