[wide-int] More optimisations

Richard Sandiford Sun, 27 Oct 2013 07:49:40 -0700

This patch adds some more optimisations to the wi:: comparison functions.
It uses the:


  #define CONSTANT(X) (__builtin_constant_p (X) && (X))

idiom that was mentioned before, except that I thought CONSTANT would be
too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
Better names welcome.

The changes are:

- Add a fast path to eq_p for when one of the inputs isn't sign-extended.
  This includes code to handle compile-time 0 specially.

- Add the opposite optimisation to Mike's lts_p change, if we can tell at
  compile time that it applies.

- Add fast paths to ltu_p for constants.

E.g.:

  bool
  f1 (const_tree x)
  {
    return wi::eq_p (x, 0);
  }

now gives:

        xorl    %eax, %eax
        cmpw    $1, 4(%rdi)
        je      .L5
        rep ret
        .p2align 4,,10
        .p2align 3
.L5:
        cmpq    $0, 16(%rdi)
        sete    %al
        ret

  bool
  f2 (const_tree x, HOST_WIDE_INT y)
  {
    return wi::eq_p (x, y);
  }

gives:

        movq    8(%rdi), %rax
        movzwl  52(%rax), %edx
        xorl    %eax, %eax
        andw    $1023, %dx
        cmpw    $1, 4(%rdi)
        je      .L10
        rep ret
        .p2align 4,,10
        .p2align 3
.L10:
        xorq    16(%rdi), %rsi
        movzwl  %dx, %edx
        movl    $64, %ecx
        subl    %edx, %ecx
        movq    %rsi, %rax
        salq    %cl, %rax
        testl   %ecx, %ecx
        cmovg   %rax, %rsi
        testq   %rsi, %rsi
        sete    %al
        ret

  bool
  f3 (HOST_WIDE_INT x, const_tree y)
  {
    return wi::lts_p (x, y);
  }

is similarly ugly because of way that it ignores TYPE_SIGN and so has to
explicitly sign-extend "small-prec" cases:

        movq    8(%rsi), %rax
        movzwl  4(%rsi), %ecx
        movzwl  52(%rax), %edx
        andl    $1023, %edx
        cmpl    $1, %ecx
        je      .L16
        leal    -1(%rcx), %eax
        sall    $6, %ecx
        subl    %edx, %ecx
        movq    16(%rsi,%rax,8), %rax
        movq    %rax, %rdx
        salq    %cl, %rdx
        testl   %ecx, %ecx
        cmovg   %rdx, %rax
        sarq    $63, %rax
        addl    $1, %eax
        ret
        .p2align 4,,10
        .p2align 3
.L16:
        cmpl    $63, %edx
        movq    16(%rsi), %rax
        ja      .L13
        movb    $64, %cl
        subl    %edx, %ecx
        salq    %cl, %rax
        sarq    %cl, %rax
.L13:
        cmpq    %rdi, %rax
        setg    %al
        ret

but:

  bool
  f4 (HOST_WIDE_INT x, const_tree y)
  {
    return wi::lts_p (x, wi::to_widest (y));
  }

is a bit more respectable:

        movzwl  6(%rsi), %eax
        cmpl    $1, %eax
        je      .L20
        subl    $1, %eax
        movq    16(%rsi,%rax,8), %rax
        sarq    $63, %rax
        addl    $1, %eax
        ret
        .p2align 4,,10
        .p2align 3
.L20:
        cmpq    %rdi, 16(%rsi)
        setg    %al
        ret

For similar reasons:

  bool
  f5 (const_tree x)
  {
    return wi::ltu_p (x, 100);
  }

gives:

        movq    8(%rdi), %rax
        movzwl  52(%rax), %ecx
        xorl    %eax, %eax
        andw    $1023, %cx
        cmpw    $1, 4(%rdi)
        je      .L26
        rep ret
        .p2align 4,,10
        .p2align 3
.L26:
        cmpw    $63, %cx
        ja      .L23
        movl    $1, %eax
        salq    %cl, %rax
        subq    $1, %rax
        andq    16(%rdi), %rax
.L24:
        cmpq    $99, %rax
        setbe   %al
        ret
        .p2align 4,,10
        .p2align 3
.L23:
        movq    16(%rdi), %rax
        jmp     .L24

but:

  bool
  f6 (const_tree x)
  {
    return wi::ltu_p (wi::to_widest (x), 100);
  }

gives:

        xorl    %eax, %eax
        cmpw    $1, 6(%rdi)
        je      .L30
        rep ret
        .p2align 4,,10
        .p2align 3
.L30:
        cmpq    $99, 16(%rdi)
        setbe   %al
        ret

Tested on powerpc64-linux-gnu and x86_64-linux-gnu.  OK for wide-int?

Thanks,
Richard


Index: gcc/system.h
===================================================================
--- gcc/system.h        2013-10-27 14:25:19.144723977 +0000
+++ gcc/system.h        2013-10-27 14:25:20.716738045 +0000
@@ -711,6 +711,12 @@ #define gcc_unreachable() __builtin_unre
 #define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__))
 #endif
 
+#if GCC_VERSION >= 3001
+#define CAN_TELL(X) (__builtin_constant_p (X) && (X))
+#else
+#define CAN_TELL(X) (false && (X))
+#endif
+
 /* Until we can use STATIC_ASSERT.  */
 #define STATIC_ASSERT(X) \
   typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED
Index: gcc/wide-int.h
===================================================================
--- gcc/wide-int.h      2013-10-27 14:25:19.144723977 +0000
+++ gcc/wide-int.h      2013-10-27 14:37:34.834443832 +0000
@@ -1495,6 +1495,7 @@ wi::eq_p (const T1 &x, const T2 &y)
   WIDE_INT_REF_FOR (T2) yi (y, precision);
   if (xi.is_sign_extended && yi.is_sign_extended)
     {
+      /* This case reduces to array equality.  */
       if (xi.len != yi.len)
        return false;
       unsigned int i = 0;
@@ -1504,10 +1505,21 @@ wi::eq_p (const T1 &x, const T2 &y)
       while (++i != xi.len);
       return true;
     }
-  if (precision <= HOST_BITS_PER_WIDE_INT)
+  if (yi.len == 1)
     {
-      unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow ();
-      return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0;
+      /* XI is only equal to YI if it too has a single HWI.  */
+      if (xi.len != 1)
+       return false;
+      /* Excess bits in xi.val[0] will be signs or zeros, so comparisons
+        with 0 are simple.  */
+      if (CAN_TELL (yi.val[0] == 0))
+       return xi.val[0] == 0;
+      /* Otherwise flush out any excess bits first.  */
+      unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0];
+      int excess = HOST_BITS_PER_WIDE_INT - precision;
+      if (excess > 0)
+       diff <<= excess;
+      return diff == 0;
     }
   return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision);
 }
@@ -1528,20 +1540,28 @@ wi::lts_p (const T1 &x, const T2 &y)
   unsigned int precision = get_binary_precision (x, y);
   WIDE_INT_REF_FOR (T1) xi (x, precision);
   WIDE_INT_REF_FOR (T2) yi (y, precision);
-  // We optimize x < y, where y is 64 or fewer bits.
+  /* We optimize x < y, where y is 64 or fewer bits.  */
   if (wi::fits_shwi_p (yi))
     {
-      // If x fits directly into a shwi, we can compare directly.
+      /* Make lts_p (x, 0) as efficient as wi::neg_p (x).  */
+      if (CAN_TELL (yi.val[0] == 0))
+       return neg_p (xi);
+      /* If x fits directly into a shwi, we can compare directly.  */
       if (wi::fits_shwi_p (xi))
        return xi.to_shwi () < yi.to_shwi ();
-      // If x doesn't fit and is negative, then it must be more
-      // negative than any value in y, and hence smaller than y.
-      if (neg_p (xi, SIGNED))
+      /* If x doesn't fit and is negative, then it must be more
+        negative than any value in y, and hence smaller than y.  */
+      if (neg_p (xi))
        return true;
-      // If x is positive, then it must be larger than any value in y,
-      // and hence greater than y.
+      /* If x is positive, then it must be larger than any value in y,
+        and hence greater than y.  */
       return false;
     }
+  /* Optimize the opposite case, if it can be detected at compile time.  */
+  if (CAN_TELL (xi.len == 1))
+    /* If YI is negative it is lower than the least HWI.
+       If YI is positive it is greater than the greatest HWI.  */
+    return !neg_p (yi);
   return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len);
 }
 
@@ -1553,6 +1573,12 @@ wi::ltu_p (const T1 &x, const T2 &y)
   unsigned int precision = get_binary_precision (x, y);
   WIDE_INT_REF_FOR (T1) xi (x, precision);
   WIDE_INT_REF_FOR (T2) yi (y, precision);
+  /* Optimize comparisons with constants and with sub-HWI unsigned
+     integers.  */
+  if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0))
+    return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0];
+  if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0))
+    return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0];
   if (precision <= HOST_BITS_PER_WIDE_INT)
     {
       unsigned HOST_WIDE_INT xl = xi.to_uhwi ();

[wide-int] More optimisations

Reply via email to