Re: [wide-int] More optimisations

Richard Biener Tue, 29 Oct 2013 05:21:23 -0700

On Sun, 27 Oct 2013, Richard Sandiford wrote:

> This patch adds some more optimisations to the wi:: comparison functions.
> It uses the:
> 
>   #define CONSTANT(X) (__builtin_constant_p (X) && (X))
> 
> idiom that was mentioned before, except that I thought CONSTANT would be
> too easily confused with CONSTANT_P, so I went for CAN_TELL instead.
> Better names welcome.


STATIC_CONSTANT_P similar to static_assert?

> The changes are:
> 
> - Add a fast path to eq_p for when one of the inputs isn't sign-extended.
>   This includes code to handle compile-time 0 specially.
> 
> - Add the opposite optimisation to Mike's lts_p change, if we can tell at
>   compile time that it applies.

I think the cases Mike added should only be enabled when we can figure
them out at compile-time, too.

Ok with that change.

Thanks,
Richard.

> - Add fast paths to ltu_p for constants.
> 
> E.g.:
> 
>   bool
>   f1 (const_tree x)
>   {
>     return wi::eq_p (x, 0);
>   }
> 
> now gives:
> 
>         xorl    %eax, %eax
>         cmpw    $1, 4(%rdi)
>         je      .L5
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L5:
>         cmpq    $0, 16(%rdi)
>         sete    %al
>         ret
> 
>   bool
>   f2 (const_tree x, HOST_WIDE_INT y)
>   {
>     return wi::eq_p (x, y);
>   }
> 
> gives:
> 
>         movq    8(%rdi), %rax
>         movzwl  52(%rax), %edx
>         xorl    %eax, %eax
>         andw    $1023, %dx
>         cmpw    $1, 4(%rdi)
>         je      .L10
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L10:
>         xorq    16(%rdi), %rsi
>         movzwl  %dx, %edx
>         movl    $64, %ecx
>         subl    %edx, %ecx
>         movq    %rsi, %rax
>         salq    %cl, %rax
>         testl   %ecx, %ecx
>         cmovg   %rax, %rsi
>         testq   %rsi, %rsi
>         sete    %al
>         ret
> 
>   bool
>   f3 (HOST_WIDE_INT x, const_tree y)
>   {
>     return wi::lts_p (x, y);
>   }
> 
> is similarly ugly because of way that it ignores TYPE_SIGN and so has to
> explicitly sign-extend "small-prec" cases:
> 
>         movq    8(%rsi), %rax
>         movzwl  4(%rsi), %ecx
>         movzwl  52(%rax), %edx
>         andl    $1023, %edx
>         cmpl    $1, %ecx
>         je      .L16
>         leal    -1(%rcx), %eax
>         sall    $6, %ecx
>         subl    %edx, %ecx
>         movq    16(%rsi,%rax,8), %rax
>         movq    %rax, %rdx
>         salq    %cl, %rdx
>         testl   %ecx, %ecx
>         cmovg   %rdx, %rax
>         sarq    $63, %rax
>         addl    $1, %eax
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L16:
>         cmpl    $63, %edx
>         movq    16(%rsi), %rax
>         ja      .L13
>         movb    $64, %cl
>         subl    %edx, %ecx
>         salq    %cl, %rax
>         sarq    %cl, %rax
> .L13:
>         cmpq    %rdi, %rax
>         setg    %al
>         ret
> 
> but:
> 
>   bool
>   f4 (HOST_WIDE_INT x, const_tree y)
>   {
>     return wi::lts_p (x, wi::to_widest (y));
>   }
> 
> is a bit more respectable:
> 
>         movzwl  6(%rsi), %eax
>         cmpl    $1, %eax
>         je      .L20
>         subl    $1, %eax
>         movq    16(%rsi,%rax,8), %rax
>         sarq    $63, %rax
>         addl    $1, %eax
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L20:
>         cmpq    %rdi, 16(%rsi)
>         setg    %al
>         ret
> 
> For similar reasons:
> 
>   bool
>   f5 (const_tree x)
>   {
>     return wi::ltu_p (x, 100);
>   }
> 
> gives:
> 
>         movq    8(%rdi), %rax
>         movzwl  52(%rax), %ecx
>         xorl    %eax, %eax
>         andw    $1023, %cx
>         cmpw    $1, 4(%rdi)
>         je      .L26
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L26:
>         cmpw    $63, %cx
>         ja      .L23
>         movl    $1, %eax
>         salq    %cl, %rax
>         subq    $1, %rax
>         andq    16(%rdi), %rax
> .L24:
>         cmpq    $99, %rax
>         setbe   %al
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L23:
>         movq    16(%rdi), %rax
>         jmp     .L24
> 
> but:
> 
>   bool
>   f6 (const_tree x)
>   {
>     return wi::ltu_p (wi::to_widest (x), 100);
>   }
> 
> gives:
> 
>         xorl    %eax, %eax
>         cmpw    $1, 6(%rdi)
>         je      .L30
>         rep ret
>         .p2align 4,,10
>         .p2align 3
> .L30:
>         cmpq    $99, 16(%rdi)
>         setbe   %al
>         ret
> 
> Tested on powerpc64-linux-gnu and x86_64-linux-gnu.  OK for wide-int?
> 
> Thanks,
> Richard
> 
> 
> Index: gcc/system.h
> ===================================================================
> --- gcc/system.h      2013-10-27 14:25:19.144723977 +0000
> +++ gcc/system.h      2013-10-27 14:25:20.716738045 +0000
> @@ -711,6 +711,12 @@ #define gcc_unreachable() __builtin_unre
>  #define gcc_unreachable() (fancy_abort (__FILE__, __LINE__, __FUNCTION__))
>  #endif
>  
> +#if GCC_VERSION >= 3001
> +#define CAN_TELL(X) (__builtin_constant_p (X) && (X))
> +#else
> +#define CAN_TELL(X) (false && (X))
> +#endif
> +
>  /* Until we can use STATIC_ASSERT.  */
>  #define STATIC_ASSERT(X) \
>    typedef int assertion1[(X) ? 1 : -1] ATTRIBUTE_UNUSED
> Index: gcc/wide-int.h
> ===================================================================
> --- gcc/wide-int.h    2013-10-27 14:25:19.144723977 +0000
> +++ gcc/wide-int.h    2013-10-27 14:37:34.834443832 +0000
> @@ -1495,6 +1495,7 @@ wi::eq_p (const T1 &x, const T2 &y)
>    WIDE_INT_REF_FOR (T2) yi (y, precision);
>    if (xi.is_sign_extended && yi.is_sign_extended)
>      {
> +      /* This case reduces to array equality.  */
>        if (xi.len != yi.len)
>       return false;
>        unsigned int i = 0;
> @@ -1504,10 +1505,21 @@ wi::eq_p (const T1 &x, const T2 &y)
>        while (++i != xi.len);
>        return true;
>      }
> -  if (precision <= HOST_BITS_PER_WIDE_INT)
> +  if (yi.len == 1)
>      {
> -      unsigned HOST_WIDE_INT diff = xi.ulow () ^ yi.ulow ();
> -      return (diff << (-precision % HOST_BITS_PER_WIDE_INT)) == 0;
> +      /* XI is only equal to YI if it too has a single HWI.  */
> +      if (xi.len != 1)
> +     return false;
> +      /* Excess bits in xi.val[0] will be signs or zeros, so comparisons
> +      with 0 are simple.  */
> +      if (CAN_TELL (yi.val[0] == 0))
> +     return xi.val[0] == 0;
> +      /* Otherwise flush out any excess bits first.  */
> +      unsigned HOST_WIDE_INT diff = xi.val[0] ^ yi.val[0];
> +      int excess = HOST_BITS_PER_WIDE_INT - precision;
> +      if (excess > 0)
> +     diff <<= excess;
> +      return diff == 0;
>      }
>    return eq_p_large (xi.val, xi.len, yi.val, yi.len, precision);
>  }
> @@ -1528,20 +1540,28 @@ wi::lts_p (const T1 &x, const T2 &y)
>    unsigned int precision = get_binary_precision (x, y);
>    WIDE_INT_REF_FOR (T1) xi (x, precision);
>    WIDE_INT_REF_FOR (T2) yi (y, precision);
> -  // We optimize x < y, where y is 64 or fewer bits.
> +  /* We optimize x < y, where y is 64 or fewer bits.  */
>    if (wi::fits_shwi_p (yi))
>      {
> -      // If x fits directly into a shwi, we can compare directly.
> +      /* Make lts_p (x, 0) as efficient as wi::neg_p (x).  */
> +      if (CAN_TELL (yi.val[0] == 0))
> +     return neg_p (xi);
> +      /* If x fits directly into a shwi, we can compare directly.  */
>        if (wi::fits_shwi_p (xi))
>       return xi.to_shwi () < yi.to_shwi ();
> -      // If x doesn't fit and is negative, then it must be more
> -      // negative than any value in y, and hence smaller than y.
> -      if (neg_p (xi, SIGNED))
> +      /* If x doesn't fit and is negative, then it must be more
> +      negative than any value in y, and hence smaller than y.  */
> +      if (neg_p (xi))
>       return true;
> -      // If x is positive, then it must be larger than any value in y,
> -      // and hence greater than y.
> +      /* If x is positive, then it must be larger than any value in y,
> +      and hence greater than y.  */
>        return false;
>      }
> +  /* Optimize the opposite case, if it can be detected at compile time.  */
> +  if (CAN_TELL (xi.len == 1))
> +    /* If YI is negative it is lower than the least HWI.
> +       If YI is positive it is greater than the greatest HWI.  */
> +    return !neg_p (yi);
>    return lts_p_large (xi.val, xi.len, precision, yi.val, yi.len);
>  }
>  
> @@ -1553,6 +1573,12 @@ wi::ltu_p (const T1 &x, const T2 &y)
>    unsigned int precision = get_binary_precision (x, y);
>    WIDE_INT_REF_FOR (T1) xi (x, precision);
>    WIDE_INT_REF_FOR (T2) yi (y, precision);
> +  /* Optimize comparisons with constants and with sub-HWI unsigned
> +     integers.  */
> +  if (CAN_TELL (yi.len == 1 && yi.val[0] >= 0))
> +    return xi.len == 1 && xi.to_uhwi () < (unsigned HOST_WIDE_INT) yi.val[0];
> +  if (CAN_TELL (xi.len == 1 && xi.val[0] >= 0))
> +    return yi.len != 1 || yi.to_uhwi () > (unsigned HOST_WIDE_INT) xi.val[0];
>    if (precision <= HOST_BITS_PER_WIDE_INT)
>      {
>        unsigned HOST_WIDE_INT xl = xi.to_uhwi ();
> 
> 

-- 
Richard Biener <[email protected]>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend

Re: [wide-int] More optimisations

Reply via email to