[PATCH, rs6000] refactor/cleanup in rs6000-string.c

2018-07-31 Thread Aaron Sawdey
Just teasing things apart a bit more in this function so I can add
vec/vsx code generation without making it enormous and
incomprehensible.

Bootstrap/regtest passes on powerpc64le, ok for trunk?

Thanks,
Aaron


2018-07-31  Aaron Sawdey  

* config/rs6000/rs6000-string.c (select_block_compare_mode): Move test
for word_mode_ok here instead of passing as argument.
(expand_block_compare): Change select_block_compare_mode() call.
(expand_strncmp_gpr_sequence): New function.
(expand_strn_compare): Make use of expand_strncmp_gpr_sequence.

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 263039)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -238,13 +238,11 @@
 
OFFSET is the current read offset from the beginning of the block.
BYTES is the number of bytes remaining to be read.
-   ALIGN is the minimum alignment of the memory blocks being compared in bytes.
-   WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
-   the largest allowable mode.  */
+   ALIGN is the minimum alignment of the memory blocks being compared in 
bytes.  */
 static machine_mode
 select_block_compare_mode (unsigned HOST_WIDE_INT offset,
   unsigned HOST_WIDE_INT bytes,
-  unsigned HOST_WIDE_INT align, bool word_mode_ok)
+  unsigned HOST_WIDE_INT align)
 {
   /* First see if we can do a whole load unit
  as that will be more efficient than a larger load + shift.  */
@@ -257,6 +255,11 @@
   /* The most we can read without potential page crossing.  */
   unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
 
+  /* If we have an LE target without ldbrx and word_mode is DImode,
+ then we must avoid using word_mode.  */
+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
+  && word_mode == DImode);
+
   if (word_mode_ok && bytes >= UNITS_PER_WORD)
 return word_mode;
   else if (bytes == GET_MODE_SIZE (SImode))
@@ -1382,16 +1385,11 @@
   else
 cond = gen_reg_rtx (CCmode);
 
-  /* If we have an LE target without ldbrx and word_mode is DImode,
- then we must avoid using word_mode.  */
-  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
-  && word_mode == DImode);
-
   /* Strategy phase.  How many ops will this take and should we expand it?  */
 
   unsigned HOST_WIDE_INT offset = 0;
   machine_mode load_mode =
-select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
+select_block_compare_mode (offset, bytes, base_align);
   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
 
   /* We don't want to generate too much code.  The loop code can take
@@ -1445,8 +1443,7 @@
   while (bytes > 0)
 {
   unsigned int align = compute_current_alignment (base_align, offset);
-  load_mode = select_block_compare_mode (offset, bytes,
-align, word_mode_ok);
+  load_mode = select_block_compare_mode (offset, bytes, align);
   load_mode_size = GET_MODE_SIZE (load_mode);
   if (bytes >= load_mode_size)
cmp_bytes = load_mode_size;
@@ -1698,6 +1695,189 @@
   LABEL_NUSES (strncmp_label) += 1;
 }
 
+/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
+   BYTES_TO_COMPARE is the number of bytes to be compared.
+   BASE_ALIGN is the smaller of the alignment of the two strings.
+   ORIG_SRC1 is the unmodified rtx for the first string.
+   ORIG_SRC2 is the unmodified rtx for the second string.
+   TMP_REG_SRC1 is the register for loading the first string.
+   TMP_REG_SRC2 is the register for loading the second string.
+   RESULT_REG is the rtx for the result register.
+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
+   to strcmp/strncmp if we have equality at the end of the inline comparison.
+   CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
+   and generate the final comparison result.
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
+   set the final result.  */
+static void
+expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
+   unsigned int base_align,
+   rtx orig_src1, rtx orig_src2,
+   rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
+   bool equality_compare_rest, rtx &cleanup_label,
+   rtx final_move_label)
+{
+  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
+  machine_mode load_mode;
+  unsigned int load_mode_size;
+  unsigned HOST_WIDE_INT cmp_bytes = 0;
+  unsigned HOST_WIDE_INT offset = 0;
+  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
+  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
+
+ 

Re: [Patch,optimization]: Optimized changes in the estimate register pressure cost.

2015-09-28 Thread Aaron Sawdey
On Sat, 2015-09-26 at 04:51 +, Ajit Kumar Agarwal wrote:
> I have made the following changes in the estimate_reg_pressure_cost function 
> used 
> by the loop invariant and IVOPTS. 
> 
> Earlier the estimate_reg_pressure cost uses the cost of n_new variables that 
> are generated by the Loop Invariant
>  and IVOPTS. These are not sufficient for register pressure calculation. The 
> register pressure cost calculation should
> use the n_new + n_old (numbers) to consider the cost. n_old is the register  
> used inside the loops and the effect of
>  n_new new variables generated by loop invariant and IVOPTS on register 
> pressure is based on how the new
> variables impact on register used inside the loops. The increase or decrease 
> in register pressure is due to the impact
> of new variables on the register used  inside the loops. The 
> register-register move cost or the spill cost should consider
> the cost associated with register used and the new variables generated. The 
> movement  of new variables increases or 
> decreases the register pressure, which is based on  overall cost of n_new + 
> n_old variables.
> 
> The increase and decrease in register pressure is based on the overall cost 
> of n_new + n_old as the changes in the 
> register pressure caused due to new variables is based on how the changes 
> behave with respect to the register used 
> in the loops.
> 
> Thus the register pressure caused to new variables is based on the new 
> variables and its impact on register used inside
>  the loops and thus consider the overall  cost of n_new + n_old.
> 
> Bootstrap for i386 and reg tested on i386 with the change is fine.
> 
> SPEC CPU 2000 benchmarks are run and there is following impact on the 
> performance
> and code size.
> 
> ratio with the optimization vs ratio without optimization for INT benchmarks
> (3807.632 vs 3804.661)
> 
> ratio with the optimization vs ratio without optimization for FP benchmarks
> ( 4668.743 vs 4778.741)
> 
> Code size reduction with respect to FP SPEC CPU 2000 benchmarks
> 
> Number of instruction with optimization = 1094117
> Number of instruction without optimization = 1094659
> 
> Reduction in number of instruction with the optimization = 542 instruction.
> 
> [Patch,optimization]: Optimized changes in the estimate
>  register pressure cost.
> 
> Earlier the estimate_reg_pressure cost uses the cost of n_new variables that
> are generated by the Loop Invariant and IVOPTS. These are not sufficient for
> register pressure calculation. The register pressure cost calculation should
> use the n_new + n_old (numbers) to consider the cost. n_old is the register
> used inside the loops and the affect of n_new new variables generated by
> loop invariant and IVOPTS on register pressure is based on how the new
> variables impact on register used inside the loops.
> 
> ChangeLog:
> 2015-09-26  Ajit Agarwal  
> 
>   * cfgloopanal.c (estimate_reg_pressure_cost) : Add changes
>   to consider the n_new plus n_old in the register pressure
>   cost.
> 
> Signed-off-by:Ajit Agarwal ajit...@xilinx.com

Ajit,
  It looks to me like your change doesn't do anything at all inside the
loop-invariant.c code. There it's doing a difference between two
estimate_reg_pressure_cost calls so adding n_old (regs_used) to both is
canceled out.

  size_cost = (estimate_reg_pressure_cost (new_regs[0] + regs_needed[0],
   regs_used, speed, call_p)
   - estimate_reg_pressure_cost (new_regs[0],
     regs_used, speed, call_p));

I'm not quite sure I understand the "why" of the heuristic you've added
here -- can you explain your reasoning further?

> 
> Thanks & Regards
> Ajit
> 

Thanks,
Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-09-27 Thread Aaron Sawdey
This is the third piece of my effort to improve inline expansion of memmove. The
first two parts I posted back in June fixed the names of the optab entries
involved so that optab cpymem is used for memcpy() and optab movmem is used for
memmove(). This piece adds support for actually attempting to invoke the movmem
optab to do inline expansion of __builtin_memmove().

Because what needs to be done for memmove() is very similar to memcpy(), I have
just added a bool parm "might_overlap" to several of the functions involved so
the same functions can handle both. The name might_overlap comes from the fact
that if we still have a memmove() call at expand, this means
gimple_fold_builtin_memory_op() was not able to prove that the source and
destination do not overlap.

There are a few places where might_overlap gets used to keep us from trying to
use the by-pieces infrastructure or generate a copy loop, as neither of those
things will work correctly if source and destination overlap.

I've restructured things slightly in emit_block_move_hints() so that we can
try the pattern first if we already know that by-pieces won't work. This way
we can bail out immediately in the might_overlap case.

Bootstrap/regtest passed on ppc64le, in progress on x86_64. If everything 
passes,
is this ok for trunk?


2019-09-27  Aaron Sawdey 

* builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
(expand_builtin_memcpy): Use might_overlap parm.
(expand_builtin_mempcpy_args): Use might_overlap parm.
(expand_builtin_memmove): Call expand_builtin_memory_copy_args.
(expand_builtin_memory_copy_args): Add might_overlap parm.
* expr.c (emit_block_move_via_cpymem): Rename to
emit_block_move_via_pattern, add might_overlap parm, use cpymem
or movmem optab as appropriate.
(emit_block_move_hints): Add might_overlap parm, do the right
thing for might_overlap==true.
* expr.h (emit_block_move_hints): Update prototype.




Index: gcc/builtins.c
===
--- gcc/builtins.c  (revision 276131)
+++ gcc/builtins.c  (working copy)
@@ -127,7 +127,8 @@
 static rtx expand_builtin_memcpy (tree, rtx);
 static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
rtx target, tree exp,
-   memop_ret retmode);
+   memop_ret retmode,
+   bool might_overlap);
 static rtx expand_builtin_memmove (tree, rtx);
 static rtx expand_builtin_mempcpy (tree, rtx);
 static rtx expand_builtin_mempcpy_args (tree, tree, tree, rtx, tree, 
memop_ret);
@@ -3790,7 +3791,7 @@
   check_memop_access (exp, dest, src, len);

   return expand_builtin_memory_copy_args (dest, src, len, target, exp,
- /*retmode=*/ RETURN_BEGIN);
+ /*retmode=*/ RETURN_BEGIN, false);
 }

 /* Check a call EXP to the memmove built-in for validity.
@@ -3797,7 +3798,7 @@
Return NULL_RTX on both success and failure.  */

 static rtx
-expand_builtin_memmove (tree exp, rtx)
+expand_builtin_memmove (tree exp, rtx target)
 {
   if (!validate_arglist (exp,
 POINTER_TYPE, POINTER_TYPE, INTEGER_TYPE, VOID_TYPE))
@@ -3809,7 +3810,8 @@

   check_memop_access (exp, dest, src, len);

-  return NULL_RTX;
+  return expand_builtin_memory_copy_args (dest, src, len, target, exp,
+ /*retmode=*/ RETURN_BEGIN, true);
 }

 /* Expand a call EXP to the mempcpy builtin.
@@ -3858,7 +3860,8 @@

 static rtx
 expand_builtin_memory_copy_args (tree dest, tree src, tree len,
-rtx target, tree exp, memop_ret retmode)
+rtx target, tree exp, memop_ret retmode,
+bool might_overlap)
 {
   const char *src_str;
   unsigned int src_align = get_pointer_alignment (src);
@@ -3894,10 +3897,11 @@
&probable_max_size);
   src_str = c_getstr (src);

-  /* If SRC is a string constant and block move would be done
- by pieces, we can avoid loading the string from memory
- and only stored the computed constants.  */
-  if (src_str
+  /* If SRC is a string constant and block move would be done by
+ pieces, we can avoid loading the string from memory and only
+ stored the computed constants.  I'm not sure if the by pieces
+ method works if src/dest are overlapping, so avoid that case.  */
+  if (src_str && !might_overlap
   && CONST_INT_P (len_rtx)
   && (unsigned HOST_WIDE_INT) INTVAL (len_rtx) <= strlen (src_str) + 1
   && can_store_by_pieces (INTVAL (len_rtx), builtin_memcpy_read_str,
@@ -3922,7 +3926,7 @@
   && (retmode == RETURN_BEGIN || target =

[PATCH, RS6000] Add movmemsi pattern for inline expansion of memmove()

2019-09-30 Thread Aaron Sawdey
This patch uses the support added in the patch I posted last week for actually 
doing
inline expansion of memmove(). I've added a might_overlap parameter to 
expand_block_move()
to tell it when it must make sure to handle overlapping moves. I changed the 
code to
save up the generated rtx for both loads and stores instead of just stores. In 
the
might_overlap==true case, if we get to MAX_MOVE_REG and the move is not done 
yet, then
we bail out and return false. So what this can now do is inline expand any 
memmove()
that can be done in 4 loads followed by 4 stores. It will use lxv/stxv if 
size/alignment
allows, otherwise it will use unaligned integer loads/stores. So it can expand 
most
memmove() up to 32 bytes, and some that are 33-64 bytes if the arguments are 16 
byte
aligned.

I've also removed the code from expand_block_move() for dealing with
mode==BLKmode because I don't believe that can happen. The big if construct that
figures out which size we are going to use has a plain else on it, and every
clause in it sets mode to something other than BLKmode. So I removed that code
to simplify things and just left a gcc_assert(mode != BLKmode).

Regtest in progress on ppc64le (power9), if tests are ok, is this ok for trunk
after the movmem optab patch posted last week is approved?

Thanks!
   Aaron

2019-09-30  Aaron Sawdey 

* config/rs6000/rs6000-protos.h (expand_block_move): Change prototype.
* config/rs6000/rs6000-string.c (expand_block_move): Add might_overlap 
parm.
* config/rs6000/rs6000.md (movmemsi): Add new pattern.
(cpymemsi): Add might_overlap parm to expand_block_move() call.



Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 276131)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -69,7 +69,7 @@
 extern void rs6000_generate_float2_double_code (rtx, rtx, rtx);
 extern void rs6000_generate_vsigned2_code (bool, rtx, rtx, rtx);
 extern int expand_block_clear (rtx[]);
-extern int expand_block_move (rtx[]);
+extern int expand_block_move (rtx[], bool);
 extern bool expand_block_compare (rtx[]);
 extern bool expand_strn_compare (rtx[], int);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 276131)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2719,7 +2719,7 @@
 #define MAX_MOVE_REG 4

 int
-expand_block_move (rtx operands[])
+expand_block_move (rtx operands[], bool might_overlap)
 {
   rtx orig_dest = operands[0];
   rtx orig_src = operands[1];
@@ -2730,6 +2730,7 @@
   int bytes;
   int offset;
   int move_bytes;
+  rtx loads[MAX_MOVE_REG];
   rtx stores[MAX_MOVE_REG];
   int num_reg = 0;

@@ -2817,47 +2818,35 @@
  gen_func.mov = gen_movqi;
}

+  /* Mode is always set to something other than BLKmode by one of the
+cases of the if statement above.  */
+  gcc_assert (mode != BLKmode);
+
   src = adjust_address (orig_src, mode, offset);
   dest = adjust_address (orig_dest, mode, offset);

-  if (mode != BLKmode)
-   {
- rtx tmp_reg = gen_reg_rtx (mode);
+  rtx tmp_reg = gen_reg_rtx (mode);
+
+  loads[num_reg]= (*gen_func.mov) (tmp_reg, src);
+  stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);

- emit_insn ((*gen_func.mov) (tmp_reg, src));
- stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
-   }
+  /* If we didn't succeed in doing it in one pass, we can't do it in the
+might_overlap case.  Bail out and return failure.  */
+  if (might_overlap && num_reg >= MAX_MOVE_REG
+ && bytes > move_bytes)
+   return 0;

-  if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
+  /* Emit loads and stores saved up.  */
+  if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
{
  int i;
  for (i = 0; i < num_reg; i++)
+   emit_insn (loads[i]);
+ for (i = 0; i < num_reg; i++)
emit_insn (stores[i]);
  num_reg = 0;
}
-
-  if (mode == BLKmode)
-   {
- /* Move the address into scratch registers.  The movmemsi
-patterns require zero offset.  */
- if (!REG_P (XEXP (src, 0)))
-   {
- rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
- src = replace_equiv_address (src, src_reg);
-   }
- set_mem_size (src, move_bytes);
-
- if (!REG_P (XEXP (dest, 0)))
-   {
- rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
- dest = replace_equiv_address (dest, dest_reg);
-   }
- set_mem_size (dest, move_bytes);
-
- emit_insn ((*gen_func.movmemsi) (dest, src,
-   

Re: [PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-10-02 Thread Aaron Sawdey
On 10/1/19 4:45 PM, Jeff Law wrote:
> On 9/27/19 12:23 PM, Aaron Sawdey wrote:
>> This is the third piece of my effort to improve inline expansion of memmove. 
>> The
>> first two parts I posted back in June fixed the names of the optab entries
>> involved so that optab cpymem is used for memcpy() and optab movmem is used 
>> for
>> memmove(). This piece adds support for actually attempting to invoke the 
>> movmem
>> optab to do inline expansion of __builtin_memmove().
>>
>> Because what needs to be done for memmove() is very similar to memcpy(), I 
>> have
>> just added a bool parm "might_overlap" to several of the functions involved 
>> so
>> the same functions can handle both. The name might_overlap comes from the 
>> fact
>> that if we still have a memmove() call at expand, this means
>> gimple_fold_builtin_memory_op() was not able to prove that the source and
>> destination do not overlap.
>>
>> There are a few places where might_overlap gets used to keep us from trying 
>> to
>> use the by-pieces infrastructure or generate a copy loop, as neither of those
>> things will work correctly if source and destination overlap.
>>
>> I've restructured things slightly in emit_block_move_hints() so that we can
>> try the pattern first if we already know that by-pieces won't work. This way
>> we can bail out immediately in the might_overlap case.
>>
>> Bootstrap/regtest passed on ppc64le, in progress on x86_64. If everything 
>> passes,
>> is this ok for trunk?
>>
>>
>> 2019-09-27  Aaron Sawdey 
>>
>>  * builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>  (expand_builtin_memcpy): Use might_overlap parm.
>>  (expand_builtin_mempcpy_args): Use might_overlap parm.
>>  (expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>  (expand_builtin_memory_copy_args): Add might_overlap parm.
>>  * expr.c (emit_block_move_via_cpymem): Rename to
>>  emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>  or movmem optab as appropriate.
>>  (emit_block_move_hints): Add might_overlap parm, do the right
>>  thing for might_overlap==true.
>>  * expr.h (emit_block_move_hints): Update prototype.
>>
>>
>>
>>
>> Index: gcc/builtins.c
>> ===
>> --- gcc/builtins.c   (revision 276131)
>> +++ gcc/builtins.c   (working copy)
>> @@ -3894,10 +3897,11 @@
>>  &probable_max_size);
>>src_str = c_getstr (src);
>>
>> -  /* If SRC is a string constant and block move would be done
>> - by pieces, we can avoid loading the string from memory
>> - and only stored the computed constants.  */
>> -  if (src_str
>> +  /* If SRC is a string constant and block move would be done by
>> + pieces, we can avoid loading the string from memory and only
>> + stored the computed constants.  I'm not sure if the by pieces
>> + method works if src/dest are overlapping, so avoid that case.  */
>> +  if (src_str && !might_overlap
> I don't think you need the check here.  c_getstr, when it returns
> somethign useful is going to be returning a string constant.  Think read
> only literals here.  I'm pretty sure overlap isn't going to be possible.

After some digging, I agree -- c_getstr() return a string constant and
that is then used to generate stores of constants.

I've fixed the other issues and also fixed emit_block_move_via_pattern() to
make use of pieces_ok instead of calling can_move_by_pieces() a second
time. The patch I'm actually committing is below.

Thanks for the review!

  Aaron

Index: gcc/builtins.c
===
--- gcc/builtins.c  (revision 276131)
+++ gcc/builtins.c  (working copy)
@@ -127,7 +127,8 @@
 static rtx expand_builtin_memcpy (tree, rtx);
 static rtx expand_builtin_memory_copy_args (tree dest, tree src, tree len,
rtx target, tree exp,
-   memop_ret retmode);
+   memop_ret retmode,
+   bool might_overlap);
 static rtx expand_builtin_memmove (tree, rtx);
 static rtx expand_builtin_mempcpy (tree, rtx);
 static rtx expand_builtin_mempcpy_args (tree, tree, tree, rtx, tree, 
memop_ret);
@@ -3790,7 +3791,7 @@
   check_memop_access (exp, dest, src, len);

   return expand_builtin_memory_copy_args (dest, src, len, target, exp,
-   

Re: [PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-10-02 Thread Aaron Sawdey
On 10/2/19 5:35 PM, Jakub Jelinek wrote:
> On Wed, Oct 02, 2019 at 09:21:23AM -0500, Aaron Sawdey wrote:
>>>> 2019-09-27  Aaron Sawdey 
>>>>
>>>>* builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>(expand_builtin_memcpy): Use might_overlap parm.
>>>>(expand_builtin_mempcpy_args): Use might_overlap parm.
>>>>(expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>>>(expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>* expr.c (emit_block_move_via_cpymem): Rename to
>>>>emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>>>or movmem optab as appropriate.
>>>>(emit_block_move_hints): Add might_overlap parm, do the right
>>>>thing for might_overlap==true.
>>>>* expr.h (emit_block_move_hints): Update prototype.
> 
>> @@ -1622,13 +1624,30 @@
>>set_mem_size (y, const_size);
>>  }
>>
>> -  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
>> +  bool pieces_ok = can_move_by_pieces (INTVAL (size), align);
>> +  bool pattern_ok = false;
>> +
>> +  if (!CONST_INT_P (size) || !pieces_ok || might_overlap)
> ...
> 
> This change broke rtl checking bootstrap.
> You can't use INTVAL on size that isn't CONST_INT_P.
> 
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux,
> committed to trunk as obvious:

Jakub,
  Sorry about that! Now that you point it out, it's obvious. But what it means
for me is that I need to be in the habit of bootstrapping with 
--enable-checking=rtl
when I make these changes.

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain


Re: [PATCH] Use movmem optab to attempt inline expansion of __builtin_memmove()

2019-10-03 Thread Aaron Sawdey
On 10/2/19 5:44 PM, Aaron Sawdey wrote:
> On 10/2/19 5:35 PM, Jakub Jelinek wrote:
>> On Wed, Oct 02, 2019 at 09:21:23AM -0500, Aaron Sawdey wrote:
>>>>> 2019-09-27  Aaron Sawdey 
>>>>>
>>>>>   * builtins.c (expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>>   (expand_builtin_memcpy): Use might_overlap parm.
>>>>>   (expand_builtin_mempcpy_args): Use might_overlap parm.
>>>>>   (expand_builtin_memmove): Call expand_builtin_memory_copy_args.
>>>>>   (expand_builtin_memory_copy_args): Add might_overlap parm.
>>>>>   * expr.c (emit_block_move_via_cpymem): Rename to
>>>>>   emit_block_move_via_pattern, add might_overlap parm, use cpymem
>>>>>   or movmem optab as appropriate.
>>>>>   (emit_block_move_hints): Add might_overlap parm, do the right
>>>>>   thing for might_overlap==true.
>>>>>   * expr.h (emit_block_move_hints): Update prototype.
>>
>>> @@ -1622,13 +1624,30 @@
>>>set_mem_size (y, const_size);
>>>  }
>>>
>>> -  if (CONST_INT_P (size) && can_move_by_pieces (INTVAL (size), align))
>>> +  bool pieces_ok = can_move_by_pieces (INTVAL (size), align);
>>> +  bool pattern_ok = false;
>>> +
>>> +  if (!CONST_INT_P (size) || !pieces_ok || might_overlap)
>> ...
>>
>> This change broke rtl checking bootstrap.
>> You can't use INTVAL on size that isn't CONST_INT_P.
>>
>> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux,
>> committed to trunk as obvious:
> 
> Jakub,
>   Sorry about that! Now that you point it out, it's obvious. But what it means
> for me is that I need to be in the habit of bootstrapping with 
> --enable-checking=rtl
> when I make these changes.

I stared at this for a bit and came up with a slightly cleaner fix that is one 
less line:

2019-10-03  Aaron Sawdey 

* expr.c (emit_block_move_hints): Slightly cleaner fix to
can_move_by_pieces issue.

Index: gcc/expr.c
===
--- gcc/expr.c  (revision 276516)
+++ gcc/expr.c  (working copy)
@@ -1624,9 +1624,8 @@
   set_mem_size (y, const_size);
 }

-  bool pieces_ok = false;
-  if (CONST_INT_P (size))
-pieces_ok = can_move_by_pieces (INTVAL (size), align);
+  bool pieces_ok = CONST_INT_P (size)
+&& can_move_by_pieces (INTVAL (size), align);
   bool pattern_ok = false;

   if (!pieces_ok || might_overlap)

Bootstrap/regtest (with --enable-checking=yes,rtl,tree) ok on ppc64le (power9),
committed as obvious.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain


[PATCH, rs6000] inline expansion of str[n]cmp using vec/vsx instructions

2018-08-22 Thread Aaron Sawdey
This patch teaches rs6000 inline expansion of strcmp/strncmp how to
generate vector/vsx code for power8/power9 processors. Compares 16
bytes and longer are generated using the vector code, which is
considerably faster than the gpr based code. 

Bootstrap/regtest passes on ppc64 (power8) and ppc64le (power8 and
power9). Ok for trunk?

Thanks!
Aaron


2018-08-22  Aaron Sawdey  

* config/rs6000/altivec.md (altivec_eq): Remove star.
* config/rs6000/rs6000-string.c (do_load_for_compare): Support
vector load modes.
(expand_strncmp_vec_sequence): New function.
(emit_final_str_compare_vec): New function.
(expand_strn_compare): Support for vector strncmp.
* config/rs6000/rs6000.opt (-mstring-compare-inline-limit): Change
length specification to bytes.
* config/rs6000/vsx.md (vsx_ld_elemrev_v16qi_internal): Remove star.
(vcmpnezb_p): New pattern.
* doc/invoke.texi (RS/6000 and PowerPC Options): Update documentation
for option -mstring-compare-inline-limit.

Index: gcc/config/rs6000/altivec.md
===
--- gcc/config/rs6000/altivec.md(revision 263753)
+++ gcc/config/rs6000/altivec.md(working copy)
@@ -603,7 +603,7 @@
   "vcmpbfp %0,%1,%2"
   [(set_attr "type" "veccmp")])
 
-(define_insn "*altivec_eq"
+(define_insn "altivec_eq"
   [(set (match_operand:VI2 0 "altivec_register_operand" "=v")
(eq:VI2 (match_operand:VI2 1 "altivec_register_operand" "v")
(match_operand:VI2 2 "altivec_register_operand" "v")))]
@@ -2304,7 +2304,7 @@
 
 ;; Compare vectors producing a vector result and a predicate, setting CR6 to
 ;; indicate a combined status
-(define_insn "*altivec_vcmpequ_p"
+(define_insn "altivec_vcmpequ_p"
   [(set (reg:CC CR6_REGNO)
(unspec:CC [(eq:CC (match_operand:VI2 1 "register_operand" "v")
   (match_operand:VI2 2 "register_operand" "v"))]
Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 263753)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -157,6 +157,29 @@
 {
   switch (GET_MODE (reg))
 {
+case E_V16QImode:
+  switch (mode) {
+  case E_V16QImode:
+   if (!BYTES_BIG_ENDIAN) 
+ if (TARGET_P9_VECTOR)
+   emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
+ else
+   {
+ rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, V16QImode, 0);
+ gcc_assert (MEM_P (mem));
+ rtx addr = XEXP (mem, 0);
+ rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
+ MEM_COPY_ATTRIBUTES (mem_v2di, mem);
+ set_mem_size (mem, GET_MODE_SIZE (V2DImode));
+ emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
+   }
+   else
+ emit_insn (gen_vsx_movv2di_64bit (reg, mem));
+   break;
+  default:
+   gcc_unreachable ();
+  }
+  break;
 case E_DImode:
   switch (mode)
{
@@ -227,7 +250,22 @@
  gcc_unreachable ();
}
   break;
+
+case E_QImode:
+  switch (mode)
+   {
+   case E_QImode:
+ emit_move_insn (reg, mem);
+ break;
+   default:
+ debug_rtx(reg);
+ gcc_unreachable ();
+ break;
+   }
+  break;
+  
 default:
+  debug_rtx(reg);
   gcc_unreachable ();
   break;
 }
@@ -1878,6 +1916,175 @@
 
 }
 
+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx 
+   instructions.
+
+   BYTES_TO_COMPARE is the number of bytes to be compared.
+   ORIG_SRC1 is the unmodified rtx for the first string.
+   ORIG_SRC2 is the unmodified rtx for the second string.
+   S1ADDR is the register to use for the base address of the first string.
+   S2ADDR is the register to use for the base address of the second string.
+   OFF_REG is the register to use for the string offset for loads.
+   S1DATA is the register for loading the first string.
+   S2DATA is the register for loading the second string.
+   VEC_RESULT is the rtx for the vector result indicating the byte difference.
+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
+   to strcmp/strncmp if we have equality at the end of the inline comparison.
+   CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
+   and generate the final comparison result.
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
+   set the final result.  */
+static void
+expand_strncmp_vec_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
+   rtx orig_src1, rtx orig_src2,
+

[PATCH] reorganize block/string move/compare expansions out of rs6000.c

2017-06-22 Thread Aaron Sawdey
This patch moves about 1400 lines of code for various block and string
compare/move/zero expansions out of rs6000.c into a new file 
rs6000-string.c. Segher had asked me to do this before I go adding new
code here.

Bootstrap passes on ppc64le, regtest in progress. OK for trunk if that
passes?



2017-06-22  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_clear,
do_load_for_compare, select_block_compare_mode,
compute_current_alignment, expand_block_compare,
expand_strncmp_align_check, expand_strn_compare,
expand_block_move, rs6000_output_load_multiple)
Move functions related to string/block move/compare
to a separate file.
* config/rs6000/rs6000.c Move above functions to rs6000-string.c.
* config/rs6000/rs6000-protos.h (rs6000_emit_dot_insn) Add prototype
for this function which is now used in two files.
* config/rs6000/t-rs6000 Add rule to compile rs6000-string.o.
* config.gcc Add rs6000-string.o to extra_objs for
targets powerpc*-*-* and rs6000*-*-*.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: config.gcc
===
--- config.gcc	(revision 249565)
+++ config.gcc	(working copy)
@@ -454,6 +454,7 @@
 	;;
 powerpc*-*-*)
 	cpu_type=rs6000
+	extra_objs="rs6000-string.o"
 	extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
 	extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
@@ -471,6 +472,7 @@
 	;;
 rs6000*-*-*)
 	extra_options="${extra_options} g.opt fused-madd.opt rs6000/rs6000-tables.opt"
+	extra_objs="rs6000-string.o"
 	;;
 sparc*-*-*)
 	cpu_type=sparc
Index: config/rs6000/rs6000-protos.h
===
--- config/rs6000/rs6000-protos.h	(revision 249565)
+++ config/rs6000/rs6000-protos.h	(working copy)
@@ -134,6 +134,7 @@
 extern void rs6000_emit_cbranch (machine_mode, rtx[]);
 extern char * output_cbranch (rtx, const char *, int, rtx_insn *);
 extern const char * output_probe_stack_range (rtx, rtx);
+extern void rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg);
 extern bool rs6000_emit_set_const (rtx, rtx);
 extern int rs6000_emit_cmove (rtx, rtx, rtx, rtx);
 extern int rs6000_emit_vector_cond_expr (rtx, rtx, rtx, rtx, rtx, rtx);
Index: config/rs6000/rs6000-string.c
===
--- config/rs6000/rs6000-string.c	(revision 0)
+++ config/rs6000/rs6000-string.c	(working copy)
@@ -0,0 +1,1468 @@
+/* Subroutines used for code generation on IBM RS/6000.
+   Copyright (C) 1991-2017 Free Software Foundation, Inc.
+   Contributed by Richard Kenner (ken...@vlsi1.ultra.nyu.edu)
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "memmodel.h"
+#include "tm_p.h"
+#include "ira.h"
+#include "print-tree.h"
+#include "varasm.h"
+#include "explow.h"
+#include "expr.h"
+#include "output.h"
+
+#define min(A,B)	((A) < (B) ? (A) : (B))
+#define max(A,B)	((A) > (B) ? (A) : (B))
+
+/* Expand a block clear operation, and return 1 if successful.  Return 0
+   if we should let the compiler generate normal code.
+
+   operands[0] is the destination
+   operands[1] is the length
+   operands[3] is the alignment */
+
+int
+expand_block_clear (rtx operands[])
+{
+  rtx orig_dest = operands[0];
+  rtx bytes_rtx	= operands[1];
+  rtx align_rtx = operands[3];
+  bool constp	= (GET_CODE (bytes_rtx) == CONST_INT);
+  HOST_WIDE_INT align;
+  HOST_WIDE_INT bytes;
+  int offset;
+  int clear_bytes;
+  int clear_step;
+
+  /* If this is not a fixed size move, just call memcpy */
+  if (! constp)
+return 0;
+
+  /* This must be a fixed size alignment  */
+  gcc_assert (GET_CODE (align_rtx) == CONST_INT);
+  align = INTVAL (align_rtx) * BITS_PER_UNIT;
+
+  /* Anything to cl

[PATCH rs6000] remove implicit static var outputs of toc_relative_expr_p

2017-06-27 Thread Aaron Sawdey
So, this is to set things up so I can in a future patch separate out
the code that deals with optimizing byte swapping for LE on P8 into a
separate file.

The function toc_relative_expr_p implicitly sets two static vars
(tocrel_base and tocrel_offset) that are declared in rs6000.c. The real
purpose of this is to communicate between
print_operand/print_operand_address and rs6000_output_addr_const_extra,
which is called through the asm_out hook vector by something in the
call tree under output_addr_const.

This patch changes toc_relative_expr_p to make tocrel_base and
tocrel_offset be explicit const_rtx * args. All of the calls other than
print_operand/print_operand_address are changed to have local const_rtx
vars that are passed in. The statics in rs6000.c are now called
tocrel_base_oac and tocrel_offset_oac to reflect their use to
communicate across output_addr_const, and that is now the only thing
they are used for.

Bootstrap and regtest passes in trunk 249639 (to avoid the bootstrap
fail), ok for trunk?


2017-06-27  Aaron Sawdey  

* config/rs6000/rs6000.c (toc_relative_expr_p): Make tocrel_base
and tocrel_offset be pointer args rather than implicitly using
static versions.
(legitimate_constant_pool_address_p, rs6000_emit_move,
const_load_sequence_p, adjust_vperm): Add local tocrel_base and
tocrel_offset and use in toc_relative_expr_p call.
(print_operand, print_operand_address): Use static tocrel_base_oac
and tocrel_offset_oac.
(rs6000_output_addr_const_extra): Use static tocrel_base_oac and
tocrel_offset_oac.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h	(revision 249639)
+++ gcc/config/rs6000/rs6000-protos.h	(working copy)
@@ -40,7 +40,7 @@
 extern int small_data_operand (rtx, machine_mode);
 extern bool mem_operand_gpr (rtx, machine_mode);
 extern bool mem_operand_ds_form (rtx, machine_mode);
-extern bool toc_relative_expr_p (const_rtx, bool);
+extern bool toc_relative_expr_p (const_rtx, bool, const_rtx *, const_rtx *);
 extern void validate_condition_mode (enum rtx_code, machine_mode);
 extern bool legitimate_constant_pool_address_p (const_rtx, machine_mode,
 		bool);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 249639)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -8628,18 +8628,25 @@
 	  && ASM_OUTPUT_SPECIAL_POOL_ENTRY_P (get_pool_constant (base), Pmode));
 }
 
-static const_rtx tocrel_base, tocrel_offset;
+/* These are only used to pass through from print_operand/print_operand_address
+ * to rs6000_output_addr_const_extra over the intervening function 
+ * output_addr_const which is not target code.  */
+static const_rtx tocrel_base_oac, tocrel_offset_oac;
 
 /* Return true if OP is a toc pointer relative address (the output
of create_TOC_reference).  If STRICT, do not match non-split
-   -mcmodel=large/medium toc pointer relative addresses.  */
+   -mcmodel=large/medium toc pointer relative addresses.  Places base 
+   and offset pieces in TOCREL_BASE and TOCREL_OFFSET respectively.  */
 
 bool
-toc_relative_expr_p (const_rtx op, bool strict)
+toc_relative_expr_p (const_rtx op, bool strict, const_rtx *tocrel_base,
+		 const_rtx *tocrel_offset)
 {
   if (!TARGET_TOC)
 return false;
 
+  gcc_assert (tocrel_base != NULL && tocrel_offset != NULL);
+
   if (TARGET_CMODEL != CMODEL_SMALL)
 {
   /* When strict ensure we have everything tidy.  */
@@ -8655,16 +8662,16 @@
 	op = XEXP (op, 1);
 }
 
-  tocrel_base = op;
-  tocrel_offset = const0_rtx;
+  *tocrel_base = op;
+  *tocrel_offset = const0_rtx;
   if (GET_CODE (op) == PLUS && add_cint_operand (XEXP (op, 1), GET_MODE (op)))
 {
-  tocrel_base = XEXP (op, 0);
-  tocrel_offset = XEXP (op, 1);
+  *tocrel_base = XEXP (op, 0);
+  *tocrel_offset = XEXP (op, 1);
 }
 
-  return (GET_CODE (tocrel_base) == UNSPEC
-	  && XINT (tocrel_base, 1) == UNSPEC_TOCREL);
+  return (GET_CODE (*tocrel_base) == UNSPEC
+	  && XINT (*tocrel_base, 1) == UNSPEC_TOCREL);
 }
 
 /* Return true if X is a constant pool address, and also for cmodel=medium
@@ -8674,7 +8681,8 @@
 legitimate_constant_pool_address_p (const_rtx x, machine_mode mode,
 bool strict)
 {
-  return (toc_relative_expr_p (x, strict)
+  const_rtx tocrel_base, tocrel_offset;
+  return (toc_relative_expr_p (x, strict, &tocrel_base, &tocrel_offset)
 	  && (TARGET_CMODEL != CMODEL_MEDIUM
 	  || constant_pool_expr_p (XVECEXP (tocrel_base, 0, 0))
 	  || mode == QImode
@@ -11055,6 +11063,7 @@
   /* If this is a SYMBOL_REF that refers to a constant pool entry,
 	 and we hav

Re: [PATCH rs6000] remove implicit static var outputs of toc_relative_expr_p

2017-06-28 Thread Aaron Sawdey
Hi Segher,

On Tue, 2017-06-27 at 18:35 -0500, Segher Boessenkool wrote:
> Hi Aaron,
> 
> On Tue, Jun 27, 2017 at 11:43:57AM -0500, Aaron Sawdey wrote:
> > The function toc_relative_expr_p implicitly sets two static vars
> > (tocrel_base and tocrel_offset) that are declared in rs6000.c. The
> > real
> > purpose of this is to communicate between
> > print_operand/print_operand_address and
> > rs6000_output_addr_const_extra,
> > which is called through the asm_out hook vector by something in the
> > call tree under output_addr_const.
> > 
> > This patch changes toc_relative_expr_p to make tocrel_base and
> > tocrel_offset be explicit const_rtx * args. All of the calls other
> > than
> > print_operand/print_operand_address are changed to have local
> > const_rtx
> > vars that are passed in.
> 
> If those locals aren't used, can you arrange to call
> toc_relative_expr_p
> with NULL instead?  Or are they always used?

There are calls where neither is used or only tocrel_base is used.

> 
> > The statics in rs6000.c are now called
> > tocrel_base_oac and tocrel_offset_oac to reflect their use to
> > communicate across output_addr_const, and that is now the only
> > thing
> > they are used for.
> 
> Can't say I like those names, very cryptical.  Not that I know
> something
> better, the short names as they were weren't very nice either.
> 
> > --- gcc/config/rs6000/rs6000.c  (revision 249639)
> > +++ gcc/config/rs6000/rs6000.c  (working copy)
> > @@ -8628,18 +8628,25 @@
> >       && ASM_OUTPUT_SPECIAL_POOL_ENTRY_P (get_pool_constant
> > (base), Pmode));
> >  }
> >  
> > -static const_rtx tocrel_base, tocrel_offset;
> > +/* These are only used to pass through from
> > print_operand/print_operand_address
> > + * to rs6000_output_addr_const_extra over the intervening
> > function 
> > + * output_addr_const which is not target code.  */
> 
> No leading * in a block comment please.  (And you have a trailing
> space).
> 
> > +static const_rtx tocrel_base_oac, tocrel_offset_oac;
> >  
> >  /* Return true if OP is a toc pointer relative address (the output
> > of create_TOC_reference).  If STRICT, do not match non-split
> > -   -mcmodel=large/medium toc pointer relative addresses.  */
> > +   -mcmodel=large/medium toc pointer relative addresses.  Places
> > base 
> > +   and offset pieces in TOCREL_BASE and TOCREL_OFFSET
> > respectively.  */
> 
> s/Places/Place/ (and another trailing space).
> 
> > -  tocrel_base = op;
> > -  tocrel_offset = const0_rtx;
> > +  *tocrel_base = op;
> > +  *tocrel_offset = const0_rtx;
> >    if (GET_CODE (op) == PLUS && add_cint_operand (XEXP (op, 1),
> > GET_MODE (op)))
> >  {
> > -  tocrel_base = XEXP (op, 0);
> > -  tocrel_offset = XEXP (op, 1);
> > +  *tocrel_base = XEXP (op, 0);
> > +  *tocrel_offset = XEXP (op, 1);
> >  }
> 
> Maybe write this as
> 
>   if (GET_CODE (op) == PLUS && add_cint_operand (XEXP (op, 1),
> GET_MODE (op)))
> {
>   *tocrel_base = XEXP (op, 0);
>   *tocrel_offset = XEXP (op, 1);
> }
>   else
> {
>   *tocrel_base = op;
>   *tocrel_offset = const0_rtx;
> }
> 
> or, if you allow NULL pointers,
> 
>   bool with_offset = GET_CODE (op) == PLUS
>    && add_cint_operand (XEXP (op, 1), GET_MODE (op));
>   if (tocrel_base)
> *tocrel_base = with_offset ? XEXP (op, 0) : op;
>   if (tocrel_offset)
> *tocrel_offset = with_offset ? XEXP (op, 1) : const0_rtx;
> 
> or such.
> 
> > -  return (GET_CODE (tocrel_base) == UNSPEC
> > -     && XINT (tocrel_base, 1) == UNSPEC_TOCREL);
> > +  return (GET_CODE (*tocrel_base) == UNSPEC
> > +     && XINT (*tocrel_base, 1) == UNSPEC_TOCREL);
> 
> Well, and then you have this, so you need to assign tocrel_base to a
> local
> as well.
> 
> >  legitimate_constant_pool_address_p (const_rtx x, machine_mode
> > mode,
> >     bool strict)
> >  {
> > -  return (toc_relative_expr_p (x, strict)
> > +  const_rtx tocrel_base, tocrel_offset;
> > +  return (toc_relative_expr_p (x, strict, &tocrel_base,
> > &tocrel_offset)
> 
> For example here it seems nothing uses tocrel_base?
> 
> It is probably nicer to have a separate function for
> toc_relative_expr_p
> and one to pull the base/offset out.  And maybe don't keep it cached
> for
> the output function either?  It has all info it needs, ri

Re: [PATCH rs6000] remove implicit static var outputs of toc_relative_expr_p

2017-06-29 Thread Aaron Sawdey
On Wed, 2017-06-28 at 18:19 -0500, Segher Boessenkool wrote:
> On Wed, Jun 28, 2017 at 03:21:49PM -0500, Aaron Sawdey wrote:
> > -toc_relative_expr_p (const_rtx op, bool strict)
> > +toc_relative_expr_p (const_rtx op, bool strict, const_rtx
> > *tocrel_base_ret,
> > +    const_rtx *tocrel_offset_ret)
> >  {
> >    if (!TARGET_TOC)
> >  return false;
> > -  tocrel_base = op;
> > -  tocrel_offset = const0_rtx;
> > +  const_rtx tocrel_base = op;
> > +  const_rtx tocrel_offset = const0_rtx;
> > +
> >    if (GET_CODE (op) == PLUS && add_cint_operand (XEXP (op, 1),
> > GET_MODE (op)))
> >  {
> >    tocrel_base = XEXP (op, 0);
> > -  tocrel_offset = XEXP (op, 1);
> > +  if (tocrel_offset_ret)
> > +   tocrel_offset = XEXP (op, 1);
> 
> Lose the "if"?  Or do you get a compiler warning then?

I was just trying to avoid unnecessary work in the case where the
pointer is NULL. In that case tocrel_offset isn't actually used for
anything. Probably I should just let the compiler figure that one out,
I will delete the if for clarity.

> > @@ -8674,7 +8686,8 @@
> >  legitimate_constant_pool_address_p (const_rtx x, machine_mode
> > mode,
> >     bool strict)
> >  {
> > -  return (toc_relative_expr_p (x, strict)
> > +  const_rtx tocrel_base, tocrel_offset;
> > +  return (toc_relative_expr_p (x, strict, &tocrel_base,
> > &tocrel_offset)
> >       && (TARGET_CMODEL != CMODEL_MEDIUM
> >       || constant_pool_expr_p (XVECEXP (tocrel_base, 0,
> > 0))
> >       || mode == QImode
> 
> Use NULL for the args here, instead?

The diff didn't include all the context. Both tocrel_base and
tocrel_offset are used in the function:

bool
legitimate_constant_pool_address_p (const_rtx x, machine_mode mode,
bool strict)
{
  const_rtx tocrel_base, tocrel_offset;
  return (toc_relative_expr_p (x, strict, &tocrel_base, &tocrel_offset)
  && (TARGET_CMODEL != CMODEL_MEDIUM
  || constant_pool_expr_p (XVECEXP (tocrel_base, 0, 0))
  || mode == QImode
  || offsettable_ok_by_alignment (XVECEXP (tocrel_base, 0, 0),
  INTVAL (tocrel_offset), mode)));
}


> 
> The patch is okay for trunk with those things taken care of.  Thanks,
> 
> 
> Segher
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] make canonicalize_condition keep its promise

2017-11-15 Thread Aaron Sawdey
So, the story of this very small patch starts with me adding patterns
for ppc instructions bdz[tf] and bdnz[tf] such as this:

  [(set (pc)
(if_then_else
  (and
 (ne (match_operand:P 1 "register_operand" "c,*b,*b,*b")
 (const_int 1))
 (match_operator 3 "branch_comparison_operator"
  [(match_operand 4 "cc_reg_operand" "y,y,y,y")
   (const_int 0)]))
  (label_ref (match_operand 0))
  (pc)))
   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
(plus:P (match_dup 1)
(const_int -1)))
   (clobber (match_scratch:P 5 "=X,X,&r,r"))
   (clobber (match_scratch:CC 6 "=X,&y,&y,&y"))
   (clobber (match_scratch:CCEQ 7 "=X,&y,&y,&y"))]

However when this gets to the loop_doloop pass, we get an assert fail
in iv_number_of_iterations():

  gcc_assert (COMPARISON_P (condition));

This is happening because this branch insn tests two things ANDed
together so the and is at the top of the expression, not a comparison.

This condition is extracted from the insn by get_condition() which is
pretty straightforward, and which calls canonicalize_condition() before
returning it. Now, one could put a test for a jump condition that is
not a conditional test in here but the comment for
canonicalize_condition() says:

   (1) The code will always be a comparison operation (EQ, NE, GT, etc.).

So, this patch adds a test at the end that just returns 0 if the return
rtx is not a comparison. As it happens, doloop conversion is not needed
here because I'm already generating rtl for a branch-decrement counter
based loop.

If there is a better way to go about this please let me know and I'll
revise/retest.

Bootstrap and regtest pass on ppc64le and x86_64. Ok for trunk?

Thanks,
Aaron


2017-11-15  Aaron Sawdey  

* rtlanal.c (canonicalize_condition): Return 0 if final rtx
does not have a conditional at the top.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/rtlanal.c
===
--- gcc/rtlanal.c   (revision 254553)
+++ gcc/rtlanal.c   (working copy)
@@ -5623,7 +5623,11 @@
   if (CC0_P (op0))
 return 0;
 
-  return gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+  /* We promised to return a comparison.  */
+  rtx ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
+  if (COMPARISON_P (ret))
+return ret;
+  return 0;
 }
 
 /* Given a jump insn JUMP, return the condition that will cause it to branch


Re: [PATCH] make canonicalize_condition keep its promise

2017-11-20 Thread Aaron Sawdey
On Sun, 2017-11-19 at 16:44 -0700, Jeff Law wrote:
> On 11/15/2017 08:40 AM, Aaron Sawdey wrote:
> > So, the story of this very small patch starts with me adding
> > patterns
> > for ppc instructions bdz[tf] and bdnz[tf] such as this:
> > 
> >   [(set (pc)
> > (if_then_else
> >   (and
> >  (ne (match_operand:P 1 "register_operand" "c,*b,*b,*b")
> >  (const_int 1))
> >  (match_operator 3 "branch_comparison_operator"
> >   [(match_operand 4 "cc_reg_operand" "y,y,y,y")
> >    (const_int 0)]))
> >   (label_ref (match_operand 0))
> >   (pc)))
> >    (set (match_operand:P 2 "nonimmediate_operand"
> > "=1,*r,m,*d*wi*c*l")
> > (plus:P (match_dup 1)
> > (const_int -1)))
> >    (clobber (match_scratch:P 5 "=X,X,&r,r"))
> >    (clobber (match_scratch:CC 6 "=X,&y,&y,&y"))
> >    (clobber (match_scratch:CCEQ 7 "=X,&y,&y,&y"))]
> > 
> > However when this gets to the loop_doloop pass, we get an assert
> > fail
> > in iv_number_of_iterations():
> > 
> >   gcc_assert (COMPARISON_P (condition));
> > 
> > This is happening because this branch insn tests two things ANDed
> > together so the and is at the top of the expression, not a
> > comparison.
> 
> Is this something you've created for an existing loop?  Presumably an
> existing loop that previously was a simple loop?

The rtl to use this instruction is generated by new code I'm working on
to do a builtin expansion of memcmp using a loop. I call
gen_bdnztf_di() to generate rtl for the insn. It would be nice to be
able to generate this instruction from doloop conversion but that is
beyond the scope of what I'm working on presently.

> > 
> > This condition is extracted from the insn by get_condition() which
> > is
> > pretty straightforward, and which calls canonicalize_condition()
> > before
> > returning it. Now, one could put a test for a jump condition that
> > is
> > not a conditional test in here but the comment for
> > canonicalize_condition() says:
> > 
> >    (1) The code will always be a comparison operation (EQ, NE, GT,
> > etc.).
> > 
> > So, this patch adds a test at the end that just returns 0 if the
> > return
> > rtx is not a comparison. As it happens, doloop conversion is not
> > needed
> > here because I'm already generating rtl for a branch-decrement
> > counter
> > based loop.
> 
> I'm not at all familiar with this code, but I would probably guess
> that
> COND is supposed to appear within INSN.  Canonicalize comparison is
> supposed to modify the COND expression that appears within INSN.  THe
> overall structure of INSN doesn't much matter as long as COND refers
> to
> a comparison code with two operands.
> 
> My gut tells me the problem is upstream or downstream in the call
> chain
> or that you've taken a loop that was considered a simple loop and
> modified the exit test in a way that other code is not prepared to
> handle.

Yes, get_condition() is the other place that this could be fixed. It
expects rtl of the form:

(set (pc) (if_then_else (cond ...

and here it's seeing something of the form:

(set (pc) (if_then_else (and (cond ... ) (cond ...)

It seems legitimate to add a check for this in get_condition() as well:

  cond = XEXP (SET_SRC (set), 0);
  if (!COMPARISON_P (cond))
return NULL_RTX;

There are a couple of uses of canonicalize_condition() in ifcvt.c and
it looks like the code there may expect the return to be a conditional.
This suggests it might be a good idea to do both checks.

Thanks,
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] make canonicalize_condition keep its promise

2017-11-21 Thread Aaron Sawdey
On Tue, 2017-11-21 at 10:06 -0700, Jeff Law wrote:
> On 11/20/2017 06:41 AM, Aaron Sawdey wrote:
> > On Sun, 2017-11-19 at 16:44 -0700, Jeff Law wrote:
> > > On 11/15/2017 08:40 AM, Aaron Sawdey wrote:
> > > > So, the story of this very small patch starts with me adding
> > > > patterns
> > > > for ppc instructions bdz[tf] and bdnz[tf] such as this:
> > > > 
> > > >   [(set (pc)
> > > > (if_then_else
> > > >   (and
> > > >  (ne (match_operand:P 1 "register_operand"
> > > > "c,*b,*b,*b")
> > > >  (const_int 1))
> > > >  (match_operator 3 "branch_comparison_operator"
> > > >   [(match_operand 4 "cc_reg_operand"
> > > > "y,y,y,y")
> > > >    (const_int 0)]))
> > > >   (label_ref (match_operand 0))
> > > >   (pc)))
> > > >    (set (match_operand:P 2 "nonimmediate_operand"
> > > > "=1,*r,m,*d*wi*c*l")
> > > > (plus:P (match_dup 1)
> > > > (const_int -1)))
> > > >    (clobber (match_scratch:P 5 "=X,X,&r,r"))
> > > >    (clobber (match_scratch:CC 6 "=X,&y,&y,&y"))
> > > >    (clobber (match_scratch:CCEQ 7 "=X,&y,&y,&y"))]
> > > > 
> > > > However when this gets to the loop_doloop pass, we get an
> > > > assert
> > > > fail
> > > > in iv_number_of_iterations():
> > > > 
> > > >   gcc_assert (COMPARISON_P (condition));
> > > > 
> > > > This is happening because this branch insn tests two things
> > > > ANDed
> > > > together so the and is at the top of the expression, not a
> > > > comparison.
> > > 
> > > Is this something you've created for an existing
> > > loop?  Presumably an
> > > existing loop that previously was a simple loop?
> > 
> > The rtl to use this instruction is generated by new code I'm
> > working on
> > to do a builtin expansion of memcmp using a loop. I call
> > gen_bdnztf_di() to generate rtl for the insn. It would be nice to
> > be
> > able to generate this instruction from doloop conversion but that
> > is
> > beyond the scope of what I'm working on presently.
> 
> Understood.
> 
> So what I think (and I'm hoping you can confirm one way or the other)
> is
> that by generating this instruction you're turing a loop which
> previously was considered a simple loop by the IV code and turning it
> into something the IV bits no longer think is a simple loop.
> 
> I think that's problematical as when the loop is thought to be a
> simple
> loop, it has to have a small number of forms for its loop back/exit
> loop
> tests and whether or not a loop is a simple loop is cached in the
> loop
> structure.
> 
> I think we need to dig into that first.  If my suspicion is correct
> then
> this patch is really just papering over that deeper problem.  So I
> think
> you need to dig a big deeper into why you're getting into the code in
> question (canonicalize_condition) and whether or not the call chain
> makes any sense given the changes you've made to the loop.
> 

Jeff,
  There is no existing loop structure. This starts with a memcmp() call
and then goes down through the builtin expansion mechanism, which is
ultimately expanding the pattern cmpmemsi which is where my code is
generating a loop that finishes with bdnzt. The code that's ultimately
generated looks like this:

srdi 9,10,4
li 6,0
mtctr 9
li 4,8
.L9:
ldbrx 8,11,6
ldbrx 9,7,6
ldbrx 5,11,4
ldbrx 3,7,4
addi 6,6,16
addi 4,4,16
subfc. 9,9,8
bne 0,.L4
subfc. 9,3,5
bdnzt 2,.L9

So it is a loop with a branch out, and then the branch decrement nz
true back to the top. The iv's here (regs 4 and 6) were generated by my
expansion code. 

I really think the ultimate problem here is that both
canonicalize_condition and get_condition promise in their documenting
comments that they will return something that has a cond at the root of
the rtx, or 0 if they don't understand what they're given. In this case
they do not understand the rtx of bdnzt and are returning rtx rooted
with an and, not a cond. This may seem like papering over the problem,
but I think it is legitimate for these functions to return 0 when the
branch insn in question does not have a simple cond at the heart of it.
And bootstrap/regtest did pass with my patch on ppc64le and x86_64.
Ultimately, yes something better ought to be done here.

Thanks,
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] make canonicalize_condition keep its promise

2017-11-29 Thread Aaron Sawdey
On Tue, 2017-11-21 at 11:45 -0600, Aaron Sawdey wrote:
> On Tue, 2017-11-21 at 10:06 -0700, Jeff Law wrote:
> > On 11/20/2017 06:41 AM, Aaron Sawdey wrote:
> > > On Sun, 2017-11-19 at 16:44 -0700, Jeff Law wrote:
> > > > On 11/15/2017 08:40 AM, Aaron Sawdey wrote:
> > > > > So, the story of this very small patch starts with me adding
> > > > > patterns
> > > > > for ppc instructions bdz[tf] and bdnz[tf] such as this:
> > > > > 
> > > > >   [(set (pc)
> > > > >   (if_then_else
> > > > >     (and
> > > > >    (ne (match_operand:P 1 "register_operand"
> > > > > "c,*b,*b,*b")
> > > > >    (const_int 1))
> > > > >    (match_operator 3 "branch_comparison_operator"
> > > > >     [(match_operand 4 "cc_reg_operand"
> > > > > "y,y,y,y")
> > > > >      (const_int 0)]))
> > > > >     (label_ref (match_operand 0))
> > > > >     (pc)))
> > > > >    (set (match_operand:P 2 "nonimmediate_operand"
> > > > > "=1,*r,m,*d*wi*c*l")
> > > > >   (plus:P (match_dup 1)
> > > > >   (const_int -1)))
> > > > >    (clobber (match_scratch:P 5 "=X,X,&r,r"))
> > > > >    (clobber (match_scratch:CC 6 "=X,&y,&y,&y"))
> > > > >    (clobber (match_scratch:CCEQ 7 "=X,&y,&y,&y"))]
> > > > > 
> > > > > However when this gets to the loop_doloop pass, we get an
> > > > > assert
> > > > > fail
> > > > > in iv_number_of_iterations():
> > > > > 
> > > > >   gcc_assert (COMPARISON_P (condition));
> > > > > 
> > > > > This is happening because this branch insn tests two things
> > > > > ANDed
> > > > > together so the and is at the top of the expression, not a
> > > > > comparison.
> > > > 
> > > > Is this something you've created for an existing
> > > > loop?  Presumably an
> > > > existing loop that previously was a simple loop?
> > > 
> > > The rtl to use this instruction is generated by new code I'm
> > > working on
> > > to do a builtin expansion of memcmp using a loop. I call
> > > gen_bdnztf_di() to generate rtl for the insn. It would be nice to
> > > be
> > > able to generate this instruction from doloop conversion but that
> > > is
> > > beyond the scope of what I'm working on presently.
> > 
> > Understood.
> > 
> > So what I think (and I'm hoping you can confirm one way or the
> > other)
> > is
> > that by generating this instruction you're turing a loop which
> > previously was considered a simple loop by the IV code and turning
> > it
> > into something the IV bits no longer think is a simple loop.
> > 
> > I think that's problematical as when the loop is thought to be a
> > simple
> > loop, it has to have a small number of forms for its loop back/exit
> > loop
> > tests and whether or not a loop is a simple loop is cached in the
> > loop
> > structure.
> > 
> > I think we need to dig into that first.  If my suspicion is correct
> > then
> > this patch is really just papering over that deeper problem.  So I
> > think
> > you need to dig a big deeper into why you're getting into the code
> > in
> > question (canonicalize_condition) and whether or not the call chain
> > makes any sense given the changes you've made to the loop.
> > 
> 
> Jeff,
>   There is no existing loop structure. This starts with a memcmp()
> call
> and then goes down through the builtin expansion mechanism, which is
> ultimately expanding the pattern cmpmemsi which is where my code is
> generating a loop that finishes with bdnzt. The code that's
> ultimately
> generated looks like this:
> 
> srdi 9,10,4
> li 6,0
> mtctr 9
> li 4,8
> .L9:
> ldbrx 8,11,6
> ldbrx 9,7,6
> ldbrx 5,11,4
> ldbrx 3,7,4
> addi 6,6,16
> addi 4,4,16
> subfc. 9,9,8
> bne 0,.L4
> subfc. 9,3,5
> bdnzt 2,.L9
> 
> So it is a loop with a branch out, and then the branch decrement nz
> true

[PATCH] rs6000: Cleanup bdz/bdnz insn/splitter, add new insn/splitter for bdzt/bdzf/bdnzt/bdnzf

2017-11-30 Thread Aaron Sawdey
This does some cleanup/consolidation so that bdz/bdnz are supported by
a single insn and splitter, and adds a new insn and splitter to support
the conditional form of those (bdzt/bdzf/bdnzt/bdnzf).

This is going to be used for the memcmp() builtin expansion patch which
is next. That also will require the change to canonicalize_condition I
posted before thanksgiving to prevent doloop from being confused by
bdnzt et. al. 

Bootstrap/regtest passes on ppc64le. OK for trunk?


2017-11-30  Aaron Sawdey  

* config/rs6000/rs6000.md (cceq_ior_compare): Remove * so I can use it
to generate rtl.
(cceq_ior_compare_complement): Give it a name so I can use it, and
change boolean_or_operator predicate to boolean_operator so it can
be used to generate a crand.
Define new code iterator eqne, and new code_attrs bd/bd_neg.
(_) New name for ctr_internal[12] now combined into
a single define_insn. There is now just a single splitter for this
that looks whether the decremented ctr result is going to a register
or memory and generates the appropriate rtl.
(tf_) A new insn pattern for the conditional form branch
decrement (bdnzt/bdnzf/bdzt/bdzf). This also has a splitter similar
to the one for _.
* config/rs6000/rs6000.c (rs6000_legitimate_combined_insn): Updated
with the new names of the branch decrement patterns, and added the
names of the branch decrement conditional patterns.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 254553)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -9056,10 +9056,14 @@
  for the difficult case.  It's better to not create problems
  in the first place.  */
   if (icode != CODE_FOR_nothing
-  && (icode == CODE_FOR_ctrsi_internal1
-	  || icode == CODE_FOR_ctrdi_internal1
-	  || icode == CODE_FOR_ctrsi_internal2
-	  || icode == CODE_FOR_ctrdi_internal2))
+  && (icode == CODE_FOR_bdz_si
+	  || icode == CODE_FOR_bdz_di
+	  || icode == CODE_FOR_bdnz_si
+	  || icode == CODE_FOR_bdnz_di
+	  || icode == CODE_FOR_bdztf_si
+	  || icode == CODE_FOR_bdnztf_si
+	  || icode == CODE_FOR_bdztf_di
+	  || icode == CODE_FOR_bdnztf_di))
 return false;
 
   return true;
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md	(revision 254553)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -12744,7 +12744,7 @@
 ; which are generated by the branch logic.
 ; Prefer destructive operations where BT = BB (for crXX BT,BA,BB)
 
-(define_insn "*cceq_ior_compare"
+(define_insn "cceq_ior_compare"
   [(set (match_operand:CCEQ 0 "cc_reg_operand" "=y,?y")
 (compare:CCEQ (match_operator:SI 1 "boolean_operator"
 	[(match_operator:SI 2
@@ -12764,9 +12764,9 @@
 
 ; Why is the constant -1 here, but 1 in the previous pattern?
 ; Because ~1 has all but the low bit set.
-(define_insn ""
+(define_insn "cceq_ior_compare_complement"
   [(set (match_operand:CCEQ 0 "cc_reg_operand" "=y,?y")
-(compare:CCEQ (match_operator:SI 1 "boolean_or_operator"
+(compare:CCEQ (match_operator:SI 1 "boolean_operator"
 	[(not:SI (match_operator:SI 2
   "branch_positive_comparison_operator"
   [(match_operand 3
@@ -12983,34 +12983,13 @@
 ;; rs6000_legitimate_combined_insn prevents combine creating any of
 ;; the ctr insns.
 
-(define_insn "ctr_internal1"
-  [(set (pc)
-	(if_then_else (ne (match_operand:P 1 "register_operand" "c,*b,*b,*b")
-			  (const_int 1))
-		  (label_ref (match_operand 0))
-		  (pc)))
-   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
-	(plus:P (match_dup 1)
-		(const_int -1)))
-   (clobber (match_scratch:CC 3 "=X,&x,&x,&x"))
-   (clobber (match_scratch:P 4 "=X,X,&r,r"))]
-  ""
-{
-  if (which_alternative != 0)
-return "#";
-  else if (get_attr_length (insn) == 4)
-return "bdnz %l0";
-  else
-return "bdz $+8\;b %l0";
-}
-  [(set_attr "type" "branch")
-   (set_attr "length" "*,16,20,20")])
+(define_code_iterator eqne [eq ne])
+(define_code_attr bd [(ne "bdnz") (eq "bdz")])
+(define_code_attr bd_neg [(ne "bdz") (eq "bdnz")])
 
-;; Similar but use EQ
-
-(define_insn "ctr_internal2"
+(define_insn "_"
   [(set (pc)
-	(if_then_else (eq (match_operand:P 1 "register_operand" "c,*b,*b,*b")
+	(

[PATCH, rs6000] generate loop code for memcmp inline expansion

2017-12-11 Thread Aaron Sawdey
This patch builds on two previously posted patches:

https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01216.html
https://gcc.gnu.org/ml/gcc-patches/2017-11/msg02599.html

The previous patches allow the use of bdnzt.

This patch allows the cmpstrsi pattern expansion to handle longer blocks without
increasing code size by using a loop:

.L13:
ldbrx 7,3,6
ldbrx 9,10,6
ldbrx 0,3,5
ldbrx 4,10,5
addi 6,6,16
addi 5,5,16
subfc. 9,9,7
bne 0,.L10
subfc. 9,4,0
bdnzt 2,.L13

Performance testing on P8 showed that unroll x2 with 2 IVs was the way to go,
and it is the same performance as the previous non-loop inlne code for a
32 byte compare. Also on P8 the performance crossover with calling glibc memcmp
happens around 192 bytes length. The loop expansion code can also be generated
when the length is not known at compile time.

The gcc.dg/memcmp-1.c and gcc.dg/strncmp-2.c test cases are updated by the
patch to check the additional memcmp expansion for known and unknown lengths.

The undocumented option -mblock-compare-inline-limit was changed slightly to
mean how many bytes will be compared by the non-loop inline code, the default
is 31 bytes. As previously, -mblock-compare-inlne-limit=0 will disable all
inline expansion of memcmp(). The new -mblock-compare-inline-loop-limit option
sets the upper limit, for lengths above that no expansion is done if the length
is known at compile time. If the length is unknown, the generated code calls
glibc memcmp() after comparing that many bytes.

Bootstrap and regtest pass on P8 LE and P9 LE, testing in progress for
BE (default, P7, and P8). If tests pass, OK for trunk when the first of the
previously posted patches is approved?

Thanks,
Aaron


2017-12-11  Aaron Sawdey  

* config/rs6000/rs6000-string.c (do_load_for_compare_from_addr): New
function.
(do_ifelse): New function.
(do_isel): New function.
(do_sub3): New function.
(do_add3): New function.
(do_load_mask_compare): New function.
(do_overlap_load_compare): New function.
(expand_compare_loop): New function.
(expand_block_compare): Call expand_compare_loop() when appropriate.
* config/rs6000/rs6000.opt (-mblock-compare-inline-limit): Change
option description.
(-mblock-compare-inline-loop-limit): New option.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 255516)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -301,6 +301,923 @@
   return MIN (base_align, offset & -offset);
 }
 
+/* Prepare address and then do a load.
+
+   MODE is the mode to use for the load.
+   DEST is the destination register for the data.
+   ADDR is the address to be loaded.
+   ORIG_ADDR is the original address expression.  */
+static void
+do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
+			   rtx orig_addr)
+{
+  rtx mem = gen_rtx_MEM (mode, addr);
+  MEM_COPY_ATTRIBUTES (mem, orig_addr);
+  set_mem_size (mem, GET_MODE_SIZE (mode));
+  do_load_for_compare (dest, mem, mode);
+  return;
+}
+
+/* Do a branch for an if/else decision.
+
+   CMPMODE is the mode to use for the comparison.
+   COMPARISON is the rtx code for the compare needed.
+   A is the first thing to be compared.
+   B is the second thing to be compared.
+   CR is the condition code reg input, or NULL_RTX.
+   TRUE_LABEL is the label to branch to if the condition is true.
+
+   The return value is the CR used for the comparison.
+   If CR is null_rtx, then a new register of CMPMODE is generated.
+   If A and B are both null_rtx, then CR must not be null, and the
+   compare is not generated so you can use this with a dot form insn.  */
+
+static rtx
+do_ifelse (machine_mode cmpmode, rtx_code comparison,
+	   rtx a, rtx b, rtx cr, rtx true_label)
+{
+  gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
+	  || (a != NULL_RTX && b != NULL_RTX));
+
+  if (cr != NULL_RTX)
+gcc_assert (GET_MODE (cr) == cmpmode);
+  else
+cr = gen_reg_rtx (cmpmode);
+
+  rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
+
+  if (a != NULL_RTX)
+emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
+
+  rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
+
+  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
+  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  JUMP_LABEL (j) = true_label;
+  LABEL_NUSES (true_label) += 1;
+
+  return cr;
+}
+
+/* Emit an isel of the proper mode for DEST.
+
+   DEST is the isel destination register.
+   SRC1 is the isel source if CR is true.
+   SRC2 is the isel source if CR is false.
+   CR is the conditi

[PATCH][rs6000][PR target/82190] fix mem size info in rtl generated by memcmp and strncmp/strcmp builtin expansion

2017-12-12 Thread Aaron Sawdey
In the code I put in gcc7 for expanding memcmp and strcmp/strncmp as
builtins, I used set_mem_size to set the size of loads to only the
bytes I was actually going to compare. However this is really incorrect
and the test case for 82190 was failing because alias analysis believed
the bogus size and though there was no conflict between an 8byte load
used for comparing 6 bytes and a later store to the 7th byte. As a
result it eliminated that load from the 7 byte compare that followed
later.

This patch changes the set_mem_size calls in expand_block_move and
expand_strn_compare to set the size to the size of the load being done
regardless of how many bytes are being used.

OK for trunk if bootstrap/regtest passes on ppc64le?

2017-12-12  Aaron Sawdey  

PR target/82190
* config/rs6000/rs6000-string.c (expand_block_move,
expand_strn_compare): fix set_mem_size() calls.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 255585)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -1247,6 +1247,9 @@
   if (bytes > rs6000_block_move_inline_limit)
 return 0;
 
+  bool isP8 = (rs6000_cpu == PROCESSOR_POWER8);
+  bool isP9 = (rs6000_cpu == PROCESSOR_POWER9);
+
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
 {
   union {
@@ -1258,7 +1261,7 @@
 
   /* Altivec first, since it will be faster than a string move
 	 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+  if (TARGET_ALTIVEC && bytes >= 16 && (isP8 || isP9 || align >= 128))
 	{
 	  move_bytes = 16;
 	  mode = V4SImode;
Index: gcc/testsuite/gcc.dg/pr82190.c
===
--- gcc/testsuite/gcc.dg/pr82190.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/pr82190.c	(working copy)
@@ -0,0 +1,22 @@
+/* PR target/82190 */
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-optimize-strlen -fweb" } */
+
+char src[64] __attribute__ ((aligned)) = "aaa";
+char dst[64] __attribute__ ((aligned));
+
+int
+main ()
+{
+  __builtin_memcpy (dst, src, 6);
+  if (__builtin_memcmp (dst, src, 6))
+__builtin_abort ();
+
+  __builtin_memcpy (dst, src, 7);
+  if (__builtin_memcmp (dst, src, 7))
+__builtin_abort ();
+
+  return 0;
+}
+
+


Re: [PATCH][rs6000][PR target/82190] fix mem size info in rtl generated by memcmp and strncmp/strcmp builtin expansion

2017-12-12 Thread Aaron Sawdey
On Tue, 2017-12-12 at 20:50 +0100, Jakub Jelinek wrote:
> On Tue, Dec 12, 2017 at 01:40:41PM -0600, Aaron Sawdey wrote:
> > 2017-12-12  Aaron Sawdey  
> > 
> > PR target/82190
> > * config/rs6000/rs6000-string.c (expand_block_move,
> > expand_strn_compare): fix set_mem_size() calls.
> 
> That should be capitalized: Fix instead of fix

[wrong patch deleted]

> Is this the patch you meant to attach?  First of all, it only changes
> expand_block_move, not expand_strn_compare, and the change seems more
> like an optimization of P8/P9 rather than actual fix (otherwise,
> wouldn't
> it fail on say P7?).
> 
> > +  return 0;
> > +}
> > +
> > +
> 
> Please avoid unnecessary trailing whitespace.
> 

Jakub,
  Yes that is a different patch unrelated to the 82190 fix. I've
attached the correct patch.

Thanks!
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 255585)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -459,7 +459,7 @@
  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
  src1 = replace_equiv_address (src1, src1_reg);
}
-  set_mem_size (src1, cmp_bytes);
+  set_mem_size (src1, load_mode_size);
 
   if (!REG_P (XEXP (src2, 0)))
{
@@ -466,7 +466,7 @@
  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
  src2 = replace_equiv_address (src2, src2_reg);
}
-  set_mem_size (src2, cmp_bytes);
+  set_mem_size (src2, load_mode_size);
 
   do_load_for_compare (tmp_reg_src1, src1, load_mode);
   do_load_for_compare (tmp_reg_src2, src2, load_mode);
@@ -937,7 +937,7 @@
  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
  src1 = replace_equiv_address (src1, src1_reg);
}
-  set_mem_size (src1, cmp_bytes);
+  set_mem_size (src1, load_mode_size);
 
   if (!REG_P (XEXP (src2, 0)))
{
@@ -944,7 +944,7 @@
  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
  src2 = replace_equiv_address (src2, src2_reg);
}
-  set_mem_size (src2, cmp_bytes);
+  set_mem_size (src2, load_mode_size);
 
   do_load_for_compare (tmp_reg_src1, src1, load_mode);
   do_load_for_compare (tmp_reg_src2, src2, load_mode);
@@ -1096,7 +1096,7 @@
  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); 
  src1 = replace_equiv_address (src1, src1_reg);
}
-  set_mem_size (src1, cmp_bytes);
+  set_mem_size (src1, load_mode_size); 
 
   if (!REG_P (XEXP (src2, 0)))
{
@@ -1103,7 +1103,7 @@
  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
  src2 = replace_equiv_address (src2, src2_reg);
}
-  set_mem_size (src2, cmp_bytes);
+  set_mem_size (src2, load_mode_size);
 
   /* Construct call to strcmp/strncmp to compare the rest of the string.  */
   if (no_length)
Index: gcc/testsuite/gcc.dg/pr82190.c
===
--- gcc/testsuite/gcc.dg/pr82190.c	(nonexistent)
+++ gcc/testsuite/gcc.dg/pr82190.c	(working copy)
@@ -0,0 +1,20 @@
+/* PR target/82190 */
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-optimize-strlen -fweb" } */
+
+char src[64] __attribute__ ((aligned)) = "aaa";
+char dst[64] __attribute__ ((aligned));
+
+int
+main ()
+{
+  __builtin_memcpy (dst, src, 6);
+  if (__builtin_memcmp (dst, src, 6))
+__builtin_abort ();
+
+  __builtin_memcpy (dst, src, 7);
+  if (__builtin_memcmp (dst, src, 7))
+__builtin_abort ();
+
+  return 0;
+}


[PATCH, rs6000] Allow memmov/memset builtin expansion to use unaligned vsx on p8/p9

2017-12-13 Thread Aaron Sawdey
This patch allows the use of unaligned vsx loads/stores for builtin
expansion of memset and memcmp on p8/p9. Performance of unaligned vsx
instructions is good on these processors. 

OK for trunk if bootstrap/regtest on ppc64le passes?

2017-12-13  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_move): Allow the use of
unaligned VSX load/store on P8/P9.
(expand_block_clear): Allow the use of unaligned VSX load/store on 
P8/P9.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 255585)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -67,11 +67,14 @@
   if (bytes <= 0)
 return 1;
 
+  bool isP8 = (rs6000_cpu == PROCESSOR_POWER8);
+  bool isP9 = (rs6000_cpu == PROCESSOR_POWER9);
+
   /* Use the builtin memset after a point, to avoid huge code bloat.
  When optimize_size, avoid any significant code bloat; calling
  memset is about 4 instructions, so allow for one instruction to
  load zero and three to do clearing.  */
-  if (TARGET_ALTIVEC && align >= 128)
+  if (TARGET_ALTIVEC && (align >= 128 || isP8 || isP9))
 clear_step = 16;
   else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
 clear_step = 8;
@@ -88,7 +91,7 @@
   machine_mode mode = BLKmode;
   rtx dest;
 
-  if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
+  if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || isP8 || isP9))
 	{
 	  clear_bytes = 16;
 	  mode = V4SImode;
@@ -1247,6 +1250,9 @@
   if (bytes > rs6000_block_move_inline_limit)
 return 0;
 
+  bool isP8 = (rs6000_cpu == PROCESSOR_POWER8);
+  bool isP9 = (rs6000_cpu == PROCESSOR_POWER9);
+
   for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
 {
   union {
@@ -1258,7 +1264,7 @@
 
   /* Altivec first, since it will be faster than a string move
 	 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+  if (TARGET_ALTIVEC && bytes >= 16 && (isP8 || isP9 || align >= 128))
 	{
 	  move_bytes = 16;
 	  mode = V4SImode;


Re: [PATCH] make canonicalize_condition keep its promise

2017-12-14 Thread Aaron Sawdey
On Thu, 2017-12-14 at 13:43 -0700, Jeff Law wrote:
> On 11/21/2017 10:45 AM, Aaron Sawdey wrote:
> > On Tue, 2017-11-21 at 10:06 -0700, Jeff Law wrote:
> > > On 11/20/2017 06:41 AM, Aaron Sawdey wrote:
> > > > On Sun, 2017-11-19 at 16:44 -0700, Jeff Law wrote:
> > > > > On 11/15/2017 08:40 AM, Aaron Sawdey wrote:
> > > > > > So, the story of this very small patch starts with me
> > > > > > adding
> > > > > > patterns
> > > > > > for ppc instructions bdz[tf] and bdnz[tf] such as this:
> > > > > > 
> > > > > >   [(set (pc)
> > > > > > (if_then_else
> > > > > >   (and
> > > > > >  (ne (match_operand:P 1 "register_operand"
> > > > > > "c,*b,*b,*b")
> > > > > >  (const_int 1))
> > > > > >  (match_operator 3 "branch_comparison_operator"
> > > > > >   [(match_operand 4 "cc_reg_operand"
> > > > > > "y,y,y,y")
> > > > > >    (const_int 0)]))
> > > > > >   (label_ref (match_operand 0))
> > > > > >   (pc)))
> > > > > >    (set (match_operand:P 2 "nonimmediate_operand"
> > > > > > "=1,*r,m,*d*wi*c*l")
> > > > > > (plus:P (match_dup 1)
> > > > > > (const_int -1)))
> > > > > >    (clobber (match_scratch:P 5 "=X,X,&r,r"))
> > > > > >    (clobber (match_scratch:CC 6 "=X,&y,&y,&y"))
> > > > > >    (clobber (match_scratch:CCEQ 7 "=X,&y,&y,&y"))]
> > > > > > 
> > > > > > However when this gets to the loop_doloop pass, we get an
> > > > > > assert
> > > > > > fail
> > > > > > in iv_number_of_iterations():
> > > > > > 
> > > > > >   gcc_assert (COMPARISON_P (condition));
> > > > > > 
> > > > > > This is happening because this branch insn tests two things
> > > > > > ANDed
> > > > > > together so the and is at the top of the expression, not a
> > > > > > comparison.
> > > > > 
> > > > > Is this something you've created for an existing
> > > > > loop?  Presumably an
> > > > > existing loop that previously was a simple loop?
> > > > 
> > > > The rtl to use this instruction is generated by new code I'm
> > > > working on
> > > > to do a builtin expansion of memcmp using a loop. I call
> > > > gen_bdnztf_di() to generate rtl for the insn. It would be nice
> > > > to
> > > > be
> > > > able to generate this instruction from doloop conversion but
> > > > that
> > > > is
> > > > beyond the scope of what I'm working on presently.
> > > 
> > > Understood.
> > > 
> > > So what I think (and I'm hoping you can confirm one way or the
> > > other)
> > > is
> > > that by generating this instruction you're turing a loop which
> > > previously was considered a simple loop by the IV code and
> > > turning it
> > > into something the IV bits no longer think is a simple loop.
> > > 
> > > I think that's problematical as when the loop is thought to be a
> > > simple
> > > loop, it has to have a small number of forms for its loop
> > > back/exit
> > > loop
> > > tests and whether or not a loop is a simple loop is cached in the
> > > loop
> > > structure.
> > > 
> > > I think we need to dig into that first.  If my suspicion is
> > > correct
> > > then
> > > this patch is really just papering over that deeper problem.  So
> > > I
> > > think
> > > you need to dig a big deeper into why you're getting into the
> > > code in
> > > question (canonicalize_condition) and whether or not the call
> > > chain
> > > makes any sense given the changes you've made to the loop.
> > > 
> > 
> > Jeff,
> >   There is no existing loop structure. This starts with a memcmp()
> > call
> > and then goes down through the builtin expansion mechanism, which
> > is
> > ultimately expanding the pattern c

Re: [PATCH] make canonicalize_condition keep its promise

2018-01-02 Thread Aaron Sawdey
On Tue, 2017-12-19 at 16:56 -0700, Jeff Law wrote:
> So the path to get_condition is reasonable.  That's really all I
> needed
> to have verified one way or the other.
> 
> With that in mind your patch is fine.
> 
> I will note that I find it highly confusing that we attach a simple
> loop
> descriptor to a loop that is not a simple loop.  But clearly you
> didn't
> introduce that oddball behavior.

Jeff,
  Thanks for sticking with this and reviewing, I have re-checked that
regstrap still passes and committed as 256079.

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000][PR target/82190] fix mem size info in rtl generated by memcmp and strncmp/strcmp builtin expansion

2018-01-02 Thread Aaron Sawdey
On Tue, 2017-12-12 at 14:12 -0600, Segher Boessenkool wrote:
> That looks better :-)  Okay for trunk, thanks!

As we discussed on IRC before christmas, I've simplified this to use
TARGET_EFFICIENT_UNALIGNED_VSX instead of checking explicitly for P8/P9
processors. Has the same effect but is cleaner/clearer. Committed in
256112.

  Aaron

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 256110)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -73,7 +73,7 @@
  When optimize_size, avoid any significant code bloat; calling
  memset is about 4 instructions, so allow for one instruction to
  load zero and three to do clearing.  */
-  if (TARGET_ALTIVEC && align >= 128)
+  if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
 clear_step = 16;
   else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
 clear_step = 8;
@@ -90,7 +90,7 @@
   machine_mode mode = BLKmode;
   rtx dest;
  -  if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
+  if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || 
TARGET_EFFICIENT_UNALIGNED_VSX))
{
  clear_bytes = 16;
  mode = V4SImode;
@@ -1260,7 +1260,7 @@
 /* Altivec first, since it will be faster than a string move
 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
+  if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || 
align >= 128))
{
  move_bytes = 16;
  mode = V4SImode;


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] rs6000: Cleanup bdz/bdnz insn/splitter, add new insn/splitter for bdzt/bdzf/bdnzt/bdnzf

2018-01-08 Thread Aaron Sawdey
On Fri, 2017-12-01 at 16:45 -0600, Segher Boessenkool wrote:
> Looks good otherwise.  I'll ok it when there is a user (or a
> testcase).
> It shouldn't go in before the canonicalize_condition patch, of
> course.

The canonicalize_condition patch is in, so I have checked in this
cleanup and addition to the patterns and splitters for the branch
decrement instructions as 256344.

2018-01-08  Aaron Sawdey  

* config/rs6000/rs6000.md (cceq_ior_compare): Remove * so I can use it
to generate rtl.
(cceq_ior_compare_complement): Give it a name so I can use it, and
change boolean_or_operator predicate to boolean_operator so it can
be used to generate a crand.
(eqne): New code iterator.
(bd/bd_neg): New code_attrs.
(_): New name for ctr_internal[12] now combined into
a single define_insn.
(tf_): A new insn pattern for the conditional form branch
decrement (bdnzt/bdnzf/bdzt/bdzf).
* config/rs6000/rs6000.c (rs6000_legitimate_combined_insn): Updated
with the new names of the branch decrement patterns, and added the
names of the branch decrement conditional patterns.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md	(revision 256216)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -12797,7 +12797,7 @@
 ; which are generated by the branch logic.
 ; Prefer destructive operations where BT = BB (for crXX BT,BA,BB)
 
-(define_insn "*cceq_ior_compare"
+(define_insn "cceq_ior_compare"
   [(set (match_operand:CCEQ 0 "cc_reg_operand" "=y,?y")
 (compare:CCEQ (match_operator:SI 1 "boolean_operator"
 	[(match_operator:SI 2
@@ -12817,9 +12817,9 @@
 
 ; Why is the constant -1 here, but 1 in the previous pattern?
 ; Because ~1 has all but the low bit set.
-(define_insn ""
+(define_insn "cceq_ior_compare_complement"
   [(set (match_operand:CCEQ 0 "cc_reg_operand" "=y,?y")
-(compare:CCEQ (match_operator:SI 1 "boolean_or_operator"
+(compare:CCEQ (match_operator:SI 1 "boolean_operator"
 	[(not:SI (match_operator:SI 2
   "branch_positive_comparison_operator"
   [(match_operand 3
@@ -13036,34 +13036,13 @@
 ;; rs6000_legitimate_combined_insn prevents combine creating any of
 ;; the ctr insns.
 
-(define_insn "ctr_internal1"
-  [(set (pc)
-	(if_then_else (ne (match_operand:P 1 "register_operand" "c,*b,*b,*b")
-			  (const_int 1))
-		  (label_ref (match_operand 0))
-		  (pc)))
-   (set (match_operand:P 2 "nonimmediate_operand" "=1,*r,m,*d*wi*c*l")
-	(plus:P (match_dup 1)
-		(const_int -1)))
-   (clobber (match_scratch:CC 3 "=X,&x,&x,&x"))
-   (clobber (match_scratch:P 4 "=X,X,&r,r"))]
-  ""
-{
-  if (which_alternative != 0)
-return "#";
-  else if (get_attr_length (insn) == 4)
-return "bdnz %l0";
-  else
-return "bdz $+8\;b %l0";
-}
-  [(set_attr "type" "branch")
-   (set_attr "length" "*,16,20,20")])
+(define_code_iterator eqne [eq ne])
+(define_code_attr bd [(eq "bdz") (ne "bdnz")])
+(define_code_attr bd_neg [(eq "bdnz") (ne "bdz")])
 
-;; Similar but use EQ
-
-(define_insn "ctr_internal2"
+(define_insn "_"
   [(set (pc)
-	(if_then_else (eq (match_operand:P 1 "register_operand" "c,*b,*b,*b")
+	(if_then_else (eqne (match_operand:P 1 "register_operand" "c,*b,*b,*b")
 			  (const_int 1))
 		  (label_ref (match_operand 0))
 		  (pc)))
@@ -13077,15 +13056,14 @@
   if (which_alternative != 0)
 return "#";
   else if (get_attr_length (insn) == 4)
-return "bdz %l0";
+return " %l0";
   else
-return "bdnz $+8\;b %l0";
+return " $+8\;b %l0";
 }
   [(set_attr "type" "branch")
(set_attr "length" "*,16,20,20")])
 
-;; Now the splitters if we could not allocate the CTR register
-
+;; Now the splitter if we could not allocate the CTR register
 (define_split
   [(set (pc)
 	(if_then_else (match_operator 2 "comparison_operator"
@@ -13093,19 +13071,13 @@
    (const_int 1)])
 		  (match_operand 5)
 		  (match_operand 6)))
-   (set (match_operand:P 0 "int_reg_operand")
+   (set (match_operand:P 0 "nonimmediate_operand")
 	(plus:P (match_dup 1)
 		(const_int -1)))
(clobber (match_scratch:CC 3))
(clobber (match_scratch:P 4))]
   "reload_completed"
-  [(

Re: [PATCH, rs6000] generate loop code for memcmp inline expansion

2018-01-08 Thread Aaron Sawdey
On Tue, 2017-12-12 at 10:13 -0600, Segher Boessenkool wrote:
> Please fix those trivialities, and it's okay for trunk (after the
> rtlanal patch is approved too).  Thanks!

Here's the final version of this, which is committed as 256351.


2018-01-08  Aaron Sawdey  

* config/rs6000/rs6000-string.c (do_load_for_compare_from_addr): New
function.
(do_ifelse): New function.
(do_isel): New function.
(do_sub3): New function.
(do_add3): New function.
(do_load_mask_compare): New function.
(do_overlap_load_compare): New function.
(expand_compare_loop): New function.
(expand_block_compare): Call expand_compare_loop() when appropriate.
* config/rs6000/rs6000.opt (-mblock-compare-inline-limit): Change
option description.
(-mblock-compare-inline-loop-limit): New option.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 256350)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -303,6 +303,959 @@
   return MIN (base_align, offset & -offset);
 }
 
+/* Prepare address and then do a load.
+
+   MODE is the mode to use for the load.
+   DEST is the destination register for the data.
+   ADDR is the address to be loaded.
+   ORIG_ADDR is the original address expression.  */
+static void
+do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
+			   rtx orig_addr)
+{
+  rtx mem = gen_rtx_MEM (mode, addr);
+  MEM_COPY_ATTRIBUTES (mem, orig_addr);
+  set_mem_size (mem, GET_MODE_SIZE (mode));
+  do_load_for_compare (dest, mem, mode);
+  return;
+}
+
+/* Do a branch for an if/else decision.
+
+   CMPMODE is the mode to use for the comparison.
+   COMPARISON is the rtx code for the compare needed.
+   A is the first thing to be compared.
+   B is the second thing to be compared.
+   CR is the condition code reg input, or NULL_RTX.
+   TRUE_LABEL is the label to branch to if the condition is true.
+
+   The return value is the CR used for the comparison.
+   If CR is null_rtx, then a new register of CMPMODE is generated.
+   If A and B are both null_rtx, then CR must not be null, and the
+   compare is not generated so you can use this with a dot form insn.  */
+
+static void
+do_ifelse (machine_mode cmpmode, rtx_code comparison,
+	   rtx a, rtx b, rtx cr, rtx true_label)
+{
+  gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
+	  || (a != NULL_RTX && b != NULL_RTX));
+
+  if (cr != NULL_RTX)
+gcc_assert (GET_MODE (cr) == cmpmode);
+  else
+cr = gen_reg_rtx (cmpmode);
+
+  rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
+
+  if (a != NULL_RTX)
+emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
+
+  rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
+
+  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
+  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  JUMP_LABEL (j) = true_label;
+  LABEL_NUSES (true_label) += 1;
+}
+
+/* Emit an isel of the proper mode for DEST.
+
+   DEST is the isel destination register.
+   SRC1 is the isel source if CR is true.
+   SRC2 is the isel source if CR is false.
+   CR is the condition for the isel.  */
+static void
+do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
+  else
+emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
+}
+
+/* Emit a subtract of the proper mode for DEST.
+
+   DEST is the destination register for the subtract.
+   SRC1 is the first subtract input.
+   SRC2 is the second subtract input.
+
+   Computes DEST = SRC1-SRC2.  */
+static void
+do_sub3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_subdi3 (dest, src1, src2));
+  else
+emit_insn (gen_subsi3 (dest, src1, src2));
+}
+
+/* Emit an add of the proper mode for DEST.
+
+   DEST is the destination register for the add.
+   SRC1 is the first add input.
+   SRC2 is the second add input.
+
+   Computes DEST = SRC1+SRC2.  */
+static void
+do_add3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_adddi3 (dest, src1, src2));
+  else
+emit_insn (gen_addsi3 (dest, src1, src2));
+}
+
+/* Generate rtl for a load, shift, and compare of less than a full word.
+
+   LOAD_MODE is the machine mode for the loads.
+   DIFF is the reg for the difference.
+   CMP_REM is the reg containing the remaining bytes to compare.
+   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
+   SRC1_ADDR is the first source address.
+   SRC2_ADDR is the second source address.
+   ORIG_SRC1 is the original first sour

[PATCH, rs6000] cleanup/refactor in rs6000-string.c

2018-06-14 Thread Aaron Sawdey
This patch cleans up and refactors some stuff in rs6000-string.c
before I start working on adding vec/vsx support to str[n]cmp inline
expansion. Also removes the * from vsx_mov_64bit in vsx.md
because I'll be using that pattern to generate lxvd2x.

Bootstrap/regtest passes on ppc64le power8 -- ok for trunk?

Thanks!
   Aaron

2018-06-14  Aaron Sawdey  

* config/rs6000/rs6000-string.c (select_block_compare_mode): Check
TARGET_EFFICIENT_OVERLAPPING_UNALIGNED here instead of in caller.
(do_and3, do_and3_mask, do_compb3, do_rotl3): New functions.
(expand_block_compare): Change select_block_compare_mode call.
(expand_strncmp_align_check): Use new functions, fix comment.
(emit_final_str_compare_gpr): New function.
(expand_strn_compare): Refactor and clean up code.
* config/rs6000/vsx.md (vsx_mov_64bit): Remove *.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: rs6000-string.c
===
--- rs6000-string.c	(revision 261573)
+++ rs6000-string.c	(working copy)
@@ -264,6 +264,7 @@
   else if (bytes == GET_MODE_SIZE (QImode))
 return QImode;
   else if (bytes < GET_MODE_SIZE (SImode)
+	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
 	   && offset >= GET_MODE_SIZE (SImode) - bytes)
 /* This matches the case were we have SImode and 3 bytes
and offset >= 1 and permits us to move back one and overlap
@@ -271,6 +272,7 @@
unwanted bytes off of the input.  */
 return SImode;
   else if (word_mode_ok && bytes < UNITS_PER_WORD
+	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
 	   && offset >= UNITS_PER_WORD-bytes)
 /* Similarly, if we can use DImode it will get matched here and
can do an overlapping read that ends at the end of the block.  */
@@ -406,6 +408,70 @@
 emit_insn (gen_addsi3 (dest, src1, src2));
 }
 
+/* Emit an and of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1&SRC2.  */
+static void
+do_and3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_anddi3 (dest, src1, src2));
+  else
+emit_insn (gen_andsi3 (dest, src1, src2));
+}
+
+/* Emit an and-mask of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the mask input.
+
+   Computes DEST = SRC1&SRC2.  */
+static void
+do_and3_mask (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_anddi3_mask (dest, src1, src2));
+  else
+emit_insn (gen_andsi3_mask (dest, src1, src2));
+}
+
+/* Emit an cmpb of the proper mode for DEST.
+
+   DEST is the destination register for the cmpb.
+   SRC1 is the first input.
+   SRC2 is the second input.
+
+   Computes cmpb of SRC1, SRC2.  */
+static void
+do_cmpb3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_cmpbdi3 (dest, src1, src2));
+  else
+emit_insn (gen_cmpbsi3 (dest, src1, src2));
+}
+
+/* Emit a rotl of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1 rotated left by SRC2.  */
+static void
+do_rotl3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_rotldi3 (dest, src1, src2));
+  else
+emit_insn (gen_rotlsi3 (dest, src1, src2));
+}
+
 /* Generate rtl for a load, shift, and compare of less than a full word.
 
LOAD_MODE is the machine mode for the loads.
@@ -1393,11 +1459,8 @@
   while (bytes > 0)
 {
   unsigned int align = compute_current_alignment (base_align, offset);
-  if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-	load_mode = select_block_compare_mode (offset, bytes, align,
-	   word_mode_ok);
-  else
-	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
+  load_mode = select_block_compare_mode (offset, bytes,
+	 align, word_mode_ok);
   load_mode_size = GET_MODE_SIZE (load_mode);
   if (bytes >= load_mode_size)
 	cmp_bytes = load_mode_size;
@@ -1625,22 +1688,19 @@
   return true;
 }
 
-/* Generate alignment check and branch code to set up for
+/* Generate page crossing check and branch code to set up for
strncmp when we don't have DI alignment.
STRNCMP_LABEL is the label to branch if there is a page crossing.
-   SRC is the string pointer to be examined.
+   SRC_ADDR is the string address to be examined.
BYTES is the max number of bytes to compare.  */
 static void
-expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
+expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
 {

[PATCH, rs6000] PR target/86222 fix truncation issue with constants when compiling -m32

2018-06-21 Thread Aaron Sawdey
expand_strn_compare was not using gen_int_mode or trunc_int_for_mode to
properly truncate to Pmode when creating contants in the generate rtx.
This lead to an improper constant and the ICE in PR/86222.

Testing on ppc64 with -m32, -m32 -mpowerpc64 and -m64. If regstrap
passes, ok for trunk and backport to 8?

Thanks, 
   Aaron


2018-06-19  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strn_compare): Handle -m32
correctly.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 261850)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -1925,20 +1925,15 @@
 	  /* -m32 -mpowerpc64 results in word_mode being DImode even
 	 though otherwise it is 32-bit. The length arg to strncmp
 	 is a size_t which will be the same size as pointers.  */
-	  rtx len_rtx;
-	  if (TARGET_64BIT)
-	len_rtx = gen_reg_rtx (DImode);
-	  else
-	len_rtx = gen_reg_rtx (SImode);
+	  rtx len_rtx = gen_reg_rtx (Pmode);
+	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
 
-	  emit_move_insn (len_rtx, bytes_rtx);
-
 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
    target, LCT_NORMAL, GET_MODE (target),
    force_reg (Pmode, src1_addr), Pmode,
    force_reg (Pmode, src2_addr), Pmode,
-   len_rtx, GET_MODE (len_rtx));
+   len_rtx, Pmode);
 	}
 
   rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
@@ -2126,18 +2121,12 @@
 	}
   else
 	{
-	  rtx len_rtx;
-	  if (TARGET_64BIT)
-	len_rtx = gen_reg_rtx (DImode);
-	  else
-	len_rtx = gen_reg_rtx (SImode);
-
-	  emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
+	  rtx len_rtx = gen_reg_rtx (Pmode);
+	  emit_move_insn (len_rtx, gen_int_mode (bytes-compare_length, Pmode));
 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
    target, LCT_NORMAL, GET_MODE (target),
-   src1, Pmode, src2, Pmode,
-   len_rtx, GET_MODE (len_rtx));
+   src1, Pmode, src2, Pmode, len_rtx, Pmode);
 	}
 
   rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);


[PATCH, rs6000] don't use unaligned vsx for memset of less than 32 bytes

2018-06-25 Thread Aaron Sawdey
In gcc 8 I added support for unaligned vsx in the builtin expansion of
memset(x,0,y). Turns out that for memset of less than 32 bytes, this
doesn't really help much, and it also runs into an egregious load-hit-
store case in CPU2006 components gcc and hmmer.

This patch reverts to the previous (gcc 7) behavior for memset of 16-31 
bytes, which is to use vsx stores only if the target is 16 byte
aligned. For 32 bytes or more, unaligned vsx stores will still be used.
  Performance testing of the memset expansion shows that not much is
given up by using scalar stores for 16-31 bytes, and CPU2006 runs show
the performance regression is fixed.

Regstrap passes on powerpc64le, ok for trunk and backport to 8?

Thanks,
   Aaron

2018-06-25  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_clear): Don't use
unaligned vsx for 16B memset.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 261808)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -90,7 +90,9 @@
   machine_mode mode = BLKmode;
   rtx dest;
 
-  if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
+  if (TARGET_ALTIVEC
+	  && ((bytes >= 16 && align >= 128)
+	  || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
 	{
 	  clear_bytes = 16;
 	  mode = V4SImode;


Re: [PATCH, rs6000] generate loop code for memcmp inline expansion

2018-01-10 Thread Aaron Sawdey
I'll check the runtime of that --- I added some test cases to memcmp-
1.c and probably it is now taking too long. I will revise it so it's no
 longer than it was before.

  Aaron

On Wed, 2018-01-10 at 14:25 +, Szabolcs Nagy wrote:
> On 08/01/18 19:37, Aaron Sawdey wrote:
> > On Tue, 2017-12-12 at 10:13 -0600, Segher Boessenkool wrote:
> > > > Please fix those trivialities, and it's okay for trunk (after
> > > > the
> > > > rtlanal patch is approved too).  Thanks!
> > 
> > Here's the final version of this, which is committed as 256351.
> > 
> > 
> > 2018-01-08  Aaron Sawdey  
> > 
> > * config/rs6000/rs6000-string.c
> > (do_load_for_compare_from_addr): New
> > function.
> > (do_ifelse): New function.
> > (do_isel): New function.
> > (do_sub3): New function.
> > (do_add3): New function.
> > (do_load_mask_compare): New function.
> > (do_overlap_load_compare): New function.
> > (expand_compare_loop): New function.
> > (expand_block_compare): Call expand_compare_loop() when
> > appropriate.
> > * config/rs6000/rs6000.opt (-mblock-compare-inline-limit):
> > Change
> > option description.
> > (-mblock-compare-inline-loop-limit): New option.
> > 
> 
> ...
> > Index: gcc/testsuite/gcc.dg/memcmp-1.c
> > ===
> > --- gcc/testsuite/gcc.dg/memcmp-1.c (revision 256350)
> > +++ gcc/testsuite/gcc.dg/memcmp-1.c (working copy)
> > @@ -14,11 +14,80 @@
> >  #ifndef NRAND
> >  #define NRAND 1
> >  #endif
> > -#define MAX_SZ 200
> > +#define MAX_SZ 600
> >  
> 
> i see timeouts when running aarch64-none-elf tests in some
> emulator environments:
> 
> WARNING: program timed out.
> FAIL: gcc.dg/memcmp-1.c execution test
> 
> if there is a way to reduce the iteration count or the
> tested variants that would help slow targets.
> 
> > +#define
> > DEF_RS(ALIGN)  
> > \
> > +static void test_memcmp_runtime_size_ ## ALIGN (const char *str1, 
> >\
> > +   const char *str2,   
> >\
> > +   size_t sz, int
> > expect)\
> > +{  
> >\
> > +  char three[8192] __attribute__ ((aligned (4096)));   
> >\
> > +  char four[8192] __attribute__ ((aligned (4096)));
> >\
> > +  char *a, *b; 
> >\
> > +  int i,j,a1,a2,r; 
> >\
> > +  for (j = 0; j < 2; j++)  
> >\
> > +{  
> >\
> > +  for (i = 0; i < 2; i++)  
> >\
> > +   {   
> >\
> > + a = three+i*ALIGN+j*(4096-2*i*ALIGN); 
> >\
> > + b = four+i*ALIGN+j*(4096-2*i*ALIGN);  
> >\
> > + memcpy(a,str1,sz);
> >\
> > + memcpy(b,str2,sz);
> >\
> > + asm(" "); 
> >\
> > + r = memcmp(a,b,sz);   
> >\
> > + asm(" "); 
> >\
> > + if ( r < 0 && !(expect < 0) ) abort();
> >\
> > + if ( r > 0 && !(expect > 0) ) abort();
> >\
> > + if ( r == 0 && !(expect == 0) ) abort();  
> >\
> > +   }   
> >\
> > +}  
> >\
> > +}
> > +
> > +DEF_RS(1)
> > +DEF_RS(2)
> > +DEF_RS(4)
> > +DEF_RS(8)
> > +DEF_RS(16)
> > +
> > +static void test_memcmp_runtime_size (const char *str1, const char
> > *str2,
> > + size_t sz, int expect)
> > +{
> > +  char three[8192] __attribute__ ((aligned (4096)));
> > +  char four[819

[PATCH] reduce runtime of gcc.dg/memcmp-1.c test

2018-01-10 Thread Aaron Sawdey
This brings it back not quite to where it was but a lot more reasonable
than what I put into 256351.

2018-01-10  Aaron Sawdey  

* gcc.dg/memcmp-1.c: Reduce runtime to something reasonable.

OK for trunk?

Thanks, 
   Aaron


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: /home/sawdey/src/gcc/trunk/trunk/gcc/testsuite/gcc.dg/memcmp-1.c
===
--- /home/sawdey/src/gcc/trunk/trunk/gcc/testsuite/gcc.dg/memcmp-1.c	(revision 256437)
+++ /home/sawdey/src/gcc/trunk/trunk/gcc/testsuite/gcc.dg/memcmp-1.c	(working copy)
@@ -12,8 +12,20 @@
 int lib_strncmp(const char *a, const char *b, size_t n) asm("strncmp");
 
 #ifndef NRAND
+#ifdef TEST_ALL
 #define NRAND 1
+#else
+#define NRAND 500
 #endif
+#endif
+#ifndef TZONE
+#ifdef TEST_ALL
+#define TZONE 16
+#else
+#define TZONE 8
+#endif
+#endif
+
 #define MAX_SZ 600
 
 #define DEF_RS(ALIGN)  \
@@ -33,9 +45,7 @@
 	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);   \
 	  memcpy(a,str1,sz);		   \
 	  memcpy(b,str2,sz);		   \
-	  asm(" ");			   \
 	  r = memcmp(a,b,sz);		   \
-	  asm(" ");			   \
 	  if ( r < 0 && !(expect < 0) ) abort();			   \
 	  if ( r > 0 && !(expect > 0) )	abort();			   \
 	  if ( r == 0 && !(expect == 0) ) abort();			   \
@@ -67,15 +77,13 @@
 	{
 	  for (a1=0; a1 < 2*sizeof(void *); a1++)
 	{
+	  a = three+i*a1+j*(4096-2*i*a1);
+	  memcpy(a,str1,sz);
 	  for (a2=0; a2 < 2*sizeof(void *); a2++)
 		{
-		  a = three+i*a1+j*(4096-2*i*a1);
 		  b = four+i*a2+j*(4096-2*i*a2);
-		  memcpy(a,str1,sz);
 		  memcpy(b,str2,sz);
-		  asm(" ");
 		  r = memcmp(a,b,sz);
-		  asm(" ");
 		  if ( r < 0 && !(expect < 0) ) abort();
 		  if ( r > 0 && !(expect > 0) )	abort();
 		  if ( r == 0 && !(expect == 0) ) abort();
@@ -89,7 +97,7 @@
 void (test_strncmp)(const char *, const char *, int),
   size_t sz, int align)
 {
-  char buf1[MAX_SZ*2+10],buf2[MAX_SZ*2+10];
+  char buf1[MAX_SZ*2+TZONE],buf2[MAX_SZ*2+TZONE];
   size_t test_sz = (sz10)?(test_sz-10):0); diff_pos < test_sz+10; diff_pos++)
-for(zero_pos = ((test_sz>10)?(test_sz-10):0); zero_pos < test_sz+10; zero_pos++)
+  for(diff_pos = ((test_sz>TZONE)?(test_sz-TZONE):0); diff_pos < test_sz+TZONE; diff_pos++)
+for(zero_pos = ((test_sz>TZONE)?(test_sz-TZONE):0); zero_pos < test_sz+TZONE; zero_pos++)
   {
 	memset(buf1, 'A', 2*test_sz);
 	memset(buf2, 'A', 2*test_sz);
@@ -125,7 +133,6 @@
 	(*test_memcmp)(buf2,buf2,0);
 	test_memcmp_runtime_size (buf1, buf2, sz, e);
 	test_memcmp_runtime_size (buf2, buf1, sz, -e);
-	test_memcmp_runtime_size (buf2, buf2, sz, 0);
 	e = lib_strncmp(buf1,buf2,sz);
 	(*test_strncmp)(buf1,buf2,e);
 	(*test_strncmp)(buf2,buf1,-e);
@@ -470,10 +477,8 @@
 DEF_TEST(9,1)
 DEF_TEST(16,1)
 DEF_TEST(32,1)
-DEF_TEST(100,1)
-DEF_TEST(100,8)
-DEF_TEST(180,1)
-DEF_TEST(180,8)
+DEF_TEST(33,8)
+DEF_TEST(49,1)
 #endif
 
 int
@@ -753,9 +758,7 @@
 RUN_TEST(9,1)
 RUN_TEST(16,1)
 RUN_TEST(32,1)
-RUN_TEST(100,1)
-RUN_TEST(100,8)
-RUN_TEST(180,1)
-RUN_TEST(180,8)
+RUN_TEST(33,8)
+RUN_TEST(49,1)
 #endif
 }


[PATCH][PR debug/83758] look more carefully for internal_arg_pointer in vt_add_function_parameter()

2018-01-29 Thread Aaron Sawdey
This bug appears to revolve around whether there is a canonical rtx for
internal_arg_pointer in var-tracking. In vt_add_function_parameter() we
 currently have:

static void
vt_add_function_parameter (tree parm)
{
  rtx decl_rtl = DECL_RTL_IF_SET (parm);
  rtx incoming = DECL_INCOMING_RTL (parm);
  tree decl;
  machine_mode mode;
  poly_int64 offset;
  dataflow_set *out;
  decl_or_value dv;

  if (TREE_CODE (parm) != PARM_DECL)
return;

  if (!decl_rtl || !incoming)
return;

  if (GET_MODE (decl_rtl) == BLKmode || GET_MODE (incoming) == BLKmode)
return;

  /* If there is a DRAP register or a pseudo in internal_arg_pointer,
 rewrite the incoming location of parameters passed on the stack
 into MEMs based on the argument pointer, so that incoming doesn't
 depend on a pseudo.  */
  if (MEM_P (incoming)
  && (XEXP (incoming, 0) == crtl->args.internal_arg_pointer
  || (GET_CODE (XEXP (incoming, 0)) == PLUS
  && XEXP (XEXP (incoming, 0), 0)
 == crtl->args.internal_arg_pointer
  && CONST_INT_P (XEXP (XEXP (incoming, 0), 1)
{
  HOST_WIDE_INT off = -FIRST_PARM_OFFSET (current_function_decl);
  if (GET_CODE (XEXP (incoming, 0)) == PLUS)
off += INTVAL (XEXP (XEXP (incoming, 0), 1));
  incoming
= replace_equiv_address_nv (incoming,
plus_constant (Pmode,
   arg_pointer_rtx, off));
}


This is looking for crtl->args.internal_arg_pointer within rtx
incoming. The problem I am seeing is that the same rtx is there, just
not as a pointer to the identical rtx objects, so is not found by the
== comparison in the current code. So hence my patch below to switch
from == to rtx_equal_p(). If the expression is not rewritten, then the
pseudo created for the stack pointer is not preserved and later we run
into the assert near the beginning of vt_expand_var_loc_chain().

Bootstrap now passes for languages=c,c++,go on ppc64le. If
bootstrap/regtest is ok on ppc64le and x86_64, ok for trunk?


2018-01-29  Aaron Sawdey  

* var-tracking.c (vt_add_function_parameter): Fix comparison of rtx.


Index: gcc/var-tracking.c
===
--- gcc/var-tracking.c  (revision 257159)
+++ gcc/var-tracking.c  (working copy)
@@ -9668,10 +9668,10 @@
  into MEMs based on the argument pointer, so that incoming doesn't
  depend on a pseudo.  */
   if (MEM_P (incoming)
-  && (XEXP (incoming, 0) == crtl->args.internal_arg_pointer
+  && (rtx_equal_p (XEXP (incoming, 0), crtl-
>args.internal_arg_pointer)
  || (GET_CODE (XEXP (incoming, 0)) == PLUS
- && XEXP (XEXP (incoming, 0), 0)
-== crtl->args.internal_arg_pointer
+ && rtx_equal_p (XEXP (XEXP (incoming, 0), 0),
+ crtl->args.internal_arg_pointer)
  && CONST_INT_P (XEXP (XEXP (incoming, 0), 1)
 {
   HOST_WIDE_INT off = -FIRST_PARM_OFFSET (current_function_decl);


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][PR debug/83758] look more carefully for internal_arg_pointer in vt_add_function_parameter()

2018-01-30 Thread Aaron Sawdey
On Tue, 2018-01-30 at 11:13 +0100, Jakub Jelinek wrote:
> On Tue, Jan 30, 2018 at 03:55:58AM -0600, Segher Boessenkool wrote:
> > > But in that case, what does the copying?
> > 
> > I don't know.  Aaron will look at it, but timezones etc. :-)

Indeed I did see unshare_all_rtl() copying internal_arg_pointer. But
also several places in function.c:

assign_parm_adjust_entry_rtl:
  move_block_from_reg (REGNO (entry_parm),
   validize_mem (copy_rtx (stack_parm)),
   data->partial / UNITS_PER_WORD);

assign_parm_setup_reg:
  /* Copy the value into the register, thus bridging between
 assign_parm_find_data_types and expand_expr_real_1.  */

  equiv_stack_parm = data->stack_parm;
  validated_mem = validize_mem (copy_rtx (data->entry_parm));

assign_parm_setup_block:
  mem = validize_mem (copy_rtx (stack_parm));

in expr.c:

expand_expr_real_1:
  /* DECL_MODE might change when TYPE_MODE depends on attribute target
 settings for VECTOR_TYPE_P that might switch for the function.  */
  if (currently_expanding_to_rtl
  && code == VAR_DECL && MEM_P (decl_rtl)
  && VECTOR_TYPE_P (type) && exp && DECL_MODE (exp) != mode)
decl_rtl = change_address (decl_rtl, TYPE_MODE (type), 0);
  else
decl_rtl = copy_rtx (decl_rtl);


> > 
> > > That's what seems strange.  I can see why we'd have two nested
> > > pluses with the inner plus being pointer-equal to
> > > internal_arg_ptr.
> > > And I can see why we'd have a single canonical plus (which IMO
> > > would
> > > be better, but I agree it's not stage 4 material).  It's having
> > > the two
> > > nested pluses without maintaining pointer equality that seems
> > > strange.
> > 
> > The inner plus is *not* pointer-equal, that is the
> > problem.  Something
> > did copy_rtx (or such) on it, many things do.  We can tell you what
> > exactly later today.
> 
> Most likely unshare_all_rtl, which does:
>   for (tree decl = DECL_ARGUMENTS (cfun->decl); decl; decl =
> DECL_CHAIN (decl))
> {
>   if (DECL_RTL_SET_P (decl))
> SET_DECL_RTL (decl, copy_rtx_if_shared (DECL_RTL (decl)));
>   DECL_INCOMING_RTL (decl) = copy_rtx_if_shared
> (DECL_INCOMING_RTL (decl));
> }
> 
> Anyway, my preference would be to change that gen_rtx_PLUS into
>   stack_parm = crtl->args.internal_arg_pointer;
>   if (!CONST_INT_P (offset_rtx))
> stack_parm = gen_rtx_PLUS (Pmode, stack_parm, offset_rtx);
>   else if (offset_rtx != const0_rtx)
> stack_parm = plus_constant (Pmode, stack_parm, INTVAL
> (offset_rtx));
>   stack_parm = gen_rtx_MEM (data->promoted_mode, stack_parm);
> and deal specially with GET_CODE (crtl->args.internal_arg_pointer)
> in var-tracking.c.
> rs6000/powerpcspe with -fsplit-stack are the only cases where
> crtl->args.internal_arg_pointer is not a REG, so just running libgo
> testsuite on powerpc{,64,64le} should cover it all.

I'll give this a try today when I get to the office.

Thanks,
Aaron


> 
>   Jakub
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH, rs6000][PR debug/83758] v2 rs6000_internal_arg_pointer should only return a register

2018-01-30 Thread Aaron Sawdey
On Tue, 2018-01-30 at 14:04 +0100, Jakub Jelinek wrote:
> On IRC when discussing it with Segher this morning we've come to the
> conclusion that it would be best if rs6000 just followed what all
> other
> ports to, i.e. return a pseudo from the target hook, like:
> 
> --- gcc/config/rs6000/rs6000.c2018-01-30 12:30:27.416360076
> +0100
> +++ gcc/config/rs6000/rs6000.c2018-01-30 13:59:07.360639803
> +0100
> @@ -29602,8 +29602,9 @@ rs6000_internal_arg_pointer (void)
> emit_insn_after (pat, get_insns ());
> pop_topmost_sequence ();
>   }
> -  return plus_constant (Pmode, cfun->machine-
> >split_stack_arg_pointer,
> - FIRST_PARM_OFFSET
> (current_function_decl));
> +  rtx ret = plus_constant (Pmode, cfun->machine-
> >split_stack_arg_pointer,
> +FIRST_PARM_OFFSET
> (current_function_decl));
> +  return copy_to_reg (ret);
>  }
>return virtual_incoming_args_rtx;
>  }
> 
> copy_to_reg is what e.g. the generic or pa target hook conditionally
> uses.

This fix looks good, passes bootstrap, go tests run. 

Segher is currently regtesting on ppc64le power9. OK for trunk if tests
pass?

2018-01-30  Aaron Sawdey  

    * config/rs6000/rs6000.c (rs6000_internal_arg_pointer ): Only return
a reg rtx.
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 257188)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -29602,8 +29602,9 @@
 	  emit_insn_after (pat, get_insns ());
 	  pop_topmost_sequence ();
 	}
-  return plus_constant (Pmode, cfun->machine->split_stack_arg_pointer,
-			FIRST_PARM_OFFSET (current_function_decl));
+  rtx ret = plus_constant (Pmode, cfun->machine->split_stack_arg_pointer,
+			   FIRST_PARM_OFFSET (current_function_decl));
+  return copy_to_reg (ret);
 }
   return virtual_incoming_args_rtx;
 }


Re: [PATCH][AArch64] PR84114: Avoid reassociating FMA

2018-02-27 Thread Aaron Sawdey
On Tue, 2018-02-27 at 14:21 +, Wilco Dijkstra wrote:
> Richard Biener 
> 
> > It happens that on some targets doing two FMAs in parallel and one
> > non-FMA operation merging them is faster than chaining three
> > FMAs...
> 
> Like I mentioned in the PR, long chains should be broken, but for
> that we need a new parameter to state how long a chain may be before
> it is split. The issue today is that it splits even very short
> chains, removing beneficial FMAs.
> 
> > But yes, somewhere I suggested that FMA detection should/could be
> > integrated with reassociation.

I'd also like to see some work here. 

Doing two FMA in parallel and then a non-FMA merge is faster on ppc,
but it would be nice if the target had some more control of exactly how
this happens.

Also doing parallel reassociation increases register pressure so it
would be nice to be able to avoid causing issues as a result of that.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



PR target/84743 adjust reassociation widths for power8/power9

2018-03-12 Thread Aaron Sawdey
Looking at CPU2017 results for different reassociation widths, things
have shifted since I last looked at this with CPU2006 in early gcc7
timeframe. Best thing to do seems to be to set reassociation width to 1
for all integer modes, which is what the attached patch does.

I also tried setting width to 1 for float modes PLUS_EXPR as this patch
did for aarch64 but this does not seem to be helpful for power8.
https://gcc.gnu.org/ml/gcc-patches/2018-02/msg01271.html


Results below are % performance improvement on power8 comparing trunk
with the attached patch vs trunk with --param tree-reassoc-width=1 to
disable parallel reassociation for everything (first column of results)
and trunk unmodified (second column of results). 

CPU2017 componentvs width=1   vs trunk
500.perlbench_r-0.36% -0.15%
502.gcc_r   0.06%  0.04%
505.mcf_r   0.32%  0.24%
520.omnetpp_r   0.57% -0.95%
523.xalancbmk_r 1.45%  1.04%
525.x264_r -0.05%  0.09%
531.deepsjeng_r 0.04%  0.09%
541.leela_r 0.10%  0.72%
548.exchange2_r 0.08%  0.73%
557.xz_r0.09%  2.12%
CPU2017 int geo mean0.23%  0.40%
503.bwaves_r0.00%  0.01%
507.cactuBSSN_r 0.05% -0.02%
508.namd_r  0.00%  0.00%
510.parest_r   -0.01%  0.20%
511.povray_r0.03% -0.24%
519.lbm_r  -0.04% -0.16%
521.wrf_r  -0.01% -0.56%
526.blender_r  -0.82% -0.47%
527.cam4_r -0.18%  0.06%
538.imagick_r  -0.02%  0.01%
544.nab_r   0.00%  0.23%
549.fotonik3d_r 0.24%  0.54%
554.roms_r -0.05%  0.03%
CPU2017 fp geo mean-0.06% -0.03%

Bottom line is net improvement for CPU2017 int compared with either
current trunk, or disabling parallel reassociation. For CPU2017 fp,
very small overall degradation. 

Currently doing regstrap on ppc64le, ok for trunk if results look good?

Thanks!
   Aaron

2018-03-12  Aaron Sawdey  

PR target/84743
* config/rs6000/rs6000.c (rs6000_reassociation_width): Disable parallel
reassociation for int modes.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 258101)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -10006,7 +10006,7 @@
   if (VECTOR_MODE_P (mode))
 	return 4;
   if (INTEGRAL_MODE_P (mode)) 
-	return opc == MULT_EXPR ? 4 : 6;
+	return 1;
   if (FLOAT_MODE_P (mode))
 	return 4;
   break;


[PATCH, rs6000] PR target/83822 fix redundant conditions

2018-03-29 Thread Aaron Sawdey
I've fixed the redundant conditions in the expressions pointed out by
83822. Bootstrap/regtest passes on ppc64le, ok for trunk?

Aaron


2018-03-29  Aaron Sawdey  

PR target/83822
* config/rs6000/rs6000-string.c (expand_compare_loop): Fix redundant
condition.
* config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): Fix redundant
condition.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-c.c
===
--- gcc/config/rs6000/rs6000-c.c	(revision 258900)
+++ gcc/config/rs6000/rs6000-c.c	(working copy)
@@ -642,8 +642,7 @@
 	  cpp_get_callbacks (pfile)->macro_to_expand = rs6000_macro_to_expand;
 	}
 }
-  if (!TARGET_HARD_FLOAT
-  || (TARGET_HARD_FLOAT && !TARGET_DOUBLE_FLOAT))
+  if (!TARGET_HARD_FLOAT || !TARGET_DOUBLE_FLOAT)
 builtin_define ("_SOFT_DOUBLE");
   /* Used by lwarx/stwcx. errata work-around.  */
   if (rs6000_cpu == PROCESSOR_PPC405)
Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c	(revision 258900)
+++ gcc/config/rs6000/rs6000-string.c	(working copy)
@@ -966,8 +966,7 @@
   rtx final_cleanup = gen_label_rtx ();
   rtx cmp_rem_before = gen_reg_rtx (word_mode);
   /* Compare one more word_mode chunk if needed.  */
-  if (!bytes_is_const
-	  || (bytes_is_const && bytes_remaining >= load_mode_size))
+  if (!bytes_is_const || bytes_remaining >= load_mode_size)
 	{
 	  /* If remainder length < word length, branch to final
 	 cleanup compare.  */


[PATCH rs6000: document options (PR85321)

2018-04-10 Thread Aaron Sawdey
This updates invoke.texi to document -mblock-compare-inline-limit,
-mblock-compare-inline-loop-limit, and -mstring-compare-inline-limit.

Tested with "make pdf", ok for trunk?

2018-04-10  Aaron Sawdey  

PR target/85321
* doc/invoke.texi (RS/6000 and PowerPC Options): Document options
-mblock-compare-inline-limit, -mblock-compare-inline-loop-limit,
and -mstring-compare-inline-limit.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/doc/invoke.texi
===
--- gcc/doc/invoke.texi	(revision 259295)
+++ gcc/doc/invoke.texi	(working copy)
@@ -1080,6 +1080,9 @@
 -maix-struct-return  -msvr4-struct-return @gol
 -mabi=@var{abi-type}  -msecure-plt  -mbss-plt @gol
 -mblock-move-inline-limit=@var{num} @gol
+-mblock-compare-inline-limit=@var{num} @gol
+-mblock-compare-inline-loop-limit=@var{num} @gol
+-mstring-compare-inline-limit=@var{num} @gol
 -misel  -mno-isel @gol
 -misel=yes  -misel=no @gol
 -mpaired @gol
@@ -24142,6 +24145,31 @@
 @var{num} is 32 bytes on 32-bit targets and 64 bytes on 64-bit
 targets.  The default value is target-specific.
 
+@item -mblock-compare-inline-limit=@var{num}
+@opindex mblock-compare-inline-limit
+Generate non-looping inline code for all block compares (such as calls
+to @code{memcmp} or structure compares) less than or equal to @var{num}
+bytes. If @var{num} is 0, all inline expansion (non-loop and loop) of
+block compare is disabled. The default value is target-specific.
+
+@item -mblock-compare-inline-loop-limit=@var{num}
+@opindex mblock-compare-inline-loop-limit
+Generate an inline expansion using loop code for all block compares that
+are less than or equal to @var{num} bytes, but greater than the limit
+for non-loop inline block compare expansion. If the block length is not
+constant, at most @var{num} bytes will be compared before @code{memcmp}
+is called to compare the remainder of the block. The default value is
+target-specific.
+
+@item -mstring-compare-inline-limit=@var{num}
+@opindex mstring-compare-inline-limit
+Generate at most @var{num} pairs of load instructions to compare the
+string inline. If the difference or end of string is not found at the
+end of the inline compare a call to @code{strcmp} or @code{strncmp} will
+take care of the rest of the comparison. The default is 8 pairs of
+loads, which will compare 64 bytes on a 64-bit target and 32 bytes on a
+32-bit target.
+
 @item -G @var{num}
 @opindex G
 @cindex smaller data references (PowerPC)


[PATCH, committed] Update my MAINTAINERS entry

2018-04-10 Thread Aaron Sawdey
Update to my new email address. Committed as 259301.

2018-04-10  Aaron Sawdey  

* MAINTAINERS: Update my email address.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: MAINTAINERS
===
--- MAINTAINERS	(revision 259295)
+++ MAINTAINERS	(working copy)
@@ -568,7 +568,7 @@
 Duncan Sands	
 Sujoy Saraswati	
 Trevor Saunders	
-Aaron Sawdey	
+Aaron Sawdey	
 Roger Sayle	
 Will Schmidt	
 Tilo Schwarz	


[PATCH, rs6000] PR85321 improve documentation of -mcall and -mtraceback=

2018-04-10 Thread Aaron Sawdey
Another update to document -mcall- and -mtraceback= options. Cleanup to
remove -mabi={no-,}spe from the RS/6000 and PowerPC section. And a trim
to the help text for -mblock-compare-* and -mstring-compare-inline-
limit so they are not excessively long. The complete description for
those is now in invoke.texi. This is the last piece for 85321.

Testing in progress on linux-ppc64le, ok for trunk if tests are ok?

Thanks,
   Aaron

2018-04-10  Aaron Sawdey  

PR target/85321
* doc/invoke.texi (RS/6000 and PowerPC Options): Document options
-mcall= and -mtraceback. Remove options -mabi=spe and -mabi=no-spe
from PowerPC section.
* config/rs6000/sysv4.opt (mcall): Improve help text.
* config/rs6000/rs6000.opt (mblock-compare-inline-limit=): Trim
help text that is too long.
* config/rs6000/rs6000.opt (mblock-compare-inline-loop-limit=): Trim
help text that is too long.
* config/rs6000/rs6000.opt (mstring-compare-inline-limit=): Trim
help text that is too long.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: doc/invoke.texi
===
--- doc/invoke.texi	(revision 259302)
+++ doc/invoke.texi	(working copy)
@@ -1076,7 +1076,10 @@
 -mprioritize-restricted-insns=@var{priority} @gol
 -msched-costly-dep=@var{dependence_type} @gol
 -minsert-sched-nops=@var{scheme} @gol
--mcall-sysv  -mcall-netbsd @gol
+-mcall-aixdesc  -mcall-eabi  -mcall-freebsd  @gol
+-mcall-linux  -mcall-netbsd  -mcall-openbsd  @gol
+-mcall-sysv  -mcall-sysv-eabi  -mcall-sysv-noeabi @gol
+-mtraceback=@var{traceback_type} @gol
 -maix-struct-return  -msvr4-struct-return @gol
 -mabi=@var{abi-type}  -msecure-plt  -mbss-plt @gol
 -mblock-move-inline-limit=@var{num} @gol
@@ -23957,6 +23960,12 @@
 On System V.4 and embedded PowerPC systems compile code for the
 OpenBSD operating system.
 
+@item -mtraceback=@var{traceback_type}
+@opindex mtraceback
+Select the type of traceback table. Valid values for @var{traceback_type}
+are @samp{full}, @samp{part},
+and @samp{no}.
+
 @item -maix-struct-return
 @opindex maix-struct-return
 Return all structures in memory (as specified by the AIX ABI)@.
@@ -23973,16 +23982,6 @@
 @samp{no-spe}, @samp{ibmlongdouble}, @samp{ieeelongdouble},
 @samp{elfv1}, @samp{elfv2}@.
 
-@item -mabi=spe
-@opindex mabi=spe
-Extend the current ABI with SPE ABI extensions.  This does not change
-the default ABI, instead it adds the SPE ABI extensions to the current
-ABI@.
-
-@item -mabi=no-spe
-@opindex mabi=no-spe
-Disable Book-E SPE ABI extensions for the current ABI@.
-
 @item -mabi=ibmlongdouble
 @opindex mabi=ibmlongdouble
 Change the current ABI to use IBM extended-precision long double.
Index: config/rs6000/sysv4.opt
===
--- config/rs6000/sysv4.opt	(revision 259301)
+++ config/rs6000/sysv4.opt	(working copy)
@@ -21,7 +21,7 @@
 
 mcall-
 Target RejectNegative Joined Var(rs6000_abi_name)
-Select ABI calling convention.
+-mcall=ABI	Select ABI calling convention.
 
 msdata=
 Target RejectNegative Joined Var(rs6000_sdata_name)
Index: config/rs6000/rs6000.opt
===
--- config/rs6000/rs6000.opt	(revision 259301)
+++ config/rs6000/rs6000.opt	(working copy)
@@ -335,15 +335,15 @@
 
 mblock-compare-inline-limit=
 Target Report Var(rs6000_block_compare_inline_limit) Init(31) RejectNegative Joined UInteger Save
-Specify the maximum number of bytes to compare inline with non-looping code. If this is set to 0, all inline expansion (non-loop and loop) of memcmp is disabled.
+Specify the maximum number of bytes to compare inline with non-looping code.
 
 mblock-compare-inline-loop-limit=
 Target Report Var(rs6000_block_compare_inline_loop_limit) Init(-1) RejectNegative Joined UInteger Save
-Specify the maximum number of bytes to compare inline with loop code generation.  If the length is not known at compile time, memcmp will be called after this many bytes are compared. By default, a length will be picked depending on the tuning target.
+Specify the maximum number of bytes to compare inline with loop code generation.
 
 mstring-compare-inline-limit=
 Target Report Var(rs6000_string_compare_inline_limit) Init(8) RejectNegative Joined UInteger Save
-Specify the maximum number pairs of load instructions that should be generated inline for the compare.  If the number needed exceeds the limit, a call to strncmp will be generated instead.
+Specify the maximum number pairs of load instructions that should be generated for inline compares.
 
 misel
 Target Report Mask(ISEL) Var(rs6000_isa_flags)


[PATCH] rs6000 PR83660 fix ICE with vec_extract

2018-04-13 Thread Aaron Sawdey
Per the discussion on the 83660, I've come to a minimal patch to
prevent this. Basically marking the vec_extract tree as having side
effects later makes sure that it gets all the cleanup points it needs
so that gimplify_cleanup_point_expr () is happy.  Also because
vec_insert puts a MODIFY_EXPR in there, it has side effects and this
problem will not occur.

Doing bootstrap/regtest on ppc64le with -mcpu=power7 since that is
where this issue arises. OK for trunk if everything passes?

Thanks,
   Aaron


2018-04-13  Aaron Sawdey  

PR target/83660
* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Mark
vec_extract expression as having side effects to make sure it gets
a cleanup point.

2018-04-13  Aaron Sawdey  

PR target/83660
* gcc.target/powerpc/pr83660.C: New test.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: config/rs6000/rs6000-c.c
===
--- config/rs6000/rs6000-c.c	(revision 259353)
+++ config/rs6000/rs6000-c.c	(working copy)
@@ -6705,6 +6705,15 @@
   stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
   stmt = build_indirect_ref (loc, stmt, RO_NULL);
 
+  /* PR83660: We mark this as having side effects so that
+	 downstream in fold_build_cleanup_point_expr () it will get a
+	 CLEANUP_POINT_EXPR.  If it does not we can run into an ICE
+	 later in gimplify_cleanup_point_expr ().  Potentially this
+	 causes missed optimization because the actually is no side
+	 effect.  */
+  if (c_dialect_cxx ())
+	TREE_SIDE_EFFECTS (stmt) = 1;
+
   return stmt;
 }
 
Index: testsuite/gcc.target/powerpc/pr83660.C
===
--- testsuite/gcc.target/powerpc/pr83660.C	(nonexistent)
+++ testsuite/gcc.target/powerpc/pr83660.C	(working copy)
@@ -0,0 +1,14 @@
+/* PR target/83660 */
+/* { dg-do compile } */
+/* { dg-options "-mcpu=power7" } */
+
+#include 
+
+typedef __vector unsigned int  uvec32_t  __attribute__((__aligned__(16)));
+
+unsigned get_word(uvec32_t v)
+{
+return ({const unsigned _B1 = 32;
+vec_extract((uvec32_t)v, 2);});
+}
+


[PATCH][rs6000][PR target/87474] fix strncmp expansion with -mno-power8-vector

2018-10-01 Thread Aaron Sawdey
PR/87474 happens because I didn't check that both vector and VSX instructions
were enabled, so insns that are disabled get generated with -mno-power8-vector.

Regstrap passes on ppc64le, ok for trunk?

Thanks!
  Aaron



2018-10-01  Aaron Sawdey  

PR target/87474
* config/rs6000/rs6000-string.c (expand_strn_compare): Check that both
vector and VSX are enabled.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 264760)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2205,6 +2205,7 @@
 }
   else
 {
+  /* Implies TARGET_P8_VECTOR here. */
   rtx diffix = gen_reg_rtx (DImode);
   rtx result_gbbd = gen_reg_rtx (V16QImode);
   /* Since each byte of the input is either 00 or FF, the bytes in
@@ -2313,9 +2314,12 @@
   /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
  least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
  at least POWER8.  That way we can rely on overlapping compares to
- do the final comparison of less than 16 bytes.  Also I do not want
- to deal with making this work for 32 bits.  */
-  int use_vec = (bytes >= 16 && !TARGET_32BIT && 
TARGET_EFFICIENT_UNALIGNED_VSX);
+ do the final comparison of less than 16 bytes.  Also I do not
+ want to deal with making this work for 32 bits.  In addition, we
+ have to make sure that we have at least P8_VECTOR (we don't allow
+ P9_VECTOR without P8_VECTOR).  */
+  int use_vec = (bytes >= 16 && !TARGET_32BIT
+&& TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);

   if (use_vec)
 required_align = 16;


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000][PR target/87474] fix strncmp expansion with -mno-power8-vector

2018-10-02 Thread Aaron Sawdey



On 10/2/18 3:38 AM, Segher Boessenkool wrote:
> On Mon, Oct 01, 2018 at 11:09:44PM -0500, Aaron Sawdey wrote:
>> PR/87474 happens because I didn't check that both vector and VSX instructions
>> were enabled, so insns that are disabled get generated with 
>> -mno-power8-vector.
> 
>>  PR target/87474
>>  * config/rs6000/rs6000-string.c (expand_strn_compare): Check that both
>>  vector and VSX are enabled.
> 
> You mean "P8 vector" or similar, I think?

True, it should say TARGET_P[89]_VECTOR.

> 
> 
>> --- gcc/config/rs6000/rs6000-string.c(revision 264760)
>> +++ gcc/config/rs6000/rs6000-string.c(working copy)
>> @@ -2205,6 +2205,7 @@
>>  }
>>else
>>  {
>> +  /* Implies TARGET_P8_VECTOR here. */
> 
> That isn't true as far as I see.

We can only enter emit_final_str_compare_vec() if TARGET_P8_VECTOR is set.
That's the additional check this patch adds. So in this function you can have
both P8 and P9 vector, or just p8 vector. rs6000_option_override_internal()
enforces that P8 vector must be set if P9 vector is set. So in the else here
we know that only p8 vector is set.

> 
> 
> Okay for trunk with improved changelog and that stray line removed.
> Thanks!
> 
> 
> Segher
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] rs6000 PR83660 fix ICE with vec_extract

2018-04-23 Thread Aaron Sawdey
This also affects gcc 7 and is fixed by the same patch. I've tested the
backport to 7 on ppc64le and it causes no new fails. OK for backport to
7 (and 6 if it's also needed there)?

Thanks,
   Aaron


On Fri, 2018-04-13 at 15:37 -0500, Aaron Sawdey wrote:
> Per the discussion on the 83660, I've come to a minimal patch to
> prevent this. Basically marking the vec_extract tree as having side
> effects later makes sure that it gets all the cleanup points it needs
> so that gimplify_cleanup_point_expr () is happy.  Also because
> vec_insert puts a MODIFY_EXPR in there, it has side effects and this
> problem will not occur.
> 
> Doing bootstrap/regtest on ppc64le with -mcpu=power7 since that is
> where this issue arises. OK for trunk if everything passes?
> 
> Thanks,
>Aaron
> 
> 
> 2018-04-13  Aaron Sawdey  
> 
>   PR target/83660
>   * config/rs6000/rs6000-c.c
> (altivec_resolve_overloaded_builtin): Mark
>   vec_extract expression as having side effects to make sure it
> gets
>   a cleanup point.
> 
> 2018-04-13  Aaron Sawdey  
> 
>   PR target/83660
>   * gcc.target/powerpc/pr83660.C: New test.
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2018-12-19 Thread Aaron Sawdey
Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
inline expansion from doing unaligned vector or using vector load/store
other than lvx/stvx. More description of the issue is here:

https://patchwork.ozlabs.org/patch/814059/

OK for trunk if bootstrap/regtest ok?

Thanks!
   Aaron

2018-12-19  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_move): Don't use
unaligned vsx and avoid lxvd2x/stxvd2x.
(gen_lvx_v4si_move): New function.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 267055)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2669,6 +2669,35 @@
   return true;
 }

+/* Generate loads and stores for a move of v4si mode using lvx/stvx.
+   This uses altivec_{l,st}vx__internal which use unspecs to
+   keep combine from changing what instruction gets used.
+
+   DEST is the destination for the data.
+   SRC is the source of the data for the move.  */
+
+static rtx
+gen_lvx_v4si_move (rtx dest, rtx src)
+{
+  rtx rv = NULL;
+  if (MEM_P (dest))
+{
+  gcc_assert (!MEM_P (src));
+  gcc_assert (GET_MODE (src) == V4SImode);
+  rv = gen_altivec_stvx_v4si_internal (dest, src);
+}
+  else if (MEM_P (src))
+{
+  gcc_assert (!MEM_P (dest));
+  gcc_assert (GET_MODE (dest) == V4SImode);
+  rv = gen_altivec_lvx_v4si_internal (dest, src);
+}
+  else
+gcc_unreachable ();
+
+  return rv;
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.

@@ -2721,11 +2750,11 @@

   /* Altivec first, since it will be faster than a string move
 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || 
align >= 128))
+  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
{
  move_bytes = 16;
  mode = V4SImode;
- gen_func.mov = gen_movv4si;
+ gen_func.mov = gen_lvx_v4si_move;
}
   else if (bytes >= 8 && TARGET_POWERPC64
       && (align >= 64 || !STRICT_ALIGNMENT))



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH 2/2] v2: C++: improvements to binary operator diagnostics (PR c++/87504)

2018-12-19 Thread Aaron Sawdey
h-location.h and gcc-rich-location.c.
>> (build_binary_op): Use struct op_location_t and
>> class binary_op_rich_location.
>>
>> gcc/cp/ChangeLog:
>> PR c++/87504
>> * call.c (op_error): Convert 1st param from location_t to
>> const op_location_t &.  Use binary_op_rich_location for binary
>> ops.
>> (build_conditional_expr_1): Convert 1st param from location_t to
>> const op_location_t &.
>> (build_conditional_expr): Likewise.
>> (build_new_op_1): Likewise.
>> (build_new_op): Likewise.
>> * cp-tree.h (build_conditional_expr): Likewise.
>> (build_new_op): Likewise.
>> (build_x_binary_op): Likewise.
>> (cp_build_binary_op): Likewise.
>> * parser.c (cp_parser_primary_expression): Build a location
>> for id-expression nodes.
>> (cp_parser_binary_expression): Use an op_location_t when
>> calling build_x_binary_op.
>> (cp_parser_operator): Build a location for user-defined literals.
>> * typeck.c (build_x_binary_op): Convert 1st param from location_t
>> to const op_location_t &.
>> (cp_build_binary_op): Likewise.  Use binary_op_rich_location.
>>
>> gcc/ChangeLog:
>> PR c++/87504
>> * gcc-rich-location.c
>> (maybe_range_label_for_tree_type_mismatch::get_text): Move here from
>> c/c-typeck.c.
>> (binary_op_rich_location::binary_op_rich_location): New ctor.
>> (binary_op_rich_location::use_operator_loc_p): New function.
>> * gcc-rich-location.h
>> (class maybe_range_label_for_tree_type_mismatch)): Move here from
>> c/c-typeck.c.
>> (struct op_location_t): New forward decl.
>> (class binary_op_rich_location): New class.
>> * tree.h (struct op_location_t): New struct.
>>
>> gcc/testsuite/ChangeLog:
>> * c-c++-common/Wtautological-compare-ranges.c: New test.
>> * g++.dg/cpp0x/pr51420.C: Add -fdiagnostics-show-caret and update
>> expected output.
>> * g++.dg/diagnostic/bad-binary-ops.C: Update expected output from
>> 1-location form to 3-location form, with labelling of ranges with
>> types.  Add examples of id-expression nodes with namespaces.
>> * g++.dg/diagnostic/param-type-mismatch-2.C: Likewise.
>>
>> This is the 2nd commit message:
>>
>> FIXME: column and multiline fixes to * g++.dg/cpp0x/pr51420.C
>> ---
>>   gcc/c-family/c-common.h    |  3 +-
>>   gcc/c-family/c-warn.c  | 57 +++---
>>   gcc/c/c-typeck.c   | 41 +-
>>   gcc/cp/call.c  | 28 ---
>>   gcc/cp/cp-tree.h   | 10 ++-
>>   gcc/cp/parser.c    | 32 ++--
>>   gcc/cp/typeck.c    | 14 ++--
>>   gcc/gcc-rich-location.c    | 89 
>> ++
>>   gcc/gcc-rich-location.h    | 57 ++
>>   .../c-c++-common/Wtautological-compare-ranges.c    | 42 ++
>>   gcc/testsuite/g++.dg/cpp0x/pr51420.C   | 10 +++
>>   gcc/testsuite/g++.dg/diagnostic/bad-binary-ops.C   | 57 +-
>>   .../g++.dg/diagnostic/param-type-mismatch-2.C  |  4 +-
>>   gcc/tree.h | 49 
>>   14 files changed, 417 insertions(+), 76 deletions(-)
>>   create mode 100644 
>> gcc/testsuite/c-c++-common/Wtautological-compare-ranges.c
>>
>> diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
>> index 4187343..0b9ddf6 100644
>> --- a/gcc/c-family/c-common.h
>> +++ b/gcc/c-family/c-common.h
>> @@ -1268,7 +1268,8 @@ extern void constant_expression_error (tree);
>>   extern void overflow_warning (location_t, tree, tree = NULL_TREE);
>>   extern void warn_logical_operator (location_t, enum tree_code, tree,
>>  enum tree_code, tree, enum tree_code, tree);
>> -extern void warn_tautological_cmp (location_t, enum tree_code, tree, tree);
>> +extern void warn_tautological_cmp (const op_location_t &, enum tree_code,
>> +   tree, tree);
>>   extern void warn_logical_not_parentheses (location_t, enum tree_code, tree,
>>     tree);
>>   extern bool warn_if_unused_value (const_tree, location_t);
>> diff --git a/gcc/c-family/c-warn.c b/gcc/c-family/c-warn.c
>> index fc7f87c..fce9d84 100644
>> --- a/gcc/c-family/c-warn.c
>> +++ b/gcc/c-family/c-warn.c
>> @@ -322,7 +322,8 @@ find_array_ref_with_const_idx_r (tree *expr_p, int *, 
>> void *)
>>       if ((TREE_CODE (expr) == ARRAY_REF
>>  || TREE_CODE (expr) == ARRAY_RANGE_REF)
>> -  && TREE_CODE (TREE_OPERAND (expr, 1)) == INTEGER_CST)
>> +  && (TREE_CODE (tree_strip_any_location_wrapper (TREE_OPERAND (expr, 
>> 1)))
>> +  == INTEGER_CST))
>>   return integer_type_node;
> 
> I think we want fold_for_warn here.  OK with that change (assuming it passes).
> 
> Jason
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] -Wtautological-compare: fix comparison of macro expansions

2018-12-20 Thread Aaron Sawdey
On 12/20/18 8:25 AM, David Malcolm wrote:
> According to comments within PR c++/87504, the patch fixes the
> bootstrap on aarch64, and fixes a similar issue on Solaris/SPARC.
> 
> It also passed bootstrap®rtesting on x86_64-pc-linux-gnu.
> 
> Given that, I've committed it to trunk as r267299.
> 
> Aaron, does this fix the issue you saw?
> 
> Thanks, and sorry again about the breakage.
> Dave
> 

Dave,
  Thanks for the quick response, the build issue is fixed with r267299.

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2018-12-20 Thread Aaron Sawdey
On 12/20/18 3:51 AM, Segher Boessenkool wrote:
> On Wed, Dec 19, 2018 at 01:53:05PM -0600, Aaron Sawdey wrote:
>> Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
>> to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
>> inline expansion from doing unaligned vector or using vector load/store
>> other than lvx/stvx. More description of the issue is here:
>>
>> https://patchwork.ozlabs.org/patch/814059/
>>
>> OK for trunk if bootstrap/regtest ok?
> 
> Okay, but see below.
> 
[snip]
> 
> This is extraordinarily clumsy :-)  Maybe something like:
> 
> static rtx
> gen_lvx_v4si_move (rtx dest, rtx src)
> {
>   gcc_assert (!(MEM_P (dest) && MEM_P (src));
>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
>   if (MEM_P (dest))
> return gen_altivec_stvx_v4si_internal (dest, src);
>   else if (MEM_P (src))
> return gen_altivec_lvx_v4si_internal (dest, src);
>   else
> gcc_unreachable ();
> }
> 
> (Or do you allow VOIDmode for src as well?)  Anyway, at least get rid of
> the useless extra variable.

I think this should be better:

static rtx
gen_lvx_v4si_move (rtx dest, rtx src)
{
  gcc_assert ((MEM_P (dest) && !MEM_P (src)) || (MEM_P (src) && !MEM_P(dest)));
  gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
  if (MEM_P (dest))
  return gen_altivec_stvx_v4si_internal (dest, src);
  else if (MEM_P (src))
  return gen_altivec_lvx_v4si_internal (dest, src);
  gcc_unreachable ();
}

I'll commit after I re-regstrap.

Thanks!
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2018-12-20 Thread Aaron Sawdey
On 12/20/18 5:44 PM, Segher Boessenkool wrote:
> On Thu, Dec 20, 2018 at 05:34:54PM -0600, Aaron Sawdey wrote:
>> On 12/20/18 3:51 AM, Segher Boessenkool wrote:
>>> On Wed, Dec 19, 2018 at 01:53:05PM -0600, Aaron Sawdey wrote:
>>>> Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
>>>> to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
>>>> inline expansion from doing unaligned vector or using vector load/store
>>>> other than lvx/stvx. More description of the issue is here:
>>>>
>>>> https://patchwork.ozlabs.org/patch/814059/
>>>>
>>>> OK for trunk if bootstrap/regtest ok?
>>>
>>> Okay, but see below.
>>>
>> [snip]
>>>
>>> This is extraordinarily clumsy :-)  Maybe something like:
>>>
>>> static rtx
>>> gen_lvx_v4si_move (rtx dest, rtx src)
>>> {
>>>   gcc_assert (!(MEM_P (dest) && MEM_P (src));
>>>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
>>>   if (MEM_P (dest))
>>> return gen_altivec_stvx_v4si_internal (dest, src);
>>>   else if (MEM_P (src))
>>> return gen_altivec_lvx_v4si_internal (dest, src);
>>>   else
>>> gcc_unreachable ();
>>> }
>>>
>>> (Or do you allow VOIDmode for src as well?)  Anyway, at least get rid of
>>> the useless extra variable.
>>
>> I think this should be better:
> 
> The gcc_unreachable at the end catches the non-mem to non-mem case.
> 
>> static rtx
>> gen_lvx_v4si_move (rtx dest, rtx src)
>> {
>>   gcc_assert ((MEM_P (dest) && !MEM_P (src)) || (MEM_P (src) && 
>> !MEM_P(dest)));
> 
> But if you prefer this, how about
> 
> {
>   gcc_assert (MEM_P (dest) ^ MEM_P (src));
>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
> 
>   if (MEM_P (dest))
> return gen_altivec_stvx_v4si_internal (dest, src);
>   else
> return gen_altivec_lvx_v4si_internal (dest, src);
> }
> 
> :-)
> 
> 
> Segher
> 

I like that even better, thanks!

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] improve gpr inline expansion of str[n]cmp

2018-10-25 Thread Aaron Sawdey
This patch changes the sequence that gcc generates for inline expansion of
strcmp/strncmp using scalar (gpr) instructions. The new sequence is one
instruction shorter and uses cmpb/cmpb/orc./bne which I also have been
told that valgrind should be able to understand as the defined/undefined
info can be propagated and should show that the branch is not based on
any undefined data past the end of the string.

Performance is mostly a wash. The new sequence is faster if there is a
difference in the string, the old sequence is sligntly faster for short strings
that do not differ. The new sequence is faster for long strings that do not
differ, but that isn't important because if vsx is enabled, the gpr
sequence is only used for 15 bytes or less.

Bootstrap/regtest passes on ppc64le (power8, power9), ppc64 (power8)
and ppc32 (power8). Ok for trunk?

Thanks,
Aaron


2018-10-25  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to
a shorter sequence with fewer branches.
(emit_final_str_compare_gpr): Ditto.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 265393)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -259,7 +259,7 @@
   gcc_assert (mode == E_QImode);
   emit_move_insn (reg, mem);
   break;
-
+
 default:
   gcc_unreachable ();
   break;
@@ -726,7 +726,7 @@
 {
   if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
/* Do not expect length longer than word_mode.  */
-   return false;
+   return false;
   else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE 
(word_mode))
{
  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
@@ -770,7 +770,7 @@
   rtx j;

   /* Example of generated code for 35 bytes aligned 1 byte.
-
+
 mtctr 8
 li 6,0
 li 5,8
@@ -798,7 +798,7 @@
 popcntd 9,9
 subfe 10,10,10
 or 9,9,10
-
+
  Compiled with -fno-reorder-blocks for clarity.  */

   /* Structure of what we're going to do:
@@ -1041,7 +1041,7 @@
   if (!bytes_is_const)
{
  /* If we're dealing with runtime length, we have to check if
-it's zero after the loop. When length is known at compile
+it's zero after the loop.  When length is known at compile
 time the no-remainder condition is dealt with above.  By
 doing this after cleanup_label, we also deal with the
 case where length is 0 at the start and we bypass the
@@ -1411,7 +1411,7 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
   /* P7/P8 code uses cond for subfc. but P9 uses
- it for cmpld which needs CCUNSmode. */
+ it for cmpld which needs CCUNSmode.  */
   rtx cond;
   if (TARGET_P9_MISC)
 cond = gen_reg_rtx (CCUNSmode);
@@ -1655,7 +1655,7 @@
emit_label (convert_label);

   /* We need to produce DI result from sub, then convert to target SI
-while maintaining <0 / ==0 / >0 properties. This sequence works:
+while maintaining <0 / ==0 / >0 properties.  This sequence works:
 subfc L,A,B
 subfe H,H,H
 popcntd L,L
@@ -1740,7 +1740,7 @@
to strcmp/strncmp if we have equality at the end of the inline comparison.
P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
to clean up and generate the final comparison result.
-   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
set the final result.  */
 static void
 expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
@@ -1763,12 +1763,9 @@
   while (bytes_to_compare > 0)
 {
   /* GPR compare sequence:
- check each 8B with: ld/ld cmpd bne
-If equal, use rldicr/cmpb to check for zero byte.
+ check each 8B with: ld/ld/cmpb/cmpb/orc./bne
+
  cleanup code at end:
- cmpb  get byte that differs
- cmpb  look for zero byte
- orc   combine
  cntlzdget bit of first zero/diff byte
  subficconvert for rldcl use
  rldcl rldcl   extract diff/zero byte
@@ -1776,7 +1773,7 @@

  The last compare can branch around the cleanup code if the
  result is zero because the strings are exactly equal.  */
-
+
   unsigned int align = compute_current_alignment (base_align, offset);
   load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
   load_mode_size = GET_MODE_SIZE (load_mode);
@@ -1801,34 +1798,49 @@
   rid of the extra bytes.  */
cmp_bytes = bytes_to_compare;

-  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset

[PATCH][rs6000] use index form addresses more often for ldbrx/stdbrx

2018-10-27 Thread Aaron Sawdey
At Segher's suggestion, I looked into changing the predicates on 
bswapdi2_{load,store}
from memory_operand to indexed_or_indirect_operand and putting some code into 
bswapdi2
to make the address indirect if it wasn't already.

The motivating case for this was the code I was seeing for the gpr expansion of 
strncmp.
Before I would typically see something like this:

addi 9,3,8
ldbrx 10,0,9
addi 9,4,8
ldbrx 8,0,9
subf. 9,8,10
bne 0,.L13
cmpb 10,10,9
cmpdi 0,10,0
bne 0,.L9
addi 9,3,16
ldbrx 10,0,9
addi 9,4,16
ldbrx 8,0,9
subf. 9,8,10
bne 0,.L13
cmpb 10,10,9
cmpdi 0,10,0
bne 0,.L9

For each comparison block it is doing the add separately and using 0 for one 
input
of the ldbrx.

After this change, it is more like this:

ldbrx 8,3,27
ldbrx 7,4,27
cmpb 9,8,9
cmpb 10,8,7
orc. 9,9,10
bne 0,.L13
ldbrx 8,3,24
ldbrx 7,4,24
cmpb 10,8,9
cmpb 9,8,7
orc. 9,10,9
bne 0,.L13


Here it has created temps with constants and hoisted them out of a loop, but I 
have
other cases where it will update them if there is more register pressure. in 
either
case the code is more compact and makes full use of the indexed addressing of 
ldbrx.

Bootstrap/regtest passed on ppc64le targeting power7/power8/power9, ok for 
trunk?

Thanks!
Aaron

2018-10-27  Aaron Sawdey  

* config/rs6000/rs6000.md (bswapdi2): Force address into register
if not in one already.
(bswapdi2_load): Change predicate to indexed_or_indirect_operand.
(bswapdi2_store): Ditto.

Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265393)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2512,9 +2512,27 @@
   if (TARGET_POWERPC64 && TARGET_LDBRX)
 {
   if (MEM_P (src))
-   emit_insn (gen_bswapdi2_load (dest, src));
+{
+  rtx addr = XEXP (src, 0);
+  if (!legitimate_indirect_address_p (addr, reload_completed)
+  && !legitimate_indexed_address_p (addr, reload_completed))
+{
+  addr = force_reg (Pmode, addr);
+  src = replace_equiv_address_nv (src, addr);
+}
+ emit_insn (gen_bswapdi2_load (dest, src));
+}
   else if (MEM_P (dest))
-   emit_insn (gen_bswapdi2_store (dest, src));
+{
+  rtx addr = XEXP (dest, 0);
+  if (!legitimate_indirect_address_p (addr, reload_completed)
+  && !legitimate_indexed_address_p (addr, reload_completed))
+{
+  addr = force_reg (Pmode, addr);
+  dest = replace_equiv_address_nv (dest, addr);
+}
+ emit_insn (gen_bswapdi2_store (dest, src));
+}
   else if (TARGET_P9_VECTOR)
emit_insn (gen_bswapdi2_xxbrd (dest, src));
   else
@@ -2535,13 +2553,13 @@
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
 (define_insn "bswapdi2_load"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+   (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "Z")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "ldbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswapdi2_store"
-  [(set (match_operand:DI 0 "memory_operand" "=Z")
+  [(set (match_operand:DI 0 "indexed_or_indirect_operand" "=Z")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "stdbrx %1,%y0"




-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] use index form addresses more often for ldbrx/stdbrx

2018-10-29 Thread Aaron Sawdey
On 10/27/18 12:52 PM, Segher Boessenkool wrote:
> Hi Aaron,
> 
> On Sat, Oct 27, 2018 at 11:20:01AM -0500, Aaron Sawdey wrote:
>> --- gcc/config/rs6000/rs6000.md  (revision 265393)
>> +++ gcc/config/rs6000/rs6000.md  (working copy)
>> @@ -2512,9 +2512,27 @@
>>if (TARGET_POWERPC64 && TARGET_LDBRX)
>>  {
>>if (MEM_P (src))
>> -emit_insn (gen_bswapdi2_load (dest, src));
>> +{
>> +  rtx addr = XEXP (src, 0);
>> +  if (!legitimate_indirect_address_p (addr, reload_completed)
>> +  && !legitimate_indexed_address_p (addr, reload_completed))
> 
> Should you use indexed_or_indirect operand instead here?
> 
>> +{
>> +  addr = force_reg (Pmode, addr);
>> +  src = replace_equiv_address_nv (src, addr);
>> +}
>> +  emit_insn (gen_bswapdi2_load (dest, src));
>> +}
> 
> You could maybe make this a utility routine as well (in rs6000.c)...
> Something like force_indexed_or_indirect_mem.  So this code will be just
> 
>   if (MEM_P (src))
>   force_indexed_or_indirect_mem (src);
> 
> then.
> 
> Could you try those things please?
> 
> 
> Segher
> 

Segher,
  Here's a patch restructured in that way.
OK for trunk if bootstrap/regtest passes?

Thanks!
   Aaron

2018-10-29  Aaron Sawdey  

* config/rs6000/rs6000.md (bswapdi2): Force address into register
if not in one already.
(bswapdi2_load): Change predicate to indexed_or_indirect_operand.
(bswapdi2_store): Ditto.
* config/rs6000/rs6000.c (rs6000_force_indexed_or_indirect_mem): New
helper function.
* config/rs6000/rs6000-protos.h (rs6000_force_indexed_or_indirect_mem):
Prototype for helper function.



Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 265588)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -47,6 +47,7 @@
 extern bool legitimate_indirect_address_p (rtx, int);
 extern bool legitimate_indexed_address_p (rtx, int);
 extern bool avoiding_indexed_address_p (machine_mode);
+extern void rs6000_force_indexed_or_indirect_mem (rtx x);

 extern rtx rs6000_got_register (rtx);
 extern rtx find_addr_reg (rtx);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 265588)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -8423,7 +8423,22 @@
   return false;
 }

+/* Helper function for making sure we will make full
+   use of indexed addressing.  */

+void
+rs6000_force_indexed_or_indirect_mem (rtx x)
+{
+  rtx addr = XEXP (x, 0);
+  machine_mode m = GET_MODE (x);
+  if (!indexed_or_indirect_operand (x, m))
+{
+  addr = force_reg (Pmode, addr);
+  x = replace_equiv_address_nv (x, addr);
+}
+}
+
+
 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */

 static bool
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265588)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2512,9 +2512,15 @@
   if (TARGET_POWERPC64 && TARGET_LDBRX)
 {
   if (MEM_P (src))
-   emit_insn (gen_bswapdi2_load (dest, src));
+{
+ rs6000_force_indexed_or_indirect_mem (src);
+ emit_insn (gen_bswapdi2_load (dest, src));
+}
   else if (MEM_P (dest))
-   emit_insn (gen_bswapdi2_store (dest, src));
+{
+ rs6000_force_indexed_or_indirect_mem (dest);
+ emit_insn (gen_bswapdi2_store (dest, src));
+}
   else if (TARGET_P9_VECTOR)
emit_insn (gen_bswapdi2_xxbrd (dest, src));
   else
@@ -2535,13 +2541,13 @@
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
 (define_insn "bswapdi2_load"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+   (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "Z")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "ldbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswapdi2_store"
-  [(set (match_operand:DI 0 "memory_operand" "=Z")
+  [(set (match_operand:DI 0 "indexed_or_indirect_operand" "=Z")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "stdbrx %1,%y0"



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] use index form addresses more often for ldbrx/stdbrx

2018-10-30 Thread Aaron Sawdey
I had to make one more change to make this actually work. In
rs6000_force_indexed_or_indirect_mem() it was necessary to
return the updated rtx.

Bootstrap/regtest passes on ppc64le (power7, power9), ok for trunk?

Thanks!
   Aaron

2018-10-30  Aaron Sawdey  

* config/rs6000/rs6000.md (bswapdi2): Force address into register
if not in indexed or indirect form.
(bswapdi2_load): Change predicate to indexed_or_indirect_operand.
(bswapdi2_store): Ditto.
* config/rs6000/rs6000.c (rs6000_force_indexed_or_indirect_mem): New
helper function.
* config/rs6000/rs6000-protos.h (rs6000_force_indexed_or_indirect_mem):
Prototype for helper function.


Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 265588)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -47,6 +47,7 @@
 extern bool legitimate_indirect_address_p (rtx, int);
 extern bool legitimate_indexed_address_p (rtx, int);
 extern bool avoiding_indexed_address_p (machine_mode);
+extern rtx rs6000_force_indexed_or_indirect_mem (rtx x);

 extern rtx rs6000_got_register (rtx);
 extern rtx find_addr_reg (rtx);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 265588)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -8423,7 +8423,23 @@
   return false;
 }

+/* Helper function for making sure we will make full
+   use of indexed addressing.  */

+rtx
+rs6000_force_indexed_or_indirect_mem (rtx x)
+{
+  machine_mode m = GET_MODE (x);
+  if (!indexed_or_indirect_operand (x, m))
+{
+  rtx addr = XEXP (x, 0);
+  addr = force_reg (Pmode, addr);
+  x = replace_equiv_address_nv (x, addr);
+}
+  return x;
+}
+
+
 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */

 static bool
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265588)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2512,9 +2512,15 @@
   if (TARGET_POWERPC64 && TARGET_LDBRX)
 {
   if (MEM_P (src))
-   emit_insn (gen_bswapdi2_load (dest, src));
+{
+ src = rs6000_force_indexed_or_indirect_mem (src);
+ emit_insn (gen_bswapdi2_load (dest, src));
+}
   else if (MEM_P (dest))
-   emit_insn (gen_bswapdi2_store (dest, src));
+{
+ dest = rs6000_force_indexed_or_indirect_mem (dest);
+ emit_insn (gen_bswapdi2_store (dest, src));
+}
   else if (TARGET_P9_VECTOR)
emit_insn (gen_bswapdi2_xxbrd (dest, src));
   else
@@ -2535,13 +2541,13 @@
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
 (define_insn "bswapdi2_load"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-   (bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+   (bswap:DI (match_operand:DI 1 "indexed_or_indirect_operand" "Z")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "ldbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswapdi2_store"
-  [(set (match_operand:DI 0 "memory_operand" "=Z")
+  [(set (match_operand:DI 0 "indexed_or_indirect_operand" "=Z")
(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
   "TARGET_POWERPC64 && TARGET_LDBRX"
   "stdbrx %1,%y0"



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] cleanup and rename rs6000_address_for_fpconvert

2018-11-01 Thread Aaron Sawdey
This patch combines the duties of rs6000_address_for_fpconvert into
rs6000_force_indexed_or_indirect_mem which I recently added, and changes
all calls to use the latter. The new function name is more descriptive of what
is actually going on. This now uses indexed_or_indirect_operand() to test
the incoming rtx which matches what the insns this is used to prepare for
are using as their predicate.

Bootstrap/regtest passes on ppc64le (power7, power9), ok for trunk?



2018-11-01  Aaron Sawdey  

* config/rs6000/rs6000-protos.h (rs6000_address_for_fpconvert): Remove
prototype.
* config/rs6000/rs6000.c (rs6000_force_indexed_or_indirect_mem):
Combine with rs6000_address_for_fpconvert.
(rs6000_address_for_fpconvert) Combine with
rs6000_force_indexed_or_indirect_mem.
(rs6000_expand_vector_init): Change function call from
rs6000_address_for_fpconvert to rs6000_force_indexed_or_indirect_mem.
* config/rs6000/rs6000.md (floatsi2_lfiwax): Change call from
rs6000_address_for_fpconvert to rs6000_force_indexed_or_indirect_mem.
(floatsi2_lfiwax_mem): Ditto.
(floatunssi2_lfiwzx): Ditto.
(floatunssi2_lfiwzx_mem): Ditto.
(float2): Ditto.
(floatuns2): Ditto.
(fix_truncsi2_stfiwx): Ditto.
(fixuns_truncsi2_stfiwx): Ditto.
(float_si2_hw): Ditto.
(floatuns_si2_hw): Ditto.
* config/rs6000/vsx.md (*vsx_extract_si): Ditto.
(vsx_splat_): Ditto.




Index: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h   (revision 265637)
+++ gcc/config/rs6000/rs6000-protos.h   (working copy)
@@ -153,7 +153,6 @@

 extern rtx rs6000_machopic_legitimize_pic_address (rtx, machine_mode,
   rtx);
-extern rtx rs6000_address_for_fpconvert (rtx);
 extern rtx rs6000_allocate_stack_temp (machine_mode, bool, bool);
 extern align_flags rs6000_loop_align (rtx);
 extern void rs6000_split_logical (rtx [], enum rtx_code, bool, bool, bool);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 265637)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -6560,7 +6560,7 @@
{
  rtx element0 = XVECEXP (vals, 0, 0);
  if (MEM_P (element0))
-   element0 = rs6000_address_for_fpconvert (element0);
+   element0 = rs6000_force_indexed_or_indirect_mem (element0);
  else
element0 = force_reg (SImode, element0);

@@ -6601,7 +6601,7 @@
  if (TARGET_P9_VECTOR)
{
  if (MEM_P (element0))
-   element0 = rs6000_address_for_fpconvert (element0);
+   element0 = rs6000_force_indexed_or_indirect_mem (element0);

  emit_insn (gen_vsx_splat_v4sf (target, element0));
}
@@ -8423,23 +8423,6 @@
   return false;
 }

-/* Helper function for making sure we will make full
-   use of indexed addressing.  */
-
-rtx
-rs6000_force_indexed_or_indirect_mem (rtx x)
-{
-  machine_mode m = GET_MODE (x);
-  if (!indexed_or_indirect_operand (x, m))
-{
-  rtx addr = XEXP (x, 0);
-  addr = force_reg (Pmode, addr);
-  x = replace_equiv_address_nv (x, addr);
-}
-  return x;
-}
-
-
 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook.  */

 static bool
@@ -37312,21 +37295,19 @@
   return stack;
 }

-/* Given a memory reference, if it is not a reg or reg+reg addressing, convert
-   to such a form to deal with memory reference instructions like STFIWX that
-   only take reg+reg addressing.  */
+/* Given a memory reference, if it is not a reg or reg+reg addressing,
+   convert to such a form to deal with memory reference instructions
+   like STFIWX and LDBRX that only take reg+reg addressing.  */

 rtx
-rs6000_address_for_fpconvert (rtx x)
+rs6000_force_indexed_or_indirect_mem (rtx x)
 {
-  rtx addr;
+  machine_mode m = GET_MODE (x);

   gcc_assert (MEM_P (x));
-  addr = XEXP (x, 0);
-  if (can_create_pseudo_p ()
-  && ! legitimate_indirect_address_p (addr, reload_completed)
-  && ! legitimate_indexed_address_p (addr, reload_completed))
+  if (can_create_pseudo_p () && !indexed_or_indirect_operand (x, m))
 {
+  rtx addr = XEXP (x, 0);
   if (GET_CODE (addr) == PRE_INC || GET_CODE (addr) == PRE_DEC)
{
  rtx reg = XEXP (addr, 0);
@@ -37346,7 +37327,7 @@
  addr = reg;
}

-  x = replace_equiv_address (x, copy_addr_to_reg (addr));
+  x = replace_equiv_address (x, force_reg (Pmode, addr));
 }

   return x;
Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265637)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -5225,7 +5225,7 @@
tmp = gen_reg_rtx (DImode);
   if (MEM_P (src))

[PATCH][rs6000] fix ICE for strncmp expansion on power6

2018-11-02 Thread Aaron Sawdey
This patch addresses an ICE for a missing instruction when targeting power6. 
The issue
is that we shouldn't generate x-form load rtx if TARGET_AVOID_XFORM is true 
because
it won't end up being matched. More generally, on big endian we do not need to 
use
ldbrx et. al. which are index loads, but can just use ld and other normal d-form
loads. So this is going to generate better code for BE in general which is why 
I have
changed it to do this for big endian or TARGET_AVOID_XFORM.

Bootstrap/regtest passes on ppc32 and ppc64 (power 6/7/8), ok for trunk?

Thanks!
   Aaron


2018-11-02  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Pay
attention to TARGET_AVOID_XFORM.

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 265733)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -1798,12 +1798,18 @@
   rid of the extra bytes.  */
cmp_bytes = bytes_to_compare;

-  rtx offset_reg = gen_reg_rtx (Pmode);
-  emit_move_insn (offset_reg, GEN_INT (offset));
-
-  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg);
+  rtx offset_rtx;
+  if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
+   offset_rtx = GEN_INT (offset);
+  else
+   {
+ offset_rtx = gen_reg_rtx (Pmode);
+ emit_move_insn (offset_rtx, GEN_INT (offset));
+   }
+  rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
+  rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
+   
   do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, 
orig_src1);
-  rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg);
   do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, 
orig_src2);

   /* We must always left-align the data we read, and


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] use index form addresses more often for l[wh]brx/st[wh]brx

2018-11-05 Thread Aaron Sawdey
This does the same thing for bswap2 that I previously did for bswapdi2.
The predicates for bswap2_{load,store} are now 
indexed_or_indirect_operand,
and bswap2 uses rs6000_force_indexed_or_indirect_mem to make sure the
address is appropriate for that predicate.

Bootstrap/regtest passes on ppc64le power8/power9, ok for trunk?

Thanks!
Aaron



2018-11-05  Aaron Sawdey  

* config/rs6000/rs6000.md (bswap2): Force address into register
if not in indexed or indirect form.
(bswap2_load): Change predicate to indexed_or_indirect_operand.
(bswap2_store): Ditto.


Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 265753)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -2411,9 +2411,15 @@
 src = force_reg (mode, src);

   if (MEM_P (src))
-emit_insn (gen_bswap2_load (dest, src));
+{
+   src = rs6000_force_indexed_or_indirect_mem (src);
+   emit_insn (gen_bswap2_load (dest, src));
+}
   else if (MEM_P (dest))
-emit_insn (gen_bswap2_store (dest, src));
+{
+   dest = rs6000_force_indexed_or_indirect_mem (dest);
+   emit_insn (gen_bswap2_store (dest, src));
+}
   else
 emit_insn (gen_bswap2_reg (dest, src));
   DONE;
@@ -2421,13 +2427,13 @@

 (define_insn "bswap2_load"
   [(set (match_operand:HSI 0 "gpc_reg_operand" "=r")
-   (bswap:HSI (match_operand:HSI 1 "memory_operand" "Z")))]
+   (bswap:HSI (match_operand:HSI 1 "indexed_or_indirect_operand" "Z")))]
   ""
   "lbrx %0,%y1"
   [(set_attr "type" "load")])

 (define_insn "bswap2_store"
-  [(set (match_operand:HSI 0 "memory_operand" "=Z")
+  [(set (match_operand:HSI 0 "indexed_or_indirect_operand" "=Z")
(bswap:HSI (match_operand:HSI 1 "gpc_reg_operand" "r")))]
   ""
   "stbrx %1,%y0"




-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][rs6000] inline expansion of memcmp using vsx

2018-11-14 Thread Aaron Sawdey
This patch generalizes some the functions added earlier to do vsx expansion of 
strncmp
so that the can also generate the code needed for memcmp. I reorganized
expand_block_compare() a little to be able to make use of this there. The vsx 
code is more
compact so I've changed the default block compare inline limit to 63 bytes. The 
vsx
code is only used if there is at least 16 bytes to compare as this means we 
don't have to
do complex code to compare less than one chunk. If vsx is not available the 
limit is cut
in half. The performance is good, vsx memcmp is considerably faster than the 
gpr inline code
if the strings are equal and is comparable if the strings have a 10% chance of 
being
equal (spread across the string).

Currently regtesting, ok for trunk if tests pass?

Thanks!
   Aaron

2018-11-14  Aaron Sawdey  

* config/rs6000/rs6000-string.c (emit_vsx_zero_reg): New function.
(expand_cmp_vec_sequence): Rename and modify
expand_strncmp_vec_sequence.
(emit_final_compare_vec): Rename and modify emit_final_str_compare_vec.
(generate_6432_conversion): New function.
(expand_block_compare): Add support for vsx.
(expand_block_compare_gpr): New function.
* config/rs6000/rs6000.opt (rs6000_block_compare_inline_limit): Increase
default limit to 63 because of more compact vsx code.




Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 266034)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -615,6 +615,283 @@
 }
 }

+static rtx
+emit_vsx_zero_reg()
+{
+  unsigned int i;
+  rtx zr[16];
+  for (i = 0; i < 16; i++)
+zr[i] = GEN_INT (0);
+  rtvec zv = gen_rtvec_v (16, zr);
+  rtx zero_reg = gen_reg_rtx (V16QImode);
+  rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
+  return zero_reg;
+}
+
+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
+   instructions.
+
+   BYTES_TO_COMPARE is the number of bytes to be compared.
+   ORIG_SRC1 is the unmodified rtx for the first string.
+   ORIG_SRC2 is the unmodified rtx for the second string.
+   S1ADDR is the register to use for the base address of the first string.
+   S2ADDR is the register to use for the base address of the second string.
+   OFF_REG is the register to use for the string offset for loads.
+   S1DATA is the register for loading the first string.
+   S2DATA is the register for loading the second string.
+   VEC_RESULT is the rtx for the vector result indicating the byte difference.
+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
+   to strcmp/strncmp if we have equality at the end of the inline comparison.
+   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
+   to clean up and generate the final comparison result.
+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
+   set the final result.
+   CHECKZERO indicates whether the sequence should check for zero bytes
+   for use doing strncmp, or not (for use doing memcmp).  */
+static void
+expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
+rtx orig_src1, rtx orig_src2,
+rtx s1addr, rtx s2addr, rtx off_reg,
+rtx s1data, rtx s2data, rtx vec_result,
+bool equality_compare_rest, rtx *p_cleanup_label,
+rtx final_move_label, bool checkzero)
+{
+  machine_mode load_mode;
+  unsigned int load_mode_size;
+  unsigned HOST_WIDE_INT cmp_bytes = 0;
+  unsigned HOST_WIDE_INT offset = 0;
+  rtx zero_reg = NULL;
+
+  gcc_assert (p_cleanup_label != NULL);
+  rtx cleanup_label = *p_cleanup_label;
+
+  emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
+  emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
+
+  if (checkzero && !TARGET_P9_VECTOR)
+zero_reg = emit_vsx_zero_reg();
+
+  while (bytes_to_compare > 0)
+{
+  /* VEC/VSX compare sequence for P8:
+check each 16B with:
+lxvd2x 32,28,8
+lxvd2x 33,29,8
+vcmpequb 2,0,1  # compare strings
+vcmpequb 4,0,3  # compare w/ 0
+xxlorc 37,36,34   # first FF byte is either mismatch or end of 
string
+vcmpequb. 7,5,3  # reg 7 contains 0
+bnl 6,.Lmismatch
+
+For the P8 LE case, we use lxvd2x and compare full 16 bytes
+but then use use vgbbd and a shift to get two bytes with the
+information we need in the correct order.
+
+VEC/VSX compare sequence if TARGET_P9_VECTOR:
+lxvb16x/lxvb16x # load 16B of each string
+vcmpnezb.   # produces difference location or zero byte 
location
+bne 6,.Lmismatch
+
+Use the overlapping compare trick for the last block if it is
+less than 16 bytes.
+  */
+
+  load_mode = V16QImod

Re: [PATCH][rs6000] inline expansion of memcmp using vsx

2018-11-15 Thread Aaron Sawdey
On 11/15/18 4:02 AM, Richard Biener wrote:
> On Wed, Nov 14, 2018 at 5:43 PM Aaron Sawdey  wrote:
>>
>> This patch generalizes some the functions added earlier to do vsx expansion 
>> of strncmp
>> so that the can also generate the code needed for memcmp. I reorganized
>> expand_block_compare() a little to be able to make use of this there. The 
>> vsx code is more
>> compact so I've changed the default block compare inline limit to 63 bytes. 
>> The vsx
>> code is only used if there is at least 16 bytes to compare as this means we 
>> don't have to
>> do complex code to compare less than one chunk. If vsx is not available the 
>> limit is cut
>> in half. The performance is good, vsx memcmp is considerably faster than the 
>> gpr inline code
>> if the strings are equal and is comparable if the strings have a 10% chance 
>> of being
>> equal (spread across the string).
> 
> How is performance affected if there are close earlier char-size
> stores to one of the string/memory?
> Can power still do store forwarding in this case?

Store forwarding between scalar and vector is not great, but it's
better than having to make a plt call to memcmp() which may well use
vsx anyway. I had set the crossover between scalar and vsx at 16 bytes
because the vsx code is more compact. The performance is similar for
16-32 byte sizes. But you could make an argument for switching at 33
bytes. This way builtin memcmp of 33-64 bytes would now use inline vsx
code instead of memcmp() call. At 33 bytes the vsx inline code is 3x
faster than a memcmp() call so would likely remain faster even if
there was an ugly vector-load-hit-scalar-store. Also small structures
32 bytes and less being compared would use scalar code and the same as
gcc 8 and would avoid this issue.

  Aaron

> 
>> Currently regtesting, ok for trunk if tests pass?
>>
>> Thanks!
>>Aaron
>>
>> 2018-11-14  Aaron Sawdey  
>>
>> * config/rs6000/rs6000-string.c (emit_vsx_zero_reg): New function.
>> (expand_cmp_vec_sequence): Rename and modify
>> expand_strncmp_vec_sequence.
>> (emit_final_compare_vec): Rename and modify 
>> emit_final_str_compare_vec.
>> (generate_6432_conversion): New function.
>> (expand_block_compare): Add support for vsx.
>> (expand_block_compare_gpr): New function.
>> * config/rs6000/rs6000.opt (rs6000_block_compare_inline_limit): 
>> Increase
>> default limit to 63 because of more compact vsx code.
>>
>>
>>
>>
>> Index: gcc/config/rs6000/rs6000-string.c
>> ===
>> --- gcc/config/rs6000/rs6000-string.c   (revision 266034)
>> +++ gcc/config/rs6000/rs6000-string.c   (working copy)
>> @@ -615,6 +615,283 @@
>>  }
>>  }
>>
>> +static rtx
>> +emit_vsx_zero_reg()
>> +{
>> +  unsigned int i;
>> +  rtx zr[16];
>> +  for (i = 0; i < 16; i++)
>> +zr[i] = GEN_INT (0);
>> +  rtvec zv = gen_rtvec_v (16, zr);
>> +  rtx zero_reg = gen_reg_rtx (V16QImode);
>> +  rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
>> +  return zero_reg;
>> +}
>> +
>> +/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
>> +   instructions.
>> +
>> +   BYTES_TO_COMPARE is the number of bytes to be compared.
>> +   ORIG_SRC1 is the unmodified rtx for the first string.
>> +   ORIG_SRC2 is the unmodified rtx for the second string.
>> +   S1ADDR is the register to use for the base address of the first string.
>> +   S2ADDR is the register to use for the base address of the second string.
>> +   OFF_REG is the register to use for the string offset for loads.
>> +   S1DATA is the register for loading the first string.
>> +   S2DATA is the register for loading the second string.
>> +   VEC_RESULT is the rtx for the vector result indicating the byte 
>> difference.
>> +   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup 
>> call
>> +   to strcmp/strncmp if we have equality at the end of the inline 
>> comparison.
>> +   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need 
>> code
>> +   to clean up and generate the final comparison result.
>> +   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
>> +   set the final result.
>> +   CHECKZERO indicates whether the sequence should check for zero bytes
>> +   for use doing strncmp, or not (for use doing memcmp).  */
>> +static void
>> +expand_cmp_vec_sequence (unsig

[PATCH][rs6000] better use of unaligned vsx in memset() expansion

2018-11-26 Thread Aaron Sawdey
When I previously added the use of unaligned vsx stores to inline expansion
of memset, I didn't do a good job of managing boundary conditions. The intention
was to only use unaligned vsx if the block being cleared was more than 32 bytes.
What it actually did was to prevent the use of unaligned vsx for the last 32
bytes of any block being cleared. So this change puts the test up front so it
is not affected by the decrement of bytes.

OK for trunk if regstrap passes?

Thanks!
   Aaron



2018-11-26  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_clear): Change how
we determine if unaligned vsx is ok.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 266219)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -85,14 +85,14 @@
   if (! optimize_size && bytes > 8 * clear_step)
 return 0;

+  bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
+
   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
 {
   machine_mode mode = BLKmode;
   rtx dest;

-  if (TARGET_ALTIVEC
- && ((bytes >= 16 && align >= 128)
- || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
+  if (TARGET_ALTIVEC && ((bytes >= 16 && align >= 128) || 
unaligned_vsx_ok))
{
  clear_bytes = 16;
  mode = V4SImode;

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000][8 backport] improve gpr inline expansion of str[n]cmp

2018-11-26 Thread Aaron Sawdey
Just so there is some record of what I did here -- in order to backport the
gpr strncmp expansion improvement patch to gcc 8 I had to pull in some pieces
of an earlier cleanup patch from June of this year.

I'll get this committed to gcc-8-branch when I'm done with the bootstrap/regtest
on a couple different ppc64 architectures (unless anyone has any objections).

Thanks,
   Aaron



2018-11-26  Aaron Sawdey  

Backport from mainline
2018-10-25  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to
a shorter sequence with fewer branches.
(emit_final_str_compare_gpr): Ditto.

Backport from mainline to allow the above code to go in:
2018-06-14  Aaron Sawdey  

* config/rs6000/rs6000-string.c (do_and3, do_and3_mask,


Index: rs6000-string.c
===
--- rs6000-string.c (revision 266483)
+++ rs6000-string.c (working copy)
@@ -408,6 +408,54 @@
 emit_insn (gen_addsi3 (dest, src1, src2));
 }

+/* Emit an and of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1&SRC2.  */
+static void
+do_and3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_anddi3 (dest, src1, src2));
+  else
+emit_insn (gen_andsi3 (dest, src1, src2));
+}
+
+/* Emit an cmpb of the proper mode for DEST.
+
+   DEST is the destination register for the cmpb.
+   SRC1 is the first input.
+   SRC2 is the second input.
+
+   Computes cmpb of SRC1, SRC2.  */
+static void
+do_cmpb3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_cmpbdi3 (dest, src1, src2));
+  else
+emit_insn (gen_cmpbsi3 (dest, src1, src2));
+}
+
+/* Emit a rotl of the proper mode for DEST.
+
+   DEST is the destination register for the and.
+   SRC1 is the first and input.
+   SRC2 is the second and input.
+
+   Computes DEST = SRC1 rotated left by SRC2.  */
+static void
+do_rotl3 (rtx dest, rtx src1, rtx src2)
+{
+  if (GET_MODE (dest) == DImode)
+emit_insn (gen_rotldi3 (dest, src1, src2));
+  else
+emit_insn (gen_rotlsi3 (dest, src1, src2));
+}
+
 /* Generate rtl for a load, shift, and compare of less than a full word.

LOAD_MODE is the machine mode for the loads.
@@ -640,7 +688,7 @@
 {
   if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
/* Do not expect length longer than word_mode.  */
-   return false;
+   return false;
   else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE 
(word_mode))
{
  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
@@ -684,7 +732,7 @@
   rtx j;

   /* Example of generated code for 35 bytes aligned 1 byte.
-
+
 mtctr 8
 li 6,0
 li 5,8
@@ -712,7 +760,7 @@
 popcntd 9,9
 subfe 10,10,10
 or 9,9,10
-
+
  Compiled with -fno-reorder-blocks for clarity.  */

   /* Structure of what we're going to do:
@@ -955,7 +1003,7 @@
   if (!bytes_is_const)
{
  /* If we're dealing with runtime length, we have to check if
-it's zero after the loop. When length is known at compile
+it's zero after the loop.  When length is known at compile
 time the no-remainder condition is dealt with above.  By
 doing this after cleanup_label, we also deal with the
 case where length is 0 at the start and we bypass the
@@ -1325,7 +1373,7 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
   /* P7/P8 code uses cond for subfc. but P9 uses
- it for cmpld which needs CCUNSmode. */
+ it for cmpld which needs CCUNSmode.  */
   rtx cond;
   if (TARGET_P9_MISC)
 cond = gen_reg_rtx (CCUNSmode);
@@ -1578,7 +1626,7 @@
emit_label (convert_label);

   /* We need to produce DI result from sub, then convert to target SI
-while maintaining <0 / ==0 / >0 properties. This sequence works:
+while maintaining <0 / ==0 / >0 properties.  This sequence works:
 subfc L,A,B
 subfe H,H,H
 popcntd L,L
@@ -1847,6 +1895,9 @@
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);

+  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
+  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
+
   /* Generate sequence of ld/ldbrx, cmpb to compare out
  to the length specified.  */
   unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
@@ -1853,12 +1904,9 @@
   while (bytes_to_compare > 0)
 {
   /* Compare sequence:
- check each 8B with: ld/ld cmpd bne
-If equal, use rldicr/cmpb to check for zero byte.
+ check each 8B with: ld/ld/cmpb/cmpb/orc./bne
+
  cle

Re: [PATCH][rs6000] better use of unaligned vsx in memset() expansion

2018-11-28 Thread Aaron Sawdey
The first version of this had a big bug and cleared past the requested bytes.
This version passes regstrap on ppc64le(power7/8/9), ppc64be(power6/7/8),
and ppc32(power8).

OK for trunk (and 8 backport after a week)?

Thanks!
   Aaron

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 266524)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -85,6 +85,8 @@
   if (! optimize_size && bytes > 8 * clear_step)
 return 0;

+  bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
+
   for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
 {
   machine_mode mode = BLKmode;
@@ -91,8 +93,7 @@
   rtx dest;

   if (TARGET_ALTIVEC
- && ((bytes >= 16 && align >= 128)
- || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
+ && (bytes >= 16 && ( align >= 128 || unaligned_vsx_ok)))
{
  clear_bytes = 16;
  mode = V4SImode;


On 11/26/18 4:29 PM, Segher Boessenkool wrote:
> On Mon, Nov 26, 2018 at 03:08:32PM -0600, Aaron Sawdey wrote:
>> When I previously added the use of unaligned vsx stores to inline expansion
>> of memset, I didn't do a good job of managing boundary conditions. The 
>> intention
>> was to only use unaligned vsx if the block being cleared was more than 32 
>> bytes.
>> What it actually did was to prevent the use of unaligned vsx for the last 32
>> bytes of any block being cleared. So this change puts the test up front so it
>> is not affected by the decrement of bytes.
> 
> Oh wow.  Yes, that isn't so great.  Okay for trunk (and whatever backports).
> Thanks,
> 
> 
> Segher
> 
> 
>> 2018-11-26  Aaron Sawdey  
>>
>>  * config/rs6000/rs6000-string.c (expand_block_clear): Change how
>>  we determine if unaligned vsx is ok.
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH 31/30] Update documentation for movmem to cpymem change

2019-06-26 Thread Aaron Sawdey
On 6/25/19 4:43 PM, Jeff Law wrote:
> On 6/25/19 2:22 PM, acsaw...@linux.ibm.com wrote:
>> From: Aaron Sawdey 
>>
>>  * builtins.c (get_memory_rtx): Fix comment.
>>  * optabs.def (movmem_optab): Change to cpymem_optab.
>>  * expr.c (emit_block_move_via_cpymem): Change movmem to cpymem.
>>  (emit_block_move_hints): Change movmem to cpymem.
>>  * defaults.h: Change movmem to cpymem.
>>  * targhooks.c (get_move_ratio): Change movmem to cpymem.
>>  (default_use_by_pieces_infrastructure_p): Ditto.
> So I think you're missing an update to the RTL/MD documentation.  This
> is also likely to cause problems for any out-of-tree ports, so it's
> probably worth a mention in the gcc-10 changes, which will need to be
> created (in CVS no less, ugh).
> 
> I think the stuff posted to-date is fine, but it shouldn't go in without
> the corresponding docs and gcc-10 changes updates.
This would be my proposed patch to update the documentation. I'll also work
out what the entry in the gcc 10 changes and post that for review before
this all goes in.

OK for trunk along with the other 30 patches?

Thanks,
Aaron



* doc/md.texi: Change movmem to cpymem and update description to match.
* doc/rtl.texi: Change movmem to cpymem.
* target.def (use_by_pieces_infrastructure_p): Change movmem to cpymem.
* doc/tm.texi: Regenerate.
---
 gcc/doc/md.texi  | 26 ++
 gcc/doc/rtl.texi |  2 +-
 gcc/doc/tm.texi  |  4 ++--
 gcc/target.def   |  4 ++--
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index b45b4be..3f9d545 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6200,13 +6200,13 @@ This pattern is not allowed to @code{FAIL}.
 @item @samp{one_cmpl@var{m}2}
 Store the bitwise-complement of operand 1 into operand 0.

-@cindex @code{movmem@var{m}} instruction pattern
-@item @samp{movmem@var{m}}
-Block move instruction.  The destination and source blocks of memory
+@cindex @code{cpymem@var{m}} instruction pattern
+@item @samp{cpymem@var{m}}
+Block copy instruction.  The destination and source blocks of memory
 are the first two operands, and both are @code{mem:BLK}s with an
 address in mode @code{Pmode}.

-The number of bytes to move is the third operand, in mode @var{m}.
+The number of bytes to copy is the third operand, in mode @var{m}.
 Usually, you specify @code{Pmode} for @var{m}.  However, if you can
 generate better code knowing the range of valid lengths is smaller than
 those representable in a full Pmode pointer, you should provide
@@ -6226,14 +6226,16 @@ in a way that the blocks are not required to be aligned 
according to it in
 all cases. This expected alignment is also in bytes, just like operand 4.
 Expected size, when unknown, is set to @code{(const_int -1)}.

-Descriptions of multiple @code{movmem@var{m}} patterns can only be
+Descriptions of multiple @code{cpymem@var{m}} patterns can only be
 beneficial if the patterns for smaller modes have fewer restrictions
 on their first, second and fourth operands.  Note that the mode @var{m}
-in @code{movmem@var{m}} does not impose any restriction on the mode of
-individually moved data units in the block.
+in @code{cpymem@var{m}} does not impose any restriction on the mode of
+individually copied data units in the block.

-These patterns need not give special consideration to the possibility
-that the source and destination strings might overlap.
+The @code{cpymem@var{m}} patterns need not give special consideration
+to the possibility that the source and destination strings might
+overlap. These patterns are used to do inline expansion of
+@code{__builtin_memcpy}.

 @cindex @code{movstr} instruction pattern
 @item @samp{movstr}
@@ -6254,7 +6256,7 @@ given as a @code{mem:BLK} whose address is in mode 
@code{Pmode}.  The
 number of bytes to set is the second operand, in mode @var{m}.  The value to
 initialize the memory with is the third operand. Targets that only support the
 clearing of memory should reject any value that is not the constant 0.  See
-@samp{movmem@var{m}} for a discussion of the choice of mode.
+@samp{cpymem@var{m}} for a discussion of the choice of mode.

 The fourth operand is the known alignment of the destination, in the form
 of a @code{const_int} rtx.  Thus, if the compiler knows that the
@@ -6272,13 +6274,13 @@ Operand 9 is the probable maximal size (i.e.@: we 
cannot rely on it for
 correctness, but it can be used for choosing proper code sequence for a
 given size).

-The use for multiple @code{setmem@var{m}} is as for @code{movmem@var{m}}.
+The use for multiple @code{setmem@var{m}} is as for @code{cpymem@var{m}}.

 @cindex @code{cmpstrn@var{m}} instruction pattern
 @item @samp{cmpstrn@var{m}}
 String compare instruction, with five operands.  Operand 0 is the output;
 it has mode @var{m}.  The remaining four operands

Re: [PATCH 32/30] Document movmem/cpymem changes in gcc-10/changes.html

2019-06-27 Thread Aaron Sawdey
On 6/25/19 4:43 PM, Jeff Law wrote:
> On 6/25/19 2:22 PM, acsaw...@linux.ibm.com wrote:
>> From: Aaron Sawdey 
>>
>>  * builtins.c (get_memory_rtx): Fix comment.
>>  * optabs.def (movmem_optab): Change to cpymem_optab.
>>  * expr.c (emit_block_move_via_cpymem): Change movmem to cpymem.
>>  (emit_block_move_hints): Change movmem to cpymem.
>>  * defaults.h: Change movmem to cpymem.
>>  * targhooks.c (get_move_ratio): Change movmem to cpymem.
>>  (default_use_by_pieces_infrastructure_p): Ditto.
> So I think you're missing an update to the RTL/MD documentation.  This
> is also likely to cause problems for any out-of-tree ports, so it's
> probably worth a mention in the gcc-10 changes, which will need to be
> created (in CVS no less, ugh).
> 
> I think the stuff posted to-date is fine, but it shouldn't go in without
> the corresponding docs and gcc-10 changes updates.

Here is the corresponding documentation change for gcc-10/changes.html.

OK for trunk?

Thanks,
Aaron




Index: changes.html
===
RCS file: /cvs/gcc/wwwdocs/htdocs/gcc-10/changes.html,v
retrieving revision 1.4
diff -r1.4 changes.html
139c139,149
< 
---
> Other significant improvements
> 
>   
> To allow inline expansion of both memcpy
> and memmove, the existing movmem instruction
> patterns used for non-overlapping memory copies have been renamed to
> cpymem. The movmem name is now used
> for overlapping memory moves, consistent with the
> library functions memcpy and memmove.
>   
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain


[PATCH] Add movmem optab entry back in for overlapping moves

2019-07-02 Thread Aaron Sawdey
This is the second piece for allowing inline expansion of memmove. Now that
the old movmem patterns have all been renamed to cpymem, the movmem optab can
be added back.

Next piece will be: add support for __builtin_memmove() to use the movmem optab 
and
associated patterns.

This patch passes bootstrap/regtest on ppc64le and x86_64. Ok for trunk?

2019-07-02  Aaron Sawdey  

* optabs.def (movmem_optab): Add movmem back for memmove().
* doc/md.texi: Add description of movmem pattern for overlapping move.


Index: gcc/doc/md.texi
===
--- gcc/doc/md.texi (revision 272762)
+++ gcc/doc/md.texi (working copy)
@@ -6237,6 +6237,42 @@
 overlap. These patterns are used to do inline expansion of
 @code{__builtin_memcpy}.

+@cindex @code{movmem@var{m}} instruction pattern
+@item @samp{movmem@var{m}}
+Block move instruction.  The destination and source blocks of memory
+are the first two operands, and both are @code{mem:BLK}s with an
+address in mode @code{Pmode}.
+
+The number of bytes to copy is the third operand, in mode @var{m}.
+Usually, you specify @code{Pmode} for @var{m}.  However, if you can
+generate better code knowing the range of valid lengths is smaller than
+those representable in a full Pmode pointer, you should provide
+a pattern with a
+mode corresponding to the range of values you can handle efficiently
+(e.g., @code{QImode} for values in the range 0--127; note we avoid numbers
+that appear negative) and also a pattern with @code{Pmode}.
+
+The fourth operand is the known shared alignment of the source and
+destination, in the form of a @code{const_int} rtx.  Thus, if the
+compiler knows that both source and destination are word-aligned,
+it may provide the value 4 for this operand.
+
+Optional operands 5 and 6 specify expected alignment and size of block
+respectively.  The expected alignment differs from alignment in operand 4
+in a way that the blocks are not required to be aligned according to it in
+all cases. This expected alignment is also in bytes, just like operand 4.
+Expected size, when unknown, is set to @code{(const_int -1)}.
+
+Descriptions of multiple @code{movmem@var{m}} patterns can only be
+beneficial if the patterns for smaller modes have fewer restrictions
+on their first, second and fourth operands.  Note that the mode @var{m}
+in @code{movmem@var{m}} does not impose any restriction on the mode of
+individually copied data units in the block.
+
+The @code{movmem@var{m}} patterns must correctly handle the case where
+the source and destination strings overlap. These patterns are used to
+do inline expansion of @code{__builtin_memmove}.
+
 @cindex @code{movstr} instruction pattern
 @item @samp{movstr}
 String copy instruction, with @code{stpcpy} semantics.  Operand 0 is
Index: gcc/optabs.def
===
--- gcc/optabs.def  (revision 272762)
+++ gcc/optabs.def  (working copy)
@@ -257,6 +257,7 @@
 OPTAB_D (cmpstr_optab, "cmpstr$a")
 OPTAB_D (cmpstrn_optab, "cmpstrn$a")
 OPTAB_D (cpymem_optab, "cpymem$a")
+OPTAB_D (movmem_optab, "movmem$a")
 OPTAB_D (setmem_optab, "setmem$a")
 OPTAB_D (strlen_optab, "strlen$a")


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] sched-ebb.c: avoid moving table jumps (PR rtl-optimization/88423)

2019-02-18 Thread Aaron Sawdey
The code in emit_case_dispatch_table() will very clearly always emit 
branch/label/jumptable_data/barrier
so this does need to be handled. So, yes tablejump always looks like this, and 
also yes it seems to be
ripe ground for logic bugs, we have 88308, 88347, 88423 all related to it.

In the long term it might be nice to use a general mechanism (SCHED_GROUP_P?) 
for handling the label and jump
table data that follow a case branch using jump table.

But for now in stage 4, I think the right way to fix this is with the patch 
that Segher posted earlier.
If regtest passes (x86_64 and ppc64/ppc32), ok for trunk?

2019-02-18  Aaron Sawdey  

PR rtl-optimization/88347
* schedule-ebb.c (begin_move_insn): Apply Segher's patch to handle
a jump table before the barrier.


On 1/24/19 9:43 AM, Alexander Monakov wrote:
> On Wed, 23 Jan 2019, Alexander Monakov wrote:
> 
>> It appears that sched-deps tries to take notice of a barrier after a jump, 
>> but
>> similarly to sched-ebb doesn't anticipate that for a tablejump the barrier 
>> will
>> appear after two more insns (a code_label and a jump_table_data).
>>
>> If so, it needs a fixup just like the posted change for the assert. I'll 
>> fire up
>> a bootstrap/regtest.
> 
> Updated patch below (now taking into account that NEXT_INSN may give NULL)
> passes bootstrap/regtest on x86_64, also with -fsched2-use-superblocks.
> 
> I'm surprised to learn that a tablejump may be not the final insn in its
> containing basic block.  It certainly seems like a ripe ground for logic
> bugs like this one.  Is it really intentional?
> 
> OK for trunk?
> 
> Thanks.
> Alexander
> 
>   PR rtl-optimization/88347
>   PR rtl-optimization/88423
>   * sched-deps.c (sched_analyze_insn): Take into account that for
>   tablejumps the barrier appears after a label and a jump_table_data.
> 
> --- a/gcc/sched-deps.c
> +++ b/gcc/sched-deps.c
> @@ -3005,6 +3005,11 @@ sched_analyze_insn (struct deps_desc *deps, rtx x, 
> rtx_insn *insn)
>if (JUMP_P (insn))
>  {
>rtx_insn *next = next_nonnote_nondebug_insn (insn);
> +  /* ??? For tablejumps, the barrier may appear not immediately after
> + the jump, but after a label and a jump_table_data insn.  */
> +  if (next && LABEL_P (next) && NEXT_INSN (next)
> +   && JUMP_TABLE_DATA_P (NEXT_INSN (next)))
> + next = NEXT_INSN (NEXT_INSN (next));
>if (next && BARRIER_P (next))
>   reg_pending_barrier = MOVE_BARRIER;
>else
> 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] sched-ebb.c: avoid moving table jumps (PR rtl-optimization/88423)

2019-02-18 Thread Aaron Sawdey


On 2/18/19 10:41 AM, Alexander Monakov wrote:
> On Mon, 18 Feb 2019, Aaron Sawdey wrote:
> 
>> The code in emit_case_dispatch_table() will very clearly always emit 
>> branch/label/jumptable_data/barrier
>> so this does need to be handled. So, yes tablejump always looks like this, 
>> and also yes it seems to be
>> ripe ground for logic bugs, we have 88308, 88347, 88423 all related to it.
>>
>> In the long term it might be nice to use a general mechanism 
>> (SCHED_GROUP_P?) for handling the label and jump
>> table data that follow a case branch using jump table.
>>
>> But for now in stage 4, I think the right way to fix this is with the patch 
>> that Segher posted earlier.
>> If regtest passes (x86_64 and ppc64/ppc32), ok for trunk?
> 
> How making an assert more permissive is "the right way" here?
> As already mentioned, without the assert we'd move a USE of the register with
> function return value to an unreachable block, which would be incorrect.
> 
> Do you anticipate issues with the sched-deps patch?

Alexander,
 I see you are allowing it to see the barrier as if it were right after the 
tablejump.

Are you saying that the motion of the tablejump is happening because the 
scheduler does not see
the barrier (because it does not follow immediately after) and thus decides it 
can move instructions
to the other side of the tablejump? I agree that is incorrect and is asking for 
other hidden problems.

It would be nice if the tablejump, jump table label, jump table data, and 
barrier were all one indivisible
unit somehow.

In the meantime, can someone approve Alexander's patch?

Thanks,
   Aaron



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][rs6000] avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline expansion

2019-01-14 Thread Aaron Sawdey
The patch for this was committed to trunk as 267562 (see below). Is this also 
ok for backport to 8?

Thanks,
   Aaron

On 12/20/18 5:44 PM, Segher Boessenkool wrote:
> On Thu, Dec 20, 2018 at 05:34:54PM -0600, Aaron Sawdey wrote:
>> On 12/20/18 3:51 AM, Segher Boessenkool wrote:
>>> On Wed, Dec 19, 2018 at 01:53:05PM -0600, Aaron Sawdey wrote:
>>>> Because of POWER9 dd2.1 issues with certain unaligned vsx instructions
>>>> to cache inhibited memory, here is a patch that keeps memmove (and memcpy)
>>>> inline expansion from doing unaligned vector or using vector load/store
>>>> other than lvx/stvx. More description of the issue is here:
>>>>
>>>> https://patchwork.ozlabs.org/patch/814059/
>>>>
>>>> OK for trunk if bootstrap/regtest ok?
>>>
>>> Okay, but see below.
>>>
>> [snip]
>>>
>>> This is extraordinarily clumsy :-)  Maybe something like:
>>>
>>> static rtx
>>> gen_lvx_v4si_move (rtx dest, rtx src)
>>> {
>>>   gcc_assert (!(MEM_P (dest) && MEM_P (src));
>>>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
>>>   if (MEM_P (dest))
>>> return gen_altivec_stvx_v4si_internal (dest, src);
>>>   else if (MEM_P (src))
>>> return gen_altivec_lvx_v4si_internal (dest, src);
>>>   else
>>> gcc_unreachable ();
>>> }
>>>
>>> (Or do you allow VOIDmode for src as well?)  Anyway, at least get rid of
>>> the useless extra variable.
>>
>> I think this should be better:
> 
> The gcc_unreachable at the end catches the non-mem to non-mem case.
> 
>> static rtx
>> gen_lvx_v4si_move (rtx dest, rtx src)
>> {
>>   gcc_assert ((MEM_P (dest) && !MEM_P (src)) || (MEM_P (src) && 
>> !MEM_P(dest)));
> 
> But if you prefer this, how about
> 
> {
>   gcc_assert (MEM_P (dest) ^ MEM_P (src));
>   gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
> 
>   if (MEM_P (dest))
> return gen_altivec_stvx_v4si_internal (dest, src);
>   else
> return gen_altivec_lvx_v4si_internal (dest, src);
> }
> 
> :-)
> 
> 
> Segher
> 

2019-01-03  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_block_move): Don't use
unaligned vsx and avoid lxvd2x/stxvd2x.
(gen_lvx_v4si_move): New function.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 267299)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -2669,6 +2669,25 @@
   return true;
 }

+/* Generate loads and stores for a move of v4si mode using lvx/stvx.
+   This uses altivec_{l,st}vx__internal which use unspecs to
+   keep combine from changing what instruction gets used.
+
+   DEST is the destination for the data.
+   SRC is the source of the data for the move.  */
+
+static rtx
+gen_lvx_v4si_move (rtx dest, rtx src)
+{
+  gcc_assert (MEM_P (dest) ^ MEM_P (src));
+  gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
+
+  if (MEM_P (dest))
+return gen_altivec_stvx_v4si_internal (dest, src);
+  else
+return gen_altivec_lvx_v4si_internal (dest, src);
+}
+
 /* Expand a block move operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.

@@ -2721,11 +2740,11 @@

   /* Altivec first, since it will be faster than a string move
 when it applies, and usually not significantly larger.  */
-  if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || 
align >= 128))
+  if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
{
  move_bytes = 16;
  mode = V4SImode;
- gen_func.mov = gen_movv4si;
+ gen_func.mov = gen_lvx_v4si_move;
}
   else if (bytes >= 8 && TARGET_POWERPC64
   && (align >= 64 || !STRICT_ALIGNMENT))



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH, rs6000] PR target/89112 [8/9 Regression] fix bdnzt pattern for long branch case

2019-02-02 Thread Aaron Sawdey
I needed to introduce a local label in this pattern because output_cbranch put 
out a second instruction
in the long branch case. This fixes the issue but there are a couple ways this 
could be improved:

* output_cbranch() is passed the original insn and assumes from that that the 
branch is a long
branch. However this is incorrect because we are just branching to a local 
label we know is only
a few instructions away. If there is a way to fix this, an unnecessary branch 
could be eliminated.

* While the long branch case of this pattern needs to work, the real problem is 
that part of
the code emitted by the memcmp expansion is being treated as cold code and 
moved to the end of
the function. Ideally all of this code should stay together. I suspect I need 
to make some kind
of branch frequency notation for this to happen.

Regstrap passes on ppc64le power7/8/9, ok for trunk and backport to 8?

Thanks!

2019-02-02  Aaron Sawdey  

* config/rs6000/rs6000.md (tf_): generate a local label
for the long branch case.

Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 268403)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -12639,8 +12639,8 @@
   else
 {
   static char seq[96];
-  char *bcs = output_cbranch (operands[3], "$+8", 1, insn);
-  sprintf(seq, " $+12\;%s;b %%l0", bcs);
+  char *bcs = output_cbranch (operands[3], ".L%=", 1, insn);
+  sprintf(seq, " .L%%=\;%s\;b %%l0\;.L%%=:", bcs);
   return seq;
 }
 }



-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH, rs6000] PR target/89112 put branch probabilities on branches generated by inline expansion

2019-02-04 Thread Aaron Sawdey
This is the second part of the fix for 89112, fixing the conditions that caused 
it to happen.
This patch adds REG_BR_PROB notes to the branches generated by inline expansion 
of memcmp
and strncmp. This prevents any of the code from being marked as cold and moved 
to the end
of the function, which is what caused the long branches in 89112. With this 
patch, the test
case for 89112 does not have any long branches within the expansion of memcmp, 
and the code
for each memcmp is contiguous.

OK for trunk and 8 backport if bootstrap/regtest passes?

Thanks!

   Aaron

2019-02-04  Aaron Sawdey  

* config/rs6000/rs6000-string.c (do_ifelse, expand_cmp_vec_sequence,
expand_compare_loop, expand_block_compare_gpr,
expand_strncmp_align_check, expand_strncmp_gpr_sequence): add branch
probability.


Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 268522)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -35,6 +35,8 @@
 #include "expr.h"
 #include "output.h"
 #include "target.h"
+#include "profile-count.h"
+#include "predict.h"

 /* Expand a block clear operation, and return 1 if successful.  Return 0
if we should let the compiler generate normal code.
@@ -369,6 +371,7 @@
B is the second thing to be compared.
CR is the condition code reg input, or NULL_RTX.
TRUE_LABEL is the label to branch to if the condition is true.
+   P is the estimated branch probability for the branch.

The return value is the CR used for the comparison.
If CR is null_rtx, then a new register of CMPMODE is generated.
@@ -377,7 +380,7 @@

 static void
 do_ifelse (machine_mode cmpmode, rtx_code comparison,
-  rtx a, rtx b, rtx cr, rtx true_label)
+  rtx a, rtx b, rtx cr, rtx true_label, profile_probability p)
 {
   gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
  || (a != NULL_RTX && b != NULL_RTX));
@@ -395,7 +398,8 @@
   rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);

   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
-  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  add_reg_br_prob_note (j, p);
   JUMP_LABEL (j) = true_label;
   LABEL_NUSES (true_label) += 1;
 }
@@ -781,7 +785,8 @@
   rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
   rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
 lab_ref, pc_rtx);
-  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+  add_reg_br_prob_note (j2, profile_probability::likely ());
   JUMP_LABEL (j2) = dst_label;
   LABEL_NUSES (dst_label) += 1;

@@ -1036,7 +1041,7 @@

   /* Difference found is stored here before jump to diff_label.  */
   rtx diff = gen_reg_rtx (word_mode);
-  rtx j;
+  rtx_insn *j;

   /* Example of generated code for 35 bytes aligned 1 byte.

@@ -1120,11 +1125,11 @@
   /* Check for > max_bytes bytes.  We want to bail out as quickly as
 possible if we have to go over to memcmp.  */
   do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
-NULL_RTX, library_call_label);
+NULL_RTX, library_call_label, profile_probability::even ());

   /* Check for < loop_bytes bytes.  */
   do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
-NULL_RTX, cleanup_label);
+NULL_RTX, cleanup_label, profile_probability::even ());

   /* Loop compare bytes and iterations if bytes>max_bytes.  */
   rtx mb_reg = gen_reg_rtx (word_mode);
@@ -1165,7 +1170,7 @@
{
  rtx lab_after = gen_label_rtx ();
  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
-NULL_RTX, lab_after);
+NULL_RTX, lab_after, profile_probability::even ());
  emit_move_insn (loop_cmp, mb_reg);
  emit_move_insn (iter, mi_reg);
  emit_label (lab_after);
@@ -1236,7 +1241,7 @@
}

   do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
-dcond, diff_label);
+dcond, diff_label, profile_probability::unlikely ());

   if (TARGET_P9_MISC)
{
@@ -1260,6 +1265,7 @@
   else
j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
   eqrtx, dcond));
+  add_reg_br_prob_note (j, profile_probability::likely ());
   JUMP_LABEL (j) = loop_top_label;
   LABEL_NUSES (loop_top_label) += 1;
 }
@@ -1272,9 +1278,11 @@
  code.  If we exit here with a nonzero diff, it is
  because the second word differed.  */
   if (TARGET_P9_MISC)
-do_ifelse (CCUNSmode, NE, NULL_RTX, NUL

Re: [PATCH, rs6000] PR target/89112 put branch probabilities on branches generated by inline expansion

2019-02-08 Thread Aaron Sawdey
Missed two more conditional branches created by inline expansion that should 
have had
branch probability notes.

2019-02-08  Aaron Sawdey  

* config/rs6000/rs6000-string.c (expand_compare_loop,
expand_block_compare): Insert REG_BR_PROB notes in inline expansion of
memcmp/strncmp.

Index: gcc/config/rs6000/rs6000-string.c
===
--- gcc/config/rs6000/rs6000-string.c   (revision 268547)
+++ gcc/config/rs6000/rs6000-string.c   (working copy)
@@ -1525,6 +1525,7 @@
  else
j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
   eqrtx, cond));
+ add_reg_br_prob_note (j, profile_probability::likely ());
  JUMP_LABEL (j) = fc_loop;
  LABEL_NUSES (fc_loop) += 1;

@@ -1897,6 +1898,7 @@
  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
 cvt_ref, pc_rtx);
  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
+ add_reg_br_prob_note (j, profile_probability::likely ());
  JUMP_LABEL (j) = convert_label;
  LABEL_NUSES (convert_label) += 1;
}

Pre-approved by Segher for trunk and backport to 8, will commit after regtest 
completes.

  Aaron

On 2/4/19 1:06 PM, Aaron Sawdey wrote:
> This is the second part of the fix for 89112, fixing the conditions that 
> caused it to happen.
> This patch adds REG_BR_PROB notes to the branches generated by inline 
> expansion of memcmp
> and strncmp. This prevents any of the code from being marked as cold and 
> moved to the end
> of the function, which is what caused the long branches in 89112. With this 
> patch, the test
> case for 89112 does not have any long branches within the expansion of 
> memcmp, and the code
> for each memcmp is contiguous.
> 
> OK for trunk and 8 backport if bootstrap/regtest passes?
> 
> Thanks!
> 
>Aaron
> 
> 2019-02-04  Aaron Sawdey  
> 
>   * config/rs6000/rs6000-string.c (do_ifelse, expand_cmp_vec_sequence,
>   expand_compare_loop, expand_block_compare_gpr,
>   expand_strncmp_align_check, expand_strncmp_gpr_sequence): add branch
>   probability.
> 
> 
> Index: gcc/config/rs6000/rs6000-string.c
> ===
> --- gcc/config/rs6000/rs6000-string.c (revision 268522)
> +++ gcc/config/rs6000/rs6000-string.c (working copy)
> @@ -35,6 +35,8 @@
>  #include "expr.h"
>  #include "output.h"
>  #include "target.h"
> +#include "profile-count.h"
> +#include "predict.h"
> 
>  /* Expand a block clear operation, and return 1 if successful.  Return 0
> if we should let the compiler generate normal code.
> @@ -369,6 +371,7 @@
> B is the second thing to be compared.
> CR is the condition code reg input, or NULL_RTX.
> TRUE_LABEL is the label to branch to if the condition is true.
> +   P is the estimated branch probability for the branch.
> 
> The return value is the CR used for the comparison.
> If CR is null_rtx, then a new register of CMPMODE is generated.
> @@ -377,7 +380,7 @@
> 
>  static void
>  do_ifelse (machine_mode cmpmode, rtx_code comparison,
> -rtx a, rtx b, rtx cr, rtx true_label)
> +rtx a, rtx b, rtx cr, rtx true_label, profile_probability p)
>  {
>gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
> || (a != NULL_RTX && b != NULL_RTX));
> @@ -395,7 +398,8 @@
>rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
> 
>rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
> -  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  add_reg_br_prob_note (j, p);
>JUMP_LABEL (j) = true_label;
>LABEL_NUSES (true_label) += 1;
>  }
> @@ -781,7 +785,8 @@
>rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
>rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
>lab_ref, pc_rtx);
> -  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
> +  add_reg_br_prob_note (j2, profile_probability::likely ());
>JUMP_LABEL (j2) = dst_label;
>LABEL_NUSES (dst_label) += 1;
> 
> @@ -1036,7 +1041,7 @@
> 
>/* Difference found is stored here before jump to diff_label.  */
>rtx diff = gen_reg_rtx (word_mode);
> -  rtx j;
> +  rtx_insn *j;
> 
>/* Example of generated code for 35 bytes aligned 1 byte.
> 
> @@ -1120,11 +1125,11 @@
> 

[PATCH] PR rtl-optimization/88308 Update LABEL_NUSES in move_insn_for_shrink_wrap

2019-02-13 Thread Aaron Sawdey
I've tracked pr/88308 down to move_insn_for_shrink_wrap(). This function moves 
an insn
from one BB to another by copying it and deleting the old one. Unfortunately 
this does
the LABEL_NUSES count on labels referenced because deleting the old instruction 
decrements
the count and nothing in this function is incrementing the count.

It just happens that on rs6000 with -m64, force_const_mem() gets called on the 
address
and that sets LABEL_PRESERVE_P on the label which prevents it from being 
deleted. For
whatever reason this doesn't happen in a -m32 compilation, and the label and 
it's associated
jump table data are deleted. This later causes the ICE when the dwarf code 
tries to look
at the label.

Segher and I came up with 3 possible solutions to this:

1) Don't let move_insn_for_shrink_wrap try to move insns with label_ref in them.
2) Call mark_jump_label() on the copied instruction to fix up the ref counts.
3) Make the function actually move the insn instead of copying/deleting it.

It seemed like option 2 was the best thing for stage 4 as it is not inhibiting 
anything
and is just doing a fixup of the ref count.

OK for trunk after regtesting on ppc64be (32/64) and x86_64?

Thanks!
   Aaron


2019-02-13  Aaron Sawdey  

* shrink-wrap.c (move_insn_for_shrink_wrap): Fix LABEL_NUSES counts
on copied instruction.


Index: gcc/shrink-wrap.c
===
--- gcc/shrink-wrap.c   (revision 268783)
+++ gcc/shrink-wrap.c   (working copy)
@@ -414,7 +414,12 @@
   dead_debug_insert_temp (debug, DF_REF_REGNO (def), insn,
  DEBUG_TEMP_BEFORE_WITH_VALUE);

-  emit_insn_after (PATTERN (insn), bb_note (bb));
+  rtx_insn *insn_copy = emit_insn_after (PATTERN (insn), bb_note (bb));
+  /* Update the LABEL_NUSES count on any referenced labels. The ideal
+ solution here would be to actually move the instruction instead
+ of copying/deleting it as this loses some notations on the
+ insn.  */
+  mark_jump_label (PATTERN (insn), insn_copy, 0);
   delete_insn (insn);
   return true;
 }


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [TESTSUITE]Use strncpy instead of strcpy in testsuite/gcc.dg/memcmp-1.c

2017-08-30 Thread Aaron Sawdey
On Wed, 2017-08-30 at 10:16 +0100, Renlin Li wrote:
> Hi,
> 
> In test_driver_memcmp function, I found buf1 and buf2 is not properly
> terminated with null character.
> 
> In lib_strncmp, strcpy will be called with buf1 and buf2.
> The normal implementation of strcpy function has a loop to copy
> character from source
> to destination one by one until a null character is encountered.
> 
> If the string is not properly terminated, this will cause the strcpy
> read/write
> memory beyond the boundary.
> 
> Here I changed the strcpy into strncpy to constraint the function to
> visit
> legal memory only.

Hi,
  Renlin you are correct that it shouldn't be using strcpy because the
string may not be null terminated. However I would suggest we use
memcpy instead of strncpy. The reason is that cases where there is a
null char in the middle of the string test whether the strncmp is
properly ignoring what comes after. So how about using this:

  memcpy(a,str1,SZ);\
  memcpy(b,str2,SZ);\

as in the test_memcmp_ part of the macro?

  Aaron

> 
> Test Okay without any problem. Okay to commit?
> 
> Regard,
> Renlin
> 
> 
> gcc/testsuite/ChangeLog:
> 
> 2017-08-30  Renlin Li  
> 
>   * gcc.dg/memcmp-1.c (test_strncmp): Use strncpy instead of
> strcpy.
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] Add testcases to test builtin-expansion of memcmp and strncmp

2017-01-06 Thread Aaron Sawdey
Jeff,
  Thanks for the review. Committed as 244177 with requested changes.

2017-01-06  Aaron Sawdey  
* gcc.dg/memcmp-1.c: New.
* gcc.dg/strncmp-1.c: New.

Aaron


[PATCH, bugfix] builtin expansion of strcmp for rs6000

2017-01-11 Thread Aaron Sawdey
This expands on the previous patch. For strcmp and for strncmp with N
larger than 64, the first 64 bytes of comparison is expanded inline and
then a call to strcmp or strncmp is emitted to compare the remainder if
the strings are found to be equal at that point. 

-mstring-compare-inline-limit=N determines how many block comparisons
are emitted. With the default 8, and 64-bit code, you get 64 bytes. 

Performance testing on a power8 system shows that the code is anywhere
from 2-8 times faster than RHEL7.2 glibc strcmp/strncmp depending on
alignment and length.

In the process of adding this I discovered that the expansion being
generated for strncmp had a bug in that it was not testing for a zero
byte to terminate the comparison. As a result inputs like
strncmp("AB\0CDEFGX", "AB\0CDEFGY", 9) would be compared not equal. The
initial comparison of a doubleword would be equal so a second one would
be fetched and compared, ignoring the zero byte that should have
terminated comparison. The fix is to use a cmpb to check for zero bytes
in the equality case before comparing the next chunk. I updated the
strncmp-1.c test case to check for this, and also added a new 
strcmp-1.c test case to check strcmp expansion. Also both now have a
length 100 tests to check the transition from the inline comparison to
the library call for the remainder.

ChangeLog
2017-01-11  Aaron Sawdey  
* config/rs6000/rs6000-protos.h (expand_strn_compare): Add arg.
* config/rs6000/rs6000.c (expand_strn_compare): Add ability to expand
strcmp. Fix bug where comparison didn't stop with zero byte.
* config/rs6000/rs6000.md (cmpstrnsi): Args to expand_strn_compare.
(cmpstrsi): Add pattern.

gcc.dg/ChangeLog
2017-01-11  Aaron Sawdey  
* gcc.dg/strcmp-1.c: New.
* gcc.dg/strncmp-1.c: Add test for a bug that escaped.




-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h	(revision 244322)
+++ gcc/config/rs6000/rs6000-protos.h	(working copy)
@@ -78,7 +78,7 @@
 extern int expand_block_clear (rtx[]);
 extern int expand_block_move (rtx[]);
 extern bool expand_block_compare (rtx[]);
-extern bool expand_strn_compare (rtx[]);
+extern bool expand_strn_compare (rtx[], int);
 extern const char * rs6000_output_load_multiple (rtx[]);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
 extern bool rs6000_is_valid_and_mask (rtx, machine_mode);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 244322)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -19635,22 +19635,36 @@
OPERANDS[0] is the target (result).
OPERANDS[1] is the first source.
OPERANDS[2] is the second source.
+   If NO_LENGTH is zero, then:
OPERANDS[3] is the length.
-   OPERANDS[4] is the alignment in bytes.  */
+   OPERANDS[4] is the alignment in bytes.
+   If NO_LENGTH is nonzero, then:
+   OPERANDS[3] is the alignment in bytes.  */
 bool
-expand_strn_compare (rtx operands[])
+expand_strn_compare (rtx operands[], int no_length)
 {
   rtx target = operands[0];
   rtx orig_src1 = operands[1];
   rtx orig_src2 = operands[2];
-  rtx bytes_rtx = operands[3];
-  rtx align_rtx = operands[4];
+  rtx bytes_rtx, align_rtx;
+  if (no_length)
+{
+  bytes_rtx = NULL;
+  align_rtx = operands[3];
+}
+  else
+{
+  bytes_rtx = operands[3];
+  align_rtx = operands[4];
+}
   HOST_WIDE_INT cmp_bytes = 0;
   rtx src1 = orig_src1;
   rtx src2 = orig_src2;
 
-  /* If this is not a fixed size compare, just call strncmp.  */
-  if (!CONST_INT_P (bytes_rtx))
+  /* If we have a length, it must be constant. This simplifies things
+ a bit as we don't have to generate code to check if we've exceeded
+ the length. Later this could be expanded to handle this case.  */
+  if (!no_length && !CONST_INT_P (bytes_rtx))
 return false;
 
   /* This must be a fixed size alignment.  */
@@ -19668,8 +19682,6 @@
 
   gcc_assert (GET_MODE (target) == SImode);
 
-  HOST_WIDE_INT bytes = INTVAL (bytes_rtx);
-
   /* If we have an LE target without ldbrx and word_mode is DImode,
  then we must avoid using word_mode.  */
   int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
@@ -19678,16 +19690,37 @@
   int word_mode_size = GET_MODE_SIZE (word_mode);
 
   int offset = 0;
+  HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
+  HOST_WIDE_INT compare_length; /* How much we are going to compare inline.  */
+  if (no_length)
+/* Use this as a standin to determine the mode to use.  */
+bytes = rs6000_string_compare_inline_limit * word_mode_size;
+  else
+bytes = INTVAL (bytes_rtx);
+
   machine_mode load_m

Re: [PATCH, bugfix] builtin expansion of strcmp for rs6000

2017-01-16 Thread Aaron Sawdey
Here is an updated version of this patch. 

Tulio noted that glibc's strncmp test was failing. This turned out to
be the use of signed HOST_WIDE_INT for handling strncmp length. The
glibc test calls strncmp with length 2^64-1, presumably to provoke
exactly this type of bug. Fixing the issue required changing
select_block_compare_mode() and expand_block_compare() as well.

The other change is if we must emit a runtime check for the 4k
crossing, then we might as well set base_align to 8 and emit the best
possible code.

OK for trunk if bootstrap/regtest passes on ppc64/ppc64le?

Thanks,
  Aaron

On Wed, 2017-01-11 at 11:26 -0600, Aaron Sawdey wrote:
> This expands on the previous patch. For strcmp and for strncmp with N
> larger than 64, the first 64 bytes of comparison is expanded inline
> and
> then a call to strcmp or strncmp is emitted to compare the remainder
> if
> the strings are found to be equal at that point. 
> 
> -mstring-compare-inline-limit=N determines how many block comparisons
> are emitted. With the default 8, and 64-bit code, you get 64 bytes. 
> 
> Performance testing on a power8 system shows that the code is
> anywhere
> from 2-8 times faster than RHEL7.2 glibc strcmp/strncmp depending on
> alignment and length.
> 
> In the process of adding this I discovered that the expansion being
> generated for strncmp had a bug in that it was not testing for a zero
> byte to terminate the comparison. As a result inputs like
> strncmp("AB\0CDEFGX", "AB\0CDEFGY", 9) would be compared not equal.
> The
> initial comparison of a doubleword would be equal so a second one
> would
> be fetched and compared, ignoring the zero byte that should have
> terminated comparison. The fix is to use a cmpb to check for zero
> bytes
> in the equality case before comparing the next chunk. I updated the
> strncmp-1.c test case to check for this, and also added a new 
> strcmp-1.c test case to check strcmp expansion. Also both now have a
> length 100 tests to check the transition from the inline comparison
> to
> the library call for the remainder.
> 
> ChangeLog
> 2017-01-11  Aaron Sawdey  
>   * config/rs6000/rs6000-protos.h (expand_strn_compare): Add arg.
>   * config/rs6000/rs6000.c (expand_strn_compare): Add ability to
> expand
>   strcmp. Fix bug where comparison didn't stop with zero byte.
>   * config/rs6000/rs6000.md (cmpstrnsi): Args to
> expand_strn_compare.
>   (cmpstrsi): Add pattern.
> 
> gcc.dg/ChangeLog
> 2017-01-11  Aaron Sawdey  
>   * gcc.dg/strcmp-1.c: New.
>   * gcc.dg/strncmp-1.c: Add test for a bug that escaped.
> 
> 
> 
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h	(revision 244503)
+++ gcc/config/rs6000/rs6000-protos.h	(working copy)
@@ -78,7 +78,7 @@
 extern int expand_block_clear (rtx[]);
 extern int expand_block_move (rtx[]);
 extern bool expand_block_compare (rtx[]);
-extern bool expand_strn_compare (rtx[]);
+extern bool expand_strn_compare (rtx[], int);
 extern const char * rs6000_output_load_multiple (rtx[]);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
 extern bool rs6000_is_valid_and_mask (rtx, machine_mode);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 244503)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -19310,7 +19310,8 @@
WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
the largest allowable mode.  */
 static machine_mode
-select_block_compare_mode (HOST_WIDE_INT offset, HOST_WIDE_INT bytes,
+select_block_compare_mode (unsigned HOST_WIDE_INT offset,
+			   unsigned HOST_WIDE_INT bytes,
 			   HOST_WIDE_INT align, bool word_mode_ok)
 {
   /* First see if we can do a whole load unit
@@ -19322,7 +19323,7 @@
  Do largest chunk possible without violating alignment rules.  */
 
   /* The most we can read without potential page crossing.  */
-  HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
+  unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
 
   if (word_mode_ok && bytes >= UNITS_PER_WORD)
 return word_mode;
@@ -19410,8 +19411,8 @@
   gcc_assert (GET_MODE (target) == SImode);
 
   /* Anything to move? */
-  HOST_WIDE_INT bytes = INTVAL (bytes_rtx);
-  if (bytes <= 0)
+  unsigned HOST_WIDE_INT bytes = INTVAL (bytes_rtx);
+  if (bytes == 0)
 return true;
 
   /* The code generated for p7 and older is not faster than glibc
@@ -19432,14 +19433,14 @@
 
   /* Strategy phase.  How many ops will this take and should we expand it?  */
 
-  int offset = 0;
+  unsigned 

Re: [PATCH, bugfix] builtin expansion of strcmp for rs6000

2017-01-17 Thread Aaron Sawdey
On Tue, 2017-01-17 at 08:30 -0600, Peter Bergner wrote:
> On 1/16/17 3:09 PM, Aaron Sawdey wrote:
> > Here is an updated version of this patch.
> > 
> > Tulio noted that glibc's strncmp test was failing. This turned out
> > to
> > be the use of signed HOST_WIDE_INT for handling strncmp length. The
> > glibc test calls strncmp with length 2^64-1, presumably to provoke
> > exactly this type of bug. Fixing the issue required changing
> > select_block_compare_mode() and expand_block_compare() as well.
> 
> If glibc's testsuite exposed a bug, then we should also add a similar
> bug to our testsuite.  I scanned the patch and I'm not sure I see
> that exact test scenario.  Is it there and I'm just not seeing it?
> 
> Peter
> 

Nope, you didn't miss it, Peter. I will add such a test as a separate
patch, this one has dragged on for a long time. I have another more
comprehensive test case for strcmp/strncmp I want to add anyway.

Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] testcase for builtin expansion of strncmp and strcmp

2017-01-17 Thread Aaron Sawdey
This patch adds test gcc.dg/strncmp-2.c for builtin expansion of
strncmp and strcmp. This tests a couple more things such as differences
that occur after the zero byte, and something that glibc does which is
to call strncmp with SIZE_MAX for the length which looks for overflow
issues.

I've included interested parties from targets that have a strncmp
builtin.

The test passes on x86_64 and on ppc64le with -mcpu=power6. It will not
pass on ppc64/ppc64le -mcpu=power[78] until I check in my patch that
segher ack'd yesterday and is currently regtesting. OK for trunk?

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/testsuite/gcc.dg/strncmp-2.c
===
--- gcc/testsuite/gcc.dg/strncmp-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/strncmp-2.c	(working copy)
@@ -0,0 +1,715 @@
+/* Test strcmp/strncmp builtin expansion for compilation and proper execution.  */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target ptr32plus } */
+
+#include 
+#include 
+#include 
+#include 
+
+#define MAX_SZ 200
+
+static void test_driver_strcmp (void (test_func)(const char *, const char *, int), int align)
+{
+  char buf1[2*MAX_SZ+10],buf2[2*MAX_SZ+10];
+  int sz, diff_pos, zero_pos;
+  int e;
+  for(sz = 1; sz < MAX_SZ; sz++)
+for(diff_pos = ((sz>10)?(sz-10):0); diff_pos < sz+10; diff_pos++)
+  for(zero_pos = ((sz>10)?(sz-10):0); zero_pos < sz+10; zero_pos++)
+	{
+	  memset(buf1, 'A', sizeof(buf1));
+	  memset(buf2, 'A', sizeof(buf2));
+	  buf2[diff_pos] = 'B';
+	  buf1[zero_pos] = 0;
+	  buf2[zero_pos] = 0;
+	  e = -1;
+	  if ( zero_pos == 0 || zero_pos <= diff_pos ) e = 0;
+	  (*test_func)(buf1,buf2,e);
+	  (*test_func)(buf2,buf1,-e);
+	  (*test_func)(buf2,buf2,0);
+	  /* differing length: */
+	  buf2[diff_pos] = 0;
+	  (*test_func)(buf1,buf2,-e);
+	  memset(buf2+diff_pos,'B',sizeof(buf2)-diff_pos);
+	  buf2[zero_pos] = 0;
+	  (*test_func)(buf1,buf2,e);
+	  (*test_func)(buf2,buf1,-e);
+	}
+}
+
+static void test_driver_strncmp (void (test_func)(const char *, const char *, int), size_t sz, int align)
+{
+  char buf1[MAX_SZ*2+10],buf2[MAX_SZ*2+10];
+  size_t test_sz = (sz10)?(test_sz-10):0);
+  diff_pos < test_sz+10; diff_pos++)
+for(zero_pos = ((test_sz>10)?(test_sz-10):0);
+	zero_pos < test_sz+10; zero_pos++)
+  {
+	memset(buf1, 'A', 2*test_sz);
+	memset(buf2, 'A', 2*test_sz);
+	buf2[diff_pos] = 'B';
+	buf1[zero_pos] = 0;
+	buf2[zero_pos] = 0;
+	e = -1;
+	if ( diff_pos >= sz ) e = 0;
+	if ( zero_pos <= diff_pos ) e = 0;
+	(*test_func)(buf1,buf2,e);
+	(*test_func)(buf2,buf1,-e);
+	(*test_func)(buf2,buf2,0);
+	/* differing length: */
+	buf2[diff_pos] = 0;
+	(*test_func)(buf1,buf2,-e);
+	memset(buf2+diff_pos,'B',sizeof(buf2)-diff_pos);
+	buf2[zero_pos] = 0;
+	(*test_func)(buf1,buf2,e);
+	(*test_func)(buf2,buf1,-e);
+  }
+}
+
+#define RUN_TESTN(SZ, ALIGN) test_driver_strncmp (test_strncmp_ ## SZ ## _ ## ALIGN, SZ, ALIGN);
+#define RUN_TEST(ALIGN)  test_driver_strcmp (test_strcmp_ ## ALIGN, ALIGN);
+
+#define DEF_TESTN(SZ, ALIGN)	 \
+static void test_strncmp_ ## SZ ## _ ## ALIGN (const char *str1, const char *str2, int expect)	 \
+{ \
+  char three[8192] __attribute__ ((aligned (4096)));		 \
+  char four[8192] __attribute__ ((aligned (4096)));		 \
+  char *a, *b;			 \
+  int i,j,r;			 \
+  for (j = 0; j < 2; j++)	 \
+{ \
+  for (i = 0; i < 2; i++)	 \
+	{			 \
+	  a = three+i*ALIGN+j*(4096-2*i*ALIGN);			 \
+	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);			 \
+	  strcpy(a,str1);	 \
+	  strcpy(b,str2);	 \
+	  r = strncmp(a,b,SZ);	 \
+	  if ( r < 0 && !(expect < 0) ) \
+	{ abort(); }	 \
+	  if ( r > 0 && !(expect > 0) ) \
+	{ abort(); }	 \
+	  if ( r == 0 && !(expect == 0) )			 \
+	{ abort(); }	 \
+	}			 \
+} \
+}
+#define DEF_TEST(ALIGN)		 \
+static void test_strcmp_ ## ALIGN (const char *str1, const char *str2, int expect)		 \
+{ \
+  char three[8192] __attribute__ ((aligned (4096)));		 \
+  char four[8192] __attribute__ ((aligned (4096)));		 \
+  char *a, *b;			 \
+  int i,j,r;			 \
+  for (j = 0; j < 2; j++)	 \
+{ \
+  for (i = 0; i < 2; i++)	 \
+	{			 \
+	  a = three+i*ALIGN+j*(4096-2*i*ALIGN);			 \
+	  b = four+i*ALIGN+j*(4096-2*i*ALIGN);			 \
+	  strcpy(a,str1);	 \
+	  strcpy(b,str2);	 \
+	  r = strcmp(a,b);	 \
+	  if ( r < 0 && !(expect < 0) ) \
+	{ abort(); }	 \
+	  if ( r > 0 &&

[PATCH, pr63256] update powerpc dg options

2017-01-19 Thread Aaron Sawdey
SMS does process the loop in sms-8.c on powerpc now so I have updated
the options to reflect that.

Test now passes on powerpc -m64/-m32/-m32 -mpowerpc64. Ok for trunk?

testsuite/ChangeLog
2017-01-19  Aaron Sawdey  
* gcc.dg/sms-8.c: Update options for powerpc*-*-*.
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/testsuite/gcc.dg/sms-8.c
===
--- gcc/testsuite/gcc.dg/sms-8.c	(revision 244540)
+++ gcc/testsuite/gcc.dg/sms-8.c	(working copy)
@@ -4,7 +4,6 @@
 
  /* { dg-do run } */
  /* { dg-options "-O2 -fmodulo-sched -fmodulo-sched-allow-regmoves -fdump-rtl-sms --param sms-min-sc=1" } */
- /* { dg-options "-O2 -fmodulo-sched -fmodulo-sched-allow-regmoves -fdump-rtl-sms" { target powerpc*-*-* } } */
 
 extern void abort (void);
 
@@ -36,6 +35,6 @@
   return 0;
 }
 
-/* { dg-final { scan-rtl-dump-times "SMS succeeded" 0 "sms" { target powerpc*-*-* } } } */
+/* { dg-final { scan-rtl-dump-times "SMS succeeded" 1 "sms" { target powerpc*-*-* } } } */
 
 


[PATCH][PR target/80083][7 regression] fix power9 vsx-small-integer issue caused by wrong constraints

2017-03-20 Thread Aaron Sawdey
Test libgomp doacross2.f90 failed only at -O1 because an incorrect
constraint on movsi_internal1 (for vspltisw) led to confusion between
vsx and float registers (fix credit to Meissner). In subsequent
discussion David Edelsohn pointed out that there was an additional
error on the constraint for xxspltib -1 that is also fixed now.

Bootstrap/regtest reveals no errors on either power8 or power9. Ok for
trunk?

2017-03-20  Aaron Sawdey  

PR target/80083
* config/rs6000/rs6000.md (*movsi_internal1): incorrect constraints
for alternatives 14/15.
 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH][PR target/80083][7 regression] fix power9 vsx-small-integer issue caused by wrong constraints

2017-03-20 Thread Aaron Sawdey
On Mon, 2017-03-20 at 11:11 -0500, Aaron Sawdey wrote:
> Test libgomp doacross2.f90 failed only at -O1 because an incorrect
> constraint on movsi_internal1 (for vspltisw) led to confusion between
> vsx and float registers (fix credit to Meissner). In subsequent
> discussion David Edelsohn pointed out that there was an additional
> error on the constraint for xxspltib -1 that is also fixed now.
> 
> Bootstrap/regtest reveals no errors on either power8 or power9. Ok
> for
> trunk?
> 
> 2017-03-20  Aaron Sawdey  
> 
>   PR target/80083
>   * config/rs6000/rs6000.md (*movsi_internal1): incorrect
> constraints
>   for alternatives 14/15.
>  

Forgot the patch! 

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md	(revision 246224)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -6727,7 +6727,7 @@
   [(set (match_operand:SI 0 "rs6000_nonimmediate_operand"
 		"=r, r,   r,   ?*wI,?*wH,
 		 m,  ?Z,  ?Z,  r,   r,
-		 r,  ?*wIwH,  ?*wJwK,  ?*wK,?*wJwK,
+		 r,  ?*wIwH,  ?*wJwK,  ?*wJwK,  ?*wu,
 		 ?*wJwK, ?*wH,?*wK,?*wIwH,  ?r,
 		 r,  *c*l,*h,  *h")
 


[PATCH][PR target/80123][7 regression] new constraint wA to prevent r0 use in mtvsrdd

2017-03-21 Thread Aaron Sawdey
Both the fails in 80123 are a situation where vsx_splat_ for V2DI
generates rtl for a mtvsrdd but constraint wr doesn't prevent
allocation of r0 for the input. So new constraint wA combines the
attributes of wr and b -- it is BASE_REGS if 64-bit and NO_REGS
otherwise.

Currently doing bootstrap/regtest on 64-bit LE and BE, and also BE 32-
bit. OK for trunk if everything passes?

2017-03-21  Aaron Sawdey  

PR target/80123
* doc/md.texi (Constraints): Document wA constraint.
* config/rs6000/constraints.md (wA): New.
* config/rs6000/rs6000.c (rs6000_debug_reg_global): Add wA reg_class.
(rs6000_init_hard_regno_mode_ok): Init wA constraint.
* config/rs6000/rs6000.h (RS6000_CONSTRAINT_wA): New.
* config/rs6000/vsx.md (vsx_splat_): Use wA constraint.


-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/constraints.md
===
--- gcc/config/rs6000/constraints.md	(revision 246295)
+++ gcc/config/rs6000/constraints.md	(working copy)
@@ -133,6 +133,9 @@
 (define_register_constraint "wz" "rs6000_constraints[RS6000_CONSTRAINT_wz]"
   "Floating point register if the LFIWZX instruction is enabled or NO_REGS.")
 
+(define_register_constraint "wA" "rs6000_constraints[RS6000_CONSTRAINT_wA]"
+  "BASE_REGS if 64-bit instructions are enabled or NO_REGS.")
+
 ;; wB needs ISA 2.07 VUPKHSW
 (define_constraint "wB"
   "Signed 5-bit constant integer that can be loaded into an altivec register."
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 246295)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -2468,6 +2468,7 @@
 	   "wx reg_class = %s\n"
 	   "wy reg_class = %s\n"
 	   "wz reg_class = %s\n"
+	   "wA reg_class = %s\n"
 	   "wH reg_class = %s\n"
 	   "wI reg_class = %s\n"
 	   "wJ reg_class = %s\n"
@@ -2500,6 +2501,7 @@
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wx]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wy]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wz]],
+	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wA]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wH]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wI]],
 	   reg_class_names[rs6000_constraints[RS6000_CONSTRAINT_wJ]],
@@ -3210,7 +3212,10 @@
 }
 
   if (TARGET_POWERPC64)
-rs6000_constraints[RS6000_CONSTRAINT_wr] = GENERAL_REGS;
+{
+  rs6000_constraints[RS6000_CONSTRAINT_wr] = GENERAL_REGS;
+  rs6000_constraints[RS6000_CONSTRAINT_wA] = BASE_REGS;
+}
 
   if (TARGET_P8_VECTOR && TARGET_UPPER_REGS_SF)			/* SFmode  */
 {
Index: gcc/config/rs6000/rs6000.h
===
--- gcc/config/rs6000/rs6000.h	(revision 246295)
+++ gcc/config/rs6000/rs6000.h	(working copy)
@@ -1612,6 +1612,7 @@
   RS6000_CONSTRAINT_wx,		/* FPR register for STFIWX */
   RS6000_CONSTRAINT_wy,		/* VSX register for SF */
   RS6000_CONSTRAINT_wz,		/* FPR register for LFIWZX */
+  RS6000_CONSTRAINT_wA,		/* BASE_REGS if 64-bit.  */
   RS6000_CONSTRAINT_wH,		/* Altivec register for 32-bit integers.  */
   RS6000_CONSTRAINT_wI,		/* VSX register for 32-bit integers.  */
   RS6000_CONSTRAINT_wJ,		/* VSX register for 8/16-bit integers.  */
Index: gcc/config/rs6000/vsx.md
===
--- gcc/config/rs6000/vsx.md	(revision 246295)
+++ gcc/config/rs6000/vsx.md	(working copy)
@@ -3072,7 +3072,7 @@
 	"=,,we,")
 	(vec_duplicate:VSX_D
 	 (match_operand: 1 "splat_input_operand"
-	",Z,b, wr")))]
+	",Z,b, wA")))]
   "VECTOR_MEM_VSX_P (mode)"
   "@
xxpermdi %x0,%x1,%x1,0
Index: gcc/doc/md.texi
===
--- gcc/doc/md.texi	(revision 246295)
+++ gcc/doc/md.texi	(working copy)
@@ -3122,6 +3122,9 @@
 @item wz
 Floating point register if the LFIWZX instruction is enabled or NO_REGS.
 
+@item wA
+Address base register if 64-bit instructions are enabled or NO_REGS.
+
 @item wB
 Signed 5-bit constant integer that can be loaded into an altivec register.
 


[PATCH][PR target/80358][7 regression] Fix boundary check error in expand_block_compare

2017-04-07 Thread Aaron Sawdey
Turns out we get passed const -1 for the length arg from this code.
ROUND_UP adds load_mode_size to that resulting in a small positive
number, hilarity ensues. Fixed by computing a sensible limit and using
IN_RANGE instead, which won't overflow in this way.

OK for trunk if bootstrap/regtest in progress passes?

2017-04-07  Aaron Sawdey  

PR target/80358
* config/rs6000/rs6000.c (expand_block_compare): Fix boundary check.

Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 246771)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -19672,8 +19672,9 @@
   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
 
   /* We don't want to generate too much code.  */
-  if (ROUND_UP (bytes, load_mode_size) / load_mode_size
-  > (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit)
+  unsigned HOST_WIDE_INT max_bytes =
+load_mode_size * (unsigned HOST_WIDE_INT) 
rs6000_block_compare_inline_limit;
+  if (!IN_RANGE (bytes, 1, max_bytes))
 return false;
 
   bool generate_6432_conversion = false;
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH] Add -static-pie to GCC driver to create static PIE

2017-09-12 Thread Aaron Sawdey
On Tue, 2017-09-12 at 16:20 +, Joseph Myers wrote:
> On Mon, 28 Aug 2017, H.J. Lu wrote:
> 
> > Here is the updated patch.   OK for trunk?
> 
> OK.

This seems to be causing an issue for me on powerpc:

/home/sawdey/src/gcc/trunk/build/./prev-gcc/xg++ 
-B/home/sawdey/src/gcc/trunk/build/./prev-gcc/ 
-B/home/sawdey/install/gcc/trunk_str/powerpc64le-unknown-linux-gnu/bin/ 
-nostdinc++ 
-B/home/sawdey/src/gcc/trunk/build/prev-powerpc64le-unknown-linux-gnu/libstdc++-v3/src/.libs
 
-B/home/sawdey/src/gcc/trunk/build/prev-powerpc64le-unknown-linux-gnu/libstdc++-v3/libsupc++/.libs
  
-I/home/sawdey/src/gcc/trunk/build/prev-powerpc64le-unknown-linux-gnu/libstdc++-v3/include/powerpc64le-unknown-linux-gnu
  
-I/home/sawdey/src/gcc/trunk/build/prev-powerpc64le-unknown-linux-gnu/libstdc++-v3/include
  -I/home/sawdey/src/gcc/trunk/trunk/libstdc++-v3/libsupc++ 
-L/home/sawdey/src/gcc/trunk/build/prev-powerpc64le-unknown-linux-gnu/libstdc++-v3/src/.libs
 
-L/home/sawdey/src/gcc/trunk/build/prev-powerpc64le-unknown-linux-gnu/libstdc++-v3/libsupc++/.libs
 -c   -g -O2 -gtoggle -DIN_GCC -fno-exceptions -fno-rtti 
-fasynchronous-unwind-tables -W -Wall -Wno-narrowing -Wwrite-strings 
-Wcast-qual -Wmissing-format-attribute -Woverloaded-virtual -pedantic 
-Wno-long-long -Wno-variadic-macros -Wno-overlength-strings -Werror -fno-common 
 -DHAVE_CONFIG_H -DGENERATOR_FILE -fno-PIE -I. -Ibuild -I../../trunk/gcc 
-I../../trunk/gcc/build -I../../trunk/gcc/../include  
-I../../trunk/gcc/../libcpp/include  \
-o build/gencheck.o ../../trunk/gcc/gencheck.c
In file included from ./tm.h:30:0,
 from ../../trunk/gcc/gencheck.c:23:
../../trunk/gcc/config/rs6000/sysv4.h:819:0: error: "LINK_EH_SPEC" redefined 
[-Werror]
 # define LINK_EH_SPEC "%{!static:--eh-frame-hdr} "
 
In file included from ./tm.h:28:0,
 from ../../trunk/gcc/gencheck.c:23:
../../trunk/gcc/config/gnu-user.h:135:0: note: this is the location of the 
previous definition
 #define LINK_EH_SPEC "%{!static|static-pie:--eh-frame-hdr} "

I don't see the problem on 252033.

Thanks,
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][PR target/79170] fix memcmp builtin expansion sequence for rs6000 target.

2017-01-27 Thread Aaron Sawdey
This patch changes the generated sequence for builtin expansion of
memcmp to one that is actually correct. The old sequence did not work
right if the subf overflowed subtracting two 64-bit chunks, or if extsw
overflowed after subtraction of two 32-bit chunks. Included is an
expansion of the memcmp-1.c test case so that it catches these issues.

The new sequence looks like this

ld  A
ld  B
subfc.  R, B, A
bne
subfe   C, C, C
popcntd P, R
or  R, R, C
extsw   R, R

where ld/ld/subfc./bne is the repeated sequence to compare 64-bit
chunks, and subfe/popcntd/or/extsw is the code used to convert a 32-bit 
or larger difference to a signed 32-bit result.

Additionally for power9 subfc. is replaced by cmpld and
subfe/popcntd/or/extsw is replaced by setb.

The updated memcmp-1 testcase passes on ppc64le (p8/p9), ppc64 (p8/p9),
 ppc32 (p8), and x86_64. Bootstrap was successful on ppc64/ppc64le.
Assuming regtest on ppc64/ppc64le passes, ok for trunk?

2017-01-27  Aaron Sawdey  

PR target/79170
* gcc.dg/memcmp-1.c: Improved to catch failures seen in PR 79170.

2017-01-27  Aaron Sawdey  

PR target/79170
* config/rs6000/altivec.md (*setb_internal): Rename to setb_signed.
(setb_unsigned) New pattern for setb with CCUNS.
* config/rs6000/rs6000.c (expand_block_compare): Use a different
subfc./subfe sequence to avoid overflow problems. Generate a
shorter sequence with cmpld/setb for power9.
* config/rs6000/rs6000.md (subf3_carry_dot2): Add a new pattern
for generating subfc. instruction.
(cmpstrsi): Add TARGET_POPCNTD predicate as the generate sequence
now uses this instruction.
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/altivec.md
===
--- gcc/config/rs6000/altivec.md	(revision 244952)
+++ gcc/config/rs6000/altivec.md	(working copy)
@@ -3838,7 +3838,7 @@
 ;; Otherwise, set operand 0 to 0.  Note that the result stored into
 ;; register operand 0 is non-zero iff either the LT or GT bits are on
 ;; within condition register operand 1.
-(define_insn "*setb_internal"
+(define_insn "setb_signed"
[(set (match_operand:SI 0 "gpc_reg_operand" "=r")
 	 (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y")
 			  (const_int 0))
@@ -3851,6 +3851,19 @@
   "setb %0,%1"
   [(set_attr "type" "logical")])
 
+(define_insn "setb_unsigned"
+   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
+	 (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y")
+			  (const_int 0))
+			  (const_int -1)
+			  (if_then_else (gtu (match_dup 1)
+	(const_int 0))
+	(const_int 1)
+	(const_int 0]
+  "TARGET_P9_MISC"
+  "setb %0,%1"
+  [(set_attr "type" "logical")])
+
 ;; Test byte within two ranges.
 ;;
 ;; The bytes of operand 1 are organized as xx:xx:xx:vv, where xx
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 244952)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -17292,7 +17292,7 @@
   TYPE_NAME (V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector __bool char", bool_V16QI_type_node);
-  TYPE_NAME ( bool_V16QI_type_node) = tdecl;
+  TYPE_NAME (bool_V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector unsigned short", unsigned_V8HI_type_node);
   TYPE_NAME (unsigned_V8HI_type_node) = tdecl;
@@ -19458,24 +19458,31 @@
   rtx src1 = orig_src1;
   rtx src2 = orig_src2;
 
-  /* If this is not a fixed size compare, just call memcmp */
+  /* This case is complicated to handle because the subtract
+ with carry instructions do not generate the 64-bit
+ carry and so we must emit code to calculate it ourselves.
+ We choose not to implement this yet.  */
+  if (TARGET_32BIT && TARGET_POWERPC64)
+return false;
+
+  /* If this is not a fixed size compare, just call memcmp.  */
   if (!CONST_INT_P (bytes_rtx))
 return false;
 
-  /* This must be a fixed size alignment */
+  /* This must be a fixed size alignment.  */
   if (!CONST_INT_P (align_rtx))
 return false;
 
   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
 
-  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff */
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff.  */
   if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
   || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
 return false;
 
   gcc_assert (GET_MODE (target) == SImode);
 
-  /* Anything to move? */
+  /* Anything to move?  */
   unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
   if (bytes =

Re: [PATCH, pr63256] update powerpc dg options

2017-02-01 Thread Aaron Sawdey
On Thu, 2017-01-19 at 17:00 -0600, Aaron Sawdey wrote:
> SMS does process the loop in sms-8.c on powerpc now so I have updated
> the options to reflect that.
> 
> Test now passes on powerpc -m64/-m32/-m32 -mpowerpc64. Ok for trunk?
> 
> testsuite/ChangeLog
> 2017-01-19  Aaron Sawdey  
>   * gcc.dg/sms-8.c: Update options for powerpc*-*-*.

Ping.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][PR target/79449][7 regression] fix ppc strncmp builtin expansion runtime boundary crossing check

2017-02-09 Thread Aaron Sawdey
Basic problem is that the runtime check for a 4k boundary crossing was
checking that it didn't happen in the N bytes strncmp was being asked
to look at. But, the code generation after the check assumed it could
generate an 8-byte read and so it could step over the boundary. 

The fix is to make sure that the minimum distance to the boundary being
checked and the maximum read size being generated for these short
comparisons are in sync.

The added test case tests for this condition against both memcmp and
strncmp builtin expansion.

Assuming bootstrap/regtest passes on ppc variants and the new test case
passes on x86_64 as well, ok for trunk?


2017-02-09  Aaron Sawdey  

PR target/79449
* gcc.dg/strncmp-2.c: New.  Test strncmp and memcmp builtin
expansion for reading beyond a 4k boundary.

2017-02-09  Aaron Sawdey  

PR target/79449
* config/rs6000/rs6000.c (expand_block_compare): Make sure runtime
boundary crossing check and subsequent code generation agree.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 245294)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -19931,15 +19931,26 @@
 	 cmpldi	cr7,r8,4096-16
 	 bgt	cr7,L(pagecross) */
 
+  /* Make sure that the length we use for the alignment test and
+ the subsequent code generation are in agreement so we do not
+ go past the length we tested for a 4k boundary crossing.  */
+  unsigned HOST_WIDE_INT align_test = compare_length;
+  if (align_test < 8)
+{
+  align_test = 1 << ceil_log2 (align_test);
+  base_align = align_test;
+}
+  else
+{
+  align_test = ROUND_UP (align_test, 8);
+  base_align = 8;
+}
+
   if (align1 < 8)
-	expand_strncmp_align_check (strncmp_label, src1, compare_length);
+expand_strncmp_align_check (strncmp_label, src1, align_test);
   if (align2 < 8)
-	expand_strncmp_align_check (strncmp_label, src2, compare_length);
+expand_strncmp_align_check (strncmp_label, src2, align_test);
 
-  /* After the runtime alignment checks, we can use any alignment we
-	 like as we know there is no 4k boundary crossing.  */
-  base_align = 8;
-
   /* Now generate the following sequence:
 	 - branch to begin_compare
 	 - strncmp_label
Index: gcc/testsuite/gcc.dg/strncmp-2.c
===
--- gcc/testsuite/gcc.dg/strncmp-2.c	(revision 0)
+++ gcc/testsuite/gcc.dg/strncmp-2.c	(working copy)
@@ -0,0 +1,99 @@
+/* Test strncmp builtin expansion for compilation and proper execution.  */
+/* { dg-do run { target *-*-linux* *-*-gnu* } } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target ptr32plus } */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+int lib_memcmp(const void *a, const void *b, size_t n) asm("memcmp");
+int lib_strncmp(const char *a, const char *b, size_t n) asm("strncmp");
+
+static void test_driver_strncmp (void (test_strncmp)(const char *, const char *, int),
+ void (test_memcmp)(const void *, const void *, int),
+ size_t sz)
+{
+  long pgsz = sysconf(_SC_PAGESIZE);
+  char buf1[sz+1];
+  char *buf2 = aligned_alloc(pgsz,2*pgsz);
+  char *p2;
+  int r,i,e;
+
+  r = mprotect (buf2+pgsz,pgsz,PROT_NONE);
+  if (r < 0) abort();
+  
+  memset(buf1,'A',sz);
+  for(i=10; i>=0; i--) {
+p2 = buf2+pgsz-sz-i;
+memset(p2,'A',sz);
+e = lib_strncmp(buf1,p2,sz);
+(*test_strncmp)(buf1,p2,e);
+e = lib_memcmp(buf1,p2,sz);
+(*test_memcmp)(buf1,p2,e);
+  }
+}
+
+#define RUN_TEST(SZ) test_driver_strncmp (test_strncmp_ ## SZ, test_memcmp_ ## SZ, SZ);
+
+#define DEF_TEST(SZ) \
+__attribute__((noinline))		  \
+void test_strncmp_ ## SZ (const char *str1, const char *str2, int expect) \
+{	  \
+  int r;  \
+  r = strncmp(str1,str2,SZ);		  \
+  if ( r < 0 && !(expect < 0) ) abort();  \
+  if ( r > 0 && !(expect > 0) )	abort();  \
+  if ( r == 0 && !(expect == 0) ) abort();  \
+} \
+__attribute__((noinline))		  \
+void test_memcmp_ ## SZ (const void *p1, const void *p2, int expect)  \
+{	  \
+  int r;  \
+  r = memcmp(p1,p2,SZ);		  \
+  if ( r < 0 && !(expect < 0) ) abort();  \
+  if ( r > 0 && !(expect > 0) )	abort();  \
+  if ( r == 0 && !(expect == 0) ) abort();  \
+}
+
+DEF_TEST(1)
+DEF_TEST(2)
+DEF_TEST(3)
+DEF_TEST(4)
+DEF_TEST(5)
+DEF_TEST(6)
+DEF_TEST(7)
+DEF_TEST(8)
+DEF_TEST(9)
+DEF_TEST(10)
+DEF_TEST(11)
+DEF_TEST(12)
+DEF_TEST(1

[PATCH][PR target/79295][7 regression] fix ppc bcdadd insn pattern

2017-02-09 Thread Aaron Sawdey
The bcdadd pattern has the wrong constraints. The change Meissner
supplied in PR79295 fixes the issue.

Successfully bootstrapped on ppc64le, ok for trunk if regtest also
passes?

2017-02-09  Aaron Sawdey  

PR target/79295
* config/rs6000/altivec.md (bcd): Fix constraints.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/altivec.md
===
--- gcc/config/rs6000/altivec.md	(revision 245221)
+++ gcc/config/rs6000/altivec.md	(working copy)
@@ -3710,10 +3710,10 @@
 (define_code_iterator BCD_TEST [eq lt gt unordered])
 
 (define_insn "bcd"
-  [(set (match_operand:V1TI 0 "register_operand" "")
-	(unspec:V1TI [(match_operand:V1TI 1 "register_operand" "")
-		  (match_operand:V1TI 2 "register_operand" "")
-		  (match_operand:QI 3 "const_0_to_1_operand" "")]
+  [(set (match_operand:V1TI 0 "gpc_reg_operand" "=v")
+	(unspec:V1TI [(match_operand:V1TI 1 "gpc_reg_operand" "v")
+		  (match_operand:V1TI 2 "gpc_reg_operand" "v")
+		  (match_operand:QI 3 "const_0_to_1_operand" "n")]
 		 UNSPEC_BCD_ADD_SUB))
(clobber (reg:CCFP CR6_REGNO))]
   "TARGET_P8_VECTOR"


[PATCH] portability fix for gcc.dg/strncmp-2.c testcase

2017-02-14 Thread Aaron Sawdey
This testcase I added failed to compile on AIX or older linux due to
the use of aligned_alloc(). Now fixed to use posix_memalign if
available, and valloc otherwise.

Now it compiles and passes on x86_64 (fedora 25), ppc64 (RHEL6.8), and
AIX. OK for trunk?

2017-02-14  Aaron Sawdey  

* gcc.dg/strncmp-2.c: Portability fixes.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: strncmp-2.c
===
--- strncmp-2.c	(revision 245439)
+++ strncmp-2.c	(working copy)
@@ -19,7 +19,12 @@
 {
   long pgsz = sysconf(_SC_PAGESIZE);
   char buf1[sz+1];
-  char *buf2 = aligned_alloc(pgsz,2*pgsz);
+  char *buf2;
+#if _POSIX_C_SOURCE >= 200112L
+  if ( posix_memalign ((void **)&buf2, pgsz, 2*pgsz) ) abort ();
+#else
+  if ( !(buf2 = valloc(2*pgsz))) abort ();
+#endif
   char *p2;
   int r,i,e;
 
@@ -35,6 +40,7 @@
 e = lib_memcmp(buf1,p2,sz);
 (*test_memcmp)(buf1,p2,e);
   }
+  free(buf2);
 }
 
 #define RUN_TEST(SZ) test_driver_strncmp (test_strncmp_ ## SZ, test_memcmp_ ## SZ, SZ);


Re: [PATCH] portability fix for gcc.dg/strncmp-2.c testcase

2017-02-14 Thread Aaron Sawdey
On Tue, 2017-02-14 at 13:09 -0600, Segher Boessenkool wrote:
> On Tue, Feb 14, 2017 at 11:56:50AM -0600, Aaron Sawdey wrote:
> > This testcase I added failed to compile on AIX or older linux due
> > to
> > the use of aligned_alloc(). Now fixed to use posix_memalign if
> > available, and valloc otherwise.
> > 
> > Now it compiles and passes on x86_64 (fedora 25), ppc64 (RHEL6.8),
> > and
> > AIX. OK for trunk?
> 
> Is valloc preferable to aligned_alloc on all systems where
> posix_memalign
> does not exist?  Okay for trunk if so.  Thanks,
> 
> 
> Segher

My reasoning here was to use the modern function (posix_memalign) if
available and otherwise fall back to valloc which is in glibc dating
back to 1996 and openbsd's man page says it was added in BSD 3.0 so
pretty much anything should have it.

Thanks,
Aaron


> 
> 
> > 2017-02-14  Aaron Sawdey  
> > 
> > * gcc.dg/strncmp-2.c: Portability fixes.
> 
> 
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: [PATCH, testsuite]: Use posix_memalign instead of aligned_alloc in gcc.dg/strncmp-2.c

2017-02-20 Thread Aaron Sawdey
On Fri, 2017-02-17 at 10:50 +0100, Uros Bizjak wrote:
> posix_memalign is portable to older, non-c11 runtimes.
> 
> 2017-02-17  Uros Bizjak  
> 
> * gcc.dg/strncmp-2.c (test_driver_strncmp): Use posix_memalign
> instead of aligned_alloc.
> 
> Tested on x86_64-linux-gnu, CentOS 5.11.
> 
> OK for mainline?

Uros,
  I posted something very similar last Tuesday:

https://gcc.gnu.org/ml/gcc-patches/2017-02/msg00937.html

I didn't get around to applying it until this morning in 245608.
Apologies for wasting your time tracking down the same issue again.

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][PR target/79752] fix rs6000 power9 peephole2 for udiv/umod

2017-02-28 Thread Aaron Sawdey
This showed up in power9 code for __divkf3 software float support and
caused a divd to be emitted where we needed a divdu. OK for trunk if
bootstrap/regtest passes?

Index: ../trunk/gcc/config/rs6000/rs6000.md
===
--- ../trunk/gcc/config/rs6000/rs6000.md(revision 245787)
+++ ../trunk/gcc/config/rs6000/rs6000.md(working copy)
@@ -3161,7 +3161,7 @@
&& ! reg_mentioned_p (operands[3], operands[1])
&& ! reg_mentioned_p (operands[3], operands[2])"
   [(set (match_dup 0)
-   (div:GPR (match_dup 1)
+   (udiv:GPR (match_dup 1)
 (match_dup 2)))
(set (match_dup 3)
(mult:GPR (match_dup 0)

2017-02-28  Aaron Sawdey  

PR target/79752
* config/rs6000/rs6000.md (peephole2 for udiv/umod): Should emit
udiv rather than div since input pattern is unsigned.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH][PR target/79752] fix rs6000 power9 peephole2 for udiv/umod -- backported to gcc-6-branch

2017-03-14 Thread Aaron Sawdey
This showed up in power9 code for __divkf3 software float support and
caused a divd to be emitted where we needed a divdu.

backported/bootstrapped/regtested to gcc-6-branch

Index: gcc/config/rs6000/rs6000.md
===
--- gcc/config/rs6000/rs6000.md (revision 246123)
+++ gcc/config/rs6000/rs6000.md (working copy)
@@ -3063,8 +3063,8 @@
&& ! reg_mentioned_p (operands[3], operands[1])
&& ! reg_mentioned_p (operands[3], operands[2])"
   [(set (match_dup 0)
-   (div:GPR (match_dup 1)
-(match_dup 2)))
+   (udiv:GPR (match_dup 1)
+ (match_dup 2)))
(set (match_dup 3)
(mult:GPR (match_dup 0)
  (match_dup 2)))

2017-03-14  Aaron Sawdey  

Backport from mainline
    2017-02-28  Aaron Sawdey  

PR target/79752
* config/rs6000/rs6000.md (peephole2 for udiv/umod): Should emit
udiv rather than div since input pattern is unsigned.
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH] builtin expansion of memcmp for powerpc

2016-09-22 Thread Aaron Sawdey
The powerpc target had a movmemsi pattern which supports memcpy() but
did not have anything for memcmp(). This adds support for builtin
expansion of memcmp() into inline code for modest constant lengths.
Performance on power8 is in the range of 3-7x faster than calling
memcmp() for lengths under 40 bytes.

Bootstrap on powerpc64le, regtest in progress, OK for trunk if no new
regressions?

2016-09-22  Aaron Sawdey  

* config/rs6000/rs6000.md (cmpmemsi): New define_expand.
* config/rs6000/rs6000.c (expand_block_compare): New function used by
cmpmemsi pattern to do builtin expansion of memcmp().
(compute_current_alignment): Add helper function for
expand_block_compare used to compute alignment as the compare proceeds.
(select_block_compare_mode): Used by expand_block_compare to select
the mode used for reading the next chunk of bytes in the compare.
(do_load_for_compare): Used by expand_block_compare to emit the load
insns for the compare.
(rs6000_emit_dot_insn): Moved this function to avoid a forward
reference from expand_block_compare().
* config/rs6000/rs6000-protos.h (expand_block_compare): Add a
prototype for this function.
* config/rs6000/rs6000.opt (mblock-compare-inline-limit): Add a new
target option for controlling how much code inline expansion of
memcmp() will be allowed to generate.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000-protos.h
===
--- gcc/config/rs6000/rs6000-protos.h	(revision 240286)
+++ gcc/config/rs6000/rs6000-protos.h	(working copy)
@@ -77,6 +77,7 @@
 extern void rs6000_scale_v2df (rtx, rtx, int);
 extern int expand_block_clear (rtx[]);
 extern int expand_block_move (rtx[]);
+extern bool expand_block_compare (rtx[]);
 extern const char * rs6000_output_load_multiple (rtx[]);
 extern bool rs6000_is_valid_mask (rtx, int *, int *, machine_mode);
 extern bool rs6000_is_valid_and_mask (rtx, machine_mode);
Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 240286)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -18423,7 +18423,462 @@
   return 1;
 }
 
+/* Emit a potentially record-form instruction, setting DST from SRC.
+   If DOT is 0, that is all; otherwise, set CCREG to the result of the
+   signed comparison of DST with zero.  If DOT is 1, the generated RTL
+   doesn't care about the DST result; if DOT is 2, it does.  If CCREG
+   is CR0 do a single dot insn (as a PARALLEL); otherwise, do a SET and
+   a separate COMPARE.  */
+
+static void
+rs6000_emit_dot_insn (rtx dst, rtx src, int dot, rtx ccreg)
+{
+  if (dot == 0)
+{
+  emit_move_insn (dst, src);
+  return;
+}
+
+  if (cc_reg_not_cr0_operand (ccreg, CCmode))
+{
+  emit_move_insn (dst, src);
+  emit_move_insn (ccreg, gen_rtx_COMPARE (CCmode, dst, const0_rtx));
+  return;
+}
+
+  rtx ccset = gen_rtx_SET (ccreg, gen_rtx_COMPARE (CCmode, src, const0_rtx));
+  if (dot == 1)
+{
+  rtx clobber = gen_rtx_CLOBBER (VOIDmode, dst);
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, clobber)));
+}
+  else
+{
+  rtx set = gen_rtx_SET (dst, src);
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, ccset, set)));
+}
+}
+
 
+/* Figure out the correct instructions to generate to load data for
+   block compare.  MODE is used for the read from memory, and
+   data is zero extended if REG is wider than MODE.  If LE code
+   is being generated, bswap loads are used.
+
+   REG is the destination register to move the data into.
+   MEM is the memory block being read.
+   MODE is the mode of memory to use for the read.  */
+static void
+do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
+{
+  switch (GET_MODE (reg))
+{
+case DImode:
+  switch (mode)
+	{
+	case QImode:
+	  emit_insn (gen_zero_extendqidi2 (reg, mem));
+	  break;
+	case HImode:
+	  {
+	rtx src = mem;
+	if (TARGET_LITTLE_ENDIAN)
+	  {
+		src = gen_reg_rtx (HImode);
+		emit_insn (gen_bswaphi2 (src, mem));
+	  }
+	emit_insn (gen_zero_extendhidi2 (reg, src));
+	break;
+	  }
+	case SImode:
+	  {
+	rtx src = mem;
+	if (TARGET_LITTLE_ENDIAN)
+	  {
+		src = gen_reg_rtx (SImode);
+		emit_insn (gen_bswapsi2 (src, mem));
+	  }
+	emit_insn (gen_zero_extendsidi2 (reg, src));
+	  }
+	  break;
+	case DImode:
+	  if (TARGET_LITTLE_ENDIAN)
+	emit_insn (gen_bswapdi2 (reg, mem));
+	  else
+	emit_insn (gen_movdi (reg, mem));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+  break;
+
+case SImode:
+  switch (mode)
+	{
+	case QImode:
+	  emit_insn (gen_zero_extendqisi2 (reg, mem));
+	  break;
+	case HImode:
+	  {
+	rtx src 

Re: [PATCH] builtin expansion of memcmp for powerpc

2016-09-24 Thread Aaron Sawdey
On Sat, 2016-09-24 at 09:32 -0400, David Edelsohn wrote:
> This patch broke bootstrap on AIX.
> 
> The ChangeLog entry was not committed to the gcc/ChangeLog file.

Changelog entry added. Forgot to write that emacs buffer before commit.

> TARGET_LITTLE_ENDIAN only is defined for PPC64 Linux.  This patch
> should be using !BYTES_BIG_ENDIAN.

Change made, I will commit as obvious once I bootstrap to double check
my work on ppc64le.

Sorry for the mess ...

  Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH PR77718]

2016-09-28 Thread Aaron Sawdey
This patch that Bernd put in PR77718 seems to be fine. Bootstrap and
regtest done on powerpc64le, no new failures. Ok for trunk?

2016-09-28  Bernd Schmidt  

* builtins.c (expand_builtin_memcmp): don't swap args unless
result is only being compared with zero.

Index: builtins.c
===
--- builtins.c  (revision 240511)
+++ builtins.c  (working copy)
@@ -3707,11 +3707,13 @@ expand_builtin_memcmp (tree exp, rtx tar
 
   by_pieces_constfn constfn = NULL;
 
-  const char *src_str = c_getstr (arg1);
-  if (src_str == NULL)
-src_str = c_getstr (arg2);
-  else
-std::swap (arg1_rtx, arg2_rtx);
+  const char *src_str = c_getstr (arg2);
+  if (result_eq && src_str == NULL)
+{
+  src_str = c_getstr (arg1);
+  if (src_str != NULL)
+   std::swap (arg1_rtx, arg2_rtx);
+}
 
   /* If SRC is a string constant and block move would be done
  by pieces, we can avoid loading the string from memory

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH, RS6000] improve builtin expansion of memcmp for p7

2016-10-06 Thread Aaron Sawdey
I've improved the builtin memcmp expansion so it avoids a couple of 
things that p7 and previous processors don't like. Performance on
p7 is now never worse than glibc memcmp(). Bootstrap/regtest in progress
on power7 ppc64 BE. 

OK for trunk if testing passes?


gcc/ChangeLog:

2016-10-06  Aaron Sawdey  

* config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
Add macro to say we can efficiently handle overlapping unaligned
loads.
* config/rs6000/rs6000.c (expand_block_compare): Avoid generating
poor code for processors older than p8.

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c	(revision 240816)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -18687,6 +18687,14 @@
   if (bytes <= 0)
 return true;
 
+  /* The code generated for p7 and older is not faster than glibc
+ memcmp if alignment is small and length is not short, so bail
+ out to avoid those conditions.  */
+  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+  && ((base_align == 1 && bytes > 16)
+	  || (base_align == 2 && bytes > 32)))
+return false;
+
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
 
@@ -18736,13 +18744,18 @@
   while (bytes > 0)
 {
   int align = compute_current_alignment (base_align, offset);
-  load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok);
+  if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+	load_mode = select_block_compare_mode(offset, bytes, align,
+	  word_mode_ok);
+  else
+	load_mode = select_block_compare_mode(0, bytes, align, word_mode_ok);
   load_mode_size = GET_MODE_SIZE (load_mode);
   if (bytes >= load_mode_size)
 	cmp_bytes = load_mode_size;
-  else
+  else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
 	{
-	  /* Move this load back so it doesn't go past the end.  */
+	  /* Move this load back so it doesn't go past the end.
+	 P8/P9 can do this efficiently.  */
 	  int extra_bytes = load_mode_size - bytes;
 	  cmp_bytes = bytes;
 	  if (extra_bytes < offset)
@@ -18752,7 +18765,12 @@
 	  bytes = cmp_bytes;
 	}
 	}
-
+  else
+	/* P7 and earlier can't do the overlapping load trick fast,
+	   so this forces a non-overlapping load and a shift to get
+	   rid of the extra bytes.  */
+	cmp_bytes = bytes;
+  
   src1 = adjust_address (orig_src1, load_mode, offset);
   src2 = adjust_address (orig_src2, load_mode, offset);
 
Index: gcc/config/rs6000/rs6000.h
===
--- gcc/config/rs6000/rs6000.h	(revision 240816)
+++ gcc/config/rs6000/rs6000.h	(working copy)
@@ -603,6 +603,9 @@
  && TARGET_POWERPC64)
 #define TARGET_VEXTRACTUB	(TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
  && TARGET_UPPER_REGS_DI && TARGET_POWERPC64)
+/* This wants to be set for p8 and newer.  On p7, overlapping unaligned
+   loads are slow. */
+#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
 
 /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
in power7, so conditionalize them on p8 features.  TImode syncs need quad


[PATCH, RS6000, Committed] increase buf size in rs6000_elf_asm_out_{constructor,destructor}

2016-10-06 Thread Aaron Sawdey
It seems we now have analysis that concludes these buffers may possibly
overflow. This broke bootstrap on ppc64 BE. Bootstrap passed on ppc64 BE
power7. Committing as pre-approved by Segher.


2016-10-06  Aaron Sawdey  

* config/rs6000/rs6000.c (rs6000_elf_asm_out_constructor)
(rs6000_elf_asm_out_destructor): increase size of buf to avoid
possible overflow.


Index: gcc/config/rs6000/rs6000.c
===
--- gcc/config/rs6000/rs6000.c  (revision 240846)
+++ gcc/config/rs6000/rs6000.c  (working copy)
@@ -34166,7 +34166,7 @@
 rs6000_elf_asm_out_constructor (rtx symbol, int priority)
 {
   const char *section = ".ctors";
-  char buf[16];
+  char buf[18];

   if (priority != DEFAULT_INIT_PRIORITY)
 {
@@ -34197,7 +34197,7 @@
 rs6000_elf_asm_out_destructor (rtx symbol, int priority)
 {
   const char *section = ".dtors";
-  char buf[16];
+  char buf[18];

   if (priority != DEFAULT_INIT_PRIORITY)
     {
-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[PATCH, RS6000, PR77934] mtvsrdd needs b (base register) constraint on first input

2016-10-11 Thread Aaron Sawdey
Gcc 7 trunk was generating incorrect code for spec2k6 403.gcc due to
this constraint issue. OK for trunk after bootstrap/regtest passes?

2016-10-06  Aaron Sawdey  

PR target/77934
* config/rs6000/vmx.md (vsx_concat_): The mtvsrdd instruction
needs a base register for arg 1.


Index: gcc/config/rs6000/vsx.md
===
--- gcc/config/rs6000/vsx.md(revision 240994)
+++ gcc/config/rs6000/vsx.md(working copy)
@@ -1938,7 +1938,7 @@
 (define_insn "vsx_concat_"
   [(set (match_operand:VSX_D 0 "gpc_reg_operand" "=,we")
(vec_concat:VSX_D
-(match_operand: 1 "gpc_reg_operand" ",r")
+(match_operand: 1 "gpc_reg_operand" ",b")
 (match_operand: 2 "gpc_reg_operand" ",r")))]
   "VECTOR_MEM_VSX_P (mode)"
 {

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



Re: PING! Re: [PATCH, Fortran] Extension: COTAN and degree-valued trig intrinsics with -fdec-math

2016-10-11 Thread Aaron Sawdey
On Tue, 2016-10-11 at 07:26 -0400, Fritz Reese wrote:
> On Mon, Oct 10, 2016 at 3:56 PM, Steve Kargl
>  wrote:
> ...
> > 
> > There are a few small clean-up that can be
> > done.  For example,
> > 
> > +static gfc_expr *
> > +get_radians (gfc_expr *deg)
> > +{
> > +  mpfr_t tmp;
> ...
> > 
> > the tmp variable is unneeded in the above.  Converting the double
> > precision 180.0 to mpfr_t and then dividing is probably slower
> > than just dividing by 180.
> > 
> > +  /* Set factor = pi / 180.  */
> > +  factor = gfc_get_constant_expr (deg->ts.type, deg->ts.kind,
> > °->where);
> > +  mpfr_const_pi (factor->value.real, GFC_RND_MODE);
> > +  mpfr_div_ui (factor->value.real, factor->value.real, 180,
> > GFC_RND_MODE);
> > 
> ...
> 
> Good catch, fixed and committed r240989. Many thanks to you and
> Jerry.
> 
> ---
> Fritz Reese
> 

I think the first part of that cleanup didn't get applied as I am
seeing this:

../../gcc/gcc/fortran/iresolve.c: In function âgfc_expr* 
get_degrees(gfc_expr*)â:
../../gcc/gcc/fortran/iresolve.c:2728:14: error: âtmpâ was not declared in this 
scope

and also this:

../../gcc/gcc/fortran/simplify.c: In function âvoid radians_f(__mpfr_struct*, 
mpfr_rnd_t)â:
../../gcc/gcc/fortran/simplify.c:1775:5: error: âmpfr_fmod_dâ was not declared 
in this scope
 mpfr_fmod_d (tmp, x, 360.0, rnd_mode);
 ^~~

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



[RFC PATCH] expand_strn_compare should attempt expansion even if neither string is constant

2016-10-26 Thread Aaron Sawdey
I'm currently working on a builtin expansion of strncmp for powerpc
similar to the one for memcmp I checked recently. One thing I
encountered is that the code in expand_strn_compare will not attempt to
expand the cmpstrnsi pattern at all if neither string parameter is a
constant string. This doesn't make a lot of sense in light of the fact
that expand_str_compare starts with an attempt to expand cmpstrsi and
then if that does not work, looks at the string args to see if one is
constant so it can use cmpstrnsi with the known length.

The attached patch is my attempt to add this fallback path to
expand_strn_compare, i.e. if neither length is known, just attempt
expansion of cmpstrnsi using the given 3 arguments.

It looks like in addition to rs6000, there are 3 other targets that
have cmpstrnsi patterns: i386, sh, and rx. 

Is this a reasonable approach to take with this? If so I'll
bootstrap/regtest on i386 as rs6000 does not as yet have an expansion
for cmpstrsi or cmpstrnsi.

Thanks,
   Aaron

-- 
Aaron Sawdey, Ph.D.  acsaw...@linux.vnet.ibm.com
050-2/C113  (507) 253-7520 home: 507/263-0782
IBM Linux Technology Center - PPC ToolchainIndex: builtins.c
===
--- builtins.c	(revision 241593)
+++ builtins.c	(working copy)
@@ -3932,46 +3932,53 @@
 len1 = c_strlen (arg1, 1);
 len2 = c_strlen (arg2, 1);
 
-if (len1)
-  len1 = size_binop_loc (loc, PLUS_EXPR, ssize_int (1), len1);
-if (len2)
-  len2 = size_binop_loc (loc, PLUS_EXPR, ssize_int (1), len2);
+if (!len1 && !len2)
+  {
+	len = arg3;
+  }
+else
+  {
+	if (len1)
+	  len1 = size_binop_loc (loc, PLUS_EXPR, ssize_int (1), len1);
+	if (len2)
+	  len2 = size_binop_loc (loc, PLUS_EXPR, ssize_int (1), len2);
 
-/* If we don't have a constant length for the first, use the length
-   of the second, if we know it.  We don't require a constant for
-   this case; some cost analysis could be done if both are available
-   but neither is constant.  For now, assume they're equally cheap,
-   unless one has side effects.  If both strings have constant lengths,
-   use the smaller.  */
+	/* If we don't have a constant length for the first, use the length
+	   of the second, if we know it.  We don't require a constant for
+	   this case; some cost analysis could be done if both are available
+	   but neither is constant.  For now, assume they're equally cheap,
+	   unless one has side effects.  If both strings have constant lengths,
+	   use the smaller.  */
 
-if (!len1)
-  len = len2;
-else if (!len2)
-  len = len1;
-else if (TREE_SIDE_EFFECTS (len1))
-  len = len2;
-else if (TREE_SIDE_EFFECTS (len2))
-  len = len1;
-else if (TREE_CODE (len1) != INTEGER_CST)
-  len = len2;
-else if (TREE_CODE (len2) != INTEGER_CST)
-  len = len1;
-else if (tree_int_cst_lt (len1, len2))
-  len = len1;
-else
-  len = len2;
+	if (!len1)
+	  len = len2;
+	else if (!len2)
+	  len = len1;
+	else if (TREE_SIDE_EFFECTS (len1))
+	  len = len2;
+	else if (TREE_SIDE_EFFECTS (len2))
+	  len = len1;
+	else if (TREE_CODE (len1) != INTEGER_CST)
+	  len = len2;
+	else if (TREE_CODE (len2) != INTEGER_CST)
+	  len = len1;
+	else if (tree_int_cst_lt (len1, len2))
+	  len = len1;
+	else
+	  len = len2;
 
-/* If both arguments have side effects, we cannot optimize.  */
-if (!len || TREE_SIDE_EFFECTS (len))
-  return NULL_RTX;
+	/* If both arguments have side effects, we cannot optimize.  */
+	if (!len || TREE_SIDE_EFFECTS (len))
+	  return NULL_RTX;
 
-/* The actual new length parameter is MIN(len,arg3).  */
-len = fold_build2_loc (loc, MIN_EXPR, TREE_TYPE (len), len,
-		   fold_convert_loc (loc, TREE_TYPE (len), arg3));
+	/* The actual new length parameter is MIN(len,arg3).  */
+	len = fold_build2_loc (loc, MIN_EXPR, TREE_TYPE (len), len,
+			   fold_convert_loc (loc, TREE_TYPE (len), arg3));
 
-/* If we don't have POINTER_TYPE, call the function.  */
-if (arg1_align == 0 || arg2_align == 0)
-  return NULL_RTX;
+	/* If we don't have POINTER_TYPE, call the function.  */
+	if (arg1_align == 0 || arg2_align == 0)
+	  return NULL_RTX;
+  }
 
 /* Stabilize the arguments in case gen_cmpstrnsi fails.  */
 arg1 = builtin_save_expr (arg1);


  1   2   >