Re: [SVE] PR89007 - Implement generic vector average expansion

Prathamesh Kulkarni Thu, 05 Dec 2019 02:05:58 -0800

On Fri, 29 Nov 2019 at 15:41, Richard Biener <richard.guent...@gmail.com> wrote:
>
> On Fri, Nov 22, 2019 at 12:40 PM Prathamesh Kulkarni
> <prathamesh.kulka...@linaro.org> wrote:
> >
> > On Wed, 20 Nov 2019 at 16:54, Richard Biener <rguent...@suse.de> wrote:
> > >
> > > On Wed, 20 Nov 2019, Richard Sandiford wrote:
> > >
> > > > Hi,
> > > >
> > > > Thanks for doing this.  Adding Richard on cc:, since the SVE subject
> > > > tag might have put him off.  There's not really anything SVE-specific
> > > > here apart from the testcase.
> > >
> > > Ah.
> > >
> > > > > 2019-11-19  Prathamesh Kulkarni  <prathamesh.kulka...@linaro.org>
> > > > >
> > > > >     PR tree-optimization/89007
> > > > >     * tree-vect-patterns.c (vect_recog_average_pattern): If there is 
> > > > > no
> > > > >     target support available, generate code to distribute rshift over 
> > > > > plus
> > > > >     and add one depending upon floor or ceil rounding.
> > > > >
> > > > > testsuite/
> > > > >     * gcc.target/aarch64/sve/pr89007.c: New test.
> > > > >
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c 
> > > > > b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c
> > > > > new file mode 100644
> > > > > index 00000000000..32095c63c61
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c
> > > > > @@ -0,0 +1,29 @@
> > > > > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */
> > > > > +/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve 
> > > > > --save-temps" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" } } */
> > > > > +
> > > > > +#define N 1024
> > > > > +unsigned char dst[N];
> > > > > +unsigned char in1[N];
> > > > > +unsigned char in2[N];
> > > > > +
> > > > > +/*
> > > > > +**  foo:
> > > > > +** ...
> > > > > +** lsr     (z[0-9]+\.b), z[0-9]+\.b, #1
> > > > > +** lsr     (z[0-9]+\.b), z[0-9]+\.b, #1
> > > > > +** add     (z[0-9]+\.b), \1, \2
> > > > > +** orr     (z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d
> > > > > +** and     (z[0-9]+\.b), \4\.b, #0x1
> > > > > +** add     z0.b, \3, \5
> > > >
> > > > It'd probably be more future-proof to allow (\1, \2|\2, \1) and
> > > > (\3, \5|\5, \3).  Same for the other testcase.
> > > >
> > > > > +** ...
> > > > > +*/
> > > > > +void
> > > > > +foo ()
> > > > > +{
> > > > > +  for( int x = 0; x < N; x++ )
> > > > > +    dst[x] = (in1[x] + in2[x] + 1) >> 1;
> > > > > +}
> > > > > +
> > > > > +/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */
> > > > > +/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c 
> > > > > b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c
> > > > > new file mode 100644
> > > > > index 00000000000..cc40f45046b
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c
> > > > > @@ -0,0 +1,29 @@
> > > > > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */
> > > > > +/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve 
> > > > > --save-temps" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" } } */
> > > > > +
> > > > > +#define N 1024
> > > > > +unsigned char dst[N];
> > > > > +unsigned char in1[N];
> > > > > +unsigned char in2[N];
> > > > > +
> > > > > +/*
> > > > > +**  foo:
> > > > > +** ...
> > > > > +** lsr     (z[0-9]+\.b), z[0-9]+\.b, #1
> > > > > +** lsr     (z[0-9]+\.b), z[0-9]+\.b, #1
> > > > > +** add     (z[0-9]+\.b), \1, \2
> > > > > +** and     (z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d
> > > > > +** and     (z[0-9]+\.b), \4\.b, #0x1
> > > > > +** add     z0.b, \3, \5
> > > > > +** ...
> > > > > +*/
> > > > > +void
> > > > > +foo ()
> > > > > +{
> > > > > +  for( int x = 0; x < N; x++ )
> > > > > +    dst[x] = (in1[x] + in2[x]) >> 1;
> > > > > +}
> > > > > +
> > > > > +/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */
> > > > > +/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */
> > > > > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
> > > > > index 8ebbcd76b64..7025a3b4dc2 100644
> > > > > --- a/gcc/tree-vect-patterns.c
> > > > > +++ b/gcc/tree-vect-patterns.c
> > > > > @@ -2019,22 +2019,59 @@ vect_recog_average_pattern (stmt_vec_info 
> > > > > last_stmt_info, tree *type_out)
> > > >
> > > > >    /* Check for target support.  */
> > > > >    tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
> > > > > -  if (!new_vectype
> > > > > -      || !direct_internal_fn_supported_p (ifn, new_vectype,
> > > > > -                                     OPTIMIZE_FOR_SPEED))
> > > > > +
> > > > > +  if (!new_vectype)
> > > > >      return NULL;
> > > > >
> > > > > +  bool ifn_supported
> > > > > +    = direct_internal_fn_supported_p (ifn, new_vectype, 
> > > > > OPTIMIZE_FOR_SPEED);
> > > > > +
> > > > >    /* The IR requires a valid vector type for the cast result, even 
> > > > > though
> > > > >       it's likely to be discarded.  */
> > > > >    *type_out = get_vectype_for_scalar_type (vinfo, type);
> > > > >    if (!*type_out)
> > > > >      return NULL;
> > > >  >
> > > > > -  /* Generate the IFN_AVG* call.  */
> > > > >    tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
> > > > >    tree new_ops[2];
> > > > >    vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
> > > > >                    unprom, new_vectype);
> > > > > +
> > > > > +  if (!ifn_supported)
> > > >
> > > > I guess this is personal preference, but I'm not sure there's much
> > > > benefit to splitting this out of the "if" into a separate variable.
> > > >
> > > > > +    {
> > > > > +      /* If there is no target support available, generate code
> > > > > +    to distribute rshift over plus and add one depending
> > > > > +    upon floor or ceil rounding.  */
> > > >
> > > > Maybe "and add a carry"?  It'd be good to write out the expansion in
> > > > pseudo-code too.
> > > >
> > > > The transform is only valid on unsigned types.  We still need to
> > > > bail out for signed types because the carry would act in the wrong
> > > > direction for negative results.  E.g.:
> > > >
> > > >   (-3 + 1) >> 1 == -2
> > > >   (-3 >> 1) == -1,  (1 >> 1) == 0,  (-1 & 1) == 1   total == 0
> > > >
> > > > Handling that case could be future work.
> > > >
> > > > > +      tree one_cst = build_one_cst (new_type);
> > > > > +
> > > > > +      tree tmp1 = vect_recog_temp_ssa_var (new_type, NULL);
> > > > > +      gassign *g1 = gimple_build_assign (tmp1, RSHIFT_EXPR, 
> > > > > new_ops[0], one_cst);
> > > > > +
> > > > > +      tree tmp2 = vect_recog_temp_ssa_var (new_type, NULL);
> > > > > +      gassign *g2 = gimple_build_assign (tmp2, RSHIFT_EXPR, 
> > > > > new_ops[1], one_cst);
> > > > > +
> > > > > +      tree tmp3 = vect_recog_temp_ssa_var (new_type, NULL);
> > > > > +      gassign *g3 = gimple_build_assign (tmp3, PLUS_EXPR, tmp1, 
> > > > > tmp2);
> > > > > +
> > > > > +      tree tmp4 = vect_recog_temp_ssa_var (new_type, NULL);
> > > > > +      tree_code c = (ifn == IFN_AVG_CEIL) ? BIT_IOR_EXPR : 
> > > > > BIT_AND_EXPR;
> > > > > +      gassign *g4 = gimple_build_assign (tmp4, c, new_ops[0], 
> > > > > new_ops[1]);
> > > > > +
> > > > > +      tree tmp5 = vect_recog_temp_ssa_var (new_type, NULL);
> > > > > +      gassign *g5 = gimple_build_assign (tmp5, BIT_AND_EXPR, tmp4, 
> > > > > one_cst);
> > > > > +
> > > > > +      gassign *g6 = gimple_build_assign (new_var, PLUS_EXPR, tmp3, 
> > > > > tmp5);
> > > > > +
> > > > > +      append_pattern_def_seq (last_stmt_info, g1, new_vectype);
> > > > > +      append_pattern_def_seq (last_stmt_info, g2, new_vectype);
> > > > > +      append_pattern_def_seq (last_stmt_info, g3, new_vectype);
> > > > > +      append_pattern_def_seq (last_stmt_info, g4, new_vectype);
> > > > > +      append_pattern_def_seq (last_stmt_info, g5, new_vectype);
> > >
> > > I mainly wonder of why the new sequence is any better that the
> > > original - that is, where did we verify that the non-pattern sequence
> > > cannot be vectorized?  It is
> > >
> > >   _1 = in1[x_17];
> > >   _2 = (int) _1;
> > >   _3 = in2[x_17];
> > >   _4 = (int) _3;
> > >   _5 = _2 + _4;
> > >   _6 = _5 + 1;
> > >   _7 = _6 >> 1;
> > >   _8 = (unsigned char) _7;
> > >   dst[x_17] = _8;
> > >
> > > so it seems it was just convenient to handle the "un-widening"
> > > pattern here instead of more generally doing the "carry" trick
> > > for supporting >> 1 in the un-widening path(s) (not exactly knowing
> > > if we have more than a few very special cases covering this)?
> > >
> > > The overall function comment would also need to be updated to
> > > outline the scheme.
> > >
> > > You don't verify the target supports shifting of the smaller
> > > mode vectors nor do you verify it supports bitwise ops or
> > > plus [of smaller mode vectors] (AVX512BW is needed for AVX512
> > > vectorization of byte/short for example!  Yeah, stupid AVX...,
> > > also SSE has pavg, not sure about AVX512 w/o BW here).
> > >
> > > Since the vectorizer has no way to go back to the non-pattern
> > > stmts when analyzing later you do have to verify the pattern
> > > is handled if there exists the remote chance that the original
> > > stmt sequence could.
> > Hi,
> > I have tried to address the suggestions in the attached patch.
> > Does it look OK ?
>
>    tree new_ops[2];
>    vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
>                        unprom, new_vectype);
> +
> +  if (!direct_internal_fn_supported_p (ifn, new_vectype, OPTIMIZE_FOR_SPEED)
> +      && TYPE_UNSIGNED (new_type)
> +      && optab_for_tree_code (RSHIFT_EXPR, new_vectype, optab_scalar)
> +      && optab_for_tree_code (PLUS_EXPR, new_vectype, optab_default)
> +      && optab_for_tree_code (BIT_IOR_EXPR, new_vectype, optab_default)
> +      && optab_for_tree_code (BIT_AND_EXPR, new_vectype, optab_default))
> +    {
> +      /* As a fallback, generate code for followi
>
> you have elided the ifn supported call when only one of the optabs isn't
> available and will end up generating the IFN call anyway.  Likewise you
> already have promoted the inputs.  I suggest to refactor these checks
> so you compute ifn_supported_p and general_supported_p (if !ifn_supported_p)
> and bail out early if neither.
>
> OK with those changes.
Hi Richard,
Sorry for late response. I tried to refactor the patch to move the
checks before promoting inputs.
Does it look OK ?


Thanks,
Prathamesh
>
> Richard.
>
> > Thanks,
> > Prathamesh
> > >
> > > Richard.
> > >
> > > > This is all again personal preference, but IMO it'd be clearer to give
> > > > the temporaries more meaningful names.  E.g.:
> > > >
> > > > tmp1 -> shifted_op0
> > > > tmp2 -> shifted_op1
> > > > tmp3 -> sum_of_shifted
> > > > tmp4 -> unmasked_carry
> > > > tmp5 -> carry
> > > >
> > > > But maybe that's less necessary if we have the pseudo-code in a comment.
> > > >
> > > > I think it'd also be better to add statements as we go rather than
> > > > save them all till the end.  That way it's easier to add conditional
> > > > paths later (e.g. to handle the signed case).
> > > >
> > > > Thanks,
> > > > Richard
> > > >
> > >
> > > --
> > > Richard Biener <rguent...@suse.de>
> > > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
> > > Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c
new file mode 100644
index 00000000000..af4aff4ec6d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c
@@ -0,0 +1,29 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define N 1024
+unsigned char dst[N];
+unsigned char in1[N];
+unsigned char in2[N];
+
+/*
+**  foo: 
+**	...
+**	lsr	(z[0-9]+\.b), z[0-9]+\.b, #1
+**	lsr	(z[0-9]+\.b), z[0-9]+\.b, #1
+**	add	(z[0-9]+\.b), (\1, \2|\2, \1)
+**	orr	(z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d
+**	and	(z[0-9]+\.b), \5\.b, #0x1
+**	add	z0\.b, (\3, \6|\6, \3)
+**	...
+*/
+void
+foo ()
+{
+  for( int x = 0; x < N; x++ )
+    dst[x] = (in1[x] + in2[x] + 1) >> 1;
+}
+
+/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */
+/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c
new file mode 100644
index 00000000000..2ccdd0d353e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c
@@ -0,0 +1,29 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define N 1024
+unsigned char dst[N];
+unsigned char in1[N];
+unsigned char in2[N];
+
+/*
+**  foo: 
+**	...
+**	lsr	(z[0-9]+\.b), z[0-9]+\.b, #1
+**	lsr	(z[0-9]+\.b), z[0-9]+\.b, #1
+**	add	(z[0-9]+\.b), (\1, \2|\2, \1)
+**	and	(z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d
+**	and	(z[0-9]+\.b), \5\.b, #0x1
+**	add	z0\.b, (\3, \6|\6, \3)
+**	...
+*/
+void
+foo ()
+{
+  for( int x = 0; x < N; x++ )
+    dst[x] = (in1[x] + in2[x]) >> 1;
+}
+
+/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */
+/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 8ebbcd76b64..4543990453b 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -1915,7 +1915,10 @@ vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out)
 	    TYPE avg = (TYPE) avg';
 
   where NTYPE is no wider than half of TYPE.  Since only the bottom half
-  of avg is used, all or part of the cast of avg' should become redundant.  */
+  of avg is used, all or part of the cast of avg' should become redundant.
+
+  If there is no target support available, generate code to distribute rshift
+  over plus and add a carry.  */
 
 static gimple *
 vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out)
@@ -2019,9 +2022,20 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out)
 
   /* Check for target support.  */
   tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type);
-  if (!new_vectype
-      || !direct_internal_fn_supported_p (ifn, new_vectype,
-					  OPTIMIZE_FOR_SPEED))
+  if (!new_vectype)
+    return NULL;
+
+  bool fallback_p = false;
+
+  if (direct_internal_fn_supported_p (ifn, new_vectype, OPTIMIZE_FOR_SPEED))
+    ;
+  else if (TYPE_UNSIGNED (new_type)
+	   && optab_for_tree_code (RSHIFT_EXPR, new_vectype, optab_scalar)
+	   && optab_for_tree_code (PLUS_EXPR, new_vectype, optab_default)
+	   && optab_for_tree_code (BIT_IOR_EXPR, new_vectype, optab_default)
+	   && optab_for_tree_code (BIT_AND_EXPR, new_vectype, optab_default))
+    fallback_p = true;
+  else
     return NULL;
 
   /* The IR requires a valid vector type for the cast result, even though
@@ -2030,11 +2044,54 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out)
   if (!*type_out)
     return NULL;
 
-  /* Generate the IFN_AVG* call.  */
   tree new_var = vect_recog_temp_ssa_var (new_type, NULL);
   tree new_ops[2];
   vect_convert_inputs (last_stmt_info, 2, new_ops, new_type,
 		       unprom, new_vectype);
+
+  if (fallback_p)
+    {
+      /* As a fallback, generate code for following sequence:
+
+	 Generate code for following sequence:
+	 shifted_op0 = new_ops[0] >> 1;
+	 shifted_op1 = new_ops[1] >> 1;
+	 sum_of_shifted = shifted_op0 + shifted_op1;
+	 unmasked_carry = new_ops[0] and/or new_ops[1];
+	 carry = unmasked_carry & 1;
+	 new_var = sum_of_shifted + carry;
+      */	 
+
+      tree one_cst = build_one_cst (new_type);
+      gassign *g;
+
+      tree shifted_op0 = vect_recog_temp_ssa_var (new_type, NULL);
+      g = gimple_build_assign (shifted_op0, RSHIFT_EXPR, new_ops[0], one_cst);
+      append_pattern_def_seq (last_stmt_info, g, new_vectype);
+
+      tree shifted_op1 = vect_recog_temp_ssa_var (new_type, NULL);
+      g = gimple_build_assign (shifted_op1, RSHIFT_EXPR, new_ops[1], one_cst);
+      append_pattern_def_seq (last_stmt_info, g, new_vectype);
+
+      tree sum_of_shifted = vect_recog_temp_ssa_var (new_type, NULL);
+      g = gimple_build_assign (sum_of_shifted, PLUS_EXPR,
+			       shifted_op0, shifted_op1);
+      append_pattern_def_seq (last_stmt_info, g, new_vectype);
+      
+      tree unmasked_carry = vect_recog_temp_ssa_var (new_type, NULL);
+      tree_code c = (ifn == IFN_AVG_CEIL) ? BIT_IOR_EXPR : BIT_AND_EXPR;
+      g = gimple_build_assign (unmasked_carry, c, new_ops[0], new_ops[1]);
+      append_pattern_def_seq (last_stmt_info, g, new_vectype);
+ 
+      tree carry = vect_recog_temp_ssa_var (new_type, NULL);
+      g = gimple_build_assign (carry, BIT_AND_EXPR, unmasked_carry, one_cst);
+      append_pattern_def_seq (last_stmt_info, g, new_vectype);
+
+      g = gimple_build_assign (new_var, PLUS_EXPR, sum_of_shifted, carry);
+      return vect_convert_output (last_stmt_info, type, g, new_vectype);
+    }
+
+  /* Generate the IFN_AVG* call.  */
   gcall *average_stmt = gimple_build_call_internal (ifn, 2, new_ops[0],
 						    new_ops[1]);
   gimple_call_set_lhs (average_stmt, new_var);

Re: [SVE] PR89007 - Implement generic vector average expansion

Reply via email to