On Fri, 29 Nov 2019 at 15:41, Richard Biener <richard.guent...@gmail.com> wrote: > > On Fri, Nov 22, 2019 at 12:40 PM Prathamesh Kulkarni > <prathamesh.kulka...@linaro.org> wrote: > > > > On Wed, 20 Nov 2019 at 16:54, Richard Biener <rguent...@suse.de> wrote: > > > > > > On Wed, 20 Nov 2019, Richard Sandiford wrote: > > > > > > > Hi, > > > > > > > > Thanks for doing this. Adding Richard on cc:, since the SVE subject > > > > tag might have put him off. There's not really anything SVE-specific > > > > here apart from the testcase. > > > > > > Ah. > > > > > > > > 2019-11-19 Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> > > > > > > > > > > PR tree-optimization/89007 > > > > > * tree-vect-patterns.c (vect_recog_average_pattern): If there is > > > > > no > > > > > target support available, generate code to distribute rshift over > > > > > plus > > > > > and add one depending upon floor or ceil rounding. > > > > > > > > > > testsuite/ > > > > > * gcc.target/aarch64/sve/pr89007.c: New test. > > > > > > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c > > > > > b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c > > > > > new file mode 100644 > > > > > index 00000000000..32095c63c61 > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c > > > > > @@ -0,0 +1,29 @@ > > > > > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > > > > > +/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve > > > > > --save-temps" } */ > > > > > +/* { dg-final { check-function-bodies "**" "" } } */ > > > > > + > > > > > +#define N 1024 > > > > > +unsigned char dst[N]; > > > > > +unsigned char in1[N]; > > > > > +unsigned char in2[N]; > > > > > + > > > > > +/* > > > > > +** foo: > > > > > +** ... > > > > > +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 > > > > > +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 > > > > > +** add (z[0-9]+\.b), \1, \2 > > > > > +** orr (z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d > > > > > +** and (z[0-9]+\.b), \4\.b, #0x1 > > > > > +** add z0.b, \3, \5 > > > > > > > > It'd probably be more future-proof to allow (\1, \2|\2, \1) and > > > > (\3, \5|\5, \3). Same for the other testcase. > > > > > > > > > +** ... > > > > > +*/ > > > > > +void > > > > > +foo () > > > > > +{ > > > > > + for( int x = 0; x < N; x++ ) > > > > > + dst[x] = (in1[x] + in2[x] + 1) >> 1; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */ > > > > > +/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */ > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c > > > > > b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c > > > > > new file mode 100644 > > > > > index 00000000000..cc40f45046b > > > > > --- /dev/null > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c > > > > > @@ -0,0 +1,29 @@ > > > > > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > > > > > +/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve > > > > > --save-temps" } */ > > > > > +/* { dg-final { check-function-bodies "**" "" } } */ > > > > > + > > > > > +#define N 1024 > > > > > +unsigned char dst[N]; > > > > > +unsigned char in1[N]; > > > > > +unsigned char in2[N]; > > > > > + > > > > > +/* > > > > > +** foo: > > > > > +** ... > > > > > +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 > > > > > +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 > > > > > +** add (z[0-9]+\.b), \1, \2 > > > > > +** and (z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d > > > > > +** and (z[0-9]+\.b), \4\.b, #0x1 > > > > > +** add z0.b, \3, \5 > > > > > +** ... > > > > > +*/ > > > > > +void > > > > > +foo () > > > > > +{ > > > > > + for( int x = 0; x < N; x++ ) > > > > > + dst[x] = (in1[x] + in2[x]) >> 1; > > > > > +} > > > > > + > > > > > +/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */ > > > > > +/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */ > > > > > diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c > > > > > index 8ebbcd76b64..7025a3b4dc2 100644 > > > > > --- a/gcc/tree-vect-patterns.c > > > > > +++ b/gcc/tree-vect-patterns.c > > > > > @@ -2019,22 +2019,59 @@ vect_recog_average_pattern (stmt_vec_info > > > > > last_stmt_info, tree *type_out) > > > > > > > > > /* Check for target support. */ > > > > > tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type); > > > > > - if (!new_vectype > > > > > - || !direct_internal_fn_supported_p (ifn, new_vectype, > > > > > - OPTIMIZE_FOR_SPEED)) > > > > > + > > > > > + if (!new_vectype) > > > > > return NULL; > > > > > > > > > > + bool ifn_supported > > > > > + = direct_internal_fn_supported_p (ifn, new_vectype, > > > > > OPTIMIZE_FOR_SPEED); > > > > > + > > > > > /* The IR requires a valid vector type for the cast result, even > > > > > though > > > > > it's likely to be discarded. */ > > > > > *type_out = get_vectype_for_scalar_type (vinfo, type); > > > > > if (!*type_out) > > > > > return NULL; > > > > > > > > > > - /* Generate the IFN_AVG* call. */ > > > > > tree new_var = vect_recog_temp_ssa_var (new_type, NULL); > > > > > tree new_ops[2]; > > > > > vect_convert_inputs (last_stmt_info, 2, new_ops, new_type, > > > > > unprom, new_vectype); > > > > > + > > > > > + if (!ifn_supported) > > > > > > > > I guess this is personal preference, but I'm not sure there's much > > > > benefit to splitting this out of the "if" into a separate variable. > > > > > > > > > + { > > > > > + /* If there is no target support available, generate code > > > > > + to distribute rshift over plus and add one depending > > > > > + upon floor or ceil rounding. */ > > > > > > > > Maybe "and add a carry"? It'd be good to write out the expansion in > > > > pseudo-code too. > > > > > > > > The transform is only valid on unsigned types. We still need to > > > > bail out for signed types because the carry would act in the wrong > > > > direction for negative results. E.g.: > > > > > > > > (-3 + 1) >> 1 == -2 > > > > (-3 >> 1) == -1, (1 >> 1) == 0, (-1 & 1) == 1 total == 0 > > > > > > > > Handling that case could be future work. > > > > > > > > > + tree one_cst = build_one_cst (new_type); > > > > > + > > > > > + tree tmp1 = vect_recog_temp_ssa_var (new_type, NULL); > > > > > + gassign *g1 = gimple_build_assign (tmp1, RSHIFT_EXPR, > > > > > new_ops[0], one_cst); > > > > > + > > > > > + tree tmp2 = vect_recog_temp_ssa_var (new_type, NULL); > > > > > + gassign *g2 = gimple_build_assign (tmp2, RSHIFT_EXPR, > > > > > new_ops[1], one_cst); > > > > > + > > > > > + tree tmp3 = vect_recog_temp_ssa_var (new_type, NULL); > > > > > + gassign *g3 = gimple_build_assign (tmp3, PLUS_EXPR, tmp1, > > > > > tmp2); > > > > > + > > > > > + tree tmp4 = vect_recog_temp_ssa_var (new_type, NULL); > > > > > + tree_code c = (ifn == IFN_AVG_CEIL) ? BIT_IOR_EXPR : > > > > > BIT_AND_EXPR; > > > > > + gassign *g4 = gimple_build_assign (tmp4, c, new_ops[0], > > > > > new_ops[1]); > > > > > + > > > > > + tree tmp5 = vect_recog_temp_ssa_var (new_type, NULL); > > > > > + gassign *g5 = gimple_build_assign (tmp5, BIT_AND_EXPR, tmp4, > > > > > one_cst); > > > > > + > > > > > + gassign *g6 = gimple_build_assign (new_var, PLUS_EXPR, tmp3, > > > > > tmp5); > > > > > + > > > > > + append_pattern_def_seq (last_stmt_info, g1, new_vectype); > > > > > + append_pattern_def_seq (last_stmt_info, g2, new_vectype); > > > > > + append_pattern_def_seq (last_stmt_info, g3, new_vectype); > > > > > + append_pattern_def_seq (last_stmt_info, g4, new_vectype); > > > > > + append_pattern_def_seq (last_stmt_info, g5, new_vectype); > > > > > > I mainly wonder of why the new sequence is any better that the > > > original - that is, where did we verify that the non-pattern sequence > > > cannot be vectorized? It is > > > > > > _1 = in1[x_17]; > > > _2 = (int) _1; > > > _3 = in2[x_17]; > > > _4 = (int) _3; > > > _5 = _2 + _4; > > > _6 = _5 + 1; > > > _7 = _6 >> 1; > > > _8 = (unsigned char) _7; > > > dst[x_17] = _8; > > > > > > so it seems it was just convenient to handle the "un-widening" > > > pattern here instead of more generally doing the "carry" trick > > > for supporting >> 1 in the un-widening path(s) (not exactly knowing > > > if we have more than a few very special cases covering this)? > > > > > > The overall function comment would also need to be updated to > > > outline the scheme. > > > > > > You don't verify the target supports shifting of the smaller > > > mode vectors nor do you verify it supports bitwise ops or > > > plus [of smaller mode vectors] (AVX512BW is needed for AVX512 > > > vectorization of byte/short for example! Yeah, stupid AVX..., > > > also SSE has pavg, not sure about AVX512 w/o BW here). > > > > > > Since the vectorizer has no way to go back to the non-pattern > > > stmts when analyzing later you do have to verify the pattern > > > is handled if there exists the remote chance that the original > > > stmt sequence could. > > Hi, > > I have tried to address the suggestions in the attached patch. > > Does it look OK ? > > tree new_ops[2]; > vect_convert_inputs (last_stmt_info, 2, new_ops, new_type, > unprom, new_vectype); > + > + if (!direct_internal_fn_supported_p (ifn, new_vectype, OPTIMIZE_FOR_SPEED) > + && TYPE_UNSIGNED (new_type) > + && optab_for_tree_code (RSHIFT_EXPR, new_vectype, optab_scalar) > + && optab_for_tree_code (PLUS_EXPR, new_vectype, optab_default) > + && optab_for_tree_code (BIT_IOR_EXPR, new_vectype, optab_default) > + && optab_for_tree_code (BIT_AND_EXPR, new_vectype, optab_default)) > + { > + /* As a fallback, generate code for followi > > you have elided the ifn supported call when only one of the optabs isn't > available and will end up generating the IFN call anyway. Likewise you > already have promoted the inputs. I suggest to refactor these checks > so you compute ifn_supported_p and general_supported_p (if !ifn_supported_p) > and bail out early if neither. > > OK with those changes. Hi Richard, Sorry for late response. I tried to refactor the patch to move the checks before promoting inputs. Does it look OK ?
Thanks, Prathamesh > > Richard. > > > Thanks, > > Prathamesh > > > > > > Richard. > > > > > > > This is all again personal preference, but IMO it'd be clearer to give > > > > the temporaries more meaningful names. E.g.: > > > > > > > > tmp1 -> shifted_op0 > > > > tmp2 -> shifted_op1 > > > > tmp3 -> sum_of_shifted > > > > tmp4 -> unmasked_carry > > > > tmp5 -> carry > > > > > > > > But maybe that's less necessary if we have the pseudo-code in a comment. > > > > > > > > I think it'd also be better to add statements as we go rather than > > > > save them all till the end. That way it's easier to add conditional > > > > paths later (e.g. to handle the signed case). > > > > > > > > Thanks, > > > > Richard > > > > > > > > > > -- > > > Richard Biener <rguent...@suse.de> > > > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, > > > Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c new file mode 100644 index 00000000000..af4aff4ec6d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-1.c @@ -0,0 +1,29 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#define N 1024 +unsigned char dst[N]; +unsigned char in1[N]; +unsigned char in2[N]; + +/* +** foo: +** ... +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 +** add (z[0-9]+\.b), (\1, \2|\2, \1) +** orr (z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d +** and (z[0-9]+\.b), \5\.b, #0x1 +** add z0\.b, (\3, \6|\6, \3) +** ... +*/ +void +foo () +{ + for( int x = 0; x < N; x++ ) + dst[x] = (in1[x] + in2[x] + 1) >> 1; +} + +/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */ +/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c new file mode 100644 index 00000000000..2ccdd0d353e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr89007-2.c @@ -0,0 +1,29 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -ftree-vectorize -march=armv8.2-a+sve --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#define N 1024 +unsigned char dst[N]; +unsigned char in1[N]; +unsigned char in2[N]; + +/* +** foo: +** ... +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 +** lsr (z[0-9]+\.b), z[0-9]+\.b, #1 +** add (z[0-9]+\.b), (\1, \2|\2, \1) +** and (z[0-9]+)\.d, z[0-9]+\.d, z[0-9]+\.d +** and (z[0-9]+\.b), \5\.b, #0x1 +** add z0\.b, (\3, \6|\6, \3) +** ... +*/ +void +foo () +{ + for( int x = 0; x < N; x++ ) + dst[x] = (in1[x] + in2[x]) >> 1; +} + +/* { dg-final { scan-assembler-not {\tuunpklo\t} } } */ +/* { dg-final { scan-assembler-not {\tuunpkhi\t} } } */ diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index 8ebbcd76b64..4543990453b 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -1915,7 +1915,10 @@ vect_recog_mulhs_pattern (stmt_vec_info last_stmt_info, tree *type_out) TYPE avg = (TYPE) avg'; where NTYPE is no wider than half of TYPE. Since only the bottom half - of avg is used, all or part of the cast of avg' should become redundant. */ + of avg is used, all or part of the cast of avg' should become redundant. + + If there is no target support available, generate code to distribute rshift + over plus and add a carry. */ static gimple * vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out) @@ -2019,9 +2022,20 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out) /* Check for target support. */ tree new_vectype = get_vectype_for_scalar_type (vinfo, new_type); - if (!new_vectype - || !direct_internal_fn_supported_p (ifn, new_vectype, - OPTIMIZE_FOR_SPEED)) + if (!new_vectype) + return NULL; + + bool fallback_p = false; + + if (direct_internal_fn_supported_p (ifn, new_vectype, OPTIMIZE_FOR_SPEED)) + ; + else if (TYPE_UNSIGNED (new_type) + && optab_for_tree_code (RSHIFT_EXPR, new_vectype, optab_scalar) + && optab_for_tree_code (PLUS_EXPR, new_vectype, optab_default) + && optab_for_tree_code (BIT_IOR_EXPR, new_vectype, optab_default) + && optab_for_tree_code (BIT_AND_EXPR, new_vectype, optab_default)) + fallback_p = true; + else return NULL; /* The IR requires a valid vector type for the cast result, even though @@ -2030,11 +2044,54 @@ vect_recog_average_pattern (stmt_vec_info last_stmt_info, tree *type_out) if (!*type_out) return NULL; - /* Generate the IFN_AVG* call. */ tree new_var = vect_recog_temp_ssa_var (new_type, NULL); tree new_ops[2]; vect_convert_inputs (last_stmt_info, 2, new_ops, new_type, unprom, new_vectype); + + if (fallback_p) + { + /* As a fallback, generate code for following sequence: + + Generate code for following sequence: + shifted_op0 = new_ops[0] >> 1; + shifted_op1 = new_ops[1] >> 1; + sum_of_shifted = shifted_op0 + shifted_op1; + unmasked_carry = new_ops[0] and/or new_ops[1]; + carry = unmasked_carry & 1; + new_var = sum_of_shifted + carry; + */ + + tree one_cst = build_one_cst (new_type); + gassign *g; + + tree shifted_op0 = vect_recog_temp_ssa_var (new_type, NULL); + g = gimple_build_assign (shifted_op0, RSHIFT_EXPR, new_ops[0], one_cst); + append_pattern_def_seq (last_stmt_info, g, new_vectype); + + tree shifted_op1 = vect_recog_temp_ssa_var (new_type, NULL); + g = gimple_build_assign (shifted_op1, RSHIFT_EXPR, new_ops[1], one_cst); + append_pattern_def_seq (last_stmt_info, g, new_vectype); + + tree sum_of_shifted = vect_recog_temp_ssa_var (new_type, NULL); + g = gimple_build_assign (sum_of_shifted, PLUS_EXPR, + shifted_op0, shifted_op1); + append_pattern_def_seq (last_stmt_info, g, new_vectype); + + tree unmasked_carry = vect_recog_temp_ssa_var (new_type, NULL); + tree_code c = (ifn == IFN_AVG_CEIL) ? BIT_IOR_EXPR : BIT_AND_EXPR; + g = gimple_build_assign (unmasked_carry, c, new_ops[0], new_ops[1]); + append_pattern_def_seq (last_stmt_info, g, new_vectype); + + tree carry = vect_recog_temp_ssa_var (new_type, NULL); + g = gimple_build_assign (carry, BIT_AND_EXPR, unmasked_carry, one_cst); + append_pattern_def_seq (last_stmt_info, g, new_vectype); + + g = gimple_build_assign (new_var, PLUS_EXPR, sum_of_shifted, carry); + return vect_convert_output (last_stmt_info, type, g, new_vectype); + } + + /* Generate the IFN_AVG* call. */ gcall *average_stmt = gimple_build_call_internal (ifn, 2, new_ops[0], new_ops[1]); gimple_call_set_lhs (average_stmt, new_var);