On Thu, Jun 18, 2015 at 5:04 AM, Benedikt Huber <benedikt.hu...@theobroma-systems.com> wrote: A few comments about the patch itself besides the cost model issue and iteration issue mentioned in the other email thread.
> * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. > * config/aarch64/aarch64-protos.h: Declare. > * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and > frsqrts. > * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code > in fast math mode. > * config/aarch64/aarch64.md: Added enum entry. > * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double. > --- > gcc/ChangeLog | 9 +++ > gcc/config/aarch64/aarch64-builtins.c | 60 ++++++++++++++++ > gcc/config/aarch64/aarch64-protos.h | 2 + > gcc/config/aarch64/aarch64-simd.md | 27 ++++++++ > gcc/config/aarch64/aarch64.c | 63 +++++++++++++++++ > gcc/config/aarch64/aarch64.md | 3 + > gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113 > +++++++++++++++++++++++++++++++ > 7 files changed, 277 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c > > diff --git a/gcc/ChangeLog b/gcc/ChangeLog > index c9b156f..690ebba 100644 > --- a/gcc/ChangeLog > +++ b/gcc/ChangeLog > @@ -1,3 +1,12 @@ > +2015-06-15 Benedikt Huber <benedikt.hu...@theobroma-systems.com> > + > + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. > + * config/aarch64/aarch64-protos.h: Declare. > + * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte > and frsqrts. > + * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code > in fast math mode. > + * config/aarch64/aarch64.md: Added enum entry. > + * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double. > + > 2015-06-14 Richard Sandiford <richard.sandif...@arm.com> > > * rtl.h (classify_insn): Declare. > diff --git a/gcc/config/aarch64/aarch64-builtins.c > b/gcc/config/aarch64/aarch64-builtins.c > index f7a39ec..484bb84 100644 > --- a/gcc/config/aarch64/aarch64-builtins.c > +++ b/gcc/config/aarch64/aarch64-builtins.c > @@ -342,6 +342,8 @@ enum aarch64_builtins > AARCH64_BUILTIN_GET_FPSR, > AARCH64_BUILTIN_SET_FPSR, > > + AARCH64_BUILTIN_RSQRT, > + AARCH64_BUILTIN_RSQRTF, > AARCH64_SIMD_BUILTIN_BASE, > AARCH64_SIMD_BUILTIN_LANE_CHECK, > #include "aarch64-simd-builtins.def" > @@ -831,6 +833,32 @@ aarch64_init_crc32_builtins () > } > > void > +aarch64_add_builtin_rsqrt (void) > +{ > + tree fndecl = NULL; > + tree ftype = NULL; > + ftype = build_function_type_list (double_type_node, double_type_node, > NULL_TREE); > + > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt", > + ftype, > + AARCH64_BUILTIN_RSQRT, > + BUILT_IN_MD, > + NULL, > + NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT] = fndecl; > + > + tree ftypef = NULL; > + ftypef = build_function_type_list (float_type_node, float_type_node, > NULL_TREE); > + fndecl = add_builtin_function ("__builtin_aarch64_rsqrtf", > + ftypef, > + AARCH64_BUILTIN_RSQRTF, > + BUILT_IN_MD, > + NULL, > + NULL_TREE); > + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF] = fndecl; > +} > + > +void > aarch64_init_builtins (void) > { > tree ftype_set_fpr > @@ -855,6 +883,7 @@ aarch64_init_builtins (void) > aarch64_init_simd_builtins (); > if (TARGET_CRC32) > aarch64_init_crc32_builtins (); > + aarch64_add_builtin_rsqrt (); > } > > tree > @@ -1099,6 +1128,23 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx > target) > return target; > } > > +static rtx > +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) > +{ > + rtx pat; > + tree arg0 = CALL_EXPR_ARG (exp, 0); > + rtx op0 = expand_normal (arg0); > + > + enum insn_code c = CODE_FOR_rsqrtdf; > + if (fcode == AARCH64_BUILTIN_RSQRTF) > + c = CODE_FOR_rsqrtsf; > + > + pat = GEN_FCN (c) (target, op0); > + emit_insn (pat); > + > + return target; > +} > + > /* Expand an expression EXP that calls a built-in function, > with result going to TARGET if that's convenient. */ > rtx > @@ -1146,6 +1192,11 @@ aarch64_expand_builtin (tree exp, > else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= > AARCH64_CRC32_BUILTIN_MAX) > return aarch64_crc32_expand_builtin (fcode, exp, target); > > + if (fcode == AARCH64_BUILTIN_RSQRT || > + fcode == AARCH64_BUILTIN_RSQRTF) > + return aarch64_expand_builtin_rsqrt (fcode, exp, target); > + > + return NULL_RTX; > gcc_unreachable (); > } > > @@ -1303,6 +1354,15 @@ aarch64_builtin_vectorized_function (tree fndecl, tree > type_out, tree type_in) > return NULL_TREE; > } > > +tree > +aarch64_builtin_rsqrt (bool is_float) > +{ > + if (is_float) > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF]; > + else > + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT]; > +} > + > #undef VAR1 > #define VAR1(T, N, MAP, A) \ > case AARCH64_SIMD_BUILTIN_##T##_##N##A: > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index 965a11b..4f1c8ce 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -270,6 +270,8 @@ void aarch64_print_operand (FILE *, rtx, char); > void aarch64_print_operand_address (FILE *, rtx); > void aarch64_emit_call_insn (rtx); > > +void aarch64_emit_swrsqrt (rtx, rtx); > + > /* Initialize builtins for SIMD intrinsics. */ > void init_aarch64_simd_builtins (void); > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index b90f938..266800a 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -353,6 +353,33 @@ > [(set_attr "type" "neon_fp_mul_d_scalar_q")] > ) > > +(define_insn "*rsqrte_simd" > + [(set (match_operand:VALLF 0 "register_operand" "=w") > + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] > + UNSPEC_RSQRTE))] > + "TARGET_SIMD" > + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" > + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")]) I think it might be useful to have the mode on the pattern name. > + > +(define_insn "*rsqrts_simd" Likewise. > + [(set (match_operand:VALLF 0 "register_operand" "=w") > + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") > + (match_operand:VALLF 2 "register_operand" "w")] > + UNSPEC_RSQRTS))] > + "TARGET_SIMD" > + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" > + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")]) > + > +(define_expand "rsqrt<mode>" > + [(set (match_operand:GPF 0 "register_operand" "=w") > + (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")] > + UNSPEC_RSQRT))] > + "TARGET_FLOAT" > +{ > + aarch64_emit_swrsqrt (operands[0], operands[1]); > + DONE; > +}) > + > (define_insn "*aarch64_mul3_elt_to_64v2df" > [(set (match_operand:DF 0 "register_operand" "=w") > (mult:DF > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index c3c2795..281f0b2 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -6816,6 +6816,66 @@ aarch64_memory_move_cost (machine_mode mode > ATTRIBUTE_UNUSED, > return aarch64_tune_params->memmov_cost; > } > > +extern tree aarch64_builtin_rsqrt (bool is_float); > + > +static tree > +aarch64_builtin_reciprocal (unsigned int fn, > + bool md_fn ATTRIBUTE_UNUSED, > + bool sqrt ATTRIBUTE_UNUSED) You don't need ATTRIBUTE_UNUSED any more if you are not using the variable at all. Just remove the variable name, this is C++ code after all :). > +{ > + if (!fast_math_flags_set_p (&global_options)) > + return NULL_TREE; Is this correct check here? I suspect you want !(flag_finite_math_only && !flag_trapping_math && flag_unsafe_math_optimizations) instead so someone can do -ffast-math -fno-finite-math-only. is there a reason why you don't check md_fn here at all because if md_fn is true, and you get the same value for fn as BUILT_IN_SQRTF/BUILT_IN_SQRT, we would get incorrect code. Also I think you should check md_fn and support the vectorized functions here. > + > + if (fn == BUILT_IN_SQRTF) > + return aarch64_builtin_rsqrt (true); > + else if (fn == BUILT_IN_SQRT) > + return aarch64_builtin_rsqrt (false); > + else > + return NULL_TREE; > +} > + > +void > +aarch64_emit_swrsqrt (rtx dst, rtx src) > +{ > + enum machine_mode mode = GET_MODE (src); > + rtx xsrc = gen_reg_rtx (mode); > + emit_set_insn (xsrc, src); Why do you use emit_set_insn here instead of emit_move_insn? > + > + rtx x0 = gen_reg_rtx (mode); > + emit_insn (gen_rtx_SET (x0, > + gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), > + UNSPEC_RSQRTE))); And not emit_set_insn here? > + > + bool double_mode = (DFmode == mode || > + V1DFmode == mode || > + V2DFmode == mode || > + V4DFmode == mode || > + V6DFmode == mode || > + V8DFmode == mode); Why not use: mode == DFmode || (VECTOR_MODE_P (mode) && GET_MODE_INNER (mode) == DFmode); At least it becomes more obvious what is done here. Or just: mode == DFmode || mode == V2DFmode || mode == V4DFmode Also it might make sense to have an assert on the modes we accept at the beginning of the function. > + > + int iterations = 2; > + if (double_mode) > + iterations = 3; > + > + for (int i = 0; i < iterations; ++i) > + { > + rtx x1 = gen_reg_rtx (mode); > + rtx x2 = gen_reg_rtx (mode); > + rtx x3 = gen_reg_rtx (mode); > + emit_insn (gen_rtx_SET (x2, > + gen_rtx_MULT (mode, x0, x0))); > + emit_insn (gen_rtx_SET (x3, > + gen_rtx_UNSPEC (mode, gen_rtvec (2, xsrc, x2), > + UNSPEC_RSQRTS))); > + emit_insn (gen_rtx_SET (x1, > + gen_rtx_MULT (mode, x0, x3))); More of emit_set_insn usage. > + x0 = x1; > + } > + > + emit_move_insn (dst, x0); > + return; > +} > + > /* Return the number of instructions that can be issued per cycle. */ > static int > aarch64_sched_issue_rate (void) > @@ -11747,6 +11807,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, > #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P > #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p > > +#undef TARGET_BUILTIN_RECIPROCAL > +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal > + > #undef TARGET_VECTOR_MODE_SUPPORTED_P > #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 11123d6..7272d05 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -120,6 +120,9 @@ > UNSPEC_VSTRUCTDUMMY > UNSPEC_SP_SET > UNSPEC_SP_TEST > + UNSPEC_RSQRT > + UNSPEC_RSQRTE > + UNSPEC_RSQRTS > ]) > > (define_c_enum "unspecv" [ > diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c > b/gcc/testsuite/gcc.target/aarch64/rsqrt.c > new file mode 100644 > index 0000000..6607483 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c > @@ -0,0 +1,113 @@ > +/* { dg-do run } */ > +/* { dg-options "-O3 -fno-inline --save-temps -ffast-math" } */ Maybe instead of -fno-inline, you can add some volatile to some variables. We really should have separate tests for the assembly scan vs the runtime tests too. > + > +#include <math.h> > +#include <stdio.h> > + > +#include <values.h> Is values.h a standard header file. I suspect you should be using float.h instead which is the standard header file. > + > +#define PI 3.141592653589793 > +#define SQRT2 1.4142135623730951 > + > +#define PI_4 0.7853981633974483 > +#define SQRT1_2 0.7071067811865475 > + > +/* 2^25+1, float has 24 significand bits > + * according to Single-precision floating-point format. */ > +#define TESTA8_FLT 33554433 > +/* 2^54+1, double has 53 significand bits > + * according to Double-precision floating-point format. */ > +#define TESTA8_DBL 18014398509481985 > + > +#define SD(a, b) t_double ((#a), (a), (b)); > +#define SF(a, b) t_float ((#a), (a), (b)); > + > +#define EPSILON_double __DBL_EPSILON__ > +#define EPSILON_float __FLT_EPSILON__ > +#define ABS_double fabs > +#define ABS_float fabsf > +#define SQRT_double sqrt > +#define SQRT_float sqrtf I suspect you should have those using the builtin versions of the above functions instead of the named ones. Thanks, Andrew Pinski > + > +extern void abort (void); > + > +#define TESTTYPE(TYPE) \ > +TYPE rsqrt_##TYPE (TYPE a) \ > +{ \ > + return 1.0/SQRT_##TYPE(a); \ > +} \ > + \ > +int equals_##TYPE (TYPE a, TYPE b) \ > +{ \ > + return (a == b || \ > + (isnan (a) && isnan (b)) || \ > + (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \ > +} \ > + \ > +void t_##TYPE (const char *s, TYPE a, TYPE result) \ > +{ \ > + TYPE r = rsqrt_##TYPE (a); \ > + if (!equals_##TYPE (r, result)) \ > + { \ > + abort (); \ > + } \ > +} \ > + > +// printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \ > + > +TESTTYPE(double) > +TESTTYPE(float) > + > +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } > */ > +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, > d\[0-9\]+" 3 } } */ > + > +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } > */ > +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, > s\[0-9\]+" 2 } } */ > + > +int main () > +{ > + SD( 1.0/256, 0X1.00000000000000P+4 ); > + SD( 1.0, 0X1.00000000000000P+0 ); > + SD( -1.0, NAN); > + SD( 11.0, 0X1.34BF63D1568260P-2 ); > + SD( 0.0, INFINITY); > + SD( INFINITY, 0X0.00000000000000P+0 ); > + SD( NAN, NAN); > + SD( -NAN, -NAN); > + SD( DBL_MAX, 0X1.00000000000010P-512); > + SD( DBL_MIN, 0X1.00000000000000P+511); > + SD( PI, 0X1.20DD750429B6D0P-1 ); > + SD( PI_4, 0X1.20DD750429B6D0P+0 ); > + SD( SQRT2, 0X1.AE89F995AD3AE0P-1 ); > + SD( SQRT1_2, 0X1.306FE0A31B7150P+0 ); > + SD( -PI, NAN); > + SD( -SQRT2, NAN); > + SD( TESTA8_DBL, 0X1.00000000000000P-27 ); > + > + SF( 1.0/256, 0X1.00000000000000P+4 ); > + SF( 1.0, 0X1.00000000000000P+0 ); > + SF( -1.0, NAN); > + SF( 11.0, 0X1.34BF6400000000P-2 ); > + SF( 0.0, INFINITY); > + SF( INFINITY, 0X0.00000000000000P+0 ); > + SF( NAN, NAN); > + SF( -NAN, -NAN); > + SF( FLT_MAX, 0X1.00000200000000P-64 ); > + SF( FLT_MIN, 0X1.00000000000000P+63 ); > + SF( PI, 0X1.20DD7400000000P-1 ); > + SF( PI_4, 0X1.20DD7400000000P+0 ); > + SF( SQRT2, 0X1.AE89FA00000000P-1 ); > + SF( SQRT1_2, 0X1.306FE000000000P+0 ); > + SF( -PI, NAN); > + SF( -SQRT2, NAN); > + SF( TESTA8_FLT, 0X1.6A09E600000000P-13 ); > + > +// With -ffast-math these return positive INF. > +// SD( -0.0, -INFINITY); > +// SF( -0.0, -INFINITY); > +// The reason here is that -ffast-math flushes to zero. > +// SD(DBL_MIN/256, 0X1.00000000000000P+515); > +// SF(FLT_MIN/256, 0X1.00000000000000P+67 ); > + > + return 0; > +} > -- > 1.9.1 >