* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. * config/aarch64/aarch64-protos.h: Declare. * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode. * config/aarch64/aarch64.md: Added enum entry. * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double. --- gcc/ChangeLog | 9 +++ gcc/config/aarch64/aarch64-builtins.c | 60 ++++++++++++++++ gcc/config/aarch64/aarch64-protos.h | 2 + gcc/config/aarch64/aarch64-simd.md | 27 ++++++++ gcc/config/aarch64/aarch64.c | 63 +++++++++++++++++ gcc/config/aarch64/aarch64.md | 3 + gcc/testsuite/gcc.target/aarch64/rsqrt.c | 113 +++++++++++++++++++++++++++++++ 7 files changed, 277 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c9b156f..690ebba 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2015-06-15 Benedikt Huber <benedikt.hu...@theobroma-systems.com> + + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. + * config/aarch64/aarch64-protos.h: Declare. + * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. + * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode. + * config/aarch64/aarch64.md: Added enum entry. + * testsuite/gcc.target/aarch64/rsqrt.c: Tests for single and double. + 2015-06-14 Richard Sandiford <richard.sandif...@arm.com> * rtl.h (classify_insn): Declare. diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index f7a39ec..484bb84 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -342,6 +342,8 @@ enum aarch64_builtins AARCH64_BUILTIN_GET_FPSR, AARCH64_BUILTIN_SET_FPSR, + AARCH64_BUILTIN_RSQRT, + AARCH64_BUILTIN_RSQRTF, AARCH64_SIMD_BUILTIN_BASE, AARCH64_SIMD_BUILTIN_LANE_CHECK, #include "aarch64-simd-builtins.def" @@ -831,6 +833,32 @@ aarch64_init_crc32_builtins () } void +aarch64_add_builtin_rsqrt (void) +{ + tree fndecl = NULL; + tree ftype = NULL; + ftype = build_function_type_list (double_type_node, double_type_node, NULL_TREE); + + fndecl = add_builtin_function ("__builtin_aarch64_rsqrt", + ftype, + AARCH64_BUILTIN_RSQRT, + BUILT_IN_MD, + NULL, + NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT] = fndecl; + + tree ftypef = NULL; + ftypef = build_function_type_list (float_type_node, float_type_node, NULL_TREE); + fndecl = add_builtin_function ("__builtin_aarch64_rsqrtf", + ftypef, + AARCH64_BUILTIN_RSQRTF, + BUILT_IN_MD, + NULL, + NULL_TREE); + aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF] = fndecl; +} + +void aarch64_init_builtins (void) { tree ftype_set_fpr @@ -855,6 +883,7 @@ aarch64_init_builtins (void) aarch64_init_simd_builtins (); if (TARGET_CRC32) aarch64_init_crc32_builtins (); + aarch64_add_builtin_rsqrt (); } tree @@ -1099,6 +1128,23 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target) return target; } +static rtx +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op0 = expand_normal (arg0); + + enum insn_code c = CODE_FOR_rsqrtdf; + if (fcode == AARCH64_BUILTIN_RSQRTF) + c = CODE_FOR_rsqrtsf; + + pat = GEN_FCN (c) (target, op0); + emit_insn (pat); + + return target; +} + /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient. */ rtx @@ -1146,6 +1192,11 @@ aarch64_expand_builtin (tree exp, else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX) return aarch64_crc32_expand_builtin (fcode, exp, target); + if (fcode == AARCH64_BUILTIN_RSQRT || + fcode == AARCH64_BUILTIN_RSQRTF) + return aarch64_expand_builtin_rsqrt (fcode, exp, target); + + return NULL_RTX; gcc_unreachable (); } @@ -1303,6 +1354,15 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) return NULL_TREE; } +tree +aarch64_builtin_rsqrt (bool is_float) +{ + if (is_float) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRTF]; + else + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT]; +} + #undef VAR1 #define VAR1(T, N, MAP, A) \ case AARCH64_SIMD_BUILTIN_##T##_##N##A: diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 965a11b..4f1c8ce 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -270,6 +270,8 @@ void aarch64_print_operand (FILE *, rtx, char); void aarch64_print_operand_address (FILE *, rtx); void aarch64_emit_call_insn (rtx); +void aarch64_emit_swrsqrt (rtx, rtx); + /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index b90f938..266800a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -353,6 +353,33 @@ [(set_attr "type" "neon_fp_mul_d_scalar_q")] ) +(define_insn "*rsqrte_simd" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRTE))] + "TARGET_SIMD" + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")]) + +(define_insn "*rsqrts_simd" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") + (match_operand:VALLF 2 "register_operand" "w")] + UNSPEC_RSQRTS))] + "TARGET_SIMD" + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")]) + +(define_expand "rsqrt<mode>" + [(set (match_operand:GPF 0 "register_operand" "=w") + (unspec:GPF [(match_operand:GPF 1 "register_operand" "w")] + UNSPEC_RSQRT))] + "TARGET_FLOAT" +{ + aarch64_emit_swrsqrt (operands[0], operands[1]); + DONE; +}) + (define_insn "*aarch64_mul3_elt_to_64v2df" [(set (match_operand:DF 0 "register_operand" "=w") (mult:DF diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index c3c2795..281f0b2 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -6816,6 +6816,66 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, return aarch64_tune_params->memmov_cost; } +extern tree aarch64_builtin_rsqrt (bool is_float); + +static tree +aarch64_builtin_reciprocal (unsigned int fn, + bool md_fn ATTRIBUTE_UNUSED, + bool sqrt ATTRIBUTE_UNUSED) +{ + if (!fast_math_flags_set_p (&global_options)) + return NULL_TREE; + + if (fn == BUILT_IN_SQRTF) + return aarch64_builtin_rsqrt (true); + else if (fn == BUILT_IN_SQRT) + return aarch64_builtin_rsqrt (false); + else + return NULL_TREE; +} + +void +aarch64_emit_swrsqrt (rtx dst, rtx src) +{ + enum machine_mode mode = GET_MODE (src); + rtx xsrc = gen_reg_rtx (mode); + emit_set_insn (xsrc, src); + + rtx x0 = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), + UNSPEC_RSQRTE))); + + bool double_mode = (DFmode == mode || + V1DFmode == mode || + V2DFmode == mode || + V4DFmode == mode || + V6DFmode == mode || + V8DFmode == mode); + + int iterations = 2; + if (double_mode) + iterations = 3; + + for (int i = 0; i < iterations; ++i) + { + rtx x1 = gen_reg_rtx (mode); + rtx x2 = gen_reg_rtx (mode); + rtx x3 = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (x2, + gen_rtx_MULT (mode, x0, x0))); + emit_insn (gen_rtx_SET (x3, + gen_rtx_UNSPEC (mode, gen_rtvec (2, xsrc, x2), + UNSPEC_RSQRTS))); + emit_insn (gen_rtx_SET (x1, + gen_rtx_MULT (mode, x0, x3))); + x0 = x1; + } + + emit_move_insn (dst, x0); + return; +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) @@ -11747,6 +11807,9 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p +#undef TARGET_BUILTIN_RECIPROCAL +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal + #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 11123d6..7272d05 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -120,6 +120,9 @@ UNSPEC_VSTRUCTDUMMY UNSPEC_SP_SET UNSPEC_SP_TEST + UNSPEC_RSQRT + UNSPEC_RSQRTE + UNSPEC_RSQRTS ]) (define_c_enum "unspecv" [ diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c b/gcc/testsuite/gcc.target/aarch64/rsqrt.c new file mode 100644 index 0000000..6607483 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c @@ -0,0 +1,113 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -fno-inline --save-temps -ffast-math" } */ + +#include <math.h> +#include <stdio.h> + +#include <values.h> + +#define PI 3.141592653589793 +#define SQRT2 1.4142135623730951 + +#define PI_4 0.7853981633974483 +#define SQRT1_2 0.7071067811865475 + +/* 2^25+1, float has 24 significand bits + * according to Single-precision floating-point format. */ +#define TESTA8_FLT 33554433 +/* 2^54+1, double has 53 significand bits + * according to Double-precision floating-point format. */ +#define TESTA8_DBL 18014398509481985 + +#define SD(a, b) t_double ((#a), (a), (b)); +#define SF(a, b) t_float ((#a), (a), (b)); + +#define EPSILON_double __DBL_EPSILON__ +#define EPSILON_float __FLT_EPSILON__ +#define ABS_double fabs +#define ABS_float fabsf +#define SQRT_double sqrt +#define SQRT_float sqrtf + +extern void abort (void); + +#define TESTTYPE(TYPE) \ +TYPE rsqrt_##TYPE (TYPE a) \ +{ \ + return 1.0/SQRT_##TYPE(a); \ +} \ + \ +int equals_##TYPE (TYPE a, TYPE b) \ +{ \ + return (a == b || \ + (isnan (a) && isnan (b)) || \ + (ABS_##TYPE (a - b) < EPSILON_##TYPE)); \ +} \ + \ +void t_##TYPE (const char *s, TYPE a, TYPE result) \ +{ \ + TYPE r = rsqrt_##TYPE (a); \ + if (!equals_##TYPE (r, result)) \ + { \ + abort (); \ + } \ +} \ + +// printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); \ + +TESTTYPE(double) +TESTTYPE(float) + +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, d\[0-9\]+" 3 } } */ + +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, s\[0-9\]+" 2 } } */ + +int main () +{ + SD( 1.0/256, 0X1.00000000000000P+4 ); + SD( 1.0, 0X1.00000000000000P+0 ); + SD( -1.0, NAN); + SD( 11.0, 0X1.34BF63D1568260P-2 ); + SD( 0.0, INFINITY); + SD( INFINITY, 0X0.00000000000000P+0 ); + SD( NAN, NAN); + SD( -NAN, -NAN); + SD( DBL_MAX, 0X1.00000000000010P-512); + SD( DBL_MIN, 0X1.00000000000000P+511); + SD( PI, 0X1.20DD750429B6D0P-1 ); + SD( PI_4, 0X1.20DD750429B6D0P+0 ); + SD( SQRT2, 0X1.AE89F995AD3AE0P-1 ); + SD( SQRT1_2, 0X1.306FE0A31B7150P+0 ); + SD( -PI, NAN); + SD( -SQRT2, NAN); + SD( TESTA8_DBL, 0X1.00000000000000P-27 ); + + SF( 1.0/256, 0X1.00000000000000P+4 ); + SF( 1.0, 0X1.00000000000000P+0 ); + SF( -1.0, NAN); + SF( 11.0, 0X1.34BF6400000000P-2 ); + SF( 0.0, INFINITY); + SF( INFINITY, 0X0.00000000000000P+0 ); + SF( NAN, NAN); + SF( -NAN, -NAN); + SF( FLT_MAX, 0X1.00000200000000P-64 ); + SF( FLT_MIN, 0X1.00000000000000P+63 ); + SF( PI, 0X1.20DD7400000000P-1 ); + SF( PI_4, 0X1.20DD7400000000P+0 ); + SF( SQRT2, 0X1.AE89FA00000000P-1 ); + SF( SQRT1_2, 0X1.306FE000000000P+0 ); + SF( -PI, NAN); + SF( -SQRT2, NAN); + SF( TESTA8_FLT, 0X1.6A09E600000000P-13 ); + +// With -ffast-math these return positive INF. +// SD( -0.0, -INFINITY); +// SF( -0.0, -INFINITY); +// The reason here is that -ffast-math flushes to zero. +// SD(DBL_MIN/256, 0X1.00000000000000P+515); +// SF(FLT_MIN/256, 0X1.00000000000000P+67 ); + + return 0; +} -- 1.9.1