Hi Benedikt, On 29/07/15 18:47, Benedikt Huber wrote:
* config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. * config/aarch64/aarch64-opts.h: -mrecip has a default value depending on the core. * config/aarch64/aarch64-protos.h: Declare. * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code in fast math mode. * config/aarch64/aarch64.md: Added enum entries. * config/aarch64/aarch64.opt: Added options -mrecip and -mlow-precision-recip-sqrt. * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans for frsqrte and frsqrts * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
<snip>
+ /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient. */ rtx @@ -1139,6 +1213,13 @@ aarch64_expand_builtin (tree exp, else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX) return aarch64_crc32_expand_builtin (fcode, exp, target); + if (fcode == AARCH64_BUILTIN_RSQRT_DF || + fcode == AARCH64_BUILTIN_RSQRT_SF || + fcode == AARCH64_BUILTIN_RSQRT_V2DF || + fcode == AARCH64_BUILTIN_RSQRT_V2SF || + fcode == AARCH64_BUILTIN_RSQRT_V4SF) + return aarch64_expand_builtin_rsqrt (fcode, exp, target); +
The style guidelines are to put the '||' on a new line like so: (fcode == AARCH64_BUILTIN_RSQRT_DF || fcode == AARCH64_BUILTIN_RSQRT_SF || ...
gcc_unreachable (); } @@ -1296,6 +1377,28 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) return NULL_TREE; } +tree +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn) +{ + if (md_fn) + { + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF]; + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF]; + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF]; + } + else + { + if (fn == BUILT_IN_SQRT) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF]; + if (fn == BUILT_IN_SQRTF) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF]; + } + return NULL_TREE; +} + #undef VAR1 #define VAR1(T, N, MAP, A) \ case AARCH64_SIMD_BUILTIN_##T##_##N##A: diff --git a/gcc/config/aarch64/aarch64-opts.h b/gcc/config/aarch64/aarch64-opts.h index 24bfd9f..75e9c67 100644 --- a/gcc/config/aarch64/aarch64-opts.h +++ b/gcc/config/aarch64/aarch64-opts.h @@ -64,4 +64,11 @@ enum aarch64_code_model { AARCH64_CMODEL_LARGE }; +/* Each core can have -mrecip enabled or disabled by default. */ +enum aarch64_mrecip { + AARCH64_MRECIP_OFF = 0, + AARCH64_MRECIP_ON, + AARCH64_MRECIP_DEFAULT, +}; + #endif diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 4062c27..dabe345 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -197,6 +197,7 @@ struct tune_params int min_div_recip_mul_sf; int min_div_recip_mul_df; unsigned int extra_tuning_flags; + bool mrecip_default_enabled; };
Please create a new field to put into the extra_tuning_flags. This involves adding an entry aarch64-tuning-flags.def. This way we avoid overpopulating the tuning structure with a multitude of on/off boolean options.
#define AARCH64_FUSION_PAIR(x, name, index) \ @@ -321,6 +322,8 @@ void aarch64_print_operand (FILE *, rtx, char); void aarch64_print_operand_address (FILE *, rtx); void aarch64_emit_call_insn (rtx); +void aarch64_emit_swrsqrt (rtx, rtx); + /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index b90f938..ac8e12d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -353,6 +353,33 @@ [(set_attr "type" "neon_fp_mul_d_scalar_q")] ) +(define_insn "*rsqrte_<mode>" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRTE))] + "TARGET_SIMD" + "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>" + [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")]) + +(define_insn "*rsqrts_<mode>" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") + (match_operand:VALLF 2 "register_operand" "w")] + UNSPEC_RSQRTS))] + "TARGET_SIMD" + "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>" + [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")]) + +(define_expand "rsqrt_<mode>" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRT))] + "TARGET_SIMD" +{ + aarch64_emit_swrsqrt (operands[0], operands[1]); + DONE; +})
Please add the number of operands to the pattern names, for example: "rsqrte_<mode>2"
+ (define_insn "*aarch64_mul3_elt_to_64v2df" [(set (match_operand:DF 0 "register_operand" "=w") (mult:DF
<snip>
/* Support for fine-grained override of the tuning structures. */ @@ -6961,6 +6967,66 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, return aarch64_tune_params.memmov_cost; } +extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn); + +static tree +aarch64_builtin_reciprocal (unsigned int fn, + bool md_fn, + bool) +{ + if (!flag_finite_math_only || + flag_trapping_math || + !flag_unsafe_math_optimizations || + optimize_size || + flag_mrecip == AARCH64_MRECIP_OFF || + (flag_mrecip == AARCH64_MRECIP_DEFAULT && + !aarch64_tune_params.mrecip_default_enabled))
Similar comment on the placement of the '||'.
+ { + return NULL_TREE; + } + + return aarch64_builtin_rsqrt (fn, md_fn); +} + +void +aarch64_emit_swrsqrt (rtx dst, rtx src) +{ + enum machine_mode mode = GET_MODE (src); + gcc_assert ( + mode == SFmode || mode == V2SFmode || mode == V4SFmode || + mode == DFmode || mode == V2DFmode); + + rtx xsrc = gen_reg_rtx (mode); + emit_move_insn (xsrc, src); + rtx x0 = gen_reg_rtx (mode); + emit_set_insn (x0, + gen_rtx_UNSPEC (mode, gen_rtvec (1, xsrc), UNSPEC_RSQRTE)); + + bool double_mode = (mode == DFmode || mode == V2DFmode); + + int iterations = 2; + if (double_mode) + iterations = 3; + + if (flag_mrecip_low_precision_sqrt) + iterations--; + + for (int i = 0; i < iterations; ++i) + { + rtx x1 = gen_reg_rtx (mode); + rtx x2 = gen_reg_rtx (mode); + rtx x3 = gen_reg_rtx (mode); + emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); + emit_set_insn (x3, gen_rtx_UNSPEC ( + mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS)); + emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); + x0 = x1;
You can probably avoid manually creating these RTXes and risking mis-constructing them (or someone modifying them by mistake in the future) by just calling the gen_* functions for the patterns you want to generate. For example, instead of doing: + emit_set_insn (x3, gen_rtx_UNSPEC ( + mode, gen_rtvec (2, xsrc, x2), UNSPEC_RSQRTS)); You can probably just do: emit_insn (gen_rsqrts_{sf,df}2 (x3, x2)); This will require you to rename the patterns in aarch64.md to not have a '*' in their name.
+ + emit_move_insn (dst, x0); + return; +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) @@ -12099,6 +12165,9 @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned flags) #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p +#undef TARGET_BUILTIN_RECIPROCAL +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal + #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 1e343fa..d7944b2 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -122,6 +122,9 @@ UNSPEC_VSTRUCTDUMMY UNSPEC_SP_SET UNSPEC_SP_TEST + UNSPEC_RSQRT + UNSPEC_RSQRTE + UNSPEC_RSQRTS ]) (define_c_enum "unspecv" [ diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 98ef9f6..581dc21 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) Value(AARCH64_ABI_ILP32) EnumValue Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64) + +mrecip +Common Report Var(flag_mrecip) Optimization Init(AARCH64_MRECIP_DEFAULT) +Generate software reciprocal square root for better throughput. Default with fast-math.
s/fast-math/-ffast-math/. Also, this is not the default with -ffast-math. The default now depends on the tuning core, right? Thanks, Kyrill