Re: [PATCH] 2015-07-31 Benedikt Huber Philipp Tomsich

pinskia Thu, 03 Sep 2015 09:17:19 -0700


> On Sep 3, 2015, at 11:58 PM, Sebastian Pop <seb...@gmail.com> wrote:
> 
> On Wed, Aug 26, 2015 at 11:58 AM, Benedikt Huber
> <benedikt.hu...@theobroma-systems.com> wrote:
>> ping
>> 
>> [PATCH v4][aarch64] Implemented reciprocal square root (rsqrt) estimation in 
>> -ffast-math
>> 
>> https://gcc.gnu.org/ml/gcc-patches/2015-07/msg02698.html
>> 
>>> On 31 Jul 2015, at 19:05, Benedikt Huber 
>>> <benedikt.hu...@theobroma-systems.com> wrote:
>>> 
>>>      * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>>>      rsqrtf.
>>>      * config/aarch64/aarch64-opts.h: -mrecip has a default value
>>>      depending on the core.
>>>      * config/aarch64/aarch64-protos.h: Declare.
>>>      * config/aarch64/aarch64-simd.md: Matching expressions for
>>>      frsqrte and frsqrts.
>>>      * config/aarch64/aarch64-tuning-flags.def: Added
>>>      MRECIP_DEFAULT_ENABLED.
>>>      * config/aarch64/aarch64.c: New functions. Emit rsqrt
>>>      estimation code in fast math mode.
>>>      * config/aarch64/aarch64.md: Added enum entries.
>>>      * config/aarch64/aarch64.opt: Added options -mrecip and
>>>      -mlow-precision-recip-sqrt.
>>>      * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>>>      for frsqrte and frsqrts
>>>      * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
> 
> The patch looks good to me.
> You will need an ARM/AArch64 maintainer to approve your patch: +Ramana

Except it is missing comments before almost all new functions.  Yes aarch64 
backend does not follow that rule but that does not mean you should not either. 

Thanks,
Andrew

> 
> Thanks,
> Sebastian
> 
>>> 
>>> Signed-off-by: Philipp Tomsich <philipp.toms...@theobroma-systems.com>
>>> ---
>>> gcc/ChangeLog                                      |  21 ++++
>>> gcc/config/aarch64/aarch64-builtins.c              | 104 
>>> ++++++++++++++++++++
>>> gcc/config/aarch64/aarch64-opts.h                  |   7 ++
>>> gcc/config/aarch64/aarch64-protos.h                |   2 +
>>> gcc/config/aarch64/aarch64-simd.md                 |  27 ++++++
>>> gcc/config/aarch64/aarch64-tuning-flags.def        |   1 +
>>> gcc/config/aarch64/aarch64.c                       | 106 
>>> +++++++++++++++++++-
>>> gcc/config/aarch64/aarch64.md                      |   3 +
>>> gcc/config/aarch64/aarch64.opt                     |   8 ++
>>> gcc/doc/invoke.texi                                |  19 ++++
>>> gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c |  63 ++++++++++++
>>> gcc/testsuite/gcc.target/aarch64/rsqrt.c           | 107 
>>> +++++++++++++++++++++
>>> 12 files changed, 463 insertions(+), 5 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/rsqrt.c
>>> 
>>> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
>>> index 3432adb..3bf3098 100644
>>> --- a/gcc/ChangeLog
>>> +++ b/gcc/ChangeLog
>>> @@ -1,3 +1,24 @@
>>> +2015-07-31  Benedikt Huber  <benedikt.hu...@theobroma-systems.com>
>>> +         Philipp Tomsich  <philipp.toms...@theobroma-systems.com>
>>> +
>>> +     * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and
>>> +     rsqrtf.
>>> +     * config/aarch64/aarch64-opts.h: -mrecip has a default value
>>> +     depending on the core.
>>> +     * config/aarch64/aarch64-protos.h: Declare.
>>> +     * config/aarch64/aarch64-simd.md: Matching expressions for
>>> +     frsqrte and frsqrts.
>>> +     * config/aarch64/aarch64-tuning-flags.def: Added
>>> +     MRECIP_DEFAULT_ENABLED.
>>> +     * config/aarch64/aarch64.c: New functions. Emit rsqrt
>>> +     estimation code in fast math mode.
>>> +     * config/aarch64/aarch64.md: Added enum entries.
>>> +     * config/aarch64/aarch64.opt: Added options -mrecip and
>>> +     -mlow-precision-recip-sqrt.
>>> +     * testsuite/gcc.target/aarch64/rsqrt-asm-check.c: Assembly scans
>>> +     for frsqrte and frsqrts
>>> +     * testsuite/gcc.target/aarch64/rsqrt.c: Functional tests for rsqrt.
>>> +
>>> 2015-07-08  Jiong Wang  <jiong.w...@arm.com>
>>> 
>>>      * config/aarch64/aarch64.c (aarch64_unspec_may_trap_p): New function.
>>> diff --git a/gcc/config/aarch64/aarch64-builtins.c 
>>> b/gcc/config/aarch64/aarch64-builtins.c
>>> index b6c89b9..b4f443c 100644
>>> --- a/gcc/config/aarch64/aarch64-builtins.c
>>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>>> @@ -335,6 +335,11 @@ enum aarch64_builtins
>>>  AARCH64_BUILTIN_GET_FPSR,
>>>  AARCH64_BUILTIN_SET_FPSR,
>>> 
>>> +  AARCH64_BUILTIN_RSQRT_DF,
>>> +  AARCH64_BUILTIN_RSQRT_SF,
>>> +  AARCH64_BUILTIN_RSQRT_V2DF,
>>> +  AARCH64_BUILTIN_RSQRT_V2SF,
>>> +  AARCH64_BUILTIN_RSQRT_V4SF,
>>>  AARCH64_SIMD_BUILTIN_BASE,
>>>  AARCH64_SIMD_BUILTIN_LANE_CHECK,
>>> #include "aarch64-simd-builtins.def"
>>> @@ -824,6 +829,42 @@ aarch64_init_crc32_builtins ()
>>> }
>>> 
>>> void
>>> +aarch64_add_builtin_rsqrt (void)

Here. 

>>> +{
>>> +  tree fndecl = NULL;
>>> +  tree ftype = NULL;
>>> +
>>> +  tree V2SF_type_node = build_vector_type (float_type_node, 2);
>>> +  tree V2DF_type_node = build_vector_type (double_type_node, 2);
>>> +  tree V4SF_type_node = build_vector_type (float_type_node, 4);
>>> +
>>> +  ftype = build_function_type_list (double_type_node, double_type_node, 
>>> NULL_TREE);
>>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_df",
>>> +    ftype, AARCH64_BUILTIN_RSQRT_DF, BUILT_IN_MD, NULL, NULL_TREE);
>>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF] = fndecl;
>>> +
>>> +  ftype = build_function_type_list (float_type_node, float_type_node, 
>>> NULL_TREE);
>>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_sf",
>>> +    ftype, AARCH64_BUILTIN_RSQRT_SF, BUILT_IN_MD, NULL, NULL_TREE);
>>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF] = fndecl;
>>> +
>>> +  ftype = build_function_type_list (V2DF_type_node, V2DF_type_node, 
>>> NULL_TREE);
>>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2df",
>>> +    ftype, AARCH64_BUILTIN_RSQRT_V2DF, BUILT_IN_MD, NULL, NULL_TREE);
>>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF] = fndecl;
>>> +
>>> +  ftype = build_function_type_list (V2SF_type_node, V2SF_type_node, 
>>> NULL_TREE);
>>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v2sf",
>>> +    ftype, AARCH64_BUILTIN_RSQRT_V2SF, BUILT_IN_MD, NULL, NULL_TREE);
>>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF] = fndecl;
>>> +
>>> +  ftype = build_function_type_list (V4SF_type_node, V4SF_type_node, 
>>> NULL_TREE);
>>> +  fndecl = add_builtin_function ("__builtin_aarch64_rsqrt_v4sf",
>>> +    ftype, AARCH64_BUILTIN_RSQRT_V4SF, BUILT_IN_MD, NULL, NULL_TREE);
>>> +  aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF] = fndecl;
>>> +}
>>> +
>>> +void
>>> aarch64_init_builtins (void)
>>> {
>>>  tree ftype_set_fpr
>>> @@ -848,6 +889,7 @@ aarch64_init_builtins (void)
>>>    aarch64_init_simd_builtins ();
>>>  if (TARGET_CRC32)
>>>    aarch64_init_crc32_builtins ();
>>> +  aarch64_add_builtin_rsqrt ();
>>> }
>>> 
>>> tree
>>> @@ -1092,6 +1134,39 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, 
>>> rtx target)
>>>  return target;
>>> }
>>> 
>>> +static rtx
>>> +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
>>> +{

Likewise. 


>>> +  rtx pat;
>>> +  tree arg0 = CALL_EXPR_ARG (exp, 0);
>>> +  rtx op0 = expand_normal (arg0);
>>> +
>>> +  enum insn_code c;
>>> +
>>> +  switch (fcode)
>>> +    {
>>> +      case AARCH64_BUILTIN_RSQRT_DF:
>>> +        c = CODE_FOR_rsqrt_df2; break;
>>> +      case AARCH64_BUILTIN_RSQRT_SF:
>>> +        c = CODE_FOR_rsqrt_sf2; break;
>>> +      case AARCH64_BUILTIN_RSQRT_V2DF:
>>> +        c = CODE_FOR_rsqrt_v2df2; break;
>>> +      case AARCH64_BUILTIN_RSQRT_V2SF:
>>> +        c = CODE_FOR_rsqrt_v2sf2; break;
>>> +      case AARCH64_BUILTIN_RSQRT_V4SF:
>>> +        c = CODE_FOR_rsqrt_v4sf2; break;
>>> +       default: gcc_unreachable ();
>>> +    }
>>> +
>>> +  if (!target)
>>> +    target = gen_reg_rtx (GET_MODE (op0));
>>> +
>>> +  pat = GEN_FCN (c) (target, op0);
>>> +  emit_insn (pat);
>>> +
>>> +  return target;
>>> +}
>>> +
>>> /* Expand an expression EXP that calls a built-in function,
>>>   with result going to TARGET if that's convenient.  */
>>> rtx
>>> @@ -1139,6 +1214,13 @@ aarch64_expand_builtin (tree exp,
>>>  else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= 
>>> AARCH64_CRC32_BUILTIN_MAX)
>>>    return aarch64_crc32_expand_builtin (fcode, exp, target);
>>> 
>>> +  if (fcode == AARCH64_BUILTIN_RSQRT_DF
>>> +      || fcode == AARCH64_BUILTIN_RSQRT_SF
>>> +      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
>>> +      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
>>> +      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
>>> +    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
>>> +
>>>  gcc_unreachable ();
>>> }
>>> 
>>> @@ -1296,6 +1378,28 @@ aarch64_builtin_vectorized_function (tree fndecl, 
>>> tree type_out, tree type_in)
>>>  return NULL_TREE;
>>> }
>>> 
>>> +tree
>>> +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
>>> +{

And here. 

>>> +  if (md_fn)
>>> +    {
>>> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
>>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
>>> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
>>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
>>> +      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
>>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
>>> +    }
>>> +  else
>>> +    {
>>> +      if (fn == BUILT_IN_SQRT)
>>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
>>> +      if (fn == BUILT_IN_SQRTF)
>>> +        return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
>>> +    }
>>> +  return NULL_TREE;
>>> +}
>>> +
>>> #undef VAR1
>>> #define VAR1(T, N, MAP, A) \
>>>  case AARCH64_SIMD_BUILTIN_##T##_##N##A:
>>> diff --git a/gcc/config/aarch64/aarch64-opts.h 
>>> b/gcc/config/aarch64/aarch64-opts.h
>>> index 24bfd9f..75e9c67 100644
>>> --- a/gcc/config/aarch64/aarch64-opts.h
>>> +++ b/gcc/config/aarch64/aarch64-opts.h
>>> @@ -64,4 +64,11 @@ enum aarch64_code_model {
>>>  AARCH64_CMODEL_LARGE
>>> };
>>> 
>>> +/* Each core can have -mrecip enabled or disabled by default. */
>>> +enum aarch64_mrecip {
>>> +  AARCH64_MRECIP_OFF = 0,
>>> +  AARCH64_MRECIP_ON,
>>> +  AARCH64_MRECIP_DEFAULT,
>>> +};
>>> +
>>> #endif
>>> diff --git a/gcc/config/aarch64/aarch64-protos.h 
>>> b/gcc/config/aarch64/aarch64-protos.h
>>> index 4062c27..8b8a389 100644
>>> --- a/gcc/config/aarch64/aarch64-protos.h
>>> +++ b/gcc/config/aarch64/aarch64-protos.h
>>> @@ -321,6 +321,8 @@ void aarch64_print_operand (FILE *, rtx, char);
>>> void aarch64_print_operand_address (FILE *, rtx);
>>> void aarch64_emit_call_insn (rtx);
>>> 
>>> +void aarch64_emit_swrsqrt (rtx, rtx);
>>> +
>>> /* Initialize builtins for SIMD intrinsics.  */
>>> void init_aarch64_simd_builtins (void);
>>> 
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>>> b/gcc/config/aarch64/aarch64-simd.md
>>> index b90f938..ae81731 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -353,6 +353,33 @@
>>>  [(set_attr "type" "neon_fp_mul_d_scalar_q")]
>>> )
>>> 
>>> +(define_insn "rsqrte_<mode>2"
>>> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
>>> +     (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
>>> +                  UNSPEC_RSQRTE))]
>>> +  "TARGET_SIMD"
>>> +  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
>>> +  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
>>> +
>>> +(define_insn "rsqrts_<mode>3"
>>> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
>>> +     (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
>>> +               (match_operand:VALLF 2 "register_operand" "w")]
>>> +                  UNSPEC_RSQRTS))]
>>> +  "TARGET_SIMD"
>>> +  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
>>> +  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
>>> +
>>> +(define_expand "rsqrt_<mode>2"
>>> +  [(set (match_operand:VALLF 0 "register_operand" "=w")
>>> +     (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
>>> +                  UNSPEC_RSQRT))]
>>> +  "TARGET_SIMD"
>>> +{
>>> +  aarch64_emit_swrsqrt (operands[0], operands[1]);
>>> +  DONE;
>>> +})
>>> +
>>> (define_insn "*aarch64_mul3_elt_to_64v2df"
>>>  [(set (match_operand:DF 0 "register_operand" "=w")
>>>     (mult:DF
>>> diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
>>> b/gcc/config/aarch64/aarch64-tuning-flags.def
>>> index 01aaca8..97dbf00 100644
>>> --- a/gcc/config/aarch64/aarch64-tuning-flags.def
>>> +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
>>> @@ -31,4 +31,5 @@
>>>     flags.  */
>>> 
>>> AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS, 0)
>>> +AARCH64_EXTRA_TUNING_OPTION ("mrecip_default_enabled", 
>>> MRECIP_DEFAULT_ENABLED, 1)
>>> 
>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>>> index 6c13a078..76c0eee 100644
>>> --- a/gcc/config/aarch64/aarch64.c
>>> +++ b/gcc/config/aarch64/aarch64.c
>>> @@ -364,7 +364,7 @@ static const struct tune_params generic_tunings =
>>>  1,  /* vec_reassoc_width.  */
>>>  2,  /* min_div_recip_mul_sf.  */
>>>  2,  /* min_div_recip_mul_df.  */
>>> -  (AARCH64_EXTRA_TUNE_NONE)  /* tune_flags.  */
>>> +  (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED)        /* tune_flags.  */
>>> };
>>> 
>>> static const struct tune_params cortexa53_tunings =
>>> @@ -386,7 +386,7 @@ static const struct tune_params cortexa53_tunings =
>>>  1,  /* vec_reassoc_width.  */
>>>  2,  /* min_div_recip_mul_sf.  */
>>>  2,  /* min_div_recip_mul_df.  */
>>> -  (AARCH64_EXTRA_TUNE_NONE)  /* tune_flags.  */
>>> +  (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED)        /* tune_flags.  */
>>> };
>>> 
>>> static const struct tune_params cortexa57_tunings =
>>> @@ -408,7 +408,8 @@ static const struct tune_params cortexa57_tunings =
>>>  1,  /* vec_reassoc_width.  */
>>>  2,  /* min_div_recip_mul_sf.  */
>>>  2,  /* min_div_recip_mul_df.  */
>>> -  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)       /* tune_flags.  */
>>> +  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS        /* tune_flags.  */
>>> +   | AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED)
>>> };
>>> 
>>> static const struct tune_params cortexa72_tunings =
>>> @@ -430,7 +431,7 @@ static const struct tune_params cortexa72_tunings =
>>>  1,  /* vec_reassoc_width.  */
>>>  2,  /* min_div_recip_mul_sf.  */
>>>  2,  /* min_div_recip_mul_df.  */
>>> -  (AARCH64_EXTRA_TUNE_NONE)  /* tune_flags.  */
>>> +  (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED)        /* tune_flags.  */
>>> };
>>> 
>>> static const struct tune_params thunderx_tunings =
>>> @@ -472,7 +473,7 @@ static const struct tune_params xgene1_tunings =
>>>  1,  /* vec_reassoc_width.  */
>>>  2,  /* min_div_recip_mul_sf.  */
>>>  2,  /* min_div_recip_mul_df.  */
>>> -  (AARCH64_EXTRA_TUNE_NONE)  /* tune_flags.  */
>>> +  (AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED)        /* tune_flags.  */
>>> };
>>> 
>>> /* Support for fine-grained override of the tuning structures.  */
>>> @@ -6961,6 +6962,98 @@ aarch64_memory_move_cost (machine_mode mode 
>>> ATTRIBUTE_UNUSED,
>>>  return aarch64_tune_params.memmov_cost;
>>> }
>>> 
>>> +extern tree aarch64_builtin_rsqrt (unsigned int fn, bool md_fn);

Why isn't that in a header file instead of here?

>>> +
>>> +static tree
>>> +aarch64_builtin_reciprocal (unsigned int fn,
>>> +                            bool md_fn,
>>> +                            bool)

And here. 

>>> +{
>>> +  if (!flag_finite_math_only
>>> +      || flag_trapping_math
>>> +      || !flag_unsafe_math_optimizations
>>> +      || optimize_size
>>> +      || flag_mrecip == AARCH64_MRECIP_OFF
>>> +      || (flag_mrecip == AARCH64_MRECIP_DEFAULT
>>> +          && !(aarch64_tune_params.extra_tuning_flags
>>> +               & AARCH64_EXTRA_TUNE_MRECIP_DEFAULT_ENABLED)))
>>> +  {
>>> +    return NULL_TREE;
>>> +  }
>>> +
>>> +  return aarch64_builtin_rsqrt (fn, md_fn);
>>> +}
>>> +
>>> +typedef rtx (*rsqrte_type) (rtx, rtx);
>>> +
>>> +rsqrte_type get_rsqrte_type (enum machine_mode mode)
>>> +{

And another one. 


>>> +  switch (mode)
>>> +  {
>>> +    case DFmode:   return gen_rsqrte_df2;
>>> +    case SFmode:   return gen_rsqrte_sf2;
>>> +    case V2DFmode: return gen_rsqrte_v2df2;
>>> +    case V2SFmode: return gen_rsqrte_v2sf2;
>>> +    case V4SFmode: return gen_rsqrte_v4sf2;
>>> +    default: gcc_unreachable ();
>>> +  }
>>> +}
>>> +
>>> +typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
>>> +
>>> +rsqrts_type get_rsqrts_type (enum machine_mode mode)

And another one. 

>>> +{
>>> +  switch (mode)
>>> +  {
>>> +    case DFmode:   return gen_rsqrts_df3;
>>> +    case SFmode:   return gen_rsqrts_sf3;
>>> +    case V2DFmode: return gen_rsqrts_v2df3;
>>> +    case V2SFmode: return gen_rsqrts_v2sf3;
>>> +    case V4SFmode: return gen_rsqrts_v4sf3;
>>> +    default: gcc_unreachable ();
>>> +  }
>>> +}
>>> +
>>> +void
>>> +aarch64_emit_swrsqrt (rtx dst, rtx src)
>>> +{

One more. 

>>> +  enum machine_mode mode = GET_MODE (src);
>>> +  gcc_assert (
>>> +    mode == SFmode || mode == V2SFmode || mode == V4SFmode ||
>>> +    mode == DFmode || mode == V2DFmode);
>>> +
>>> +  rtx xsrc = gen_reg_rtx (mode);
>>> +  emit_move_insn (xsrc, src);
>>> +  rtx x0 = gen_reg_rtx (mode);
>>> +
>>> +  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
>>> +
>>> +  bool double_mode = (mode == DFmode || mode == V2DFmode);
>>> +
>>> +  int iterations = 2;
>>> +  if (double_mode)
>>> +    iterations = 3;
>>> +
>>> +  if (flag_mrecip_low_precision_sqrt)
>>> +    iterations--;
>>> +
>>> +  for (int i = 0; i < iterations; ++i)
>>> +    {
>>> +      rtx x1 = gen_reg_rtx (mode);
>>> +      rtx x2 = gen_reg_rtx (mode);
>>> +      rtx x3 = gen_reg_rtx (mode);
>>> +      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
>>> +
>>> +      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
>>> +
>>> +      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
>>> +      x0 = x1;
>>> +    }
>>> +
>>> +  emit_move_insn (dst, x0);
>>> +  return;
>>> +}
>>> +
>>> /* Return the number of instructions that can be issued per cycle.  */
>>> static int
>>> aarch64_sched_issue_rate (void)
>>> @@ -12099,6 +12192,9 @@ aarch64_unspec_may_trap_p (const_rtx x, unsigned 
>>> flags)
>>> #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
>>> #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
>>> 
>>> +#undef TARGET_BUILTIN_RECIPROCAL
>>> +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
>>> +
>>> #undef TARGET_VECTOR_MODE_SUPPORTED_P
>>> #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
>>> 
>>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>>> index 1e343fa..d7944b2 100644
>>> --- a/gcc/config/aarch64/aarch64.md
>>> +++ b/gcc/config/aarch64/aarch64.md
>>> @@ -122,6 +122,9 @@
>>>    UNSPEC_VSTRUCTDUMMY
>>>    UNSPEC_SP_SET
>>>    UNSPEC_SP_TEST
>>> +    UNSPEC_RSQRT
>>> +    UNSPEC_RSQRTE
>>> +    UNSPEC_RSQRTS
>>> ])
>>> 
>>> (define_c_enum "unspecv" [
>>> diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
>>> index 98ef9f6..7921b85 100644
>>> --- a/gcc/config/aarch64/aarch64.opt
>>> +++ b/gcc/config/aarch64/aarch64.opt
>>> @@ -124,3 +124,11 @@ Enum(aarch64_abi) String(ilp32) 
>>> Value(AARCH64_ABI_ILP32)
>>> 
>>> EnumValue
>>> Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
>>> +
>>> +mrecip
>>> +Common Report Var(flag_mrecip) Optimization Init(AARCH64_MRECIP_DEFAULT)
>>> +Generate software reciprocal square root for better throughput.
>>> +
>>> +mlow-precision-recip-sqrt
>>> +Common Var(flag_mrecip_low_precision_sqrt) Optimization
>>> +Run fewer approximation steps to reduce latency and precision.
>>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>>> index b28e5d6..bd922a3 100644
>>> --- a/gcc/doc/invoke.texi
>>> +++ b/gcc/doc/invoke.texi
>>> @@ -515,6 +515,8 @@ Objective-C and Objective-C++ Dialects}.
>>> -mtls-dialect=desc  -mtls-dialect=traditional @gol
>>> -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
>>> -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
>>> +-mrecip -mno-recip @gol
>>> +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
>>> -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
>>> 
>>> @emph{Adapteva Epiphany Options}
>>> @@ -12426,6 +12428,23 @@ Enable or disable the workaround for the ARM 
>>> Cortex-A53 erratum number 843419.
>>> This erratum workaround is made at link time and this will only pass the
>>> corresponding flag to the linker.
>>> 
>>> +@item -mrecip
>>> +@item -mno-recip
>>> +@opindex mrecip
>>> +@opindex mno-recip
>>> +This option enables use of the
>>> +reciprocal square root estimate instructions with additional
>>> +Newton-Raphson steps to increase precision instead of doing a square root 
>>> and
>>> +divide for floating-point arguments.
>>> +It can only be used together with @option{-ffast-math}.
>>> +
>>> +@item -mlow-precision-recip-sqrt
>>> +@item -mno-low-precision-recip-sqrt
>>> +@opindex -mlow-precision-recip-sqrt
>>> +@opindex -mno-low-precision-recip-sqrt
>>> +The square root estimate uses two steps instead of three for 
>>> double-precision,
>>> +and one step instead of two for single-precision. Thus reducing latency 
>>> and precision.
>>> +
>>> @item -march=@var{name}
>>> @opindex march
>>> Specify the name of the target architecture, optionally suffixed by one or
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c 
>>> b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>>> new file mode 100644
>>> index 0000000..d6cfe11
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt-asm-check.c
>>> @@ -0,0 +1,63 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math -mrecip" } */
>>> +
>>> +#include <math.h>
>>> +
>>> +#define sqrt_float   __builtin_sqrtf
>>> +#define sqrt_double  __builtin_sqrt
>>> +
>>> +#define TESTTYPE(TYPE)                                          \
>>> +typedef struct {                                                \
>>> +  TYPE a;                                                       \
>>> +  TYPE b;                                                       \
>>> +  TYPE c;                                                       \
>>> +  TYPE d;                                                       \
>>> +} s4_##TYPE;                                                    \
>>> +                                                                \
>>> +typedef struct {                                                \
>>> +  TYPE a;                                                       \
>>> +  TYPE b;                                                       \
>>> +} s2_##TYPE;                                                    \
>>> +                                                                \
>>> +s4_##TYPE rsqrtv4_##TYPE (s4_##TYPE i)                          \
>>> +{                                                               \
>>> +  s4_##TYPE o;                                                  \
>>> +  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
>>> +  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
>>> +  o.c = 1.0 / sqrt_##TYPE (i.c);                                \
>>> +  o.d = 1.0 / sqrt_##TYPE (i.d);                                \
>>> +  return o;                                                     \
>>> +}                                                               \
>>> +                                                                \
>>> +s2_##TYPE rsqrtv2_##TYPE (s2_##TYPE i)                          \
>>> +{                                                               \
>>> +  s2_##TYPE o;                                                  \
>>> +  o.a = 1.0 / sqrt_##TYPE (i.a);                                \
>>> +  o.b = 1.0 / sqrt_##TYPE (i.b);                                \
>>> +  return o;                                                     \
>>> +}                                                               \
>>> +                                                                \
>>> +TYPE rsqrt_##TYPE (TYPE i)                                      \
>>> +{                                                               \
>>> +  return 1.0 / sqrt_##TYPE (i);                                 \
>>> +}                                                               \
>>> +                                                                \
>>> +
>>> +TESTTYPE(double)
>>> +TESTTYPE(float)
>>> +
>>> +/* { dg-final { scan-assembler-times "frsqrte\\td\[0-9\]+, d\[0-9\]+" 1 } 
>>> } */
>>> +/* { dg-final { scan-assembler-times "frsqrts\\td\[0-9\]+, d\[0-9\]+, 
>>> d\[0-9\]+" 3 } } */
>>> +
>>> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2d, 
>>> v\[0-9\]+.2d" 3 } } */
>>> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2d, 
>>> v\[0-9\]+.2d, v\[0-9\]+.2d" 9 } } */
>>> +
>>> +
>>> +/* { dg-final { scan-assembler-times "frsqrte\\ts\[0-9\]+, s\[0-9\]+" 1 } 
>>> } */
>>> +/* { dg-final { scan-assembler-times "frsqrts\\ts\[0-9\]+, s\[0-9\]+, 
>>> s\[0-9\]+" 2 } } */
>>> +
>>> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.4s, 
>>> v\[0-9\]+.4s" 1 } } */
>>> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.4s, 
>>> v\[0-9\]+.4s, v\[0-9\]+.4s" 2 } } */
>>> +
>>> +/* { dg-final { scan-assembler-times "frsqrte\\tv\[0-9\]+.2s, 
>>> v\[0-9\]+.2s" 1 } } */
>>> +/* { dg-final { scan-assembler-times "frsqrts\\tv\[0-9\]+.2s, 
>>> v\[0-9\]+.2s, v\[0-9\]+.2s" 2 } } */
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/rsqrt.c 
>>> b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
>>> new file mode 100644
>>> index 0000000..4a5c008
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/rsqrt.c
>>> @@ -0,0 +1,107 @@
>>> +/* { dg-do run } */
>>> +/* { dg-options "-O3 --save-temps -fverbose-asm -ffast-math -mrecip" } */
>>> +
>>> +#include <math.h>
>>> +#include <stdio.h>
>>> +
>>> +#include <float.h>
>>> +
>>> +#define PI    3.141592653589793
>>> +#define SQRT2 1.4142135623730951
>>> +
>>> +#define PI_4 0.7853981633974483
>>> +#define SQRT1_2 0.7071067811865475
>>> +
>>> +/* 2^25+1, float has 24 significand bits
>>> + *       according to Single-precision floating-point format.  */
>>> +#define TESTA8_FLT 33554433
>>> +/* 2^54+1, double has 53 significand bits
>>> + *       according to Double-precision floating-point format.  */
>>> +#define TESTA8_DBL 18014398509481985
>>> +
>>> +#define SD(a, b) t_double ((#a), (a), (b));
>>> +#define SF(a, b) t_float ((#a), (a), (b));
>>> +
>>> +#define EPSILON_double __DBL_EPSILON__
>>> +#define EPSILON_float __FLT_EPSILON__
>>> +#define ABS_double __builtin_fabs
>>> +#define ABS_float __builtin_fabsf
>>> +#define SQRT_double __builtin_sqrt
>>> +#define SQRT_float __builtin_sqrtf
>>> +
>>> +extern void abort (void);
>>> +
>>> +#define TESTTYPE(TYPE)                                                     
>>> \
>>> +TYPE rsqrt_##TYPE (TYPE a)                                                 
>>> \
>>> +{                                                                          
>>> \
>>> +  return 1.0/SQRT_##TYPE(a);                                               
>>> \
>>> +}                                                                          
>>> \
>>> +                                                                           
>>> \
>>> +int equals_##TYPE (TYPE a, TYPE b)                                         
>>> \
>>> +{                                                                          
>>> \
>>> +  return (a == b ||                                                        
>>> \
>>> +          (isnan (a) && isnan (b)) ||                                      
>>> \
>>> +          (ABS_##TYPE (a - b) < EPSILON_##TYPE));                          
>>> \
>>> +}                                                                          
>>> \
>>> +                                                                           
>>> \
>>> +void t_##TYPE (const char *s, TYPE a, TYPE result)                         
>>> \
>>> +{                                                                          
>>> \
>>> +  TYPE r = rsqrt_##TYPE (a);                                               
>>> \
>>> +  if (!equals_##TYPE (r, result))                                          
>>> \
>>> +  {                                                                        
>>> \
>>> +    abort ();                                                              
>>> \
>>> +  }                                                                        
>>> \
>>> +}                                                                          
>>> \
>>> +
>>> +//  printf ("Problem in %20s: %30.18A should be %30.18A\n", s, r, result); 
>>> \
>>> +
>>> +TESTTYPE(double)
>>> +TESTTYPE(float)
>>> +
>>> +int main ()
>>> +{
>>> +  SD(    1.0/256, 0X1.00000000000000P+4  );
>>> +  SD(        1.0, 0X1.00000000000000P+0  );
>>> +  SD(       -1.0,                     NAN);
>>> +  SD(       11.0, 0X1.34BF63D1568260P-2  );
>>> +  SD(        0.0,                INFINITY);
>>> +  SD(   INFINITY, 0X0.00000000000000P+0  );
>>> +  SD(        NAN,                     NAN);
>>> +  SD(       -NAN,                    -NAN);
>>> +  SD(    DBL_MAX, 0X1.00000000000010P-512);
>>> +  SD(    DBL_MIN, 0X1.00000000000000P+511);
>>> +  SD(         PI, 0X1.20DD750429B6D0P-1  );
>>> +  SD(       PI_4, 0X1.20DD750429B6D0P+0  );
>>> +  SD(      SQRT2, 0X1.AE89F995AD3AE0P-1  );
>>> +  SD(    SQRT1_2, 0X1.306FE0A31B7150P+0  );
>>> +  SD(        -PI,                     NAN);
>>> +  SD(     -SQRT2,                     NAN);
>>> +  SD( TESTA8_DBL, 0X1.00000000000000P-27 );
>>> +
>>> +  SF(    1.0/256, 0X1.00000000000000P+4  );
>>> +  SF(        1.0, 0X1.00000000000000P+0  );
>>> +  SF(       -1.0,                     NAN);
>>> +  SF(       11.0, 0X1.34BF6400000000P-2  );
>>> +  SF(        0.0,                INFINITY);
>>> +  SF(   INFINITY, 0X0.00000000000000P+0  );
>>> +  SF(        NAN,                     NAN);
>>> +  SF(       -NAN,                    -NAN);
>>> +  SF(    FLT_MAX, 0X1.00000200000000P-64 );
>>> +  SF(    FLT_MIN, 0X1.00000000000000P+63 );
>>> +  SF(         PI, 0X1.20DD7400000000P-1  );
>>> +  SF(       PI_4, 0X1.20DD7400000000P+0  );
>>> +  SF(      SQRT2, 0X1.AE89FA00000000P-1  );
>>> +  SF(    SQRT1_2, 0X1.306FE000000000P+0  );
>>> +  SF(        -PI,                     NAN);
>>> +  SF(     -SQRT2,                     NAN);
>>> +  SF( TESTA8_FLT, 0X1.6A09E600000000P-13 );
>>> +
>>> +//   With -ffast-math these return positive INF.
>>> +//   SD(       -0.0,               -INFINITY);
>>> +//   SF(       -0.0,               -INFINITY);
>>> +//   The reason here is that -ffast-math flushes to zero.
>>> +//   SD(DBL_MIN/256, 0X1.00000000000000P+515);
>>> +//   SF(FLT_MIN/256, 0X1.00000000000000P+67 );
>>> +
>>> +  return 0;
>>> +}
>>> --
>>> 1.9.1
>>> 
>>
Re: [PATCH] 2015-07-31 Benedikt Huber Philipp Tomsich

Reply via email to