On Mon, Apr 21, 2025 at 5:43 AM liuhongt <hongtao....@intel.com> wrote:
>
> From: "hongtao.liu" <hongtao....@intel.com>
>
> When FMA is available, N-R step can be rewritten with
>
> a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
>
> which have 2 fma generated.[1]
>
> [1] https://bugs.llvm.org/show_bug.cgi?id=21385
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
>
> gcc/ChangeLog:
>
>         * config/i386/i386-expand.cc (ix86_emit_swdivsf): Generate 2
>         FMA instructions when TARGET_FMA.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/recip-vec-divf-fma.c: New test.

OK, with a small nit below.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc                | 44 ++++++++++++++-----
>  .../gcc.target/i386/recip-vec-divf-fma.c      | 12 +++++
>  2 files changed, 44 insertions(+), 12 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index cdfd94d3c73..4fffbfdd574 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -19256,8 +19256,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
> machine_mode mode)
>    e1 = gen_reg_rtx (mode);
>    x1 = gen_reg_rtx (mode);
>
> -  /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
> -
>    b = force_reg (mode, b);
>
>    /* x0 = rcp(b) estimate */
> @@ -19270,20 +19268,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, 
> machine_mode mode)
>      emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
>                                                 UNSPEC_RCP)));
>
> -  /* e0 = x0 * b */
> -  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
> +  unsigned vector_size = GET_MODE_SIZE (mode);
>
> -  /* e0 = x0 * e0 */
> -  emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
> +  /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
> +     N-R step with 2 fma implementation.  */
> +  if (TARGET_FMA
> +      || (TARGET_AVX512F && vector_size == 64)
> +      || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
> +    {
> +      /* e0 = x0 * a  */
> +      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
> +      /* e1 = e0 * b - a  */
> +      emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
> +                                              gen_rtx_NEG (mode, a))));
> +      /* res = - e1 * x0 + e0  */
> +      emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
> +                                              gen_rtx_NEG (mode, e1),
> +                                              x0, e0)));
> +    }
> +    /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
> +  else

Please put the above comment here, as it applies to the "else" branch.

> +    {
> +      /* e0 = x0 * b */
> +      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
>
> -  /* e1 = x0 + x0 */
> -  emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
> +      /* e1 = x0 + x0 */
> +      emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
>
> -  /* x1 = e1 - e0 */
> -  emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
> +      /* e0 = x0 * e0 */
> +      emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
>
> -  /* res = a * x1 */
> -  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
> +      /* x1 = e1 - e0 */
> +      emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
> +
> +      /* res = a * x1 */
> +      emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
> +    }
>  }
>
>  /* Output code to perform a Newton-Rhapson approximation of a
> diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c 
> b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
> new file mode 100644
> index 00000000000..ad9e07b1eb6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mfma -mavx2" } */
> +/* { dg-final { scan-assembler-times {(?n)vfn?m(add|sub)[1-3]*ps} 2 } } */
> +
> +typedef float v4sf __attribute__((vector_size(16)));
> +/* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a  */
> +
> +v4sf
> +foo (v4sf a, v4sf b)
> +{
> +    return a / b;
> +}
> --
> 2.34.1
>

Reply via email to