On Mon, Apr 21, 2025 at 5:43 AM liuhongt <hongtao....@intel.com> wrote: > > From: "hongtao.liu" <hongtao....@intel.com> > > When FMA is available, N-R step can be rewritten with > > a / b = (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a > > which have 2 fma generated.[1] > > [1] https://bugs.llvm.org/show_bug.cgi?id=21385 > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > > gcc/ChangeLog: > > * config/i386/i386-expand.cc (ix86_emit_swdivsf): Generate 2 > FMA instructions when TARGET_FMA. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/recip-vec-divf-fma.c: New test.
OK, with a small nit below. Thanks, Uros. > --- > gcc/config/i386/i386-expand.cc | 44 ++++++++++++++----- > .../gcc.target/i386/recip-vec-divf-fma.c | 12 +++++ > 2 files changed, 44 insertions(+), 12 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index cdfd94d3c73..4fffbfdd574 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -19256,8 +19256,6 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, > machine_mode mode) > e1 = gen_reg_rtx (mode); > x1 = gen_reg_rtx (mode); > > - /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ > - > b = force_reg (mode, b); > > /* x0 = rcp(b) estimate */ > @@ -19270,20 +19268,42 @@ ix86_emit_swdivsf (rtx res, rtx a, rtx b, > machine_mode mode) > emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), > UNSPEC_RCP))); > > - /* e0 = x0 * b */ > - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); > + unsigned vector_size = GET_MODE_SIZE (mode); > > - /* e0 = x0 * e0 */ > - emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); > + /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a > + N-R step with 2 fma implementation. */ > + if (TARGET_FMA > + || (TARGET_AVX512F && vector_size == 64) > + || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16))) > + { > + /* e0 = x0 * a */ > + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a))); > + /* e1 = e0 * b - a */ > + emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b, > + gen_rtx_NEG (mode, a)))); > + /* res = - e1 * x0 + e0 */ > + emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode, > + gen_rtx_NEG (mode, e1), > + x0, e0))); > + } > + /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ > + else Please put the above comment here, as it applies to the "else" branch. > + { > + /* e0 = x0 * b */ > + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b))); > > - /* e1 = x0 + x0 */ > - emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); > + /* e1 = x0 + x0 */ > + emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0))); > > - /* x1 = e1 - e0 */ > - emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); > + /* e0 = x0 * e0 */ > + emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0))); > > - /* res = a * x1 */ > - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); > + /* x1 = e1 - e0 */ > + emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0))); > + > + /* res = a * x1 */ > + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1))); > + } > } > > /* Output code to perform a Newton-Rhapson approximation of a > diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > new file mode 100644 > index 00000000000..ad9e07b1eb6 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/recip-vec-divf-fma.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -mfma -mavx2" } */ > +/* { dg-final { scan-assembler-times {(?n)vfn?m(add|sub)[1-3]*ps} 2 } } */ > + > +typedef float v4sf __attribute__((vector_size(16))); > +/* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a */ > + > +v4sf > +foo (v4sf a, v4sf b) > +{ > + return a / b; > +} > -- > 2.34.1 >