From ba7b2badff17dccaec200fea195fbca84e74dc23 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Fri, 2 Aug 2024 06:48:47 -0700
Subject: [PATCH] aarch64: Reduce FP reassociation width for Neoverse V2 and
 set AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA

The fp reassociation width for Neoverse V2 was set to 6 since its
introduction and I guess it was empirically tuned.  But since
AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA was added the tree reassociation
pass seems to be more deliberate in forming FMAs and when that flag is
used it seems to more properly evaluate the FMA vs non-FMA reassociation
widths.
According to the Neoverse V2 SWOG the core has a throughput of 4 for
most FP operations, so the value 6 is not accurate anyway.
Also, the SWOG does state that FMADD operations are pipelined and the
results can be forwarded from FP multiplies to the accumulation operands
of FMADD instructions, which seems to be what
AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA expresses.

This patch sets the fp_reassoc_width field to 4 and enables
AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA for -mcpu=neoverse-v2.

On SPEC2017 fprate I see the following changes on a Grace system:
503.bwaves_r	0.16%
507.cactuBSSN_r	-0.32%
508.namd_r	3.04%
510.parest_r	0.00%
511.povray_r	0.78%
519.lbm_r 	0.35%
521.wrf_r	0.69%
526.blender_r	-0.53%
527.cam4_r	0.84%
538.imagick_r	0.00%
544.nab_r	-0.97%
549.fotonik3d_r	-0.45%
554.roms_r	0.97%
Geomean	        0.35%

with -Ofast -mcpu=grace -flto.

So slight overall improvement with a meaningful improvement in
508.namd_r.

I think other tunings in aarch64 should look into
AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA as well, but I'll leave the
benchmarking to someone else.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/ChangeLog:

	* config/aarch64/tuning_models/neoversev2.h (fp_reassoc_width):
	Set to 4.
	(tune_flags): Add AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA.
---
 gcc/config/aarch64/tuning_models/neoversev2.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h
index bd259a37e9c..fcd36773b1d 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -229,7 +229,7 @@ static const struct tune_params neoversev2_tunings =
   "4",		/* jump_align.  */
   "32:16",	/* loop_align.  */
   3,	/* int_reassoc_width.  */
-  6,	/* fp_reassoc_width.  */
+  4,	/* fp_reassoc_width.  */
   4,	/* fma_reassoc_width.  */
   3,	/* vec_reassoc_width.  */
   2,	/* min_div_recip_mul_sf.  */
@@ -240,10 +240,11 @@ static const struct tune_params neoversev2_tunings =
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),	/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
+   | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),	/* tune_flags.  */
   &generic_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS	   /* stp_policy_model.  */
 };
 
-#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
\ No newline at end of file
+#endif /* GCC_AARCH64_H_NEOVERSEV2.  */
-- 
2.43.2

