max intrinsics

Kyrill Tkachov Wed, 18 Jul 2018 07:03:33 -0700


On 18/07/18 14:26, Thomas König wrote:

Hi Kyrlll,

Am 18.07.2018 um 13:17 schrieb Kyrill Tkachov <kyrylo.tkac...@foss.arm.com>:

Thomas, Janne, would this relaxation of NaN handling be acceptable given the 
benefits
mentioned above? If so, what would be the recommended adjustment to the 
nan_1.f90 test?

I would be a bit careful about changing behavior in such a major way. What 
would the results with NaN and infinity then be, with or without optimization? 
Would the results be consistent with min(nan,num) vs min(num,nan)? Would they 
be consistent with the new IEEE standard?

In general, I think that min(nan,num) should be nan and that our current 
behavior is not the best.

Does anybody have dats points on how this is handled by other compilers?

Oh, and if anything is changed, then compile and runtime behavior should always 
be the same.


Thanks, that makes it clearer what behaviour is accceptable.

So this v3 patch follows Richard Sandiford's suggested approach of emitting 
IFN_FMIN/FMAX
when dealing with floating-point values and NaN handling is important and the 
target
supports the IFN_FMIN/FMAX. Otherwise the current explicit comparison sequence 
is emitted.
For integer types and -ffast-math floating-point it will emit MIN/MAX_EXPR.

With this patch the nan_1.f90 behaviour is preserved on all targets, we get the 
optimal
sequence on aarch64 and on x86_64 we avoid the function call, with no changes 
in code generation.

This gives the performance improvement on 521.wrf on aarch64 and leaves it 
unchanged on x86_64.

I'm hoping this addresses all the concerns raised in this thread:
* The NaN-handling behaviour is unchanged on all platforms.
* The fast inline sequence is emitted where it is available.
* No calls to library fmin*/fmax* are emitted where there were none.
* MIN/MAX_EXPR sequence are emitted where possible.

Is this acceptable?

Thanks,
Kyrill

2018-07-18  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * trans-intrinsic.c: (gfc_conv_intrinsic_minmax): Emit MIN_MAX_EXPR
    or IFN_FMIN/FMAX sequence to calculate the min/max when possible.

2018-07-18  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>

    * gfortran.dg/max_fmaxl_aarch64.f90: New test.
    * gfortran.dg/min_fminl_aarch64.f90: Likewise.
    * gfortran.dg/minmax_integer.f90: Likewise.

diff --git a/gcc/fortran/trans-intrinsic.c b/gcc/fortran/trans-intrinsic.c
index d306e3a5a6209c1621d91f99ffc366acecd9c3d0..6f5700f2a421d2a735d77c4c4ec0c4c9c058e727 100644
--- a/gcc/fortran/trans-intrinsic.c
+++ b/gcc/fortran/trans-intrinsic.c
@@ -31,6 +31,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "trans.h"
 #include "stringpool.h"
 #include "fold-const.h"
+#include "internal-fn.h"
 #include "tree-nested.h"
 #include "stor-layout.h"
 #include "toplev.h"	/* For rest_of_decl_compilation.  */
@@ -3874,14 +3875,15 @@ gfc_conv_intrinsic_ttynam (gfc_se * se, gfc_expr * expr)
     minmax (a1, a2, a3, ...)
     {
       mvar = a1;
-      if (a2 .op. mvar || isnan (mvar))
-        mvar = a2;
-      if (a3 .op. mvar || isnan (mvar))
-        mvar = a3;
+      mvar = COMP (mvar, a2)
+      mvar = COMP (mvar, a3)
       ...
-      return mvar
+      return mvar;
     }
- */
+    Where COMP is MIN/MAX_EXPR for integral types or when we don't
+    care about NaNs, or IFN_FMIN/MAX when the target has support for
+    fast NaN-honouring min/max.  When neither holds expand a sequence
+    of explicit comparisons.  */
 
 /* TODO: Mismatching types can occur when specific names are used.
    These should be handled during resolution.  */
@@ -3891,7 +3893,6 @@ gfc_conv_intrinsic_minmax (gfc_se * se, gfc_expr * expr, enum tree_code op)
   tree tmp;
   tree mvar;
   tree val;
-  tree thencase;
   tree *args;
   tree type;
   gfc_actual_arglist *argexpr;
@@ -3912,55 +3913,77 @@ gfc_conv_intrinsic_minmax (gfc_se * se, gfc_expr * expr, enum tree_code op)
 
   mvar = gfc_create_var (type, "M");
   gfc_add_modify (&se->pre, mvar, args[0]);
-  for (i = 1, argexpr = argexpr->next; i < nargs; i++)
-    {
-      tree cond, isnan;
 
+  internal_fn ifn = op == GT_EXPR ? IFN_FMAX : IFN_FMIN;
+
+  for (i = 1, argexpr = argexpr->next; i < nargs; i++, argexpr = argexpr->next)
+    {
+      tree cond = NULL_TREE;
       val = args[i];
 
       /* Handle absent optional arguments by ignoring the comparison.  */
       if (argexpr->expr->expr_type == EXPR_VARIABLE
 	  && argexpr->expr->symtree->n.sym->attr.optional
 	  && TREE_CODE (val) == INDIRECT_REF)
-	cond = fold_build2_loc (input_location,
+	{
+	  cond = fold_build2_loc (input_location,
 				NE_EXPR, logical_type_node,
 				TREE_OPERAND (val, 0),
 			build_int_cst (TREE_TYPE (TREE_OPERAND (val, 0)), 0));
-      else
-      {
-	cond = NULL_TREE;
-
+	}
+      else if (!VAR_P (val) && !TREE_CONSTANT (val))
 	/* Only evaluate the argument once.  */
-	if (!VAR_P (val) && !TREE_CONSTANT (val))
-	  val = gfc_evaluate_now (val, &se->pre);
-      }
+	val = gfc_evaluate_now (val, &se->pre);
 
-      thencase = build2_v (MODIFY_EXPR, mvar, convert (type, val));
+      tree calc;
+      /* If we dealing with integral types or we don't care about NaNs
+	 just do a MIN/MAX_EXPR.  */
+      if (!HONOR_NANS (type) && !HONOR_SIGNED_ZEROS (type))
+	{
+
+	  tree_code code = op == GT_EXPR ? MAX_EXPR : MIN_EXPR;
+	  calc = fold_build2_loc (input_location, code, type,
+				  convert (type, val), mvar);
+	  tmp = build2_v (MODIFY_EXPR, mvar, calc);
 
-      tmp = fold_build2_loc (input_location, op, logical_type_node,
-			     convert (type, val), mvar);
+	}
+      /* If we care about NaNs and we have internal functions available for
+	 fmin/fmax to perform the comparison, use those.  */
+      else if (SCALAR_FLOAT_TYPE_P (type)
+	      && direct_internal_fn_supported_p (ifn, type, OPTIMIZE_FOR_SPEED))
+	{
+	  calc = build_call_expr_internal_loc (input_location, ifn, type,
+				      2, mvar, convert (type, val));
+	  tmp = build2_v (MODIFY_EXPR, mvar, calc);
 
-      /* FIXME: When the IEEE_ARITHMETIC module is implemented, the call to
-	 __builtin_isnan might be made dependent on that module being loaded,
-	 to help performance of programs that don't rely on IEEE semantics.  */
-      if (FLOAT_TYPE_P (TREE_TYPE (mvar)))
+	}
+      /* Otherwise expand to:
+	mvar = a1;
+	if (a2 .op. mvar || isnan (mvar))
+	  mvar = a2;
+	if (a3 .op. mvar || isnan (mvar))
+	  mvar = a3;
+	...  */
+      else
 	{
-	  isnan = build_call_expr_loc (input_location,
-				       builtin_decl_explicit (BUILT_IN_ISNAN),
-				       1, mvar);
+	  tree isnan = build_call_expr_loc (input_location,
+					builtin_decl_explicit (BUILT_IN_ISNAN),
+					1, mvar);
+	  tmp = fold_build2_loc (input_location, op, logical_type_node,
+				 convert (type, val), mvar);
+
 	  tmp = fold_build2_loc (input_location, TRUTH_OR_EXPR,
-				 logical_type_node, tmp,
-				 fold_convert (logical_type_node, isnan));
+				  logical_type_node, tmp,
+				  fold_convert (logical_type_node, isnan));
+	  tmp = build3_v (COND_EXPR, tmp,
+			  build2_v (MODIFY_EXPR, mvar, convert (type, val)),
+			  build_empty_stmt (input_location));
 	}
-      tmp = build3_v (COND_EXPR, tmp, thencase,
-		      build_empty_stmt (input_location));
 
       if (cond != NULL_TREE)
 	tmp = build3_v (COND_EXPR, cond, tmp,
 			build_empty_stmt (input_location));
-
       gfc_add_expr_to_block (&se->pre, tmp);
-      argexpr = argexpr->next;
     }
   se->expr = mvar;
 }
diff --git a/gcc/testsuite/gfortran.dg/max_fmaxl_aarch64.f90 b/gcc/testsuite/gfortran.dg/max_fmaxl_aarch64.f90
new file mode 100644
index 0000000000000000000000000000000000000000..8c8ea063e5d0718dc829c1f5574c5b46040e6786
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/max_fmaxl_aarch64.f90
@@ -0,0 +1,9 @@
+! { dg-do compile { target aarch64*-*-* } }
+! { dg-options "-O2 -fdump-tree-optimized" }
+
+subroutine fool (a, b, c, d, e, f, g, h)
+  real (kind=16) :: a, b, c, d, e, f, g, h
+  a = max (a, b, c, d, e, f, g, h)
+end subroutine
+
+! { dg-final { scan-tree-dump-times "__builtin_fmaxl " 7 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/min_fminl_aarch64.f90 b/gcc/testsuite/gfortran.dg/min_fminl_aarch64.f90
new file mode 100644
index 0000000000000000000000000000000000000000..92368917fb48e0c468a16d080ab3a9ac842e01a7
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/min_fminl_aarch64.f90
@@ -0,0 +1,9 @@
+! { dg-do compile { target aarch64*-*-* } }
+! { dg-options "-O2 -fdump-tree-optimized" }
+
+subroutine fool (a, b, c, d, e, f, g, h)
+  real (kind=16) :: a, b, c, d, e, f, g, h
+  a = min (a, b, c, d, e, f, g, h)
+end subroutine
+
+! { dg-final { scan-tree-dump-times "__builtin_fminl " 7 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/minmax_integer.f90 b/gcc/testsuite/gfortran.dg/minmax_integer.f90
new file mode 100644
index 0000000000000000000000000000000000000000..5b6be38c7055ce4e8620cf75ec7d8a182436b24f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/minmax_integer.f90
@@ -0,0 +1,15 @@
+! { dg-do compile }
+! { dg-options "-O2 -fdump-tree-optimized" }
+
+subroutine foo (a, b, c, d, e, f, g, h)
+  integer (kind=4) :: a, b, c, d, e, f, g, h
+  a = min (a, b, c, d, e, f, g, h)
+end subroutine
+
+subroutine foof (a, b, c, d, e, f, g, h)
+  integer (kind=4) :: a, b, c, d, e, f, g, h
+  a = max (a, b, c, d, e, f, g, h)
+end subroutine
+
+! { dg-final { scan-tree-dump-times "MIN_EXPR" 7 "optimized" } }
+! { dg-final { scan-tree-dump-times "MAX_EXPR" 7 "optimized" } }

Re: [PATCH][Fortran][v2] Use MIN/MAX_EXPR for min/max intrinsics

Reply via email to