From 388efd0216fae95f645fdf4e14c180a539282416 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@nvidia.com>
Date: Fri, 25 Jul 2025 17:55:03 +0200
Subject: [PATCH] tree-parloops: Enable runtime thread detection with
 -ftree-parallelize-loops

This patch adds runtime thread count detection to auto-parallelization.
-ftree-parallelize-loops option generates parallelized loops without
specifying a fixed thread count, deferring this decision to program execution
time where it is controlled by the OMP_NUM_THREADS environment variable.

Bootstrap and regression tested on aarch64-linux.  Compiled SPEC HPC pot3d
https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with
-ftree-parallelize-loops and tested without having OMP_NUM_THREADS set in the
environment and with OMP_NUM_THREADS set to different values.

gcc/ChangeLog:

	* doc/invoke.texi (ftree-parallelize-loops): Update.
	* common.opt (ftree-parallelize-loops): Add alias that maps to
	special value INT_MAX for runtime thread detection.
	* builtins.def (DEF_GOMP_BUILTIN): Enable OpenMP builtins
	when flag_tree_parallelize_loops == INT_MAX.
	* tree-parloops.cc (create_parallel_loop): Use INT_MAX for runtime
	detection.  Call gimple_build_omp_parallel without building a
	OMP_CLAUSE_NUM_THREADS clause.
	(gen_parallel_loop): For auto-detection, use a conservative
	estimate of 2 threads.
	(parallelize_loops): Same.
	* gcc.cc (LINK_SPEC, GOMP_SELF_SPECS): Add automatic libgomp
	and pthread linking for -ftree-parallelize-loops.

gcc/testsuite/ChangeLog:

	* gcc.dg/autopar/runtime-auto.c: New test.

Signed-off-by: Sebastian Pop <spop@nvidia.com>
---
 gcc/common.opt                              |  4 ++
 gcc/doc/invoke.texi                         | 18 +++++--
 gcc/gcc.cc                                  |  4 +-
 gcc/testsuite/gcc.dg/autopar/runtime-auto.c | 53 +++++++++++++++++++++
 gcc/tree-parloops.cc                        | 36 ++++++++------
 5 files changed, 96 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/autopar/runtime-auto.c
diff --git a/gcc/common.opt b/gcc/common.opt
index f6d93dc05fb..6b3c8481d48 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3303,6 +3303,10 @@ ftree-parallelize-loops=
 Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
 -ftree-parallelize-loops=<number>	Enable automatic parallelization of loops.
 
+ftree-parallelize-loops
+Common Alias(ftree-parallelize-loops=,2147483647,1)
+Enable automatic parallelization of loops.
+
 ftree-phiprop
 Common Var(flag_tree_phiprop) Init(1) Optimization
 Enable hoisting loads from conditional pointers.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 492ca291432..d7c8b5cff0b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -659,7 +659,7 @@ Objective-C and Objective-C++ Dialects}.
 -ftree-phiprop  -ftree-loop-distribution  -ftree-loop-distribute-patterns
 -ftree-loop-ivcanon  -ftree-loop-linear  -ftree-loop-optimize
 -ftree-loop-vectorize
--ftree-parallelize-loops=@var{n}  -ftree-pre  -ftree-partial-pre  -ftree-pta
+-ftree-parallelize-loops[=@var{n}]  -ftree-pre  -ftree-partial-pre  -ftree-pta
 -ftree-reassoc  -ftree-scev-cprop  -ftree-sink  -ftree-slsr  -ftree-sra
 -ftree-switch-conversion  -ftree-tail-merge
 -ftree-ter  -ftree-vectorize  -ftree-vrp  -ftrivial-auto-var-init
@@ -14663,8 +14663,9 @@ variable merging and induction variable elimination) on trees.
 Enabled by default at @option{-O1} and higher.
 
 @opindex ftree-parallelize-loops
-@item -ftree-parallelize-loops=n
-Parallelize loops, i.e., split their iteration space to run in n threads.
+@item -ftree-parallelize-loops
+@itemx -ftree-parallelize-loops=@var{n}
+Parallelize loops, i.e., split their iteration space to run in multiple threads.
 This is only possible for loops whose iterations are independent
 and can be arbitrarily reordered.  The optimization is only
 profitable on multiprocessor machines, for loops that are CPU-intensive,
@@ -14672,6 +14673,17 @@ rather than constrained e.g.@: by memory bandwidth.  This option
 implies @option{-pthread}, and thus is only supported on targets
 that have support for @option{-pthread}.
 
+When a positive value @var{n} is specified, the number of threads is fixed
+at compile time and cannot be changed after compilation. The compiler
+generates ``#pragma omp parallel num_threads(@var{n})''.
+
+When used without @code{=@var{n}} (i.e., @option{-ftree-parallelize-loops}),
+the number of threads is determined at program execution time via the
+@env{OMP_NUM_THREADS} environment variable. If @env{OMP_NUM_THREADS} is not
+set, the OpenMP runtime automatically detects the number of available
+processors and uses that value. This enables creating binaries that
+adapt to different hardware configurations without recompilation.
+
 @opindex ftree-pta
 @item -ftree-pta
 Perform function-local points-to analysis on trees.  This flag is
diff --git a/gcc/gcc.cc b/gcc/gcc.cc
index 8da821e92ac..b66ae73a10a 100644
--- a/gcc/gcc.cc
+++ b/gcc/gcc.cc
@@ -1168,7 +1168,7 @@ proper position among the other output files.  */
     %{s} %{t} %{u*} %{z} %{Z} %{!nostdlib:%{!r:%{!nostartfiles:%S}}} \
     %{static|no-pie|static-pie:} %@{L*} %(link_libgcc) " \
     VTABLE_VERIFICATION_SPEC " " SANITIZER_EARLY_SPEC " %o "" \
-    %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1):\
+    %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1)|ftree-parallelize-loops:\
 	%:include(libgomp.spec)%(link_gomp)}\
     %{fgnu-tm:%:include(libitm.spec)%(link_itm)}\
     " STACK_SPLIT_SPEC "\
@@ -1349,7 +1349,7 @@ static const char *const multilib_defaults_raw[] = MULTILIB_DEFAULTS;
    for targets that use different start files and suchlike.  */
 #ifndef GOMP_SELF_SPECS
 #define GOMP_SELF_SPECS \
-  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1): " \
+  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1)|ftree-parallelize-loops: " \
   "-pthread}"
 #endif
 
diff --git a/gcc/testsuite/gcc.dg/autopar/runtime-auto.c b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
new file mode 100644
index 00000000000..c1a3131634d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/autopar/runtime-auto.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops -fdump-tree-parloops2-details" } */
+
+void abort (void);
+
+#define N 1000
+
+int a[N], b[N], c[N];
+
+void
+test_parallel_loop (void)
+{
+  int i;
+
+  /* This loop should be auto-parallelized when -ftree-parallelize-loops
+     (without =number) is used for runtime thread detection via OMP_NUM_THREADS.  */
+  for (i = 0; i < N; i++)
+    a[i] = b[i] + c[i];
+}
+
+int
+main (void)
+{
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      b[i] = i;
+      c[i] = i * 2;
+    }
+
+  test_parallel_loop ();
+
+  for (i = 0; i < N; i++)
+    {
+      if (a[i] != b[i] + c[i])
+	abort ();
+    }
+
+  return 0;
+}
+
+/* Check that the loop is parallelized with runtime thread detection.  */
+/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
+
+/* Check that "#pragma omp parallel" is generated.  */
+/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
+
+/* Check that instead of generating a num_threads(x) clause, the compiler calls
+   "__builtin_omp_get_num_threads" that will set the number of threads at
+   program execution time.  */
+/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
+
diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 666c6a1f376..736182868dc 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2601,10 +2601,19 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
       gsi = gsi_last_bb (paral_bb);
 
       gcc_checking_assert (n_threads != 0);
-      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
-      OMP_CLAUSE_NUM_THREADS_EXPR (t)
-	= build_int_cst (integer_type_node, n_threads);
-      omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+      if (n_threads == INT_MAX)
+	/* No hardcoded thread count, let OpenMP runtime decide.  */
+	omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn,
+						  data);
+      else
+	{
+	  /* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
+	     thread count.  */
+	  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+	  OMP_CLAUSE_NUM_THREADS_EXPR (t)
+	    = build_int_cst (integer_type_node, n_threads);
+	  omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+	}
       gimple_set_location (omp_par_stmt, loc);
 
       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
@@ -2812,7 +2821,6 @@ gen_parallel_loop (class loop *loop,
   struct clsn_data clsn_data;
   location_t loc;
   gimple *cond_stmt;
-  unsigned int m_p_thread=2;
 
   /* From
 
@@ -2885,15 +2893,14 @@ gen_parallel_loop (class loop *loop,
 
   if (!oacc_kernels_p)
     {
-      if (loop->inner)
-	m_p_thread=2;
-      else
-	m_p_thread=MIN_PER_THREAD;
-
       gcc_checking_assert (n_threads != 0);
+      /* For runtime thread detection, use a conservative estimate of 2 threads
+	 for the many iterations condition check.  */
+      unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
+      unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
       many_iterations_cond =
 	fold_build2 (GE_EXPR, boolean_type_node,
-		     nit, build_int_cst (type, m_p_thread * n_threads - 1));
+		     nit, build_int_cst (type, m_p_thread * threads - 1));
 
       many_iterations_cond
 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@@ -3905,14 +3912,15 @@ parallelize_loops (bool oacc_kernels_p)
       estimated = estimated_loop_iterations_int (loop);
       if (estimated == -1)
 	estimated = get_likely_max_loop_iterations_int (loop);
+      /* For runtime thread detection, use an estimate of 2 threads.  */
+      unsigned threads = (n_threads == INT_MAX) ? 2 : n_threads;
+      unsigned m_p_thread = loop->inner ? 2 : MIN_PER_THREAD;
       /* FIXME: Bypass this check as graphite doesn't update the
 	 count and frequency correctly now.  */
       if (!flag_loop_parallelize_all
 	  && !oacc_kernels_p
 	  && ((estimated != -1
-	       && (estimated
-		   < ((HOST_WIDE_INT) n_threads
-		      * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
+	       && (estimated < ((HOST_WIDE_INT) threads * m_p_thread - 1)))
 	      /* Do not bother with loops in cold areas.  */
 	      || optimize_loop_nest_for_size_p (loop)))
 	continue;
-- 
2.45.2