From 9db03c0ed521ed8c5182e67b435636d08181ddeb Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@nvidia.com>
Date: Fri, 25 Jul 2025 17:55:03 +0200
Subject: [PATCH] tree-parloops: Enable runtime thread detection with
 -ftree-parallelize-loops=0

This patch adds runtime thread count detection to auto-parallelization.
-ftree-parallelize-loops=0 option generates parallelized loops without
specifying a fixed thread count, deferring this decision to program execution
time where it is controlled by the OMP_NUM_THREADS environment variable.

The patch changes:

1. Flag semantics:
   - Default (-1): auto-parallelization disabled.
   - 0: runtime thread detection via OMP_NUM_THREADS.
   - N>1: fixed thread count (no change to previous behavior.)

2. Gate condition: allow pass execution for flag == 0 || flag > 1.

3. OpenMP builtin enablement: enable for flag >= 0 instead of > 1.

4. Thread count handling: when flag == 0, set n_threads=0 and omit
   num_threads clause, letting OpenMP runtime determine thread count.

5. Profitability checks: bypass thread-count-dependent checks when n_threads=0.

6. Driver integration: automatically link libgomp and enable pthread
   support when -ftree-parallelize-loops=0 is used.

Bootstrap and regression tested on aarch64-linux.  Compiled SPEC HPC pot3d
https://www.spec.org/hpc2021/docs/benchmarks/628.pot3d_s.html with
-ftree-parallelize-loops=0 and tested without having OMP_NUM_THREADS set in the
environment and with OMP_NUM_THREADS set to different values.

gcc/ChangeLog:

	* builtins.def (DEF_GOMP_BUILTIN): Enable OpenMP builtins for
	flag_tree_parallelize_loops >= 0.
	* common.opt (ftree-parallelize-loops): Change initial value to -1.
	* gcc/doc/invoke.texi(ftree-parallelize-loops=n): Document possible
	values for variable n.
	* gcc.cc (LINK_SPEC): Add automatic libgomp linking for
	-ftree-parallelize-loops=0.
	(GOMP_SELF_SPECS): Add automatic pthread linking for
	-ftree-parallelize-loops=0.
	* tree-parloops.cc (create_parallel_loop): Generate a "#pragma omp
	parallel" without num_threads(x) clause when n_threads is zero.
	(gen_parallel_loop): Use a conservative value of 2 for the auto-
	parallelization cost model in case it is a runtime check.
	(parallelize_loops): Handle flag_tree_parallelize_loops == 0 as
	n_threads = 0.
	(gate): Execute the pass when flag_tree_parallelize_loops >= 0.

gcc/testsuite/ChangeLog:

	* gcc.dg/autopar/runtime-threads-1.c: New test.

Signed-off-by: Sebastian Pop <spop@nvidia.com>
---
 gcc/builtins.def                              |  2 +-
 gcc/common.opt                                |  2 +-
 gcc/doc/invoke.texi                           | 14 ++++-
 gcc/gcc.cc                                    |  4 +-
 .../gcc.dg/autopar/runtime-threads-1.c        | 61 +++++++++++++++++++
 gcc/tree-parloops.cc                          | 34 ++++++++---
 6 files changed, 101 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/autopar/runtime-threads-1.c

diff --git a/gcc/builtins.def b/gcc/builtins.def
index f6f3e104f6a..c4d86654aeb 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -223,7 +223,7 @@ along with GCC; see the file COPYING3.  If not see
                false, true, true, ATTRS, false, \
 	       (flag_openacc \
 		|| flag_openmp \
-		|| flag_tree_parallelize_loops > 1))
+		|| flag_tree_parallelize_loops >= 0))
 
 /* Builtin used by the implementation of GNU TM.  These
    functions are mapped to the actual implementation of the STM library. */
diff --git a/gcc/common.opt b/gcc/common.opt
index ea39f87ae71..f9589b44cc7 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -3178,7 +3178,7 @@ Common Var(flag_tree_loop_optimize) Init(1) Optimization
 Enable loop optimizations on tree level.
 
 ftree-parallelize-loops=
-Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(1) Optimization
+Common Joined RejectNegative UInteger Var(flag_tree_parallelize_loops) Init(-1) Optimization
 -ftree-parallelize-loops=<number>	Enable automatic parallelization of loops.
 
 ftree-phiprop
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index e0a641213ae..2d239e69faa 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14102,14 +14102,22 @@ Perform induction variable optimizations (strength reduction, induction
 variable merging and induction variable elimination) on trees.
 
 @opindex ftree-parallelize-loops
-@item -ftree-parallelize-loops=n
-Parallelize loops, i.e., split their iteration space to run in n threads.
+@item -ftree-parallelize-loops=@var{n}
+Parallelize loops, i.e., split their iteration space to run in @var{n} threads.
 This is only possible for loops whose iterations are independent
 and can be arbitrarily reordered.  The optimization is only
 profitable on multiprocessor machines, for loops that are CPU-intensive,
 rather than constrained e.g.@: by memory bandwidth.  This option
 implies @option{-pthread}, and thus is only supported on targets
-that have support for @option{-pthread}.
+that have support for @option{-pthread}.  A positive value for @var{n}
+corresponds to the number of threads to be created by the compiler and
+cannot be changed after compilation: the number of threads is set by
+the compiler with the num_threads clause to the "#pragma omp parallel
+num_threads(@var{n})".  When @var{n} is zero, the environment variable
+@env{OMP_NUM_THREADS} is used to set the number of threads to be
+created at program execution time.  If @env{OMP_NUM_THREADS} is not
+set, the OpenMP runtime determines the number of processors on the
+system and uses that number for the number of threads to be created.
 
 @opindex ftree-pta
 @item -ftree-pta
diff --git a/gcc/gcc.cc b/gcc/gcc.cc
index 00f93d00f96..34b41ff6d62 100644
--- a/gcc/gcc.cc
+++ b/gcc/gcc.cc
@@ -1161,7 +1161,7 @@ proper position among the other output files.  */
     %{s} %{t} %{u*} %{z} %{Z} %{!nostdlib:%{!r:%{!nostartfiles:%S}}} \
     %{static|no-pie|static-pie:} %@{L*} %(link_libgcc) " \
     VTABLE_VERIFICATION_SPEC " " SANITIZER_EARLY_SPEC " %o "" \
-    %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1):\
+    %{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1)|ftree-parallelize-loops=0:\
 	%:include(libgomp.spec)%(link_gomp)}\
     %{fgnu-tm:%:include(libitm.spec)%(link_itm)}\
     " STACK_SPLIT_SPEC "\
@@ -1342,7 +1342,7 @@ static const char *const multilib_defaults_raw[] = MULTILIB_DEFAULTS;
    for targets that use different start files and suchlike.  */
 #ifndef GOMP_SELF_SPECS
 #define GOMP_SELF_SPECS \
-  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1): " \
+  "%{fopenacc|fopenmp|%:gt(%{ftree-parallelize-loops=*:%*} 1)|ftree-parallelize-loops=0: " \
   "-pthread}"
 #endif
 
diff --git a/gcc/testsuite/gcc.dg/autopar/runtime-threads-1.c b/gcc/testsuite/gcc.dg/autopar/runtime-threads-1.c
new file mode 100644
index 00000000000..d8f7277ce7c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/autopar/runtime-threads-1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-parallelize-loops=0 -fdump-tree-parloops2-details" } */
+
+#include <stdlib.h>
+
+#define N 1000
+
+int a[N], b[N], c[N];
+
+void
+test_parallel_loop (void)
+{
+  int i;
+
+  /* This loop should be auto-parallelized.
+
+     When using -ftree-parallelize-loops=n with n positive, the number of
+     threads is set by the compiler with the num_threads clause to the "#pragma
+     omp parallel".  When n is zero, the environment variable OMP_NUM_THREADS
+     is used to set the number of threads at program execution time.  If
+     OMP_NUM_THREADS is not set in the environment, the OpenMP runtime
+     determines the number of available processors on the system and uses that
+     number for the number of threads.  */
+#pragma GCC unroll 0
+  for (i = 0; i < N; i++)
+    a[i] = b[i] + c[i];
+}
+
+int
+main (void)
+{
+  int i;
+
+  /* Initialize arrays */
+  for (i = 0; i < N; i++)
+    {
+      b[i] = i;
+      c[i] = i * 2;
+    }
+
+  test_parallel_loop ();
+
+  /* Verify results */
+  for (i = 0; i < N; i++)
+    {
+      if (a[i] != b[i] + c[i])
+	abort ();
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "parallelizing" "parloops2" } } */
+
+/* Check that "#pragma omp parallel" is generated.  */
+/* { dg-final { scan-tree-dump "pragma omp parallel" "parloops2" } } */
+
+/* Check that instead of generating a num_threads(x) clause, the compiler calls
+   "__builtin_omp_get_num_threads" that will set the number of threads at
+   program execution time.  */
+/* { dg-final { scan-tree-dump "__builtin_omp_get_num_threads" "parloops2" } } */
diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 888a834faf9..d02f9447db0 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2768,11 +2768,21 @@ create_parallel_loop (class loop *loop, tree loop_fn, tree data,
       basic_block paral_bb = single_pred (bb);
       gsi = gsi_last_bb (paral_bb);
 
-      gcc_checking_assert (n_threads != 0);
-      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
-      OMP_CLAUSE_NUM_THREADS_EXPR (t)
-	= build_int_cst (integer_type_node, n_threads);
-      omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+      /* Build the OMP_CLAUSE_NUM_THREADS clause only if we have a fixed
+	 thread count.  If n_threads is 0, let OpenMP runtime determine
+	 the thread count from OMP_NUM_THREADS environment variable.  */
+      if (n_threads > 0)
+	{
+	  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+	  OMP_CLAUSE_NUM_THREADS_EXPR (t)
+	    = build_int_cst (integer_type_node, n_threads);
+	  omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+	}
+      else
+	{
+	  /* No hardcoded thread count, let OpenMP runtime decide.  */
+	  omp_par_stmt = gimple_build_omp_parallel (NULL, NULL_TREE, loop_fn, data);
+	}
       gimple_set_location (omp_par_stmt, loc);
 
       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
@@ -3058,10 +3068,12 @@ gen_parallel_loop (class loop *loop,
       else
 	m_p_thread=MIN_PER_THREAD;
 
-      gcc_checking_assert (n_threads != 0);
+      /* For runtime thread detection (n_threads == 0), use a conservative
+	 estimate of 2 threads for the many iterations condition check.  */
+      unsigned threads_for_check = (n_threads > 0) ? n_threads : 2;
       many_iterations_cond =
 	fold_build2 (GE_EXPR, boolean_type_node,
-		     nit, build_int_cst (type, m_p_thread * n_threads - 1));
+		     nit, build_int_cst (type, m_p_thread * threads_for_check - 1));
 
       many_iterations_cond
 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
@@ -4017,9 +4029,12 @@ parallelize_loops (bool oacc_kernels_p)
     return false;
 
   /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
-     the argument to -ftree-parallelize-loops.  */
+     the argument to -ftree-parallelize-loops.  When flag_tree_parallelize_loops
+     is 0, use runtime thread detection (let OpenMP runtime read OMP_NUM_THREADS). */
   if (oacc_kernels_p)
     n_threads = 0;
+  else if (flag_tree_parallelize_loops == 0)
+    n_threads = 0;  /* Runtime thread detection via OpenMP */
   else
     n_threads = flag_tree_parallelize_loops;
 
@@ -4095,6 +4110,7 @@ parallelize_loops (bool oacc_kernels_p)
       if (!flag_loop_parallelize_all
 	  && !oacc_kernels_p
 	  && ((estimated != -1
+	       && n_threads > 0
 	       && (estimated
 		   < ((HOST_WIDE_INT) n_threads
 		      * (loop->inner ? 2 : MIN_PER_THREAD) - 1)))
@@ -4186,7 +4202,7 @@ public:
     if (oacc_kernels_p)
       return flag_openacc;
     else
-      return flag_tree_parallelize_loops > 1;
+      return flag_tree_parallelize_loops >= 0;
   }
   unsigned int execute (function *) final override;
   opt_pass * clone () final override
-- 
2.45.2

