On 18/08/2025 22:30, Bruno Haible wrote:
Pádraig Brady wrote:
V2 attached with comments.

Thanks. I can't spot any obvious mistake. But nevertheless:

   - The line
             cpu_quota = MAX (1, (long)((double)quota / period + 0.5));
     can be simplified to
             cpu_quota = MAX (1, (long)(ncpus + 0.5));

   - When cleaning up variables, it's more systematic to free them in reverse
     allocation order, that is:

     free (quota_str);
     free (mount);
     free (cgroup_str);

   - In the
       while (cgroup && *cgroup)
     loop, it would be useful to have a comment regarding what the 
slash-separated
     components of a cgroup are.

I updated (attached) to do all the above
and also add testing docs in tests/test-nproc.c
   - What is the point of testing HAVE_SCHED_GETAFFINITY_LIKE_GLIBC?
     Linux has had sched_getscheduler() for ages.

Well it was to align with the include ifdef for sched.h,
and piggy backing on the existing m4 check.
The associated comment summarizes this as:
/* We've a new enough sched.h  */

I'll push the attached later today.

thanks!
Padraig
From ed951e7a1a55bc22a7081266f0f26adf8999cf95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 18 Aug 2025 15:34:59 +0100
Subject: [PATCH] nproc: honor cgroupv2 CPU quotas

cgroupv1 CPU quotas are not considered,
as those are now legacy (RHEL7 era),
and are more complex/inefficient to parse.

Tested in coreutils on Fedora 42
as detailed in tests/test-nproc.c

* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
first at the common location for efficiency,
resorting to searching mount points otherwise.
(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
returning the lowest integer number of CPUs configured.
(cpu_quota): On Linux return the cgroupv2 CPU quota if the
currrent scheduler honors it.  Otherwise return ULONG_MAX.
(num_processors): Clamp the return to <= quota.
* modules/nproc: Depend on mntent-h.
* tests/test-nproc.c: Document how cgroup CPU quotas were tested.
---
 ChangeLog          |  13 +++
 lib/nproc.c        | 195 +++++++++++++++++++++++++++++++++++++++++----
 m4/nproc.m4        |   2 +-
 tests/test-nproc.c |  65 +++++++++++++++
 4 files changed, 260 insertions(+), 15 deletions(-)
 create mode 100644 tests/test-nproc.c

diff --git a/ChangeLog b/ChangeLog
index 29897dc8c5..fb3c905b8f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2025-08-18  Pádraig Brady  <[email protected]>
+
+	nproc: honor cgroupv2 CPU quotas
+	* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
+	first at the common location for efficiency,
+	resorting to searching mount points otherwise.
+	(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
+	returning the lowest integer number of CPUs configured.
+	(cpu_quota): On Linux return the cgroupv2 CPU quota if the
+	current scheduler honors it.  Otherwise return ULONG_MAX.
+	(num_processors): Clamp the return to <= quota.
+	* module/nproc: Depend on mntent-h.
+
 2025-08-15  Bruno Haible  <[email protected]>
 
 	Reduce risk of compilation errors within include files.
diff --git a/lib/nproc.c b/lib/nproc.c
index cecf60bc6e..7c5ae3acf9 100644
--- a/lib/nproc.c
+++ b/lib/nproc.c
@@ -22,7 +22,12 @@
 
 #include <errno.h>
 #include <limits.h>
+#if HAVE_MNTENT_H
+# include <mntent.h>
+#endif
 #include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 
 #if HAVE_PTHREAD_GETAFFINITY_NP && 0
@@ -62,6 +67,8 @@
 
 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
 
+#define NPROC_MINIMUM 1
+
 /* Return the number of processors available to the current process, based
    on a modern system call that returns the "affinity" between the current
    process and each CPU.  Return 0 if unknown or if such a system call does
@@ -244,7 +251,7 @@ num_processors_via_affinity_mask (void)
 /* Return the total number of processors.  Here QUERY must be one of
    NPROC_ALL, NPROC_CURRENT.  The result is guaranteed to be at least 1.  */
 static unsigned long int
-num_processors_ignoring_omp (enum nproc_query query)
+num_processors_available (enum nproc_query query)
 {
   /* On systems with a modern affinity mask system call, we have
          sysconf (_SC_NPROCESSORS_CONF)
@@ -377,7 +384,159 @@ num_processors_ignoring_omp (enum nproc_query query)
   }
 #endif
 
-  return 1;
+  return NPROC_MINIMUM;
+}
+
+#if defined __linux__ || defined __ANDROID__
+/* Identify the cgroup2 mount point,
+   initially at the usual location for efficiency,
+   resorting to searching mount points otherwise.
+   Return NULL if the mount point is not found.
+   The returned string can be freed.  */
+static char *
+cgroup2_mount (void)
+{
+  FILE *fp;
+  char *ret = NULL;
+
+  /* Check the usual location first.  */
+  if (access ("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0)
+    return strdup ("/sys/fs/cgroup");
+
+#if HAVE_MNTENT_H
+  /* Otherwise look for the mount point.  */
+  struct mntent *mnt;
+  if (! (fp = setmntent ("/proc/mounts", "r")))
+    return NULL;
+  while ((mnt = getmntent (fp)) != NULL)
+    {
+      if (strcmp (mnt->mnt_type, "cgroup2") == 0)
+        {
+          ret = strdup (mnt->mnt_dir);
+          break;
+        }
+    }
+  endmntent (fp);
+#endif
+
+  return ret;
+}
+
+/* Return the minimum configured cgroupv2 CPU quota for the current process.
+   Return ULONG_MAX if quota can't be read.
+   Returned value will be >= 1.  */
+static unsigned long int
+get_cgroup2_cpu_quota (void)
+{
+  unsigned long int cpu_quota = ULONG_MAX;
+  FILE *fp;
+
+  fp = fopen ("/proc/self/cgroup", "r");
+  if (! fp)
+    return cpu_quota;
+
+  /* Get our cgroupv2 (unififed) hierarchy.  */
+  char *cgroup = NULL;
+  char *cgroup_str = NULL;
+  size_t cgroup_size = 0;
+  ssize_t read;
+  while ((read = getline (&cgroup_str, &cgroup_size, fp)) != -1)
+    {
+      if (strncmp (cgroup_str, "0::/", 4) == 0)
+        {
+          char *end = cgroup_str + read - 1;
+          if (*end == '\n')
+            *end = '\0';
+          cgroup = cgroup_str + 3;
+          break;
+        }
+    }
+  fclose (fp);
+
+  char *mount = NULL;
+  if (cgroup && ! (mount = cgroup2_mount ()))
+    cgroup = NULL;
+
+  /* Find the lowest quota in the hierarchy.  */
+  char *quota_str = NULL;
+  size_t quota_size = 0;
+  while (cgroup && *cgroup)
+    {
+      /* Walk back up the nested cgroup hierarchy
+         to find the lowest cpu quota as defined in a cpu.max file.
+         Note this file may not be present if the cpu controller
+         is not enabled for that part of the hierarchy.  */
+
+      char cpu_max_file[PATH_MAX];
+      snprintf (cpu_max_file, sizeof (cpu_max_file),
+                "%s%s/cpu.max", mount, cgroup);
+
+      if ((fp = fopen (cpu_max_file, "r"))
+          && getline (&quota_str, &quota_size, fp) != -1
+          && strncmp (quota_str, "max", 3) != 0)
+        {
+          long quota, period;
+          if (sscanf (quota_str, "%ld %ld", &quota, &period) == 2 && period)
+            {
+              double ncpus = (double)quota / period;
+              if (cpu_quota == ULONG_MAX || ncpus < cpu_quota)
+                {
+                  cpu_quota = MAX (1, (long)(ncpus + 0.5));
+                  /* nproc will return 1 minimum, so no point going lower */
+                  if (cpu_quota == 1)
+                    *cgroup = '\0';
+                }
+            }
+        }
+
+      if (fp)
+        fclose (fp);
+
+      char *last_sep = strrchr (cgroup, '/');
+      if (! last_sep)
+        break;
+      if (last_sep == cgroup && *(cgroup + 1))
+        *(cgroup + 1) = '\0';  /* Iterate on "/" also.  */
+      else
+        *last_sep = '\0';
+    }
+
+  free (quota_str);
+  free (mount);
+  free (cgroup_str);
+
+  return cpu_quota;
+}
+#endif
+
+
+/* Return the cgroupv2 CPU quota if the current scheduler honors it.
+   Otherwise return ULONG_MAX.
+   Returned value will be >= 1.  */
+static unsigned long int
+cpu_quota (void)
+{
+  unsigned long int quota = ULONG_MAX;
+
+#if defined __linux__ || defined __ANDROID__
+# if HAVE_SCHED_GETAFFINITY_LIKE_GLIBC && defined SCHED_DEADLINE
+  /* We've a new enough sched.h  */
+  switch (sched_getscheduler (0))
+    {
+      case -1:
+      case SCHED_FIFO:
+      case SCHED_RR:
+      case SCHED_DEADLINE:
+        quota = ULONG_MAX;
+        break;
+      default:
+        quota = get_cgroup2_cpu_quota ();
+        break;
+    }
+# endif
+#endif
+
+  return quota;
 }
 
 /* Parse OMP environment variables without dependence on OMP.
@@ -416,13 +575,13 @@ parse_omp_threads (char const* threads)
 unsigned long int
 num_processors (enum nproc_query query)
 {
-  unsigned long int omp_env_limit = ULONG_MAX;
+  unsigned long int nproc_limit = ULONG_MAX;
 
+  /* Honor the OpenMP environment variables, recognized also by all
+     programs that are based on OpenMP.  */
   if (query == NPROC_CURRENT_OVERRIDABLE)
     {
-      unsigned long int omp_env_threads;
-      /* Honor the OpenMP environment variables, recognized also by all
-         programs that are based on OpenMP.  */
+      unsigned long int omp_env_threads, omp_env_limit;
       omp_env_threads = parse_omp_threads (getenv ("OMP_NUM_THREADS"));
       omp_env_limit = parse_omp_threads (getenv ("OMP_THREAD_LIMIT"));
       if (! omp_env_limit)
@@ -431,14 +590,22 @@ num_processors (enum nproc_query query)
       if (omp_env_threads)
         return MIN (omp_env_threads, omp_env_limit);
 
+      nproc_limit = omp_env_limit;
       query = NPROC_CURRENT;
     }
-  /* Here query is one of NPROC_ALL, NPROC_CURRENT.  */
-  if (omp_env_limit == 1)
-    /* No need to even call num_processors_ignoring_omp (query).  */
-    return 1;
-  {
-    unsigned long nprocs = num_processors_ignoring_omp (query);
-    return MIN (nprocs, omp_env_limit);
-  }
+
+  /* Honor any CPU quotas.  */
+  if (query == NPROC_CURRENT && nproc_limit > NPROC_MINIMUM)
+    {
+      unsigned long int quota = cpu_quota ();
+      nproc_limit = MIN (quota, nproc_limit);
+    }
+
+  if (nproc_limit > NPROC_MINIMUM)
+    {
+      unsigned long nprocs = num_processors_available (query);
+      nproc_limit = MIN (nprocs, nproc_limit);
+    }
+
+  return nproc_limit;
 }
diff --git a/m4/nproc.m4 b/m4/nproc.m4
index 48c239be06..5e96afc93a 100644
--- a/m4/nproc.m4
+++ b/m4/nproc.m4
@@ -17,7 +17,7 @@ AC_DEFUN([gl_PREREQ_NPROC],
   dnl Persuade glibc <sched.h> to declare CPU_SETSIZE, CPU_ISSET etc.
   AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS])
 
-  AC_CHECK_HEADERS([sys/pstat.h sys/sysmp.h sys/param.h],,,
+  AC_CHECK_HEADERS([mntent.h sys/pstat.h sys/sysmp.h sys/param.h],,,
     [AC_INCLUDES_DEFAULT])
   dnl <sys/sysctl.h> requires <sys/param.h> on OpenBSD 4.0.
   AC_CHECK_HEADERS([sys/sysctl.h],,,
diff --git a/tests/test-nproc.c b/tests/test-nproc.c
new file mode 100644
index 0000000000..2c7406f95d
--- /dev/null
+++ b/tests/test-nproc.c
@@ -0,0 +1,65 @@
+/*
+nproc honors cgroup v2 CPU quotas
+and was tested in coreutils on a Fedora 42 system as follows:
+
+# Note we honor a limit anywhere in /proc/self/cgroup hierarchy
+# so apply settings below in the parent cgroup of the current process
+$ nested_cgroup=/sys/fs/cgroup/$(dirname $(cut -d/ -f2- /proc/self/cgroup))
+$ echo $nested_cgroup
+/sys/fs/cgroup/user.slice/user-1001.slice/[email protected]/app.slice
+
+# This test system has 4 CPUs
+$ src/nproc
+4
+
+# Behave like MAX (1, (int)round(quota/period))
+$ echo "100000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "90000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "140000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "150000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+2
+
+# Ensure NPROC_ALL takes precedence
+$ echo "100000 100000" > $nested_cgroup/cpu.max
+$ src/nproc --all
+4
+
+# Ensure OMP env vars have appropriate precedence
+$ echo "200000 100000" > $nested_cgroup/cpu.max
+$ OMP_NUM_THREADS=10 src/nproc
+10
+$ OMP_THREAD_LIMIT=10 src/nproc
+2
+
+# Ensure quota only reduces
+$ echo "500000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+4
+
+# Restore system to unlimited
+$ echo "max 100000" > $nested_cgroup/cpu.max
+
+# Test quota in root hierarchy
+$ podman run --cpus=2 -i --rm fedora:latest /tmp/nproc
+2
+$ podman run --cpus=1.5 -i --rm fedora:latest /tmp/nproc
+2
+$ podman run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
+1
+$ podman run --cpus=100 -i --rm fedora:latest /tmp/nproc
+4
+
+# Docker is similar to podman, but explicitly limits max allowable
+$ docker run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
+1
+$ docker run --cpus=100 -i --rm fedora:latest /tmp/nproc
+docker: Error response from daemon:
+range of CPUs is from 0.01 to 4.00, as there are only 4 CPUs
+*/
-- 
2.50.1

Reply via email to