On 18/08/2025 20:52, Collin Funk wrote:
Bruno Haible via Gnulib discussion list <[email protected]> writes:

Proposed patch is attached.

I guess you want the patch to be reviewed?

Well no pressure.
I'll apply it in a day or so after giving folks some time to comment.

Reviewing a patch, for me, includes validating that the body of a
function implements its specification comment. But all three new
functions lacks such a comment; therefore I can't really review anything.

I was going to take a look as well. Agree that comments would be nice
though. Since I (and likely others) do not fully understand linux-isms
like sysfs, procfs, and cgroups.

I see four lines that need whitespace fixes:
     return strdup ("/sys/fs/cgroup");
   snprintf(cpu_max_file, sizeof (cpu_max_file),
#if HAVE_MNTENT_H
#endif

Some other spacing nits, they all need a space before parentheses:

     endmntent(fp);
     switch (sched_getscheduler(0))
     quota = get_cgroup2_cpu_quota();

V2 attached with comments.

cheers,
Padraig
From f935d969cd882da259171e469302645503837b6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 18 Aug 2025 15:34:59 +0100
Subject: [PATCH] nproc: honor cgroupv2 CPU quotas

cgroupv1 CPU quotas are not considered,
as those are now legacy (RHEL7 era),
and are more complex/inefficient to parse.

Tested in coreutils on Fedora 42 like:

  # Honor limit anywhere in /proc/self/cgroup hierarchy
  $ systemd_nested_cgroup=\
/sys/fs/cgroup/user.slice/user-1001.slice/[email protected]/app.slice/

  # The number of processors on this system
  $ src/nproc
  4

  # Behave like MAX (1, (int)round(quota/period))
  $ echo "100000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  1
  $ echo "90000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  1
  $ echo "140000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  1
  $ echo "150000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  2

  # Ensure NPROC_ALL takes precedence
  $ echo "100000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc --all
  4

  # Ensure OMP env vars have appropriate precedence
  $ echo "200000 100000" > $systemd_nested_cgroup/cpu.max
  $ OMP_NUM_THREADS=10 src/nproc
  10
  $ OMP_THREAD_LIMIT=10 src/nproc
  2

  # Ensure quota only reduces
  $ echo "500000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  4

  # Reset system to unlimited
  $ echo "max 100000" > $systemd_nested_cgroup/cpu.max

  # Test quota in root hierarchy
  $ podman run --cpus=2 -i --rm fedora:latest /tmp/nproc
  2
  $ podman run --cpus=1.5 -i --rm fedora:latest /tmp/nproc
  2
  $ podman run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
  1
  $ podman run --cpus=100 -i --rm fedora:latest /tmp/nproc
  4

  # Docker is similar to podman, but explicitly limits max allowable
  $ docker run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
  1
  $ docker run --cpus=100 -i --rm fedora:latest /tmp/nproc
  docker: Error response from daemon:
  range of CPUs is from 0.01 to 4.00, as there are only 4 CPUs

* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
first at the common location for efficiency,
resorting to searching mount points otherwise.
(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
returning the lowest integer number of CPUs configured.
(cpu_quota): On Linux return the cgroupv2 CPU quota if the
currrent scheduler honors it.  Otherwise return ULONG_MAX.
(num_processors): Clamp the return to <= quota.
* module/nproc: Depend on mntent-h.
---
 ChangeLog     |  13 ++++
 lib/nproc.c   | 188 ++++++++++++++++++++++++++++++++++++++++++++++----
 modules/nproc |   1 +
 3 files changed, 188 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 29897dc8c5..fb3c905b8f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2025-08-18  Pádraig Brady  <[email protected]>
+
+	nproc: honor cgroupv2 CPU quotas
+	* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
+	first at the common location for efficiency,
+	resorting to searching mount points otherwise.
+	(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
+	returning the lowest integer number of CPUs configured.
+	(cpu_quota): On Linux return the cgroupv2 CPU quota if the
+	current scheduler honors it.  Otherwise return ULONG_MAX.
+	(num_processors): Clamp the return to <= quota.
+	* module/nproc: Depend on mntent-h.
+
 2025-08-15  Bruno Haible  <[email protected]>
 
 	Reduce risk of compilation errors within include files.
diff --git a/lib/nproc.c b/lib/nproc.c
index cecf60bc6e..2906900896 100644
--- a/lib/nproc.c
+++ b/lib/nproc.c
@@ -22,7 +22,12 @@
 
 #include <errno.h>
 #include <limits.h>
+#if HAVE_MNTENT_H
+# include <mntent.h>
+#endif
 #include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 
 #if HAVE_PTHREAD_GETAFFINITY_NP && 0
@@ -62,6 +67,8 @@
 
 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
 
+#define NPROC_MINIMUM 1
+
 /* Return the number of processors available to the current process, based
    on a modern system call that returns the "affinity" between the current
    process and each CPU.  Return 0 if unknown or if such a system call does
@@ -244,7 +251,7 @@ num_processors_via_affinity_mask (void)
 /* Return the total number of processors.  Here QUERY must be one of
    NPROC_ALL, NPROC_CURRENT.  The result is guaranteed to be at least 1.  */
 static unsigned long int
-num_processors_ignoring_omp (enum nproc_query query)
+num_processors_available (enum nproc_query query)
 {
   /* On systems with a modern affinity mask system call, we have
          sysconf (_SC_NPROCESSORS_CONF)
@@ -377,7 +384,152 @@ num_processors_ignoring_omp (enum nproc_query query)
   }
 #endif
 
-  return 1;
+  return NPROC_MINIMUM;
+}
+
+#if defined __linux__ || defined __ANDROID__
+/* Identify the cgroup2 mount point,
+   initially at the usual location for efficiency,
+   resorting to searching mount points otherwise.
+   Return NULL if the mount point is not found.
+   The returned string can be freed.  */
+static char *
+cgroup2_mount (void)
+{
+  FILE *fp;
+  char *ret = NULL;
+
+  /* Check the usual location first.  */
+  if (access ("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0)
+    return strdup ("/sys/fs/cgroup");
+
+#if HAVE_MNTENT_H
+  /* Otherwise look for the mount point.  */
+  struct mntent *mnt;
+  if (! (fp = setmntent ("/proc/mounts", "r")))
+    return NULL;
+  while ((mnt = getmntent (fp)) != NULL)
+    {
+      if (strcmp (mnt->mnt_type, "cgroup2") == 0)
+        {
+          ret = strdup (mnt->mnt_dir);
+          break;
+        }
+    }
+  endmntent (fp);
+#endif
+
+  return ret;
+}
+
+/* Return the minimum configured cgroupv2 CPU quota for the current process.
+   Return ULONG_MAX if quota can't be read.
+   Returned value will be >= 1.  */
+static unsigned long int
+get_cgroup2_cpu_quota (void)
+{
+  unsigned long int cpu_quota = ULONG_MAX;
+  FILE *fp;
+
+  fp = fopen ("/proc/self/cgroup", "r");
+  if (! fp)
+    return cpu_quota;
+
+  /* Get our cgroupv2 (unififed) hierarchy.  */
+  char *cgroup = NULL;
+  char *cgroup_str = NULL;
+  size_t cgroup_size = 0;
+  ssize_t read;
+  while ((read = getline (&cgroup_str, &cgroup_size, fp)) != -1)
+    {
+      if (strncmp (cgroup_str, "0::/", 4) == 0)
+        {
+          char *end = cgroup_str + read - 1;
+          if (*end == '\n')
+            *end = '\0';
+          cgroup = cgroup_str + 3;
+          break;
+        }
+    }
+  fclose (fp);
+
+  char *mount = NULL;
+  if (cgroup && ! (mount = cgroup2_mount ()))
+    cgroup = NULL;
+
+  /* Find the lowest quota in the hierarchy.  */
+  char *quota_str = NULL;
+  size_t quota_size = 0;
+  while (cgroup && *cgroup)
+    {
+      char cpu_max_file[PATH_MAX];
+      snprintf (cpu_max_file, sizeof (cpu_max_file),
+                "%s%s/cpu.max", mount, cgroup);
+
+      if ((fp = fopen (cpu_max_file, "r"))
+          && getline (&quota_str, &quota_size, fp) != -1
+          && strncmp (quota_str, "max", 3) != 0)
+        {
+          long quota, period;
+          if (sscanf (quota_str, "%ld %ld", &quota, &period) == 2 && period)
+            {
+              double ncpus = (double)quota / period;
+              if (cpu_quota == ULONG_MAX || ncpus < cpu_quota)
+                {
+                  cpu_quota = MAX (1, (long)((double)quota / period + 0.5));
+                  /* nproc will return 1 minimum, so no point going lower */
+                  if (cpu_quota == 1)
+                    *cgroup = '\0';
+                }
+            }
+        }
+
+      if (fp)
+        fclose (fp);
+
+      char *last_sep = strrchr (cgroup, '/');
+      if (! last_sep)
+        break;
+      if (last_sep == cgroup && *(cgroup + 1))
+        *(cgroup + 1) = '\0';  /* Iterate on "/" also.  */
+      else
+        *last_sep = '\0';
+    }
+
+  free (quota_str);
+  free (cgroup_str);
+  free (mount);
+  return cpu_quota;
+}
+#endif
+
+
+/* Return the cgroupv2 CPU quota if the current scheduler honors it.
+   Otherwise return ULONG_MAX.
+   Returned value will be >= 1.  */
+static unsigned long int
+cpu_quota (void)
+{
+  unsigned long int quota = ULONG_MAX;
+
+#if defined __linux__ || defined __ANDROID__
+# if HAVE_SCHED_GETAFFINITY_LIKE_GLIBC && defined SCHED_DEADLINE
+  /* We've a new enough sched.h  */
+  switch (sched_getscheduler (0))
+    {
+      case SCHED_FIFO:
+      case SCHED_RR:
+      case SCHED_DEADLINE:
+        quota = ULONG_MAX;
+        break;
+      default:
+        quota = get_cgroup2_cpu_quota ();
+        break;
+    }
+# endif
+#endif
+
+  return quota;
 }
 
 /* Parse OMP environment variables without dependence on OMP.
@@ -416,13 +568,13 @@ parse_omp_threads (char const* threads)
 unsigned long int
 num_processors (enum nproc_query query)
 {
-  unsigned long int omp_env_limit = ULONG_MAX;
+  unsigned long int nproc_limit = ULONG_MAX;
 
+  /* Honor the OpenMP environment variables, recognized also by all
+     programs that are based on OpenMP.  */
   if (query == NPROC_CURRENT_OVERRIDABLE)
     {
-      unsigned long int omp_env_threads;
-      /* Honor the OpenMP environment variables, recognized also by all
-         programs that are based on OpenMP.  */
+      unsigned long int omp_env_threads, omp_env_limit;
       omp_env_threads = parse_omp_threads (getenv ("OMP_NUM_THREADS"));
       omp_env_limit = parse_omp_threads (getenv ("OMP_THREAD_LIMIT"));
       if (! omp_env_limit)
@@ -431,14 +583,22 @@ num_processors (enum nproc_query query)
       if (omp_env_threads)
         return MIN (omp_env_threads, omp_env_limit);
 
+      nproc_limit = omp_env_limit;
       query = NPROC_CURRENT;
     }
-  /* Here query is one of NPROC_ALL, NPROC_CURRENT.  */
-  if (omp_env_limit == 1)
-    /* No need to even call num_processors_ignoring_omp (query).  */
-    return 1;
-  {
-    unsigned long nprocs = num_processors_ignoring_omp (query);
-    return MIN (nprocs, omp_env_limit);
-  }
+
+  /* Honor any CPU quotas.  */
+  if (query == NPROC_CURRENT && nproc_limit > NPROC_MINIMUM)
+    {
+      unsigned long int quota = cpu_quota ();
+      nproc_limit = MIN (quota, nproc_limit);
+    }
+
+  if (nproc_limit > NPROC_MINIMUM)
+    {
+      unsigned long nprocs = num_processors_available (query);
+      nproc_limit = MIN (nprocs, nproc_limit);
+    }
+
+  return nproc_limit;
 }
diff --git a/modules/nproc b/modules/nproc
index 1081f7b778..de8cb1ef82 100644
--- a/modules/nproc
+++ b/modules/nproc
@@ -10,6 +10,7 @@ Depends-on:
 c-ctype
 extensions
 minmax
+mntent-h
 unistd-h
 
 configure.ac:
-- 
2.50.1

Reply via email to