On 01/08/2025 18:23, Pádraig Brady wrote:
cgroup constraints are a popular mechanism on linux.
I was thinking of augmenting the nproc routines
to incorporate cgroup cpu constraints for NPROC_CURRENT_OVERRIDABLE
so that we'd return MIN(cgroup constraint, current value);

The cgroup constraint would be inferred from:
    cat $(findmnt -n -t cgroup2 -o target)/$(cut -d: -f3- < 
/proc/self/cgroup)/cpu.max
though the C will be a bit more involved of course :)

I'll look at doing this soon, unless there are other opinions.

Proposed patch is attached.
I've tested it in coreutils on old linux, new linux, non linux.

cheers,
Padraig
From 524e3755de77094b0194af62e01885b4155cbed0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 18 Aug 2025 15:34:59 +0100
Subject: [PATCH] nproc: honor cgroupv2 CPU quotas

cgroupv1 CPU quotas are not considered,
as those are now legacy (RHEL7 era),
and are more complex/inefficient to parse.

Tested in coreutils on Fedora 42 like:

  # Honor limit anywhere in /proc/self/cgroup hierarchy
  $ systemd_nested_cgroup=\
/sys/fs/cgroup/user.slice/user-1001.slice/[email protected]/app.slice/

  # The number of processors on this system
  $ src/nproc
  4

  # Behave like MAX (1, (int)round(quota/period))
  $ echo "100000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  1
  $ echo "90000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  1
  $ echo "140000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  1
  $ echo "150000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  2

  # Ensure NPROC_ALL takes precedence
  $ echo "100000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc --all
  4

  # Ensure OMP env vars have appropriate precedence
  $ echo "200000 100000" > $systemd_nested_cgroup/cpu.max
  $ OMP_NUM_THREADS=10 src/nproc
  10
  $ OMP_THREAD_LIMIT=10 src/nproc
  2

  # Ensure quota only reduces
  $ echo "500000 100000" > $systemd_nested_cgroup/cpu.max
  $ src/nproc
  4

  # Reset system to unlimited
  $ echo "max 100000" > $systemd_nested_cgroup/cpu.max

  # Test quota in root hierarchy
  $ podman run --cpus=2 -i --rm fedora:latest /tmp/nproc
  2
  $ podman run --cpus=1.5 -i --rm fedora:latest /tmp/nproc
  2
  $ podman run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
  1
  $ podman run --cpus=100 -i --rm fedora:latest /tmp/nproc
  4

  # Docker is similar to podman, but explicitly limits max allowable
  $ docker run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
  1
  $ docker run --cpus=100 -i --rm fedora:latest /tmp/nproc
  docker: Error response from daemon:
  range of CPUs is from 0.01 to 4.00, as there are only 4 CPUs

* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
first at the common location for efficiency,
resorting to searching mount points otherwise.
(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
returning the lowest integer number of CPUs configured.
(cpu_quota): On Linux return the cgroupv2 CPU quota if the
currrent scheduler honors it.  Otherwise return ULONG_MAX.
(num_processors): Clamp the return to <= quota.
* module/nproc: Depend on mntent-h.
---
 ChangeLog     |  13 ++++
 lib/nproc.c   | 176 ++++++++++++++++++++++++++++++++++++++++++++++----
 modules/nproc |   1 +
 3 files changed, 176 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 29897dc8c5..245b0c2880 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2025-08-18  Pádraig Brady  <[email protected]>
+
+	nproc: honor cgroupv2 CPU quotas
+	* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
+	first at the common location for efficiency,
+	resorting to searching mount points otherwise.
+	(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
+	returning the lowest integer number of CPUs configured.
+	(cpu_quota): On Linux return the cgroupv2 CPU quota if the
+	currrent scheduler honors it.  Otherwise return ULONG_MAX.
+	(num_processors): Clamp the return to <= quota.
+	* module/nproc: Depend on mntent-h.
+
 2025-08-15  Bruno Haible  <[email protected]>
 
 	Reduce risk of compilation errors within include files.
diff --git a/lib/nproc.c b/lib/nproc.c
index cecf60bc6e..4bc805ee4e 100644
--- a/lib/nproc.c
+++ b/lib/nproc.c
@@ -22,7 +22,12 @@
 
 #include <errno.h>
 #include <limits.h>
+#if HAVE_MNTENT_H
+#include <mntent.h>
+#endif
 #include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 
 #if HAVE_PTHREAD_GETAFFINITY_NP && 0
@@ -62,6 +67,8 @@
 
 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
 
+#define NPROC_MINIMUM 1
+
 /* Return the number of processors available to the current process, based
    on a modern system call that returns the "affinity" between the current
    process and each CPU.  Return 0 if unknown or if such a system call does
@@ -244,7 +251,7 @@ num_processors_via_affinity_mask (void)
 /* Return the total number of processors.  Here QUERY must be one of
    NPROC_ALL, NPROC_CURRENT.  The result is guaranteed to be at least 1.  */
 static unsigned long int
-num_processors_ignoring_omp (enum nproc_query query)
+num_processors_available (enum nproc_query query)
 {
   /* On systems with a modern affinity mask system call, we have
          sysconf (_SC_NPROCESSORS_CONF)
@@ -377,7 +384,140 @@ num_processors_ignoring_omp (enum nproc_query query)
   }
 #endif
 
-  return 1;
+  return NPROC_MINIMUM;
+}
+
+#if defined __linux__ || defined __ANDROID__
+static char *
+cgroup2_mount (void)
+{
+  FILE *fp;
+  char *ret = NULL;
+
+  /* Check the usual location first.  */
+  if (access ("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0)
+      return strdup ("/sys/fs/cgroup");
+
+#if HAVE_MNTENT_H
+  /* Otherwise look for the mount point.  */
+  struct mntent *mnt;
+  if (! (fp = setmntent ("/proc/mounts", "r")))
+    return NULL;
+  while ((mnt = getmntent (fp)) != NULL)
+    {
+      if (strcmp (mnt->mnt_type, "cgroup2") == 0)
+        {
+          ret = strdup (mnt->mnt_dir);
+          break;
+        }
+    }
+  endmntent(fp);
+#endif
+
+  return ret;
+}
+
+static unsigned long int
+get_cgroup2_cpu_quota (void)
+{
+  unsigned long int cpu_quota = ULONG_MAX;
+  FILE *fp;
+
+  fp = fopen ("/proc/self/cgroup", "r");
+  if (! fp)
+    return cpu_quota;
+
+  /* Get our cgroupv2 (unififed) hierarchy.  */
+  char *cgroup = NULL;
+  char *cgroup_str = NULL;
+  size_t cgroup_size = 0;
+  ssize_t read;
+  while ((read = getline (&cgroup_str, &cgroup_size, fp)) != -1)
+    {
+      if (strncmp (cgroup_str, "0::/", 4) == 0)
+        {
+          char *end = cgroup_str + read - 1;
+          if (*end == '\n')
+            *end = '\0';
+          cgroup = cgroup_str + 3;
+          break;
+        }
+    }
+  fclose (fp);
+
+  char *mount = NULL;
+  if (cgroup && ! (mount = cgroup2_mount ()))
+    cgroup = NULL;
+
+  /* Find the lowest quota in the hierarchy.  */
+  char *quota_str = NULL;
+  size_t quota_size = 0;
+  while (cgroup && *cgroup)
+    {
+      char cpu_max_file[PATH_MAX];
+      snprintf(cpu_max_file, sizeof (cpu_max_file),
+              "%s%s/cpu.max", mount, cgroup);
+
+      if ((fp = fopen (cpu_max_file, "r"))
+          && getline (&quota_str, &quota_size, fp) != -1
+          && strncmp (quota_str, "max", 3) != 0)
+        {
+          long quota, period;
+          if (sscanf (quota_str, "%ld %ld", &quota, &period) == 2 && period)
+            {
+              double ncpus = (double)quota / period;
+              if (cpu_quota == ULONG_MAX || ncpus < cpu_quota)
+                {
+                  cpu_quota = MAX (1, (long)((double)quota / period + 0.5));
+                  /* nproc will return 1 minimum, so no point going lower */
+                  if (cpu_quota == 1)
+                    *cgroup = '\0';
+                }
+            }
+        }
+
+      if (fp)
+        fclose (fp);
+
+      char *last_sep = strrchr (cgroup, '/');
+      if (! last_sep)
+        break;
+      if (last_sep == cgroup && *(cgroup + 1))
+        *(cgroup + 1) = '\0';  /* Iterate on "/" also.  */
+      else
+        *last_sep = '\0';
+    }
+
+  free (quota_str);
+  free (cgroup_str);
+  free (mount);
+  return cpu_quota;
+}
+#endif
+
+static unsigned long int
+cpu_quota (void)
+{
+  unsigned long int quota = ULONG_MAX;
+
+#if defined __linux__ || defined __ANDROID__
+# if HAVE_SCHED_GETAFFINITY_LIKE_GLIBC && defined SCHED_DEADLINE
+  /* We've a new enough sched.h  */
+  switch (sched_getscheduler(0))
+    {
+      case SCHED_FIFO:
+      case SCHED_RR:
+      case SCHED_DEADLINE:
+        quota = ULONG_MAX;
+        break;
+      default:
+        quota = get_cgroup2_cpu_quota();
+        break;
+    }
+# endif
+#endif
+
+  return quota;
 }
 
 /* Parse OMP environment variables without dependence on OMP.
@@ -416,13 +556,13 @@ parse_omp_threads (char const* threads)
 unsigned long int
 num_processors (enum nproc_query query)
 {
-  unsigned long int omp_env_limit = ULONG_MAX;
+  unsigned long int nproc_limit = ULONG_MAX;
 
+  /* Honor the OpenMP environment variables, recognized also by all
+     programs that are based on OpenMP.  */
   if (query == NPROC_CURRENT_OVERRIDABLE)
     {
-      unsigned long int omp_env_threads;
-      /* Honor the OpenMP environment variables, recognized also by all
-         programs that are based on OpenMP.  */
+      unsigned long int omp_env_threads, omp_env_limit;
       omp_env_threads = parse_omp_threads (getenv ("OMP_NUM_THREADS"));
       omp_env_limit = parse_omp_threads (getenv ("OMP_THREAD_LIMIT"));
       if (! omp_env_limit)
@@ -431,14 +571,22 @@ num_processors (enum nproc_query query)
       if (omp_env_threads)
         return MIN (omp_env_threads, omp_env_limit);
 
+      nproc_limit = omp_env_limit;
       query = NPROC_CURRENT;
     }
-  /* Here query is one of NPROC_ALL, NPROC_CURRENT.  */
-  if (omp_env_limit == 1)
-    /* No need to even call num_processors_ignoring_omp (query).  */
-    return 1;
-  {
-    unsigned long nprocs = num_processors_ignoring_omp (query);
-    return MIN (nprocs, omp_env_limit);
-  }
+
+  /* Honor any CPU quotas.  */
+  if (query == NPROC_CURRENT && nproc_limit > NPROC_MINIMUM)
+    {
+      unsigned long int quota = cpu_quota ();
+      nproc_limit = MIN (quota, nproc_limit);
+    }
+
+  if (nproc_limit > NPROC_MINIMUM)
+    {
+      unsigned long nprocs = num_processors_available (query);
+      nproc_limit = MIN (nprocs, nproc_limit);
+    }
+
+  return nproc_limit;
 }
diff --git a/modules/nproc b/modules/nproc
index 1081f7b778..de8cb1ef82 100644
--- a/modules/nproc
+++ b/modules/nproc
@@ -10,6 +10,7 @@ Depends-on:
 c-ctype
 extensions
 minmax
+mntent-h
 unistd-h
 
 configure.ac:
-- 
2.50.1

Reply via email to