On 18/08/2025 22:30, Bruno Haible wrote:
Pádraig Brady wrote:
V2 attached with comments.
Thanks. I can't spot any obvious mistake. But nevertheless:
- The line
cpu_quota = MAX (1, (long)((double)quota / period + 0.5));
can be simplified to
cpu_quota = MAX (1, (long)(ncpus + 0.5));
- When cleaning up variables, it's more systematic to free them in reverse
allocation order, that is:
free (quota_str);
free (mount);
free (cgroup_str);
- In the
while (cgroup && *cgroup)
loop, it would be useful to have a comment regarding what the
slash-separated
components of a cgroup are.
I updated (attached) to do all the above
and also add testing docs in tests/test-nproc.c
- What is the point of testing HAVE_SCHED_GETAFFINITY_LIKE_GLIBC?
Linux has had sched_getscheduler() for ages.
Well it was to align with the include ifdef for sched.h,
and piggy backing on the existing m4 check.
The associated comment summarizes this as:
/* We've a new enough sched.h */
I'll push the attached later today.
thanks!
Padraig
From ed951e7a1a55bc22a7081266f0f26adf8999cf95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 18 Aug 2025 15:34:59 +0100
Subject: [PATCH] nproc: honor cgroupv2 CPU quotas
cgroupv1 CPU quotas are not considered,
as those are now legacy (RHEL7 era),
and are more complex/inefficient to parse.
Tested in coreutils on Fedora 42
as detailed in tests/test-nproc.c
* lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
first at the common location for efficiency,
resorting to searching mount points otherwise.
(get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
returning the lowest integer number of CPUs configured.
(cpu_quota): On Linux return the cgroupv2 CPU quota if the
currrent scheduler honors it. Otherwise return ULONG_MAX.
(num_processors): Clamp the return to <= quota.
* modules/nproc: Depend on mntent-h.
* tests/test-nproc.c: Document how cgroup CPU quotas were tested.
---
ChangeLog | 13 +++
lib/nproc.c | 195 +++++++++++++++++++++++++++++++++++++++++----
m4/nproc.m4 | 2 +-
tests/test-nproc.c | 65 +++++++++++++++
4 files changed, 260 insertions(+), 15 deletions(-)
create mode 100644 tests/test-nproc.c
diff --git a/ChangeLog b/ChangeLog
index 29897dc8c5..fb3c905b8f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2025-08-18 Pádraig Brady <[email protected]>
+
+ nproc: honor cgroupv2 CPU quotas
+ * lib/nproc.c (cgroup2_mount): Identify the cgroup2 mount point,
+ first at the common location for efficiency,
+ resorting to searching mount points otherwise.
+ (get_cgroup2_cpu_quota): Walk cgroup2 quotas for the current process,
+ returning the lowest integer number of CPUs configured.
+ (cpu_quota): On Linux return the cgroupv2 CPU quota if the
+ current scheduler honors it. Otherwise return ULONG_MAX.
+ (num_processors): Clamp the return to <= quota.
+ * module/nproc: Depend on mntent-h.
+
2025-08-15 Bruno Haible <[email protected]>
Reduce risk of compilation errors within include files.
diff --git a/lib/nproc.c b/lib/nproc.c
index cecf60bc6e..7c5ae3acf9 100644
--- a/lib/nproc.c
+++ b/lib/nproc.c
@@ -22,7 +22,12 @@
#include <errno.h>
#include <limits.h>
+#if HAVE_MNTENT_H
+# include <mntent.h>
+#endif
#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#if HAVE_PTHREAD_GETAFFINITY_NP && 0
@@ -62,6 +67,8 @@
#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+#define NPROC_MINIMUM 1
+
/* Return the number of processors available to the current process, based
on a modern system call that returns the "affinity" between the current
process and each CPU. Return 0 if unknown or if such a system call does
@@ -244,7 +251,7 @@ num_processors_via_affinity_mask (void)
/* Return the total number of processors. Here QUERY must be one of
NPROC_ALL, NPROC_CURRENT. The result is guaranteed to be at least 1. */
static unsigned long int
-num_processors_ignoring_omp (enum nproc_query query)
+num_processors_available (enum nproc_query query)
{
/* On systems with a modern affinity mask system call, we have
sysconf (_SC_NPROCESSORS_CONF)
@@ -377,7 +384,159 @@ num_processors_ignoring_omp (enum nproc_query query)
}
#endif
- return 1;
+ return NPROC_MINIMUM;
+}
+
+#if defined __linux__ || defined __ANDROID__
+/* Identify the cgroup2 mount point,
+ initially at the usual location for efficiency,
+ resorting to searching mount points otherwise.
+ Return NULL if the mount point is not found.
+ The returned string can be freed. */
+static char *
+cgroup2_mount (void)
+{
+ FILE *fp;
+ char *ret = NULL;
+
+ /* Check the usual location first. */
+ if (access ("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0)
+ return strdup ("/sys/fs/cgroup");
+
+#if HAVE_MNTENT_H
+ /* Otherwise look for the mount point. */
+ struct mntent *mnt;
+ if (! (fp = setmntent ("/proc/mounts", "r")))
+ return NULL;
+ while ((mnt = getmntent (fp)) != NULL)
+ {
+ if (strcmp (mnt->mnt_type, "cgroup2") == 0)
+ {
+ ret = strdup (mnt->mnt_dir);
+ break;
+ }
+ }
+ endmntent (fp);
+#endif
+
+ return ret;
+}
+
+/* Return the minimum configured cgroupv2 CPU quota for the current process.
+ Return ULONG_MAX if quota can't be read.
+ Returned value will be >= 1. */
+static unsigned long int
+get_cgroup2_cpu_quota (void)
+{
+ unsigned long int cpu_quota = ULONG_MAX;
+ FILE *fp;
+
+ fp = fopen ("/proc/self/cgroup", "r");
+ if (! fp)
+ return cpu_quota;
+
+ /* Get our cgroupv2 (unififed) hierarchy. */
+ char *cgroup = NULL;
+ char *cgroup_str = NULL;
+ size_t cgroup_size = 0;
+ ssize_t read;
+ while ((read = getline (&cgroup_str, &cgroup_size, fp)) != -1)
+ {
+ if (strncmp (cgroup_str, "0::/", 4) == 0)
+ {
+ char *end = cgroup_str + read - 1;
+ if (*end == '\n')
+ *end = '\0';
+ cgroup = cgroup_str + 3;
+ break;
+ }
+ }
+ fclose (fp);
+
+ char *mount = NULL;
+ if (cgroup && ! (mount = cgroup2_mount ()))
+ cgroup = NULL;
+
+ /* Find the lowest quota in the hierarchy. */
+ char *quota_str = NULL;
+ size_t quota_size = 0;
+ while (cgroup && *cgroup)
+ {
+ /* Walk back up the nested cgroup hierarchy
+ to find the lowest cpu quota as defined in a cpu.max file.
+ Note this file may not be present if the cpu controller
+ is not enabled for that part of the hierarchy. */
+
+ char cpu_max_file[PATH_MAX];
+ snprintf (cpu_max_file, sizeof (cpu_max_file),
+ "%s%s/cpu.max", mount, cgroup);
+
+ if ((fp = fopen (cpu_max_file, "r"))
+ && getline ("a_str, "a_size, fp) != -1
+ && strncmp (quota_str, "max", 3) != 0)
+ {
+ long quota, period;
+ if (sscanf (quota_str, "%ld %ld", "a, &period) == 2 && period)
+ {
+ double ncpus = (double)quota / period;
+ if (cpu_quota == ULONG_MAX || ncpus < cpu_quota)
+ {
+ cpu_quota = MAX (1, (long)(ncpus + 0.5));
+ /* nproc will return 1 minimum, so no point going lower */
+ if (cpu_quota == 1)
+ *cgroup = '\0';
+ }
+ }
+ }
+
+ if (fp)
+ fclose (fp);
+
+ char *last_sep = strrchr (cgroup, '/');
+ if (! last_sep)
+ break;
+ if (last_sep == cgroup && *(cgroup + 1))
+ *(cgroup + 1) = '\0'; /* Iterate on "/" also. */
+ else
+ *last_sep = '\0';
+ }
+
+ free (quota_str);
+ free (mount);
+ free (cgroup_str);
+
+ return cpu_quota;
+}
+#endif
+
+
+/* Return the cgroupv2 CPU quota if the current scheduler honors it.
+ Otherwise return ULONG_MAX.
+ Returned value will be >= 1. */
+static unsigned long int
+cpu_quota (void)
+{
+ unsigned long int quota = ULONG_MAX;
+
+#if defined __linux__ || defined __ANDROID__
+# if HAVE_SCHED_GETAFFINITY_LIKE_GLIBC && defined SCHED_DEADLINE
+ /* We've a new enough sched.h */
+ switch (sched_getscheduler (0))
+ {
+ case -1:
+ case SCHED_FIFO:
+ case SCHED_RR:
+ case SCHED_DEADLINE:
+ quota = ULONG_MAX;
+ break;
+ default:
+ quota = get_cgroup2_cpu_quota ();
+ break;
+ }
+# endif
+#endif
+
+ return quota;
}
/* Parse OMP environment variables without dependence on OMP.
@@ -416,13 +575,13 @@ parse_omp_threads (char const* threads)
unsigned long int
num_processors (enum nproc_query query)
{
- unsigned long int omp_env_limit = ULONG_MAX;
+ unsigned long int nproc_limit = ULONG_MAX;
+ /* Honor the OpenMP environment variables, recognized also by all
+ programs that are based on OpenMP. */
if (query == NPROC_CURRENT_OVERRIDABLE)
{
- unsigned long int omp_env_threads;
- /* Honor the OpenMP environment variables, recognized also by all
- programs that are based on OpenMP. */
+ unsigned long int omp_env_threads, omp_env_limit;
omp_env_threads = parse_omp_threads (getenv ("OMP_NUM_THREADS"));
omp_env_limit = parse_omp_threads (getenv ("OMP_THREAD_LIMIT"));
if (! omp_env_limit)
@@ -431,14 +590,22 @@ num_processors (enum nproc_query query)
if (omp_env_threads)
return MIN (omp_env_threads, omp_env_limit);
+ nproc_limit = omp_env_limit;
query = NPROC_CURRENT;
}
- /* Here query is one of NPROC_ALL, NPROC_CURRENT. */
- if (omp_env_limit == 1)
- /* No need to even call num_processors_ignoring_omp (query). */
- return 1;
- {
- unsigned long nprocs = num_processors_ignoring_omp (query);
- return MIN (nprocs, omp_env_limit);
- }
+
+ /* Honor any CPU quotas. */
+ if (query == NPROC_CURRENT && nproc_limit > NPROC_MINIMUM)
+ {
+ unsigned long int quota = cpu_quota ();
+ nproc_limit = MIN (quota, nproc_limit);
+ }
+
+ if (nproc_limit > NPROC_MINIMUM)
+ {
+ unsigned long nprocs = num_processors_available (query);
+ nproc_limit = MIN (nprocs, nproc_limit);
+ }
+
+ return nproc_limit;
}
diff --git a/m4/nproc.m4 b/m4/nproc.m4
index 48c239be06..5e96afc93a 100644
--- a/m4/nproc.m4
+++ b/m4/nproc.m4
@@ -17,7 +17,7 @@ AC_DEFUN([gl_PREREQ_NPROC],
dnl Persuade glibc <sched.h> to declare CPU_SETSIZE, CPU_ISSET etc.
AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS])
- AC_CHECK_HEADERS([sys/pstat.h sys/sysmp.h sys/param.h],,,
+ AC_CHECK_HEADERS([mntent.h sys/pstat.h sys/sysmp.h sys/param.h],,,
[AC_INCLUDES_DEFAULT])
dnl <sys/sysctl.h> requires <sys/param.h> on OpenBSD 4.0.
AC_CHECK_HEADERS([sys/sysctl.h],,,
diff --git a/tests/test-nproc.c b/tests/test-nproc.c
new file mode 100644
index 0000000000..2c7406f95d
--- /dev/null
+++ b/tests/test-nproc.c
@@ -0,0 +1,65 @@
+/*
+nproc honors cgroup v2 CPU quotas
+and was tested in coreutils on a Fedora 42 system as follows:
+
+# Note we honor a limit anywhere in /proc/self/cgroup hierarchy
+# so apply settings below in the parent cgroup of the current process
+$ nested_cgroup=/sys/fs/cgroup/$(dirname $(cut -d/ -f2- /proc/self/cgroup))
+$ echo $nested_cgroup
+/sys/fs/cgroup/user.slice/user-1001.slice/[email protected]/app.slice
+
+# This test system has 4 CPUs
+$ src/nproc
+4
+
+# Behave like MAX (1, (int)round(quota/period))
+$ echo "100000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "90000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "140000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+1
+$ echo "150000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+2
+
+# Ensure NPROC_ALL takes precedence
+$ echo "100000 100000" > $nested_cgroup/cpu.max
+$ src/nproc --all
+4
+
+# Ensure OMP env vars have appropriate precedence
+$ echo "200000 100000" > $nested_cgroup/cpu.max
+$ OMP_NUM_THREADS=10 src/nproc
+10
+$ OMP_THREAD_LIMIT=10 src/nproc
+2
+
+# Ensure quota only reduces
+$ echo "500000 100000" > $nested_cgroup/cpu.max
+$ src/nproc
+4
+
+# Restore system to unlimited
+$ echo "max 100000" > $nested_cgroup/cpu.max
+
+# Test quota in root hierarchy
+$ podman run --cpus=2 -i --rm fedora:latest /tmp/nproc
+2
+$ podman run --cpus=1.5 -i --rm fedora:latest /tmp/nproc
+2
+$ podman run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
+1
+$ podman run --cpus=100 -i --rm fedora:latest /tmp/nproc
+4
+
+# Docker is similar to podman, but explicitly limits max allowable
+$ docker run --cpus=1.4 -i --rm fedora:latest /tmp/nproc
+1
+$ docker run --cpus=100 -i --rm fedora:latest /tmp/nproc
+docker: Error response from daemon:
+range of CPUs is from 0.01 to 4.00, as there are only 4 CPUs
+*/
--
2.50.1