commit:     448b7748adb4494b8a7dbfb5293437c1100c8977
Author:     Zac Medico <zmedico <AT> gentoo <DOT> org>
AuthorDate: Thu Oct 30 03:30:25 2025 +0000
Commit:     Zac Medico <zmedico <AT> gentoo <DOT> org>
CommitDate: Thu Oct 30 03:53:25 2025 +0000
URL:        https://gitweb.gentoo.org/proj/portage.git/commit/?id=448b7748

egencache: retry manifest process with unexpected returncode

Retry for an intermittent unexpected returncode which
occurs in CI runs with forkserver (bug 965132). In CI
the unexpected returncode tends to be 255 which indicates
that the forkserver exited unexpectedly.

Bug: https://bugs.gentoo.org/965132
Signed-off-by: Zac Medico <zmedico <AT> gentoo.org>

 bin/egencache                                      | 27 +++++++----
 .../ebuild/_parallel_manifest/ManifestScheduler.py | 54 +++++++++++++++++++++-
 2 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/bin/egencache b/bin/egencache
index d36ae5d32e..6b131360d5 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -61,7 +61,7 @@ try:
     from portage.dep import _repo_separator
     from portage.output import colorize, EOutput
     from portage.package.ebuild._parallel_manifest.ManifestScheduler import (
-        ManifestScheduler,
+        manifest_scheduler_retry,
     )
     from portage.util import cmp_sort_key, writemsg_level, no_color
     from portage.util._async.AsyncFunction import AsyncFunction
@@ -1351,21 +1351,28 @@ try:
                 cp_iter = iter(atoms)
 
             event_loop = global_event_loop()
-            scheduler = ManifestScheduler(
-                portdb,
-                cp_iter=cp_iter,
-                gpg_cmd=gpg_cmd,
-                gpg_vars=gpg_vars,
-                force_sign_key=force_sign_key,
-                max_jobs=options.jobs,
-                max_load=options.load_average,
-                event_loop=event_loop,
+            manifest_task = AsyncTaskFuture(
+                future=manifest_scheduler_retry(
+                    portdb,
+                    cp_iter=cp_iter,
+                    gpg_cmd=gpg_cmd,
+                    gpg_vars=gpg_vars,
+                    force_sign_key=force_sign_key,
+                    max_jobs=options.jobs,
+                    max_load=options.load_average,
+                    event_loop=event_loop,
+                )
             )
 
+            scheduler = TaskScheduler(iter([manifest_task]), 
event_loop=event_loop)
+
             signum = run_main_scheduler(scheduler)
             if signum is not None:
                 sys.exit(128 + signum)
 
+            # Raise an unexpected exception if one occurred.
+            manifest_task.future.result()
+
             if options.tolerant:
                 ret.append(os.EX_OK)
             else:

diff --git a/lib/portage/package/ebuild/_parallel_manifest/ManifestScheduler.py 
b/lib/portage/package/ebuild/_parallel_manifest/ManifestScheduler.py
index 36372d228d..d773d04da9 100644
--- a/lib/portage/package/ebuild/_parallel_manifest/ManifestScheduler.py
+++ b/lib/portage/package/ebuild/_parallel_manifest/ManifestScheduler.py
@@ -1,6 +1,8 @@
 # Copyright 2012-2025 Gentoo Authors
 # Distributed under the terms of the GNU General Public License v2
 
+import asyncio
+
 import portage
 from portage import os
 from portage.dbapi.porttree import _async_manifest_fetchlist
@@ -10,6 +12,46 @@ from portage.util._async.AsyncScheduler import AsyncScheduler
 from .ManifestTask import ManifestTask
 
 
+async def manifest_scheduler_retry(*args, max_tries=3, **kwargs) -> int:
+    """
+    Since ManifestScheduler is not well suited for internal retry, create
+    a new ManifestScheduler instance for each retry. Returns the returncode
+    from the last ManifestScheduler instance, which will only be non-zero
+    if all retries failed.
+    """
+    tries = max_tries
+    scheduler = ManifestScheduler(*args, **kwargs)
+    scheduler.start()
+    try:
+        await scheduler.async_wait()
+    except asyncio.CancelledError:
+        scheduler.terminate()
+        await scheduler.async_wait()
+        raise
+    cp_failed = scheduler.cp_failed.copy()
+    tries -= 1
+    while scheduler.cp_retry and tries > 0:
+        kwargs["cp_iter"] = iter(scheduler.cp_retry)
+        scheduler = ManifestScheduler(*args, **kwargs)
+        scheduler.start()
+        try:
+            await scheduler.async_wait()
+        except asyncio.CancelledError:
+            scheduler.terminate()
+            await scheduler.async_wait()
+            raise
+        cp_failed.update(scheduler.cp_failed)
+        cp_failed.difference_update(scheduler.cp_successful)
+        tries -= 1
+
+    # Account for failures from all ManifestScheduler instances, since we
+    # only retry when the returncode is unexpected.
+    if cp_failed:
+        scheduler.returncode |= 1
+
+    return scheduler.returncode
+
+
 class ManifestScheduler(AsyncScheduler):
     def __init__(
         self,
@@ -31,6 +73,9 @@ class ManifestScheduler(AsyncScheduler):
         self._gpg_vars = gpg_vars
         self._force_sign_key = force_sign_key
         self._task_iter = self._iter_tasks()
+        self.cp_retry = set()
+        self.cp_failed = set()
+        self.cp_successful = set()
 
     def _next_task(self):
         return next(self._task_iter)
@@ -92,7 +137,14 @@ class ManifestScheduler(AsyncScheduler):
                 )
 
     def _task_exit(self, task):
-        if task.returncode != os.EX_OK:
+        if task.returncode == os.EX_OK:
+            self.cp_successful.add(task.cp)
+        else:
+            self.cp_failed.add(task.cp)
+            if task.returncode != 1:
+                # Retry if the returncode was unexpected.
+                self.cp_retry.add(task.cp)
+
             if not self._terminated_tasks:
                 portage.writemsg(
                     "Error processing %s%s%s with returncode %s, 
continuing...\n"

Reply via email to