commit:     9c63c4541ffb891dcb6cbf94c8f5ad8d643c684f
Author:     Zac Medico <zmedico <AT> gentoo <DOT> org>
AuthorDate: Mon Oct 27 22:28:38 2025 +0000
Commit:     Zac Medico <zmedico <AT> gentoo <DOT> org>
CommitDate: Mon Oct 27 23:25:29 2025 +0000
URL:        https://gitweb.gentoo.org/proj/portage.git/commit/?id=9c63c454

egencache: retry metadata process with unexpected returncode

Since MetadataRegen is not well suited for internal retry, create
a new MetadataRegen instance for each retry. Returns the returncode
from the last MetadataRegen instance, which will only be non-zero
if all retries failed.

Since we only observe bug 965132 in the python3.14 forkserver tests,
it seems like this must be some kind of forkserver bug that causes
our bash proccess to occasionally silently fail with returncode 255.

It may be very difficult to find the root cause of this forkserver
issue, and we can't predict how soon it will be fixed. Meanwhile,
I think we will appreciate the retry if it eliminates our
intermittent CI failures.

Another possible solution would be to force the multiprocessing
start method to spawn inside the egencache process (like
_EbuildFetcherProcess).

Bug: https://bugs.gentoo.org/965132
Signed-off-by: Zac Medico <zmedico <AT> gentoo.org>

 bin/egencache                | 29 +++++++++++++++++--------
 lib/_emerge/MetadataRegen.py | 51 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/bin/egencache b/bin/egencache
index 6ed455dd4c..d36ae5d32e 100755
--- a/bin/egencache
+++ b/bin/egencache
@@ -65,6 +65,7 @@ try:
     )
     from portage.util import cmp_sort_key, writemsg_level, no_color
     from portage.util._async.AsyncFunction import AsyncFunction
+    from portage.util._async.AsyncTaskFuture import AsyncTaskFuture
     from portage.util._async.run_main_scheduler import run_main_scheduler
     from portage.util._async.TaskScheduler import TaskScheduler
     from portage.util._eventloop.global_event_loop import global_event_loop
@@ -72,7 +73,7 @@ try:
     from portage import cpv_getkey
     from portage.dep import Atom, isjustname
     from portage.versions import vercmp
-    from _emerge.MetadataRegen import MetadataRegen
+    from _emerge.MetadataRegen import metadata_regen_retry
 
     try:
         from xml.etree import ElementTree
@@ -320,15 +321,22 @@ try:
             write_auxdb = (
                 external_cache_only or "metadata-transfer" in 
portdb.settings.features
             )
-            self._regen = MetadataRegen(
-                portdb,
-                cp_iter=cp_iter,
-                consumer=self._metadata_callback,
-                max_jobs=max_jobs,
-                max_load=max_load,
-                write_auxdb=write_auxdb,
-                main=True,
+            self._regen_task = AsyncTaskFuture(
+                future=metadata_regen_retry(
+                    portdb,
+                    cp_iter=cp_iter,
+                    consumer=self._metadata_callback,
+                    max_jobs=max_jobs,
+                    max_load=max_load,
+                    write_auxdb=write_auxdb,
+                    main=True,
+                )
             )
+            self._regen = TaskScheduler(
+                iter([self._regen_task]),
+                event_loop=global_event_loop(),
+            )
+
             self.returncode = os.EX_OK
             conf = portdb.repositories.get_repo_for_location(tree)
             if external_cache_only:
@@ -446,6 +454,9 @@ try:
 
             self.returncode |= self._regen.returncode
 
+            # Raise an unexpected exception if one occurred.
+            self._regen_task.future.result()
+
             for trg_cache in self._trg_caches:
                 self._cleanse_cache(trg_cache)
 

diff --git a/lib/_emerge/MetadataRegen.py b/lib/_emerge/MetadataRegen.py
index 496e6295c9..55eb690e78 100644
--- a/lib/_emerge/MetadataRegen.py
+++ b/lib/_emerge/MetadataRegen.py
@@ -10,6 +10,46 @@ from portage.dep import _repo_separator
 from portage.util._async.AsyncScheduler import AsyncScheduler
 
 
+async def metadata_regen_retry(*args, max_tries=3, **kwargs) -> int:
+    """
+    Since MetadataRegen is not well suited for internal retry, create
+    a new MetadataRegen instance for each retry. Returns the returncode
+    from the last MetadataRegen instance, which will only be non-zero
+    if all retries failed.
+    """
+    tries = max_tries
+    scheduler = MetadataRegen(*args, **kwargs)
+    scheduler.start()
+    try:
+        await scheduler.async_wait()
+    except asyncio.CancelledError:
+        scheduler.terminate()
+        await scheduler.async_wait()
+        raise
+    cpv_failed = scheduler.cpv_failed.copy()
+    tries -= 1
+    while scheduler.cp_retry and tries > 0:
+        kwargs["cp_iter"] = iter(scheduler.cp_retry)
+        scheduler = MetadataRegen(*args, **kwargs)
+        scheduler.start()
+        try:
+            await scheduler.async_wait()
+        except asyncio.CancelledError:
+            scheduler.terminate()
+            await scheduler.async_wait()
+            raise
+        cpv_failed.update(scheduler.cpv_failed)
+        cpv_failed.difference_update(scheduler.cpv_successful)
+        tries -= 1
+
+    # Account for failures from all MetadataRegen instances, since we
+    # only retry when the returncode is unexpected.
+    if cpv_failed:
+        scheduler.returncode |= 1
+
+    return scheduler.returncode
+
+
 class MetadataRegen(AsyncScheduler):
     def __init__(self, portdb, cp_iter=None, consumer=None, write_auxdb=True, 
**kwargs):
         AsyncScheduler.__init__(self, **kwargs)
@@ -28,6 +68,9 @@ class MetadataRegen(AsyncScheduler):
         self._cp_set = set()
         self._process_iter = self._iter_metadata_processes()
         self._running_tasks = set()
+        self.cp_retry = set()
+        self.cpv_failed = set()
+        self.cpv_successful = set()
 
     def _next_task(self):
         return next(self._process_iter)
@@ -152,7 +195,13 @@ class MetadataRegen(AsyncScheduler):
         portdb.flush_cache()
 
     def _task_exit(self, metadata_process):
-        if metadata_process.returncode != os.EX_OK:
+        if metadata_process.returncode == os.EX_OK:
+            self.cpv_successful.add(metadata_process.cpv)
+        else:
+            self.cpv_failed.add(metadata_process.cpv)
+            if metadata_process.returncode != 1:
+                # Retry if the returncode was unexpected.
+                self.cp_retry.add(metadata_process.cpv.cp)
             self._valid_pkgs.discard(metadata_process.cpv)
             if not self._terminated_tasks:
                 portage.writemsg(

Reply via email to