commit: 9c63c4541ffb891dcb6cbf94c8f5ad8d643c684f Author: Zac Medico <zmedico <AT> gentoo <DOT> org> AuthorDate: Mon Oct 27 22:28:38 2025 +0000 Commit: Zac Medico <zmedico <AT> gentoo <DOT> org> CommitDate: Mon Oct 27 23:25:29 2025 +0000 URL: https://gitweb.gentoo.org/proj/portage.git/commit/?id=9c63c454
egencache: retry metadata process with unexpected returncode Since MetadataRegen is not well suited for internal retry, create a new MetadataRegen instance for each retry. Returns the returncode from the last MetadataRegen instance, which will only be non-zero if all retries failed. Since we only observe bug 965132 in the python3.14 forkserver tests, it seems like this must be some kind of forkserver bug that causes our bash proccess to occasionally silently fail with returncode 255. It may be very difficult to find the root cause of this forkserver issue, and we can't predict how soon it will be fixed. Meanwhile, I think we will appreciate the retry if it eliminates our intermittent CI failures. Another possible solution would be to force the multiprocessing start method to spawn inside the egencache process (like _EbuildFetcherProcess). Bug: https://bugs.gentoo.org/965132 Signed-off-by: Zac Medico <zmedico <AT> gentoo.org> bin/egencache | 29 +++++++++++++++++-------- lib/_emerge/MetadataRegen.py | 51 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 10 deletions(-) diff --git a/bin/egencache b/bin/egencache index 6ed455dd4c..d36ae5d32e 100755 --- a/bin/egencache +++ b/bin/egencache @@ -65,6 +65,7 @@ try: ) from portage.util import cmp_sort_key, writemsg_level, no_color from portage.util._async.AsyncFunction import AsyncFunction + from portage.util._async.AsyncTaskFuture import AsyncTaskFuture from portage.util._async.run_main_scheduler import run_main_scheduler from portage.util._async.TaskScheduler import TaskScheduler from portage.util._eventloop.global_event_loop import global_event_loop @@ -72,7 +73,7 @@ try: from portage import cpv_getkey from portage.dep import Atom, isjustname from portage.versions import vercmp - from _emerge.MetadataRegen import MetadataRegen + from _emerge.MetadataRegen import metadata_regen_retry try: from xml.etree import ElementTree @@ -320,15 +321,22 @@ try: write_auxdb = ( external_cache_only or "metadata-transfer" in portdb.settings.features ) - self._regen = MetadataRegen( - portdb, - cp_iter=cp_iter, - consumer=self._metadata_callback, - max_jobs=max_jobs, - max_load=max_load, - write_auxdb=write_auxdb, - main=True, + self._regen_task = AsyncTaskFuture( + future=metadata_regen_retry( + portdb, + cp_iter=cp_iter, + consumer=self._metadata_callback, + max_jobs=max_jobs, + max_load=max_load, + write_auxdb=write_auxdb, + main=True, + ) ) + self._regen = TaskScheduler( + iter([self._regen_task]), + event_loop=global_event_loop(), + ) + self.returncode = os.EX_OK conf = portdb.repositories.get_repo_for_location(tree) if external_cache_only: @@ -446,6 +454,9 @@ try: self.returncode |= self._regen.returncode + # Raise an unexpected exception if one occurred. + self._regen_task.future.result() + for trg_cache in self._trg_caches: self._cleanse_cache(trg_cache) diff --git a/lib/_emerge/MetadataRegen.py b/lib/_emerge/MetadataRegen.py index 496e6295c9..55eb690e78 100644 --- a/lib/_emerge/MetadataRegen.py +++ b/lib/_emerge/MetadataRegen.py @@ -10,6 +10,46 @@ from portage.dep import _repo_separator from portage.util._async.AsyncScheduler import AsyncScheduler +async def metadata_regen_retry(*args, max_tries=3, **kwargs) -> int: + """ + Since MetadataRegen is not well suited for internal retry, create + a new MetadataRegen instance for each retry. Returns the returncode + from the last MetadataRegen instance, which will only be non-zero + if all retries failed. + """ + tries = max_tries + scheduler = MetadataRegen(*args, **kwargs) + scheduler.start() + try: + await scheduler.async_wait() + except asyncio.CancelledError: + scheduler.terminate() + await scheduler.async_wait() + raise + cpv_failed = scheduler.cpv_failed.copy() + tries -= 1 + while scheduler.cp_retry and tries > 0: + kwargs["cp_iter"] = iter(scheduler.cp_retry) + scheduler = MetadataRegen(*args, **kwargs) + scheduler.start() + try: + await scheduler.async_wait() + except asyncio.CancelledError: + scheduler.terminate() + await scheduler.async_wait() + raise + cpv_failed.update(scheduler.cpv_failed) + cpv_failed.difference_update(scheduler.cpv_successful) + tries -= 1 + + # Account for failures from all MetadataRegen instances, since we + # only retry when the returncode is unexpected. + if cpv_failed: + scheduler.returncode |= 1 + + return scheduler.returncode + + class MetadataRegen(AsyncScheduler): def __init__(self, portdb, cp_iter=None, consumer=None, write_auxdb=True, **kwargs): AsyncScheduler.__init__(self, **kwargs) @@ -28,6 +68,9 @@ class MetadataRegen(AsyncScheduler): self._cp_set = set() self._process_iter = self._iter_metadata_processes() self._running_tasks = set() + self.cp_retry = set() + self.cpv_failed = set() + self.cpv_successful = set() def _next_task(self): return next(self._process_iter) @@ -152,7 +195,13 @@ class MetadataRegen(AsyncScheduler): portdb.flush_cache() def _task_exit(self, metadata_process): - if metadata_process.returncode != os.EX_OK: + if metadata_process.returncode == os.EX_OK: + self.cpv_successful.add(metadata_process.cpv) + else: + self.cpv_failed.add(metadata_process.cpv) + if metadata_process.returncode != 1: + # Retry if the returncode was unexpected. + self.cp_retry.add(metadata_process.cpv.cp) self._valid_pkgs.discard(metadata_process.cpv) if not self._terminated_tasks: portage.writemsg(
