Hi Bruno, A while ago I said that I would have a look at optimizing the --extract-recursive-dependencies option you implemented in gnulib-tool. Then I forgot about it.
I've written and pushed the change finally. Here was my testing: $ time gnulib-tool --extract-recursive-dependents c99 > extract-recursive-dependents-c99-old real 3m53.437s user 0m41.646s sys 3m24.750s $ time gnulib-tool --extract-recursive-dependents c99 > extract-recursive-dependents-c99-new real 0m7.485s user 0m7.078s sys 0m0.312s $ cmp extract-recursive-dependents-c99-old extract-recursive-dependents-c99-new $ echo $? 0 $ time gnulib-tool --extract-recursive-dependents stdlib > extract-recursive-dependents-stdlib-old real 1m53.326s user 0m19.842s sys 1m40.334s $ time gnulib-tool --extract-recursive-dependents stdlib > extract-recursive-dependents-stdlib-new real 0m3.205s user 0m2.917s sys 0m0.260s $ cmp extract-recursive-dependents-stdlib-old extract-recursive-dependents-stdlib-new $ echo $? 0 Collin
>From 1e0fff5e558ce14e3dc0f7361ac78fd4b2a2d477 Mon Sep 17 00:00:00 2001 From: Collin Funk <collin.fu...@gmail.com> Date: Tue, 3 Dec 2024 19:17:52 -0800 Subject: [PATCH] gnulib-tool.py: Optimize --extract-recursive-dependencies. * pygnulib/GLModuleSystem.py (GLModuleSystem.list): Add optional argument to include test modules. (GLModule._getDependents) New function. (GLModule.getDependents): Use it. (GLModule.getDependentsRecursively): Likewise. (GLModule._getAllModules): New function. --- ChangeLog | 10 +++++ pygnulib/GLModuleSystem.py | 91 ++++++++++++++++---------------------- 2 files changed, 48 insertions(+), 53 deletions(-) diff --git a/ChangeLog b/ChangeLog index 74e5a96323..48e2a3299d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +2024-12-03 Collin Funk <collin.fu...@gmail.com> + + gnulib-tool.py: Optimize --extract-recursive-dependencies. + * pygnulib/GLModuleSystem.py (GLModuleSystem.list): Add optional + argument to include test modules. + (GLModule._getDependents) New function. + (GLModule.getDependents): Use it. + (GLModule.getDependentsRecursively): Likewise. + (GLModule._getAllModules): New function. + 2024-12-03 Bruno Haible <br...@clisp.org> strerror_r-posix: Silence gcc 14 warning. diff --git a/pygnulib/GLModuleSystem.py b/pygnulib/GLModuleSystem.py index 4185765ab2..7d68b712e0 100644 --- a/pygnulib/GLModuleSystem.py +++ b/pygnulib/GLModuleSystem.py @@ -137,10 +137,12 @@ def file_is_module(self, filename: str) -> bool: or filename.endswith('.rej') or filename.endswith('~')) - def list(self) -> list[str]: + def list(self, with_tests: bool | None = False) -> list[str]: '''Return the available module names as tuple. We could use a combination of os.walk() function and re module. However, it takes too much time to - complete, so this version uses subprocess to run shell commands.''' + complete, so this version uses subprocess to run shell commands. + Arguments are: + - with_tests: Include test module names, optional argument.''' result = '' localpath = self.config['localpath'] find_args = ['find', 'modules', '-type', 'f', '-print'] @@ -166,7 +168,12 @@ def list(self) -> list[str]: # Filter out undesired file names. listing = [ line for line in listing - if self.file_is_module(line) and not _isTestsModuleName(line) ] + if self.file_is_module(line) ] + # Check if test modules should be filtered out. + if not with_tests: + listing = [ line + for line in listing + if not _isTestsModuleName(line) ] modules = sorted(set(listing)) return modules @@ -557,60 +564,36 @@ def getDependenciesRecursively(self) -> set[GLModule]: return outmodules - def getDependents(self) -> list[GLModule]: - '''Return list of dependents (a.k.a. "reverse dependencies"), - as a list of GLModule objects. - GLConfig: localpath.''' + def _getAllModules(self) -> list[GLModule]: + '''Return a list of all modules as a list of GLModule objects.''' + module_names = self.modulesystem.list(True) + modules = [ self.modulesystem.find(module) + for module in module_names + if module != ''] + modules = [ module + for module in modules + if module is not None ] + return modules + + def _getDependents(self, modules: list[GLModule] | None = None) -> list[GLModule]: + '''Internal function for getDependents and getDependentsRecursively + accepting an optional argument modules.''' if 'dependents' not in self.cache: - localpath = self.config['localpath'] - # Find a set of module candidates quickly. - # TODO: Optimize. This approach is fine for a single getDependents - # invocation, but not for 100 or 1000 of them. - # Convert the module name to a POSIX basic regex. - # Needs to handle . [ \ * ^ $. - regex = self.name.replace('\\', '\\\\').replace('[', '\\[').replace('^', '\\^') - regex = re.compile(r'([.*$])').sub(r'[\1]', regex) - line_regex = '^' + regex - # We can't add a '$' to line_regex, because that would fail to match - # lines that denote conditional dependencies. We could invoke grep - # twice, once to search for line_regex + '$' and once to search - # for line_regex + [ <TAB>] but that would be twice as slow. - # Read module candidates from gnulib root directory. - command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,'" % shlex.quote(line_regex) - with sp.Popen(command, shell=True, cwd=DIRS['root'], stdout=sp.PIPE) as proc: - result = proc.stdout.read().decode('UTF-8') - # Read module candidates from local directories. - if localpath != None and len(localpath) > 0: - command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,' -e 's,\\.diff$,,'" % shlex.quote(line_regex) - for localdir in localpath: - with sp.Popen(command, shell=True, cwd=localdir, stdout=sp.PIPE) as proc: - result += proc.stdout.read().decode('UTF-8') - listing = [ line - for line in result.split('\n') - if line.strip() ] - # Remove modules/ prefix from each file name. - pattern = re.compile(r'^modules/') - listing = [ pattern.sub('', line) - for line in listing ] - # Filter out undesired file names. - listing = [ line - for line in listing - if self.modulesystem.file_is_module(line) ] - # ${module}-tests implicitly depends on ${module}, if both exist. - if self.isNonTests(): - implicit_dependent = self.name+'-tests' - if self.modulesystem.exists(implicit_dependent): - listing.append(implicit_dependent) - candidates = sorted(set(listing)) result = [] - for name in candidates: - module = self.modulesystem.find(name) - if module: # Ignore module candidates that don't actually exist. - if self in module.getDependenciesWithoutConditions(): - result.append(module) + if modules is None: + modules = self._getAllModules() + for module in modules: + if self in set(module.getDependenciesWithoutConditions()): + result.append(module) self.cache['dependents'] = result return self.cache['dependents'] + def getDependents(self) -> list[GLModule]: + '''Return list of dependents (a.k.a. "reverse dependencies"), + as a list of GLModule objects. + GLConfig: localpath.''' + return self._getDependents() + def getDependentsRecursively(self) -> set[GLModule]: '''Return a list of recursive dependents of this module, as a set of GLModule objects.''' @@ -618,6 +601,8 @@ def getDependentsRecursively(self) -> set[GLModule]: inmodules = set() outmodules = set() + modules = self._getAllModules() + # In order to process every module only once (for speed), process an "input # list" of modules, producing an "output list" of modules. During each round, # more modules can be queued in the input list. Once a module on the input @@ -629,7 +614,7 @@ def getDependentsRecursively(self) -> set[GLModule]: inmodules = set() # Accumulator, queue for next round for module in inmodules_this_round: outmodules.add(module) - inmodules = inmodules.union(module.getDependents()) + inmodules = inmodules.union(module._getDependents(modules)) handledmodules = handledmodules.union(inmodules_this_round) # Remove handledmodules from inmodules. inmodules = inmodules.difference(handledmodules) -- 2.47.1