Hi Collin, > I believe the slowness in --extract-dependents caused by my previous > patch was caused by the parsing of all files in modules/* done during > GLModules.__init__().
Yes. > Your implementation was faster for the non-recursive case, so I've added > it back with very minor changes and used an optional top_level argument. Thanks. Reviewing the new code now, I see a couple of details that can be improved: * The top_level argument is not needed. When I had suggested it, I had not seen that you already had split the function into two functions (_getDependents and getDependents). Patch 0001 eliminates it. * getDependents is storing its result in a cache, but the cache is never read. This makes no sense. Either cache or not cache. I choose to cache it (just in case main.py needs to make the same invocation twice). Done in patch 0001 as well. * _getAllModules does not use the 'self' argument. This indicates that the method is on the wrong class. It's better done as a method on GLModuleSystem. Done in patch 0002. * _getAllModules does not need to eliminate empty module names. They don't occur; I verified that. Also done in patch 0002. * _getDependents has an incomplete documentation. Just saying "Internal function for getDependentsRecursively" is redundant. Fixed in patch 0003. * Since the result of _getDependents depends on the modules list passed as argument, it is wrong to cache the result in self.cache[...]. Only functions without additional arguments are allowed to cache their values. Fortunately this caching is not needed, since the loop in getDependentsRecursively makes sure to not make repeated calls. Also done in patch 0003. * The optional argument to _getDependents is overkill; it can be made a required argument. Also done in patch 0003. Bruno
>From d72b2930e61c2d858254457b634142f43e3f617a Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Thu, 5 Dec 2024 09:14:49 +0100 Subject: [PATCH 1/3] gnulib-tool.py: Remove dead code. * pygnulib/GLModuleSystem.py (GLModule.getDependents): Remove top_level parameter. Use the cached value if present. * pygnulib/main.py: Update accordingly. --- ChangeLog | 7 +++ pygnulib/GLModuleSystem.py | 98 +++++++++++++++++++------------------- pygnulib/main.py | 2 +- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/ChangeLog b/ChangeLog index c783488d21..771c225c5e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2024-12-05 Bruno Haible <br...@clisp.org> + + gnulib-tool.py: Remove dead code. + * pygnulib/GLModuleSystem.py (GLModule.getDependents): Remove top_level + parameter. Use the cached value if present. + * pygnulib/main.py: Update accordingly. + 2024-12-04 Collin Funk <collin.fu...@gmail.com> gnulib-tool.py: Make --extract-dependents quick again. diff --git a/pygnulib/GLModuleSystem.py b/pygnulib/GLModuleSystem.py index dda1cc3693..5182d9ac99 100644 --- a/pygnulib/GLModuleSystem.py +++ b/pygnulib/GLModuleSystem.py @@ -576,7 +576,7 @@ def _getAllModules(self) -> list[GLModule]: return modules def _getDependents(self, modules: list[GLModule] | None = None) -> list[GLModule]: - '''Internal function for getDependents and getDependentsRecursively + '''Internal function for getDependentsRecursively accepting an optional argument modules.''' if 'dependents' not in self.cache: result = [] @@ -588,56 +588,56 @@ def _getDependents(self, modules: list[GLModule] | None = None) -> list[GLModule self.cache['dependents'] = result return self.cache['dependents'] - def getDependents(self, top_level: bool | None = True) -> list[GLModule]: + def getDependents(self) -> list[GLModule]: '''Return list of dependents (a.k.a. "reverse dependencies"), as a list of GLModule objects. - GLConfig: localpath. Arguments are: - - top_level: Optional argument, to use an optimized version from main().''' - # Only use optimized version, below if called from the main(). - if not top_level: - return self._getDependents() - localpath = self.config['localpath'] - # Find a set of module candidates quickly. - # Convert the module name to a POSIX basic regex. - # Needs to handle . [ \ * ^ $. - regex = self.name.replace('\\', '\\\\').replace('[', '\\[').replace('^', '\\^') - regex = re.compile(r'([.*$])').sub(r'[\1]', regex) - line_regex = '^' + regex - # We can't add a '$' to line_regex, because that would fail to match - # lines that denote conditional dependencies. We could invoke grep - # twice, once to search for line_regex + '$' and once to search - # for line_regex + [ <TAB>] but that would be twice as slow. - # Read module candidates from gnulib root directory. - command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,'" % shlex.quote(line_regex) - result = sp.run(command, shell=True, cwd=DIRS['root'], capture_output=True).stdout.decode('utf-8') - if localpath != None and len(localpath) > 0: - command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,' -e 's,\\.diff$,,'" % shlex.quote(line_regex) - for localdir in localpath: - result += sp.run(command, shell=True, cwd=localdir, capture_output=True).stdout.decode('utf-8') - listing = [ line - for line in result.split('\n') - if line.strip() ] - # Remove modules/ prefix from each file name. - pattern = re.compile(r'^modules/') - listing = [ pattern.sub('', line) - for line in listing ] - # Filter out undesired file names. - listing = [ line - for line in listing - if self.modulesystem.file_is_module(line) ] - # ${module}-tests implicitly depends on ${module}, if both exist. - if self.isNonTests(): - implicit_dependent = self.name+'-tests' - if self.modulesystem.exists(implicit_dependent): - listing.append(implicit_dependent) - candidates = sorted(set(listing)) - result = [] - for name in candidates: - module = self.modulesystem.find(name) - if module: # Ignore module candidates that don't actually exist. - if self in module.getDependenciesWithoutConditions(): - result.append(module) - self.cache['dependents'] = result + This implementation is optimized for the case of a single invocation + of getDependents. If you need multiple invocations, better use _getDependents. + GLConfig: localpath.''' + if 'dependents' not in self.cache: + localpath = self.config['localpath'] + # Find a set of module candidates quickly. + # Convert the module name to a POSIX basic regex. + # Needs to handle . [ \ * ^ $. + regex = self.name.replace('\\', '\\\\').replace('[', '\\[').replace('^', '\\^') + regex = re.compile(r'([.*$])').sub(r'[\1]', regex) + line_regex = '^' + regex + # We can't add a '$' to line_regex, because that would fail to match + # lines that denote conditional dependencies. We could invoke grep + # twice, once to search for line_regex + '$' and once to search + # for line_regex + [ <TAB>] but that would be twice as slow. + # Read module candidates from gnulib root directory. + command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,'" % shlex.quote(line_regex) + result = sp.run(command, shell=True, cwd=DIRS['root'], capture_output=True).stdout.decode('utf-8') + # Read module candidates from local directories. + if localpath != None and len(localpath) > 0: + command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,' -e 's,\\.diff$,,'" % shlex.quote(line_regex) + for localdir in localpath: + result += sp.run(command, shell=True, cwd=localdir, capture_output=True).stdout.decode('utf-8') + listing = [ line + for line in result.split('\n') + if line.strip() ] + # Remove modules/ prefix from each file name. + pattern = re.compile(r'^modules/') + listing = [ pattern.sub('', line) + for line in listing ] + # Filter out undesired file names. + listing = [ line + for line in listing + if self.modulesystem.file_is_module(line) ] + # ${module}-tests implicitly depends on ${module}, if both exist. + if self.isNonTests(): + implicit_dependent = self.name+'-tests' + if self.modulesystem.exists(implicit_dependent): + listing.append(implicit_dependent) + candidates = sorted(set(listing)) + result = [] + for name in candidates: + module = self.modulesystem.find(name) + if module: # Ignore module candidates that don't actually exist. + if self in module.getDependenciesWithoutConditions(): + result.append(module) + self.cache['dependents'] = result return self.cache['dependents'] diff --git a/pygnulib/main.py b/pygnulib/main.py index d70b566269..f6aa9d47ea 100644 --- a/pygnulib/main.py +++ b/pygnulib/main.py @@ -1289,7 +1289,7 @@ def main(temp_directory: str) -> None: for name in modules: module = modulesystem.find(name) if module: - dependents = module.getDependents(top_level=True) + dependents = module.getDependents() dependents_names = sorted([ m.name for m in dependents ]) sys.stdout.write(lines_to_multiline(dependents_names)) -- 2.34.1
>From 5efa7c42982786b3bac4c8495c9d6baa5bd1a721 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Thu, 5 Dec 2024 09:28:13 +0100 Subject: [PATCH 2/3] gnulib-tool.py: Refactor. * pygnulib/GLModuleSystem.py (GLModuleSystem.getAllModules): Moved here from GLModule._getAllModules. No need to eliminate empty module names. (GLModule._getDependents, GLModule.getDependentsRecursively): Update. --- ChangeLog | 5 +++++ pygnulib/GLModuleSystem.py | 25 ++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index 771c225c5e..892ead004f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2024-12-05 Bruno Haible <br...@clisp.org> + gnulib-tool.py: Refactor. + * pygnulib/GLModuleSystem.py (GLModuleSystem.getAllModules): Moved here + from GLModule._getAllModules. No need to eliminate empty module names. + (GLModule._getDependents, GLModule.getDependentsRecursively): Update. + gnulib-tool.py: Remove dead code. * pygnulib/GLModuleSystem.py (GLModule.getDependents): Remove top_level parameter. Use the cached value if present. diff --git a/pygnulib/GLModuleSystem.py b/pygnulib/GLModuleSystem.py index 5182d9ac99..1a5c93b0d8 100644 --- a/pygnulib/GLModuleSystem.py +++ b/pygnulib/GLModuleSystem.py @@ -177,6 +177,16 @@ def list(self, with_tests: bool | None = False) -> list[str]: modules = sorted(set(listing)) return modules + def getAllModules(self) -> list[GLModule]: + '''Return a list of all modules as a list of GLModule objects.''' + module_names = self.list(True) + modules = [ self.find(module) + for module in module_names ] + modules = [ module + for module in modules + if module is not None ] + return modules + #=============================================================================== # Define GLModule class @@ -564,24 +574,13 @@ def getDependenciesRecursively(self) -> set[GLModule]: return outmodules - def _getAllModules(self) -> list[GLModule]: - '''Return a list of all modules as a list of GLModule objects.''' - module_names = self.modulesystem.list(True) - modules = [ self.modulesystem.find(module) - for module in module_names - if module != ''] - modules = [ module - for module in modules - if module is not None ] - return modules - def _getDependents(self, modules: list[GLModule] | None = None) -> list[GLModule]: '''Internal function for getDependentsRecursively accepting an optional argument modules.''' if 'dependents' not in self.cache: result = [] if modules is None: - modules = self._getAllModules() + modules = self.modulesystem.getAllModules() for module in modules: if self in set(module.getDependenciesWithoutConditions()): result.append(module) @@ -648,7 +647,7 @@ def getDependentsRecursively(self) -> set[GLModule]: inmodules = set() outmodules = set() - modules = self._getAllModules() + modules = self.modulesystem.getAllModules() # In order to process every module only once (for speed), process an "input # list" of modules, producing an "output list" of modules. During each round, -- 2.34.1
>From ab4cc5f2b5c616b29f750df4347966ef83416b23 Mon Sep 17 00:00:00 2001 From: Bruno Haible <br...@clisp.org> Date: Thu, 5 Dec 2024 09:43:14 +0100 Subject: [PATCH 3/3] gnulib-tool.py: Fix invalid use of cache. * pygnulib/GLModuleSystem.py (GLModule._getDependents): Make the modules argument mandatory. Don't store the result in self.cache. --- ChangeLog | 4 ++++ pygnulib/GLModuleSystem.py | 21 ++++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/ChangeLog b/ChangeLog index 892ead004f..800faf5fe0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ 2024-12-05 Bruno Haible <br...@clisp.org> + gnulib-tool.py: Fix invalid use of cache. + * pygnulib/GLModuleSystem.py (GLModule._getDependents): Make the modules + argument mandatory. Don't store the result in self.cache. + gnulib-tool.py: Refactor. * pygnulib/GLModuleSystem.py (GLModuleSystem.getAllModules): Moved here from GLModule._getAllModules. No need to eliminate empty module names. diff --git a/pygnulib/GLModuleSystem.py b/pygnulib/GLModuleSystem.py index 1a5c93b0d8..c99cfe25ad 100644 --- a/pygnulib/GLModuleSystem.py +++ b/pygnulib/GLModuleSystem.py @@ -574,19 +574,6 @@ def getDependenciesRecursively(self) -> set[GLModule]: return outmodules - def _getDependents(self, modules: list[GLModule] | None = None) -> list[GLModule]: - '''Internal function for getDependentsRecursively - accepting an optional argument modules.''' - if 'dependents' not in self.cache: - result = [] - if modules is None: - modules = self.modulesystem.getAllModules() - for module in modules: - if self in set(module.getDependenciesWithoutConditions()): - result.append(module) - self.cache['dependents'] = result - return self.cache['dependents'] - def getDependents(self) -> list[GLModule]: '''Return list of dependents (a.k.a. "reverse dependencies"), as a list of GLModule objects. @@ -639,6 +626,14 @@ def getDependents(self) -> list[GLModule]: self.cache['dependents'] = result return self.cache['dependents'] + def _getDependents(self, modules: list[GLModule]) -> list[GLModule]: + '''Return the list of dependents of this module, + among the given set of modules.''' + result = [] + for module in modules: + if self in set(module.getDependenciesWithoutConditions()): + result.append(module) + return result def getDependentsRecursively(self) -> set[GLModule]: '''Return a list of recursive dependents of this module, -- 2.34.1