Hi Bruno,

A while ago I said that I would have a look at optimizing the
--extract-recursive-dependencies option you implemented in gnulib-tool.
Then I forgot about it.

I've written and pushed the change finally.

Here was my testing:

    $ time gnulib-tool --extract-recursive-dependents c99 > 
extract-recursive-dependents-c99-old

    real        3m53.437s
    user        0m41.646s
    sys 3m24.750s
    $ time gnulib-tool --extract-recursive-dependents c99 > 
extract-recursive-dependents-c99-new

    real        0m7.485s
    user        0m7.078s
    sys 0m0.312s
    $ cmp extract-recursive-dependents-c99-old 
extract-recursive-dependents-c99-new 
    $ echo $?
    0
    $ time gnulib-tool --extract-recursive-dependents stdlib > 
extract-recursive-dependents-stdlib-old

    real        1m53.326s
    user        0m19.842s
    sys 1m40.334s
    $ time gnulib-tool --extract-recursive-dependents stdlib > 
extract-recursive-dependents-stdlib-new

    real        0m3.205s
    user        0m2.917s
    sys 0m0.260s
    $ cmp extract-recursive-dependents-stdlib-old 
extract-recursive-dependents-stdlib-new 
    $ echo $?
    0

Collin

>From 1e0fff5e558ce14e3dc0f7361ac78fd4b2a2d477 Mon Sep 17 00:00:00 2001
From: Collin Funk <collin.fu...@gmail.com>
Date: Tue, 3 Dec 2024 19:17:52 -0800
Subject: [PATCH] gnulib-tool.py: Optimize --extract-recursive-dependencies.

* pygnulib/GLModuleSystem.py (GLModuleSystem.list): Add optional
argument to include test modules.
(GLModule._getDependents) New function.
(GLModule.getDependents): Use it.
(GLModule.getDependentsRecursively): Likewise.
(GLModule._getAllModules): New function.
---
 ChangeLog                  | 10 +++++
 pygnulib/GLModuleSystem.py | 91 ++++++++++++++++----------------------
 2 files changed, 48 insertions(+), 53 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 74e5a96323..48e2a3299d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2024-12-03  Collin Funk  <collin.fu...@gmail.com>
+
+	gnulib-tool.py: Optimize --extract-recursive-dependencies.
+	* pygnulib/GLModuleSystem.py (GLModuleSystem.list): Add optional
+	argument to include test modules.
+	(GLModule._getDependents) New function.
+	(GLModule.getDependents): Use it.
+	(GLModule.getDependentsRecursively): Likewise.
+	(GLModule._getAllModules): New function.
+
 2024-12-03  Bruno Haible  <br...@clisp.org>
 
 	strerror_r-posix: Silence gcc 14 warning.
diff --git a/pygnulib/GLModuleSystem.py b/pygnulib/GLModuleSystem.py
index 4185765ab2..7d68b712e0 100644
--- a/pygnulib/GLModuleSystem.py
+++ b/pygnulib/GLModuleSystem.py
@@ -137,10 +137,12 @@ def file_is_module(self, filename: str) -> bool:
                     or filename.endswith('.rej')
                     or filename.endswith('~'))
 
-    def list(self) -> list[str]:
+    def list(self, with_tests: bool | None = False) -> list[str]:
         '''Return the available module names as tuple. We could use a combination
         of os.walk() function and re module. However, it takes too much time to
-        complete, so this version uses subprocess to run shell commands.'''
+        complete, so this version uses subprocess to run shell commands.
+        Arguments are:
+        - with_tests: Include test module names, optional argument.'''
         result = ''
         localpath = self.config['localpath']
         find_args = ['find', 'modules', '-type', 'f', '-print']
@@ -166,7 +168,12 @@ def list(self) -> list[str]:
         # Filter out undesired file names.
         listing = [ line
                     for line in listing
-                    if self.file_is_module(line) and not _isTestsModuleName(line) ]
+                    if self.file_is_module(line) ]
+        # Check if test modules should be filtered out.
+        if not with_tests:
+            listing = [ line
+                        for line in listing
+                        if not _isTestsModuleName(line) ]
         modules = sorted(set(listing))
         return modules
 
@@ -557,60 +564,36 @@ def getDependenciesRecursively(self) -> set[GLModule]:
 
         return outmodules
 
-    def getDependents(self) -> list[GLModule]:
-        '''Return list of dependents (a.k.a. "reverse dependencies"),
-        as a list of GLModule objects.
-        GLConfig: localpath.'''
+    def _getAllModules(self) -> list[GLModule]:
+        '''Return a list of all modules as a list of GLModule objects.'''
+        module_names = self.modulesystem.list(True)
+        modules = [ self.modulesystem.find(module)
+                    for module in module_names
+                    if module != '']
+        modules = [ module
+                    for module in modules
+                    if module is not None ]
+        return modules
+
+    def _getDependents(self, modules: list[GLModule] | None = None) -> list[GLModule]:
+        '''Internal function for getDependents and getDependentsRecursively
+        accepting an optional argument modules.'''
         if 'dependents' not in self.cache:
-            localpath = self.config['localpath']
-            # Find a set of module candidates quickly.
-            # TODO: Optimize. This approach is fine for a single getDependents
-            #       invocation, but not for 100 or 1000 of them.
-            # Convert the module name to a POSIX basic regex.
-            # Needs to handle . [ \ * ^ $.
-            regex = self.name.replace('\\', '\\\\').replace('[', '\\[').replace('^', '\\^')
-            regex = re.compile(r'([.*$])').sub(r'[\1]', regex)
-            line_regex = '^' + regex
-            # We can't add a '$' to line_regex, because that would fail to match
-            # lines that denote conditional dependencies. We could invoke grep
-            # twice, once to search for  line_regex + '$'  and once to search
-            # for   line_regex + [ <TAB>]  but that would be twice as slow.
-            # Read module candidates from gnulib root directory.
-            command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,'" % shlex.quote(line_regex)
-            with sp.Popen(command, shell=True, cwd=DIRS['root'], stdout=sp.PIPE) as proc:
-                result = proc.stdout.read().decode('UTF-8')
-            # Read module candidates from local directories.
-            if localpath != None and len(localpath) > 0:
-                command = "find modules -type f -print | xargs -n 100 grep -l %s /dev/null | sed -e 's,^modules/,,' -e 's,\\.diff$,,'" % shlex.quote(line_regex)
-                for localdir in localpath:
-                    with sp.Popen(command, shell=True, cwd=localdir, stdout=sp.PIPE) as proc:
-                        result += proc.stdout.read().decode('UTF-8')
-            listing = [ line
-                        for line in result.split('\n')
-                        if line.strip() ]
-            # Remove modules/ prefix from each file name.
-            pattern = re.compile(r'^modules/')
-            listing = [ pattern.sub('', line)
-                        for line in listing ]
-            # Filter out undesired file names.
-            listing = [ line
-                        for line in listing
-                        if self.modulesystem.file_is_module(line) ]
-            # ${module}-tests implicitly depends on ${module}, if both exist.
-            if self.isNonTests():
-                implicit_dependent = self.name+'-tests'
-                if self.modulesystem.exists(implicit_dependent):
-                    listing.append(implicit_dependent)
-            candidates = sorted(set(listing))
             result = []
-            for name in candidates:
-                module = self.modulesystem.find(name)
-                if module:  # Ignore module candidates that don't actually exist.
-                    if self in module.getDependenciesWithoutConditions():
-                        result.append(module)
+            if modules is None:
+                modules = self._getAllModules()
+            for module in modules:
+                if self in set(module.getDependenciesWithoutConditions()):
+                    result.append(module)
             self.cache['dependents'] = result
         return self.cache['dependents']
 
+    def getDependents(self) -> list[GLModule]:
+        '''Return list of dependents (a.k.a. "reverse dependencies"),
+        as a list of GLModule objects.
+        GLConfig: localpath.'''
+        return self._getDependents()
+
     def getDependentsRecursively(self) -> set[GLModule]:
         '''Return a list of recursive dependents of this module,
         as a set of GLModule objects.'''
@@ -618,6 +601,8 @@ def getDependentsRecursively(self) -> set[GLModule]:
         inmodules = set()
         outmodules = set()
 
+        modules = self._getAllModules()
+
         # In order to process every module only once (for speed), process an "input
         # list" of modules, producing an "output list" of modules. During each round,
         # more modules can be queued in the input list. Once a module on the input
@@ -629,7 +614,7 @@ def getDependentsRecursively(self) -> set[GLModule]:
             inmodules = set()  # Accumulator, queue for next round
             for module in inmodules_this_round:
                 outmodules.add(module)
-                inmodules = inmodules.union(module.getDependents())
+                inmodules = inmodules.union(module._getDependents(modules))
             handledmodules = handledmodules.union(inmodules_this_round)
             # Remove handledmodules from inmodules.
             inmodules = inmodules.difference(handledmodules)
-- 
2.47.1

Reply via email to