https://github.com/python/cpython/commit/9b22261a86b54f198225426e86390ef8dd85e091
commit: 9b22261a86b54f198225426e86390ef8dd85e091
branch: main
author: Petr Viktorin <[email protected]>
committer: encukou <[email protected]>
date: 2026-02-25T13:37:59+01:00
summary:
GH-145000: Add a tool to record/check removed HTML IDs (#145001)
files:
A Doc/tools/check-html-ids.py
M Doc/.ruff.toml
M Doc/Makefile
diff --git a/Doc/.ruff.toml b/Doc/.ruff.toml
index 3e676e13c3f41a..6b573fd58d089b 100644
--- a/Doc/.ruff.toml
+++ b/Doc/.ruff.toml
@@ -32,6 +32,9 @@ ignore = [
"E501", # Ignore line length errors (we use auto-formatting)
]
+[lint.per-file-ignores]
+"tools/check-html-ids.py" = ["I001"] # Unsorted imports
+
[format]
preview = true
quote-style = "preserve"
diff --git a/Doc/Makefile b/Doc/Makefile
index 4d605980a62904..d39c2fe3c3f22a 100644
--- a/Doc/Makefile
+++ b/Doc/Makefile
@@ -336,3 +336,9 @@ autobuild-stable-html:
exit 1;; \
esac
@$(MAKE) autobuild-dev-html
+
+# Collect HTML IDs to a JSON document
+.PHONY: html-ids
+html-ids:
+ $(PYTHON) tools/check-html-ids.py collect build/html \
+ -o build/html/html-ids.json.gz
diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
new file mode 100644
index 00000000000000..8e8e0a581df72d
--- /dev/null
+++ b/Doc/tools/check-html-ids.py
@@ -0,0 +1,181 @@
+from compression import gzip
+import concurrent.futures
+from pathlib import Path
+import html.parser
+import functools
+import argparse
+import json
+import sys
+import re
+
+
+IGNORED_ID_RE = re.compile(
+ r"""
+ index-\d+
+ | id\d+
+ | [_a-z]+_\d+
+""",
+ re.VERBOSE,
+)
+
+
+class IDGatherer(html.parser.HTMLParser):
+ def __init__(self, ids):
+ super().__init__()
+ self.__ids = ids
+
+ def handle_starttag(self, tag, attrs):
+ for name, value in attrs:
+ if name == 'id':
+ if not IGNORED_ID_RE.fullmatch(value):
+ self.__ids.add(value)
+
+
+def get_ids_from_file(path):
+ ids = set()
+ gatherer = IDGatherer(ids)
+ with path.open(encoding='utf-8') as file:
+ while chunk := file.read(4096):
+ gatherer.feed(chunk)
+ return ids
+
+
+def gather_ids(htmldir, *, verbose_print):
+ if not htmldir.joinpath('objects.inv').exists():
+ raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
+
+ if sys._is_gil_enabled:
+ pool = concurrent.futures.ProcessPoolExecutor()
+ else:
+ pool = concurrent.futures.ThreadPoolExecutor()
+ tasks = {}
+ for path in htmldir.glob('**/*.html'):
+ relative_path = path.relative_to(htmldir)
+ if '_static' in relative_path.parts:
+ continue
+ if 'whatsnew' in relative_path.parts:
+ continue
+ tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
+
+ ids_by_page = {}
+ for relative_path, future in tasks.items():
+ verbose_print(relative_path)
+ ids = future.result()
+ ids_by_page[str(relative_path)] = ids
+ verbose_print(f' - {len(ids)} ids found')
+
+ common = set.intersection(*ids_by_page.values())
+ verbose_print(f'Filtering out {len(common)} common ids')
+ for key, page_ids in ids_by_page.items():
+ ids_by_page[key] = sorted(page_ids - common)
+
+ return ids_by_page
+
+
+def do_check(baseline, checked, excluded, *, verbose_print):
+ successful = True
+ for name, baseline_ids in sorted(baseline.items()):
+ try:
+ checked_ids = checked[name]
+ except KeyError:
+ successful = False
+ print(f'{name}: (page missing)')
+ print()
+ else:
+ missing_ids = set(baseline_ids) - set(checked_ids)
+ if missing_ids:
+ missing_ids = {
+ a
+ for a in missing_ids
+ if not IGNORED_ID_RE.fullmatch(a)
+ and (name, a) not in excluded
+ }
+ if missing_ids:
+ successful = False
+ for missing_id in sorted(missing_ids):
+ print(f'{name}: {missing_id}')
+ print()
+ return successful
+
+
+def main(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='print out more information',
+ )
+ subparsers = parser.add_subparsers(dest='command', required=True)
+
+ collect = subparsers.add_parser(
+ 'collect', help='collect IDs from a set of HTML files'
+ )
+ collect.add_argument(
+ 'htmldir', type=Path, help='directory with HTML documentation'
+ )
+ collect.add_argument(
+ '-o',
+ '--outfile',
+ help='File to save the result in; default <htmldir>/html-ids.json.gz',
+ )
+
+ check = subparsers.add_parser('check', help='check two archives of IDs')
+ check.add_argument(
+ 'baseline_file', type=Path, help='file with baseline IDs'
+ )
+ check.add_argument('checked_file', type=Path, help='file with checked IDs')
+ check.add_argument(
+ '-x',
+ '--exclude-file',
+ type=Path,
+ help='file with IDs to exclude from the check',
+ )
+
+ args = parser.parse_args(argv[1:])
+
+ if args.verbose:
+ verbose_print = functools.partial(print, file=sys.stderr)
+ else:
+
+ def verbose_print(*args, **kwargs):
+ """do nothing"""
+
+ if args.command == 'collect':
+ ids = gather_ids(args.htmldir, verbose_print=verbose_print)
+ if args.outfile is None:
+ args.outfile = args.htmldir / 'html-ids.json.gz'
+ with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
+ json.dump({'ids_by_page': ids}, zfile)
+
+ if args.command == 'check':
+ with gzip.open(args.baseline_file) as zfile:
+ baseline = json.load(zfile)['ids_by_page']
+ with gzip.open(args.checked_file) as zfile:
+ checked = json.load(zfile)['ids_by_page']
+ excluded = set()
+ if args.exclude_file:
+ with open(args.exclude_file, encoding='utf-8') as file:
+ for line in file:
+ line = line.strip()
+ if line and not line.startswith('#'):
+ name, sep, excluded_id = line.partition(':')
+ if sep:
+ excluded.add((name.strip(), excluded_id.strip()))
+ if do_check(baseline, checked, excluded, verbose_print=verbose_print):
+ verbose_print('All OK')
+ else:
+ sys.stdout.flush()
+ print(
+ 'ERROR: Removed IDs found',
+ 'The above HTML IDs were removed from the documentation, '
+ + 'resulting in broken links. Please add them back.',
+ sep='\n',
+ file=sys.stderr,
+ )
+ if args.exclude_file:
+ print(f'Alternatively, add them to {args.exclude_file}.')
+
+
+if __name__ == '__main__':
+ main(sys.argv)
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]