This is an automated email from the ASF dual-hosted git repository.
lhotari pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/pulsar-site.git
The following commit(s) were added to refs/heads/main by this push:
new 8b83055e87e Fix site-publisher to see all changes since last
successful publish
8b83055e87e is described below
commit 8b83055e87eb7b3a7c04bdb5d4ef9f3ec0ed3e9c
Author: Lari Hotari <[email protected]>
AuthorDate: Tue Apr 28 13:37:44 2026 +0300
Fix site-publisher to see all changes since last successful publish
The CI workflow checks out main with fetch-depth=2, so site-publisher's
previous `git diff HEAD~1 HEAD` lost any commits stacked on top of each
other between publishes, and a `data/release-pulsar.js`-only change
never re-triggered the affected versioned-docs builds (e.g. f15a213).
- Track the published source-repo SHA in `.publish-ref` at the root of
asf-site-next, committed atomically with the published content.
- Use the GitHub compare API to enumerate changed files and commit
messages for the whole range. Fall back to a full rebuild on missing
ref / 404 / truncated response (>=300 files or >=250 commits).
- When `data/release-pulsar.js` is in the diff, parse both versions of
the file and rebuild every versioned-docs branch whose entry-set
changed (catches new patches under existing vtags).
- Replace scripts/split-version-build.sh + scripts/split-version.js
with a Python orchestrator (tools/pytools/lib/execute/version_build.py)
that does the same per-version yarn build / stash / restore loop.
- Honor BUILD_ALL_VERSION=1 and BUILD_VERSIONS=... in any commit
message across the publish range, not just HEAD.
- Use the workflow's default GITHUB_TOKEN (permissions: contents: write)
for both API calls and the asf-site-next push; drop PULSARBOT_TOKEN.
---
.github/workflows/ci-build-site.yml | 5 +-
docusaurus.config.ts | 9 +-
scripts/split-version-build.sh | 77 ---------
scripts/split-version.js | 8 -
tools/pytools/bin/site-publisher.py | 11 +-
tools/pytools/lib/execute/changed_files.py | 248 +++++++++++++++++++++++++++++
tools/pytools/lib/execute/site_builder.py | 41 ++---
tools/pytools/lib/execute/site_uploader.py | 13 +-
tools/pytools/lib/execute/version_build.py | 148 +++++++++++++++++
9 files changed, 444 insertions(+), 116 deletions(-)
diff --git a/.github/workflows/ci-build-site.yml
b/.github/workflows/ci-build-site.yml
index 164503434da..6b01af534f9 100644
--- a/.github/workflows/ci-build-site.yml
+++ b/.github/workflows/ci-build-site.yml
@@ -31,6 +31,8 @@ jobs:
name: Build and publish pulsar website-next
runs-on: ubuntu-latest
timeout-minutes: 600
+ permissions:
+ contents: write
steps:
- uses: actions/checkout@v6
with:
@@ -39,7 +41,6 @@ jobs:
with:
ref: 'asf-site-next'
path: tmp/asf-site-next
- token: ${{ secrets.PULSARBOT_TOKEN }}
- name: Install poetry
run: pipx install poetry
- uses: actions/setup-python@v6
@@ -53,6 +54,8 @@ jobs:
- run: corepack enable
- name: Update generated docs
working-directory: tools/pytools
+ env:
+ GITHUB_TOKEN: ${{ github.token }}
run: |
poetry install
poetry run bin/site-publisher.py
--site-path=$GITHUB_WORKSPACE/tmp/asf-site-next
diff --git a/docusaurus.config.ts b/docusaurus.config.ts
index c67e02c6073..70ad60d2657 100644
--- a/docusaurus.config.ts
+++ b/docusaurus.config.ts
@@ -34,10 +34,11 @@ try {
// Versioned legacy URLs like /docs/<version>/client-libraries-<slug> are NOT
// included here on purpose: plugin-client-redirects generates a stub HTML file
// at every `from` path, which would create
build/docs/<version>/client-libraries-*/
-// directories in every yarn build. The CI split-version-build.sh script (which
-// builds each version separately and then mv's build-<v>/<v> into build/docs/)
-// would then fail because build/docs/<v>/ is non-empty from those stubs. Those
-// versioned URLs are handled exclusively by static/.htaccess in production.
+// directories in every yarn build. The CI per-version build orchestrator
+// (tools/pytools/lib/execute/version_build.py) builds each version separately
+// and then folds build-<v>/<v> back into build/docs/; it would fail because
+// build/docs/<v>/ is non-empty from those stubs. Those versioned URLs are
+// handled exclusively by static/.htaccess in production.
function clientLibrariesLegacyRedirects() {
const slugs = [
"java", "java-setup", "java-initialize", "java-use", "java-tracing",
diff --git a/scripts/split-version-build.sh b/scripts/split-version-build.sh
deleted file mode 100755
index 3f3fb4cd55c..00000000000
--- a/scripts/split-version-build.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash
-
-set -x -e
-
-node scripts/split-version.js
-
-latest=$(cat scripts/.latest)
-buildVersion="next"
-
-echo "changed files: "
-echo "$@"
-
-_build() {
- yarn build
-}
-
-_buildVersion() {
- if [[ $buildVersion = "next" ]]; then
- echo "..." "${buildVersion}" "and" "$latest" "begin build..."
- echo "[\"current\", \"${latest}\"]" >.build-versions.json
- else
- echo "..." "$buildVersion" "begin build..."
- echo "[\"${buildVersion}\"]" >.build-versions.json
- fi
-
- _build
-
- if [[ $buildVersion != "$latest" ]]; then
- mkdir -p "build-${buildVersion}/${buildVersion}"
- mkdir -p build-assets
- cp -r "build/docs/${buildVersion}/"*
"build-${buildVersion}/${buildVersion}"
- cp -r build/assets/* build-assets/
- rm -rf "build/docs/${buildVersion}"
- fi
- echo "..." "$buildVersion" "build done..."
-}
-
-COMMIT_MSG=$(git show -s --format="%s %B")
-FORCE_BUILD_ALL_VERSION=$(echo "$COMMIT_MSG" | sed -n
's/.*BUILD_ALL_VERSION=\([0-1]*\).*/\1/p')
-FORCE_BUILD_VERSIONS=$(echo "$COMMIT_MSG" | sed -n
's/.*BUILD_VERSIONS=\([0-9\.x,]*\).*/\1/p')
-if [[ $FORCE_BUILD_VERSIONS =~ ^[0-9\.x,]+$ ]]; then
- SUPPLEMENT_VERSIONS=$FORCE_BUILD_VERSIONS
-else
- SUPPLEMENT_VERSIONS=""
-fi
-
-CURRENT_HOUR=$(date +%H)
-CURRENT_HOUR=${CURRENT_HOUR#0}
-echo "CURRENT_HOUR: ${CURRENT_HOUR}"
-
-if [[ "$FORCE_BUILD_ALL_VERSION" == "1" ]] || [[ "$FORCE_BUILD_ALL_VERSION" ==
"0" ]]; then
- BUILD_ALL_VERSION="$FORCE_BUILD_ALL_VERSION"
- echo "force build all versions"
-fi
-
-# Build only the versions that has changed and build next version that has any
changed
-while read -r version; do
- buildVersion=$version
- if [[ "$*" == *versioned_docs/version-"$version"* || $buildVersion ==
"next" || $BUILD_ALL_VERSION == "1" || $BUILD_VERSION == *"$buildVersion"* ||
$SUPPLEMENT_VERSIONS == *"$buildVersion"* ]]; then
- _buildVersion
- else
- echo "..." "$buildVersion" "no change, skip"
- fi
-done <scripts/.versions
-
-while read -r version; do
- if [ -d "build-$version" ]; then
- mv -f "build-$version/"* build/docs
- rm -rf "build-$version"
- fi
-done <scripts/.versions
-
-cp -r build-assets/* build/assets/
-rm -rf build-assets
-cp static/.htaccess build/
-
-echo "$BUILD_ALL_VERSION" >scripts/.build
diff --git a/scripts/split-version.js b/scripts/split-version.js
deleted file mode 100644
index fb61a57770f..00000000000
--- a/scripts/split-version.js
+++ /dev/null
@@ -1,8 +0,0 @@
-const fs = require("fs");
-const path = require("path");
-const _ = require("lodash");
-let versions = require("../versions.json");
-const latestVersion = versions.shift();
-versions = versions.concat(["next"]);
-fs.writeFileSync(path.join(__dirname, ".versions"), versions.join("\n") +
"\n");
-fs.writeFileSync(path.join(__dirname, ".latest"), latestVersion + "\n");
diff --git a/tools/pytools/bin/site-publisher.py
b/tools/pytools/bin/site-publisher.py
index c13a03d0508..c1f33a024d6 100755
--- a/tools/pytools/bin/site-publisher.py
+++ b/tools/pytools/bin/site-publisher.py
@@ -17,6 +17,7 @@
# specific language governing permissions and limitations
# under the License.
+import os
import tempfile
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from pathlib import Path
@@ -42,8 +43,10 @@ if __name__ == '__main__':
else:
site = Path(args.site_path)
- commit = run_pipe(git, 'rev-parse', '--short', 'HEAD',
cwd=root_path()).read().strip()
- msg = f'Site updated at revision {commit}'
+ head_sha = run_pipe(git, 'rev-parse', 'HEAD',
cwd=root_path()).read().strip()
+ short_sha = head_sha[:12]
+ msg = f'Site updated at revision {short_sha}'
+ token = os.getenv('GITHUB_TOKEN')
- site_builder.execute(site)
- site_uploader.execute(args.push, msg, site, branch)
+ site_builder.execute(site, head_sha, token)
+ site_uploader.execute(args.push, msg, site, branch, head_sha)
diff --git a/tools/pytools/lib/execute/changed_files.py
b/tools/pytools/lib/execute/changed_files.py
new file mode 100644
index 00000000000..de20c620f05
--- /dev/null
+++ b/tools/pytools/lib/execute/changed_files.py
@@ -0,0 +1,248 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import base64
+import json
+import os
+import re
+from collections import defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+
+import requests
+
+from constant import site_path
+
+REPO = 'apache/pulsar-site'
+API_ROOT = 'https://api.github.com'
+PUBLISH_REF_FILE = '.publish-ref'
+RELEASE_PULSAR_PATH = 'data/release-pulsar.js'
+VTAG_VERSION_RE = re.compile(r'^\d+\.\d+\.x$')
+
+# GitHub's compare API caps responses at 300 files / 250 commits. Beyond those
+# thresholds the result is silently truncated, so we fall back to a full
rebuild.
+COMPARE_FILE_CAP = 300
+COMPARE_COMMIT_CAP = 250
+
+BUILD_ALL_RE = re.compile(r'BUILD_ALL_VERSION=([01])')
+BUILD_VERSIONS_RE = re.compile(r'BUILD_VERSIONS=([0-9.x,]+)')
+
+
+@dataclass
+class ChangeSet:
+ files: List[str] = field(default_factory=list)
+ build_all: bool = False
+ force_versions: Set[str] = field(default_factory=set)
+
+
+def _auth_headers(token: str) -> dict:
+ return {
+ 'Authorization': f'Bearer {token}',
+ 'Accept': 'application/vnd.github+json',
+ 'X-GitHub-Api-Version': '2022-11-28',
+ }
+
+
+def _read_publish_ref(asf_site: Path) -> Optional[str]:
+ ref_file = asf_site / PUBLISH_REF_FILE
+ if not ref_file.is_file():
+ return None
+ sha = ref_file.read_text().strip()
+ return sha or None
+
+
+def _read_known_versions() -> Set[str]:
+ """Return the set of versioned-docs branches from versions.json (the
canonical source)."""
+ return set(json.loads((site_path() / 'versions.json').read_text()))
+
+
+def _fetch_release_pulsar_entries(sha: str, token: str) ->
Optional[List[dict]]:
+ """Fetch and parse data/release-pulsar.js at the given SHA into a list of
entry dicts."""
+ url = f'{API_ROOT}/repos/{REPO}/contents/{RELEASE_PULSAR_PATH}'
+ try:
+ resp = requests.get(url, headers=_auth_headers(token), params={'ref':
sha}, timeout=30)
+ resp.raise_for_status()
+ payload = resp.json()
+ except (requests.RequestException, ValueError) as e:
+ print(f'failed to fetch {RELEASE_PULSAR_PATH}@{sha}: {e}')
+ return None
+
+ encoded = payload.get('content')
+ if not encoded:
+ print(f'no content field for {RELEASE_PULSAR_PATH}@{sha}')
+ return None
+ try:
+ body = base64.b64decode(encoded).decode('utf-8')
+ except (ValueError, UnicodeDecodeError) as e:
+ print(f'failed to decode {RELEASE_PULSAR_PATH}@{sha}: {e}')
+ return None
+
+ # The file is `module.exports = [...]` where the array is valid JSON.
+ start = body.find('[')
+ end = body.rfind(']')
+ if start < 0 or end < 0 or end <= start:
+ print(f'could not locate JSON array in {RELEASE_PULSAR_PATH}@{sha}')
+ return None
+ try:
+ entries = json.loads(body[start:end + 1])
+ except json.JSONDecodeError as e:
+ print(f'failed to parse {RELEASE_PULSAR_PATH}@{sha} as JSON: {e}')
+ return None
+ if not isinstance(entries, list):
+ return None
+ return entries
+
+
+def _group_by_vtag(entries: List[dict]) -> Dict[str, Set[Tuple]]:
+ """Group entries by vtag. Each entry becomes a frozen tuple of sorted
items so we can
+ diff them as set members. Vtags that aren't of the form X.Y.x are skipped
— they refer
+ to legacy per-patch versions for which there's no versioned_docs/version-*
directory."""
+ grouped: Dict[str, Set[Tuple]] = defaultdict(set)
+ for entry in entries:
+ vtag = entry.get('vtag')
+ if not isinstance(vtag, str) or not VTAG_VERSION_RE.match(vtag):
+ continue
+ grouped[vtag].add(tuple(sorted(entry.items())))
+ return grouped
+
+
+def _release_pulsar_synthetic_paths(
+ base_sha: str, head_sha: str, token: str
+) -> Tuple[List[str], bool]:
+ """Return (synthetic_paths, ok). When ok is False, caller should escalate
to build_all.
+
+ The returned paths are pseudo-arguments — never written to disk. They're
appended to
+ the file list whose substring matcher fires the corresponding
versioned-docs build."""
+ base_entries = _fetch_release_pulsar_entries(base_sha, token)
+ head_entries = _fetch_release_pulsar_entries(head_sha, token)
+ if base_entries is None or head_entries is None:
+ return [], False
+
+ base_grouped = _group_by_vtag(base_entries)
+ head_grouped = _group_by_vtag(head_entries)
+
+ affected = {
+ vtag for vtag in base_grouped.keys() | head_grouped.keys()
+ if base_grouped.get(vtag, set()) != head_grouped.get(vtag, set())
+ }
+ if not affected:
+ return [], True
+
+ known = _read_known_versions()
+ rebuild = sorted(affected & known)
+ skipped = sorted(affected - known)
+ if rebuild:
+ print(f'release-pulsar.js changes → rebuild versions: {rebuild}')
+ if skipped:
+ print(f'release-pulsar.js changes (unknown versions, skipped):
{skipped}')
+ return [
+ f'versioned_docs/version-{vtag}/' for vtag in rebuild
+ ], True
+
+
+def _parse_commit_directives(messages: Iterable[str]) -> Tuple[bool, Set[str]]:
+ """Scan commit messages for BUILD_ALL_VERSION=1 / BUILD_VERSIONS=...
directives.
+
+ Returns (build_all, force_versions). Mirrors the regex behavior of the
previous
+ scripts/split-version-build.sh, but evaluates every commit in the range —
not just HEAD."""
+ build_all = False
+ force_versions: Set[str] = set()
+ for msg in messages:
+ if not msg:
+ continue
+ if any(m.group(1) == '1' for m in BUILD_ALL_RE.finditer(msg)):
+ build_all = True
+ for match in BUILD_VERSIONS_RE.finditer(msg):
+ for v in match.group(1).split(','):
+ v = v.strip()
+ if v:
+ force_versions.add(v)
+ return build_all, force_versions
+
+
+def full_build_paths() -> List[str]:
+ return [
+ f'versioned_docs/version-{v}/'
+ for v in sorted(_read_known_versions())
+ ]
+
+
+def compute_changed_files(
+ asf_site: Path,
+ head_sha: str,
+ token: Optional[str],
+) -> ChangeSet:
+ """Compute the change set since the last successful publish.
+
+ `build_all=True` signals the caller to rebuild every versioned-docs branch
in
+ versions.json. `force_versions` carries any explicit BUILD_VERSIONS=...
selectors
+ found in commit messages between .publish-ref and head_sha."""
+ base_sha = _read_publish_ref(asf_site)
+ if base_sha is None:
+ print(f'{PUBLISH_REF_FILE} missing → full rebuild')
+ return ChangeSet(build_all=True)
+
+ if not token:
+ if os.getenv('GITHUB_ACTIONS'):
+ print('GITHUB_TOKEN not set in CI → full rebuild')
+ else:
+ print('no GITHUB_TOKEN (local run) → full rebuild')
+ return ChangeSet(build_all=True)
+
+ if base_sha == head_sha:
+ print(f'{PUBLISH_REF_FILE} already at {head_sha} → no rebuild needed')
+ return ChangeSet()
+
+ url = f'{API_ROOT}/repos/{REPO}/compare/{base_sha}...{head_sha}'
+ try:
+ resp = requests.get(url, headers=_auth_headers(token), timeout=30)
+ if resp.status_code == 404:
+ print(f'compare {base_sha}...{head_sha} returned 404 → full
rebuild')
+ return ChangeSet(build_all=True)
+ resp.raise_for_status()
+ payload = resp.json()
+ except (requests.RequestException, ValueError) as e:
+ print(f'compare API failed: {e} → full rebuild')
+ return ChangeSet(build_all=True)
+
+ files = payload.get('files') or []
+ total_commits = payload.get('total_commits', 0)
+ if len(files) >= COMPARE_FILE_CAP or total_commits >= COMPARE_COMMIT_CAP:
+ print(
+ f'compare result truncated (files={len(files)},
commits={total_commits}) → full rebuild'
+ )
+ return ChangeSet(build_all=True)
+
+ messages = [c.get('commit', {}).get('message', '') for c in
payload.get('commits') or []]
+ build_all, force_versions = _parse_commit_directives(messages)
+ if build_all:
+ print('BUILD_ALL_VERSION=1 found in a commit message → full rebuild')
+ if force_versions:
+ print(f'BUILD_VERSIONS commit directives → also build:
{sorted(force_versions)}')
+
+ changed = [f['filename'] for f in files if f.get('filename')]
+ print(f'compare {base_sha[:12]}...{head_sha[:12]} → {len(changed)} changed
file(s)')
+
+ if RELEASE_PULSAR_PATH in changed:
+ synthetic, ok = _release_pulsar_synthetic_paths(base_sha, head_sha,
token)
+ if not ok:
+ print(f'failed to resolve {RELEASE_PULSAR_PATH} vtag diff → full
rebuild')
+ return ChangeSet(build_all=True)
+ changed.extend(synthetic)
+
+ return ChangeSet(files=changed, build_all=build_all,
force_versions=force_versions)
diff --git a/tools/pytools/lib/execute/site_builder.py
b/tools/pytools/lib/execute/site_builder.py
index 14fd2a19633..fa57aaa9746 100644
--- a/tools/pytools/lib/execute/site_builder.py
+++ b/tools/pytools/lib/execute/site_builder.py
@@ -16,43 +16,46 @@
# under the License.
import shutil
-import tempfile
from pathlib import Path
+from typing import Optional
from command import find_command, run
from constant import site_path
+from execute import version_build
+from execute.changed_files import ChangeSet, compute_changed_files,
full_build_paths
-def execute(asf_site: Path):
- # 1. Get modified files
- git = find_command('git', msg="git is required")
- with tempfile.TemporaryFile('w+') as f:
- run(git, 'diff', '--name-only', 'HEAD~1', 'HEAD', stdout=f,
cwd=site_path())
- f.seek(0)
- modified_files = f.read().splitlines()
- for file in modified_files:
- print(f"{file} was modified")
+def execute(asf_site: Path, head_sha: str, token: Optional[str]):
+ # Compute the change set vs. the last successful publish (.publish-ref in
asf-site-next).
+ # The CI workflow checks out main with fetch-depth=2, so a local `git
diff` cannot span
+ # more than the most recent commit; the GitHub compare API is the source
of truth.
+ change_set = compute_changed_files(asf_site, head_sha, token)
+ if change_set.build_all:
+ change_set = ChangeSet(
+ files=full_build_paths(),
+ build_all=True,
+ force_versions=change_set.force_versions,
+ )
+ for file in change_set.files:
+ print(f'{file} was modified')
+ if change_set.force_versions:
+ print(f'forced versions from commit messages:
{sorted(change_set.force_versions)}')
- # # 2. Install and build
- yarn = find_command('yarn', msg="yarn is required")
- bash = find_command('bash', msg="bash is required")
+ yarn = find_command('yarn', msg='yarn is required')
run(yarn, 'install', cwd=site_path())
- run(bash, 'scripts/split-version-build.sh', *modified_files,
cwd=site_path())
+ version_build.execute(change_set)
# Expand @pulsar:...@ tokens and rewrite `pathname:///` in the Docsify
# reference site (build/reference/), which Docusaurus copies verbatim
# from static/ and so isn't touched by the markdown preprocessor pipeline.
run(yarn, 'process-reference-markdown', cwd=site_path())
latest_content = site_path() / 'build'
- # 3. Publish content to asf-site-next branch
published_content = asf_site / 'content'
if not published_content.exists():
published_content.mkdir(parents=True, exist_ok=True)
- is_build_all = (site_path() / 'scripts' / '.build').read_text().strip()
- is_build_all = is_build_all == "1"
- print(f'is_build_all: {is_build_all}')
- if is_build_all:
+ print(f'is_build_all: {change_set.build_all}')
+ if change_set.build_all:
whitelist = ['api', 'charts']
old_files = [f for f in published_content.glob('*') if f.name not in
whitelist]
print(f'clean all the old content: {list(map(str, old_files))}')
diff --git a/tools/pytools/lib/execute/site_uploader.py
b/tools/pytools/lib/execute/site_uploader.py
index 37331c47f70..692e44cd19e 100755
--- a/tools/pytools/lib/execute/site_uploader.py
+++ b/tools/pytools/lib/execute/site_uploader.py
@@ -44,9 +44,16 @@ def _should_push(mode: Mode) -> bool:
return result
-def _do_push(msg: str, site: Path, branch: str):
+def _do_push(msg: str, site: Path, branch: str, head_sha: str):
git = find_command('git', msg="git is required")
+ # Persist the source-repo SHA we just published so the next run can compute
+ # the changed-files set against this point. Written before `git add -A` so
+ # it lands in the same commit as the published content. If `git push` later
+ # fails, this local file is discarded along with the unpushed commit — the
+ # next CI run re-clones asf-site-next and reads the previous .publish-ref.
+ (site / '.publish-ref').write_text(head_sha + '\n')
+
run(git, 'add', '-A', '.', cwd=site)
changed = run(git, 'diff-index', '--quiet', 'HEAD', codes={0, 1},
cwd=site).returncode
print(f'changed: {changed}')
@@ -68,9 +75,9 @@ def _do_push(msg: str, site: Path, branch: str):
run(git, 'push', 'origin', branch, cwd=site)
-def execute(mode: Mode, msg: str, site: Path, branch: str):
+def execute(mode: Mode, msg: str, site: Path, branch: str, head_sha: str):
if _should_push(mode):
- _do_push(msg, site, branch)
+ _do_push(msg, site, branch, head_sha)
else: # show changes
git = find_command('git', msg="git is required")
with tempfile.TemporaryFile('w+') as f:
diff --git a/tools/pytools/lib/execute/version_build.py
b/tools/pytools/lib/execute/version_build.py
new file mode 100644
index 00000000000..132a516f57f
--- /dev/null
+++ b/tools/pytools/lib/execute/version_build.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Per-version Docusaurus build orchestrator.
+
+Replaces the legacy ``scripts/split-version-build.sh`` +
``scripts/split-version.js``
+pair: reads versions.json directly, decides which versions to build based on
the
+ChangeSet, runs ``yarn build`` once per included version, and assembles
+``build/docs/`` from the per-version outputs."""
+
+import json
+import shutil
+from pathlib import Path
+from typing import List
+
+from command import find_command, run
+from constant import site_path
+from execute.changed_files import ChangeSet
+
+
+def _should_build(bv: str, change_set: ChangeSet) -> bool:
+ if bv == 'next':
+ return True
+ if change_set.build_all:
+ return True
+ if bv in change_set.force_versions:
+ return True
+ needle = f'versioned_docs/version-{bv}/'
+ return any(needle in f for f in change_set.files)
+
+
+def _write_build_versions(site: Path, build_versions: List[str]) -> None:
+ (site / '.build-versions.json').write_text(json.dumps(build_versions) +
'\n')
+
+
+def _move_tree(src: Path, dst: Path) -> None:
+ """Replace dst with src. dst is removed first if it exists."""
+ if dst.exists():
+ if dst.is_dir():
+ shutil.rmtree(dst)
+ else:
+ dst.unlink()
+ dst.parent.mkdir(parents=True, exist_ok=True)
+ shutil.move(str(src), str(dst))
+
+
+def _merge_tree(src: Path, dst: Path) -> None:
+ """Copy src/* into dst/, creating dst if needed. Files in dst are
overwritten."""
+ dst.mkdir(parents=True, exist_ok=True)
+ for child in src.iterdir():
+ target = dst / child.name
+ if child.is_dir():
+ shutil.copytree(child, target, dirs_exist_ok=True)
+ else:
+ shutil.copy2(child, target)
+
+
+def execute(change_set: ChangeSet) -> None:
+ """Build every required version into a single ``build/`` tree.
+
+ The flow mirrors what split-version-build.sh did, just in Python:
+ 1. Iterate ``versions[1:] + ["next"]`` (latest is built as part of "next").
+ 2. For each included version, write ``.build-versions.json`` and run
``yarn build``.
+ Each yarn build OVERWRITES ``build/`` — so we stash per-version output
to
+ ``build-{v}/`` and accumulated assets to ``build-assets/`` between runs.
+ 3. After the loop, fold every ``build-{v}/*`` back into ``build/docs/`` and
+ merge ``build-assets/*`` into ``build/assets/``. Copy
``static/.htaccess``
+ into ``build/`` (Docusaurus copies the rest of static/ but skips
dot-files).
+
+ The ``next`` iteration always runs (with ``buildVersions=["current",
latest]``);
+ its output places the latest version's docs under ``build/docs/{latest}/``
and
+ the "current" docs under ``build/docs/next/`` (per
``versionsMap.current.path``
+ in docusaurus.config.ts)."""
+ site = site_path()
+ yarn = find_command('yarn', msg='yarn is required')
+
+ versions: List[str] = json.loads((site / 'versions.json').read_text())
+ if not versions:
+ raise RuntimeError('versions.json is empty')
+ latest = versions[0]
+ iter_versions = versions[1:] + ['next']
+
+ build_dir = site / 'build'
+ assets_stash = site / 'build-assets'
+
+ for bv in iter_versions:
+ if not _should_build(bv, change_set):
+ print(f'... {bv} no change, skip')
+ continue
+
+ if bv == 'next':
+ print(f'... {bv} and {latest} begin build...')
+ _write_build_versions(site, ['current', latest])
+ else:
+ print(f'... {bv} begin build...')
+ _write_build_versions(site, [bv])
+
+ run(yarn, 'build', cwd=site)
+
+ # bv == latest never happens here (latest is excluded from
iter_versions),
+ # so we always stash. Stashing the per-version subtree lets the next
yarn
+ # build overwrite build/ without losing what we just produced.
+ built_subdir = build_dir / 'docs' / bv
+ if not built_subdir.is_dir():
+ raise RuntimeError(f'expected {built_subdir} after yarn build, not
found')
+ stash_root = site / f'build-{bv}'
+ stash_subdir = stash_root / bv
+ if stash_root.exists():
+ shutil.rmtree(stash_root)
+ stash_subdir.parent.mkdir(parents=True, exist_ok=True)
+ shutil.move(str(built_subdir), str(stash_subdir))
+
+ assets_src = build_dir / 'assets'
+ if assets_src.is_dir():
+ _merge_tree(assets_src, assets_stash)
+
+ print(f'... {bv} build done...')
+
+ # Fold per-version stashes back under build/docs/.
+ for bv in iter_versions:
+ stash_root = site / f'build-{bv}'
+ if not stash_root.is_dir():
+ continue
+ for child in stash_root.iterdir():
+ _move_tree(child, build_dir / 'docs' / child.name)
+ shutil.rmtree(stash_root)
+
+ if assets_stash.is_dir():
+ _merge_tree(assets_stash, build_dir / 'assets')
+ shutil.rmtree(assets_stash)
+
+ htaccess_src = site / 'static' / '.htaccess'
+ if htaccess_src.is_file():
+ shutil.copy2(htaccess_src, build_dir / '.htaccess')