MonkeyCanCode commented on code in PR #3553: URL: https://github.com/apache/polaris/pull/3553#discussion_r2868056416
########## site/it/site_checks/docker.py: ########## @@ -0,0 +1,112 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import json +import subprocess +import time + +from .tee import Tee + +DASH_LINE = "------------------------------------------------------------------------------------------------" Review Comment: NIT: maybe use `DASH_LINE = "-" * 96` instead ########## .github/workflows/site-guides.yml: ########## @@ -0,0 +1,79 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Site Guides + +on: + push: + branches: [ "main" ] + paths: + - 'site/content/guides/**' + - 'site/it/**' + pull_request: + branches: [ "main" ] + paths: + - 'site/content/guides/**' + workflow_dispatch: + # Allow manual execution + +jobs: + test-site-guides: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6 + - name: Set up JDK 21 + uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + with: + java-version: '21' + distribution: 'temurin' + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 + with: + python-version: '3.14' + - name: Setup test environment + uses: ./.github/actions/setup-test-env + + - name: Test site testing + working-directory: site/it + run: | + echo "::group::Setup" + python3 -m pip install --upgrade pip Review Comment: Should we just call `make client-install-dependencies` instead? ########## site/it/site_checks/docker.py: ########## @@ -0,0 +1,112 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import json +import subprocess +import time + +from .tee import Tee + +DASH_LINE = "------------------------------------------------------------------------------------------------" + + +def cleanup_docker(tee: Tee) -> float: + """ + Clean up (remove) Docker containers, networks, and volumes. + + :param tee: Log output. + :return: Elapsed time (in seconds) spent running this cleanup. + """ + start = time.monotonic() + try: + tee.printf("::group::Cleanup Docker containers, networks, volumes") + tee.printf("Purging Docker containers...") + try: + result = subprocess.run( + ["docker", "ps", "-a", "-q"], + capture_output=True, + text=True, + check=False, + ) + container_ids = [line for line in result.stdout.splitlines() if line.strip()] + if container_ids: + tee.run(["docker", "container", "rm", "-f", *container_ids]) + except FileNotFoundError: + pass + tee.printf("Purging Docker networks...") + try: + tee.run(["docker", "network", "prune", "-f"]) + except FileNotFoundError: + pass + tee.printf("Purging Docker volumes...") + try: + tee.run(["docker", "volume", "prune", "-f"]) + except FileNotFoundError: + pass + tee.printf("::endgroup::") + finally: + return time.monotonic() - start + + +def docker_compose_info(tee: Tee, md_base_name: str, md_dir_name: str) -> float: + """ + Emit docker compose status/logs for any known compose projects. + + :param tee: Log output. + :param md_base_name: Markdown file base name (for logging/group naming). + :param md_dir_name: Markdown file directory name (for logging/group naming). + :return: Elapsed time (in seconds) spent collecting docker compose information. + """ + start = time.monotonic() + tee.printf(f"::group::Docker Compose information for {md_base_name} in {md_dir_name}") + tee.printf(DASH_LINE) + try: + result = subprocess.run( + ["docker", "compose", "ls", "--all", "--format", "json"], + capture_output=True, + text=True, + check=False, + ) + except FileNotFoundError: + tee.printf(DASH_LINE) + tee.printf("::endgroup::") + return time.monotonic() - start + + try: + compose_entries = json.loads(result.stdout) if result.stdout.strip() else [] + except json.JSONDecodeError: + compose_entries = [] + + for entry in compose_entries: + docker_compose_file = entry.get("ConfigFiles") Review Comment: nit: currently we are all using one docker-compose file per getting-started example, will there ever be consideration to use a base docker-compose file later then (e.g. to avoid repetitive of common services such as bucket-setup and polaris service) instead of a standalone one which has everything? if yes, we will need to parse this as a list then use `-f` for each of them. ########## site/it/site_checks/docker.py: ########## @@ -0,0 +1,112 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import json +import subprocess +import time + +from .tee import Tee + +DASH_LINE = "------------------------------------------------------------------------------------------------" + + +def cleanup_docker(tee: Tee) -> float: + """ + Clean up (remove) Docker containers, networks, and volumes. + + :param tee: Log output. + :return: Elapsed time (in seconds) spent running this cleanup. + """ + start = time.monotonic() + try: + tee.printf("::group::Cleanup Docker containers, networks, volumes") + tee.printf("Purging Docker containers...") + try: + result = subprocess.run( Review Comment: nit: it may be more read-able by using shlex such as following: ``` result = subprocess.run( shlex.split("docker ps -a -q"), capture_output=True, text=True, check=False, ) ``` ########## site/it/site_checks/docker.py: ########## @@ -0,0 +1,112 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import json +import subprocess +import time + +from .tee import Tee + +DASH_LINE = "------------------------------------------------------------------------------------------------" + + +def cleanup_docker(tee: Tee) -> float: + """ + Clean up (remove) Docker containers, networks, and volumes. + + :param tee: Log output. + :return: Elapsed time (in seconds) spent running this cleanup. + """ + start = time.monotonic() + try: + tee.printf("::group::Cleanup Docker containers, networks, volumes") + tee.printf("Purging Docker containers...") + try: + result = subprocess.run( + ["docker", "ps", "-a", "-q"], + capture_output=True, + text=True, + check=False, Review Comment: NIT: default value for `check` is `False`: ``` subprocess.run(args, *, stdin=None, input=None, stdout=None, stderr=None, capture_output=False, shell=False, cwd=None, timeout=None, check=False, encoding=None, errors=None, text=None, env=None, universal_newlines=None, **other_popen_kwargs) ``` ########## site/it/site_checks/docker.py: ########## @@ -0,0 +1,112 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import json +import subprocess +import time + +from .tee import Tee + +DASH_LINE = "------------------------------------------------------------------------------------------------" + + +def cleanup_docker(tee: Tee) -> float: + """ + Clean up (remove) Docker containers, networks, and volumes. + + :param tee: Log output. + :return: Elapsed time (in seconds) spent running this cleanup. + """ + start = time.monotonic() + try: + tee.printf("::group::Cleanup Docker containers, networks, volumes") + tee.printf("Purging Docker containers...") + try: + result = subprocess.run( + ["docker", "ps", "-a", "-q"], + capture_output=True, + text=True, + check=False, + ) + container_ids = [line for line in result.stdout.splitlines() if line.strip()] + if container_ids: + tee.run(["docker", "container", "rm", "-f", *container_ids]) + except FileNotFoundError: Review Comment: So the `FileNotFoundError` here is to check if `docker` binary is present? Instead of using `pass` to ignore silently, should we do warning instead? Also, should we put all run in a try block instead of diff try blocks as if one is missing, the remaining one will fail as well. ########## site/it/site_checks/markdown_testing.py: ########## @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from __future__ import annotations + +import getopt +import os +import sys +import time +from pathlib import Path +from textwrap import dedent +from typing import List + +from .code_block import CodeBlocksGlobal +from .docker import cleanup_docker, docker_compose_info +from .gen_test_script import generate_markdown_test_script +from .spark import SPARK_TGZ_URL, ensure_spark +from .tee import Tee + + +def _format_duration(seconds: float) -> str: Review Comment: nits: maybe use `divmod` to splits seconds into minutes and seconds such as following: ``` def _format_duration(seconds: float) -> str: seconds = max(0.0, float(seconds)) if seconds < 60: return f"{seconds:.2f}s" minutes, rem_seconds = divmod(int(round(seconds)), 60) return f"{minutes}m {rem_seconds:02d}s" ``` ########## site/it/site_checks/markdown_testing.py: ########## @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from __future__ import annotations + +import getopt +import os +import sys +import time +from pathlib import Path +from textwrap import dedent +from typing import List + +from .code_block import CodeBlocksGlobal +from .docker import cleanup_docker, docker_compose_info +from .gen_test_script import generate_markdown_test_script +from .spark import SPARK_TGZ_URL, ensure_spark +from .tee import Tee + + +def _format_duration(seconds: float) -> str: + seconds = max(0.0, float(seconds)) + if seconds < 60: + return f"{seconds:.2f}s" + minutes = int(seconds // 60) + rem_seconds = int(round(seconds - (minutes * 60))) + if rem_seconds == 60: + minutes += 1 + rem_seconds = 0 + return f"{minutes}m {rem_seconds:02d}s" + + +def run_test( + md_file: Path, + site_dir: Path, + workspace_dir: Path, + build_tests_dir: Path, + test_summary_file: Path, + env: dict[str, str], + code_blocks_global: CodeBlocksGlobal +) -> bool: + """ + Exercises testing of a single Markdown file. + + :param md_file: The Markdown file to test. + :param site_dir: The `site/` directory of the workspace. + :param workspace_dir: Root of the Polaris worktree directory. + :param build_tests_dir: Directory to place test scripts and logs. + :param test_summary_file: Summary file for the test results. + :param env: Environment variables to pass to the test script. + :return: `True` if the test ran successfully, `False` otherwise. + """ + if not md_file.is_file(): + raise Exception(f"Error: {md_file} does not exist or is not a file") + if md_file.suffix != ".md": + raise Exception(f"Error: {md_file} is not a markdown file") + + try: + md_rel_path = md_file.resolve().relative_to(site_dir.resolve()) + except ValueError: + raise Exception(f"Error: {md_file} must be inside {site_dir}") + + md_rel_name = md_rel_path.as_posix() + md_base_name = md_file.name + md_dir_name = str(md_rel_path.parent) + test_name = md_rel_name.replace("/", "_-_") + test_script = build_tests_dir / f"{test_name}.sh" + log_file = build_tests_dir / f"{test_name}.log" + + with log_file.open("w", encoding="utf-8") as log_handle: + tee = Tee(sys.stdout, log_handle) + tee.printf(">") + tee.printf(f"> Testing {md_base_name} in {md_dir_name} ...") + + generate_markdown_test_script(md_file, test_script, code_blocks_global) + + cleanup_docker(tee) + env = dict(env) + env["SITE_TEST_GUIDE_DIR"] = str(site_dir / md_dir_name) + test_start = time.monotonic() + failed = tee.run([str(test_script)], cwd=workspace_dir, env=env) != 0 + test_duration = time.monotonic() - test_start + # Unconditionally emit an `::endgroup::` in case the generated script failed to emit its own. + tee.printf("::endgroup::") + + duration_str = _format_duration(test_duration) + if failed: + summary = f"❌ Test failed for {md_file} ({duration_str})" + else: + summary = f"✅ Test passed for {md_file} ({duration_str})" + + tee.printf(f"{summary}") + with test_summary_file.open("a", encoding="utf-8") as summary_handle: + summary_handle.write(f"{summary}\n") + if env.get("GITHUB_STEP_SUMMARY"): + with open(env["GITHUB_STEP_SUMMARY"], "a", encoding="utf-8") as summary_out: + summary_out.write(f"{summary}\n") + tee.printf(">") + + docker_info_duration = docker_compose_info(tee, md_base_name, md_dir_name) + cleanup_duration = cleanup_docker(tee) + + tee.printf(dedent(f""" + Test steps durations for for {md_file} ({'❌ FAILED' if failed else '✅ passed'}): + .. Test: {duration_str} + .. Docker Compose Info: {_format_duration(docker_info_duration)} + .. Docker Cleanup: {_format_duration(cleanup_duration)} + """)) + tee.printf("************************************************************************************************") + + return not failed + + +def usage() -> None: + sys.stderr.write( + dedent( + f""" + Usage: markdown-testing.sh [options] [<markdown-file> ...] + + markdown-files MUST be inside the site/ directory. + If no markdown-files are specified on the command line, + all markdown-files in site/content/guides are tested. + + Options: + -s <spark-tarball> + --spark-tarball=<spark-tarball> + URL to download the Spark tarball. + Default: {SPARK_TGZ_URL}. + -S + --download-spark-tarball + Always download the Spark tarball. + -h + --help + Show this help text. + """ + ) + ) + + +def markdown_testing(args: List[str]) -> bool: + """ + Main entry point for Markdown code block testing. + + :param args: Command line arguments. + :return: `False` indicates an error occurred, `True` otherwise. + """ + try: + opts, args = getopt.getopt(args, "hs:S", ["help", "spark-tarball=", "download-spark-tarball"]) + except getopt.GetoptError as exc: + sys.stderr.write(f"Unknown option: -{exc.opt}\n") + usage() + return False + + spark_tarball_url = SPARK_TGZ_URL + spark_download_requested = False + + for opt, val in opts: + if opt in ("-h", "--help"): + usage() + return True + elif opt in ("-s", "--spark-tarball"): + spark_tarball_url = val + elif opt in ("-S", "--download-spark-tarball"): + spark_download_requested = True + + script_dir = Path(__file__).resolve().parent.parent + site_dir = script_dir.parent + + build_dir = site_dir / "it" / "build" + build_tests_dir = build_dir / "tests" + spark_dir = build_dir / "spark" + build_tests_dir.mkdir(parents=True, exist_ok=True) + spark_dir.mkdir(parents=True, exist_ok=True) + + spark_sql_bin = spark_dir / "bin" / "spark-sql" + ensure_spark( + spark_sql_bin=spark_sql_bin, + spark_dir=spark_dir, + spark_tarball_url=spark_tarball_url, + spark_download_requested=spark_download_requested, + ) + + workspace_dir = (site_dir / "..").resolve() + + env = dict(os.environ) + env["SITE_TEST_WORKSPACE_DIR"] = str(workspace_dir) + env["SPARK_SQL_BIN"] = str(spark_sql_bin) + env["BUILD_TESTS_DIR"] = str(build_tests_dir) + + if args: + raw_files = list(Path(arg) for arg in args) + else: + raw_files = (site_dir / "content" / "guides").glob("*/*.md") + md_files = sorted(p.absolute().relative_to(site_dir) for p in raw_files) + + test_summary_file = build_tests_dir / "test-summary.txt" + if test_summary_file.exists(): + test_summary_file.unlink() + test_summary_file.touch() + + code_blocks_global = CodeBlocksGlobal() + + os.chdir(site_dir) + all_successful = True + for md_file in md_files: + all_successful &= run_test( + md_file=md_file, + site_dir=site_dir, + workspace_dir=workspace_dir, + build_tests_dir=build_tests_dir, + test_summary_file=test_summary_file, + env=env, + code_blocks_global=code_blocks_global, + ) + + print(">") + + print("************************************************************************************************") Review Comment: nit: `print('*' * 96)` ########## site/it/site_checks/spark.py: ########## @@ -0,0 +1,65 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import subprocess +from pathlib import Path + +SPARK_TGZ_URL = "https://downloads.apache.org/spark/spark-3.5.8/spark-3.5.8-bin-hadoop3.tgz" Review Comment: As suggested by [dongjoon-hyun](https://github.com/dongjoon-hyun) from https://github.com/apache/polaris/pull/2038, it is better to use `closer.lua` instead. Also, as this is just for testing, should we consider just install pyspark via pip instead? We can add it as a test dependency in `pyproject.yaml`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
