This is an automated email from the ASF dual-hosted git repository. potiuk pushed a commit to branch 3.2.0-docs in repository https://gitbox.apache.org/repos/asf/airflow.git
commit 7204dff4263a617f29eb4d63db2d87c9e0a94fd8 Author: Jarek Potiuk <[email protected]> AuthorDate: Mon Apr 6 16:49:28 2026 +0200 Expand sensitive vars to full list with component mapping and auto-update Update security_model.rst sensitive config variables section: - List ALL sensitive vars from config.yml and provider.yaml files - Core vars organized in a table with "Needed by" column mapping each var to the components that require it (API Server, Scheduler, Workers, Dag File Processor, Triggerer) - Provider vars in a separate table noting they should only be set where the provider functionality is needed - Tables are auto-generated between AUTOGENERATED markers Update prek hook to auto-update the sensitive var tables: - Reads config.yml and all provider.yaml files - Generates RST list-table content for core and provider sensitive vars - Replaces content between markers on each run - Warns when new sensitive vars need component mapping added to the hook - Validates [section] option and AIRFLOW__X__Y references against config - Skips autogenerated sections when checking env var references --- airflow-core/docs/security/security_model.rst | 116 ++++++++- scripts/ci/prek/check_security_doc_constants.py | 313 +++++++++++++++++------- 2 files changed, 325 insertions(+), 104 deletions(-) diff --git a/airflow-core/docs/security/security_model.rst b/airflow-core/docs/security/security_model.rst index cb1ade8e4f8..0b76b95a3f3 100644 --- a/airflow-core/docs/security/security_model.rst +++ b/airflow-core/docs/security/security_model.rst @@ -402,19 +402,109 @@ model — Airflow does not enforce these natively. the same Unix user. Environment variables can also be scoped to individual processes or containers, making it easier to restrict which components have access to which secrets. - The following is a non-exhaustive list of security-sensitive configuration variables that should - be carefully restricted: - - * ``AIRFLOW__API_AUTH__JWT_SECRET`` — JWT signing key (symmetric mode). - * ``AIRFLOW__API_AUTH__JWT_PRIVATE_KEY_PATH`` — Path to JWT private key (asymmetric mode). - * ``AIRFLOW__DATABASE__SQL_ALCHEMY_CONN`` — Metadata database connection string. - * ``AIRFLOW__CELERY__RESULT_BACKEND`` — Celery result backend connection string. - * ``AIRFLOW__CELERY__BROKER_URL`` — Celery broker URL. - * ``AIRFLOW__CORE__FERNET_KEY`` — Fernet encryption key for connections and variables at rest. - * ``AIRFLOW__SECRETS__BACKEND_KWARGS`` — Secrets backend credentials. - - This is not a complete list. Deployment Managers should review the full configuration reference - and identify all parameters that contain credentials or secrets relevant to their deployment. + The following tables list all security-sensitive configuration variables (marked ``sensitive: true`` + in Airflow's configuration). Deployment Managers should review each variable and ensure it is only + provided to the components that need it. The "Needed by" column indicates which components + typically require the variable — but actual needs depend on the specific deployment topology and + features in use. + + .. START AUTOGENERATED CORE SENSITIVE VARS + + **Core Airflow sensitive configuration variables:** + + .. list-table:: + :header-rows: 1 + :widths: 40 30 30 + + * - Environment variable + - Description + - Needed by + * - ``AIRFLOW__API_AUTH__JWT_SECRET`` + - JWT signing key (symmetric mode) + - API Server, Scheduler + * - ``AIRFLOW__API__SECRET_KEY`` + - API secret key for log token signing + - API Server, Scheduler, Workers, Triggerer + * - ``AIRFLOW__CORE__ASSET_MANAGER_KWARGS`` + - Asset manager credentials + - Dag File Processor + * - ``AIRFLOW__CORE__FERNET_KEY`` + - Fernet encryption key for connections/variables at rest + - API Server, Scheduler, Workers, Dag File Processor, Triggerer + * - ``AIRFLOW__DATABASE__SQL_ALCHEMY_CONN`` + - Metadata database connection string + - API Server, Scheduler, Dag File Processor, Triggerer + * - ``AIRFLOW__DATABASE__SQL_ALCHEMY_CONN_ASYNC`` + - Async metadata database connection string + - API Server, Scheduler, Dag File Processor, Triggerer + * - ``AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS`` + - SQLAlchemy engine parameters (may contain credentials) + - API Server, Scheduler, Dag File Processor, Triggerer + * - ``AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS`` + - Remote logging handler credentials + - Scheduler, Workers, Triggerer + * - ``AIRFLOW__SECRETS__BACKEND_KWARGS`` + - Secrets backend credentials (non-worker mode) + - Scheduler, Dag File Processor, Triggerer + * - ``AIRFLOW__SENTRY__SENTRY_DSN`` + - Sentry error reporting endpoint + - Scheduler, Triggerer + * - ``AIRFLOW__WORKERS__SECRETS_BACKEND_KWARGS`` + - Worker-specific secrets backend credentials + - Workers + + .. END AUTOGENERATED CORE SENSITIVE VARS + + Note that ``AIRFLOW__API_AUTH__JWT_PRIVATE_KEY_PATH`` (path to the JWT private key for asymmetric + signing) is not marked as ``sensitive`` in config.yml because it is a file path, not a secret + value itself. However, access to the file it points to should be restricted to the Scheduler + (which generates tokens) and the API Server (which validates them). + + .. START AUTOGENERATED PROVIDER SENSITIVE VARS + + **Provider-specific sensitive configuration variables:** + + The following variables are defined by Airflow providers and should only be set on components where + the corresponding provider functionality is needed. The decision of which components require these + variables depends on the Deployment Manager's choices about which providers and features are + enabled in each component. + + .. list-table:: + :header-rows: 1 + :widths: 40 30 30 + + * - Environment variable + - Provider + - Description + * - ``AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__SENTINEL_KWARGS`` + - celery + - Sentinel kwargs + * - ``AIRFLOW__CELERY_RESULT_BACKEND_TRANSPORT_OPTIONS__SENTINEL_KWARGS`` + - celery + - Sentinel kwargs + * - ``AIRFLOW__CELERY__BROKER_URL`` + - celery + - Broker url + * - ``AIRFLOW__CELERY__FLOWER_BASIC_AUTH`` + - celery + - Flower basic auth + * - ``AIRFLOW__CELERY__RESULT_BACKEND`` + - celery + - Result backend + * - ``AIRFLOW__KEYCLOAK_AUTH_MANAGER__CLIENT_SECRET`` + - keycloak + - Client secret + * - ``AIRFLOW__OPENSEARCH__PASSWORD`` + - opensearch + - Password + * - ``AIRFLOW__OPENSEARCH__USERNAME`` + - opensearch + - Username + + .. END AUTOGENERATED PROVIDER SENSITIVE VARS + + Deployment Managers should review the full configuration reference and identify any additional + parameters that contain credentials or secrets relevant to their specific deployment. **Use asymmetric keys for JWT signing** Using asymmetric keys (``[api_auth] jwt_private_key_path`` with a JWKS endpoint) provides better diff --git a/scripts/ci/prek/check_security_doc_constants.py b/scripts/ci/prek/check_security_doc_constants.py index 41c26db0a6f..ef4f31fde9a 100755 --- a/scripts/ci/prek/check_security_doc_constants.py +++ b/scripts/ci/prek/check_security_doc_constants.py @@ -23,15 +23,14 @@ # ] # /// """ -Validate that security documentation references to config.yml options stay in sync. +Validate and auto-update security documentation against config.yml. Checks performed: 1. Every ``[section] option`` reference in the security RST files corresponds to an - actual option in config.yml. + actual option in config.yml or provider.yaml. 2. Default values quoted in the docs match the defaults in config.yml. - 3. The sensitive-variable list in security_model.rst includes all config options - marked ``sensitive: true`` in config.yml that have an ``AIRFLOW__`` env-var form. - 4. ``[section] option`` references use correct section names. + 3. Auto-updates the sensitive-variable tables in security_model.rst between + AUTOGENERATED markers to stay in sync with config.yml and provider.yaml. """ from __future__ import annotations @@ -51,72 +50,243 @@ console = Console(color_system="standard", width=200) CONFIG_YML = AIRFLOW_ROOT_PATH / "airflow-core" / "src" / "airflow" / "config_templates" / "config.yml" PROVIDERS_ROOT = AIRFLOW_ROOT_PATH / "providers" +SECURITY_MODEL_RST = AIRFLOW_ROOT_PATH / "airflow-core" / "docs" / "security" / "security_model.rst" SECURITY_DOCS = [ AIRFLOW_ROOT_PATH / "airflow-core" / "docs" / "security" / "jwt_token_authentication.rst", - AIRFLOW_ROOT_PATH / "airflow-core" / "docs" / "security" / "security_model.rst", + SECURITY_MODEL_RST, ] # Pattern to match ``[section] option_name`` references in RST SECTION_OPTION_RE = re.compile(r"``\[(\w+)\]\s+(\w+)``") -# Pattern to match default value claims like "default: VALUE" or "Default VALUE" or "(default VALUE)" -# in table rows like "- 86400 (24h)" preceded by a config option -DEFAULT_IN_TABLE_RE = re.compile( - r"``\[(\w+)\]\s+(\w+)``.*?(?:default[:\s]+|Default[:\s]+)(\S+)", re.IGNORECASE -) - # Pattern to match AIRFLOW__SECTION__OPTION env var references ENV_VAR_RE = re.compile(r"``(AIRFLOW__\w+)``") # Map section+option to the AIRFLOW__ env var form SECTION_OPT_TO_ENV = re.compile(r"AIRFLOW__([A-Z_]+)__([A-Z_]+)") +# Markers for autogenerated sections +CORE_START = " .. START AUTOGENERATED CORE SENSITIVE VARS" +CORE_END = " .. END AUTOGENERATED CORE SENSITIVE VARS" +PROVIDER_START = " .. START AUTOGENERATED PROVIDER SENSITIVE VARS" +PROVIDER_END = " .. END AUTOGENERATED PROVIDER SENSITIVE VARS" + +# Which components need which core config sections/options. +# Maps (section, option) -> list of component names. +# This is the source of truth for the "Needed by" column. +CORE_COMPONENT_MAP: dict[tuple[str, str], str] = { + ("api", "secret_key"): "API Server, Scheduler, Workers, Triggerer", + ("api_auth", "jwt_secret"): "API Server, Scheduler", + ("core", "asset_manager_kwargs"): "Dag File Processor", + ("core", "fernet_key"): "API Server, Scheduler, Workers, Dag File Processor, Triggerer", + ("database", "sql_alchemy_conn"): "API Server, Scheduler, Dag File Processor, Triggerer", + ("database", "sql_alchemy_conn_async"): "API Server, Scheduler, Dag File Processor, Triggerer", + ("database", "sql_alchemy_engine_args"): "API Server, Scheduler, Dag File Processor, Triggerer", + ("logging", "remote_task_handler_kwargs"): "Scheduler, Workers, Triggerer", + ("secrets", "backend_kwargs"): "Scheduler, Dag File Processor, Triggerer", + ("sentry", "sentry_dsn"): "Scheduler, Triggerer", + ("workers", "secrets_backend_kwargs"): "Workers", +} + +# Human-readable descriptions for core sensitive vars +CORE_DESCRIPTIONS: dict[tuple[str, str], str] = { + ("api", "secret_key"): "API secret key for log token signing", + ("api_auth", "jwt_secret"): "JWT signing key (symmetric mode)", + ("core", "asset_manager_kwargs"): "Asset manager credentials", + ("core", "fernet_key"): "Fernet encryption key for connections/variables at rest", + ("database", "sql_alchemy_conn"): "Metadata database connection string", + ("database", "sql_alchemy_conn_async"): "Async metadata database connection string", + ("database", "sql_alchemy_engine_args"): "SQLAlchemy engine parameters (may contain credentials)", + ("logging", "remote_task_handler_kwargs"): "Remote logging handler credentials", + ("secrets", "backend_kwargs"): "Secrets backend credentials (non-worker mode)", + ("sentry", "sentry_dsn"): "Sentry error reporting endpoint", + ("workers", "secrets_backend_kwargs"): "Worker-specific secrets backend credentials", +} + def option_to_env_var(section: str, option: str) -> str: """Convert a config section+option to its AIRFLOW__ env var form.""" return f"AIRFLOW__{section.upper()}__{option.upper()}" -def load_config() -> dict: - """Load config.yml and provider.yaml files to get all config sections/options.""" +def load_core_config() -> dict: + """Load the core config.yml.""" with open(CONFIG_YML) as f: - config = yaml.safe_load(f) + return yaml.safe_load(f) - # Also load provider.yaml files which define config sections under "config:" key - # (e.g., [celery], [sentry], [workers]) - for provider_yaml in PROVIDERS_ROOT.glob("*/provider.yaml"): - with open(provider_yaml) as f: - provider_data = yaml.safe_load(f) - if provider_data and "config" in provider_data: - for section_name, section_data in provider_data["config"].items(): - if isinstance(section_data, dict) and section_name not in config: - config[section_name] = section_data - return config +def load_provider_configs() -> dict[str, dict]: + """Load provider.yaml files. Returns {provider_name: config_sections}.""" + result = {} + for provider_yaml in sorted(PROVIDERS_ROOT.glob("*/provider.yaml")): + with open(provider_yaml) as f: + data = yaml.safe_load(f) + if data and "config" in data: + provider_name = provider_yaml.parent.name + result[provider_name] = data["config"] + return result -def get_all_options(config: dict) -> dict[tuple[str, str], dict]: +def get_all_options(core_config: dict, provider_configs: dict[str, dict]) -> dict[tuple[str, str], dict]: """Return a dict of (section, option) -> option_config for all config options.""" result = {} - for section_name, section_data in config.items(): + for section_name, section_data in core_config.items(): if not isinstance(section_data, dict) or "options" not in section_data: continue for option_name, option_config in section_data["options"].items(): if isinstance(option_config, dict): result[(section_name, option_name)] = option_config - return result + for _provider_name, sections in provider_configs.items(): + for section_name, section_data in sections.items(): + if not isinstance(section_data, dict) or "options" not in section_data: + continue + for option_name, option_config in section_data["options"].items(): + if isinstance(option_config, dict): + result[(section_name, option_name)] = option_config -def get_sensitive_env_vars(all_options: dict[tuple[str, str], dict]) -> set[str]: - """Return set of AIRFLOW__X__Y env vars for all sensitive config options.""" - result = set() - for (section, option), config in all_options.items(): - if config.get("sensitive"): - result.add(option_to_env_var(section, option)) return result +def get_core_sensitive_vars(core_config: dict) -> list[tuple[str, str]]: + """Return sorted list of (section, option) for core sensitive config options.""" + result = [] + for section_name, section_data in core_config.items(): + if not isinstance(section_data, dict) or "options" not in section_data: + continue + for option_name, option_config in section_data["options"].items(): + if isinstance(option_config, dict) and option_config.get("sensitive"): + result.append((section_name, option_name)) + return sorted(result, key=lambda x: option_to_env_var(*x)) + + +def get_provider_sensitive_vars( + provider_configs: dict[str, dict], +) -> list[tuple[str, str, str]]: + """Return sorted list of (provider, section, option) for provider sensitive config options.""" + result = [] + for provider_name, sections in provider_configs.items(): + for section_name, section_data in sections.items(): + if not isinstance(section_data, dict) or "options" not in section_data: + continue + for option_name, option_config in section_data["options"].items(): + if isinstance(option_config, dict) and option_config.get("sensitive"): + result.append((provider_name, section_name, option_name)) + return sorted(result, key=lambda x: option_to_env_var(x[1], x[2])) + + +def generate_core_table(core_sensitive: list[tuple[str, str]]) -> list[str]: + """Generate RST list-table lines for core sensitive vars.""" + lines = [ + "", + " **Core Airflow sensitive configuration variables:**", + "", + " .. list-table::", + " :header-rows: 1", + " :widths: 40 30 30", + "", + " * - Environment variable", + " - Description", + " - Needed by", + ] + for section, option in core_sensitive: + env_var = option_to_env_var(section, option) + desc = CORE_DESCRIPTIONS.get((section, option), f"[{section}] {option}") + needed_by = CORE_COMPONENT_MAP.get((section, option), "Review per deployment") + lines.append(f" * - ``{env_var}``") + lines.append(f" - {desc}") + lines.append(f" - {needed_by}") + + # Check for unmapped vars and warn + for section, option in core_sensitive: + if (section, option) not in CORE_COMPONENT_MAP: + console.print( + f" [yellow]⚠[/] New core sensitive var [{section}] {option} — " + f"add it to CORE_COMPONENT_MAP in check_security_doc_constants.py" + ) + if (section, option) not in CORE_DESCRIPTIONS: + console.print( + f" [yellow]⚠[/] New core sensitive var [{section}] {option} — " + f"add a description to CORE_DESCRIPTIONS in check_security_doc_constants.py" + ) + + return lines + + +def generate_provider_table(provider_sensitive: list[tuple[str, str, str]]) -> list[str]: + """Generate RST list-table lines for provider sensitive vars.""" + lines = [ + "", + " **Provider-specific sensitive configuration variables:**", + "", + " The following variables are defined by Airflow providers and should only be set on components where", + " the corresponding provider functionality is needed. The decision of which components require these", + " variables depends on the Deployment Manager's choices about which providers and features are", + " enabled in each component.", + "", + " .. list-table::", + " :header-rows: 1", + " :widths: 40 30 30", + "", + " * - Environment variable", + " - Provider", + " - Description", + ] + for provider, section, option in provider_sensitive: + env_var = option_to_env_var(section, option) + # Generate a reasonable description from the option name + desc = option.replace("_", " ").capitalize() + lines.append(f" * - ``{env_var}``") + lines.append(f" - {provider}") + lines.append(f" - {desc}") + + return lines + + +def update_autogenerated_section( + content: str, start_marker: str, end_marker: str, new_lines: list[str] +) -> str: + """Replace content between markers with new content.""" + lines = content.splitlines() + start_idx = None + end_idx = None + + for i, line in enumerate(lines): + if start_marker in line: + start_idx = i + elif end_marker in line: + end_idx = i + break + + if start_idx is None or end_idx is None: + console.print(f" [red]✗[/] Could not find markers {start_marker!r} / {end_marker!r}") + return content + + result = lines[: start_idx + 1] + new_lines + [""] + lines[end_idx:] + return "\n".join(result) + "\n" + + +def update_sensitive_var_tables( + core_sensitive: list[tuple[str, str]], + provider_sensitive: list[tuple[str, str, str]], +) -> bool: + """Update the autogenerated tables in security_model.rst. Returns True if changed.""" + content = SECURITY_MODEL_RST.read_text() + original = content + + core_lines = generate_core_table(core_sensitive) + content = update_autogenerated_section(content, CORE_START, CORE_END, core_lines) + + provider_lines = generate_provider_table(provider_sensitive) + content = update_autogenerated_section(content, PROVIDER_START, PROVIDER_END, provider_lines) + + if content != original: + SECURITY_MODEL_RST.write_text(content) + return True + return False + + def check_option_references(doc_path: Path, all_options: dict[tuple[str, str], dict]) -> list[str]: """Check that all [section] option references in the doc exist in config.yml.""" errors = [] @@ -127,7 +297,6 @@ def check_option_references(doc_path: Path, all_options: dict[tuple[str, str], d section = match.group(1) option = match.group(2) if (section, option) not in all_options: - # Check if the section exists at all section_exists = any(s == section for s, _ in all_options) if section_exists: errors.append( @@ -146,6 +315,9 @@ def check_env_var_references(doc_path: Path, all_options: dict[tuple[str, str], content = doc_path.read_text() for line_num, line in enumerate(content.splitlines(), 1): + # Skip lines inside autogenerated sections — those are managed by the update logic + if "AUTOGENERATED" in line: + continue for match in ENV_VAR_RE.finditer(line): env_var = match.group(1) m = SECTION_OPT_TO_ENV.match(env_var) @@ -154,7 +326,6 @@ def check_env_var_references(doc_path: Path, all_options: dict[tuple[str, str], section = m.group(1).lower() option = m.group(2).lower() if (section, option) not in all_options: - # Check if the section exists section_exists = any(s == section for s, _ in all_options) if section_exists: errors.append( @@ -169,52 +340,12 @@ def check_env_var_references(doc_path: Path, all_options: dict[tuple[str, str], return errors -def check_sensitive_vars_listed(security_model_path: Path, sensitive_env_vars: set[str]) -> list[str]: - """ - Check that security_model.rst lists all sensitive config vars (as env vars). - - Returns warnings (printed but not counted as errors) since the doc explicitly - states the list is non-exhaustive. - """ - content = security_model_path.read_text() - - # Find the env vars that ARE listed in the doc - listed_env_vars = set() - for match in ENV_VAR_RE.finditer(content): - env_var = match.group(1) - if env_var.startswith("AIRFLOW__"): - listed_env_vars.add(env_var) - - # Print warnings for missing sensitive vars (not errors — the list is non-exhaustive) - missing = sorted(sensitive_env_vars - listed_env_vars) - if missing: - console.print() - console.print( - f" [yellow]⚠[/] {security_model_path.name}: The following sensitive config variables " - f"are not mentioned in the deployment hardening section (the list is documented as " - f"non-exhaustive, so these are warnings, not errors):" - ) - for env_var in missing: - console.print(f" [yellow]- {env_var}[/]") - - # No errors returned — these are warnings only - return [] - - def check_defaults_in_tables(doc_path: Path, all_options: dict[tuple[str, str], dict]) -> list[str]: - """ - Check default values in RST table rows match config.yml. - - Looks for patterns like: - * - ``[section] option`` - - 86400 (24h) - in list-table blocks. - """ + """Check default values in RST table rows match config.yml.""" errors = [] content = doc_path.read_text() lines = content.splitlines() - # Simple heuristic: find table rows with a config reference followed by a default value row i = 0 while i < len(lines): line = lines[i] @@ -222,7 +353,6 @@ def check_defaults_in_tables(doc_path: Path, all_options: dict[tuple[str, str], if match and "* -" in line: section = match.group(1) option = match.group(2) - # Next non-empty line starting with "- " or " -" is the value j = i + 1 while j < len(lines) and not lines[j].strip(): j += 1 @@ -231,18 +361,14 @@ def check_defaults_in_tables(doc_path: Path, all_options: dict[tuple[str, str], key = (section, option) if key in all_options: config_default = str(all_options[key].get("default", "~")) - # Extract numeric part from doc value like "86400 (24h)" -> "86400" doc_value = value_line.split()[0] if value_line else "" - # Strip surrounding backticks doc_value = doc_value.strip("`") - # Normalize: remove quotes from config default config_default_clean = config_default.strip('"').strip("'") if ( doc_value and config_default_clean and config_default_clean not in ("~", "None", "none", "") and doc_value != config_default_clean - # Don't flag if the doc value is a human-readable form and not doc_value.startswith("Auto") and not doc_value.startswith("None") and doc_value != "``GUESS``" @@ -257,26 +383,31 @@ def check_defaults_in_tables(doc_path: Path, all_options: dict[tuple[str, str], def main() -> int: - config = load_config() - all_options = get_all_options(config) - sensitive_env_vars = get_sensitive_env_vars(all_options) + core_config = load_core_config() + provider_configs = load_provider_configs() + all_options = get_all_options(core_config, provider_configs) + # Step 1: Auto-update the sensitive var tables + core_sensitive = get_core_sensitive_vars(core_config) + provider_sensitive = get_provider_sensitive_vars(provider_configs) + + if update_sensitive_var_tables(core_sensitive, provider_sensitive): + console.print( + " [yellow]⚠[/] security_model.rst sensitive variable tables were out of date and have been updated." + ) + console.print(" [yellow] Please review and commit the changes.[/]") + + # Step 2: Validate references (re-read after potential update) all_errors: list[str] = [] for doc_path in SECURITY_DOCS: if not doc_path.exists(): console.print(f" [yellow]⚠[/] {doc_path.name} not found, skipping") continue - all_errors.extend(check_option_references(doc_path, all_options)) all_errors.extend(check_env_var_references(doc_path, all_options)) all_errors.extend(check_defaults_in_tables(doc_path, all_options)) - # Check sensitive vars are listed in security_model.rst - security_model = AIRFLOW_ROOT_PATH / "airflow-core" / "docs" / "security" / "security_model.rst" - if security_model.exists(): - all_errors.extend(check_sensitive_vars_listed(security_model, sensitive_env_vars)) - if all_errors: console.print() for error in all_errors:
