Copilot commented on code in PR #64843:
URL: https://github.com/apache/airflow/pull/64843#discussion_r3066476391
##########
providers/openlineage/src/airflow/providers/openlineage/sqlparser.py:
##########
@@ -473,13 +473,24 @@ def _get_tables_hierarchy(
def get_openlineage_facets_with_sql(
- hook: DbApiHook, sql: str | list[str], conn_id: str, database: str | None,
use_connection: bool = True
+ hook: DbApiHook,
+ sql: str | list[str],
+ conn_id: str,
+ database: str | None,
+ use_connection: bool = True,
+ connection=None,
+ database_info=None,
) -> OperatorLineage | None:
Review Comment:
The new `connection`/`database_info` parameters are untyped and
positional-capable, which makes incorrect positional calls easier and reduces
readability for API consumers. Consider making these keyword-only (e.g., `*,
connection=..., database_info=...`) and adding concrete types (even if broad,
e.g., `connection: Any | None`, `database_info: Any | None`) to clarify intent
and improve static analysis.
##########
providers/openlineage/src/airflow/providers/openlineage/sqlparser.py:
##########
@@ -473,13 +473,24 @@ def _get_tables_hierarchy(
def get_openlineage_facets_with_sql(
- hook: DbApiHook, sql: str | list[str], conn_id: str, database: str | None,
use_connection: bool = True
+ hook: DbApiHook,
+ sql: str | list[str],
+ conn_id: str,
+ database: str | None,
+ use_connection: bool = True,
+ connection=None,
+ database_info=None,
) -> OperatorLineage | None:
- connection = hook.get_connection(conn_id)
- try:
- database_info = hook.get_openlineage_database_info(connection)
- except AttributeError:
- database_info = None
+ # Accept pre-fetched connection and database_info to avoid redundant
hook.get_connection()
+ # calls when processing multiple SQL extras from the same hook. Each
get_connection() call
+ # hits SecretsManager (miss) then the Airflow API server — passing these
in caches that cost.
+ if connection is None or database_info is None:
+ connection = hook.get_connection(conn_id)
+ if database_info is None:
+ try:
+ database_info = hook.get_openlineage_database_info(connection)
+ except AttributeError:
+ database_info = None
if database_info is None:
Review Comment:
When a caller passes `connection` but not `database_info`, this branch still
re-fetches the connection and overwrites the provided one. This defeats the
purpose of accepting a pre-fetched connection and can reintroduce the expensive
`get_connection()` call. Split the logic so `hook.get_connection(conn_id)` is
only called when `connection is None`, and only fetch `database_info` when it
is missing (using the existing `connection`).
```suggestion
if connection is None:
connection = hook.get_connection(conn_id)
if database_info is None:
try:
database_info = hook.get_openlineage_database_info(connection)
except AttributeError:
database_info = None
if database_info is None:
```
##########
providers/openlineage/src/airflow/providers/openlineage/sqlparser.py:
##########
@@ -473,13 +473,24 @@ def _get_tables_hierarchy(
def get_openlineage_facets_with_sql(
- hook: DbApiHook, sql: str | list[str], conn_id: str, database: str | None,
use_connection: bool = True
+ hook: DbApiHook,
+ sql: str | list[str],
+ conn_id: str,
+ database: str | None,
+ use_connection: bool = True,
+ connection=None,
+ database_info=None,
) -> OperatorLineage | None:
- connection = hook.get_connection(conn_id)
- try:
- database_info = hook.get_openlineage_database_info(connection)
- except AttributeError:
- database_info = None
+ # Accept pre-fetched connection and database_info to avoid redundant
hook.get_connection()
+ # calls when processing multiple SQL extras from the same hook. Each
get_connection() call
+ # hits SecretsManager (miss) then the Airflow API server — passing these
in caches that cost.
Review Comment:
The last sentence is grammatically unclear ('passing these in caches that
cost'). Consider rewording to something like: '... — passing these in avoids
that cost.' This comment explains an important performance motivation, so
clarity helps future maintainers.
```suggestion
# hits SecretsManager (miss) then the Airflow API server — passing these
in avoids that cost.
```
##########
providers/openlineage/src/airflow/providers/openlineage/utils/sql_hook_lineage.py:
##########
@@ -70,6 +71,34 @@ def emit_lineage_from_sql_extras(task_instance, sql_extras:
list, is_successful:
events: list[RunEvent] = []
query_count = 0
+ # Build hook identity -> (hook, conn_id) mapping before iterating.
+ # Using id(hook) as cache key instead of conn_id ensures distinct hook
instances
+ # with the same conn_id but different params are cached separately.
+ _hook_info: dict[int, tuple[object, str | None]] = {}
Review Comment:
The PR description says the caches are keyed by `conn_id` via a `conn_id →
hook` mapping, but the implementation explicitly keys caches by `id(hook)` and
caches separately per hook instance. Either update the PR description to match
the implemented behavior (and rationale), or switch the implementation to
`conn_id` keying if that was the original intent.
##########
providers/openlineage/src/airflow/providers/openlineage/sqlparser.py:
##########
@@ -473,13 +473,24 @@ def _get_tables_hierarchy(
def get_openlineage_facets_with_sql(
- hook: DbApiHook, sql: str | list[str], conn_id: str, database: str | None,
use_connection: bool = True
+ hook: DbApiHook,
+ sql: str | list[str],
+ conn_id: str,
+ database: str | None,
+ use_connection: bool = True,
+ connection=None,
+ database_info=None,
) -> OperatorLineage | None:
- connection = hook.get_connection(conn_id)
- try:
- database_info = hook.get_openlineage_database_info(connection)
- except AttributeError:
- database_info = None
+ # Accept pre-fetched connection and database_info to avoid redundant
hook.get_connection()
+ # calls when processing multiple SQL extras from the same hook. Each
get_connection() call
+ # hits SecretsManager (miss) then the Airflow API server — passing these
in caches that cost.
+ if connection is None or database_info is None:
+ connection = hook.get_connection(conn_id)
+ if database_info is None:
+ try:
+ database_info = hook.get_openlineage_database_info(connection)
+ except AttributeError:
+ database_info = None
Review Comment:
The new `connection`/`database_info` override behavior isn’t directly
covered by unit tests. Add tests that assert: (1) when `connection` is provided
and `database_info` is missing, `hook.get_connection` is not called, and (2)
when `database_info` is provided and `connection` is missing, `get_connection`
is called exactly once and `get_openlineage_database_info` is not called.
```suggestion
if connection is None:
connection = hook.get_connection(conn_id)
if database_info is None:
try:
database_info = hook.get_openlineage_database_info(connection)
except AttributeError:
database_info = None
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]