pierrejeambrun commented on code in PR #44332: URL: https://github.com/apache/airflow/pull/44332#discussion_r1867941494
########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, Review Comment: I renamed those for the `graph_data` to `include_upstream`, and `include_downstream` which I think are more intuitive. Here `filter_upstream=False` will actually `include_upstream=False` => no upstream. (Where we asked for not filtering upstream). ########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, +) -> GridResponse: + """Return grid data.""" + ## Database calls to retrieve the DAG Runs and Task Instances and validate the data Review Comment: I'm not sure this comment is relevant. We are ready from the dagbag here, not directly from the db. (sync from dagbag to db is separate as I understand). ########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, +) -> GridResponse: + """Return grid data.""" + ## Database calls to retrieve the DAG Runs and Task Instances and validate the data + dag: DAG = request.app.state.dag_bag.get_dag(dag_id) + if not dag: + raise HTTPException(status.HTTP_404_NOT_FOUND, f"Dag with id {dag_id} was not found") + + if root: + dag = dag.partial_subset( + task_ids_or_regex=root, include_upstream=filter_upstream, include_downstream=filter_downstream + ) + + current_time = timezone.utcnow() + # Retrieve, sort and encode the previous DAG Runs + base_query = ( + select( + DagRun.run_id, + DagRun.queued_at, + DagRun.start_date, + DagRun.end_date, + DagRun.state, + DagRun.run_type, + DagRun.data_interval_start, + DagRun.data_interval_end, + DagRun.dag_version_id.label("version_number"), + ) + .select_from(DagRun) + .where(DagRun.dag_id == dag.dag_id, DagRun.logical_date <= func.coalesce(base_date, current_time)) + .order_by(DagRun.id.desc()) + ) + + def get_dag_run_sort_param(): + """Get the Sort Param for the DAG Run.""" + + def _get_run_ordering_expr(name: str) -> ColumnOperators: + """Get the Run Ordering Expression.""" + expr = DagRun.__mapper__.columns[name] + # Data interval columns are NULL for runs created before 2.3, but SQL's + # NULL-sorting logic would make those old runs always appear first. In a + # perfect world we'd want to sort by ``get_run_data_interval()``, but that's + # not efficient, so instead the columns are coalesced into logical_date, + # which is good enough in most cases. + if name in ("data_interval_start", "data_interval_end"): + expr = func.coalesce(expr, DagRun.logical_date) + return expr.desc() + + ordering_expression = (_get_run_ordering_expr(name) for name in dag.timetable.run_ordering) + # create SortParam with ordering_expression and DagRun.id.desc() + return ordering_expression + Review Comment: I think all that can be replace by creating an appropriate `SortParam` and giving it to the following `paginated_select` ########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, +) -> GridResponse: + """Return grid data.""" + ## Database calls to retrieve the DAG Runs and Task Instances and validate the data + dag: DAG = request.app.state.dag_bag.get_dag(dag_id) + if not dag: + raise HTTPException(status.HTTP_404_NOT_FOUND, f"Dag with id {dag_id} was not found") + + if root: + dag = dag.partial_subset( + task_ids_or_regex=root, include_upstream=filter_upstream, include_downstream=filter_downstream + ) + + current_time = timezone.utcnow() + # Retrieve, sort and encode the previous DAG Runs + base_query = ( + select( + DagRun.run_id, + DagRun.queued_at, + DagRun.start_date, + DagRun.end_date, + DagRun.state, + DagRun.run_type, + DagRun.data_interval_start, + DagRun.data_interval_end, + DagRun.dag_version_id.label("version_number"), + ) + .select_from(DagRun) + .where(DagRun.dag_id == dag.dag_id, DagRun.logical_date <= func.coalesce(base_date, current_time)) + .order_by(DagRun.id.desc()) + ) + + def get_dag_run_sort_param(): + """Get the Sort Param for the DAG Run.""" + + def _get_run_ordering_expr(name: str) -> ColumnOperators: + """Get the Run Ordering Expression.""" + expr = DagRun.__mapper__.columns[name] + # Data interval columns are NULL for runs created before 2.3, but SQL's + # NULL-sorting logic would make those old runs always appear first. In a + # perfect world we'd want to sort by ``get_run_data_interval()``, but that's + # not efficient, so instead the columns are coalesced into logical_date, + # which is good enough in most cases. + if name in ("data_interval_start", "data_interval_end"): + expr = func.coalesce(expr, DagRun.logical_date) + return expr.desc() + + ordering_expression = (_get_run_ordering_expr(name) for name in dag.timetable.run_ordering) + # create SortParam with ordering_expression and DagRun.id.desc() + return ordering_expression + + dag_runs_select_filter, _ = paginated_select( + statement=base_query.order_by(*get_dag_run_sort_param(), DagRun.id.desc()), Review Comment: Because manually handling the sorting will yield things like that. By default things are sorted `asc` but then you need to muanually add the secondary sort on the primary key. And we do not handle desc and asc sorting. Only forced `desc` ########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, +) -> GridResponse: + """Return grid data.""" + ## Database calls to retrieve the DAG Runs and Task Instances and validate the data + dag: DAG = request.app.state.dag_bag.get_dag(dag_id) + if not dag: + raise HTTPException(status.HTTP_404_NOT_FOUND, f"Dag with id {dag_id} was not found") + + if root: + dag = dag.partial_subset( + task_ids_or_regex=root, include_upstream=filter_upstream, include_downstream=filter_downstream + ) + + current_time = timezone.utcnow() + # Retrieve, sort and encode the previous DAG Runs + base_query = ( + select( + DagRun.run_id, + DagRun.queued_at, + DagRun.start_date, + DagRun.end_date, + DagRun.state, + DagRun.run_type, + DagRun.data_interval_start, + DagRun.data_interval_end, + DagRun.dag_version_id.label("version_number"), + ) + .select_from(DagRun) + .where(DagRun.dag_id == dag.dag_id, DagRun.logical_date <= func.coalesce(base_date, current_time)) + .order_by(DagRun.id.desc()) + ) + + def get_dag_run_sort_param(): + """Get the Sort Param for the DAG Run.""" + + def _get_run_ordering_expr(name: str) -> ColumnOperators: + """Get the Run Ordering Expression.""" + expr = DagRun.__mapper__.columns[name] + # Data interval columns are NULL for runs created before 2.3, but SQL's + # NULL-sorting logic would make those old runs always appear first. In a + # perfect world we'd want to sort by ``get_run_data_interval()``, but that's + # not efficient, so instead the columns are coalesced into logical_date, + # which is good enough in most cases. + if name in ("data_interval_start", "data_interval_end"): + expr = func.coalesce(expr, DagRun.logical_date) + return expr.desc() + + ordering_expression = (_get_run_ordering_expr(name) for name in dag.timetable.run_ordering) + # create SortParam with ordering_expression and DagRun.id.desc() + return ordering_expression + + dag_runs_select_filter, _ = paginated_select( + statement=base_query.order_by(*get_dag_run_sort_param(), DagRun.id.desc()), + filters=[ + run_types, + run_states, + ], + order_by=None, + offset=offset, + limit=num_runs, + ) + + dag_runs = session.execute(dag_runs_select_filter) + + # Check if there are any DAG Runs with given criteria to eliminate unnecessary queries/errors + if not dag_runs: + return GridResponse(dag_runs=[]) + + # Retrieve, sort and encode the Task Instances + tis_of_dag_runs, _ = paginated_select( + statement=select( + TaskInstance.run_id, + TaskInstance.task_id, + TaskInstance.try_number, + TaskInstance.state, + TaskInstance.start_date, + TaskInstance.end_date, + TaskInstance.queued_dttm.label("queued_dttm"), + ) + .join(TaskInstance.task_instance_note, isouter=True) + .where(TaskInstance.dag_id == dag.dag_id), + filters=[], + order_by=SortParam(allowed_attrs=["task_id", "run_id"], model=TaskInstance).dynamic_depends( + "task_id" + )(), + offset=offset, + limit=None, + ) + + task_instances = session.execute(tis_of_dag_runs) + + @cache + def get_task_group_children_getter() -> operator.methodcaller: Review Comment: This utitilities functions shouldn't live in the endpoint definition, but outside, ideally in a `service/grid.py`. That's the first endpoint I believe that holds a lot of custom logic. And an extra layer, re-usable, outside could make the code cleaner. ########## tests/api_fastapi/core_api/routes/ui/test_grid.py: ########## @@ -0,0 +1,414 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from datetime import timedelta + +import pendulum +import pytest + +from airflow.decorators import task_group +from airflow.models import DagBag +from airflow.operators.empty import EmptyOperator +from airflow.utils import timezone +from airflow.utils.session import provide_session +from airflow.utils.state import DagRunState, TaskInstanceState +from airflow.utils.task_group import TaskGroup +from airflow.utils.types import DagRunType + +from tests_common.test_utils.compat import AIRFLOW_V_3_0_PLUS +from tests_common.test_utils.db import clear_db_assets, clear_db_dags, clear_db_runs, clear_db_serialized_dags +from tests_common.test_utils.mock_operators import MockOperator + +if AIRFLOW_V_3_0_PLUS: + from airflow.utils.types import DagRunTriggeredByType + +pytestmark = pytest.mark.db_test + +DAG_ID = "test_dag" +DAG_ID_2 = "test_dag_2" +TASK_ID = "task" +TASK_ID_2 = "task2" + + [email protected](autouse=True, scope="module") +def examples_dag_bag(): + # Speed up: We don't want example dags for this module + return DagBag(include_examples=False, read_dags_from_db=True) + + [email protected](autouse=True) +@provide_session +def setup(dag_maker, session=None): + clear_db_runs() + clear_db_dags() + clear_db_serialized_dags() + + with dag_maker(dag_id=DAG_ID, serialized=True, session=session) as dag: + EmptyOperator(task_id=TASK_ID) + + @task_group + def mapped_task_group(arg1): + return MockOperator(task_id="subtask", arg1=arg1) + + mapped_task_group.expand(arg1=["a", "b", "c"]) + with TaskGroup(group_id="task_group"): + MockOperator.partial(task_id="mapped_task").expand(arg1=["a", "b", "c", "d"]) + + triggered_by_kwargs = {"triggered_by": DagRunTriggeredByType.TEST} if AIRFLOW_V_3_0_PLUS else {} + logical_date = timezone.datetime(2024, 11, 30) Review Comment: We should be able to remove the `AIRFLOW_V_3_0_PLUS`, this development is targetter for airflow 3 and will not be released for airflow 2.x ########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, Review Comment: Also we could move those to a `ui.parameters.py` file. Because they are shared accross multiple ui endpoints. (same for root). And default value is the same. ########## airflow/api_fastapi/core_api/routes/ui/grid.py: ########## @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import collections +import itertools +import operator +from functools import cache + +from fastapi import HTTPException, Request, status +from sqlalchemy import func, select +from sqlalchemy.sql.operators import ColumnOperators +from typing_extensions import Any + +from airflow import DAG +from airflow.api_fastapi.common.db.common import SessionDep, paginated_select +from airflow.api_fastapi.common.parameters import ( + OptionalDateTimeQuery, + QueryDagRunRunTypesFilter, + QueryDagRunStateFilter, + QueryLimit, + QueryOffset, + SortParam, +) +from airflow.api_fastapi.common.router import AirflowRouter +from airflow.api_fastapi.core_api.datamodels.ui.grid import ( + GridDAGRunwithTIs, + GridResponse, + GridTaskInstanceSummary, +) +from airflow.api_fastapi.core_api.openapi.exceptions import create_openapi_http_exception_doc +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.models import DagRun, MappedOperator, TaskInstance +from airflow.models.baseoperator import BaseOperator +from airflow.models.taskmap import TaskMap +from airflow.utils import timezone +from airflow.utils.state import TaskInstanceState +from airflow.utils.task_group import MappedTaskGroup, TaskGroup + +grid_router = AirflowRouter(prefix="/grid", tags=["Grid"]) + + +@grid_router.get( + "/{dag_id}", + include_in_schema=False, + responses=create_openapi_http_exception_doc([status.HTTP_400_BAD_REQUEST, status.HTTP_404_NOT_FOUND]), +) +def grid_data( + dag_id: str, + run_types: QueryDagRunRunTypesFilter, + run_states: QueryDagRunStateFilter, + session: SessionDep, + offset: QueryOffset, + request: Request, + num_runs: QueryLimit, + base_date: OptionalDateTimeQuery = None, + root: str | None = None, + filter_upstream: bool = False, + filter_downstream: bool = False, +) -> GridResponse: + """Return grid data.""" + ## Database calls to retrieve the DAG Runs and Task Instances and validate the data + dag: DAG = request.app.state.dag_bag.get_dag(dag_id) + if not dag: + raise HTTPException(status.HTTP_404_NOT_FOUND, f"Dag with id {dag_id} was not found") + + if root: + dag = dag.partial_subset( + task_ids_or_regex=root, include_upstream=filter_upstream, include_downstream=filter_downstream + ) + + current_time = timezone.utcnow() + # Retrieve, sort and encode the previous DAG Runs + base_query = ( + select( + DagRun.run_id, + DagRun.queued_at, + DagRun.start_date, + DagRun.end_date, + DagRun.state, + DagRun.run_type, + DagRun.data_interval_start, + DagRun.data_interval_end, + DagRun.dag_version_id.label("version_number"), + ) + .select_from(DagRun) + .where(DagRun.dag_id == dag.dag_id, DagRun.logical_date <= func.coalesce(base_date, current_time)) + .order_by(DagRun.id.desc()) + ) + + def get_dag_run_sort_param(): + """Get the Sort Param for the DAG Run.""" + + def _get_run_ordering_expr(name: str) -> ColumnOperators: + """Get the Run Ordering Expression.""" + expr = DagRun.__mapper__.columns[name] + # Data interval columns are NULL for runs created before 2.3, but SQL's + # NULL-sorting logic would make those old runs always appear first. In a + # perfect world we'd want to sort by ``get_run_data_interval()``, but that's + # not efficient, so instead the columns are coalesced into logical_date, + # which is good enough in most cases. + if name in ("data_interval_start", "data_interval_end"): + expr = func.coalesce(expr, DagRun.logical_date) + return expr.desc() + + ordering_expression = (_get_run_ordering_expr(name) for name in dag.timetable.run_ordering) + # create SortParam with ordering_expression and DagRun.id.desc() + return ordering_expression + + dag_runs_select_filter, _ = paginated_select( + statement=base_query.order_by(*get_dag_run_sort_param(), DagRun.id.desc()), + filters=[ + run_types, + run_states, + ], + order_by=None, + offset=offset, + limit=num_runs, + ) + + dag_runs = session.execute(dag_runs_select_filter) + + # Check if there are any DAG Runs with given criteria to eliminate unnecessary queries/errors + if not dag_runs: + return GridResponse(dag_runs=[]) + + # Retrieve, sort and encode the Task Instances + tis_of_dag_runs, _ = paginated_select( + statement=select( + TaskInstance.run_id, + TaskInstance.task_id, + TaskInstance.try_number, + TaskInstance.state, + TaskInstance.start_date, + TaskInstance.end_date, + TaskInstance.queued_dttm.label("queued_dttm"), + ) + .join(TaskInstance.task_instance_note, isouter=True) + .where(TaskInstance.dag_id == dag.dag_id), + filters=[], + order_by=SortParam(allowed_attrs=["task_id", "run_id"], model=TaskInstance).dynamic_depends( + "task_id" + )(), Review Comment: `dynamic_depends` is only useful for FastAPI dependency injection system. When we manuallay want an instance of if you can just: ```python SortParam(allowed_attrs=["task_id", "run_id"], model=TaskInstance).set_value('your_value'). ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
