Copilot commented on code in PR #64770:
URL: https://github.com/apache/airflow/pull/64770#discussion_r3067064733
##########
providers/amazon/src/airflow/providers/amazon/aws/triggers/emr.py:
##########
@@ -190,10 +197,134 @@ def __init__(
waiter_max_attempts=waiter_max_attempts,
aws_conn_id=aws_conn_id,
)
+ self.virtual_cluster_id = virtual_cluster_id
+ self.job_id = job_id
+ self.cancel_on_kill = cancel_on_kill
def hook(self) -> AwsGenericHook:
return EmrContainerHook(aws_conn_id=self.aws_conn_id)
+ if not AIRFLOW_V_3_0_PLUS:
+
+ @provide_session
+ def get_task_instance(self, session: Session) -> TaskInstance:
+ """Get the task instance for the current trigger (Airflow 2.x
compatibility)."""
+ from sqlalchemy import select
+
+ query = select(TaskInstance).where(
+ TaskInstance.dag_id == self.task_instance.dag_id,
+ TaskInstance.task_id == self.task_instance.task_id,
+ TaskInstance.run_id == self.task_instance.run_id,
+ TaskInstance.map_index == self.task_instance.map_index,
+ )
+ task_instance = session.scalars(query).one_or_none()
+ if task_instance is None:
+ raise ValueError(
+ f"TaskInstance with dag_id: {self.task_instance.dag_id}, "
+ f"task_id: {self.task_instance.task_id}, "
+ f"run_id: {self.task_instance.run_id} and "
+ f"map_index: {self.task_instance.map_index} is not found"
+ )
+ return task_instance
+
+ async def get_task_state(self):
+ """Get the current state of the task instance (Airflow 3.x)."""
+ from airflow.sdk.execution_time.task_runner import RuntimeTaskInstance
+
+ task_states_response = await
sync_to_async(RuntimeTaskInstance.get_task_states)(
+ dag_id=self.task_instance.dag_id,
+ task_ids=[self.task_instance.task_id],
+ run_ids=[self.task_instance.run_id],
+ map_index=self.task_instance.map_index,
+ )
+ try:
+ task_state =
task_states_response[self.task_instance.run_id][self.task_instance.task_id]
+ except (KeyError, TypeError) as e:
+ raise ValueError(
+ f"TaskInstance with dag_id: {self.task_instance.dag_id}, "
+ f"task_id: {self.task_instance.task_id}, "
+ f"run_id: {self.task_instance.run_id} and "
+ f"map_index: {self.task_instance.map_index} is not found"
+ ) from e
+ return task_state
+
+ async def safe_to_cancel(self) -> bool:
+ """
+ Whether it is safe to cancel the EMR container job.
+
+ Returns True if task is NOT DEFERRED (user-initiated cancellation).
+ Returns False if task is DEFERRED (triggerer restart - don't cancel
job).
+ """
+ if AIRFLOW_V_3_0_PLUS:
+ task_state = await self.get_task_state()
+ else:
+ task_instance = await sync_to_async(self.get_task_instance)() #
type: ignore[call-arg]
+ task_state = task_instance.state
+ return task_state != TaskInstanceState.DEFERRED
+
+ async def run(self) -> AsyncIterator[TriggerEvent]:
+ """
+ Run the trigger and wait for the job to complete.
+
+ If the task is cancelled while waiting, attempt to cancel the EMR
container job
+ if cancel_on_kill is enabled and it's safe to do so.
+ """
+ hook: EmrContainerHook = self.hook() # type: ignore[assignment]
+ try:
+ async with await hook.get_async_conn() as client:
+ waiter = hook.get_waiter(
+ self.waiter_name,
Review Comment:
`EmrContainerHook.stop_query()` requires `virtual_cluster_id` (it calls
`cancel_job_run(virtualClusterId=self.virtual_cluster_id, ...)`), but this
trigger builds the hook via `self.hook()` which currently passes only
`aws_conn_id`. As a result, the new cancel-on-kill path will call the EMR
Containers API with `virtualClusterId=None` and fail to cancel the job. Update
`hook()` (or the instantiation here) to pass
`virtual_cluster_id=self.virtual_cluster_id` so cancellation works.
##########
providers/amazon/src/airflow/providers/amazon/aws/operators/emr.py:
##########
@@ -604,10 +610,18 @@ def check_failure(self, query_status):
def execute_complete(self, context: Context, event: dict[str, Any] | None
= None) -> str:
validated_event = validate_execute_complete_event(event)
- if validated_event["status"] != "success":
- raise AirflowException(f"Error while running job:
{validated_event}")
-
- return validated_event["job_id"]
+ if validated_event["status"] == "success":
+ return validated_event["job_id"]
+ if self.job_id:
+ self.log.info("Cancelling EMR container job %s", self.job_id)
+ try:
+ self.hook.stop_query(self.job_id)
+ except Exception:
+ self.log.exception(
+ "Failed to cancel EMR container job %s. The job may still
be running.",
+ self.job_id,
Review Comment:
In deferrable mode the operator instance is reconstructed on resume, so
runtime fields like `self.job_id` may be `None` in `execute_complete`. This new
cancellation logic uses `self.job_id`, which can skip cancellation even though
the trigger event contains the real `job_id`. Use `validated_event["job_id"]`
(and/or persist `job_id` explicitly) when cancelling so deferral
timeout/failure reliably cancels the correct EMR job.
```suggestion
job_id = validated_event.get("job_id") or self.job_id
if job_id:
self.log.info("Cancelling EMR container job %s", job_id)
try:
self.hook.stop_query(job_id)
except Exception:
self.log.exception(
"Failed to cancel EMR container job %s. The job may
still be running.",
job_id,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]