apache · potiuk · Sep 2, 2024 · Jul 29, 2024 · Aug 2, 2024
diff --git a/airflow/providers/google/cloud/hooks/dataflow.py b/airflow/providers/google/cloud/hooks/dataflow.py
@@ -186,9 +186,9 @@ class DataflowJobType:
 
 class _DataflowJobsController(LoggingMixin):
     """
-    Interface for communication with Google API.
+    Interface for communication with Google Cloud Dataflow API.
 
-    It's not use Apache Beam, but only Google Dataflow API.
+    Does not use Apache Beam API.
 
     :param dataflow: Discovery resource
     :param project_number: The Google Cloud Project ID.
@@ -271,12 +271,12 @@ def _get_current_jobs(self) -> list[dict]:
         else:
             raise ValueError("Missing both dataflow job ID and name.")
 
-    def fetch_job_by_id(self, job_id: str) -> dict:
+    def fetch_job_by_id(self, job_id: str) -> dict[str, str]:
         """
         Fetch the job with the specified Job ID.
 
-        :param job_id: Job ID to get.
-        :return: the Job
+        :param job_id: ID of the job that needs to be fetched.
+        :return: Dictionary containing the Job's data
         """
         return (
             self._dataflow.projects()
@@ -444,7 +444,6 @@ def _check_dataflow_job_state(self, job) -> bool:
                 "Google Cloud Dataflow job's expected terminal state cannot be "
                 "JOB_STATE_DRAINED while it is a batch job"
             )
-
         if current_state == current_expected_state:
             if current_expected_state == DataflowJobStatus.JOB_STATE_RUNNING:
                 return not self._wait_until_finished
@@ -938,6 +937,90 @@ def launch_job_with_flex_template(
         response: dict = request.execute(num_retries=self.num_retries)
         return response["job"]
 
+    @GoogleBaseHook.fallback_to_default_project_id
+    def launch_beam_yaml_job(
+        self,
+        *,
+        job_name: str,
+        yaml_pipeline_file: str,
+        append_job_name: bool,
+        jinja_variables: dict[str, str] | None,
+        options: dict[str, Any] | None,
+        project_id: str,
+        location: str = DEFAULT_DATAFLOW_LOCATION,
+    ) -> str:
+        """
+        Launch a Dataflow YAML job and run it until completion.
+
+        :param job_name: The unique name to assign to the Cloud Dataflow job.
+        :param yaml_pipeline_file: Path to a file defining the YAML pipeline to run.
+            Must be a local file or a URL beginning with 'gs://'.
+        :param append_job_name: Set to True if a unique suffix has to be appended to the `job_name`.
+        :param jinja_variables: A dictionary of Jinja2 variables to be used in reifying the yaml pipeline file.
+        :param options: Additional gcloud or Beam job parameters.
+            It must be a dictionary with the keys matching the optional flag names in gcloud.
+            The list of supported flags can be found at: `https://cloud.google.com/sdk/gcloud/reference/dataflow/yaml/run`.
+            Note that if a flag does not require a value, then its dictionary value must be either True or None.
+            For example, the `--log-http` flag can be passed as {'log-http': True}.
+        :param project_id: The ID of the GCP project that owns the job.
+        :param location: Region ID of the job's regional endpoint. Defaults to 'us-central1'.
+        :param on_new_job_callback: Callback function that passes the job to the operator once known.
+        :return: Job ID.
+        """
+        gcp_flags = {
+            "yaml-pipeline-file": yaml_pipeline_file,
+            "project": project_id,
+            "format": "value(job.id)",
+            "region": location,
+        }
+
+        if jinja_variables:
+            gcp_flags["jinja-variables"] = json.dumps(jinja_variables)
+
+        if options:
+            gcp_flags.update(options)
+
+        job_name = self.build_dataflow_job_name(job_name, append_job_name)
+        cmd = self._build_gcloud_command(
+            command=["gcloud", "dataflow", "yaml", "run", job_name], parameters=gcp_flags
+        )
+        job_id = self._create_dataflow_job_with_gcloud(cmd=cmd)
+        return job_id
+
+    def _build_gcloud_command(self, command: list[str], parameters: dict[str, str]) -> list[str]:
+        _parameters = deepcopy(parameters)
+        if self.impersonation_chain:
+            if isinstance(self.impersonation_chain, str):
+                impersonation_account = self.impersonation_chain
+            elif len(self.impersonation_chain) == 1:
+                impersonation_account = self.impersonation_chain[0]
+            else:
+                raise AirflowException(
+                    "Chained list of accounts is not supported, please specify only one service account."
+                )
+            _parameters["impersonate-service-account"] = impersonation_account
+        return [*command, *(beam_options_to_args(_parameters))]
+
+    def _create_dataflow_job_with_gcloud(self, cmd: list[str]) -> str:
+        """Create a Dataflow job with a gcloud command and return the job's ID."""
+        self.log.info("Executing command: %s", " ".join(shlex.quote(c) for c in cmd))
+        success_code = 0
+
+        with self.provide_authorized_gcloud():
+            proc = subprocess.run(cmd, capture_output=True)
+
+        if proc.returncode != success_code:
+            stderr_last_20_lines = "\n".join(proc.stderr.decode().strip().splitlines()[-20:])
+            raise AirflowException(
+                f"Process exit with non-zero exit code. Exit code: {proc.returncode}. Error Details : "
+                f"{stderr_last_20_lines}"
+            )
+
+        job_id = proc.stdout.decode().strip()
+        self.log.info("Created job's ID: %s", job_id)
+
+        return job_id
+
     @staticmethod
     def extract_job_id(job: dict) -> str:
         try:
@@ -1139,33 +1222,15 @@ def start_sql_job(
         :param on_new_job_callback: Callback called when the job is known.
         :return: the new job object
         """
-        gcp_options = [
-            f"--project={project_id}",
-            "--format=value(job.id)",
-            f"--job-name={job_name}",
-            f"--region={location}",
-        ]
-
-        if self.impersonation_chain:
-            if isinstance(self.impersonation_chain, str):
-                impersonation_account = self.impersonation_chain
-            elif len(self.impersonation_chain) == 1:
-                impersonation_account = self.impersonation_chain[0]
-            else:
-                raise AirflowException(
-                    "Chained list of accounts is not supported, please specify only one service account"
-                )
-            gcp_options.append(f"--impersonate-service-account={impersonation_account}")
-
-        cmd = [
-            "gcloud",
-            "dataflow",
-            "sql",
-            "query",
-            query,
-            *gcp_options,
-            *(beam_options_to_args(options)),
-        ]
+        gcp_options = {
+            "project": project_id,
+            "format": "value(job.id)",
+            "job-name": job_name,
+            "region": location,
+        }
+        cmd = self._build_gcloud_command(
+            command=["gcloud", "dataflow", "sql", "query", query], parameters={**gcp_options, **options}
+        )
         self.log.info("Executing command: %s", " ".join(shlex.quote(c) for c in cmd))
         with self.provide_authorized_gcloud():
             proc = subprocess.run(cmd, capture_output=True)

diff --git a/airflow/providers/google/cloud/operators/dataflow.py b/airflow/providers/google/cloud/operators/dataflow.py
@@ -40,7 +40,10 @@
 from airflow.providers.google.cloud.hooks.gcs import GCSHook
 from airflow.providers.google.cloud.links.dataflow import DataflowJobLink, DataflowPipelineLink
 from airflow.providers.google.cloud.operators.cloud_base import GoogleCloudBaseOperator
-from airflow.providers.google.cloud.triggers.dataflow import TemplateJobStartTrigger
+from airflow.providers.google.cloud.triggers.dataflow import (
+    DataflowStartYamlJobTrigger,
+    TemplateJobStartTrigger,
+)
 from airflow.providers.google.common.consts import GOOGLE_DEFAULT_DEFERRABLE_METHOD_NAME
 from airflow.providers.google.common.deprecated import deprecated
 from airflow.providers.google.common.hooks.base_google import PROVIDE_PROJECT_ID
@@ -946,6 +949,11 @@ def on_kill(self) -> None:
             )
 
 
+@deprecated(
+    planned_removal_date="January 31, 2025",
+    use_instead="DataflowStartYamlJobOperator",
+    category=AirflowProviderDeprecationWarning,
+)
 class DataflowStartSqlJobOperator(GoogleCloudBaseOperator):
     """
     Starts Dataflow SQL query.
@@ -1051,6 +1059,178 @@ def on_kill(self) -> None:
             )
 
 
+class DataflowStartYamlJobOperator(GoogleCloudBaseOperator):
+    """
+    Launch a Dataflow YAML job and return the result.
+
+    .. seealso::
+        For more information on how to use this operator, take a look at the guide:
+        :ref:`howto/operator:DataflowStartYamlJobOperator`
+
+    .. warning::
+        This operator requires ``gcloud`` command (Google Cloud SDK) must be installed on the Airflow worker
+        <https://cloud.google.com/sdk/docs/install>`__
+
+    :param job_name: Required. The unique name to assign to the Cloud Dataflow job.
+    :param yaml_pipeline_file: Required. Path to a file defining the YAML pipeline to run.
+        Must be a local file or a URL beginning with 'gs://'.
+    :param region: Optional. Region ID of the job's regional endpoint. Defaults to 'us-central1'.
+    :param project_id: Required. The ID of the GCP project that owns the job.
+        If set to ``None`` or missing, the default project_id from the GCP connection is used.
+    :param gcp_conn_id: Optional. The connection ID used to connect to GCP.
+    :param append_job_name: Optional. Set to True if a unique suffix has to be appended to the `job_name`.
+        Defaults to True.
+    :param drain_pipeline: Optional. Set to True if you want to stop a streaming pipeline job by draining it
+        instead of canceling when killing the task instance. Note that this does not work for batch pipeline jobs
+        or in the deferrable mode. Defaults to False.
+        For more info see: https://cloud.google.com/dataflow/docs/guides/stopping-a-pipeline
+    :param deferrable: Optional. Run operator in the deferrable mode.
+    :param expected_terminal_state: Optional. The expected terminal state of the Dataflow job at which the
+        operator task is set to succeed. Defaults to 'JOB_STATE_DONE' for the batch jobs and 'JOB_STATE_RUNNING'
+        for the streaming jobs.
+    :param poll_sleep: Optional. The time in seconds to sleep between polling Google Cloud Platform for the Dataflow job status.
+        Used both for the sync and deferrable mode.
+    :param cancel_timeout: Optional. How long (in seconds) operator should wait for the pipeline to be
+        successfully canceled when the task is being killed.
+    :param jinja_variables: Optional. A dictionary of Jinja2 variables to be used in reifying the yaml pipeline file.
+    :param options: Optional. Additional gcloud or Beam job parameters.
+        It must be a dictionary with the keys matching the optional flag names in gcloud.
+        The list of supported flags can be found at: `https://cloud.google.com/sdk/gcloud/reference/dataflow/yaml/run`.
+        Note that if a flag does not require a value, then its dictionary value must be either True or None.
+        For example, the `--log-http` flag can be passed as {'log-http': True}.
+    :param impersonation_chain: Optional service account to impersonate using short-term
+        credentials, or chained list of accounts required to get the access_token
+        of the last account in the list, which will be impersonated in the request.
+        If set as a string, the account must grant the originating account
+        the Service Account Token Creator IAM role.
+        If set as a sequence, the identities from the list must grant
+        Service Account Token Creator IAM role to the directly preceding identity, with first
+        account from the list granting this role to the originating account (templated).
+    :return: Dictionary containing the job's data.
+    """
+
+    template_fields: Sequence[str] = (
+        "job_name",
+        "yaml_pipeline_file",
+        "jinja_variables",
+        "options",
+        "region",
+        "project_id",
+        "gcp_conn_id",
+    )
+    template_fields_renderers = {
+        "jinja_variables": "json",
+    }
+    operator_extra_links = (DataflowJobLink(),)
+
+    def __init__(
+        self,
+        *,
+        job_name: str,
+        yaml_pipeline_file: str,
+        region: str = DEFAULT_DATAFLOW_LOCATION,
+        project_id: str = PROVIDE_PROJECT_ID,
+        gcp_conn_id: str = "google_cloud_default",
+        append_job_name: bool = True,
+        drain_pipeline: bool = False,
+        deferrable: bool = conf.getboolean("operators", "default_deferrable", fallback=False),
+        poll_sleep: int = 10,
+        cancel_timeout: int | None = 5 * 60,
+        expected_terminal_state: str | None = None,
+        jinja_variables: dict[str, str] | None = None,
+        options: dict[str, Any] | None = None,
+        impersonation_chain: str | Sequence[str] | None = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.job_name = job_name
+        self.yaml_pipeline_file = yaml_pipeline_file
+        self.region = region
+        self.project_id = project_id
+        self.gcp_conn_id = gcp_conn_id
+        self.append_job_name = append_job_name
+        self.drain_pipeline = drain_pipeline
+        self.deferrable = deferrable
+        self.poll_sleep = poll_sleep
+        self.cancel_timeout = cancel_timeout
+        self.expected_terminal_state = expected_terminal_state
+        self.options = options
+        self.jinja_variables = jinja_variables
+        self.impersonation_chain = impersonation_chain
+        self.job_id: str | None = None
+
+    def execute(self, context: Context) -> dict[str, Any]:
+        self.job_id = self.hook.launch_beam_yaml_job(
+            job_name=self.job_name,
+            yaml_pipeline_file=self.yaml_pipeline_file,
+            append_job_name=self.append_job_name,
+            options=self.options,
+            jinja_variables=self.jinja_variables,
+            project_id=self.project_id,
+            location=self.region,
+        )
+
+        DataflowJobLink.persist(self, context, self.project_id, self.region, self.job_id)
+
+        if self.deferrable:
+            self.defer(
+                trigger=DataflowStartYamlJobTrigger(
+                    job_id=self.job_id,
+                    project_id=self.project_id,
+                    location=self.region,
+                    gcp_conn_id=self.gcp_conn_id,
+                    poll_sleep=self.poll_sleep,
+                    cancel_timeout=self.cancel_timeout,
+                    expected_terminal_state=self.expected_terminal_state,
+                    impersonation_chain=self.impersonation_chain,
+                ),
+                method_name=GOOGLE_DEFAULT_DEFERRABLE_METHOD_NAME,
+            )
+
+        self.hook.wait_for_done(
+            job_name=self.job_name, location=self.region, project_id=self.project_id, job_id=self.job_id
+        )
+        job = self.hook.get_job(job_id=self.job_id, location=self.region, project_id=self.project_id)
+        return job
+
+    def execute_complete(self, context: Context, event: dict) -> dict[str, Any]:
+        """Execute after the trigger returns an event."""
+        if event["status"] in ("error", "stopped"):
+            self.log.info("status: %s, msg: %s", event["status"], event["message"])
+            raise AirflowException(event["message"])
+        job = event["job"]
+        self.log.info("Job %s completed with response %s", job["id"], event["message"])
+        self.xcom_push(context, key="job_id", value=job["id"])
+
+        return job
+
+    def on_kill(self):
+        """
+        Cancel the dataflow job if a task instance gets killed.
+
+        This method will not be called if a task instance is killed in a deferred
+        state.
+        """
+        self.log.info("On kill called.")
+        if self.job_id:
+            self.hook.cancel_job(
+                job_id=self.job_id,
+                project_id=self.project_id,
+                location=self.region,
+            )
+
+    @cached_property
+    def hook(self) -> DataflowHook:
+        return DataflowHook(
+            gcp_conn_id=self.gcp_conn_id,
+            poll_sleep=self.poll_sleep,
+            impersonation_chain=self.impersonation_chain,
+            drain_pipeline=self.drain_pipeline,
+            cancel_timeout=self.cancel_timeout,
+            expected_terminal_state=self.expected_terminal_state,
+        )
+
+
 # TODO: Remove one day
 @deprecated(
     planned_removal_date="November 01, 2024",