Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add backward compatibility with old versions of Apache Beam #27263

Merged
merged 3 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 35 additions & 22 deletions airflow/providers/apache/beam/hooks/beam.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""This module contains a Apache Beam Hook."""
from __future__ import annotations

import contextlib
import json
import os
import select
Expand All @@ -28,6 +29,8 @@
from tempfile import TemporaryDirectory
from typing import Callable

from packaging.version import Version

from airflow.exceptions import AirflowConfigException, AirflowException
from airflow.hooks.base import BaseHook
from airflow.providers.google.go_module_utils import init_module, install_dependencies
Expand Down Expand Up @@ -226,37 +229,47 @@ def start_python_pipeline(
if "labels" in variables:
variables["labels"] = [f"{key}={value}" for key, value in variables["labels"].items()]

if py_requirements is not None:
if not py_requirements and not py_system_site_packages:
warning_invalid_environment = textwrap.dedent(
"""\
Invalid method invocation. You have disabled inclusion of system packages and empty list
required for installation, so it is not possible to create a valid virtual environment.
In the virtual environment, apache-beam package must be installed for your job to be \
executed. To fix this problem:
* install apache-beam on the system, then set parameter py_system_site_packages to True,
* add apache-beam to the list of required packages in parameter py_requirements.
"""
)
raise AirflowException(warning_invalid_environment)

with TemporaryDirectory(prefix="apache-beam-venv") as tmp_dir:
with contextlib.ExitStack() as exit_stack:
if py_requirements is not None:
if not py_requirements and not py_system_site_packages:
warning_invalid_environment = textwrap.dedent(
"""\
Invalid method invocation. You have disabled inclusion of system packages and empty
list required for installation, so it is not possible to create a valid virtual
environment. In the virtual environment, apache-beam package must be installed for
your job to be executed.

To fix this problem:
* install apache-beam on the system, then set parameter py_system_site_packages
to True,
* add apache-beam to the list of required packages in parameter py_requirements.
"""
)
raise AirflowException(warning_invalid_environment)
tmp_dir = exit_stack.enter_context(TemporaryDirectory(prefix="apache-beam-venv"))
py_interpreter = prepare_virtualenv(
venv_directory=tmp_dir,
python_bin=py_interpreter,
system_site_packages=py_system_site_packages,
requirements=py_requirements,
)
command_prefix = [py_interpreter] + py_options + [py_file]

self._start_pipeline(
variables=variables,
command_prefix=command_prefix,
process_line_callback=process_line_callback,
)
else:
command_prefix = [py_interpreter] + py_options + [py_file]

beam_version = (
subprocess.check_output(
[py_interpreter, "-c", "import apache_beam; print(apache_beam.__version__)"]
)
.decode()
.strip()
)
self.log.info("Beam version: %s", beam_version)
impersonate_service_account = variables.get("impersonate_service_account")
if impersonate_service_account:
if Version(beam_version) < Version("2.39.0") or True:
raise AirflowException(
"The impersonateServiceAccount option requires Apache Beam 2.39.0 or newer."
)
self._start_pipeline(
variables=variables,
command_prefix=command_prefix,
Expand Down
2 changes: 1 addition & 1 deletion airflow/providers/apache/beam/provider.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ versions:

dependencies:
- apache-airflow>=2.3.0
- apache-beam>=2.39.0
- apache-beam>=2.33.0

integrations:
- integration-name: Apache Beam
Expand Down
5 changes: 5 additions & 0 deletions airflow/providers/google/cloud/operators/dataflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ class DataflowConfiguration:
If set as a sequence, the identities from the list must grant
Service Account Token Creator IAM role to the directly preceding identity, with first
account from the list granting this role to the originating account (templated).

.. warning::

This option requires Apache Beam 2.39.0 or newer.

:param drain_pipeline: Optional, set to True if want to stop streaming job by draining it
instead of canceling during killing task instance. See:
https://cloud.google.com/dataflow/docs/guides/stopping-a-pipeline
Expand Down
2 changes: 1 addition & 1 deletion generated/provider_dependencies.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"apache.beam": {
"deps": [
"apache-airflow>=2.3.0",
"apache-beam>=2.39.0"
"apache-beam>=2.33.0"
],
"cross-providers-deps": [
"google"
Expand Down
43 changes: 39 additions & 4 deletions tests/providers/apache/beam/hooks/test_beam.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,13 @@

import copy
import os
import re
import subprocess
import unittest
from unittest import mock
from unittest.mock import MagicMock

import pytest
from parameterized import parameterized

from airflow.exceptions import AirflowException
Expand Down Expand Up @@ -58,7 +60,8 @@

class TestBeamHook(unittest.TestCase):
@mock.patch(BEAM_STRING.format("BeamCommandRunner"))
def test_start_python_pipeline(self, mock_runner):
@mock.patch("airflow.providers.apache.beam.hooks.beam.subprocess.check_output", return_value=b"2.39.0")
def test_start_python_pipeline(self, mock_check_output, mock_runner):
hook = BeamHook(runner=DEFAULT_RUNNER)
wait_for_done = mock_runner.return_value.wait_for_done
process_line_callback = MagicMock()
Expand All @@ -83,6 +86,26 @@ def test_start_python_pipeline(self, mock_runner):
)
wait_for_done.assert_called_once_with()

@mock.patch("airflow.providers.apache.beam.hooks.beam.subprocess.check_output", return_value=b"2.35.0")
def test_start_python_pipeline_unsupported_option(self, mock_check_output):
hook = BeamHook(runner=DEFAULT_RUNNER)

with pytest.raises(
AirflowException,
match=re.escape("The impersonateServiceAccount option requires Apache Beam 2.39.0 or newer."),
):
hook.start_python_pipeline(
variables={
"impersonate_service_account": "[email protected]",
},
py_file="/tmp/file.py",
py_options=["-m"],
py_interpreter="python3",
py_requirements=None,
py_system_site_packages=False,
process_line_callback=MagicMock(),
)

@parameterized.expand(
[
("default_to_python3", "python3"),
Expand All @@ -92,7 +115,10 @@ def test_start_python_pipeline(self, mock_runner):
]
)
@mock.patch(BEAM_STRING.format("BeamCommandRunner"))
def test_start_python_pipeline_with_custom_interpreter(self, _, py_interpreter, mock_runner):
@mock.patch("airflow.providers.apache.beam.hooks.beam.subprocess.check_output", return_value=b"2.39.0")
def test_start_python_pipeline_with_custom_interpreter(
self, _, py_interpreter, mock_check_output, mock_runner
):
hook = BeamHook(runner=DEFAULT_RUNNER)
wait_for_done = mock_runner.return_value.wait_for_done
process_line_callback = MagicMock()
Expand Down Expand Up @@ -127,8 +153,14 @@ def test_start_python_pipeline_with_custom_interpreter(self, _, py_interpreter,
)
@mock.patch(BEAM_STRING.format("prepare_virtualenv"))
@mock.patch(BEAM_STRING.format("BeamCommandRunner"))
@mock.patch("airflow.providers.apache.beam.hooks.beam.subprocess.check_output", return_value=b"2.39.0")
def test_start_python_pipeline_with_non_empty_py_requirements_and_without_system_packages(
self, current_py_requirements, current_py_system_site_packages, mock_runner, mock_virtualenv
self,
current_py_requirements,
current_py_system_site_packages,
mock_check_output,
mock_runner,
mock_virtualenv,
):
hook = BeamHook(runner=DEFAULT_RUNNER)
wait_for_done = mock_runner.return_value.wait_for_done
Expand Down Expand Up @@ -164,7 +196,10 @@ def test_start_python_pipeline_with_non_empty_py_requirements_and_without_system
)

@mock.patch(BEAM_STRING.format("BeamCommandRunner"))
def test_start_python_pipeline_with_empty_py_requirements_and_without_system_packages(self, mock_runner):
@mock.patch("airflow.providers.apache.beam.hooks.beam.subprocess.check_output", return_value=b"2.39.0")
def test_start_python_pipeline_with_empty_py_requirements_and_without_system_packages(
self, mock_check_output, mock_runner
):
hook = BeamHook(runner=DEFAULT_RUNNER)
wait_for_done = mock_runner.return_value.wait_for_done
process_line_callback = MagicMock()
Expand Down