Skip to content

Commit

Permalink
Add automated retries on retryable condition for building images in CI (
Browse files Browse the repository at this point in the history
#24006)

There is a flakiness in pushing cache images to ghcr.io, therefore
we want to add automated retries when the images fail intermittently.

The root cause of the problem is tracked in containerd:
containerd/containerd#5978

(cherry picked from commit 7cf0e43)
  • Loading branch information
potiuk committed May 29, 2022
1 parent 0b627db commit 5a9699a
Show file tree
Hide file tree
Showing 9 changed files with 283 additions and 175 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1688,6 +1688,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}"
run: >
breeze build-image
--prepare-buildx-cache
--max-retries 3
--platform linux/amd64,linux/arm64
env:
PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }}
Expand Down Expand Up @@ -1721,6 +1722,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}"
--install-packages-from-context
--prepare-buildx-cache
--disable-airflow-repo-cache
--max-retries 3
--platform linux/amd64,linux/arm64
env:
PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }}
Expand Down
40 changes: 33 additions & 7 deletions dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import sys
from pathlib import Path
from subprocess import CompletedProcess
from typing import List, Optional, Tuple, Union

import click
Expand Down Expand Up @@ -51,6 +52,7 @@
option_image_name,
option_image_tag,
option_install_providers_from_sources,
option_max_retries,
option_parallelism,
option_platform,
option_prepare_buildx_cache,
Expand Down Expand Up @@ -88,6 +90,7 @@
instruct_build_image,
is_repo_rebased,
run_command,
run_result_contains,
)

CI_IMAGE_TOOLS_COMMANDS = {
Expand All @@ -111,6 +114,7 @@
"--tag-as-latest",
"--docker-cache",
"--force-build",
"--max-retries",
],
},
{
Expand Down Expand Up @@ -203,6 +207,7 @@
@option_docker_cache
@option_image_tag
@option_prepare_buildx_cache
@option_max_retries
@option_push_image
@option_empty_image
@option_install_providers_from_sources
Expand Down Expand Up @@ -480,15 +485,34 @@ def build_ci_image(verbose: bool, dry_run: bool, ci_image_params: BuildCiParams)
)
else:
get_console().print(f"\n[info]Building CI Image for Python {ci_image_params.python}\n")
build_command_result = run_command(
cmd, verbose=verbose, dry_run=dry_run, cwd=AIRFLOW_SOURCES_ROOT, text=True, check=False
)
if build_command_result.returncode == 0:
if ci_image_params.prepare_buildx_cache:
num_tries = 1 if ci_image_params.max_retries is None else ci_image_params.max_retries
build_command_result = CompletedProcess(args=[], returncode=1, stdout="This should never happen.")
while num_tries > 0:
build_command_result = run_command(
cmd,
verbose=verbose,
dry_run=dry_run,
cwd=AIRFLOW_SOURCES_ROOT,
check=False,
text=True,
capture_output=True,
)
if ci_image_params.prepare_buildx_cache and build_command_result.returncode == 0:
build_command_result = build_cache(
image_params=ci_image_params, dry_run=dry_run, verbose=verbose
)

if build_command_result.returncode == 0:
break
num_tries -= 1
if run_result_contains(build_command_result, "cannot reuse body, request must be retried"):
if num_tries > 0:
get_console().print(
"[info]Retrying failed command on retryable condition. "
f"There are {num_tries} left[/]"
)
continue
else:
break
if not ci_image_params.prepare_buildx_cache:
if not dry_run:
if build_command_result.returncode == 0:
Expand All @@ -507,7 +531,9 @@ def build_ci_image(verbose: bool, dry_run: bool, ci_image_params: BuildCiParams)
f"Image build: {ci_image_params.python}",
)
else:
get_console().print("[info]Not updating build cache because we are in `dry_run` mode.[/]")
get_console().print(
"[info]Not tagging/marking image as refreshed because we are in `dry_run` mode.[/]"
)
return build_command_result.returncode, f"Image build: {ci_image_params.python}"


Expand Down
53 changes: 41 additions & 12 deletions dev/breeze/src/airflow_breeze/commands/production_image_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import contextlib
import os
import sys
from subprocess import CompletedProcess
from typing import Optional, Tuple

import click
Expand Down Expand Up @@ -49,6 +50,7 @@
option_image_name,
option_image_tag,
option_install_providers_from_sources,
option_max_retries,
option_parallelism,
option_platform,
option_prepare_buildx_cache,
Expand Down Expand Up @@ -77,7 +79,12 @@
from airflow_breeze.utils.python_versions import get_python_version_list
from airflow_breeze.utils.registry import login_to_github_docker_registry
from airflow_breeze.utils.run_tests import verify_an_image
from airflow_breeze.utils.run_utils import filter_out_none, fix_group_permissions, run_command
from airflow_breeze.utils.run_utils import (
filter_out_none,
fix_group_permissions,
run_command,
run_result_contains,
)

PRODUCTION_IMAGE_TOOLS_COMMANDS = {
"name": "Production Image tools",
Expand All @@ -99,6 +106,7 @@
"--image-tag",
"--tag-as-latest",
"--docker-cache",
"--max-retries",
],
},
{
Expand Down Expand Up @@ -206,6 +214,7 @@
@option_docker_cache
@option_image_tag
@option_prepare_buildx_cache
@option_max_retries
@option_push_image
@option_empty_image
@option_airflow_constraints_mode_prod
Expand Down Expand Up @@ -517,16 +526,36 @@ def build_production_image(
image_params=prod_image_params,
verbose=verbose,
)
build_command_result = run_command(
cmd, verbose=verbose, dry_run=dry_run, cwd=AIRFLOW_SOURCES_ROOT, check=False, text=True
)
if build_command_result.returncode == 0:
if prod_image_params.prepare_buildx_cache:
build_command_result = build_cache(
image_params=prod_image_params, dry_run=dry_run, verbose=verbose
)
num_tries = 1 if prod_image_params.max_retries is None else prod_image_params.max_retries
build_command_result = CompletedProcess(args=[], returncode=1, stdout="This should never happen.")
while num_tries > 0:
build_command_result = run_command(
cmd,
verbose=verbose,
dry_run=dry_run,
cwd=AIRFLOW_SOURCES_ROOT,
check=False,
text=True,
capture_output=True,
)
if build_command_result.returncode == 0:
if prod_image_params.prepare_buildx_cache:
build_command_result = build_cache(
image_params=prod_image_params, dry_run=dry_run, verbose=verbose
)
else:
if prod_image_params.tag_as_latest:
build_command_result = tag_image_as_latest(prod_image_params, dry_run, verbose)
if build_command_result.returncode == 0:
break
num_tries -= 1
if run_result_contains(build_command_result, "cannot reuse body, request must be retried"):
if num_tries > 0:
get_console().print(
"[info]Retrying failed command on retryable condition. "
f"There are {num_tries} left[/]"
)
continue
else:
if prod_image_params.tag_as_latest:
build_command_result = tag_image_as_latest(prod_image_params, dry_run, verbose)

break
return build_command_result.returncode, f"Image build: {prod_image_params.python}"
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class _CommonBuildParams:
github_token: str = os.environ.get('GITHUB_TOKEN', "")
github_username: str = ""
image_tag: Optional[str] = None
max_retries: Optional[int] = None
install_providers_from_sources: bool = False
platform: str = f"linux/{os.uname().machine}"
prepare_buildx_cache: bool = False
Expand Down
6 changes: 6 additions & 0 deletions dev/breeze/src/airflow_breeze/utils/common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,12 @@
is_flag=True,
envvar='PREPARE_BUILDX_CACHE',
)
option_max_retries = click.option(
'--max-retries',
help='Maximum number of retries for the operation for "retryable" intermittent problems.',
type=click.IntRange(min=2),
envvar='MAX_RETRIES',
)
option_push_image = click.option(
'--push-image',
help='Push image after building it.',
Expand Down
38 changes: 38 additions & 0 deletions dev/breeze/src/airflow_breeze/utils/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from re import match
from typing import Dict, List, Mapping, Optional, Union

from airflow_breeze.branch_defaults import AIRFLOW_BRANCH
from airflow_breeze.params._common_build_params import _CommonBuildParams
from airflow_breeze.utils.ci_group import ci_group
from airflow_breeze.utils.console import get_console
Expand Down Expand Up @@ -375,3 +376,40 @@ def filter_out_none(**kwargs) -> dict:
if kwargs[key] is None:
kwargs.pop(key)
return kwargs


def fail_if_image_missing(image: str, verbose: bool, dry_run: bool, instruction: str) -> None:
skip_image_pre_commits = os.environ.get('SKIP_IMAGE_PRE_COMMITS', "false")
if skip_image_pre_commits[0].lower() == "t":
get_console().print(
f"[info]Skipping image check as SKIP_IMAGE_PRE_COMMITS is set to {skip_image_pre_commits}[/]"
)
sys.exit(0)
cmd_result = run_command(
["docker", "inspect", image], stdout=subprocess.DEVNULL, check=False, verbose=verbose, dry_run=dry_run
)
if cmd_result.returncode != 0:
print(f'[red]The image {image} is not available.[/]\n')
print(f"\n[yellow]Please run at the earliest convenience:[/]\n\n{instruction}\n\n")
sys.exit(1)


def get_runnable_ci_image(verbose: bool, dry_run: bool) -> str:
github_repository = os.environ.get('GITHUB_REPOSITORY', "apache/airflow")
python_version = "3.7"
airflow_image = f"ghcr.io/{github_repository}/{AIRFLOW_BRANCH}/ci/python{python_version}"
fail_if_image_missing(
image=airflow_image,
verbose=verbose,
dry_run=dry_run,
instruction=f"breeze build-image --python {python_version}",
)
return airflow_image


def run_result_contains(result: RunCommandResult, message: str) -> bool:
if result.stdout and message in result.stdout:
return True
if result.stderr and message in result.stderr:
return True
return False
Loading

0 comments on commit 5a9699a

Please sign in to comment.