Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Builder: additional workflow steps #351

Merged
merged 6 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def do_build(args: CensusBuildArgs, skip_completed_steps: bool = False) -> int:
do_create_reports,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also sync reports, and I don't think that's currently handled.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was not in the build process the last time I checked, but is easily added.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It wasn't, I just realized the omission today while doing the release follow-up tasks. Ty!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

do_data_copy,
do_the_release,
do_report_copy,
do_old_release_cleanup,
]
try:
Expand Down Expand Up @@ -137,7 +138,7 @@ def do_validate_soma(args: CensusBuildArgs) -> bool:
def do_create_reports(args: CensusBuildArgs) -> bool:
from .census_summary import display_diff, display_summary

reports_dir = args.working_dir / "reports"
reports_dir = args.working_dir / args.config.reports_dir
reports_dir.mkdir(parents=True, exist_ok=True)

logging.info("Creating summary report")
Expand All @@ -152,7 +153,7 @@ def do_create_reports(args: CensusBuildArgs) -> bool:


def do_data_copy(args: CensusBuildArgs) -> bool:
"""Copy data to S3"""
"""Copy data to S3, in preparation for a release"""
from .data_copy import sync_to_S3

sync_to_S3(
Expand All @@ -178,7 +179,21 @@ def do_the_release(args: CensusBuildArgs) -> bool:
"s3_region": "us-west-2",
},
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this release declaration could be moved into make_a_release, which would allow it to be unit tested.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pushing all of the data layout assumptions down to the module feels like it pollutes the abstraction. I'll add some tests for the main.do_* methods instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added tests for all of the new workflow steps, called from the top-level

make_a_release(args.config.cellxgene_census_S3_path, args.build_tag, release, make_latest=True)
make_a_release(
args.config.cellxgene_census_S3_path, args.build_tag, release, make_latest=True, dryrun=args.config.dryrun
)
return True


def do_report_copy(args: CensusBuildArgs) -> bool:
"""Copy build summary reports to S3 for posterity."""
from .data_copy import sync_to_S3

sync_to_S3(
args.working_dir / args.config.reports_dir,
urlcat(args.config.logs_S3_path, args.build_tag, args.config.reports_dir),
dryrun=args.config.dryrun,
)
return True


Expand All @@ -199,8 +214,8 @@ def do_log_copy(args: CensusBuildArgs) -> bool:
from .data_copy import sync_to_S3

sync_to_S3(
args.working_dir / "logs",
urlcat(args.config.logs_S3_path, args.build_tag),
args.working_dir / args.config.log_dir,
urlcat(args.config.logs_S3_path, args.build_tag, args.config.log_dir),
dryrun=args.config.dryrun,
)
return True
Expand All @@ -218,4 +233,5 @@ def create_args_parser() -> argparse.ArgumentParser:
return parser


sys.exit(main())
if __name__ == "__main__":
sys.exit(main())
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@
"verbose": 1,
"log_dir": "logs",
"log_file": "build.log",
"reports_dir": "reports",
"consolidate": True,
"disable_dirty_git_check": True,
"dryrun": False, # if True, will disable copy of data/logs/etc to S3 buckets
"dryrun": False, # if True, will disable copy of data/logs/reports/release.json to S3 buckets. Will NOT disable local build, etc.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

#
# Paths and census version name determined by spec.
"cellxgene_census_S3_path": "s3://cellxgene-data-public/cell-census",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def sync_to_S3(from_path: Union[str, pathlib.PosixPath], to_path: str, dryrun: b
try:
_log_it(f"Starting copy {from_path.as_posix()} -> {to_path}", dryrun)
with subprocess.Popen(cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, text=True) as proc:
print(proc.returncode)
if proc.stdout is not None:
for line in proc.stdout:
logging.info(line)
Expand Down Expand Up @@ -63,7 +64,7 @@ def main() -> int:
"--dryrun",
action=argparse.BooleanOptionalAction,
default=True,
help="Display, but do NOT perform actions. Useful for previewing actions. Default: True",
help="Skips S3 data copies. Useful for previewing actions. Default: True",
)
parser.add_argument("-v", "--verbose", action="count", default=1, help="Increase logging verbosity")
args = parser.parse_args()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _update_release_manifest(
latest_tag = new_manifest["latest"]
_log_it(f"Commiting updated release.json with latest={latest_tag}", dryrun)
if not dryrun:
commit_release_manifest(census_base_url, new_manifest)
commit_release_manifest(census_base_url, new_manifest, dryrun=dryrun)


def _perform_recursive_delete(rls_tag: CensusVersionName, uri: str, dryrun: bool) -> None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,16 @@ def get_release_manifest(census_base_url: str, s3_anon: bool = False) -> CensusR
return cast(CensusReleaseManifest, json.loads(f.read()))


def commit_release_manifest(census_base_url: str, release_manifest: CensusReleaseManifest) -> None:
def commit_release_manifest(
census_base_url: str, release_manifest: CensusReleaseManifest, dryrun: bool = False
) -> None:
"""
Write a new release manifest to the Census.
"""
# Out of an abundance of caution, validate the contents
validate_release_manifest(census_base_url, release_manifest)
_overwrite_release_manifest(census_base_url, release_manifest)
if not dryrun:
_overwrite_release_manifest(census_base_url, release_manifest)


def _overwrite_release_manifest(census_base_url: str, release_manifest: CensusReleaseManifest) -> None:
Expand Down Expand Up @@ -131,7 +134,11 @@ def _validate_exists(rls_info: CensusVersionDescription, s3_anon: bool) -> None:


def make_a_release(
census_base_url: str, rls_tag: CensusVersionName, rls_info: CensusVersionDescription, make_latest: bool
census_base_url: str,
rls_tag: CensusVersionName,
rls_info: CensusVersionDescription,
make_latest: bool,
dryrun: bool = False,
) -> None:
"""
Make a release and optionally alias release as `latest`
Expand All @@ -146,4 +153,4 @@ def make_a_release(
manifest["latest"] = rls_tag

# Will validate, and raise on anything suspicious
commit_release_manifest(census_base_url, manifest)
commit_release_manifest(census_base_url, manifest, dryrun=dryrun)
12 changes: 9 additions & 3 deletions tools/cellxgene_census_builder/tests/test_release_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,22 +199,28 @@ def test_validate_release_manifest_errors(
),
],
)
@pytest.mark.parametrize("dryrun", [True, False])
def test_make_a_release(
release_manifest: CensusReleaseManifest,
rls_tag: CensusVersionName,
rls_info: CensusVersionDescription,
expected_new_manifest: CensusReleaseManifest,
make_latest: bool,
dryrun: bool,
) -> None:
with (
mock.patch(
"cellxgene_census_builder.release_manifest.get_release_manifest", return_value=release_manifest
"cellxgene_census_builder.release_manifest.get_release_manifest", return_value=release_manifest.copy()
) as get_release_manifest_patch,
mock.patch("s3fs.S3FileSystem.isdir", return_value=True),
mock.patch(
"cellxgene_census_builder.release_manifest._overwrite_release_manifest"
) as commit_release_manifest_patch,
):
make_a_release(TEST_CENSUS_BASE_URL, rls_tag, rls_info, make_latest)
make_a_release(TEST_CENSUS_BASE_URL, rls_tag, rls_info, make_latest, dryrun=dryrun)
assert get_release_manifest_patch.called
assert commit_release_manifest_patch.call_args == ((TEST_CENSUS_BASE_URL, expected_new_manifest),)
if dryrun:
assert commit_release_manifest_patch.call_count == 0
else:
assert commit_release_manifest_patch.call_count == 1
assert commit_release_manifest_patch.call_args == ((TEST_CENSUS_BASE_URL, expected_new_manifest),)
95 changes: 95 additions & 0 deletions tools/cellxgene_census_builder/tests/test_workflow_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import pathlib
from typing import Callable
from unittest import mock

import pytest
from cellxgene_census_builder.__main__ import do_data_copy, do_log_copy, do_report_copy, do_the_release
from cellxgene_census_builder.build_state import CensusBuildArgs, CensusBuildConfig
from cellxgene_census_builder.release_manifest import CensusVersionDescription

TEST_BUCKET_PATH = "s3://bucket/path"


@pytest.mark.parametrize("dryrun", [True, False])
@pytest.mark.parametrize("build_tag", ["2020-10-20", "1999-12-31"])
def test_do_data_copy(tmp_path: pathlib.Path, build_tag: str, dryrun: bool) -> None:
args = CensusBuildArgs(
working_dir=tmp_path,
config=CensusBuildConfig(
build_tag=build_tag,
dryrun=dryrun,
cellxgene_census_S3_path=TEST_BUCKET_PATH,
),
)
from_path = tmp_path / build_tag
from_path.mkdir(exist_ok=True, parents=True)
to_path = f"{TEST_BUCKET_PATH}/{build_tag}"

with mock.patch("subprocess.Popen") as popen_patch:
popen_patch.return_value.__enter__.return_value.returncode = 0
popen_patch.return_value.__enter__.return_value.stdout = None
do_data_copy(args)

assert popen_patch.call_count == 1
expect = ["aws", "s3", "sync", from_path.as_posix(), to_path, "--no-progress"]
if dryrun:
expect += ["--dryrun"]
assert popen_patch.call_args[0][0] == expect


@pytest.mark.parametrize("step_func,dir_name", [(do_report_copy, "reports"), (do_log_copy, "logs")])
@pytest.mark.parametrize("dryrun", [True, False])
@pytest.mark.parametrize("build_tag", ["2020-10-20", "1999-12-31"])
def test_do_other_copy(
tmp_path: pathlib.Path, build_tag: str, dryrun: bool, step_func: Callable[[CensusBuildArgs], None], dir_name: str
) -> None:
args = CensusBuildArgs(
working_dir=tmp_path,
config=CensusBuildConfig(
build_tag=build_tag,
dryrun=dryrun,
logs_S3_path=TEST_BUCKET_PATH,
),
)
from_path = tmp_path / dir_name
from_path.mkdir(exist_ok=True, parents=True)
to_path = f"{TEST_BUCKET_PATH}/{build_tag}/{dir_name}"

with mock.patch("subprocess.Popen") as popen_patch:
popen_patch.return_value.__enter__.return_value.returncode = 0
popen_patch.return_value.__enter__.return_value.stdout = None
step_func(args)

assert popen_patch.call_count == 1
expect = ["aws", "s3", "sync", from_path.as_posix(), to_path, "--no-progress"]
if dryrun:
expect += ["--dryrun"]
assert popen_patch.call_args[0][0] == expect


@pytest.mark.parametrize("dryrun", [True, False])
def test_do_the_release(tmp_path: pathlib.Path, dryrun: bool) -> None:
build_tag = "2020-02-03"
args = CensusBuildArgs(
working_dir=tmp_path,
config=CensusBuildConfig(
build_tag=build_tag,
cellxgene_census_S3_path=TEST_BUCKET_PATH,
dryrun=dryrun,
),
)

with mock.patch("cellxgene_census_builder.release_manifest.make_a_release") as make_a_release_patch:
do_the_release(args)

expected_rls_description: CensusVersionDescription = {
"release_build": build_tag,
"release_date": None,
"soma": {"uri": f"{TEST_BUCKET_PATH}/{build_tag}/soma/", "s3_region": "us-west-2"},
"h5ads": {"uri": f"{TEST_BUCKET_PATH}/{build_tag}/h5ads/", "s3_region": "us-west-2"},
}
assert make_a_release_patch.call_count == 1
assert make_a_release_patch.call_args == (
(TEST_BUCKET_PATH, build_tag, expected_rls_description),
{"make_latest": True, "dryrun": dryrun},
)