Skip to content

Commit

Permalink
WX-1443 Adopt gcloud storage for localization only (#7359)
Browse files Browse the repository at this point in the history
  • Loading branch information
aednichols authored and salonishah11 committed Feb 14, 2024
1 parent e246826 commit 5059513
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 12 deletions.
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
# Cromwell Change Log

## 87 Release Notes

### Replacement of `gsutil` with `gcloud storage`

In this release, all **localization** functionality on the GCP backend migrates to use the more modern and performant `gcloud storage`. With sufficiently powerful worker VMs, Cromwell can now localize at up to 1200 MB/s [0][1][2].

In a future release, **delocalization** will also migrate to `gcloud storage`. As part of that upcoming change, we are considering turning on [parallel composite uploads](https://cromwell.readthedocs.io/en/stable/backends/Google/#parallel-composite-uploads) by default to maximize performance. Delocalized composite objects will no longer have an md5 checksum in their metadata; refer to the matrix below [3]. If you have compatibility concerns for your workflow, please [submit an issue](https://github.com/broadinstitute/cromwell/issues).

| Delocalization Strategy | Performance | crc32c | md5 |
|-------------------------|---------------|--------|-----|
| Classic | Baseline/slow |||
| Parallel Composite | Fast |||

[0] Tested with Intel Ice Lake CPU platform, 16 vCPU, 32 GB RAM, 2500 GB SSD

[1] [Throughput scales with vCPU count](https://cloud.google.com/compute/docs/disks/performance#n2_vms) with a plateau at 16 vCPUs.

[2] [Throughput scales with disk size and type](https://cloud.google.com/compute/docs/disks/performance#throughput_limits_for_zonal) with at a plateau at 2.5 TB SSD. Worked example: 1200 MB/s ÷ 0.48 MB/s per GB = 2500 GB.

[3] Cromwell itself uses crc32c hashes for call caching and is not affected

## 86 Release Notes

### GCP Batch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,17 +163,18 @@ localize_files() {
fi

# We need to determine requester pays status of the first file attempting at most `max_attempts` times.
NO_REQUESTER_PAYS_COMMAND="mkdir -p '$container_parent' && rm -f "$HOME/.config/gcloud/gce" && gsutil -o 'GSUtil:parallel_thread_count=1' -o 'GSUtil:sliced_object_download_max_components=${num_cpus}' cp '$first_cloud_file' '$container_parent'"
REQUESTER_PAYS_COMMAND="rm -f "$HOME/.config/gcloud/gce" && gsutil -o 'GSUtil:parallel_thread_count=1' -o 'GSUtil:sliced_object_download_max_components=${num_cpus}' -u $project cp '$first_cloud_file' '$container_parent'"
NO_REQUESTER_PAYS_COMMAND="mkdir -p '$container_parent' && rm -f "$HOME/.config/gcloud/gce" && gcloud storage cp '$first_cloud_file' '$container_parent'"
REQUESTER_PAYS_COMMAND="rm -f "$HOME/.config/gcloud/gce" && gcloud storage cp --billing-project $project '$first_cloud_file' '$container_parent'"

basefile=$(basename "$first_cloud_file")
private::localize_message "$first_cloud_file" "${container_parent}${basefile}"
private::determine_requester_pays ${max_attempts}

if [[ ${USE_REQUESTER_PAYS} = true ]]; then
rpflag="-u $project"
# https://cloud.google.com/storage/docs/using-requester-pays#using
gcloud_storage_rpflag="--billing-project $project"
else
rpflag=""
gcloud_storage_rpflag=""
fi


Expand All @@ -192,7 +193,7 @@ localize_files() {
while [[ ${attempt} -le ${max_attempts} ]]; do
# parallel transfer the remaining files
rm -f "$HOME/.config/gcloud/gce"
if cat files_to_localize.txt | gsutil -o "GSUtil:parallel_thread_count=1" -o "GSUtil:sliced_object_download_max_components=${num_cpus}" -m ${rpflag} cp -I "$container_parent"; then
if cat files_to_localize.txt | gcloud storage cp ${gcloud_storage_rpflag} --read-paths-from-stdin "$container_parent"; then
break
else
attempt=$((attempt + 1))
Expand All @@ -209,13 +210,13 @@ private::localize_directory() {
local cloud="$1"
local container="$2"
local max_attempts="$3"
local rpflag="$4"
local gcloud_storage_rpflag="$4"

local attempt=1
private::localize_message "$cloud" "$container"
while [[ ${attempt} -lt ${max_attempts} ]]; do
# Do not quote rpflag, when that is set it will be -u project which should be two distinct arguments.
if mkdir -p "${container}" && rm -f "$HOME/.config/gcloud/gce" && gsutil ${rpflag} -m rsync -r "${cloud}" "${container}" > /dev/null 2>&1; then
# Do not quote gcloud_storage_rpflag, when that is set it will be `--billing-project project` which should be two distinct arguments.
if mkdir -p "${container}" && rm -f "$HOME/.config/gcloud/gce" && gcloud storage ${gcloud_storage_rpflag} rsync -r "${cloud}" "${container}" > /dev/null 2>&1; then
break
else
attempt=$(($attempt + 1))
Expand All @@ -242,21 +243,21 @@ localize_directories() {

BASE_COMMAND="private::localize_directory '${cloud_directory}' '${container_directory}' '${max_attempts}'"
NO_REQUESTER_PAYS_COMMAND="${BASE_COMMAND} ''"
REQUESTER_PAYS_COMMAND="${BASE_COMMAND} '-u $project'"
REQUESTER_PAYS_COMMAND="${BASE_COMMAND} '--billing-project $project'"

private::determine_requester_pays ${max_attempts}

if [[ ${USE_REQUESTER_PAYS} = true ]]; then
rpflag="-u $project"
gcloud_storage_rpflag="--billing-project $project"
else
rpflag=""
gcloud_storage_rpflag=""
fi

while [[ $# -gt 0 ]]; do
cloud_directory="$1"
container_directory="$2"
shift 2
private::localize_directory "$cloud_directory" "$container_directory" "$max_attempts" "$rpflag"
private::localize_directory "$cloud_directory" "$container_directory" "$max_attempts" "$gcloud_storage_rpflag"
done
}

Expand Down

0 comments on commit 5059513

Please sign in to comment.