Skip to content

Commit

Permalink
first stab
Browse files Browse the repository at this point in the history
  • Loading branch information
rsasch committed Sep 19, 2024
1 parent 6c68384 commit 652e565
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ workflows:
branches:
- master
- ah_var_store
- vs_1456_status_writes_bug
- rsa_vs_1218
tags:
- /.*/
- name: GvsPrepareRangesCallset
Expand Down
10 changes: 2 additions & 8 deletions scripts/variantstore/wdl/GvsBulkIngestGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ workflow GvsBulkIngestGenomes {
# set to "NONE" to ingest all the reference data into GVS for VDS (instead of VCF) output
String drop_state = "NONE"

# The larger the `load_data_batch_size` the greater the probability of preemptions and non-retryable BigQuery errors,
# so if specifying `load_data_batch_size`, adjust preemptible and maxretries accordingly. Or just take the defaults, as those should work fine in most cases.
Int? load_data_batch_size
Int? load_data_scatter_width
Int? load_data_preemptible_override
Int? load_data_maxretries_override
String? billing_project_id
Expand Down Expand Up @@ -131,11 +129,7 @@ workflow GvsBulkIngestGenomes {
input_vcfs = SplitBulkImportFofn.vcf_file_name_fofn,
input_vcf_indexes = SplitBulkImportFofn.vcf_index_file_name_fofn,
interval_list = interval_list,

# The larger the `load_data_batch_size` the greater the probability of preemptions and non-retryable
# BigQuery errors so if specifying this adjust preemptible and maxretries accordingly. Or just take the defaults,
# those should work fine in most cases.
load_data_batch_size = load_data_batch_size,
load_data_scatter_width = load_data_scatter_width,
load_data_maxretries_override = load_data_maxretries_override,
load_data_preemptible_override = load_data_preemptible_override,
basic_docker = effective_basic_docker,
Expand Down
8 changes: 4 additions & 4 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ workflow GvsImportGenomes {
# without going over
Int beta_customer_max_scatter = 200
File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
Int? load_data_batch_size
Int? load_data_scatter_width
Int? load_data_preemptible_override
Int? load_data_maxretries_override
# At least one of these "load" inputs must be true
Expand Down Expand Up @@ -76,17 +76,17 @@ workflow GvsImportGenomes {
}
}

if ((num_samples > max_auto_batch_size) && !(defined(load_data_batch_size))) {
if ((num_samples > max_auto_batch_size) && !(defined(load_data_scatter_width))) {
call Utils.TerminateWorkflow as DieDueToTooManySamplesWithoutExplicitLoadDataBatchSize {
input:
message = "Importing " + num_samples + " samples but 'load_data_batch_size' is not explicitly specified; the limit for auto batch-sizing is " + max_auto_batch_size + " for " + genome_type + " samples.",
message = "Importing " + num_samples + " samples but 'load_data_scatter_width' is not explicitly specified; the limit for auto batch-sizing is " + max_auto_batch_size + " for " + genome_type + " samples.",
basic_docker = effective_basic_docker,
}
}

# At least 1, per limits above not more than 20.
# But if it's a beta customer, use the number computed above
Int effective_load_data_batch_size = if (defined(load_data_batch_size)) then select_first([load_data_batch_size])
Int effective_load_data_batch_size = if (defined(load_data_scatter_width)) then select_first([num_samples / load_data_scatter_width])
else if num_samples < max_scatter_for_user then 1
else if is_wgs then num_samples / max_scatter_for_user
else if num_samples < 5001 then (num_samples / (max_scatter_for_user * 2))
Expand Down

0 comments on commit 652e565

Please sign in to comment.