diff --git a/.gitignore b/.gitignore
index b552d8eeb2d..9d47c41ba71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,4 @@ client_secret.json
servicekey.json
#Test Generated File
-I_SHOULD_HAVE_BEEN_DELETED
-
-/scripts/cnv_wdl/somatic/cnv_common_tasks.wdl
-/scripts/cnv_wdl/germline/cnv_common_tasks.wdl
+I_SHOULD_HAVE_BEEN_DELETED
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index ca272a21522..40d46276874 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,6 +13,7 @@ env:
- TEST_TYPE=integration TEST_DOCKER=true TEST_VERBOSITY=minimal
- TEST_TYPE=unit TEST_DOCKER=true TEST_VERBOSITY=minimal
- TEST_TYPE=python TEST_DOCKER=true TEST_VERBOSITY=minimal
+ - RUN_CNV_SOMATIC_WDL=true
- RUN_CNV_SOMATIC_LEGACY_WDL=true
- RUN_M2_WDL=true
global:
@@ -87,7 +88,7 @@ before_install:
sudo Rscript scripts/docker/gatkbase/install_R_packages.R;
fi
# Download Cromwell jar
-- if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_M2_WDL == true || $RUN_CNV_GERMLINE_WDL == true ]]; then
+- if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_CNV_GERMLINE_WDL == true || $RUN_M2_WDL == true ]]; then
wget -O ~/cromwell-0.28.jar https://github.com/broadinstitute/cromwell/releases/download/28/cromwell-28.jar;
fi
# Download Picard jar
@@ -107,7 +108,7 @@ install:
else
./gradlew assemble;
./gradlew installDist;
- if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_M2_WDL == true || $RUN_CNV_GERMLINE_WDL == true ]]; then
+ if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_CNV_GERMLINE_WDL == true || $RUN_M2_WDL == true ]]; then
echo "building a shadow jar for the wdl";
./gradlew shadowJar;
elif [[ $TEST_TYPE == cloud ]]; then
@@ -122,12 +123,12 @@ script:
echo "Not running any tests for nightly builds";
elif [[ $TRAVIS_SECURE_ENV_VARS == false && $TEST_TYPE == cloud ]]; then
echo "Can't run cloud tests without keys so don't run tests";
+ elif [[ $RUN_CNV_SOMATIC_WDL == true ]]; then
+ echo "Running CNV somatic workflows";
+ bash scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh;
elif [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true ]]; then
echo "Running legacy CNV somatic workflows";
bash scripts/cnv_cromwell_tests/somatic_legacy/run_cnv_somatic_workflows.sh;
- elif [[ $RUN_CNV_GERMLINE_WDL == true ]]; then
- echo "Running CNV germline workflows";
- bash scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh;
elif [[ $RUN_M2_WDL == true ]]; then
echo "Deleting some unused files before running M2 WDL...";
rm -Rf src/test/resources/large/VQSR;
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wes.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wes.json
deleted file mode 100755
index 80b8423fcb1..00000000000
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wes.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "CNVGermlineCohortWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv",
- "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai",
- "CNVGermlineCohortWorkflow.num_latents": "2",
- "CNVGermlineCohortWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv",
- "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta",
- "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict",
- "CNVGermlineCohortWorkflow.gatk_jar": "/root/gatk.jar",
- "CNVGermlineCohortWorkflow.targets": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/ice_targets_chr20xy.tsv",
- "CNVGermlineCohortWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv",
- "CNVGermlineCohortWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv",
- "CNVGermlineCohortWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ],
- "CNVGermlineCohortWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wes_pon/model_final/",
- "CNVGermlineCohortWorkflow.output_path": "output",
- "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__"
-}
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wgs.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wgs.json
deleted file mode 100755
index e12b8a4caf3..00000000000
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wgs.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
- "CNVGermlineCohortWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv",
- "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai",
- "CNVGermlineCohortWorkflow.num_latents": "1",
- "CNVGermlineCohortWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv",
- "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta",
- "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict",
- "CNVGermlineCohortWorkflow.gatk_jar": "/root/gatk.jar",
- "CNVGermlineCohortWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv",
- "CNVGermlineCohortWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv",
- "CNVGermlineCohortWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ],
- "CNVGermlineCohortWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wgs_pon/model_final/",
- "CNVGermlineCohortWorkflow.output_path": "output",
- "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__"
-}
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wes.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wes.json
deleted file mode 100755
index 0d51652eb94..00000000000
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wes.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
- "CNVGermlinePanelWorkflow.gatk_jar": "/root/gatk.jar",
- "CNVGermlinePanelWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv",
- "CNVGermlinePanelWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv",
- "CNVGermlinePanelWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv",
- "CNVGermlinePanelWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ],
- "CNVGermlinePanelWorkflow.num_latents": "2",
- "CNVGermlinePanelWorkflow.targets": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/ice_targets_chr20xy.tsv",
- "CNVGermlinePanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv",
- "CNVGermlinePanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai",
- "CNVGermlinePanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict",
- "CNVGermlinePanelWorkflow.pon_output_path": "test_pon",
- "CNVGermlinePanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta",
- "CNVGermlinePanelWorkflow.gatk_docker": "__GATK_DOCKER__"
-}
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wgs.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wgs.json
deleted file mode 100755
index be4c75e545b..00000000000
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wgs.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
- "CNVGermlinePanelWorkflow.gatk_jar": "/root/gatk.jar",
- "CNVGermlinePanelWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv",
- "CNVGermlinePanelWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv",
- "CNVGermlinePanelWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv",
- "CNVGermlinePanelWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ],
- "CNVGermlinePanelWorkflow.num_latents": "1",
- "CNVGermlinePanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv",
- "CNVGermlinePanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai",
- "CNVGermlinePanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict",
- "CNVGermlinePanelWorkflow.pon_output_path": "test_pon",
- "CNVGermlinePanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta",
- "CNVGermlinePanelWorkflow.gatk_docker": "__GATK_DOCKER__"
-}
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wes.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wes.json
deleted file mode 100755
index 138e3685dee..00000000000
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wes.json
+++ /dev/null
@@ -1,21 +0,0 @@
-{
- "CNVGermlineSingleSampleWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam",
- "CNVGermlineSingleSampleWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai",
- "CNVGermlineSingleSampleWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam.bai",
- "CNVGermlineSingleSampleWorkflow.num_latents": "2",
- "CNVGermlineSingleSampleWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv",
- "CNVGermlineSingleSampleWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta",
- "CNVGermlineSingleSampleWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict",
- "CNVGermlineSingleSampleWorkflow.gatk_jar": "/root/gatk.jar",
- "CNVGermlineSingleSampleWorkflow.targets": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/ice_targets_chr20xy.tsv",
- "CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv",
- "CNVGermlineSingleSampleWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv",
- "CNVGermlineSingleSampleWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ],
- "CNVGermlineSingleSampleWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wes_pon/model_final/",
- "CNVGermlineSingleSampleWorkflow.output_path": "output",
- "CNVGermlineSingleSampleWorkflow.gatk_docker": "__GATK_DOCKER__"
-}
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wgs.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wgs.json
deleted file mode 100755
index 15d1b8f8a2b..00000000000
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wgs.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "CNVGermlineSingleSampleWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam",
- "CNVGermlineSingleSampleWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai",
- "CNVGermlineSingleSampleWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam.bai",
- "CNVGermlineSingleSampleWorkflow.num_latents": "1",
- "CNVGermlineSingleSampleWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv",
- "CNVGermlineSingleSampleWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta",
- "CNVGermlineSingleSampleWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict",
- "CNVGermlineSingleSampleWorkflow.gatk_jar": "/root/gatk.jar",
- "CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv",
- "CNVGermlineSingleSampleWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv",
- "CNVGermlineSingleSampleWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv",
- "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ],
- "CNVGermlineSingleSampleWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wgs_pon/model_final/",
- "CNVGermlineSingleSampleWorkflow.output_path": "output",
- "CNVGermlineSingleSampleWorkflow.gatk_docker": "__GATK_DOCKER__"
-}
diff --git a/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv b/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv
deleted file mode 100755
index eae8ed2f969..00000000000
--- a/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam.bai
-/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P2T_20xy-downsampled.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P2T_20xy-downsampled.bam.bai
-/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P35_20xy-downsampled.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P35_20xy-downsampled.bam.bai
\ No newline at end of file
diff --git a/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh b/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh
deleted file mode 100644
index 2dd6b804618..00000000000
--- a/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash -l
-set -e
-#cd in the directory of the script in order to use relative paths
-script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
-cd "$script_path"
-
-ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/cnv_common_tasks.wdl
-ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl
-ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl
-ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
-
-WORKING_DIR=/home/travis/build/broadinstitute
-
-pushd .
-echo "Building docker without running unit tests... ========="
-cd $WORKING_DIR/gatk
-# IMPORTANT: This code is duplicated in the M2 WDL test.
-if [ ${TRAVIS_PULL_REQUEST} != false ]; then
- HASH_TO_USE=FETCH_HEAD
- sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST};
-else
- HASH_TO_USE=${TRAVIS_COMMIT}
- sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
-fi
-echo "Docker build done =========="
-popd
-
-echo "Inserting docker image into json ========"
-CNV_CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/cnv_cromwell_tests/germline/"
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_workflow_wes.json >cnv_germline_cohort_workflow_wes_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_workflow_wgs.json >cnv_germline_cohort_workflow_wgs_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_panel_workflow_wes.json >cnv_germline_panel_workflow_wes_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_panel_workflow_wgs.json >cnv_germline_panel_workflow_wgs_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_single_sample_workflow_wes.json >cnv_germline_single_sample_workflow_wes_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_single_sample_workflow_wgs.json >cnv_germline_single_sample_workflow_wgs_mod.json
-
-echo "Running ========"
-
-CROMWELL_JAR="cromwell-0.28.jar"
-
-# Panel WES
-java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl cnv_germline_panel_workflow_wes_mod.json
-# Panel WGS
-java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl cnv_germline_panel_workflow_wgs_mod.json
-
-# Single sample WES calling
-java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl cnv_germline_single_sample_workflow_wes_mod.json
-# Single sample WGS calling
-java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl cnv_germline_single_sample_workflow_wgs_mod.json
-
-# Cohort WES calling
-java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl cnv_germline_cohort_workflow_wes_mod.json
-# Cohort WGS calling
-java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl cnv_germline_cohort_workflow_wgs_mod.json
\ No newline at end of file
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wes_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wes_workflow.json
new file mode 100644
index 00000000000..3c63db7937a
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wes_workflow.json
@@ -0,0 +1,13 @@
+{
+ "CNVSomaticPairWorkflow.common_sites": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/common_snps_sample-chr20.interval_list",
+ "CNVSomaticPairWorkflow.gatk_docker": "__GATK_DOCKER__",
+ "CNVSomaticPairWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/ice_targets_sample-chr20.interval_list",
+ "CNVSomaticPairWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam",
+ "CNVSomaticPairWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam.bai",
+ "CNVSomaticPairWorkflow.read_count_pon": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/wes-no-gc.pon.hdf5",
+ "CNVSomaticPairWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict",
+ "CNVSomaticPairWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai",
+ "CNVSomaticPairWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta",
+ "CNVSomaticPairWorkflow.tumor_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam",
+ "CNVSomaticPairWorkflow.tumor_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam.bai"
+}
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wgs_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wgs_workflow.json
new file mode 100644
index 00000000000..b34d6bb3874
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wgs_workflow.json
@@ -0,0 +1,14 @@
+{
+ "CNVSomaticPairWorkflow.common_sites": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/common_snps_sample-chr20.interval_list",
+ "CNVSomaticPairWorkflow.gatk_docker": "__GATK_DOCKER__",
+ "CNVSomaticPairWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/chr20.interval_list",
+ "CNVSomaticPairWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam",
+ "CNVSomaticPairWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam.bai",
+ "CNVSomaticPairWorkflow.PreprocessIntervals.bin_length": "10000",
+ "CNVSomaticPairWorkflow.read_count_pon": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/wgs-no-gc.pon.hdf5",
+ "CNVSomaticPairWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict",
+ "CNVSomaticPairWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai",
+ "CNVSomaticPairWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta",
+ "CNVSomaticPairWorkflow.tumor_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam",
+ "CNVSomaticPairWorkflow.tumor_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam.bai"
+}
\ No newline at end of file
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_do-gc_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_do-gc_workflow.json
new file mode 100644
index 00000000000..90c7ec34562
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_do-gc_workflow.json
@@ -0,0 +1,10 @@
+{
+ "CNVSomaticPanelWorkflow.do_explicit_gc_correction": "true",
+ "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__",
+ "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/ice_targets_sample-chr20.interval_list",
+ "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv",
+ "CNVSomaticPanelWorkflow.pon_entity_id": "test",
+ "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict",
+ "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai",
+ "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta"
+}
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv
new file mode 100644
index 00000000000..c0cdb12c282
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv
@@ -0,0 +1,2 @@
+/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam.bai
+/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam.bai
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_workflow.json
new file mode 100644
index 00000000000..c227b993414
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_workflow.json
@@ -0,0 +1,9 @@
+{
+ "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__",
+ "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/ice_targets_sample-chr20.interval_list",
+ "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv",
+ "CNVSomaticPanelWorkflow.pon_entity_id": "test",
+ "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict",
+ "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai",
+ "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta"
+}
\ No newline at end of file
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_do-gc_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_do-gc_workflow.json
new file mode 100644
index 00000000000..26df09b9695
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_do-gc_workflow.json
@@ -0,0 +1,11 @@
+{
+ "CNVSomaticPanelWorkflow.do_explicit_gc_correction": "true",
+ "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__",
+ "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/chr20.interval_list",
+ "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv",
+ "CNVSomaticPanelWorkflow.pon_entity_id": "test",
+ "CNVSomaticPanelWorkflow.PreprocessIntervals.bin_length": "10000",
+ "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict",
+ "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai",
+ "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta"
+}
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv
new file mode 100644
index 00000000000..269d8a013b9
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv
@@ -0,0 +1,2 @@
+/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam.bai
+/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam.bai
diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_workflow.json
new file mode 100644
index 00000000000..3e2ab86ac06
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_workflow.json
@@ -0,0 +1,10 @@
+{
+ "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__",
+ "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/chr20.interval_list",
+ "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv",
+ "CNVSomaticPanelWorkflow.pon_entity_id": "test",
+ "CNVSomaticPanelWorkflow.PreprocessIntervals.bin_length": "10000",
+ "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict",
+ "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai",
+ "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta"
+}
diff --git a/scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh b/scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh
new file mode 100644
index 00000000000..891fbe680f6
--- /dev/null
+++ b/scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh
@@ -0,0 +1,50 @@
+#!/bin/bash -l
+set -e
+#cd in the directory of the script in order to use relative paths
+script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
+cd "$script_path"
+
+ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/cnv_common_tasks.wdl
+
+WORKING_DIR=/home/travis/build/broadinstitute
+
+pushd .
+echo "Building docker without running unit tests... ========="
+cd $WORKING_DIR/gatk
+# IMPORTANT: This code is duplicated in the M2 WDL test.
+if [ ${TRAVIS_PULL_REQUEST} != false ]; then
+ HASH_TO_USE=FETCH_HEAD
+ sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST};
+else
+ HASH_TO_USE=${TRAVIS_COMMIT}
+ sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
+fi
+echo "Docker build done =========="
+
+popd
+
+echo "Inserting docker image into json ========"
+CNV_CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/cnv_cromwell_tests/somatic/"
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wes_workflow.json >cnv_somatic_panel_wes_workflow_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wgs_workflow.json >cnv_somatic_panel_wgs_workflow_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wes_do-gc_workflow.json >cnv_somatic_panel_wes_do-gc_workflow_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wgs_do-gc_workflow.json >cnv_somatic_panel_wgs_do-gc_workflow_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_pair_wes_workflow.json >cnv_somatic_pair_wes_workflow_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_pair_wgs_workflow.json >cnv_somatic_pair_wgs_workflow_mod.json
+
+echo "Running ========"
+CROMWELL_JAR="cromwell-0.28.jar"
+
+# Panel WES
+java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wes_workflow_mod.json
+# Panel WGS
+java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wgs_workflow_mod.json
+# Panel WES w/ explicit GC correction
+java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wes_do-gc_workflow_mod.json
+# Panel WGS w/ explicit GC correction
+java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wgs_do-gc_workflow_mod.json
+
+# Pair WES
+java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl cnv_somatic_pair_wes_workflow_mod.json
+# Pair WGS
+java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl cnv_somatic_pair_wgs_workflow_mod.json
\ No newline at end of file
diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl
index f35717da41e..55a27ab5605 100755
--- a/scripts/cnv_wdl/cnv_common_tasks.wdl
+++ b/scripts/cnv_wdl/cnv_common_tasks.wdl
@@ -1,48 +1,9 @@
-# Tasks common to both the CNV somatic panel and case workflows.
-#
-#############
-
-# Pad targets in the target file by the specified amount (this was found to improve sensitivity and specificity)
-task PadTargets {
- File targets
- Int? padding
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- # Determine output filename
- String filename = select_first([targets, ""])
- String base_filename = basename(filename, ".tsv")
-
- command {
- java -Xmx${default="1" mem}g -jar ${gatk_jar} PadTargets \
- --targets ${targets} \
- --padding ${default="250" padding} \
- --output ${base_filename}.padded.tsv
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 2]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, 40]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- File padded_targets = "${base_filename}.padded.tsv"
- }
-}
-
task PreprocessIntervals {
File? intervals
File ref_fasta_dict
Int? padding
Int? bin_length
- String gatk_jar
+ File? gatk4_jar_override
# Runtime parameters
Int? mem
@@ -54,21 +15,24 @@ task PreprocessIntervals {
String filename = select_first([intervals, "wgs"])
String base_filename = basename(filename, ".interval_list")
- command {
- java -Xmx${default="2" mem}g -jar ${gatk_jar} PreprocessIntervals \
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${default="2" mem}g -jar $GATK_JAR PreprocessIntervals \
${"-L " + intervals} \
- -sequenceDictionary ${ref_fasta_dict} \
+ --sequence-dictionary ${ref_fasta_dict} \
--padding ${default="250" padding} \
--binLength ${default="1000" bin_length} \
- --interval_merging_rule OVERLAPPING_ONLY \
+ --interval-merging-rule OVERLAPPING_ONLY \
--output ${base_filename}.preprocessed.interval_list
- }
+ >>>
runtime {
docker: "${gatk_docker}"
memory: select_first([mem, 2]) + " GB"
disks: "local-disk " + select_first([disk_space_gb, 40]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
+ preemptible: select_first([preemptible_attempts, 5])
}
output {
@@ -76,47 +40,12 @@ task PreprocessIntervals {
}
}
-# Create a target file with GC annotations
-task AnnotateTargets {
- String entity_id
- File intervals
- File ref_fasta
- File ref_fasta_fai
- File ref_fasta_dict
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- command {
- java -Xmx${default="4" mem}g -jar ${gatk_jar} AnnotateTargets \
- --targets ${intervals} \
- --reference ${ref_fasta} \
- --interval_merging_rule OVERLAPPING_ONLY \
- --output ${entity_id}.annotated.tsv
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, ceil(size(ref_fasta, "GB")) + 50]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- File annotated_intervals = "${entity_id}.annotated.tsv"
- }
-}
-
task AnnotateIntervals {
File intervals
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
- String gatk_jar
+ File? gatk4_jar_override
# Runtime parameters
Int? mem
@@ -124,19 +53,22 @@ task AnnotateIntervals {
Int? preemptible_attempts
Int? disk_space_gb
- command {
- java -Xmx${default="4" mem}g -jar ${gatk_jar} AnnotateIntervals \
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${default="4" mem}g -jar $GATK_JAR AnnotateIntervals \
-L ${intervals} \
--reference ${ref_fasta} \
- --interval_merging_rule OVERLAPPING_ONLY \
+ --interval-merging-rule OVERLAPPING_ONLY \
--output annotated_intervals.tsv
- }
+ >>>
runtime {
docker: "${gatk_docker}"
memory: select_first([mem, 5]) + " GB"
disks: "local-disk " + select_first([disk_space_gb, ceil(size(ref_fasta, "GB")) + 50]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
+ preemptible: select_first([preemptible_attempts, 5])
}
output {
@@ -144,91 +76,12 @@ task AnnotateIntervals {
}
}
-# Collect read counts for germline workflow (TSV output in target format)
-task CollectReadCounts {
- File? padded_targets
- File bam
- File bam_idx
- File ref_fasta
- File ref_fasta_fai
- File ref_fasta_dict
- Int? wgs_bin_length
- Boolean? keep_non_autosomes
- Boolean? disable_all_read_filters
- Boolean? disable_sequence_dictionary_validation
- Boolean? keep_duplicate_reads
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- # If no padded target file is input, then do WGS workflow
- Boolean is_wgs = !defined(padded_targets)
-
- # Sample name is derived from the bam filename
- String base_filename = basename(bam, ".bam")
-
- String read_counts_tsv_filename = "${base_filename}.readCounts.tsv"
- String read_counts_hdf5_filename = if is_wgs then "${base_filename}.readCounts.hdf5" else ""
- String intervals_filename = if is_wgs then "${base_filename}.readCounts.intervals.tsv" else select_first([padded_targets, ""])
-
- command <<<
- if [ ${is_wgs} = true ]
- then
- java -Xmx${default="8" mem}g -jar ${gatk_jar} SparkGenomeReadCounts \
- --input ${bam} \
- --reference ${ref_fasta} \
- --binLength ${default="1000" wgs_bin_length} \
- --keepXYMT ${default="false" keep_non_autosomes} \
- --disable-tool-default-read-filters ${default="false" disable_all_read_filters} \
- --disable-sequence-dictionary-validation ${default="true" disable_sequence_dictionary_validation} \
- $(if [ ${default="true" keep_duplicate_reads} = true ]; then echo " --disable-read-filter NotDuplicateReadFilter "; else echo ""; fi) \
- --output ${read_counts_tsv_filename} \
- --writeHdf5
- else
- java -Xmx${default="4" mem}g -jar ${gatk_jar} CalculateTargetCoverage \
- --input ${bam} \
- --reference ${ref_fasta} \
- --targets ${padded_targets} \
- --groupBy SAMPLE \
- --transform RAW \
- --targetInformationColumns FULL \
- --interval-set-rule UNION \
- --interval-merging-rule OVERLAPPING_ONLY \
- --interval-padding 0 \
- --seconds-between-progress-updates 10.0 \
- --disable-tool-default-read-filters ${default="false" disable_all_read_filters} \
- --disable-sequence-dictionary-validation ${default="true" disable_sequence_dictionary_validation} \
- $(if [ ${default="true" keep_duplicate_reads} = true ]; then echo " --disable-read-filter NotDuplicateReadFilter "; else echo ""; fi) \
- --output ${read_counts_tsv_filename}
- fi
- >>>
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- String entity_id = base_filename
- File read_counts = read_counts_tsv_filename
- File read_counts_hdf5 = read_counts_hdf5_filename #"" if is_wgs = false
- File intervals = intervals_filename #padded_targets if is_wgs = false
- }
-}
-
-# Collect counts for ModelSegments workflow
task CollectCounts {
File intervals
File bam
File bam_idx
String? output_format
- String gatk_jar
+ File? gatk4_jar_override
# Runtime parameters
Int? mem
@@ -240,20 +93,23 @@ task CollectCounts {
String base_filename = basename(bam, ".bam")
String counts_filename = if !defined(output_format) then "${base_filename}.counts.hdf5" else "${base_filename}.counts.tsv"
- command {
- java -Xmx${default="8" mem}g -jar ${gatk_jar} CollectFragmentCounts \
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${default="8" mem}g -jar $GATK_JAR CollectFragmentCounts \
--input ${bam} \
-L ${intervals} \
--outputFormat ${default="HDF5" output_format} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ${counts_filename}
- }
+ >>>
runtime {
docker: "${gatk_docker}"
memory: select_first([mem, 8]) + " GB"
disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
+ preemptible: select_first([preemptible_attempts, 5])
}
output {
@@ -262,7 +118,6 @@ task CollectCounts {
}
}
-# Collect allelic counts
task CollectAllelicCounts {
File common_sites
File bam
@@ -271,7 +126,7 @@ task CollectAllelicCounts {
File ref_fasta_fai
File ref_fasta_dict
Int? minimum_base_quality
- String gatk_jar
+ File? gatk4_jar_override
# Runtime parameters
Int? mem
@@ -279,61 +134,36 @@ task CollectAllelicCounts {
Int? preemptible_attempts
Int? disk_space_gb
+ # Mem is in units of GB but our command and memory runtime values are in MB
+ Int machine_mem = if defined(mem) then mem * 1000 else 13000
+ Int command_mem = machine_mem - 1000
+
# Sample name is derived from the bam filename
String base_filename = basename(bam, ".bam")
String allelic_counts_filename = "${base_filename}.allelicCounts.tsv"
- command {
- java -Xmx${default="8" mem}g -jar ${gatk_jar} CollectAllelicCounts \
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${command_mem}m -jar $GATK_JAR CollectAllelicCounts \
-L ${common_sites} \
--input ${bam} \
--reference ${ref_fasta} \
--minimumBaseQuality ${default="20" minimum_base_quality} \
--output ${allelic_counts_filename}
- }
+ >>>
runtime {
docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
+ memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
+ preemptible: select_first([preemptible_attempts, 5])
}
output {
String entity_id = base_filename
File allelic_counts = allelic_counts_filename
}
-}
-
-# Correct coverage profile(s) for sample-specific GC bias
-task CorrectGCBias {
- String entity_id
- File coverage # This can be either single-sample or multi-sample
- File annotated_intervals
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- command {
- java -Xmx${default=4 mem}g -jar ${gatk_jar} CorrectGCBias \
- --input ${coverage} \
- --targets ${annotated_intervals} \
- --output ${entity_id}.gc_corrected.tsv
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, ceil(size(coverage, "GB"))+50]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- File corrected_coverage = "${entity_id}.gc_corrected.tsv"
- }
}
\ No newline at end of file
diff --git a/scripts/cnv_wdl/germline/README.md b/scripts/cnv_wdl/germline/README.md
deleted file mode 100644
index 4a4f052233c..00000000000
--- a/scripts/cnv_wdl/germline/README.md
+++ /dev/null
@@ -1,86 +0,0 @@
-## Running the Germline CNV WDL
-
-### Which WDL should you use?
-- Building a panel of normals (PoN): ``cnv_germline_panel_workflow.wdl``
-- Calling events on a single normal sample: ``cnv_germline_single_sample_workflow.wdl``
-- Calling events on a cohort of normal samples: ``cnv_germline_cohort_workflow.wdl``
-
-#### Setting up parameter json file for a run
-
-To get started, copy the relevant ``*_template.json`` for the workflow you wish to run and adjust parameters accordingly.
-You can find all required resource inputs needed to run the workflows in the ``/resources`` directory. These inputs could be run out-of-the-box.
-
-*Please note that there are task-level parameters that do not appear in the template files. These are set to reasonable values by default, but can also be adjusted if desired.
-
-#### Fields of germline CNV panel of normals creation workflow
-
- ``CNVGermlinePanelWorkflow.sex_genotypes`` -- path to table of per-sample sex genotypes
- ``CNVGermlinePanelWorkflow.contig_ploidy_annotations`` -- path to the germline contig ploidy annotations table; located in ``/resources`` directory
- ``CNVGermlinePanelWorkflow.transition_prior_table`` -- path to copy number transition priors table; located in ``/resources`` directory
- ``CNVGermlinePanelWorkflow.transition_matrix_XY_Y`` -- path to copy number transition prior for Y contig for XY-genotyped samples; located in ``/resources`` directory
- ``CNVGermlinePanelWorkflow.transition_matrix_XX_X`` -- path to copy number transition prior for X contig for XX-genotyped samples; located in ``/resources`` directory
- ``CNVGermlinePanelWorkflow.transition_matrix_XY_X`` -- path to copy number transition prior for X contig for XY-genotyped samples; located in ``/resources`` directory
- ``CNVGermlinePanelWorkflow.transition_matrix_XX_Y`` -- path to copy number transition prior for Y contig for XX-genotyped samples; located in ``/resources`` directory
- ``CNVGermlinePanelWorkflow.transition_matrix_autosomal`` -- path to transition prior on autosomal loci; located in ``/resources`` directory,
- ``CNVGermlinePanelWorkflow.normal_bams_list`` -- TSV file consisting of corresponding bam and corresponding index files as described in cnv_germline_panel_workflow.wdl
- ``CNVGermlinePanelWorkflow.pon_output_path`` -- name of the final output directory
- ``CNVGermlinePanelWorkflow.num_latents`` -- (advanced) maximum number of principal components. Must be strictly less than the number of samples. The recommended value is 20 ~ 30 for large cohorts. For smaller cohorts, use 0.5 * number of samples. Unnecessary principal components are automatically pruned during PoN creation
- ``CNVGermlinePanelWorkflow.ref_fasta`` -- path to reference fasta file
- ``CNVGermlinePanelWorkflow.ref_fasta_dict`` -- path to reference dict file
- ``CNVGermlinePanelWorkflow.ref_fasta_fai`` -- path to reference fasta fai file
- ``CNVGermlinePanelWorkflow.gatk_jar`` -- absolute path to gatk.jar
- ``CNVGermlinePanelWorkflow.targets`` -- (optional) Target file (NOT in BED format) corresponding to the genomic loci of enriched targets in WES sample (e.g. Agilent, Illumina, etc). Please run ConvertBedToTargetFile to convert a BED file to a target file. If provided, then WES workflow will be run; otherwise, WGS workflow will be run
-
- In addition, there are several task-level parameters that may be set by advanced users; for example:
-
- - ``CNVGermlinePanelWorkflow.CollectReadCounts.wgs_bin_length`` -- Size of bins (in bp) for WGS coverage collection. *This must be the same value used for all samples.* Ignored if not running WGS.
- - ``CNVGermlinePanelWorkflow.PadTargets.padding`` -- Amount of padding (in bp) to add to both sides of targets for WES coverage collection. *This must be the same value used for all samples.* Ignored if not running WES.
-
- Further explanation of these task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool.
-
-
-#### Fields of germline CNV single sample calling workflow
-
-The reference used must be the same between PoN and case samples.
-
- ``CNVGermlineSingleSampleWorkflow.sex_genotypes`` -- path to table of per-sample sex genotypes
- ``CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations`` -- path to the germline contig ploidy annotations table; located in ``/resources`` directory
- ``CNVGermlineSingleSampleWorkflow.transition_prior_table`` -- path to copy number transition priors table; located in ``/resources`` directory
- ``CNVGermlineSingleSampleWorkflow.transition_matrix_XY_Y`` -- path to copy number transition prior for Y contig for XY-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineSingleSampleWorkflow.transition_matrix_XX_X`` -- path to copy number transition prior for X contig for XX-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineSingleSampleWorkflow.transition_matrix_XY_X`` -- path to copy number transition prior for X contig for XY-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineSingleSampleWorkflow.transition_matrix_XX_Y`` -- path to copy number transition prior for Y contig for XX-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineSingleSampleWorkflow.transition_matrix_autosomal`` -- path to transition prior on autosomal loci; located in ``/resources`` directory,
- ``CNVGermlineSingleSampleWorkflow.output_path`` -- name of the final output directory
- ``CNVGermlineSingleSampleWorkflow.num_latents`` -- (advanced) maximum number of principal components. Must be strictly less than the number of samples. The recommended value is 20 ~ 30 for large cohorts. For smaller cohorts, use 0.5 * number of samples. Unnecessary principal components are automatically pruned during PoN creation
- ``CNVGermlineSingleSampleWorkflow.model_path`` -- absolute path of the PoN model (posterior_finals directory of the panel creation output)
- ``CNVGermlineSingleSampleWorkflow.normal_bam`` -- path to the normal bam file
- ``CNVGermlineSingleSampleWorkflow.normal_bam_idx`` -- path to the corresponding bam index file
- ``CNVGermlineSingleSampleWorkflow.ref_fasta`` -- path to reference fasta file
- ``CNVGermlineSingleSampleWorkflow.ref_fasta_dict`` -- path to reference dict file
- ``CNVGermlineSingleSampleWorkflow.ref_fasta_fai`` -- path to reference fasta fai file
- ``CNVGermlineSingleSampleWorkflow.gatk_jar`` -- absolute path to gatk.jar
- ``CNVGermlineSingleSampleWorkflow.targets`` -- (optional) Target file (NOT in BED format) corresponding to the genomic loci of enriched targets in WES sample (e.g. Agilent, Illumina, etc). Please run ConvertBedToTargetFile to convert a BED file to a target file. If provided, then WES workflow will be run; otherwise, WGS workflow will be run
-
-
-#### Fields of germline CNV cohort calling workflow
-
-The reference used must be the same between PoN and case samples.
-
- ``CNVGermlineCohortWorkflow.sex_genotypes`` -- path to table of per-sample sex genotypes
- ``CNVGermlineCohortWorkflow.contig_ploidy_annotations`` -- path to the germline contig ploidy annotations table; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.transition_prior_table`` -- path to copy number transition priors table; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.transition_matrix_XY_Y`` -- path to copy number transition prior for Y contig for XY-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.transition_matrix_XX_X`` -- path to copy number transition prior for X contig for XX-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.transition_matrix_XY_X`` -- path to copy number transition prior for X contig for XY-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.transition_matrix_XX_Y`` -- path to copy number transition prior for Y contig for XX-genotyped samples; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.transition_matrix_autosomal`` -- path to transition prior on autosomal loci; located in ``/resources`` directory
- ``CNVGermlineCohortWorkflow.output_path`` -- name of the final output directory
- ``CNVGermlineCohortWorkflow.num_latents`` -- (advanced) maximum number of principal components. Must be strictly less than the number of samples. The recommended value is 20 ~ 30 for large cohorts. For smaller cohorts, use 0.5 * number of samples. Unnecessary principal components are automatically pruned during PoN creation
- ``CNVGermlineCohortWorkflow.model_path`` -- absolute path of the PoN model (posterior_finals directory of the panel creation output)
- ``CNVGermlineCohortWorkflow.normal_bams_list`` -- TSV file consisting of corresponding bam and corresponding index files as described in cnv_germline_cohort_workflow.wdl
- ``CNVGermlineCohortWorkflow.ref_fasta`` -- path to reference fasta file
- ``CNVGermlineCohortWorkflow.ref_fasta_dict`` -- path to reference dict file
- ``CNVGermlineCohortWorkflow.ref_fasta_fai`` -- path to reference fasta fai file
- ``CNVGermlineCohortWorkflow.gatk_jar`` -- absolute path to gatk.jar
- ``CNVGermlineCohortWorkflow.targets`` -- (optional) Target file (NOT in BED format) corresponding to the genomic loci of enriched targets in WES sample (e.g. Agilent, Illumina, etc). Please run ConvertBedToTargetFile to convert a BED file to a target file. If provided, then WES workflow will be run; otherwise, WGS workflow will be run
\ No newline at end of file
diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
deleted file mode 100755
index dafd517161f..00000000000
--- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
+++ /dev/null
@@ -1,89 +0,0 @@
-# This workflow is used for running germline CNV on a cohort of germline samples
-# Notes:
-#
-# -Basic sex genotype tab-separated table for homo sapiens must be formatted as follows (Refer to the Javadoc of SexGenotypeTableReader for full description):
-# SAMPLE_NAME SEX_GENOTYPE
-# sample_name_1 SEX_XX
-# sample_name_2 SEX_XY
-# sample_name_3 SEX_XY
-# sample_name_4 SEX_XX
-# Sex genotype identifiers (SEX_XX and SEX_XY in the above example) must match those in the tab-separated germline contig ploidy annotation table.
-# The latter is formatted as follows:
-# CONTIG CLASS SEX_XX SEX_XY
-# 1 AUTOSOMAL 2 2
-# 2 AUTOSOMAL 2 2
-# ... ... ... ...
-# X ALLOSOMAL 2 0
-# Y ALLOSOMAL 1 1
-#
-# - Input file (normal_bams_list) must contain file paths to bam and bam index files separated by tabs in the following format:
-# normal_bam_1 bam_idx_1
-# normal_bam_2 bam_idx_2
-#
-# - The target file (targets) is required for the WES workflow and should be a tab-separated file with the column headers:
-# contig start stop name
-# These targets will be padded on both sides by the amount specified by PadTargets.padding (default 250).
-#
-# - If a target file is not provided, then the WGS workflow will be run instead and the specified value of
-# wgs_bin_length (default 1000) will be used.
-#
-# - Example invocation:
-# java -jar cromwell.jar run cnv_germline_cohort_workflow.wdl myParameters.json
-# We recommend taking cnv_germline_cohort_workflow.json as a template json file and modifying it accordingly (please save
-# your modified version with a different filename and do not commit to the gatk repository).
-################
-
-
-import "cnv_germline_single_sample_workflow.wdl" as CNVGermlineSingleSampleWorkflow
-
-workflow CNVGermlineCohortWorkflow {
- # Workflow input files
- File? targets
- File normal_bams_list
- Array[Array[String]]+ normal_bams = read_tsv(normal_bams_list)
- File ref_fasta
- File ref_fasta_dict
- File ref_fasta_fai
- File sex_genotypes
- File contig_ploidy_annotations
- String gatk_jar
- String gatk_docker
-
- # Transition prior table files
- File transition_prior_table
- Array[File] copy_number_transition_prior_files
-
- # Model directory and parameters
- File model_path
- Int num_latents
-
- # Output path
- String output_path
-
- scatter (normal_bam in normal_bams) {
- call CNVGermlineSingleSampleWorkflow.CNVGermlineSingleSampleWorkflow as SingleSampleWorkflow {
- input:
- targets = targets,
- normal_bam = normal_bam[0],
- normal_bam_idx = normal_bam[1],
- ref_fasta = ref_fasta,
- ref_fasta_dict = ref_fasta_dict,
- ref_fasta_fai = ref_fasta_fai,
- sex_genotypes = sex_genotypes,
- contig_ploidy_annotations = contig_ploidy_annotations,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker,
- transition_prior_table = transition_prior_table,
- copy_number_transition_prior_files = copy_number_transition_prior_files,
- model_path = model_path,
- output_path = output_path,
- num_latents = num_latents
- }
- }
-
- output {
- Array[Array[File]] posterior_files = SingleSampleWorkflow.posteriors
- Array[Array[File]] segment_files = SingleSampleWorkflow.segments
- }
-
-}
diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow_template.json b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow_template.json
deleted file mode 100755
index 152f4e62125..00000000000
--- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow_template.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
- "CNVGermlineCohortWorkflow.sex_genotypes": "File",
- "CNVGermlineCohortWorkflow.contig_ploidy_annotations": "File",
- "CNVGermlineCohortWorkflow.transition_prior_table": "File",
- "CNVGermlineCohortWorkflow.copy_number_transition_prior_files": "Array[File]",
- "CNVGermlineCohortWorkflow.output_path": "String",
- "CNVGermlineCohortWorkflow.num_latents": "Int",
- "CNVGermlineCohortWorkflow.model_path": "String",
- "CNVGermlineCohortWorkflow.normal_bams_list": "File",
- "CNVGermlineCohortWorkflow.ref_fasta": "File",
- "CNVGermlineCohortWorkflow.ref_fasta_dict": "File",
- "CNVGermlineCohortWorkflow.ref_fasta_fai": "File",
- "CNVGermlineCohortWorkflow.gatk_jar": "String",
- "CNVGermlineCohortWorkflow.targets": "(optional) File?"
-}
diff --git a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl
deleted file mode 100755
index 73c7d24f373..00000000000
--- a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl
+++ /dev/null
@@ -1,211 +0,0 @@
-# Workflow for creating a panel of normals for germline CNV pipeline
-# Notes:
-#
-# -Basic sex genotype tab-separated table for homo sapiens must be formatted as follows (Refer to the Javadoc of SexGenotypeTableReader for full description):
-# SAMPLE_NAME SEX_GENOTYPE
-# sample_name_1 SEX_XX
-# sample_name_2 SEX_XY
-# sample_name_3 SEX_XY
-# sample_name_4 SEX_XX
-# Sex genotype identifiers (SEX_XX and SEX_XY in the above example) must match those in the tab-separated germline contig ploidy annotation table.
-# The latter is formatted as follows:
-# CONTIG CLASS SEX_XX SEX_XY
-# 1 AUTOSOMAL 2 2
-# 2 AUTOSOMAL 2 2
-# ... ... ... ...
-# X ALLOSOMAL 2 0
-# Y ALLOSOMAL 1 1
-#
-# - Input file (normal_bams_list) must contain file paths to bam and bam index files separated by tabs in the following format:
-# normal_bam_1 bam_idx_1
-# normal_bam_2 bam_idx_2
-#
-# - The target file (targets) is required for the WES workflow and should be a tab-separated file with the column headers:
-# contig start stop name
-# These targets will be padded on both sides by the amount specified by PadTargets.padding (default 250).
-#
-# - If a target file is not provided, then the WGS workflow will be run instead and the specified value of
-# wgs_bin_length (default 1000) will be used.
-#
-# - Example invocation:
-# java -jar cromwell.jar run cnv_germline_panel_workflow.wdl myParameters.json
-# We recommend taking cnv_germline_cohort_workflow.json as a template json file and modifying it accordingly (please save
-# your modified version with a different filename and do not commit to the gatk repository).
-##################
-
-import "cnv_common_tasks.wdl" as CNVTasks
-
-workflow CNVGermlinePanelWorkflow {
- # Workflow input files
- File? targets
- File normal_bams_list
- Array[Array[String]]+ normal_bams = read_tsv(normal_bams_list)
- File sex_genotypes
- File contig_ploidy_annotations
- File transition_prior_table
- Array[File] copy_number_transition_prior_files
- File ref_fasta
- File ref_fasta_dict
- File ref_fasta_fai
- String gatk_jar
- String gatk_docker
-
- # Model parameters
- Int num_latents
- # CombineReadCounts name
- String combined_entity_id = "combined_coverage"
- # Sex genotypes file name
- String sex_genotypes_entity_id = "sex_genotypes"
- # PoN output path
- String pon_output_path
- # If no target file is input, then do WGS workflow
- Boolean is_wgs = !defined(targets)
-
- if (!is_wgs) {
- call CNVTasks.PadTargets {
- input:
- # This is a bit of a hack. The task will fail if targets is not defined when it gets here.
- targets = select_first([targets, ""]),
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
- }
-
- scatter (normal_bam in normal_bams) {
- call CNVTasks.CollectReadCounts {
- input:
- padded_targets = PadTargets.padded_targets,
- keep_non_autosomes = true,
- bam = normal_bam[0],
- bam_idx = normal_bam[1],
- ref_fasta = ref_fasta,
- ref_fasta_fai = ref_fasta_fai,
- ref_fasta_dict = ref_fasta_dict,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
- }
-
- call CombineReadCounts {
- input:
- combined_entity_id = combined_entity_id,
- coverage_file_list = CollectReadCounts.read_counts,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- call CNVTasks.AnnotateTargets {
- input:
- entity_id = combined_entity_id,
- intervals = CollectReadCounts.intervals[0],
- ref_fasta = ref_fasta,
- ref_fasta_fai = ref_fasta_fai,
- ref_fasta_dict = ref_fasta_dict,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- call CNVTasks.CorrectGCBias {
- input:
- entity_id = combined_entity_id,
- coverage = CombineReadCounts.combined_coverage,
- annotated_intervals = AnnotateTargets.annotated_intervals,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- call GermlineCNVCaller {
- input:
- coverage = CorrectGCBias.corrected_coverage,
- contig_ploidy_annotations = contig_ploidy_annotations,
- sex_genotypes = sex_genotypes,
- transition_prior_table = transition_prior_table,
- copy_number_transition_prior_files = copy_number_transition_prior_files,
- pon_output_path = pon_output_path,
- num_latents = num_latents,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- output {
- Array[File] posteriors = GermlineCNVCaller.posteriors
- Array[File] model = GermlineCNVCaller.model
- Array[File] segments = GermlineCNVCaller.segments
- }
-}
-
-# Combine sample-level coverage files into a single file
-task CombineReadCounts {
- String combined_entity_id
- Array[File]+ coverage_file_list
- Int? max_open_files
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- command {
- java -Xmx${default=4 mem}g -jar ${gatk_jar} CombineReadCounts \
- --input ${sep=" --input " coverage_file_list} \
- --maxOpenFiles ${default=100 max_open_files} \
- --output ${combined_entity_id}.tsv
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, 150]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- File combined_coverage = "${combined_entity_id}.tsv"
- }
-}
-
-# Learn the coverage model
-task GermlineCNVCaller {
- File coverage
- File contig_ploidy_annotations
- File sex_genotypes
- File transition_prior_table
- Array[File] copy_number_transition_prior_files
- String pon_output_path
- Int num_latents
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- command {
- java -Xmx${default=4 mem}g -Ddtype=double -jar ${gatk_jar} GermlineCNVCaller \
- --input ${coverage} \
- --contigAnnotationsTable ${contig_ploidy_annotations} \
- --sexGenotypeTable ${sex_genotypes} \
- --copyNumberTransitionPriorTable ${transition_prior_table} \
- --outputPath ${pon_output_path} \
- --jobType LEARN_AND_CALL \
- --numLatents ${default=5 num_latents} \
- --rddCheckpointing false \
- --disableSpark true
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, 200]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- Array[File] posteriors = glob("./${pon_output_path}/posteriors_final/*")
- Array[File] model = glob("./${pon_output_path}/model_final/*")
- Array[File] segments = glob("./${pon_output_path}/posteriors_final/segments/*")
- }
-}
diff --git a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow_template.json b/scripts/cnv_wdl/germline/cnv_germline_panel_workflow_template.json
deleted file mode 100755
index 53ac97a4c5c..00000000000
--- a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow_template.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
- "CNVGermlinePanelWorkflow.sex_genotypes": "File",
- "CNVGermlinePanelWorkflow.gatk_jar": "File",
- "CNVGermlinePanelWorkflow.contig_ploidy_annotations": "File",
- "CNVGermlinePanelWorkflow.targets": "(optional) File?",
- "CNVGermlinePanelWorkflow.normal_bams_list": "File",
- "CNVGermlinePanelWorkflow.num_latents": "Int",
- "CNVGermlinePanelWorkflow.pon_output_path": "String",
- "CNVGermlinePanelWorkflow.ref_fasta": "File",
- "CNVGermlinePanelWorkflow.ref_fasta_dict": "File",
- "CNVGermlinePanelWorkflow.ref_fasta_fai": "File",
- "CNVGermlinePanelWorkflow.transition_prior_table": "File",
- "CNVGermlinePanelWorkflow.copy_number_transition_prior_files": "Array[File]"
-}
diff --git a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl
deleted file mode 100755
index a2d663d4623..00000000000
--- a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl
+++ /dev/null
@@ -1,165 +0,0 @@
-# Subworkflow for running GATK germline CNV on a single BAM. Supports both WGS and WES samples.
-# Notes:
-#
-# -Basic sex genotype tab-separated table for homo sapiens must be formatted as follows (Refer to the Javadoc of SexGenotypeTableReader for full description):
-# SAMPLE_NAME SEX_GENOTYPE
-# sample_name_1 SEX_XX
-# sample_name_2 SEX_XY
-# sample_name_3 SEX_XY
-# sample_name_4 SEX_XX
-# Sex genotype identifiers (SEX_XX and SEX_XY in the above example) must match those in the tab-separated germline contig ploidy annotation table.
-# The latter is formatted as follows:
-# CONTIG CLASS SEX_XX SEX_XY
-# 1 AUTOSOMAL 2 2
-# 2 AUTOSOMAL 2 2
-# ... ... ... ...
-# X ALLOSOMAL 2 0
-# Y ALLOSOMAL 1 1
-#
-# - The target file (targets) is required for the WES workflow and should be a tab-separated file with the column headers:
-# contig start stop name
-# These targets will be padded on both sides by the amount specified by PadTargets.padding (default 250).
-#
-# - If a target file is not provided, then the WGS workflow will be run instead and the specified value of
-# wgs_bin_length (default 1000) will be used.
-#
-# - Example invocation:
-# java -jar cromwell.jar run cnv_germline_single_sample_workflow.wdl myParameters.json
-# We recommend taking cnv_germline_cohort_workflow.json as a template json file and modifying it accordingly (please save
-# your modified version with a different filename and do not commit to the gatk repository).
-################
-
-import "cnv_common_tasks.wdl" as CNVTasks
-
-workflow CNVGermlineSingleSampleWorkflow {
- # Workflow input files
- File? targets
- File normal_bam
- File normal_bam_idx
- File ref_fasta
- File ref_fasta_dict
- File ref_fasta_fai
- File sex_genotypes
- File contig_ploidy_annotations
- String gatk_jar
- String gatk_docker
-
- # Transtion prior table files
- File transition_prior_table
- Array[File] copy_number_transition_prior_files
-
- # Model directory and parameters
- File model_path
- Int num_latents
-
- # Output path
- String output_path
-
- # If no target file is input, then do WGS workflow
- Boolean is_wgs = !defined(targets)
-
- if (!is_wgs) {
- call CNVTasks.PadTargets {
- input:
- # The task will fail if targets is not defined when it gets here, but that should not be allowed to happen.
- targets = select_first([targets, ""]),
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
- }
-
- call CNVTasks.CollectReadCounts {
- input:
- padded_targets = PadTargets.padded_targets,
- bam = normal_bam,
- bam_idx = normal_bam_idx,
- ref_fasta = ref_fasta,
- ref_fasta_fai = ref_fasta_fai,
- ref_fasta_dict = ref_fasta_dict,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- call CNVTasks.AnnotateTargets {
- input:
- entity_id = CollectReadCounts.entity_id,
- intervals = CollectReadCounts.intervals,
- ref_fasta = ref_fasta,
- ref_fasta_fai = ref_fasta_fai,
- ref_fasta_dict = ref_fasta_dict,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- call CNVTasks.CorrectGCBias {
- input:
- entity_id = CollectReadCounts.entity_id,
- coverage = CollectReadCounts.read_counts,
- annotated_intervals = AnnotateTargets.annotated_intervals,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- call GermlineCNVCaller {
- input:
- coverage = CorrectGCBias.corrected_coverage,
- contig_ploidy_annotations = contig_ploidy_annotations,
- sex_genotypes = sex_genotypes,
- transition_prior_table = transition_prior_table,
- copy_number_transition_prior_files = copy_number_transition_prior_files,
- model_path = model_path,
- num_latents = num_latents,
- output_path = output_path,
- gatk_jar = gatk_jar,
- gatk_docker = gatk_docker
- }
-
- output {
- Array[File] posteriors = GermlineCNVCaller.posteriors
- Array[File] segments = GermlineCNVCaller.segments
- }
-}
-
-task GermlineCNVCaller {
- File coverage
- File contig_ploidy_annotations
- File sex_genotypes
- File transition_prior_table
- Array[File] copy_number_transition_prior_files
- String output_path
- File model_path
- Int num_latents
- String gatk_jar
-
- # Runtime parameters
- Int? mem
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
-
- command {
- java -Xmx${default=4 mem}g -Ddtype=double -jar ${gatk_jar} GermlineCNVCaller \
- --input ${coverage} \
- --inputModelPath ${model_path} \
- --contigAnnotationsTable ${contig_ploidy_annotations} \
- --sexGenotypeTable ${sex_genotypes} \
- --copyNumberTransitionPriorTable ${transition_prior_table} \
- --outputPath ${output_path} \
- --numLatents ${default=5 num_latents} \
- --jobType CALL_ONLY \
- --rddCheckpointing false \
- --disableSpark true
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: select_first([mem, 5]) + " GB"
- disks: "local-disk " + select_first([disk_space_gb, 200]) + " HDD"
- preemptible: select_first([preemptible_attempts, 2])
- }
-
- output {
- Array[File] posteriors = glob("./${output_path}/posteriors_final/*")
- Array[File] segments = glob("./${output_path}/posteriors_final/segments/*")
- }
-}
diff --git a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow_template.json b/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow_template.json
deleted file mode 100755
index fe582d98ca0..00000000000
--- a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow_template.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
- "CNVGermlineSingleSampleWorkflow.normal_bam": "File",
- "CNVGermlineSingleSampleWorkflow.normal_bam_idx": "File",
- "CNVGermlineSingleSampleWorkflow.num_latents": "Int",
- "CNVGermlineSingleSampleWorkflow.sex_genotypes": "File",
- "CNVGermlineSingleSampleWorkflow.ref_fasta": "File",
- "CNVGermlineSingleSampleWorkflow.ref_fasta_dict": "File",
- "CNVGermlineSingleSampleWorkflow.ref_fasta_fai": "File",
- "CNVGermlineSingleSampleWorkflow.model_path": "String",
- "CNVGermlineSingleSampleWorkflow.gatk_jar": "String",
- "CNVGermlineSingleSampleWorkflow.targets": "(optional) File?",
- "CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations": "File",
- "CNVGermlineSingleSampleWorkflow.transition_prior_table": "File",
- "CNVGermlineSingleSampleWorkflow.copy_number_transition_prior_files": "Array[File]",
- "CNVGermlineSingleSampleWorkflow.output_path": "String",
-}
diff --git a/scripts/cnv_wdl/germline/resources/contig_annots.tsv b/scripts/cnv_wdl/germline/resources/contig_annots.tsv
deleted file mode 100755
index 406629d7c90..00000000000
--- a/scripts/cnv_wdl/germline/resources/contig_annots.tsv
+++ /dev/null
@@ -1,25 +0,0 @@
-CONTIG CLASS SEX_XX SEX_XY
-1 AUTOSOMAL 2 2
-2 AUTOSOMAL 2 2
-3 AUTOSOMAL 2 2
-4 AUTOSOMAL 2 2
-5 AUTOSOMAL 2 2
-6 AUTOSOMAL 2 2
-7 AUTOSOMAL 2 2
-8 AUTOSOMAL 2 2
-9 AUTOSOMAL 2 2
-10 AUTOSOMAL 2 2
-11 AUTOSOMAL 2 2
-12 AUTOSOMAL 2 2
-13 AUTOSOMAL 2 2
-14 AUTOSOMAL 2 2
-15 AUTOSOMAL 2 2
-16 AUTOSOMAL 2 2
-17 AUTOSOMAL 2 2
-18 AUTOSOMAL 2 2
-19 AUTOSOMAL 2 2
-20 AUTOSOMAL 2 2
-21 AUTOSOMAL 2 2
-22 AUTOSOMAL 2 2
-X ALLOSOMAL 2 1
-Y ALLOSOMAL 0 1
diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_priors.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_priors.tsv
deleted file mode 100755
index 8f56c282bf2..00000000000
--- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_priors.tsv
+++ /dev/null
@@ -1,25 +0,0 @@
-CONTIG SEX_XX SEX_XY
-1 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-2 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-3 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-4 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-5 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-6 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-7 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-8 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-9 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-10 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-11 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-12 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-13 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-14 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-15 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-16 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-17 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-18 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-19 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-20 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-21 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-22 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
-X homo_sapiens_germline_CN_transition_matrix_XX_X.tsv homo_sapiens_germline_CN_transition_matrix_XY_X.tsv
-Y homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv
diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv
deleted file mode 100755
index ea18c070fb0..00000000000
--- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv
+++ /dev/null
@@ -1,7 +0,0 @@
-#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples
-T_MATRIX_XX_X FROM_0 FROM_1 FROM_2 FROM_3 FROM_4
-TO_0 0.99966751443861601 0.0 4.6641935242897276e-08 0.0 0.0
-TO_1 0.0 0.9997899779920747 9.3423238818193651e-08 0.0 0.0
-TO_2 0.00033248556138398773 0.00021002200792527603 0.99999985473174158 4.541929579905158e-05 7.8833267638943636e-05
-TO_3 0.0 0.0 5.0172599663674365e-09 0.99995458070420096 0.0
-TO_4 0.0 0.0 1.8582444319879394e-10 0.0 0.99992116673236109
diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv
deleted file mode 100755
index f8d0228761f..00000000000
--- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-#A trivial transition matrix for enforcing zero ploidy on Y contig in XX samples
-T_MATRIX_XX_Y TO_0
-FROM_0 1.0
diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv
deleted file mode 100755
index 992f224460f..00000000000
--- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv
+++ /dev/null
@@ -1,6 +0,0 @@
-#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples
-T_MATRIX_XY_X FROM_0 FROM_1 FROM_2 FROM_3
-TO_0 0.99971173098797461 1.0067714836234777e-07 0.0 0.0
-TO_1 0.00028826901202540391 0.99999989259309574 7.456796468461193e-05 4.0420371867421184e-05
-TO_2 0.0 6.5615120089096975e-09 0.99992504963549644 8.0840743734842364e-06
-TO_3 0.0 1.6824389766435122e-10 3.8239981889544576e-07 0.99995149555375906
diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv
deleted file mode 100755
index 8c80b9eaa16..00000000000
--- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv
+++ /dev/null
@@ -1,7 +0,0 @@
-#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples
-T_MATRIX_XY_Y FROM_0 FROM_1 FROM_2 FROM_3 FROM_4
-TO_0 0.99966851990709416 5.9399783434370542e-08 0.0 0.0 0.0
-TO_1 0.00033148009290586881 0.99999937404917871 0.00016831138093714932 0.00035529148884256304 0.00027047913446676971
-TO_2 0.0 5.2251326738303193e-07 0.99983149068329535 6.4209305212511401e-06 0.0
-TO_3 0.0 3.4001255345191416e-08 1.9793576746822735e-07 0.99963614727046246 5.519982336056525e-06
-TO_4 0.0 1.0036515132014333e-08 0.0 2.1403101737503797e-06 0.99972400088319713
diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
deleted file mode 100755
index 5b0ef7c5cd6..00000000000
--- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv
+++ /dev/null
@@ -1,13 +0,0 @@
-#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples
-T_MATRIX_AUTOSOMAL FROM_0 FROM_1 FROM_2 FROM_3 FROM_4 FROM_5 FROM_6 FROM_7 FROM_8 FROM_9 FROM_10
-TO_0 0.9997389770672177 2.1467075095351557e-07 5.9100196666398515e-08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-TO_1 7.032645704368021e-07 0.99981467801052126 1.376696014985822e-07 1.1958483005083788e-08 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-TO_2 0.00026031966821188487 0.00018510163208542175 0.99999972823891037 9.7745650462803608e-05 0.00010292959512329779 8.0138884782014782e-05 9.0232386606850211e-05 7.9777767777145916e-05 7.5782475345150855e-05 8.8396952830754563e-05 8.9130531663621367e-05
-TO_3 0.0 5.6866424093646516e-09 3.4570453537193609e-08 0.99990217064015618 6.1564638523662337e-08 1.1699107267447413e-07 4.469162288600803e-08 0.0 0.0 0.0 0.0
-TO_4 0.0 0.0 3.3588164155451664e-08 5.6802794274147987e-08 0.99989682090607845 1.1699107267447413e-07 2.4133476358444333e-06 0.0 0.0 0.0 0.0
-TO_5 0.0 0.0 2.8971721269891569e-09 1.1958483005083788e-08 1.2960976531297335e-08 0.99991953938976808 1.3407486865802407e-07 0.0 0.0 0.0 0.0
-TO_6 0.0 0.0 2.1348140599967544e-09 2.989620751270947e-09 1.7497318317251403e-07 8.7743304505855591e-08 0.99990713080764293 1.0766230469250462e-07 0.0 0.0 0.0
-TO_7 0.0 0.0 7.8350530879524278e-10 0.0 0.0 0.0 4.469162288600803e-08 0.99992011456991814 0.0 0.0 0.0
-TO_8 0.0 0.0 9.4105226022640503e-10 0.0 0.0 0.0 0.0 0.0 0.99992421752465488 0.0 0.0
-TO_9 0.0 0.0 5.9212277047953571e-11 0.0 0.0 0.0 0.0 0.0 0.0 0.9999116030471692 0.0
-TO_10 0.0 0.0 4.2294483605681122e-12 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.99991086946833641
diff --git a/scripts/cnv_wdl/somatic/README.md b/scripts/cnv_wdl/somatic/README.md
new file mode 100644
index 00000000000..61bffbb3474
--- /dev/null
+++ b/scripts/cnv_wdl/somatic/README.md
@@ -0,0 +1,52 @@
+## Running the Somatic CNV WDL
+
+### Which WDL should you use?
+
+- Building a panel of normals (PoN): ``cnv_somatic_panel_workflow.wdl``
+- Running a matched pair: ``cnv_somatic_pair_workflow.wdl``
+
+#### Setting up parameter json file for a run
+
+To get started, create the json template (using ``java -jar wdltool.jar inputs ``) for the workflow you wish to run and adjust parameters accordingly.
+
+*Please note that there are optional workflow-level and task-level parameters that do not appear in the template file. These are set to reasonable values by default, but can also be adjusted if desired.*
+
+#### Required parameters in the somatic panel workflow
+
+The reference used must be the same between PoN and case samples.
+
+- ``CNVSomaticPanelWorkflow.gatk_docker`` -- GATK Docker image (e.g., ``broadinstitute/gatk:latest``).
+- ``CNVSomaticPanelWorkflow.intervals`` -- Picard or GATK-style interval list. For WGS, this should typically only include the autosomal chromosomes.
+- ``CNVSomaticPanelWorkflow.normal_bams_list`` -- TSV file consisting of corresponding bam and corresponding index files as described in cnv_somatic_panel_workflow.wdl.
+- ``CNVSomaticPanelWorkflow.pon_entity_id`` -- Name of the final PoN file.
+- ``CNVSomaticPanelWorkflow.ref_fasta_dict`` -- Path to reference dict file.
+- ``CNVSomaticPanelWorkflow.ref_fasta_fai`` -- Path to reference fasta fai file.
+- ``CNVSomaticPanelWorkflow.ref_fasta`` -- Path to reference fasta file.
+
+In additional, there are optional workflow-level and task-level parameters that may be set by advanced users; for example:
+
+- ``CNVSomaticPanelWorkflow.do_explicit_gc_correction`` -- (optional) If true, perform explicit GC-bias correction when creating PoN and in subsequent denoising of case samples. If false, rely on PCA-based denoising to correct for GC bias.
+- ``CNVSomaticPanelWorkflow.PreprocessIntervals.bin_length`` -- Size of bins (in bp) for coverage collection. *This must be the same value used for all case samples.*
+- ``CNVSomaticPanelWorkflow.PreprocessIntervals.padding`` -- Amount of padding (in bp) to add to both sides of targets for WES coverage collection. *This must be the same value used for all case samples.*
+
+Further explanation of other task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool.
+
+#### Required parameters in the somatic pair workflow
+
+The reference (and bins, if specified) used must be the same between PoN and case samples.
+
+- ``CNVSomaticPairWorkflow.common_sites`` -- Picard or GATK-style interval list of common sites to use for collecting allelic counts.
+- ``CNVSomaticPairWorkflow.gatk_docker`` -- GATK Docker image (e.g., ``broadinstitute/gatk:latest``).
+- ``CNVSomaticPairWorkflow.intervals`` -- Picard or GATK-style interval list. For WGS, this should typically only include the autosomal chromosomes.
+- ``CNVSomaticPairWorkflow.normal_bam`` -- File path or storage location (depending on backend) of the normal BAM file.
+- ``CNVSomaticPairWorkflow.normal_bam_idx`` -- File path or storage location (depending on backend) of the normal BAM file index.
+- ``CNVSomaticPairWorkflow.read_count_pon`` -- Path to read-count PoN created by the panel workflow.
+- ``CNVSomaticPairWorkflow.ref_fasta_dict`` -- Path to reference dict file.
+- ``CNVSomaticPairWorkflow.ref_fasta_fai`` -- Path to reference fasta fai file.
+- ``CNVSomaticPairWorkflow.ref_fasta`` -- Path to reference fasta file.
+- ``CNVSomaticPairWorkflow.tumor_bam`` -- File path or storage location (depending on backend) of the tumor BAM file.
+- ``CNVSomaticPairWorkflow.tumor_bam_idx`` -- File path or storage location (depending on backend) of the tumor BAM file index.
+
+In additional, there are several task-level parameters that may be set by advanced users as above.
+
+Further explanation of these task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool.
\ No newline at end of file
diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl
new file mode 100644
index 00000000000..35e153d24cc
--- /dev/null
+++ b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl
@@ -0,0 +1,506 @@
+# Workflow for running the GATK CNV pipeline on a matched pair. Supports both WGS and WES.
+#
+# Notes:
+#
+# - The interval-list file is required for both WGS and WES workflows and should be a Picard or GATK-style interval list.
+# These intervals will be padded on both sides by the amount specified by PreprocessIntervals.padding (default 250)
+# and split into bins of length specified by PreprocessIntervals.bin_length (default 1000; specify 0 to skip binning).
+# For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be included, but care
+# should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only with panels containing
+# individuals of the same sex as the case samples).
+#
+# - The sites file (common_sites) should be a Picard or GATK-style interval list. This is a list of sites
+# of known variation at which allelic counts will be collected for use in modeling minor-allele fractions.
+#
+# - Example invocation:
+# java -jar cromwell.jar run cnv_somatic_pair_workflow.wdl myParameters.json
+# See cnv_somatic_pair_workflow_template.json for a template json file to modify with your own parameters (please save
+# your modified version with a different filename and do not commit to the gatk repository).
+#
+#############
+
+import "cnv_common_tasks.wdl" as CNVTasks
+
+workflow CNVSomaticPairWorkflow {
+ File common_sites
+ File intervals
+ File tumor_bam
+ File tumor_bam_idx
+ File normal_bam
+ File normal_bam_idx
+ File read_count_pon
+ File ref_fasta_dict
+ File ref_fasta_fai
+ File ref_fasta
+ String gatk_docker
+ File? gatk4_jar_override
+
+ # Use as a last resort to increase the disk given to every task in case of ill behaving data
+ Int? emergency_extra_disk
+
+ Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_dict, "GB") + size(ref_fasta_fai, "GB"))
+ Int read_count_pon_size = ceil(size(read_count_pon, "GB"))
+ Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bam_idx, "GB"))
+ Int normal_bam_size = ceil(size(normal_bam, "GB") + size(normal_bam_idx, "GB"))
+
+ Int gatk4_override_size = if defined(gatk4_jar_override) then ceil(size(gatk4_jar_override, "GB")) else 0
+ # This is added to every task as padding, should increase if systematically you need more disk for every call
+ Int disk_pad = 20 + ceil(size(intervals, "GB")) + ceil(size(common_sites, "GB")) + gatk4_override_size + select_first([emergency_extra_disk,0])
+
+ Int process_disk = ref_size + disk_pad
+ call CNVTasks.PreprocessIntervals {
+ input:
+ intervals = intervals,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = process_disk
+ }
+
+ Int collect_counts_tumor_disk = tumor_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad
+ call CNVTasks.CollectCounts as CollectCountsTumor {
+ input:
+ intervals = PreprocessIntervals.preprocessed_intervals,
+ bam = tumor_bam,
+ bam_idx = tumor_bam_idx,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = collect_counts_tumor_disk
+ }
+
+ Int collect_counts_normal_disk = normal_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad
+ call CNVTasks.CollectCounts as CollectCountsNormal {
+ input:
+ intervals = PreprocessIntervals.preprocessed_intervals,
+ bam = normal_bam,
+ bam_idx = normal_bam_idx,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = collect_counts_normal_disk
+ }
+
+ Int collect_allelic_counts_tumor_disk = tumor_bam_size + ref_size + disk_pad
+ call CNVTasks.CollectAllelicCounts as CollectAllelicCountsTumor {
+ input:
+ common_sites = common_sites,
+ bam = tumor_bam,
+ bam_idx = tumor_bam_idx,
+ ref_fasta = ref_fasta,
+ ref_fasta_dict = ref_fasta_dict,
+ ref_fasta_fai = ref_fasta_fai,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = collect_allelic_counts_tumor_disk
+ }
+
+ Int collect_allelic_counts_normal_disk = normal_bam_size + ref_size + disk_pad
+ call CNVTasks.CollectAllelicCounts as CollectAllelicCountsNormal {
+ input:
+ common_sites = common_sites,
+ bam = normal_bam,
+ bam_idx = normal_bam_idx,
+ ref_fasta = ref_fasta,
+ ref_fasta_dict = ref_fasta_dict,
+ ref_fasta_fai = ref_fasta_fai,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = collect_allelic_counts_normal_disk
+ }
+
+ Int denoise_read_counts_tumor_disk = read_count_pon_size + ceil(size(CollectCountsTumor.counts, "GB")) + disk_pad
+ call DenoiseReadCounts as DenoiseReadCountsTumor {
+ input:
+ entity_id = CollectCountsTumor.entity_id,
+ read_counts = CollectCountsTumor.counts,
+ read_count_pon = read_count_pon,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = denoise_read_counts_tumor_disk
+ }
+
+ Int denoise_read_counts_normal_disk = read_count_pon_size + ceil(size(CollectCountsNormal.counts, "GB")) + disk_pad
+ call DenoiseReadCounts as DenoiseReadCountsNormal {
+ input:
+ entity_id = CollectCountsNormal.entity_id,
+ read_counts = CollectCountsNormal.counts,
+ read_count_pon = read_count_pon,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = denoise_read_counts_normal_disk
+ }
+
+ Int model_segments_disk = ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(CollectAllelicCountsTumor.allelic_counts, "GB")) + ceil(size(CollectAllelicCountsNormal.allelic_counts, "GB")) + disk_pad
+ call ModelSegments as ModelSegmentsTumor {
+ input:
+ entity_id = CollectCountsTumor.entity_id,
+ denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios,
+ allelic_counts = CollectAllelicCountsTumor.allelic_counts,
+ normal_allelic_counts = CollectAllelicCountsNormal.allelic_counts,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = model_segments_disk
+ }
+
+ call ModelSegments as ModelSegmentsNormal {
+ input:
+ entity_id = CollectCountsNormal.entity_id,
+ denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
+ allelic_counts = CollectAllelicCountsNormal.allelic_counts,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = model_segments_disk
+ }
+
+ Int copy_ratio_segments_tumor_disk = ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsTumor.copy_ratio_only_segments, "GB")) + disk_pad
+ call CallCopyRatioSegments as CallCopyRatioSegmentsTumor {
+ input:
+ entity_id = CollectCountsTumor.entity_id,
+ copy_ratio_segments = ModelSegmentsTumor.copy_ratio_only_segments,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = copy_ratio_segments_tumor_disk
+ }
+
+ Int copy_ratio_segments_normal_disk = ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsNormal.copy_ratio_only_segments, "GB")) + disk_pad
+ call CallCopyRatioSegments as CallCopyRatioSegmentsNormal {
+ input:
+ entity_id = CollectCountsNormal.entity_id,
+ copy_ratio_segments = ModelSegmentsNormal.copy_ratio_only_segments,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = copy_ratio_segments_normal_disk
+ }
+
+ # The F=files from other tasks are small enough to just combine into one disk variable and pass to the tumor plotting tasks
+ Int plot_tumor_disk = ref_size + ceil(size(DenoiseReadCountsTumor.standardized_copy_ratios, "GB")) + ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsTumor.het_allelic_counts, "GB")) + ceil(size(ModelSegmentsTumor.modeled_segments, "GB")) + disk_pad
+ call PlotDenoisedCopyRatios as PlotDenoisedCopyRatiosTumor {
+ input:
+ entity_id = CollectCountsTumor.entity_id,
+ standardized_copy_ratios = DenoiseReadCountsTumor.standardized_copy_ratios,
+ denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = plot_tumor_disk
+ }
+ # The files from other tasks are small enough to just combine into one disk variable and pass to the normal plotting tasks
+ Int plot_normal_disk = ref_size + ceil(size(DenoiseReadCountsNormal.standardized_copy_ratios, "GB")) + ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsNormal.het_allelic_counts, "GB")) + ceil(size(ModelSegmentsNormal.modeled_segments, "GB")) + disk_pad
+ call PlotDenoisedCopyRatios as PlotDenoisedCopyRatiosNormal {
+ input:
+ entity_id = CollectCountsNormal.entity_id,
+ standardized_copy_ratios = DenoiseReadCountsNormal.standardized_copy_ratios,
+ denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = plot_normal_disk
+ }
+
+ call PlotModeledSegments as PlotModeledSegmentsTumor {
+ input:
+ entity_id = CollectCountsTumor.entity_id,
+ denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios,
+ het_allelic_counts = ModelSegmentsTumor.het_allelic_counts,
+ modeled_segments = ModelSegmentsTumor.modeled_segments,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = plot_tumor_disk
+ }
+
+ call PlotModeledSegments as PlotModeledSegmentsNormal {
+ input:
+ entity_id = CollectCountsNormal.entity_id,
+ denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
+ het_allelic_counts = ModelSegmentsNormal.het_allelic_counts,
+ modeled_segments = ModelSegmentsNormal.modeled_segments,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ disk_space_gb = plot_normal_disk
+ }
+}
+
+task DenoiseReadCounts {
+ String entity_id
+ File read_counts
+ File read_count_pon
+ Int? number_of_eigensamples #use all eigensamples in panel by default
+ File? gatk4_jar_override
+
+ # Runtime parameters
+ Int? mem
+ String gatk_docker
+ Int? preemptible_attempts
+ Int disk_space_gb
+
+ # Mem is in units of GB but our command and memory runtime values are in MB
+ Int machine_mem = if defined(mem) then mem * 1000 else 13000
+ Int command_mem = machine_mem - 1000
+
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${command_mem}m -jar $GATK_JAR DenoiseReadCounts \
+ --input ${read_counts} \
+ --readCountPanelOfNormals ${read_count_pon} \
+ ${"--numberOfEigensamples " + number_of_eigensamples} \
+ --standardizedCopyRatios ${entity_id}.standardizedCR.tsv \
+ --denoisedCopyRatios ${entity_id}.denoisedCR.tsv
+ >>>
+
+ runtime {
+ docker: "${gatk_docker}"
+ memory: machine_mem + " MB"
+ disks: "local-disk " + disk_space_gb + " HDD"
+ preemptible: select_first([preemptible_attempts, 5])
+ }
+
+ output {
+ File standardized_copy_ratios = "${entity_id}.standardizedCR.tsv"
+ File denoised_copy_ratios = "${entity_id}.denoisedCR.tsv"
+ }
+}
+
+task ModelSegments {
+ String entity_id
+ File denoised_copy_ratios
+ File allelic_counts
+ File? normal_allelic_counts
+ Int? max_num_segments_per_chromosome
+ Int? min_total_allele_count
+ Float? genotyping_homozygous_log_ratio_threshold
+ Float? genotyping_base_error_rate
+ Float? kernel_variance_copy_ratio
+ Float? kernel_variance_allele_fraction
+ Float? kernel_scaling_allele_fraction
+ Int? kernel_approximation_dimension
+ Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256]
+ Float? num_changepoints_penalty_factor
+ Float? minor_allele_fraction_prior_alpha
+ Int? num_samples_copy_ratio
+ Int? num_burn_in_copy_ratio
+ Int? num_samples_allele_fraction
+ Int? num_burn_in_allele_fraction
+ Float? smoothing_threshold_copy_ratio
+ Float? smoothing_threshold_allele_fraction
+ Int? max_num_smoothing_iterations
+ Int? num_smoothing_iterations_per_fit
+ String? output_dir
+ File? gatk4_jar_override
+
+ # Runtime parameters
+ Int? mem
+ String gatk_docker
+ Int? preemptible_attempts
+ Int disk_space_gb
+
+ # Mem is in units of GB but our command and memory runtime values are in MB
+ Int machine_mem = if defined(mem) then mem * 1000 else 13000
+ # ModelSegments seems to need at least 3GB of overhead to run
+ Int command_mem = machine_mem - 3000
+
+ # If optional output_dir not specified, use "out"
+ String output_dir_ = select_first([output_dir, "out"])
+
+ command <<<
+ set -e
+ mkdir ${output_dir_}
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${command_mem}m -jar $GATK_JAR ModelSegments \
+ --denoisedCopyRatios ${denoised_copy_ratios} \
+ --allelicCounts ${allelic_counts} \
+ ${"--normalAllelicCounts " + normal_allelic_counts} \
+ --maxNumSegmentsPerChromosome ${default="500" max_num_segments_per_chromosome} \
+ --minTotalAlleleCount ${default="30" min_total_allele_count} \
+ --genotypingHomozygousLogRatioThreshold ${default="-10.0" genotyping_homozygous_log_ratio_threshold} \
+ --genotypingBaseErrorRate ${default="0.05" genotyping_base_error_rate} \
+ --kernelVarianceCopyRatio ${default="0.0" kernel_variance_copy_ratio} \
+ --kernelVarianceAlleleFraction ${default="0.025" kernel_variance_allele_fraction} \
+ --kernelScalingAlleleFraction ${default="1.0" kernel_scaling_allele_fraction} \
+ --kernelApproximationDimension ${default="100" kernel_approximation_dimension} \
+ --windowSize ${sep= " --windowSize " window_sizes} \
+ --numChangepointsPenaltyFactor ${default="1.0" num_changepoints_penalty_factor} \
+ --minorAlleleFractionPriorAlpha ${default="25.0" minor_allele_fraction_prior_alpha} \
+ --numSamplesCopyRatio ${default=100 num_samples_copy_ratio} \
+ --numBurnInCopyRatio ${default=50 num_burn_in_copy_ratio} \
+ --numSamplesAlleleFraction ${default=100 num_samples_allele_fraction} \
+ --numBurnInAlleleFraction ${default=50 num_burn_in_allele_fraction} \
+ --smoothingThresholdCopyRatio ${default="2.0" smoothing_threshold_copy_ratio} \
+ --smoothingThresholdAlleleFraction ${default="2.0" smoothing_threshold_allele_fraction} \
+ --maxNumSmoothingIterations ${default=10 max_num_smoothing_iterations} \
+ --numSmoothingIterationsPerFit ${default=0 num_smoothing_iterations_per_fit} \
+ --output ${output_dir_} \
+ --outputPrefix ${entity_id}
+
+ # We need to create the file even if the above command doesn't so we have something to delocalize
+ # If no file is created by the above task then it will copy out an empty file
+ touch ${output_dir_}/${entity_id}.hets.normal.tsv
+ >>>
+
+ runtime {
+ docker: "${gatk_docker}"
+ memory: machine_mem + " MB"
+ disks: "local-disk " + disk_space_gb + " HDD"
+ preemptible: select_first([preemptible_attempts, 5])
+ }
+
+ output {
+ File het_allelic_counts = "${output_dir_}/${entity_id}.hets.tsv"
+ File normal_het_allelic_counts = "${output_dir_}/${entity_id}.hets.normal.tsv"
+ File copy_ratio_only_segments = "${output_dir_}/${entity_id}.cr.seg"
+ File modeled_segments_begin = "${output_dir_}/${entity_id}.modelBegin.seg"
+ File copy_ratio_parameters_begin = "${output_dir_}/${entity_id}.modelBegin.cr.param"
+ File allele_fraction_parameters_begin = "${output_dir_}/${entity_id}.modelBegin.af.param"
+ File modeled_segments = "${output_dir_}/${entity_id}.modelFinal.seg"
+ File copy_ratio_parameters = "${output_dir_}/${entity_id}.modelFinal.cr.param"
+ File allele_fraction_parameters = "${output_dir_}/${entity_id}.modelFinal.af.param"
+ }
+}
+
+task CallCopyRatioSegments {
+ String entity_id
+ File copy_ratio_segments
+ Float? neutral_segment_copy_ratio_threshold
+ Float? outlier_neutral_segment_copy_ratio_z_score_threshold
+ Float? calling_copy_ratio_z_score_threshold
+ File? gatk4_jar_override
+
+ # Runtime parameters
+ Int? mem
+ String gatk_docker
+ Int? preemptible_attempts
+ Int disk_space_gb
+
+ # Mem is in units of GB but our command and memory runtime values are in MB
+ Int machine_mem = if defined(mem) then mem * 1000 else 7000
+ Int command_mem = machine_mem - 1000
+
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${command_mem}m -jar $GATK_JAR CallCopyRatioSegments \
+ --input ${copy_ratio_segments} \
+ --neutralSegmentCopyRatioThreshold ${default="0.1" neutral_segment_copy_ratio_threshold} \
+ --outlierNeutralSegmentCopyRatioZScoreThreshold ${default="2.0" outlier_neutral_segment_copy_ratio_z_score_threshold} \
+ --callingCopyRatioZScoreThreshold ${default="2.0" calling_copy_ratio_z_score_threshold} \
+ --output ${entity_id}.called.seg
+ >>>
+
+ runtime {
+ docker: "${gatk_docker}"
+ memory: machine_mem + " MB"
+ disks: "local-disk " + disk_space_gb + " HDD"
+ preemptible: select_first([preemptible_attempts, 5])
+ }
+
+ output {
+ File called_copy_ratio_segments = "${entity_id}.called.seg"
+ }
+}
+
+task PlotDenoisedCopyRatios {
+ String entity_id
+ File standardized_copy_ratios
+ File denoised_copy_ratios
+ File ref_fasta_dict
+ Int? minimum_contig_length
+ String? output_dir
+ File? gatk4_jar_override
+
+ # Runtime parameters
+ Int? mem
+ String gatk_docker
+ Int? preemptible_attempts
+ Int disk_space_gb
+
+ # Mem is in units of GB but our command and memory runtime values are in MB
+ Int machine_mem = if defined(mem) then mem * 1000 else 7000
+ Int command_mem = machine_mem - 1000
+
+ # If optional output_dir not specified, use "out"
+ String output_dir_ = select_first([output_dir, "out"])
+
+ command <<<
+ set -e
+ mkdir ${output_dir_}
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${command_mem}m -jar $GATK_JAR PlotDenoisedCopyRatios \
+ --standardizedCopyRatios ${standardized_copy_ratios} \
+ --denoisedCopyRatios ${denoised_copy_ratios} \
+ -SD ${ref_fasta_dict} \
+ --minimumContigLength ${default="1000000" minimum_contig_length} \
+ --output ${output_dir_} \
+ --outputPrefix ${entity_id}
+ >>>
+
+ runtime {
+ docker: "${gatk_docker}"
+ memory: machine_mem + " MB"
+ disks: "local-disk " + disk_space_gb + " HDD"
+ preemptible: select_first([preemptible_attempts, 5])
+ }
+
+ output {
+ File denoised_copy_ratios_plot = "${output_dir_}/${entity_id}.denoised.png"
+ File denoised_copy_ratios_lim_4_plot = "${output_dir_}/${entity_id}.denoisedLimit4.png"
+ File standardized_MAD = "${output_dir_}/${entity_id}.standardizedMAD.txt"
+ File denoised_MAD = "${output_dir_}/${entity_id}.denoisedMAD.txt"
+ File delta_MAD = "${output_dir_}/${entity_id}.deltaMAD.txt"
+ File scaled_delta_MAD = "${output_dir_}/${entity_id}.scaledDeltaMAD.txt"
+ }
+}
+
+task PlotModeledSegments {
+ String entity_id
+ File denoised_copy_ratios
+ File het_allelic_counts
+ File modeled_segments
+ File ref_fasta_dict
+ Int? minimum_contig_length
+ String? output_dir
+ File? gatk4_jar_override
+
+ # Runtime parameters
+ Int? mem
+ String gatk_docker
+ Int? preemptible_attempts
+ Int disk_space_gb
+
+ # Mem is in units of GB but our command and memory runtime values are in MB
+ Int machine_mem = if defined(mem) then mem * 1000 else 7000
+ Int command_mem = machine_mem - 1000
+
+ # If optional output_dir not specified, use "out"
+ String output_dir_ = select_first([output_dir, "out"])
+
+ command <<<
+ set -e
+ mkdir ${output_dir_}
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${command_mem}m -jar $GATK_JAR PlotModeledSegments \
+ --denoisedCopyRatios ${denoised_copy_ratios} \
+ --allelicCounts ${het_allelic_counts} \
+ --segments ${modeled_segments} \
+ -SD ${ref_fasta_dict} \
+ --minimumContigLength ${default="1000000" minimum_contig_length} \
+ --output ${output_dir_} \
+ --outputPrefix ${entity_id}
+ >>>
+
+ runtime {
+ docker: "${gatk_docker}"
+ memory: machine_mem + " MB"
+ disks: "local-disk " + disk_space_gb + " HDD"
+ preemptible: select_first([preemptible_attempts, 5])
+ }
+
+ output {
+ File modeled_segments_plot = "${output_dir_}/${entity_id}.modeled.png"
+ }
+}
diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl
new file mode 100644
index 00000000000..ec83f4a75d2
--- /dev/null
+++ b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl
@@ -0,0 +1,137 @@
+# Workflow for creating a GATK CNV Panel of Normals given a list of normal samples. Supports both WGS and WES.
+#
+# Notes:
+#
+# - Input file (normal_bams_list) must contain file paths to bam and bam index files separated by tabs in the following format:
+# normal_bam_1 bam_idx_1
+# normal_bam_2 bam_idx_2
+# ...
+#
+# - The interval-list file is required for both WGS and WES workflows and should be a Picard or GATK-style interval list.
+# These intervals will be padded on both sides by the amount specified by PreprocessIntervals.padding (default 250)
+# and split into bins of length specified by PreprocessIntervals.bin_length (default 1000; specify 0 to skip binning).
+# For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be included, but care
+# should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only with panels containing
+# individuals of the same sex as the case samples).
+#
+# - Example invocation:
+# java -jar cromwell.jar run cnv_somatic_panel_workflow.wdl myParameters.json
+# See cnv_somatic_panel_workflow_template.json for a template json file to modify with your own parameters (please save
+# your modified version with a different filename and do not commit to the gatk repository).
+#
+#############
+
+import "cnv_common_tasks.wdl" as CNVTasks
+
+workflow CNVSomaticPanelWorkflow {
+ File intervals
+ File normal_bams_list
+ Array[Array[String]]+ normal_bams = read_tsv(normal_bams_list)
+ String pon_entity_id
+ File ref_fasta_dict
+ File ref_fasta_fai
+ File ref_fasta
+ String gatk_docker
+ File? gatk4_jar_override
+ Int? mem_for_create_read_count_pon
+
+ # If true, AnnotateIntervals will be run to create GC annotations and explicit GC correction
+ # will be performed by the PoN generated by CreateReadCountPanelOfNormals before PCA is performed on subsequent cases
+ Boolean? do_explicit_gc_correction
+
+ call CNVTasks.PreprocessIntervals {
+ input:
+ intervals = intervals,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker
+ }
+
+ if (select_first([do_explicit_gc_correction, false])) {
+ call CNVTasks.AnnotateIntervals {
+ input:
+ intervals = PreprocessIntervals.preprocessed_intervals,
+ ref_fasta = ref_fasta,
+ ref_fasta_fai = ref_fasta_fai,
+ ref_fasta_dict = ref_fasta_dict,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker
+ }
+ }
+
+ scatter (normal_bam in normal_bams) {
+ call CNVTasks.CollectCounts {
+ input:
+ intervals = PreprocessIntervals.preprocessed_intervals,
+ bam = normal_bam[0],
+ bam_idx = normal_bam[1],
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker
+ }
+ }
+
+ call CreateReadCountPanelOfNormals {
+ input:
+ pon_entity_id = pon_entity_id,
+ read_count_files = CollectCounts.counts,
+ annotated_intervals = AnnotateIntervals.annotated_intervals,
+ gatk4_jar_override = gatk4_jar_override,
+ gatk_docker = gatk_docker,
+ mem = mem_for_create_read_count_pon
+ }
+
+ output {
+ File read_count_pon = CreateReadCountPanelOfNormals.read_count_pon
+ }
+}
+
+task CreateReadCountPanelOfNormals {
+ String pon_entity_id
+ Array[File] read_count_files
+ Float? minimum_interval_median_percentile
+ Float? maximum_zeros_in_sample_percentage
+ Float? maximum_zeros_in_interval_percentage
+ Float? extreme_sample_median_percentile
+ Boolean? do_impute_zeros
+ Float? extreme_outlier_truncation_percentile
+ Int? number_of_eigensamples
+ File? annotated_intervals #do not perform explicit GC correction by default
+ File? gatk4_jar_override
+
+ # Runtime parameters
+ Int? mem
+ String gatk_docker
+ Int? preemptible_attempts
+ Int? disk_space_gb
+
+ Int machine_mem = if defined(mem) then select_first([mem]) else 8
+ Float command_mem = machine_mem - 0.5
+
+ command <<<
+ set -e
+ GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override}
+
+ java -Xmx${machine_mem}g -jar $GATK_JAR CreateReadCountPanelOfNormals \
+ --input ${sep=" --input " read_count_files} \
+ --minimumIntervalMedianPercentile ${default="10.0" minimum_interval_median_percentile} \
+ --maximumZerosInSamplePercentage ${default="5.0" maximum_zeros_in_sample_percentage} \
+ --maximumZerosInIntervalPercentage ${default="5.0" maximum_zeros_in_interval_percentage} \
+ --extremeSampleMedianPercentile ${default="2.5" extreme_sample_median_percentile} \
+ --doImputeZeros ${default="true" do_impute_zeros} \
+ --extremeOutlierTruncationPercentile ${default="0.1" extreme_outlier_truncation_percentile} \
+ --numberOfEigensamples ${default="20" number_of_eigensamples} \
+ ${"--annotatedIntervals " + annotated_intervals} \
+ --output ${pon_entity_id}.pon.hdf5
+ >>>
+
+ runtime {
+ docker: "${gatk_docker}"
+ memory: command_mem + " GB"
+ disks: "local-disk " + select_first([disk_space_gb, 150]) + " HDD"
+ preemptible: select_first([preemptible_attempts, 2])
+ }
+
+ output {
+ File read_count_pon = "${pon_entity_id}.pon.hdf5"
+ }
+}
\ No newline at end of file
diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl
index 520a0324ac9..2041bd221f0 100755
--- a/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl
+++ b/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl
@@ -18,12 +18,12 @@ task PadTargets {
String filename = select_first([targets, ""])
String base_filename = basename(filename, ".tsv")
- command {
+ command <<<
java -Xmx${default="1" mem}g -jar ${gatk_jar} PadTargets \
--targets ${targets} \
--padding ${default="250" padding} \
--output ${base_filename}.padded.tsv
- }
+ >>>
runtime {
docker: "${gatk_docker}"
@@ -126,12 +126,12 @@ task AnnotateTargets {
Int? preemptible_attempts
Int? disk_space_gb
- command {
+ command <<<
java -Xmx${default=4 mem}g -jar ${gatk_jar} AnnotateTargets \
--targets ${targets} \
--reference ${ref_fasta} \
--output ${entity_id}.annotated.tsv
- }
+ >>>
runtime {
docker: "${gatk_docker}"
@@ -158,12 +158,12 @@ task CorrectGCBias {
Int? preemptible_attempts
Int? disk_space_gb
- command {
+ command <<<
java -Xmx${default=4 mem}g -jar ${gatk_jar} CorrectGCBias \
--input ${coverage} \
--targets ${annotated_targets} \
--output ${entity_id}.gc_corrected.tsv
- }
+ >>>
runtime {
docker: "${gatk_docker}"
diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl
index 3223b108fb4..4dfb7e421fd 100644
--- a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl
+++ b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl
@@ -248,7 +248,7 @@ task PlotACNVResults {
# If optional output_dir not specified, use "."
String output_dir_ = select_first([output_dir, "."])
- command {
+ command <<<
mkdir -p ${output_dir_}; \
java -Xmx${default=4 mem}g -jar ${gatk_jar} PlotACNVResults \
--hets ${hets} \
@@ -257,7 +257,7 @@ task PlotACNVResults {
-SD ${ref_fasta_dict} \
--output ${output_dir_} \
--outputPrefix ${entity_id}
- }
+ >>>
runtime {
docker: "${gatk_docker}"
@@ -289,14 +289,14 @@ task ConvertACNVResults {
# If optional output_dir not specified, use "."
String output_dir_ = select_first([output_dir, "."])
- command {
+ command <<<
mkdir -p ${output_dir_}; \
java -Xmx${default=4 mem}g -jar ${gatk_jar} ConvertACNVResults \
--tumorHets ${hets} \
--tangentNormalized ${tn_coverage} \
--segments ${acnv_segments} \
--outputDir ${output_dir_}
- }
+ >>>
runtime {
docker: "${gatk_docker}"
diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl
index 09416567afd..af928c1cad9 100644
--- a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl
+++ b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl
@@ -119,7 +119,7 @@ task NormalizeSomaticReadCounts {
Int? preemptible_attempts
Int? disk_space_gb
- command {
+ command <<<
java -Xmx${default=4 mem}g -jar ${gatk_jar} NormalizeSomaticReadCounts \
--input ${coverage} \
--targets ${padded_targets} \
@@ -128,7 +128,7 @@ task NormalizeSomaticReadCounts {
--factorNormalizedOutput ${entity_id}.fnt.tsv \
--preTangentNormalized ${entity_id}.preTN.tsv \
--betaHatsOutput ${entity_id}.betaHats.tsv
- }
+ >>>
runtime {
docker: "${gatk_docker}"
@@ -231,13 +231,13 @@ task CallSegments {
Int? preemptible_attempts
Int? disk_space_gb
- command {
+ command <<<
java -Xmx${default=4 mem}g -jar ${gatk_jar} CallSegments \
--tangentNormalized ${tn_coverage} \
--segments ${segments} \
--legacy false \
--output ${entity_id}.called
- }
+ >>>
runtime {
docker: "${gatk_docker}"
@@ -269,7 +269,7 @@ task PlotSegmentedCopyRatio {
# If optional output_dir not specified, use "."
String output_dir_ = select_first([output_dir, "."])
- command {
+ command <<<
mkdir -p ${output_dir_}; \
java -Xmx${default=4 mem}g -jar ${gatk_jar} PlotSegmentedCopyRatio \
--tangentNormalized ${tn_coverage} \
@@ -278,7 +278,7 @@ task PlotSegmentedCopyRatio {
-SD ${ref_fasta_dict} \
--output ${output_dir_} \
--outputPrefix ${entity_id}
- }
+ >>>
runtime {
docker: "${gatk_docker}"
diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl
index 75226576532..a1310909901 100644
--- a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl
+++ b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl
@@ -120,12 +120,12 @@ task CombineReadCounts {
Int? preemptible_attempts
Int? disk_space_gb
- command {
+ command <<<
java -Xmx${default=4 mem}g -jar ${gatk_jar} CombineReadCounts \
--input ${sep=" --input " coverage_file_list} \
--maxOpenFiles ${default=100 max_open_files} \
--output ${combined_entity_id}.tsv
- }
+ >>>
runtime {
docker: "${gatk_docker}"
@@ -152,7 +152,7 @@ task CreatePanelOfNormals {
Int? preemptible_attempts
Int? disk_space_gb
- command {
+ command <<<
# If there are no removed samples the output file still needs to be created
touch "${pon_entity_id}.pon.removed_samples.txt" ; \
java -Xmx${default=4 mem}g -jar ${gatk_jar} CreatePanelOfNormals \
@@ -161,7 +161,7 @@ task CreatePanelOfNormals {
--truncatePercentileThreshold 0.1 \
--noQC ${default="false" no_qc} \
--output ${pon_entity_id}.pon
- }
+ >>>
runtime {
docker: "${gatk_docker}"
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java
index 2a3cdd9f4d9..623c973671d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java
@@ -8,10 +8,10 @@
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.*;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedInterval;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedIntervalCollection;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotationSet;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberArgumentValidationUtils;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java
index 09537ff243f..b9124f0a1e6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java
@@ -7,12 +7,9 @@
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.caller.CalledCopyRatioSegmentCollection;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.caller.ReCapSegCaller;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegmentCollection;
-import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
-import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.tools.copynumber.caller.SimpleCopyRatioCaller;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection;
import java.io.File;
@@ -20,16 +17,16 @@
* Calls segments as amplified, deleted or copy number neutral given files containing denoised copy ratios
* and a list of segments.
*
- * @author David Benjamin
- *
* Examples
*
*
* gatk-launch --javaOptions "-Xmx4g" CallCopyRatioSegments \
- * --denoisedCopyRatios tumor.denoisedCR.tsv \
* --segments tumor.cr.seg \
* --output tumor.called
*
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
*/
@CommandLineProgramProperties(
summary = "Call copy-ratio segments as amplified, deleted, or copy number neutral.",
@@ -39,18 +36,19 @@
@DocumentedFeature
@BetaFeature
public final class CallCopyRatioSegments extends CommandLineProgram {
+ public static final String NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_LONG_NAME = "neutralSegmentCopyRatioThreshold";
+ public static final String NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_SHORT_NAME = "neutralTh";
- @Argument(
- doc = "Input file containing denoised copy-ratio profile (output of DenoiseReadCounts).",
- fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME,
- shortName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_SHORT_NAME
- )
- private File inputDenoisedCopyRatiosFile;
+ public static final String OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME = "outlierNeutralSegmentCopyRatioZScoreThreshold";
+ public static final String OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME = "outlierTh";
+
+ public static final String CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME = "callingCopyRatioZScoreThreshold";
+ public static final String CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME = "callingTh";
@Argument(
doc = "Input file containing copy-ratio segments (.cr.seg output of ModelSegments).",
- fullName = CopyNumberStandardArgument.SEGMENTS_FILE_LONG_NAME,
- shortName = CopyNumberStandardArgument.SEGMENTS_FILE_SHORT_NAME
+ fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME
)
private File segmentsFile;
@@ -61,15 +59,43 @@ public final class CallCopyRatioSegments extends CommandLineProgram {
)
private File outFile;
+ @Argument(
+ doc = "Threshold on non-log2 copy ratio used for determining copy-neutral segments. " +
+ "If non-log2 copy ratio is within 1 +/- this threshold, a segment is considered copy-neutral.",
+ fullName = NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_LONG_NAME,
+ shortName = NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_SHORT_NAME,
+ optional = true
+ )
+ private double neutralSegmentCopyRatioThreshold = 0.1;
+
+ @Argument(
+ doc = "Threshold on z-score of non-log2 copy ratio used for determining outlier copy-neutral segments. " +
+ "If non-log2 copy ratio z-score is above this threshold for a copy-neutral segment, " +
+ "it is considered an outlier and not used in the calculation of the length-weighted mean and standard deviation " +
+ "used for calling.",
+ fullName = OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME,
+ shortName = OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME,
+ optional = true,
+ minValue = 0.
+ )
+ private double outlierNeutralSegmentCopyRatioZScoreThreshold = 2.;
+
+ @Argument(
+ doc = "Threshold on z-score of non-log2 copy ratio used for calling segments.",
+ fullName = CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME,
+ shortName = CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME,
+ optional = true,
+ minValue = 0.
+ )
+ private double callingCopyRatioZScoreThreshold = 2.;
+
@Override
protected Object doWork() {
- final CopyRatioCollection denoisedCopyRatios = new CopyRatioCollection(inputDenoisedCopyRatiosFile);
final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(segmentsFile);
- Utils.validateArg(denoisedCopyRatios.getSampleName().equals(copyRatioSegments.getSampleName()),
- "Denoised copy ratios and copy-ratio segments do not have the same sample name.");
-
final CalledCopyRatioSegmentCollection calledCopyRatioSegments =
- new ReCapSegCaller(denoisedCopyRatios, copyRatioSegments).makeCalls();
+ new SimpleCopyRatioCaller(copyRatioSegments,
+ neutralSegmentCopyRatioThreshold, outlierNeutralSegmentCopyRatioZScoreThreshold, callingCopyRatioZScoreThreshold)
+ .makeCalls();
calledCopyRatioSegments.write(outFile);
return "SUCCESS";
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java
index 1f877728325..aad65cd3c22 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java
@@ -14,7 +14,7 @@
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
-import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollector;
+import org.broadinstitute.hellbender.tools.copynumber.datacollection.AllelicCountCollector;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleNameUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java
index ba9b1e5324d..bd66bcc29df 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java
@@ -19,12 +19,12 @@
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.GATKException;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCount;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberArgumentValidationUtils;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleNameUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
index 019c78b49d0..b20db1143a5 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
@@ -12,10 +12,10 @@
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.spark.SparkCommandLineProgram;
import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.GCBiasCorrector;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.HDF5SVDReadCountPanelOfNormals;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.denoising.GCBiasCorrector;
+import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java
index a2de829ffde..deef2ef33ec 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java
@@ -10,13 +10,9 @@
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.GCBiasCorrector;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.HDF5SVDReadCountPanelOfNormals;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDDenoisedCopyRatioResult;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDDenoisingUtils;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDReadCountPanelOfNormals;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.denoising.*;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java
new file mode 100644
index 00000000000..3007cb20b70
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java
@@ -0,0 +1,573 @@
+package org.broadinstitute.hellbender.tools.copynumber;
+
+import com.google.common.collect.ImmutableSet;
+import htsjdk.samtools.util.OverlapDetector;
+import org.apache.commons.math3.special.Beta;
+import org.apache.commons.math3.util.FastMath;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.BetaFeature;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.*;
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment;
+import org.broadinstitute.hellbender.tools.copynumber.models.AlleleFractionPrior;
+import org.broadinstitute.hellbender.tools.copynumber.models.MultidimensionalModeller;
+import org.broadinstitute.hellbender.tools.copynumber.segmentation.AlleleFractionKernelSegmenter;
+import org.broadinstitute.hellbender.tools.copynumber.segmentation.CopyRatioKernelSegmenter;
+import org.broadinstitute.hellbender.tools.copynumber.segmentation.MultidimensionalKernelSegmenter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+
+import java.io.File;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+@CommandLineProgramProperties(
+ summary = "Model segmented copy ratio from denoised read counts and segmented minor-allele fraction from allelic counts.",
+ oneLineSummary = "Model segmented copy ratio from denoised read counts and segmented minor-allele fraction from allelic counts.",
+ programGroup = CopyNumberProgramGroup.class
+)
+@DocumentedFeature
+@BetaFeature
+public final class ModelSegments extends CommandLineProgram {
+ //filename tags for output
+ public static final String HET_ALLELIC_COUNTS_FILE_SUFFIX = ".hets.tsv";
+ public static final String NORMAL_HET_ALLELIC_COUNTS_FILE_SUFFIX = ".hets.normal.tsv";
+ public static final String SEGMENTS_FILE_SUFFIX = ".seg";
+ public static final String BEGIN_FIT_FILE_TAG = ".modelBegin";
+ public static final String FINAL_FIT_FILE_TAG = ".modelFinal";
+ public static final String COPY_RATIO_MODEL_PARAMETER_FILE_SUFFIX = ".cr.param";
+ public static final String ALLELE_FRACTION_MODEL_PARAMETER_FILE_SUFFIX = ".af.param";
+ public static final String COPY_RATIO_SEGMENTS_FOR_CALLER_FILE = ".cr" + SEGMENTS_FILE_SUFFIX;
+
+ public static final String MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_LONG_NAME = "maxNumSegmentsPerChromosome";
+ public static final String MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_SHORT_NAME = "maxNumSegsPerChr";
+
+ public static final String MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME = "minTotalAlleleCount";
+ public static final String MINIMUM_TOTAL_ALLELE_COUNT_SHORT_NAME = "minAC";
+
+ public static final String GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_LONG_NAME = "genotypingHomozygousLogRatioThreshold";
+ public static final String GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_SHORT_NAME = "homLRT";
+
+ public static final String GENOTYPING_BASE_ERROR_RATE_LONG_NAME = "genotypingBaseErrorRate";
+ public static final String GENOTYPING_BASE_ERROR_RATE_SHORT_NAME = "baseErrRate";
+
+ public static final String KERNEL_VARIANCE_COPY_RATIO_LONG_NAME = "kernelVarianceCopyRatio";
+ public static final String KERNEL_VARIANCE_COPY_RATIO_SHORT_NAME = "kernVarCR";
+
+ public static final String KERNEL_VARIANCE_ALLELE_FRACTION_LONG_NAME = "kernelVarianceAlleleFraction";
+ public static final String KERNEL_VARIANCE_ALLELE_FRACTION_SHORT_NAME = "kernVarAF";
+
+ public static final String KERNEL_SCALING_ALLELE_FRACTION_LONG_NAME = "kernelScalingAlleleFraction";
+ public static final String KERNEL_SCALING_ALLELE_FRACTION_SHORT_NAME = "kernSclAF";
+
+ public static final String KERNEL_APPROXIMATION_DIMENSION_LONG_NAME = "kernelApproximationDimension";
+ public static final String KERNEL_APPROXIMATION_DIMENSION_SHORT_NAME = "kernApproxDim";
+
+ public static final String WINDOW_SIZE_LONG_NAME = "windowSize";
+ public static final String WINDOW_SIZE_SHORT_NAME = "winSize";
+
+ public static final String NUM_CHANGEPOINTS_PENALTY_FACTOR_LONG_NAME = "numChangepointsPenaltyFactor";
+ public static final String NUM_CHANGEPOINTS_PENALTY_FACTOR_SHORT_NAME = "numChangeptsPen";
+
+ public static final String MINOR_ALLELE_FRACTION_PRIOR_ALPHA_LONG_NAME = "minorAlleleFractionPriorAlpha";
+ public static final String MINOR_ALLELE_FRACTION_PRIOR_ALPHA_SHORT_NAME = "alphaAF";
+
+ public static final String NUM_SAMPLES_COPY_RATIO_LONG_NAME = "numSamplesCopyRatio";
+ public static final String NUM_SAMPLES_COPY_RATIO_SHORT_NAME = "numSampCR";
+
+ public static final String NUM_BURN_IN_COPY_RATIO_LONG_NAME = "numBurnInCopyRatio";
+ public static final String NUM_BURN_IN_COPY_RATIO_SHORT_NAME = "numBurnCR";
+
+ public static final String NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME = "numSamplesAlleleFraction";
+ public static final String NUM_SAMPLES_ALLELE_FRACTION_SHORT_NAME = "numSampAF";
+
+ public static final String NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME = "numBurnInAlleleFraction";
+ public static final String NUM_BURN_IN_ALLELE_FRACTION_SHORT_NAME = "numBurnAF";
+
+ public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_LONG_NAME = "smoothingThresholdCopyRatio";
+ public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_SHORT_NAME = "smoothThCR";
+
+ public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_LONG_NAME = "smoothingThresholdAlleleFraction";
+ public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_SHORT_NAME = "smoothThAF";
+
+ public static final String MAX_NUM_SMOOTHING_ITERATIONS_LONG_NAME = "maxNumSmoothingIterations";
+ public static final String MAX_NUM_SMOOTHING_ITERATIONS_SHORT_NAME = "maxNumSmoothIter";
+
+ public static final String NUM_SMOOTHING_ITERATIONS_PER_FIT_LONG_NAME = "numSmoothingIterationsPerFit";
+ public static final String NUM_SMOOTHING_ITERATIONS_PER_FIT_SHORT_NAME = "numSmoothIterPerFit";
+
+ @Argument(
+ doc = "Input file containing denoised copy-ratio profile (output of DenoiseReadCounts).",
+ fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME,
+ shortName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_SHORT_NAME,
+ optional = true
+ )
+ private File inputDenoisedCopyRatiosFile = null;
+
+ @Argument(
+ doc = "Input file containing allelic counts (output of CollectAllelicCounts).",
+ fullName = CopyNumberStandardArgument.ALLELIC_COUNTS_FILE_LONG_NAME,
+ shortName = CopyNumberStandardArgument.ALLELIC_COUNTS_FILE_SHORT_NAME,
+ optional = true
+ )
+ private File inputAllelicCountsFile = null;
+
+ @Argument(
+ doc = "Input file containing allelic counts for a matched normal (output of CollectAllelicCounts).",
+ fullName = CopyNumberStandardArgument.NORMAL_ALLELIC_COUNTS_FILE_LONG_NAME,
+ shortName = CopyNumberStandardArgument.NORMAL_ALLELIC_COUNTS_FILE_SHORT_NAME,
+ optional = true
+ )
+ private File inputNormalAllelicCountsFile = null;
+
+ @Argument(
+ doc = "Prefix for output files.",
+ fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME,
+ shortName = CopyNumberStandardArgument.OUTPUT_PREFIX_SHORT_NAME
+ )
+ private String outputPrefix;
+
+ @Argument(
+ doc = "Output directory.",
+ fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
+ )
+ private String outputDir;
+
+ @Argument(
+ doc = "Maximum number of segments allowed per chromosome.",
+ fullName = MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_LONG_NAME,
+ shortName = MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_SHORT_NAME,
+ minValue = 1,
+ optional = true
+ )
+ private int maxNumSegmentsPerChromosome = 1000;
+
+ @Argument(
+ doc = "Minimum total count for filtering allelic counts, if available.",
+ fullName = MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME,
+ shortName = MINIMUM_TOTAL_ALLELE_COUNT_SHORT_NAME,
+ minValue = 0,
+ optional = true
+ )
+ private int minTotalAlleleCount = 30;
+
+ @Argument(
+ doc = "Log-ratio threshold for genotyping and filtering homozygous allelic counts, if available. " +
+ "Increasing this value will increase the number of sites assumed to be heterozygous for modeling.",
+ fullName = GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_LONG_NAME,
+ shortName = GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_SHORT_NAME,
+ optional = true
+ )
+ private double genotypingHomozygousLogRatioThreshold = -10.;
+
+ @Argument(
+ doc = "Maximum base-error rate for genotyping and filtering homozygous allelic counts, if available. " +
+ "The likelihood for an allelic count to be generated from a homozygous site will be integrated " +
+ "from zero base-error rate up to this value. Decreasing this value will increase " +
+ "the number of sites assumed to be heterozygous for modeling.",
+ fullName = GENOTYPING_BASE_ERROR_RATE_LONG_NAME,
+ shortName = GENOTYPING_BASE_ERROR_RATE_SHORT_NAME,
+ optional = true
+ )
+ private double genotypingBaseErrorRate = 5E-2;
+
+ @Argument(
+ doc = "Variance of Gaussian kernel for copy-ratio segmentation, if performed. If zero, a linear kernel will be used.",
+ fullName = KERNEL_VARIANCE_COPY_RATIO_LONG_NAME,
+ shortName = KERNEL_VARIANCE_COPY_RATIO_SHORT_NAME,
+ minValue = 0.,
+ optional = true
+ )
+ private double kernelVarianceCopyRatio = 0.;
+
+ @Argument(
+ doc = "Variance of Gaussian kernel for allele-fraction segmentation, if performed. If zero, a linear kernel will be used.",
+ fullName = KERNEL_VARIANCE_ALLELE_FRACTION_LONG_NAME,
+ shortName = KERNEL_VARIANCE_ALLELE_FRACTION_SHORT_NAME,
+ minValue = 0.,
+ optional = true
+ )
+ private double kernelVarianceAlleleFraction = 0.025;
+
+ @Argument(
+ doc = "Relative scaling S of the kernel K_AF for allele-fraction segmentation to the kernel K_CR for copy-ratio segmentation. " +
+ "If multidimensional segmentation is performed, the total kernel used will be K_CR + S * K_AF.",
+ fullName = KERNEL_SCALING_ALLELE_FRACTION_LONG_NAME,
+ shortName = KERNEL_SCALING_ALLELE_FRACTION_SHORT_NAME,
+ minValue = 0.,
+ optional = true
+ )
+ private double kernelScalingAlleleFraction = 1.0;
+
+ @Argument(
+ doc = "Dimension of the kernel approximation. A subsample containing this number of data points " +
+ "will be used to construct the approximation for each chromosome. " +
+ "If the total number of data points in a chromosome is greater " +
+ "than this number, then all data points in the chromosome will be used. " +
+ "Time complexity scales quadratically and space complexity scales linearly with this parameter.",
+ fullName = KERNEL_APPROXIMATION_DIMENSION_LONG_NAME,
+ shortName = KERNEL_APPROXIMATION_DIMENSION_SHORT_NAME,
+ minValue = 1,
+ optional = true
+ )
+ private int kernelApproximationDimension = 100;
+
+ @Argument(
+ doc = "Window sizes to use for calculating local changepoint costs. " +
+ "For each window size, the cost for each data point to be a changepoint will be calculated " +
+ "assuming that it demarcates two adjacent segments of that size. " +
+ "Including small (large) window sizes will increase sensitivity to small (large) events. " +
+ "Duplicate values will be ignored.",
+ fullName = WINDOW_SIZE_LONG_NAME,
+ shortName = WINDOW_SIZE_SHORT_NAME,
+ minValue = 1,
+ optional = true
+ )
+ private List windowSizes = new ArrayList<>(Arrays.asList(8, 16, 32, 64, 128, 256));
+
+ @Argument(
+ doc = "Factor A for the penalty on the number of changepoints per chromosome for segmentation. " +
+ "Adds a penalty of the form A * C * [1 + log (N / C)], " +
+ "where C is the number of changepoints in the chromosome, " +
+ "to the cost function for each chromosome. " +
+ "Must be non-negative.",
+ fullName = NUM_CHANGEPOINTS_PENALTY_FACTOR_LONG_NAME,
+ shortName = NUM_CHANGEPOINTS_PENALTY_FACTOR_SHORT_NAME,
+ minValue = 0.,
+ optional = true
+ )
+ private double numChangepointsPenaltyFactor = 1.;
+
+ @Argument(
+ doc = "Alpha hyperparameter for the 4-parameter beta-distribution prior on segment minor-allele fraction. " +
+ "The prior for the minor-allele fraction f in each segment is assumed to be Beta(alpha, 1, 0, 1/2). " +
+ "Increasing this hyperparameter will reduce the effect of reference bias at the expense of sensitivity.",
+ fullName = MINOR_ALLELE_FRACTION_PRIOR_ALPHA_LONG_NAME,
+ shortName = MINOR_ALLELE_FRACTION_PRIOR_ALPHA_SHORT_NAME,
+ optional = true,
+ minValue = 1
+ )
+ private double minorAlleleFractionPriorAlpha = 25.;
+
+ @Argument(
+ doc = "Total number of MCMC samples for copy-ratio model.",
+ fullName = NUM_SAMPLES_COPY_RATIO_LONG_NAME,
+ shortName = NUM_SAMPLES_COPY_RATIO_SHORT_NAME,
+ optional = true,
+ minValue = 1
+ )
+ private int numSamplesCopyRatio = 100;
+
+ @Argument(
+ doc = "Number of burn-in samples to discard for copy-ratio model.",
+ fullName = NUM_BURN_IN_COPY_RATIO_LONG_NAME,
+ shortName = NUM_BURN_IN_COPY_RATIO_SHORT_NAME,
+ optional = true,
+ minValue = 0
+ )
+ private int numBurnInCopyRatio = 50;
+
+ @Argument(
+ doc = "Total number of MCMC samples for allele-fraction model.",
+ fullName = NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME,
+ shortName = NUM_SAMPLES_ALLELE_FRACTION_SHORT_NAME,
+ optional = true,
+ minValue = 1
+ )
+ private int numSamplesAlleleFraction = 100;
+
+ @Argument(
+ doc = "Number of burn-in samples to discard for allele-fraction model.",
+ fullName = NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME,
+ shortName = NUM_BURN_IN_ALLELE_FRACTION_SHORT_NAME,
+ optional = true,
+ minValue = 0
+ )
+ private int numBurnInAlleleFraction = 50;
+
+ @Argument(
+ doc = "Number of 10% equal-tailed credible-interval widths to use for copy-ratio segmentation smoothing.",
+ fullName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_LONG_NAME,
+ shortName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_SHORT_NAME,
+ optional = true,
+ minValue = 0.
+ )
+ private double smoothingCredibleIntervalThresholdCopyRatio = 2.;
+
+ @Argument(
+ doc = "Number of 10% equal-tailed credible-interval widths to use for allele-fraction segmentation smoothing.",
+ fullName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_LONG_NAME,
+ shortName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_SHORT_NAME,
+ optional = true,
+ minValue = 0.
+ )
+ private double smoothingCredibleIntervalThresholdAlleleFraction = 2.;
+
+ @Argument(
+ doc = "Maximum number of iterations allowed for segmentation smoothing.",
+ fullName = MAX_NUM_SMOOTHING_ITERATIONS_LONG_NAME,
+ shortName = MAX_NUM_SMOOTHING_ITERATIONS_SHORT_NAME,
+ optional = true,
+ minValue = 0
+ )
+ private int maxNumSmoothingIterations = 25;
+
+ @Argument(
+ doc = "Number of segmentation-smoothing iterations per MCMC model refit. " +
+ "(Increasing this will decrease runtime, but the final number of segments may be higher. " +
+ "Setting this to 0 will completely disable model refitting between iterations.)",
+ fullName = NUM_SMOOTHING_ITERATIONS_PER_FIT_LONG_NAME,
+ shortName = NUM_SMOOTHING_ITERATIONS_PER_FIT_SHORT_NAME,
+ optional = true,
+ minValue = 0
+ )
+ private int numSmoothingIterationsPerFit = 0;
+
+ //initialize data variables, some of which may be optional
+ private CopyRatioCollection denoisedCopyRatios = null;
+ private AllelicCountCollection hetAllelicCounts = null;
+
+ @Override
+ protected Object doWork() {
+ validateArguments();
+
+ //perform one-dimensional or multidimensional segmentation as appropriate and write to file
+ //(for use by CallCopyRatioSegments, if copy ratios are available)
+ final MultidimensionalSegmentCollection multidimensionalSegments;
+ if (inputDenoisedCopyRatiosFile != null && inputAllelicCountsFile == null) {
+ readDenoisedCopyRatios();
+ final CopyRatioSegmentCollection copyRatioSegments = performCopyRatioSegmentation();
+ multidimensionalSegments = new MultidimensionalSegmentCollection(
+ copyRatioSegments.getSampleMetadata(),
+ copyRatioSegments.getRecords().stream()
+ .map(s -> new MultidimensionalSegment(s.getInterval(), s.getNumPoints(), 0, s.getMeanLog2CopyRatio()))
+ .collect(Collectors.toList()));
+ hetAllelicCounts = new AllelicCountCollection(denoisedCopyRatios.getSampleMetadata(), Collections.emptyList()); //create an empty collection with the appropriate name
+ } else if (inputDenoisedCopyRatiosFile == null && inputAllelicCountsFile != null) {
+ readAndFilterAllelicCounts();
+ final AlleleFractionSegmentCollection alleleFractionSegments = performAlleleFractionSegmentation();
+ multidimensionalSegments = new MultidimensionalSegmentCollection(
+ alleleFractionSegments.getSampleMetadata(),
+ alleleFractionSegments.getRecords().stream()
+ .map(s -> new MultidimensionalSegment(s.getInterval(), 0, s.getNumPoints(), Double.NaN))
+ .collect(Collectors.toList()));
+ denoisedCopyRatios = new CopyRatioCollection(hetAllelicCounts.getSampleMetadata(), Collections.emptyList()); //create an empty collection with the appropriate name
+ } else {
+ readDenoisedCopyRatios();
+ readAndFilterAllelicCounts();
+ multidimensionalSegments = new MultidimensionalKernelSegmenter(denoisedCopyRatios, hetAllelicCounts)
+ .findSegmentation(maxNumSegmentsPerChromosome,
+ kernelVarianceCopyRatio, kernelVarianceAlleleFraction, kernelScalingAlleleFraction, kernelApproximationDimension,
+ ImmutableSet.copyOf(windowSizes).asList(),
+ numChangepointsPenaltyFactor, numChangepointsPenaltyFactor);
+ }
+
+ logger.info("Modeling available denoised copy ratios and heterozygous allelic counts...");
+ //initial MCMC model fitting performed by MultidimensionalModeller constructor
+ final AlleleFractionPrior alleleFractionPrior = new AlleleFractionPrior(minorAlleleFractionPriorAlpha);
+ final MultidimensionalModeller modeller = new MultidimensionalModeller(
+ multidimensionalSegments, denoisedCopyRatios, hetAllelicCounts, alleleFractionPrior,
+ numSamplesCopyRatio, numBurnInCopyRatio,
+ numSamplesAlleleFraction, numBurnInAlleleFraction);
+
+ //write initial segments and parameters to file
+ writeModeledSegmentsAndParameterFiles(modeller, BEGIN_FIT_FILE_TAG);
+
+ //segmentation smoothing
+ modeller.smoothSegments(
+ maxNumSmoothingIterations, numSmoothingIterationsPerFit,
+ smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction);
+
+ //write final segments and parameters to file
+ writeModeledSegmentsAndParameterFiles(modeller, FINAL_FIT_FILE_TAG);
+
+ //write final segments for copy-ratio caller (TODO remove this and MEAN_LOG2_COPY_RATIO column when new caller is available)
+ final OverlapDetector copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector();
+ final CopyRatioSegmentCollection copyRatioSegmentsFinal = new CopyRatioSegmentCollection(
+ modeller.getModeledSegments().getSampleMetadata(),
+ modeller.getModeledSegments().getIntervals().stream()
+ .map(s -> new CopyRatioSegment(s, new ArrayList<>(copyRatioMidpointOverlapDetector.getOverlaps(s))))
+ .collect(Collectors.toList()));
+ writeSegments(copyRatioSegmentsFinal, COPY_RATIO_SEGMENTS_FOR_CALLER_FILE);
+
+ logger.info("SUCCESS: ModelSegments run complete.");
+
+ return "SUCCESS";
+ }
+
+ private void validateArguments() {
+ Utils.nonNull(outputPrefix);
+ Utils.validateArg(!(inputDenoisedCopyRatiosFile == null && inputAllelicCountsFile == null),
+ "Must provide at least a denoised copy-ratio profile file or an allelic-counts file.");
+ Utils.validateArg(!(inputAllelicCountsFile == null && inputNormalAllelicCountsFile != null),
+ "Must provide an allelic-counts file for the case sample to run in matched-normal mode.");
+ if (inputDenoisedCopyRatiosFile != null) {
+ IOUtils.canReadFile(inputDenoisedCopyRatiosFile);
+ }
+ if (inputAllelicCountsFile != null) {
+ IOUtils.canReadFile(inputAllelicCountsFile);
+ }
+ if (inputNormalAllelicCountsFile != null) {
+ IOUtils.canReadFile(inputNormalAllelicCountsFile);
+ }
+ if (!new File(outputDir).exists()) {
+ throw new UserException(String.format("Output directory %s does not exist.", outputDir));
+ }
+ }
+
+ private void readDenoisedCopyRatios() {
+ logger.info(String.format("Reading denoised copy-ratio profile file (%s)...", inputDenoisedCopyRatiosFile));
+ denoisedCopyRatios = new CopyRatioCollection(inputDenoisedCopyRatiosFile);
+ }
+
+ private CopyRatioSegmentCollection performCopyRatioSegmentation() {
+ logger.info("Starting segmentation of denoised copy ratios...");
+ final int maxNumChangepointsPerChromosome = maxNumSegmentsPerChromosome - 1;
+ return new CopyRatioKernelSegmenter(denoisedCopyRatios)
+ .findSegmentation(maxNumChangepointsPerChromosome, kernelVarianceCopyRatio, kernelApproximationDimension,
+ ImmutableSet.copyOf(windowSizes).asList(),
+ numChangepointsPenaltyFactor, numChangepointsPenaltyFactor);
+ }
+
+ private void readAndFilterAllelicCounts() {
+ //read in case sample
+ logger.info(String.format("Reading allelic-counts file (%s)...", inputAllelicCountsFile));
+ final AllelicCountCollection unfilteredAllelicCounts = new AllelicCountCollection(inputAllelicCountsFile);
+ final SampleMetadata sampleMetadata = unfilteredAllelicCounts.getSampleMetadata();
+
+ //filter on total count in case sample
+ logger.info(String.format("Filtering allelic counts with total count less than %d...", minTotalAlleleCount));
+ AllelicCountCollection filteredAllelicCounts = new AllelicCountCollection(
+ sampleMetadata,
+ unfilteredAllelicCounts.getRecords().stream()
+ .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount)
+ .collect(Collectors.toList()));
+ logger.info(String.format("Retained %d / %d sites after filtering on total count...",
+ filteredAllelicCounts.getRecords().size(), unfilteredAllelicCounts.getRecords().size()));
+
+ //filter on overlap with copy-ratio intervals, if available
+ if (denoisedCopyRatios != null) {
+ logger.info("Filtering allelic-count sites not overlapping with copy-ratio intervals...");
+ final OverlapDetector copyRatioOverlapDetector = denoisedCopyRatios.getOverlapDetector();
+ filteredAllelicCounts = new AllelicCountCollection(
+ sampleMetadata,
+ filteredAllelicCounts.getRecords().stream()
+ .filter(copyRatioOverlapDetector::overlapsAny)
+ .collect(Collectors.toList()));
+ logger.info(String.format("Retained %d / %d sites after filtering on overlap with copy-ratio intervals...",
+ filteredAllelicCounts.getRecords().size(), unfilteredAllelicCounts.getRecords().size()));
+ }
+
+ if (inputNormalAllelicCountsFile == null) {
+ //filter on homozygosity in case sample
+ logger.info("No matched normal was provided, not running in matched-normal mode...");
+ logger.info("Performing binomial testing and filtering homozygous allelic counts...");
+ hetAllelicCounts = new AllelicCountCollection(
+ sampleMetadata,
+ filteredAllelicCounts.getRecords().stream()
+ .filter(ac -> calculateHomozygousLogRatio(ac, genotypingBaseErrorRate) < genotypingHomozygousLogRatioThreshold)
+ .collect(Collectors.toList()));
+ final File hetAllelicCountsFile = new File(outputDir, outputPrefix + HET_ALLELIC_COUNTS_FILE_SUFFIX);
+ hetAllelicCounts.write(hetAllelicCountsFile);
+ logger.info(String.format("Retained %d / %d sites after testing for heterozygosity...",
+ hetAllelicCounts.getRecords().size(), unfilteredAllelicCounts.getRecords().size()));
+ logger.info(String.format("Heterozygous allelic counts written to %s.", hetAllelicCountsFile));
+ } else {
+ //read in matched normal
+ logger.info("Matched normal was provided, running in matched-normal mode...");
+ logger.info("Performing binomial testing and filtering homozygous allelic counts in matched normal...");
+ final AllelicCountCollection unfilteredNormalAllelicCounts = new AllelicCountCollection(inputNormalAllelicCountsFile);
+ if (!unfilteredNormalAllelicCounts.getIntervals().equals(unfilteredAllelicCounts.getIntervals())) {
+ throw new UserException.BadInput("Allelic-count sites in case sample and matched normal do not match. " +
+ "Run CollectAllelicCounts using the same interval list of sites for both samples.");
+ }
+ final SampleMetadata normalSampleMetadata = unfilteredNormalAllelicCounts.getSampleMetadata();
+
+ //filter on total count in matched normal
+ logger.info(String.format("Filtering allelic counts in matched normal with total count less than %d...", minTotalAlleleCount));
+ final AllelicCountCollection filteredNormalAllelicCounts = new AllelicCountCollection(
+ normalSampleMetadata,
+ unfilteredNormalAllelicCounts.getRecords().stream()
+ .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount)
+ .collect(Collectors.toList()));
+ logger.info(String.format("Retained %d / %d sites in matched normal after filtering on total count...",
+ filteredNormalAllelicCounts.getRecords().size(), unfilteredNormalAllelicCounts.getRecords().size()));
+
+ //filter on homozygosity in matched normal
+ final AllelicCountCollection hetNormalAllelicCounts = new AllelicCountCollection(
+ normalSampleMetadata,
+ filteredNormalAllelicCounts.getRecords().stream()
+ .filter(ac -> calculateHomozygousLogRatio(ac, genotypingBaseErrorRate) < genotypingHomozygousLogRatioThreshold)
+ .collect(Collectors.toList()));
+ final File hetNormalAllelicCountsFile = new File(outputDir, outputPrefix + NORMAL_HET_ALLELIC_COUNTS_FILE_SUFFIX);
+ hetNormalAllelicCounts.write(hetNormalAllelicCountsFile);
+ logger.info(String.format("Retained %d / %d sites in matched normal after testing for heterozygosity...",
+ hetNormalAllelicCounts.getRecords().size(), unfilteredNormalAllelicCounts.getRecords().size()));
+ logger.info(String.format("Heterozygous allelic counts for matched normal written to %s.", hetNormalAllelicCountsFile));
+
+ //retrieve sites in case sample
+ logger.info("Retrieving allelic counts at these sites in case sample...");
+ final Set hetNormalAllelicCountSites = new HashSet<>(hetNormalAllelicCounts.getIntervals());
+ hetAllelicCounts = new AllelicCountCollection(
+ sampleMetadata,
+ filteredAllelicCounts.getRecords().stream()
+ .filter(ac -> hetNormalAllelicCountSites.contains(ac.getInterval()))
+ .collect(Collectors.toList()));
+ final File hetAllelicCountsFile = new File(outputDir, outputPrefix + HET_ALLELIC_COUNTS_FILE_SUFFIX);
+ hetAllelicCounts.write(hetAllelicCountsFile);
+ logger.info(String.format("Allelic counts for case sample at heterozygous sites in matched normal written to %s.", hetAllelicCountsFile));
+ }
+ }
+
+ private static double calculateHomozygousLogRatio(final AllelicCount allelicCount,
+ final double genotypingBaseErrorRate) {
+ final int r = allelicCount.getRefReadCount();
+ final int n = allelicCount.getTotalReadCount();
+ final double betaAll = Beta.regularizedBeta(1, r + 1, n - r + 1);
+ final double betaError = Beta.regularizedBeta(genotypingBaseErrorRate, r + 1, n - r + 1);
+ final double betaOneMinusError = Beta.regularizedBeta(1 - genotypingBaseErrorRate, r + 1, n - r + 1);
+ final double betaHom = betaError + betaAll - betaOneMinusError;
+ final double betaHet = betaOneMinusError - betaError;
+ return FastMath.log(betaHom) - FastMath.log(betaHet);
+ }
+
+ private AlleleFractionSegmentCollection performAlleleFractionSegmentation() {
+ logger.info("Starting segmentation of heterozygous allelic counts...");
+ final int maxNumChangepointsPerChromosome = maxNumSegmentsPerChromosome - 1;
+ return new AlleleFractionKernelSegmenter(hetAllelicCounts)
+ .findSegmentation(maxNumChangepointsPerChromosome, kernelVarianceAlleleFraction, kernelApproximationDimension,
+ ImmutableSet.copyOf(windowSizes).asList(),
+ numChangepointsPenaltyFactor, numChangepointsPenaltyFactor);
+ }
+
+ private void writeModeledSegmentsAndParameterFiles(final MultidimensionalModeller modeller,
+ final String fileTag) {
+ final ModeledSegmentCollection modeledSegments = modeller.getModeledSegments();
+ writeSegments(modeledSegments, fileTag + SEGMENTS_FILE_SUFFIX);
+ final File copyRatioParameterFile = new File(outputDir, outputPrefix + fileTag + COPY_RATIO_MODEL_PARAMETER_FILE_SUFFIX);
+ final File alleleFractionParameterFile = new File(outputDir, outputPrefix + fileTag + ALLELE_FRACTION_MODEL_PARAMETER_FILE_SUFFIX);
+ modeller.writeModelParameterFiles(copyRatioParameterFile, alleleFractionParameterFile);
+ }
+
+ private void writeSegments(final SampleLocatableCollection> segments,
+ final String fileSuffix) {
+ final File segmentsFile = new File(outputDir, outputPrefix + fileSuffix);
+ segments.write(segmentsFile);
+ logger.info(String.format("Segments written to %s", segmentsFile));
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java
index b75a63e127c..10c365fbdea 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java
@@ -30,7 +30,8 @@
* IntervalArgumentCollection. However, we encourage using only the -P flag.
*
* The user can also specify the length of the bins (in bp) using the -BL option. If this is not commensurate with
- * the length of the padded intervals, then the last bin will be of different length than the others.
+ * the length of the padded intervals, then the last bin will be of different length than the others. If zero is
+ * specified, then no binning will be performed.
*
* The -O argument specifies a filename for the output bins, stored as a Picard interval list.
*
@@ -66,11 +67,11 @@ public final class PreprocessIntervals extends GATKTool {
public static final String PADDING_SHORT_NAME = "P";
@Argument(
- doc = "Length (in bp) of the bins.",
+ doc = "Length (in bp) of the bins. If zero, no binning will be performed.",
fullName = BIN_LENGTH_LONG_NAME,
shortName = BIN_LENGTH_SHORT_NAME,
optional = true,
- minValue = 1
+ minValue = 0
)
private int binLength = 1000;
@@ -120,6 +121,9 @@ private static IntervalList padAndMergeIntervals(final List inpu
}
private static IntervalList generateBins(final IntervalList preparedIntervalList, final int binLength, final SAMSequenceDictionary sequenceDictionary) {
+ if (binLength == 0) {
+ return IntervalList.copyOf(preparedIntervalList);
+ }
final IntervalList bins = new IntervalList(sequenceDictionary);
for (final Interval interval : preparedIntervalList) {
for (int binStart = interval.getStart(); binStart <= interval.getEnd(); binStart += binLength) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCaller.java
new file mode 100644
index 00000000000..f6427e6acc7
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCaller.java
@@ -0,0 +1,135 @@
+package org.broadinstitute.hellbender.tools.copynumber.caller;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CalledCopyRatioSegment;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * This caller is loosely based on the legacy ReCapSeg caller that was originally implemented in ReCapSeg v1.4.5.0,
+ * but introduces major changes. The method is as follows:
+ * 1) use the non-log2 mean copy ratio to determine copy-neutral segments,
+ * 2) weight segments by length for determining the mean and standard deviation of the non-log2 copy ratio in copy-neutral segments,
+ * 3) filter outlier copy-neutral segments by non-log2 copy ratio z-score,
+ * 4) use the filtered copy-neutral segments to determine a length-weighted mean and standard deviation,
+ * 5) call segments using z-score based on this mean and standard deviation.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class SimpleCopyRatioCaller {
+ private static final Logger logger = LogManager.getLogger(SimpleCopyRatioCaller.class);
+
+ private final double neutralSegmentCopyRatioThreshold;
+ private final double outlierNeutralSegmentCopyRatioZScoreThreshold;
+ private final double callingCopyRatioZScoreThreshold;
+ private final Statistics callingStatistics;
+
+ private final CopyRatioSegmentCollection copyRatioSegments;
+
+ /**
+ * @param neutralSegmentCopyRatioThreshold non-log2 copy ratio must be within 1 +/- this threshold for a segment to be copy neutral
+ * @param outlierNeutralSegmentCopyRatioZScoreThreshold z-score on non-log2 copy ratio above which a copy-neutral segment is assumed to be an outlier
+ * and not included in the calculation of the length-weighted standard deviation of
+ * non-log2 copy ratio in copy-neutral segments
+ * @param callingCopyRatioZScoreThreshold z-score with respect to length-weighted standard deviation of non-log2 copy ratio
+ * in non-outlier copy-neutral segments used for calling segments
+ */
+ public SimpleCopyRatioCaller(final CopyRatioSegmentCollection copyRatioSegments,
+ final double neutralSegmentCopyRatioThreshold,
+ final double outlierNeutralSegmentCopyRatioZScoreThreshold,
+ final double callingCopyRatioZScoreThreshold) {
+ ParamUtils.isPositive(neutralSegmentCopyRatioThreshold, "Copy-neutral threshold must be positive.");
+ ParamUtils.isPositive(outlierNeutralSegmentCopyRatioZScoreThreshold, "Outlier z-score threshold must be positive.");
+ ParamUtils.isPositive(callingCopyRatioZScoreThreshold, "Calling z-score threshold must be positive.");
+ this.copyRatioSegments = Utils.nonNull(copyRatioSegments);
+ this.neutralSegmentCopyRatioThreshold = neutralSegmentCopyRatioThreshold;
+ this.outlierNeutralSegmentCopyRatioZScoreThreshold = outlierNeutralSegmentCopyRatioZScoreThreshold;
+ this.callingCopyRatioZScoreThreshold = callingCopyRatioZScoreThreshold;
+ callingStatistics = calculateCallingStatistics();
+ }
+
+ public CalledCopyRatioSegmentCollection makeCalls() {
+ final List segments = copyRatioSegments.getRecords();
+ final List calledSegments = new ArrayList<>(segments.size());
+ for (final CopyRatioSegment segment : segments) {
+ final double copyRatioMean = Math.pow(2., segment.getMeanLog2CopyRatio());
+ if (Math.abs(1. - copyRatioMean) < neutralSegmentCopyRatioThreshold) {
+ calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.NEUTRAL));
+ } else {
+ final double copyRatioDeviation = copyRatioMean - callingStatistics.mean;
+ if (copyRatioDeviation < -callingStatistics.standardDeviation * callingCopyRatioZScoreThreshold) {
+ calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.DELETION));
+ } else if (copyRatioDeviation > callingStatistics.standardDeviation * callingCopyRatioZScoreThreshold) {
+ calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.AMPLIFICATION));
+ } else {
+ calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.NEUTRAL));
+ }
+ }
+ }
+ return new CalledCopyRatioSegmentCollection(copyRatioSegments.getSampleMetadata(), calledSegments);
+ }
+
+ private Statistics calculateCallingStatistics() {
+ //get the segments that fall within the copy-neutral region
+ final List copyNeutralSegments = copyRatioSegments.getRecords().stream()
+ .filter(s -> Math.abs(1. - Math.pow(2., s.getMeanLog2CopyRatio())) < neutralSegmentCopyRatioThreshold)
+ .collect(Collectors.toList());
+ logger.info(String.format("%d segments in copy-neutral region [%.4f, %.4f]...", copyNeutralSegments.size(),
+ 1. - neutralSegmentCopyRatioThreshold, 1. + neutralSegmentCopyRatioThreshold));
+
+ //calculate length-weighted statistics of unfiltered copy-neutral segments
+ final Statistics unfilteredStatistics = calculateLengthWeightedStatistics(copyNeutralSegments);
+ logger.info(String.format("Length-weighted mean of segments in copy-neutral region (CR space): %.4f", unfilteredStatistics.mean));
+ logger.info(String.format("Length-weighted standard deviation for segments in copy-neutral region : %.4f", unfilteredStatistics.standardDeviation));
+
+ //filter outlier segments by only including those within 2 standard deviations
+ final List filteredCopyNeutralSegments = copyNeutralSegments.stream()
+ .filter(s -> Math.abs(Math.pow(2., s.getMeanLog2CopyRatio()) - unfilteredStatistics.mean)
+ <= unfilteredStatistics.standardDeviation * outlierNeutralSegmentCopyRatioZScoreThreshold)
+ .collect(Collectors.toList());
+ logger.info(String.format("%d / %d segments in copy-neutral region remain after outliers filtered using z-score threshold (%.4f)...",
+ filteredCopyNeutralSegments.size(), copyNeutralSegments.size(), outlierNeutralSegmentCopyRatioZScoreThreshold));
+
+ final Statistics statistics = calculateLengthWeightedStatistics(filteredCopyNeutralSegments);
+ logger.info(String.format("Length-weighted mean for z-score calling (CR space): %.4f", statistics.mean));
+ logger.info(String.format("Length-weighted standard deviation for z-score calling (CR space): %.4f", statistics.standardDeviation));
+
+ return statistics;
+ }
+
+ private static Statistics calculateLengthWeightedStatistics(final List copyRatioSegments) {
+ final List segmentLengths = copyRatioSegments.stream()
+ .map(c -> c.getInterval().getLengthOnReference())
+ .collect(Collectors.toList());
+ final double totalLength = segmentLengths.stream().mapToDouble(Integer::doubleValue).sum();
+ final int numSegments = segmentLengths.size();
+ final double lengthWeightedCopyRatioMean = IntStream.range(0, numSegments)
+ .mapToDouble(i -> segmentLengths.get(i) * Math.pow(2., copyRatioSegments.get(i).getMeanLog2CopyRatio()))
+ .sum() / totalLength;
+ final double lengthWeightedCopyRatioStandardDeviation = Math.sqrt(IntStream.range(0, numSegments)
+ .mapToDouble(i -> segmentLengths.get(i) * Math.pow(Math.pow(2., copyRatioSegments.get(i).getMeanLog2CopyRatio()) - lengthWeightedCopyRatioMean, 2))
+ .sum() / (((double) (numSegments - 1) / numSegments) * totalLength));
+ return new Statistics(lengthWeightedCopyRatioMean, lengthWeightedCopyRatioStandardDeviation);
+ }
+
+ private static final class Statistics {
+ private final double mean;
+ private final double standardDeviation;
+
+ private Statistics(final double mean,
+ final double standardDeviation) {
+ this.mean = mean;
+ this.standardDeviation = standardDeviation;
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCaller.java
deleted file mode 100644
index 363c292b081..00000000000
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCaller.java
+++ /dev/null
@@ -1,107 +0,0 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.caller;
-
-import htsjdk.samtools.util.OverlapDetector;
-import org.apache.commons.math3.stat.descriptive.moment.Mean;
-import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegmentCollection;
-import org.broadinstitute.hellbender.utils.Utils;
-
-import java.util.*;
-import java.util.stream.Collectors;
-
-/**
- * This caller mimics the legacy ReCapSeg Caller that was originally implemented in ReCapSeg v1.4.5.0.
- *
- * There is a small difference. The python code was using the same algorithm as intersectBed, which was causing it to drop
- * the first interval of each segment in calculations of the copy-neutral intervals. The code here does
- * not do this. This difference in the two codebases can cause a slight difference in the T calculation. Hence, the
- * results of this code and the python code will not be exactly the same, but will be
- * very close. A fix (to make this code match the python) has been deemed unworthy of our time.
- */
-public final class ReCapSegCaller {
- private static final Logger logger = LogManager.getLogger(ReCapSegCaller.class);
-
- //bounds on log_2 coverage for high-confidence neutral segments
- private static final double COPY_NEUTRAL_CUTOFF = 0.1;
- // Number of standard deviations before assuming that an interval was an outlier in a segment
- private static final double Z_THRESHOLD = 2;
-
- private final CopyRatioSegmentCollection copyRatioSegments;
- private final LinkedHashMap> segmentToCopyRatiosMap;
-
- /**
- * @param denoisedCopyRatios in log2 space
- */
- public ReCapSegCaller(final CopyRatioCollection denoisedCopyRatios,
- final CopyRatioSegmentCollection copyRatioSegments) {
- this.copyRatioSegments = Utils.nonNull(copyRatioSegments);
- Utils.validateArg(denoisedCopyRatios.getSampleName().equals(copyRatioSegments.getSampleName()),
- "Denoised copy ratios and copy-ratio segments do not have the same sample name.");
- segmentToCopyRatiosMap = constructSegmentToCopyRatiosMap(denoisedCopyRatios, copyRatioSegments);
- }
-
- private static LinkedHashMap> constructSegmentToCopyRatiosMap(final CopyRatioCollection denoisedCopyRatios,
- final CopyRatioSegmentCollection copyRatioSegments) {
- final LinkedHashMap> segmentToCopyRatiosMap = new LinkedHashMap<>();
- final OverlapDetector copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector();
- for (final CopyRatioSegment segment : copyRatioSegments.getRecords()) {
- final int numPointsExpected = segment.getNumPoints();
- final Set copyRatiosInSegment = copyRatioMidpointOverlapDetector.getOverlaps(segment);
- if (copyRatiosInSegment.size() != numPointsExpected) {
- throw new IllegalArgumentException("Denoised copy ratios and copy-ratio segments are not consistent.");
- }
- segmentToCopyRatiosMap.put(segment, copyRatiosInSegment);
- }
- return segmentToCopyRatiosMap;
- }
-
- private double calculateT() {
- //Get the segments that are likely copy neutral.
- //Math.abs removed to mimic python...
- final List copyNeutralSegments = segmentToCopyRatiosMap.keySet().stream()
- .filter(s -> s.getMeanLog2CopyRatio() < COPY_NEUTRAL_CUTOFF).collect(Collectors.toList());
-
- //Get the intervals that correspond to the copyNeutralSegments... note that individual intervals, due to noise,
- //can be far away from copy neutral
- final double[] copyNeutralIntervals = copyNeutralSegments.stream()
- .flatMap(s -> segmentToCopyRatiosMap.get(s).stream())
- .mapToDouble(CopyRatio::getLog2CopyRatioValue).toArray();
-
- final double meanCopyNeutralIntervals = new Mean().evaluate(copyNeutralIntervals);
- final double sigmaCopyNeutralIntervals = new StandardDeviation().evaluate(copyNeutralIntervals);
-
- // Now we filter outliers by only including those w/in 2 standard deviations.
- final double [] filteredCopyNeutralIntervals = Arrays.stream(copyNeutralIntervals)
- .filter(c -> Math.abs(c - meanCopyNeutralIntervals) < sigmaCopyNeutralIntervals * Z_THRESHOLD).toArray();
-
- return new StandardDeviation().evaluate(filteredCopyNeutralIntervals);
- }
-
- public CalledCopyRatioSegmentCollection makeCalls() {
- final double t = calculateT();
-
- logger.info("Running caller that mimics the ReCapSeg 1.4.5.0 (python) caller.");
- // Log some information about thresholds chosen for the segments.
- logger.info(String.format("Copy neutral (log2CR space) [%.4f, %.4f]", -t, t));
- logger.info(String.format("Copy neutral (CR space) [%.4f, %.4f]", Math.pow(2, -t), Math.pow(2, t)));
-
- final Set segments = segmentToCopyRatiosMap.keySet();
- final List calledSegments = new ArrayList<>(segments.size());
- for (final CopyRatioSegment segment : segments) {
- if (segment.getMeanLog2CopyRatio() < -t) {
- calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.DELETION));
- } else if (segment.getMeanLog2CopyRatio() > t) {
- calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.AMPLIFICATION));
- } else {
- calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.NEUTRAL));
- }
- }
-
- return new CalledCopyRatioSegmentCollection(copyRatioSegments.getSampleMetadata(), calledSegments);
- }
-}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollector.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/datacollection/AllelicCountCollector.java
similarity index 94%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollector.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/datacollection/AllelicCountCollector.java
index c0c4847011f..b129911e346 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollector.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/datacollection/AllelicCountCollector.java
@@ -1,9 +1,11 @@
-package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount;
+package org.broadinstitute.hellbender.tools.copynumber.datacollection;
import htsjdk.samtools.util.Locatable;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrector.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrector.java
similarity index 98%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrector.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrector.java
index fefe9ae2337..6be8504971f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrector.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrector.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.annotation;
+package org.broadinstitute.hellbender.tools.copynumber.denoising;
import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
@@ -7,6 +7,7 @@
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/HDF5SVDReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/HDF5SVDReadCountPanelOfNormals.java
similarity index 99%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/HDF5SVDReadCountPanelOfNormals.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/HDF5SVDReadCountPanelOfNormals.java
index 7500e2214b0..ab1087c1b12 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/HDF5SVDReadCountPanelOfNormals.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/HDF5SVDReadCountPanelOfNormals.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd;
+package org.broadinstitute.hellbender.tools.copynumber.denoising;
import htsjdk.samtools.util.Lazy;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisedCopyRatioResult.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisedCopyRatioResult.java
similarity index 92%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisedCopyRatioResult.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisedCopyRatioResult.java
index 402127f210f..fa3a70ee228 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisedCopyRatioResult.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisedCopyRatioResult.java
@@ -1,9 +1,9 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd;
+package org.broadinstitute.hellbender.tools.copynumber.denoising;
import org.apache.commons.math3.linear.RealMatrix;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java
similarity index 99%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisingUtils.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java
index 9039c72ebfa..4497481ae4e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisingUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd;
+package org.broadinstitute.hellbender.tools.copynumber.denoising;
import com.google.common.primitives.Doubles;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
@@ -10,8 +10,7 @@
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.CreateReadCountPanelOfNormals;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.GCBiasCorrector;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils;
import org.broadinstitute.hellbender.utils.MatrixSummaryUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDReadCountPanelOfNormals.java
similarity index 94%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDReadCountPanelOfNormals.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDReadCountPanelOfNormals.java
index 356956e553b..29f37769a17 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDReadCountPanelOfNormals.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDReadCountPanelOfNormals.java
@@ -1,6 +1,6 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd;
+package org.broadinstitute.hellbender.tools.copynumber.denoising;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import java.util.List;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AlleleFractionSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AlleleFractionSegmentCollection.java
new file mode 100644
index 00000000000..8b2b0e48860
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AlleleFractionSegmentCollection.java
@@ -0,0 +1,52 @@
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
+
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AlleleFractionSegment;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+
+import java.io.File;
+import java.util.List;
+import java.util.function.BiConsumer;
+import java.util.function.Function;
+
+/**
+ * Represents an allele-fraction segmentation.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class AlleleFractionSegmentCollection extends SampleLocatableCollection {
+ enum AlleleFractionSegmentTableColumn {
+ CONTIG,
+ START,
+ END,
+ NUM_POINTS_ALLELE_FRACTION;
+
+ static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
+ }
+
+ private static final Function ALLELE_FRACTION_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION = dataLine -> {
+ final String contig = dataLine.get(AlleleFractionSegmentTableColumn.CONTIG);
+ final int start = dataLine.getInt(AlleleFractionSegmentTableColumn.START);
+ final int end = dataLine.getInt(AlleleFractionSegmentTableColumn.END);
+ final int numPoints = dataLine.getInt(AlleleFractionSegmentTableColumn.NUM_POINTS_ALLELE_FRACTION);
+ final SimpleInterval interval = new SimpleInterval(contig, start, end);
+ return new AlleleFractionSegment(interval, numPoints);
+ };
+
+ private static final BiConsumer ALLELE_FRACTION_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER = (alleleFractionSegment, dataLine) ->
+ dataLine.append(alleleFractionSegment.getContig())
+ .append(alleleFractionSegment.getStart())
+ .append(alleleFractionSegment.getEnd())
+ .append(alleleFractionSegment.getNumPoints());
+
+ public AlleleFractionSegmentCollection(final File inputFile) {
+ super(inputFile, AlleleFractionSegmentTableColumn.COLUMNS, ALLELE_FRACTION_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, ALLELE_FRACTION_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER);
+ }
+
+ public AlleleFractionSegmentCollection(final SampleMetadata sampleMetadata,
+ final List AlleleFractionSegments) {
+ super(sampleMetadata, AlleleFractionSegments, AlleleFractionSegmentTableColumn.COLUMNS, ALLELE_FRACTION_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, ALLELE_FRACTION_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java
similarity index 94%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java
index 627d9761adf..2b2d20bcd32 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java
@@ -1,7 +1,7 @@
-package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedIntervalCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java
similarity index 90%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedIntervalCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java
index 8b5baccab64..4fc80398244 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedIntervalCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java
@@ -1,6 +1,7 @@
-package org.broadinstitute.hellbender.tools.copynumber.annotation;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LocatableCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CalledCopyRatioSegmentCollection.java
similarity index 92%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegmentCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CalledCopyRatioSegmentCollection.java
index 43606d172ee..50caf45a610 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegmentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CalledCopyRatioSegmentCollection.java
@@ -1,9 +1,9 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.caller;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CalledCopyRatioSegment;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatioCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioCollection.java
similarity index 94%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatioCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioCollection.java
index fdb36df96f7..3e18aeda89d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatioCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioCollection.java
@@ -1,8 +1,8 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
import htsjdk.samtools.util.OverlapDetector;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioSegmentCollection.java
similarity index 93%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegmentCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioSegmentCollection.java
index a6eb23cd105..eedc2042e96 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegmentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioSegmentCollection.java
@@ -1,7 +1,7 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollection.java
similarity index 95%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollection.java
index f7a25cc6cd2..56fc46774d4 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollection.java
@@ -1,10 +1,9 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
import htsjdk.samtools.util.Lazy;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.RealMatrix;
import org.broadinstitute.hdf5.HDF5File;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java
index 20e76949c97..372cf141c74 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java
@@ -34,10 +34,10 @@ public abstract class LocatableCollection extends Reco
/**
* Records are sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR}.
*/
- protected LocatableCollection(final List records,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ LocatableCollection(final List records,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
super(
Utils.nonNull(records).stream().sorted(LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList()),
mandatoryColumns,
@@ -49,10 +49,10 @@ protected LocatableCollection(final List records,
/**
* @throws IllegalArgumentException if records are not sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR}
*/
- protected LocatableCollection(final File inputFile,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ LocatableCollection(final File inputFile,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
super(inputFile, mandatoryColumns, recordFromDataLineDecoder, recordToDataLineEncoder);
validateIntervals(getRecords());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ModeledSegmentCollection.java
similarity index 93%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegmentCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ModeledSegmentCollection.java
index cbfe844c88c..78a01ea36cc 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegmentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ModeledSegmentCollection.java
@@ -1,7 +1,8 @@
-package org.broadinstitute.hellbender.tools.copynumber.multidimensional.model;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
+import org.broadinstitute.hellbender.tools.copynumber.models.MultidimensionalModeller;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
@@ -15,7 +16,7 @@
* @author Samuel Lee <slee@broadinstitute.org>
*/
public final class ModeledSegmentCollection extends SampleLocatableCollection {
- private static final String DOUBLE_FORMAT = "%6.6f"; //TODO replace this with MultidimensionalModeller.DOUBLE_FORMAT from sl_wgs_acnv branch
+ private static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT;
enum ModeledSegmentTableColumn {
CONTIG,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/MultidimensionalSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/MultidimensionalSegmentCollection.java
new file mode 100644
index 00000000000..807b359a0bb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/MultidimensionalSegmentCollection.java
@@ -0,0 +1,56 @@
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
+
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+
+import java.io.File;
+import java.util.List;
+import java.util.function.BiConsumer;
+import java.util.function.Function;
+
+/**
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class MultidimensionalSegmentCollection extends SampleLocatableCollection {
+ enum MultidimensionalSegmentTableColumn {
+ CONTIG,
+ START,
+ END,
+ NUM_POINTS_COPY_RATIO,
+ NUM_POINTS_ALLELE_FRACTION,
+ MEAN_LOG2_COPY_RATIO;
+
+ static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
+ }
+
+ private static final Function MULTIDIMENSIONAL_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION = dataLine -> {
+ final String contig = dataLine.get(MultidimensionalSegmentTableColumn.CONTIG);
+ final int start = dataLine.getInt(MultidimensionalSegmentTableColumn.START);
+ final int end = dataLine.getInt(MultidimensionalSegmentTableColumn.END);
+ final int numPointsCopyRatio = dataLine.getInt(MultidimensionalSegmentTableColumn.NUM_POINTS_COPY_RATIO);
+ final int numPointsAlleleFraction = dataLine.getInt(MultidimensionalSegmentTableColumn.NUM_POINTS_ALLELE_FRACTION);
+ final double meanLog2CopyRatio = dataLine.getDouble(MultidimensionalSegmentTableColumn.MEAN_LOG2_COPY_RATIO);
+ final SimpleInterval interval = new SimpleInterval(contig, start, end);
+ return new MultidimensionalSegment(interval, numPointsCopyRatio, numPointsAlleleFraction, meanLog2CopyRatio);
+ };
+
+ private static final BiConsumer MULTIDIMENSIONAL_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER = (alleleFractionSegment, dataLine) ->
+ dataLine.append(alleleFractionSegment.getContig())
+ .append(alleleFractionSegment.getStart())
+ .append(alleleFractionSegment.getEnd())
+ .append(alleleFractionSegment.getNumPointsCopyRatio())
+ .append(alleleFractionSegment.getNumPointsAlleleFraction())
+ .append(alleleFractionSegment.getMeanLog2CopyRatio());
+
+ public MultidimensionalSegmentCollection(final File inputFile) {
+ super(inputFile, MultidimensionalSegmentTableColumn.COLUMNS, MULTIDIMENSIONAL_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, MULTIDIMENSIONAL_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER);
+ }
+
+ public MultidimensionalSegmentCollection(final SampleMetadata sampleMetadata,
+ final List multidimensionalSegments) {
+ super(sampleMetadata, multidimensionalSegments, MultidimensionalSegmentTableColumn.COLUMNS, MULTIDIMENSIONAL_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, MULTIDIMENSIONAL_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ParameterDecileCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ParameterDecileCollection.java
new file mode 100644
index 00000000000..7a8fc4eb537
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ParameterDecileCollection.java
@@ -0,0 +1,112 @@
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
+
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.mcmc.Decile;
+import org.broadinstitute.hellbender.utils.mcmc.DecileCollection;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterEnum;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+
+import java.io.File;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class ParameterDecileCollection & ParameterEnum> extends SampleRecordCollection> {
+ enum ParameterTableColumn {
+ PARAMETER_NAME,
+ POSTERIOR_10,
+ POSTERIOR_20,
+ POSTERIOR_30,
+ POSTERIOR_40,
+ POSTERIOR_50,
+ POSTERIOR_60,
+ POSTERIOR_70,
+ POSTERIOR_80,
+ POSTERIOR_90;
+
+ static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
+ }
+
+ private static DecileCollection parseDecilesFromDataLine(final DataLine dataLine) {
+ return new DecileCollection(Arrays.asList(
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_10),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_20),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_30),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_40),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_50),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_60),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_70),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_80),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_90)));
+ }
+
+ private static void appendDecilesToDataLine(final DataLine dataLine,
+ final DecileCollection deciles,
+ final String doubleFormat) {
+ dataLine.append(String.format(doubleFormat, deciles.get(Decile.DECILE_10)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_20)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_30)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_40)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_50)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_60)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_70)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_80)))
+ .append(String.format(doubleFormat, deciles.get(Decile.DECILE_90)));
+ }
+
+ private final Map parameterToDecileCollectionMap;
+
+ public ParameterDecileCollection(final SampleMetadata sampleMetadata,
+ final Map parameterToDecileCollectionMap,
+ final Class parameterClass,
+ final String doubleFormat) {
+ super(
+ Utils.nonNull(sampleMetadata),
+ new ArrayList<>(parameterToDecileCollectionMap.entrySet()),
+ ParameterTableColumn.COLUMNS,
+ dataLine -> {
+ final String parameterName = dataLine.get(ParameterTableColumn.PARAMETER_NAME);
+ final T parameter = Enum.valueOf(Utils.nonNull(parameterClass), parameterName);
+ final DecileCollection deciles = parseDecilesFromDataLine(dataLine);
+ return new AbstractMap.SimpleEntry<>(parameter, deciles);},
+ (record, dataLine) -> {
+ final T parameter = record.getKey();
+ final DecileCollection deciles = record.getValue();
+ appendDecilesToDataLine(dataLine.append(parameter.toString()), deciles, doubleFormat);
+ }
+ );
+ this.parameterToDecileCollectionMap = parameterToDecileCollectionMap;
+ }
+
+ public ParameterDecileCollection(final File file,
+ final Class parameterClass,
+ final String doubleFormat) {
+ super(
+ Utils.nonNull(file),
+ ParameterTableColumn.COLUMNS,
+ dataLine -> {
+ final String parameterName = dataLine.get(ParameterTableColumn.PARAMETER_NAME);
+ final T parameter = Enum.valueOf(Utils.nonNull(parameterClass), parameterName);
+ final DecileCollection deciles = parseDecilesFromDataLine(dataLine);
+ return new AbstractMap.SimpleEntry<>(parameter, deciles);},
+ (record, dataLine) -> {
+ final T parameter = record.getKey();
+ final DecileCollection deciles = record.getValue();
+ dataLine.append(parameter.toString());
+ appendDecilesToDataLine(dataLine, deciles, doubleFormat);
+ }
+ );
+ parameterToDecileCollectionMap = getRecords().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
+ }
+
+ public DecileCollection getDeciles(final T parameter) {
+ return parameterToDecileCollectionMap.get(parameter);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java
index 0f9e2af1fe9..c9ae2837fb0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java
@@ -36,10 +36,10 @@ public abstract class RecordCollection {
* @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file
* @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file
*/
- protected RecordCollection(final List records,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ RecordCollection(final List records,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
this.records = ImmutableList.copyOf(Utils.nonNull(records));
this.mandatoryColumns = Utils.nonNull(mandatoryColumns);
this.recordFromDataLineDecoder = Utils.nonNull(recordFromDataLineDecoder);
@@ -56,10 +56,10 @@ protected RecordCollection(final List records,
* @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file
* @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file
*/
- protected RecordCollection(final File inputFile,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ RecordCollection(final File inputFile,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
IOUtils.canReadFile(inputFile);
this.mandatoryColumns = Utils.nonNull(mandatoryColumns);
this.recordFromDataLineDecoder = Utils.nonNull(recordFromDataLineDecoder);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java
index 2263130a9d9..925ac1de213 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java
@@ -35,11 +35,11 @@ public abstract class SampleLocatableCollection extend
/**
* Records are sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR}.
*/
- protected SampleLocatableCollection(final SampleMetadata sampleMetadata,
- final List records,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ SampleLocatableCollection(final SampleMetadata sampleMetadata,
+ final List records,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
super(
sampleMetadata,
Utils.nonNull(records).stream().sorted(LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList()),
@@ -52,10 +52,10 @@ protected SampleLocatableCollection(final SampleMetadata sampleMetadata,
/**
* @throws IllegalArgumentException if records are not sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR}
*/
- protected SampleLocatableCollection(final File inputFile,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ SampleLocatableCollection(final File inputFile,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
super(inputFile, mandatoryColumns, recordFromDataLineDecoder, recordToDataLineEncoder);
validateIntervals(getSampleName(), getRecords());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java
index 9eecc6d8b6d..e02bd1db06e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java
@@ -41,11 +41,11 @@ public abstract class SampleRecordCollection implements SampleMetadata {
* @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file
* @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file
*/
- protected SampleRecordCollection(final SampleMetadata sampleMetadata,
- final List records,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ SampleRecordCollection(final SampleMetadata sampleMetadata,
+ final List records,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
this.sampleMetadata = Utils.nonNull(sampleMetadata);
this.records = ImmutableList.copyOf(Utils.nonNull(records));
this.mandatoryColumns = Utils.nonNull(mandatoryColumns);
@@ -64,10 +64,10 @@ protected SampleRecordCollection(final SampleMetadata sampleMetadata,
* @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file
* @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file
*/
- protected SampleRecordCollection(final File inputFile,
- final TableColumnCollection mandatoryColumns,
- final Function recordFromDataLineDecoder,
- final BiConsumer recordToDataLineEncoder) {
+ SampleRecordCollection(final File inputFile,
+ final TableColumnCollection mandatoryColumns,
+ final Function recordFromDataLineDecoder,
+ final BiConsumer recordToDataLineEncoder) {
IOUtils.canReadFile(inputFile);
this.mandatoryColumns = Utils.nonNull(mandatoryColumns);
this.recordFromDataLineDecoder = Utils.nonNull(recordFromDataLineDecoder);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollection.java
similarity index 95%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollection.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollection.java
index 97b8f178149..dbc7928879c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollection.java
@@ -1,9 +1,9 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount;
+package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
import org.broadinstitute.hdf5.HDF5File;
import org.broadinstitute.hdf5.HDF5LibException;
-import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AlleleFractionSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AlleleFractionSegment.java
new file mode 100644
index 00000000000..621e2ddf5dc
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AlleleFractionSegment.java
@@ -0,0 +1,82 @@
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
+
+import htsjdk.samtools.util.Locatable;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.util.List;
+
+public class AlleleFractionSegment implements Locatable {
+ private final SimpleInterval interval;
+ private final int numPoints;
+
+ public AlleleFractionSegment(final SimpleInterval interval,
+ final int numPoints) {
+ Utils.nonNull(interval);
+ ParamUtils.isPositiveOrZero(numPoints, "Number of points must be non-negative.");
+ this.interval = interval;
+ this.numPoints = numPoints;
+ }
+
+ public AlleleFractionSegment(final SimpleInterval interval,
+ final List allelicCounts) {
+ Utils.nonNull(interval);
+ Utils.nonNull(allelicCounts);
+ this.interval = interval;
+ numPoints = allelicCounts.size();
+ }
+
+ @Override
+ public String getContig() {
+ return interval.getContig();
+ }
+
+ @Override
+ public int getStart() {
+ return interval.getStart();
+ }
+
+ @Override
+ public int getEnd() {
+ return interval.getEnd();
+ }
+
+ public SimpleInterval getInterval() {
+ return interval;
+ }
+
+ public int getNumPoints() {
+ return numPoints;
+ }
+
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ final AlleleFractionSegment that = (AlleleFractionSegment) o;
+ return numPoints == that.numPoints &&
+ interval.equals(that.interval);
+ }
+
+ @Override
+ public int hashCode() {
+ int result;
+ result = interval.hashCode();
+ result = 31 * result + numPoints;
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "AlleleFractionSegment{" +
+ "interval=" + interval +
+ ", numPoints=" + numPoints +
+ '}';
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCount.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AllelicCount.java
similarity index 98%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCount.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AllelicCount.java
index 6bcb21d9093..c5db911c6db 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCount.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AllelicCount.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.utils.Nucleotide;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedInterval.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotatedInterval.java
similarity index 96%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedInterval.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotatedInterval.java
index c06fcee370c..1bc35f57ed1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedInterval.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotatedInterval.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.annotation;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotationSet.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotationSet.java
similarity index 86%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotationSet.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotationSet.java
index cba098ab86b..dd3a55beddb 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotationSet.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotationSet.java
@@ -1,5 +1,6 @@
-package org.broadinstitute.hellbender.tools.copynumber.annotation;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.utils.Utils;
/**
@@ -7,7 +8,7 @@
*
* @author Samuel Lee <slee@broadinstitute.org>
*/
-public class AnnotationSet {
+public final class AnnotationSet {
/**
* If additional annotation fields are added here, then {@link AnnotatedIntervalCollection}
* should be updated accordingly.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CalledCopyRatioSegment.java
similarity index 90%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegment.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CalledCopyRatioSegment.java
index 9f23254a4b4..178dab834a0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegment.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CalledCopyRatioSegment.java
@@ -1,6 +1,5 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.caller;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment;
import org.broadinstitute.hellbender.utils.Utils;
public class CalledCopyRatioSegment extends CopyRatioSegment {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatio.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatio.java
similarity index 96%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatio.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatio.java
index 2e3a4cc4591..42dc271016e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatio.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatio.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.utils.SimpleInterval;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatioSegment.java
similarity index 85%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegment.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatioSegment.java
index 94cce16acd9..fb55015449d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegment.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatioSegment.java
@@ -1,7 +1,6 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
import htsjdk.samtools.util.Locatable;
-import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
@@ -24,13 +23,12 @@ public CopyRatioSegment(final SimpleInterval interval,
}
public CopyRatioSegment(final SimpleInterval interval,
- final List denoisedLog2CopyRatios) {
+ final List denoisedLog2CopyRatios) {
Utils.nonNull(interval);
Utils.nonNull(denoisedLog2CopyRatios);
this.interval = interval;
numPoints = denoisedLog2CopyRatios.size();
- final double meanCopyRatio = denoisedLog2CopyRatios.stream().mapToDouble(log2CR -> Math.pow(2., log2CR)).average().orElse(Double.NaN);
- meanLog2CopyRatio = Math.log(meanCopyRatio) * GATKProtectedMathUtils.INV_LOG_2;
+ meanLog2CopyRatio = denoisedLog2CopyRatios.stream().mapToDouble(CopyRatio::getLog2CopyRatioValue).average().orElse(Double.NaN);
}
@Override
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/ModeledSegment.java
similarity index 97%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegment.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/ModeledSegment.java
index abfad139019..f3e36851d40 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegment.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/ModeledSegment.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.multidimensional.model;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.utils.SimpleInterval;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/MultidimensionalSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/MultidimensionalSegment.java
new file mode 100644
index 00000000000..c0cd8ac822c
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/MultidimensionalSegment.java
@@ -0,0 +1,125 @@
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
+
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+public class MultidimensionalSegment implements Locatable {
+ private final SimpleInterval interval;
+ private final int numPointsCopyRatio;
+ private final int numPointsAlleleFraction;
+ private final double meanLog2CopyRatio;
+
+ public MultidimensionalSegment(final SimpleInterval interval,
+ final int numPointsCopyRatio,
+ final int numPointsAlleleFraction,
+ final double meanLog2CopyRatio) {
+ Utils.nonNull(interval);
+ Utils.validateArg(numPointsCopyRatio > 0 || numPointsAlleleFraction > 0,
+ String.format("Number of copy-ratio points or number of allele-fraction points must be positive: %s", interval));
+ this.interval = interval;
+ this.numPointsCopyRatio = numPointsCopyRatio;
+ this.numPointsAlleleFraction = numPointsAlleleFraction;
+ this.meanLog2CopyRatio = meanLog2CopyRatio;
+ }
+
+ public MultidimensionalSegment(final SimpleInterval interval,
+ final List denoisedLog2CopyRatios,
+ final List allelicCounts) {
+ Utils.nonNull(interval);
+ Utils.nonNull(denoisedLog2CopyRatios);
+ Utils.nonNull(allelicCounts);
+ this.interval = interval;
+ numPointsCopyRatio = denoisedLog2CopyRatios.size();
+ numPointsAlleleFraction = allelicCounts.size();
+ meanLog2CopyRatio = new CopyRatioSegment(interval, denoisedLog2CopyRatios).getMeanLog2CopyRatio();
+ }
+
+ public MultidimensionalSegment(final SimpleInterval interval,
+ final OverlapDetector copyRatioMidpointOverlapDetector,
+ final OverlapDetector allelicCountOverlapDetector) {
+ this(
+ interval,
+ copyRatioMidpointOverlapDetector.getOverlaps(interval).stream()
+ .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR)
+ .collect(Collectors.toList()),
+ allelicCountOverlapDetector.getOverlaps(interval).stream()
+ .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR)
+ .collect(Collectors.toList()));
+ }
+
+ @Override
+ public String getContig() {
+ return interval.getContig();
+ }
+
+ @Override
+ public int getStart() {
+ return interval.getStart();
+ }
+
+ @Override
+ public int getEnd() {
+ return interval.getEnd();
+ }
+
+ public SimpleInterval getInterval() {
+ return interval;
+ }
+
+ public int getNumPointsCopyRatio() {
+ return numPointsCopyRatio;
+ }
+
+ public int getNumPointsAlleleFraction() {
+ return numPointsAlleleFraction;
+ }
+
+ public double getMeanLog2CopyRatio() {
+ return meanLog2CopyRatio;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ final MultidimensionalSegment that = (MultidimensionalSegment) o;
+
+ return numPointsCopyRatio == that.numPointsCopyRatio &&
+ numPointsAlleleFraction == that.numPointsAlleleFraction &&
+ Double.compare(that.meanLog2CopyRatio, meanLog2CopyRatio) == 0 &&
+ interval.equals(that.interval);
+ }
+
+ @Override
+ public int hashCode() {
+ int result;
+ long temp;
+ result = interval.hashCode();
+ result = 31 * result + numPointsCopyRatio;
+ result = 31 * result + numPointsAlleleFraction;
+ temp = Double.doubleToLongBits(meanLog2CopyRatio);
+ result = 31 * result + (int) (temp ^ (temp >>> 32));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "MultidimensionalSegment{" +
+ "interval=" + interval +
+ ", numPointsCopyRatio=" + numPointsCopyRatio +
+ ", numPointsAlleleFraction=" + numPointsAlleleFraction +
+ ", meanLog2CopyRatio=" + meanLog2CopyRatio +
+ '}';
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCount.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java
similarity index 95%
rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCount.java
rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java
index a74b04dc5f8..7e99d2b1e9e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCount.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java
@@ -1,4 +1,4 @@
-package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount;
+package org.broadinstitute.hellbender.tools.copynumber.formats.records;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.utils.SimpleInterval;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionGlobalParameters.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionGlobalParameters.java
new file mode 100644
index 00000000000..806fcdcb19b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionGlobalParameters.java
@@ -0,0 +1,66 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+/**
+ * Encapsulates the global parameters of the allele fraction model: the mean and variance of the common prior on
+ * allelic biases and the outlier probability.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ */
+final class AlleleFractionGlobalParameters {
+ static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT;
+
+ private final double meanBias;
+ private final double biasVariance;
+ private final double outlierProbability;
+
+ AlleleFractionGlobalParameters(final double meanBias,
+ final double biasVariance,
+ final double outlierProbability) {
+ this.meanBias = meanBias;
+ this.biasVariance = biasVariance;
+ this.outlierProbability = outlierProbability;
+ }
+
+ double getMeanBias() {
+ return meanBias;
+ }
+
+ double getBiasVariance() {
+ return biasVariance;
+ }
+
+ double getOutlierProbability() {
+ return outlierProbability;
+ }
+
+ //get the gamma distribution alpha parameter
+ double getAlpha() {
+ return meanBias * meanBias / biasVariance;
+ }
+
+ //get the gamma distribution beta parameter
+ double getBeta() {
+ return meanBias / biasVariance;
+ }
+
+ AlleleFractionGlobalParameters copyWithNewMeanBias(final double newMeanBias) {
+ return new AlleleFractionGlobalParameters(newMeanBias, biasVariance, outlierProbability);
+ }
+
+ AlleleFractionGlobalParameters copyWithNewBiasVariance(final double newBiasVariance) {
+ return new AlleleFractionGlobalParameters(meanBias, newBiasVariance, outlierProbability);
+ }
+
+ AlleleFractionGlobalParameters copyWithNewOutlierProbability(final double newOutlierProbability) {
+ return new AlleleFractionGlobalParameters(meanBias, biasVariance, newOutlierProbability);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("AlleleFractionGlobalParameters{" +
+ "meanBias=" + DOUBLE_FORMAT +
+ ", biasVariance=" + DOUBLE_FORMAT +
+ ", outlierProbability=" + DOUBLE_FORMAT +
+ '}', meanBias, biasVariance, outlierProbability);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializer.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializer.java
new file mode 100644
index 00000000000..287f1ad97f4
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializer.java
@@ -0,0 +1,185 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.apache.commons.math3.exception.MaxCountExceededException;
+import org.apache.commons.math3.special.Beta;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.utils.OptimizationUtils;
+import org.broadinstitute.hellbender.utils.Utils;
+
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * The allele-fraction model (after marginalizing latent parameters as described in docs/CNVs/CNV-methods.pdf)
+ * contains the following parameters:
+ * 1. minor-allele fractions for each segment
+ * 2. a global outlier probability
+ * 3. the mean allelic bias
+ * 4. the rate (mean / variance) of the allelic bias
+ *
+ * Note that 3 and 4 are hyperparameters specifying a gamma distribution prior on allelic bias -- the latent variables
+ * for bias at each het site have been marginalized but the hyperparameters have not.
+ *
+ * The allele-fraction model samples the distribution of these parameters using Markov chain Monte Carlo and in principle
+ * an initialization step is not necessary. However, in practice this initialization finds the mode of the posterior
+ * distributions in only a few iterations, whereas sampling would require many more. Thus we greatly reduce the
+ * number of burn-in samples that we must discard.
+ *
+ * The initialization is straightforward: first we set the minor fractions to reasonable guesses based on alt and ref
+ * counts, assuming no allelic bias. Then we numerically maximize the likelihood with respect to each parameter until
+ * the likelihood converges to a maximum. In practice this is the unique global maximum.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class AlleleFractionInitializer {
+ private static final Logger logger = LogManager.getLogger(AlleleFractionInitializer.class);
+
+ private static final double INITIAL_OUTLIER_PROBABILITY = 0.01;
+ private static final double INITIAL_MEAN_BIAS = 1.0;
+ private static final double INITIAL_BIAS_VARIANCE = 0.05; //this is an overestimate, but starting small makes it slow for
+ //mean bias to escape a bad initial guess
+ private static final AlleleFractionGlobalParameters INITIAL_GLOBAL_PARAMETERS =
+ new AlleleFractionGlobalParameters(INITIAL_MEAN_BIAS, INITIAL_BIAS_VARIANCE, INITIAL_OUTLIER_PROBABILITY);
+
+ private static final double LOG_LIKELIHOOD_CONVERGENCE_THRESHOLD = 0.5;
+ private static final int MAX_ITERATIONS = 50;
+
+ //define maxima of search intervals for maximum likelihood -- parameter values above these would be ridiculous
+ static final double MAX_REASONABLE_OUTLIER_PROBABILITY = 0.15;
+ static final double MAX_REASONABLE_MEAN_BIAS = 5.0;
+ static final double MAX_REASONABLE_BIAS_VARIANCE = 0.5;
+ private static final double EPSILON_FOR_NEAR_MAX_WARNING = 1E-2;
+
+ //the minor-allele fraction of a segment must be less than one half by definition
+ private static final double MAX_MINOR_ALLELE_FRACTION = 0.5;
+
+ private final AlleleFractionSegmentedData data;
+ private AlleleFractionGlobalParameters globalParameters;
+ private AlleleFractionState.MinorFractions minorFractions;
+
+ /**
+ * This constructor performs the initialization.
+ */
+ AlleleFractionInitializer(final AlleleFractionSegmentedData data) {
+ this.data = Utils.nonNull(data);
+ globalParameters = INITIAL_GLOBAL_PARAMETERS;
+ minorFractions = calculateInitialMinorFractions(data);
+ double previousIterationLogLikelihood;
+ double nextIterationLogLikelihood = Double.NEGATIVE_INFINITY;
+ logger.info(String.format("Initializing allele-fraction model, iterating until log likelihood converges to within %.3f...",
+ LOG_LIKELIHOOD_CONVERGENCE_THRESHOLD));
+ int iteration = 1;
+ do {
+ previousIterationLogLikelihood = nextIterationLogLikelihood;
+ globalParameters = new AlleleFractionGlobalParameters(
+ estimateMeanBias(), estimateBiasVariance(), estimateOutlierProbability());
+ minorFractions = estimateMinorFractions();
+
+ nextIterationLogLikelihood = AlleleFractionLikelihoods.logLikelihood(globalParameters, minorFractions, data);
+ logger.info(String.format("Iteration %d, model log likelihood = %.3f...", iteration, nextIterationLogLikelihood));
+ logger.info(globalParameters);
+ iteration++;
+ } while (iteration < MAX_ITERATIONS &&
+ nextIterationLogLikelihood - previousIterationLogLikelihood > LOG_LIKELIHOOD_CONVERGENCE_THRESHOLD);
+ warnIfNearMax(AlleleFractionParameter.MEAN_BIAS.name, globalParameters.getMeanBias(), MAX_REASONABLE_MEAN_BIAS, EPSILON_FOR_NEAR_MAX_WARNING);
+ warnIfNearMax(AlleleFractionParameter.BIAS_VARIANCE.name, globalParameters.getBiasVariance(), MAX_REASONABLE_BIAS_VARIANCE, EPSILON_FOR_NEAR_MAX_WARNING);
+ warnIfNearMax(AlleleFractionParameter.OUTLIER_PROBABILITY.name, globalParameters.getOutlierProbability(), MAX_REASONABLE_OUTLIER_PROBABILITY, EPSILON_FOR_NEAR_MAX_WARNING);
+ }
+
+ private static void warnIfNearMax(final String parameterName,
+ final double value,
+ final double maxValue,
+ final double epsilon) {
+ if (maxValue - value < epsilon) {
+ logger.warn(String.format("The maximum-likelihood estimate for the global parameter %s (%s) was near its boundary (%s), " +
+ "the model is likely not a good fit to the data! Consider changing parameters for filtering homozygous sites.",
+ parameterName,
+ String.format(AlleleFractionGlobalParameters.DOUBLE_FORMAT, value),
+ String.format(AlleleFractionGlobalParameters.DOUBLE_FORMAT, maxValue)));
+ }
+ }
+
+ AlleleFractionState getInitializedState() {
+ return new AlleleFractionState(
+ globalParameters.getMeanBias(), globalParameters.getBiasVariance(), globalParameters.getOutlierProbability(), minorFractions);
+ }
+
+ /**
+ *
+ * Initialize minor fractions assuming no allelic bias.
+ *
+ *
+ *
+ * We integrate over f to get posterior probabilities (responsibilities) of alt / ref minor
+ * that is, responsibility of alt minor is int_{0 to 1/2} f^a (1 - f)^r df
+ * responsibility of ref minor is int_{0 to 1/2} f^r (1 - f)^a df
+ * These are proportional to I(1/2, a + 1, r + 1) and I(1/2, r + 1, a + 1),
+ * respectively, where I is the (incomplete) regularized Beta function.
+ * By definition, these likelihoods sum to 1, i.e., they are already normalized.
+ *
+ *
+ *
+ * Finally, we set each minor fraction to the responsibility-weighted total count of
+ * reads in minor allele divided by total reads, ignoring outliers.
+ *
+ */
+ private AlleleFractionState.MinorFractions calculateInitialMinorFractions(final AlleleFractionSegmentedData data) {
+ final int numSegments = data.getNumSegments();
+ final AlleleFractionState.MinorFractions result = new AlleleFractionState.MinorFractions(numSegments);
+ for (int segment = 0; segment < numSegments; segment++) {
+ double responsibilityWeightedMinorAlleleReadCount = 0.0;
+ double responsibilityWeightedTotalReadCount = 0.0;
+ for (final AllelicCount count : data.getIndexedAllelicCountsInSegment(segment)) {
+ final int a = count.getAltReadCount();
+ final int r = count.getRefReadCount();
+ double altMinorResponsibility;
+ try {
+ altMinorResponsibility = Beta.regularizedBeta(0.5, a + 1, r + 1);
+ } catch (final MaxCountExceededException e) {
+ altMinorResponsibility = a < r ? 1.0 : 0.0; //if the special function can't be computed, give an all-or-nothing responsibility
+ }
+ responsibilityWeightedMinorAlleleReadCount += altMinorResponsibility * a + (1 - altMinorResponsibility) * r;
+ responsibilityWeightedTotalReadCount += a + r;
+ }
+
+ // we achieve a flat prior via a single pseudocount for minor and non-minor reads, hence the +1 and +2
+ result.add((responsibilityWeightedMinorAlleleReadCount + 1)/(responsibilityWeightedTotalReadCount + 2));
+ }
+ return result;
+ }
+
+ private double estimateOutlierProbability() {
+ final Function objective = outlierProbability ->
+ AlleleFractionLikelihoods.logLikelihood(globalParameters.copyWithNewOutlierProbability(outlierProbability), minorFractions, data);
+ return OptimizationUtils.argmax(objective, 0.0, MAX_REASONABLE_OUTLIER_PROBABILITY, globalParameters.getOutlierProbability());
+ }
+
+ private double estimateMeanBias() {
+ final Function objective = meanBias ->
+ AlleleFractionLikelihoods.logLikelihood(globalParameters.copyWithNewMeanBias(meanBias), minorFractions, data);
+ return OptimizationUtils.argmax(objective, 0.0, MAX_REASONABLE_MEAN_BIAS, globalParameters.getMeanBias());
+ }
+
+ private double estimateBiasVariance() {
+ final Function objective = biasVariance ->
+ AlleleFractionLikelihoods.logLikelihood(globalParameters.copyWithNewBiasVariance(biasVariance), minorFractions, data);
+ return OptimizationUtils.argmax(objective, 0.0, MAX_REASONABLE_BIAS_VARIANCE, globalParameters.getBiasVariance());
+ }
+
+ private double estimateMinorFraction(final int segment) {
+ final Function objective = minorFraction ->
+ AlleleFractionLikelihoods.segmentLogLikelihood(globalParameters, minorFraction, data.getIndexedAllelicCountsInSegment(segment));
+ return OptimizationUtils.argmax(objective, 0.0, MAX_MINOR_ALLELE_FRACTION, minorFractions.get(segment));
+ }
+
+ private AlleleFractionState.MinorFractions estimateMinorFractions() {
+ return new AlleleFractionState.MinorFractions(
+ IntStream.range(0, data.getNumSegments()).boxed()
+ .map(this::estimateMinorFraction)
+ .collect(Collectors.toList()));
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java
new file mode 100644
index 00000000000..40abdc00ff4
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java
@@ -0,0 +1,188 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.apache.commons.math3.special.Gamma;
+import org.apache.commons.math3.util.FastMath;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils;
+
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.IntStream;
+
+import static org.apache.commons.math3.util.FastMath.sqrt;
+import static org.broadinstitute.hellbender.utils.MathUtils.log10Factorial;
+import static org.broadinstitute.hellbender.utils.MathUtils.log10ToLog;
+
+/**
+ * Contains likelihood methods for the allele-fraction model.
+ * See docs/CNVs/CNV-methods.pdf for a thorough description of the model.
+ *
+ * We can compute the log-likelihood of a alt reads and r ref reads given minor fraction f and gamma hyperparameters
+ * (specifying the distribution on allelic biases) mu (mean) and beta (rate = mean/variance) and given
+ * an alt minor, ref minor, or outlier indicator state. Note that this is a partially collapsed log-likelihood in that the
+ * latent variable corresponding to the allelic bias at this site has been marginalized out but the indicator
+ * variable has not been marginalized out.
+ *
+ * See docs/CNVs/CNV-methods.pdf for derivation.
+ *
+ * Finally, note that this is a static method and does not get mu, beta, and minorFraction from an AlleleFractionState object
+ * We need such functionality because MCMC evaluates the likelihood under proposed parameter changes.
+ *
+ *
+ * if indicator == ALT_MINOR:
+ *
+ * log { [beta^alpha / Gamma(alpha)][(1 - pi) / 2] * int_{0 to infty} f^a * (1 - f)^r * lambda^(alpha + r - 1) * exp(-beta * lambda)/(f + (1 - f) * lambda)^n d lambda }
+ *
+ * if indicator == REF_MINOR, same as ALT_MINOR but with f <--> 1 - f
+ *
+ * if indicator == OUTLIER log {pi * a!r!/(n+1)!}
+ *
+ * where alpha = mu*beta and n = a + r.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class AlleleFractionLikelihoods {
+ private static final double EPSILON = 1E-10;
+
+ private static final FunctionCache logGammaCache = new FunctionCache(Gamma::logGamma);
+ private static final FunctionCache logCache = new FunctionCache(FastMath::log);
+
+ private static final class FunctionCache extends LinkedHashMap {
+ private static final long serialVersionUID = 19841647L;
+ private static final int MAX_SIZE = 100_000;
+
+ private final Function mappingFunction;
+
+ FunctionCache(final Function mappingFunction) {
+ this.mappingFunction = mappingFunction;
+ }
+
+ Double computeIfAbsent(final Double key) {
+ return super.computeIfAbsent(key, mappingFunction);
+ }
+
+ @Override
+ protected boolean removeEldestEntry(final Map.Entry eldest) {
+ return size() >= MAX_SIZE;
+ }
+ }
+
+ private AlleleFractionLikelihoods() {}
+
+ static double segmentLogLikelihood(final AlleleFractionGlobalParameters parameters,
+ final double minorFraction,
+ final List allelicCountsInSegment) {
+ final double alpha = parameters.getAlpha();
+ final double beta = parameters.getBeta();
+ final double pi = parameters.getOutlierProbability();
+
+ //we compute some quantities that will be reused
+ final double logPi = logCache.computeIfAbsent(pi);
+ final double logNotPi = logCache.computeIfAbsent((1 - pi) / 2);
+ final double logcCommon = alpha * logCache.computeIfAbsent(beta) - logGammaCache.computeIfAbsent(alpha);
+ final double majorFraction = 1 - minorFraction;
+ final double logMinorFraction = log(minorFraction);
+ final double logMajorFraction = log(majorFraction);
+
+ double logLikelihood = 0.;
+ for (final AllelicCount allelicCount : allelicCountsInSegment) {
+ final int a = allelicCount.getAltReadCount();
+ final int r = allelicCount.getRefReadCount();
+ final int n = a + r;
+
+ //alt-minor calculation
+ final double lambda0AltMinor = biasPosteriorMode(alpha, beta, minorFraction, a, r);
+ final double kappaAltMinor = biasPosteriorCurvature(alpha, minorFraction, r, n, lambda0AltMinor);
+ final double rhoAltMinor = biasPosteriorEffectiveAlpha(lambda0AltMinor, kappaAltMinor);
+ final double tauAltMinor = biasPosteriorEffectiveBeta(lambda0AltMinor, kappaAltMinor);
+ final double logcAltMinor = logcCommon + a * logMinorFraction + r * logMajorFraction
+ + (r + alpha - rhoAltMinor) * log(lambda0AltMinor) + (tauAltMinor - beta) * lambda0AltMinor
+ - n * log(minorFraction + majorFraction * lambda0AltMinor);
+ final double altMinorLogLikelihood = logNotPi + logcAltMinor + Gamma.logGamma(rhoAltMinor) - rhoAltMinor * log(tauAltMinor);
+
+ //ref-minor calculation
+ final double lambda0RefMinor = biasPosteriorMode(alpha, beta, majorFraction, a, r);
+ final double kappaRefMinor = biasPosteriorCurvature(alpha, majorFraction, r, n, lambda0RefMinor);
+ final double rhoRefMinor = biasPosteriorEffectiveAlpha(lambda0RefMinor, kappaRefMinor);
+ final double tauRefMinor = biasPosteriorEffectiveBeta(lambda0RefMinor, kappaRefMinor);
+ final double logcRefMinor = logcCommon + a * logMajorFraction + r * logMinorFraction
+ + (r + alpha - rhoRefMinor) * log(lambda0RefMinor) + (tauRefMinor - beta) * lambda0RefMinor
+ - n * log(majorFraction + minorFraction * lambda0RefMinor);
+ final double refMinorLogLikelihood = logNotPi + logcRefMinor + Gamma.logGamma(rhoRefMinor) - rhoRefMinor * log(tauRefMinor);
+
+ final double outlierLogLikelihood = logPi + log10ToLog(log10Factorial(a) + log10Factorial(r) - log10Factorial(a + r + 1));
+
+ logLikelihood += GATKProtectedMathUtils.logSumExp(altMinorLogLikelihood, refMinorLogLikelihood, outlierLogLikelihood);
+ }
+ return logLikelihood;
+ }
+
+ /**
+ * The total log likelihood of all segments.
+ */
+ static double logLikelihood(final AlleleFractionGlobalParameters parameters,
+ final AlleleFractionState.MinorFractions minorFractions,
+ final AlleleFractionSegmentedData data) {
+ return IntStream.range(0, data.getNumSegments())
+ .mapToDouble(segment -> segmentLogLikelihood(parameters, minorFractions.get(segment), data.getIndexedAllelicCountsInSegment(segment)))
+ .sum();
+ }
+
+ /**
+ * Calculates the mode of the exact allelic-bias posterior at given values of the hyperparameters for the
+ * * allelic-bias Gamma-distribution prior, the minor-allele fraction parameter, and the observed
+ * counts at a site. See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as lambda_0) for details.
+ * @param alpha alpha hyperparameter for allelic-bias Gamma-distribution prior
+ * @param beta beta hyperparameter for allelic-bias Gamma-distribution prior
+ * @param f minor-allele fraction
+ * @param a alt counts
+ * @param r ref counts
+ */
+ private static double biasPosteriorMode(final double alpha, final double beta, final double f, final int a, final int r) {
+ final double w = (1 - f) * (a - alpha + 1) + beta * f;
+ return Math.max((sqrt(w * w + 4 * beta * f * (1 - f) * (r + alpha - 1)) - w) / (2 * beta * (1 - f)), EPSILON);
+ }
+
+ /**
+ * Calculates the curvature (second derivative at the mode) of the exact allelic-bias log posterior
+ * at given values of the hyperparameters for the allelic-bias Gamma-distribution prior,
+ * the minor-allele fraction parameter, and the observed counts at a site.
+ * See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as kappa) for details.
+ * @param alpha alpha hyperparameter for allelic-bias Gamma-distribution prior
+ * @param f minor-allele fraction
+ * @param r ref counts
+ * @param n total counts
+ * @param lambda0 mode of allelic-bias posterior
+ */
+ private static double biasPosteriorCurvature(final double alpha, final double f, final int r, final int n, final double lambda0) {
+ final double y = (1 - f) / (f + (1 - f) * lambda0);
+ return n * y * y - (r + alpha - 1) / (lambda0 * lambda0);
+ }
+
+ /**
+ * Calculates the effective alpha hyperparameter for the Gamma-distribution approximation of the exact allelic-bias posterior.
+ * See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as rho) for details.
+ * @param lambda0 mode of allelic-bias posterior
+ * @param kappa curvature of allelic-bias posterior
+ */
+ private static double biasPosteriorEffectiveAlpha(final double lambda0, final double kappa) {
+ return Math.max(1 - kappa * lambda0 * lambda0, EPSILON);
+ }
+
+ /**
+ * Calculates the effective beta hyperparameter for the Gamma-distribution approximation of the exact allelic-bias posterior.
+ * See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as tau) for details.
+ * @param lambda0 mode of allelic-bias posterior
+ * @param kappa curvature of allelic-bias posterior
+ */
+ private static double biasPosteriorEffectiveBeta(final double lambda0, final double kappa) {
+ return Math.max(-kappa * lambda0, EPSILON);
+ }
+
+ private static double log(final double x) {
+ return FastMath.log(Math.max(EPSILON, x));
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModeller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModeller.java
new file mode 100644
index 00000000000..d097d427392
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModeller.java
@@ -0,0 +1,193 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.mcmc.DecileCollection;
+import org.broadinstitute.hellbender.utils.mcmc.GibbsSampler;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterizedModel;
+
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Given segments and counts of alt and ref reads over a list of het sites,
+ * infers the minor-allele fraction of each segment. For example, a segment
+ * with (alt,ref) counts (10,90), (11,93), (88,12), (90,10) probably has a minor-allele fraction
+ * somewhere around 0.1. The model takes into account allelic bias due to mapping etc. by learning
+ * a global gamma distribution on allelic bias ratios.
+ *
+ * We define the bias ratio of each het locus to be the expected ratio of
+ * mapped ref reads to mapped alt reads given equal amounts of DNA (that is, given
+ * a germline het). The model learns a common gamma distribution:
+ * bias ratio ~ Gamma(alpha = mu^2 / sigma^2, beta = mu / sigma^2)
+ * where mu and sigma^2 are the global mean and variance of bias ratios, and
+ * alpha, beta are the natural parameters of the gamma distribution.
+ *
+ *
+ * Each segment has a minor-allele fraction f, and for each het within the locus
+ * the number of alt reads is drawn from a binomial distribution with total count
+ * n = #alt reads + #ref reads and alt probability f / (f + (1 - f) * bias ratio) if the
+ * locus is alt minor and (1 - f) / (1 - f + f * bias ratio) if the locus is ref minor.
+ *
+ *
+ * Conceptually, the model contains latent variables corresponding to the bias ratio
+ * and indicators for alt minor/ref minor at each het locus. However, we integrate them
+ * out and the MCMC model below only contains the minor-allele fractions and
+ * the three hyperparameters of the model: the two parameters of the gamma distribution
+ * along with the global outlier probability.
+ *
+ * See docs/CNVs/CNV-methods.pdf for a thorough description of the model.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class AlleleFractionModeller {
+ private static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT;
+
+ private static final double MAX_REASONABLE_MEAN_BIAS = AlleleFractionInitializer.MAX_REASONABLE_MEAN_BIAS;
+ private static final double MAX_REASONABLE_BIAS_VARIANCE = AlleleFractionInitializer.MAX_REASONABLE_BIAS_VARIANCE;
+ private static final double MAX_REASONABLE_OUTLIER_PROBABILITY = AlleleFractionInitializer.MAX_REASONABLE_OUTLIER_PROBABILITY;
+ private static final double MIN_MINOR_FRACTION_SAMPLING_WIDTH = 1E-3;
+
+ private final SampleMetadata sampleMetadata;
+ private final ParameterizedModel model;
+
+ private final List meanBiasSamples = new ArrayList<>();
+ private final List biasVarianceSamples = new ArrayList<>();
+ private final List outlierProbabilitySamples = new ArrayList<>();
+ private final List minorFractionsSamples = new ArrayList<>();
+
+ /**
+ * Constructs an allele-fraction model given allelic counts and segments.
+ * {@link AlleleFractionInitializer} is used for initialization and slice-sampling widths are estimated.
+ */
+ AlleleFractionModeller(final AllelicCountCollection allelicCounts,
+ final List segments,
+ final AlleleFractionPrior prior) {
+ Utils.nonNull(allelicCounts);
+ Utils.nonEmpty(segments);
+ Utils.nonNull(prior);
+
+ sampleMetadata = allelicCounts.getSampleMetadata();
+ final AlleleFractionSegmentedData data = new AlleleFractionSegmentedData(allelicCounts, segments);
+
+ //initialization gets us to the mode of the likelihood
+ final AlleleFractionState initialState = new AlleleFractionInitializer(data).getInitializedState();
+ final AlleleFractionGlobalParameters initialParameters = initialState.globalParameters();
+ final AlleleFractionState.MinorFractions initialMinorFractions = initialState.minorFractions();
+
+ //if we approximate conditionals as normal, we can guess the width from the curvature at the mode and use as the slice-sampling widths
+ final double meanBiasSamplingWidths = approximatePosteriorWidthAtMode(meanBias ->
+ AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewMeanBias(meanBias), initialMinorFractions, data), initialParameters.getMeanBias());
+ final double biasVarianceSamplingWidths = approximatePosteriorWidthAtMode(biasVariance ->
+ AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewBiasVariance(biasVariance), initialMinorFractions, data), initialParameters.getBiasVariance());
+ final double outlierProbabilitySamplingWidths = approximatePosteriorWidthAtMode(outlierProbability ->
+ AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewOutlierProbability(outlierProbability), initialMinorFractions, data), initialParameters.getOutlierProbability());
+
+ final List minorFractionsSliceSamplingWidths = IntStream.range(0, data.getNumSegments()).boxed()
+ .map(segment -> approximatePosteriorWidthAtMode(
+ f -> AlleleFractionLikelihoods.segmentLogLikelihood(initialParameters, f, data.getIndexedAllelicCountsInSegment(segment)), initialMinorFractions.get(segment)))
+ .map(w -> Math.max(w, MIN_MINOR_FRACTION_SAMPLING_WIDTH))
+ .collect(Collectors.toList());
+
+ final ParameterSampler meanBiasSampler =
+ new AlleleFractionSamplers.MeanBiasSampler(MAX_REASONABLE_MEAN_BIAS, meanBiasSamplingWidths);
+ final ParameterSampler biasVarianceSampler =
+ new AlleleFractionSamplers.BiasVarianceSampler(MAX_REASONABLE_BIAS_VARIANCE, biasVarianceSamplingWidths);
+ final ParameterSampler outlierProbabilitySampler =
+ new AlleleFractionSamplers.OutlierProbabilitySampler(MAX_REASONABLE_OUTLIER_PROBABILITY, outlierProbabilitySamplingWidths);
+ final ParameterSampler minorFractionsSampler =
+ new AlleleFractionSamplers.MinorFractionsSampler(prior, minorFractionsSliceSamplingWidths);
+
+ model = new ParameterizedModel.GibbsBuilder<>(initialState, data)
+ .addParameterSampler(AlleleFractionParameter.MEAN_BIAS, meanBiasSampler, Double.class)
+ .addParameterSampler(AlleleFractionParameter.BIAS_VARIANCE, biasVarianceSampler, Double.class)
+ .addParameterSampler(AlleleFractionParameter.OUTLIER_PROBABILITY, outlierProbabilitySampler, Double.class)
+ .addParameterSampler(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, minorFractionsSampler, AlleleFractionState.MinorFractions.class)
+ .build();
+ }
+
+ /**
+ * Adds {@code numSamples - numBurnIn} Markov-Chain Monte-Carlo samples of the parameter posteriors (generated using
+ * Gibbs sampling) to the collections held internally. The current {@link AlleleFractionState} held internally is used
+ * to initialize the Markov Chain.
+ * @param numSamples total number of samples per posterior
+ * @param numBurnIn number of burn-in samples to discard
+ */
+ void fitMCMC(final int numSamples, final int numBurnIn) {
+ //run MCMC
+ final GibbsSampler gibbsSampler = new GibbsSampler<>(numSamples, model);
+ gibbsSampler.runMCMC();
+
+ //update posterior samples
+ meanBiasSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.MEAN_BIAS, Double.class, numBurnIn));
+ biasVarianceSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.BIAS_VARIANCE, Double.class, numBurnIn));
+ outlierProbabilitySamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.OUTLIER_PROBABILITY, Double.class, numBurnIn));
+ minorFractionsSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, AlleleFractionState.MinorFractions.class, numBurnIn));
+ }
+
+ List getMeanBiasSamples() {
+ return Collections.unmodifiableList(meanBiasSamples);
+ }
+
+ List getBiasVarianceSamples() {
+ return Collections.unmodifiableList(biasVarianceSamples);
+ }
+
+ List getOutlierProbabilitySamples() {
+ return Collections.unmodifiableList(outlierProbabilitySamples);
+ }
+
+ List getMinorFractionsSamples() {
+ return Collections.unmodifiableList(minorFractionsSamples);
+ }
+
+ /**
+ * Should only be called after {@link #fitMCMC} has been called.
+ */
+ List getMinorAlleleFractionsPosteriorSummaries() {
+ if (minorFractionsSamples.isEmpty()) {
+ throw new IllegalStateException("Attempted to get posterior summaries for minor-allele fractions before MCMC was performed.");
+ }
+ final int numSegments = minorFractionsSamples.get(0).size();
+ final List posteriorSummaries = new ArrayList<>(numSegments);
+ for (int segment = 0; segment < numSegments; segment++) {
+ final int j = segment;
+ final List minorFractionSamples =
+ minorFractionsSamples.stream().map(s -> s.get(j)).collect(Collectors.toList());
+ posteriorSummaries.add(new ModeledSegment.SimplePosteriorSummary(minorFractionSamples));
+ }
+ return posteriorSummaries;
+ }
+
+ /**
+ * Should only be called after {@link #fitMCMC} has been called.
+ */
+ ParameterDecileCollection getGlobalParameterDeciles() {
+ if (meanBiasSamples.isEmpty()) {
+ throw new IllegalStateException("Attempted to get posterior summaries for global parameters before MCMC was performed.");
+ }
+ final Map parameterToDecilesMap = new LinkedHashMap<>();
+ parameterToDecilesMap.put(AlleleFractionParameter.MEAN_BIAS, new DecileCollection(meanBiasSamples));
+ parameterToDecilesMap.put(AlleleFractionParameter.BIAS_VARIANCE, new DecileCollection(biasVarianceSamples));
+ parameterToDecilesMap.put(AlleleFractionParameter.OUTLIER_PROBABILITY, new DecileCollection(outlierProbabilitySamples));
+ return new ParameterDecileCollection<>(sampleMetadata, parameterToDecilesMap, AlleleFractionParameter.class, DOUBLE_FORMAT);
+ }
+
+ //use width of a probability distribution given the position of its mode (estimated from Gaussian approximation) as step size
+ private static double approximatePosteriorWidthAtMode(final Function logPDF,
+ final double mode) {
+ final double absMode = Math.abs(mode);
+ final double epsilon = Math.min(1E-6, absMode / 2); //adjust scale if mode is very near zero
+ final double defaultWidth = absMode / 10; //if "mode" is not close to true mode of logPDF, approximation may not apply; just use 1 / 10 of absMode in this case
+ final double secondDerivative = (logPDF.apply(mode + epsilon) - 2 * logPDF.apply(mode) + logPDF.apply(mode - epsilon)) / (epsilon * epsilon);
+ return secondDerivative < 0 ? Math.sqrt(-1.0 / secondDerivative) : defaultWidth;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionParameter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionParameter.java
new file mode 100644
index 00000000000..cff195ea12a
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionParameter.java
@@ -0,0 +1,21 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.utils.mcmc.ParameterEnum;
+
+/**
+ * Enumerates the parameters for {@link AlleleFractionState}.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+enum AlleleFractionParameter implements ParameterEnum {
+ MEAN_BIAS("AF_reference_bias_mean"),
+ BIAS_VARIANCE("AF_reference_bias_variance"),
+ OUTLIER_PROBABILITY("AF_outlier_probability"),
+ MINOR_ALLELE_FRACTIONS("AF_minor_allele_fractions");
+
+ final String name;
+
+ AlleleFractionParameter(final String name) {
+ this.name = name;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionPrior.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionPrior.java
new file mode 100644
index 00000000000..db7ac3132f6
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionPrior.java
@@ -0,0 +1,23 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.utils.Utils;
+
+/**
+ * Represents priors for the allele-fraction model.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class AlleleFractionPrior {
+ private final double minorAlleleFractionPriorAlpha;
+
+ public AlleleFractionPrior(final double minorAlleleFractionPriorAlpha) {
+ Utils.validateArg(minorAlleleFractionPriorAlpha >= 1,
+ "Alpha hyperparameter for the 4-parameter beta-distribution prior on " +
+ "segment minor-allele fraction must be greater than or equal to one.");
+ this.minorAlleleFractionPriorAlpha = minorAlleleFractionPriorAlpha;
+ }
+
+ double getMinorAlleleFractionPriorAlpha() {
+ return minorAlleleFractionPriorAlpha;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSamplers.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSamplers.java
new file mode 100644
index 00000000000..6b474e20aad
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSamplers.java
@@ -0,0 +1,184 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.apache.commons.math3.distribution.BetaDistribution;
+import org.apache.commons.math3.random.RandomGenerator;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler;
+import org.broadinstitute.hellbender.utils.mcmc.SliceSampler;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Sampler classes for the allele-fraction model.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class AlleleFractionSamplers {
+ private static final Logger logger = LogManager.getLogger(AlleleFractionSamplers.class);
+
+ private static final int NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD = 10000;
+ private static final int NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD = 1000;
+
+ private AlleleFractionSamplers() {}
+
+ static final class MeanBiasSampler implements ParameterSampler {
+ private static final double MIN_MEAN_BIAS = 0.;
+
+ private final double maxMeanBias;
+ private final double meanBiasSliceSamplingWidth;
+
+ MeanBiasSampler(final double maxMeanBias,
+ final double meanBiasSliceSamplingWidth) {
+ this.maxMeanBias = maxMeanBias;
+ this.meanBiasSliceSamplingWidth = meanBiasSliceSamplingWidth;
+ }
+
+ @Override
+ public Double sample(final RandomGenerator rng,
+ final AlleleFractionState state,
+ final AlleleFractionSegmentedData data) {
+ logger.debug("Sampling mean bias...");
+ final Function logLikelihoodEstimate = logLikelihoodFromSubsample(
+ rng, state.minorFractions(), data, NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD);
+ return new SliceSampler(rng,
+ x -> logLikelihoodEstimate.apply(state.globalParameters().copyWithNewMeanBias(x)),
+ MIN_MEAN_BIAS, maxMeanBias, meanBiasSliceSamplingWidth)
+ .sample(state.meanBias());
+ }
+ }
+
+ static final class BiasVarianceSampler implements ParameterSampler {
+ private static final double MIN_BIAS_VARIANCE = 1E-10;
+
+ private final double maxBiasVariance;
+ private final double biasVarianceSliceSamplingWidth;
+
+ BiasVarianceSampler(final double maxBiasVariance,
+ final double biasVarianceSliceSamplingWidth) {
+ this.maxBiasVariance = maxBiasVariance;
+ this.biasVarianceSliceSamplingWidth = biasVarianceSliceSamplingWidth;
+ }
+
+ @Override
+ public Double sample(final RandomGenerator rng,
+ final AlleleFractionState state,
+ final AlleleFractionSegmentedData data) {
+ logger.debug("Sampling bias variance...");
+ final Function logLikelihoodEstimate = logLikelihoodFromSubsample(
+ rng, state.minorFractions(), data, NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD);
+ return new SliceSampler(rng,
+ x -> logLikelihoodEstimate.apply(state.globalParameters().copyWithNewBiasVariance(x)),
+ MIN_BIAS_VARIANCE, maxBiasVariance, biasVarianceSliceSamplingWidth)
+ .sample(state.biasVariance());
+ }
+ }
+
+ static final class OutlierProbabilitySampler implements ParameterSampler {
+ private static final double MIN_OUTLIER_PROBABILITY = 0.;
+
+ private final double maxOutlierProbability;
+ private final double outlierProbabilitySliceSamplingWidth;
+
+ OutlierProbabilitySampler(final double maxOutlierProbability,
+ final double outlierProbabilitySliceSamplingWidth) {
+ this.maxOutlierProbability = maxOutlierProbability;
+ this.outlierProbabilitySliceSamplingWidth = outlierProbabilitySliceSamplingWidth;
+ }
+
+ @Override
+ public Double sample(final RandomGenerator rng,
+ final AlleleFractionState state,
+ final AlleleFractionSegmentedData data) {
+ logger.debug("Sampling outlier probability...");
+ final Function logLikelihoodEstimate = logLikelihoodFromSubsample(
+ rng, state.minorFractions(), data, NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD);
+ return new SliceSampler(rng,
+ x -> logLikelihoodEstimate.apply(state.globalParameters().copyWithNewOutlierProbability(x)),
+ MIN_OUTLIER_PROBABILITY, maxOutlierProbability, outlierProbabilitySliceSamplingWidth)
+ .sample(state.outlierProbability());
+ }
+ }
+
+ // sample minor fractions of all segments
+ static final class MinorFractionsSampler implements ParameterSampler {
+ private static double MIN_MINOR_FRACTION = 0.;
+ private static double MAX_MINOR_FRACTION = 0.5;
+ private static final double PRIOR_BETA = 1.;
+
+ private final Function logPrior;
+ private final List sliceSamplingWidths;
+
+ MinorFractionsSampler(final AlleleFractionPrior prior,
+ final List sliceSamplingWidths) {
+ logPrior = f -> new BetaDistribution(null, prior.getMinorAlleleFractionPriorAlpha(), PRIOR_BETA).logDensity(2 * f);
+ this.sliceSamplingWidths = sliceSamplingWidths;
+ }
+
+ @Override
+ public AlleleFractionState.MinorFractions sample(final RandomGenerator rng, final AlleleFractionState state, final AlleleFractionSegmentedData data) {
+ final List minorFractions = new ArrayList<>(data.getNumSegments());
+ for (int segment = 0; segment < data.getNumSegments(); segment++) {
+ logger.debug(String.format("Sampling minor fraction for segment %d...", segment));
+ final List allelicCountsInSegment =
+ data.getIndexedAllelicCountsInSegment(segment);
+ if (allelicCountsInSegment.isEmpty()){
+ minorFractions.add(Double.NaN);
+ } else {
+ final Function segmentLogLikelihoodEstimate = segmentLogLikelihoodFromSubsample(
+ rng, state.globalParameters(), allelicCountsInSegment, NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD);
+ final SliceSampler sampler = new SliceSampler(rng,
+ f -> logPrior.apply(f) + segmentLogLikelihoodEstimate.apply(f),
+ MIN_MINOR_FRACTION, MAX_MINOR_FRACTION, sliceSamplingWidths.get(segment));
+ minorFractions.add(sampler.sample(state.segmentMinorFraction(segment)));
+ }
+ }
+ return new AlleleFractionState.MinorFractions(minorFractions);
+ }
+ }
+
+ private static List subsample(final RandomGenerator rng,
+ final List allelicCounts,
+ final int numPointsSubsampleThreshold) {
+ //subsample the data if we are above the threshold
+ return allelicCounts.size() > numPointsSubsampleThreshold
+ ? IntStream.range(0, numPointsSubsampleThreshold).boxed().map(i -> rng.nextInt(allelicCounts.size())).map(allelicCounts::get).collect(Collectors.toList())
+ : allelicCounts;
+ }
+
+ private static Function logLikelihoodFromSubsample(final RandomGenerator rng,
+ final AlleleFractionState.MinorFractions minorFractions,
+ final AlleleFractionSegmentedData data,
+ final int numPointsSubsampleThreshold) {
+ final List subsampledAllelicCounts =
+ subsample(rng, data.getIndexedAllelicCounts(), numPointsSubsampleThreshold);
+ final double scalingFactor = (double) data.getNumPoints() / subsampledAllelicCounts.size();
+ final Map> segmentIndexToSubsampledAllelicCountsInSegmentMap =
+ subsampledAllelicCounts.stream()
+ .collect(Collectors.groupingBy(AlleleFractionSegmentedData.IndexedAllelicCount::getSegmentIndex, Collectors.toList()));
+ return parameters -> {
+ double logLikelihood = 0.;
+ for (final int segmentIndex : segmentIndexToSubsampledAllelicCountsInSegmentMap.keySet()) {
+ logLikelihood += AlleleFractionLikelihoods.segmentLogLikelihood(
+ parameters, minorFractions.get(segmentIndex), segmentIndexToSubsampledAllelicCountsInSegmentMap.get(segmentIndex));
+ }
+ return scalingFactor * logLikelihood;
+ };
+ }
+
+ private static Function segmentLogLikelihoodFromSubsample(final RandomGenerator rng,
+ final AlleleFractionGlobalParameters parameters,
+ final List allelicCountsInSegment,
+ final int numPointsSubsampleThreshold) {
+ final List subsampledAllelicCountsInSegment =
+ subsample(rng, allelicCountsInSegment, numPointsSubsampleThreshold);
+ final double scalingFactor = (double) allelicCountsInSegment.size() / subsampledAllelicCountsInSegment.size();
+ return minorFraction -> scalingFactor * AlleleFractionLikelihoods.segmentLogLikelihood(parameters, minorFraction, subsampledAllelicCountsInSegment);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSegmentedData.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSegmentedData.java
new file mode 100644
index 00000000000..afd35c6e28e
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSegmentedData.java
@@ -0,0 +1,100 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import htsjdk.samtools.util.OverlapDetector;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.utils.IndexRange;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.mcmc.DataCollection;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * {@link DataCollection} for the allele-fraction model containing the het alt and ref counts grouped by segment.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class AlleleFractionSegmentedData implements DataCollection {
+ private final AllelicCountCollection allelicCounts;
+ private final List segments;
+
+ private final List indexedAllelicCounts;
+ private final List indexRangesPerSegment;
+
+ AlleleFractionSegmentedData(final AllelicCountCollection allelicCounts,
+ final List segments) {
+ this.allelicCounts = Utils.nonNull(allelicCounts);
+ this.segments = Utils.nonEmpty(segments).stream().sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList());
+
+ indexedAllelicCounts = new ArrayList<>(allelicCounts.size());
+ indexRangesPerSegment = new ArrayList<>(segments.size());
+
+ final OverlapDetector allelicCountOverlapDetector = allelicCounts.getOverlapDetector();
+ int startIndex = 0;
+ for (int segmentIndex = 0; segmentIndex < segments.size(); segmentIndex++) {
+ final SimpleInterval segment = segments.get(segmentIndex);
+ final List allelicCountsInSegment = allelicCountOverlapDetector.getOverlaps(segment).stream()
+ .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR)
+ .collect(Collectors.toList());
+ final int segmentStartIndex = startIndex;
+ final int si = segmentIndex;
+ IntStream.range(0, allelicCountsInSegment.size()).boxed()
+ .map(i -> new IndexedAllelicCount(allelicCountsInSegment.get(i), segmentStartIndex + i, si))
+ .forEach(indexedAllelicCounts::add);
+ indexRangesPerSegment.add(new IndexRange(segmentStartIndex, segmentStartIndex + allelicCountsInSegment.size()));
+ startIndex += allelicCountsInSegment.size();
+ }
+ }
+
+ AllelicCountCollection getAllelicCounts() {
+ return allelicCounts;
+ }
+
+ List getSegments() {
+ return Collections.unmodifiableList(segments);
+ }
+
+ int getNumSegments() {
+ return segments.size();
+ }
+
+ int getNumPoints() {
+ return allelicCounts.size();
+ }
+
+ List getIndexedAllelicCounts() {
+ return Collections.unmodifiableList(indexedAllelicCounts);
+ }
+
+ List getIndexedAllelicCountsInSegment(final int segmentIndex) {
+ return Collections.unmodifiableList(indexedAllelicCounts.subList(
+ indexRangesPerSegment.get(segmentIndex).from, indexRangesPerSegment.get(segmentIndex).to));
+ }
+
+ static final class IndexedAllelicCount extends AllelicCount {
+ private final int index;
+ private final int segmentIndex;
+
+ private IndexedAllelicCount(final AllelicCount allelicCount,
+ final int index,
+ final int segmentIndex) {
+ super(allelicCount.getInterval(), allelicCount.getRefReadCount(), allelicCount.getAltReadCount(), allelicCount.getRefNucleotide(), allelicCount.getAltNucleotide());
+ this.index = index;
+ this.segmentIndex = segmentIndex;
+ }
+
+ int getIndex() {
+ return index;
+ }
+
+ int getSegmentIndex() {
+ return segmentIndex;
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionState.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionState.java
new file mode 100644
index 00000000000..c983fd6cacc
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionState.java
@@ -0,0 +1,69 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.utils.mcmc.Parameter;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterizedState;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * The state of the allele-fraction model, containing:
+ * 1. the global mean reference bias
+ * 2. the global variance of the reference bias
+ * 3. the global outlier probability
+ * 4. minor-allele fractions for each segment
+ *
+ * See docs/CNVs/CNV-methods.pdf for details.
+ *
+ * @author David Benjamin <davidben@broadinstitute.org>
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class AlleleFractionState extends ParameterizedState {
+ static final class MinorFractions extends ArrayList {
+ private static final long serialVersionUID = 1029384756L;
+
+ MinorFractions(final int numSegments) {
+ super(numSegments);
+ }
+
+ MinorFractions(final List minorFractions) {
+ super(new ArrayList<>(minorFractions));
+ }
+ }
+
+ AlleleFractionState(final double meanBias,
+ final double biasVariance,
+ final double outlierProbability,
+ final MinorFractions minorFractions) {
+ super(Arrays.asList(
+ new Parameter<>(AlleleFractionParameter.MEAN_BIAS, meanBias),
+ new Parameter<>(AlleleFractionParameter.BIAS_VARIANCE, biasVariance),
+ new Parameter<>(AlleleFractionParameter.OUTLIER_PROBABILITY, outlierProbability),
+ new Parameter<>(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, minorFractions)));
+ }
+
+ double meanBias() {
+ return get(AlleleFractionParameter.MEAN_BIAS, Double.class);
+ }
+
+ double biasVariance() {
+ return get(AlleleFractionParameter.BIAS_VARIANCE, Double.class);
+ }
+
+ double outlierProbability() {
+ return get(AlleleFractionParameter.OUTLIER_PROBABILITY, Double.class);
+ }
+
+ double segmentMinorFraction(final int segment) {
+ return get(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, MinorFractions.class).get(segment);
+ }
+
+ AlleleFractionGlobalParameters globalParameters() {
+ return new AlleleFractionGlobalParameters(meanBias(), biasVariance(), outlierProbability());
+ }
+
+ MinorFractions minorFractions() {
+ return get(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, MinorFractions.class);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModeller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModeller.java
new file mode 100644
index 00000000000..dbdfc437936
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModeller.java
@@ -0,0 +1,166 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.mcmc.DecileCollection;
+import org.broadinstitute.hellbender.utils.mcmc.GibbsSampler;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterizedModel;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Represents a segmented model for copy ratio fit to denoised log2 copy-ratio data.
+ * The log2 copy ratios in each segment are fit by a mixture model with a normal-distribution component
+ * and a uniform outlier component. The variance of the normal-distribution component and the relative
+ * contribution of the uniform outlier component in all segments are both assumed to be global parameters.
+ * The mean of the normal-distribution component in each segment is taken to be a segment-level parameter.
+ * The component (i.e., normal or outlier) that each copy-ratio point is drawn from is determined by a latent
+ * point-level indicator.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class CopyRatioModeller {
+ private static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT;
+
+ private static final double EPSILON = 1E-6;
+ static final double LOG2_COPY_RATIO_MIN = -50.;
+ static final double LOG2_COPY_RATIO_MAX = 10.;
+ private static final double LOG2_COPY_RATIO_RANGE = LOG2_COPY_RATIO_MAX - LOG2_COPY_RATIO_MIN;
+ private static final double VARIANCE_MIN = EPSILON;
+
+ private static final double OUTLIER_PROBABILITY_INITIAL = 0.05;
+ private static final double OUTLIER_PROBABILITY_PRIOR_ALPHA = 5.;
+ private static final double OUTLIER_PROBABILITY_PRIOR_BETA = 95.;
+
+ private final SampleMetadata sampleMetadata;
+ private final ParameterizedModel model;
+
+ private final List varianceSamples = new ArrayList<>();
+ private final List outlierProbabilitySamples = new ArrayList<>();
+ private final List segmentMeansSamples = new ArrayList<>();
+
+ /**
+ * Constructs a copy-ratio model given copy ratios and segments.
+ * Initial point estimates of parameters are set to empirical estimates where available.
+ */
+ CopyRatioModeller(final CopyRatioCollection copyRatios,
+ final List segments) {
+ Utils.nonNull(copyRatios);
+ Utils.nonEmpty(segments);
+
+ sampleMetadata = copyRatios.getSampleMetadata();
+ final CopyRatioSegmentedData data = new CopyRatioSegmentedData(copyRatios, segments);
+
+ //set widths for slice sampling of variance and segment-mean posteriors using empirical variance estimate.
+ //variance posterior is inverse chi-squared, segment-mean posteriors are Gaussian; the below expressions
+ //approximate the standard deviations of these distributions.
+ //we also make sure all initial values are within appropriate bounds
+ final double dataRangeOrNaN = data.getMaxLog2CopyRatioValue() - data.getMinLog2CopyRatioValue();
+ final double dataRange = Double.isNaN(dataRangeOrNaN) ? LOG2_COPY_RATIO_RANGE : dataRangeOrNaN;
+ final double varianceEstimateOrNaN = data.estimateVariance();
+ final double varianceEstimate = Double.isNaN(varianceEstimateOrNaN) ? VARIANCE_MIN : Math.max(varianceEstimateOrNaN, VARIANCE_MIN);
+ final double varianceSliceSamplingWidth = 2. * varianceEstimate;
+ final double varianceMax = Math.max(10. * varianceEstimate, dataRange * dataRange);
+ final double meanSliceSamplingWidth = Math.sqrt(varianceEstimate * data.getNumSegments() / data.getNumPoints());
+ final List segmentMeans = data.estimateSegmentMeans().stream()
+ .map(m -> Math.max(LOG2_COPY_RATIO_MIN, Math.min(LOG2_COPY_RATIO_MAX, m)))
+ .collect(Collectors.toList());
+
+ //the uniform log-likelihood for outliers is determined by the minimum and maximum coverages in the dataset;
+ //the outlier-probability parameter should be interpreted accordingly
+ final double outlierUniformLogLikelihood = -Math.log(dataRange);
+
+ //use empirical segment means and empirical average variance across segments to initialize CopyRatioState
+ final CopyRatioState initialState = new CopyRatioState(varianceEstimate, CopyRatioModeller.OUTLIER_PROBABILITY_INITIAL,
+ new CopyRatioState.SegmentMeans(segmentMeans), new CopyRatioState.OutlierIndicators(Collections.nCopies(data.getNumPoints(), false)));
+
+ //define ParameterSamplers
+ final ParameterSampler varianceSampler =
+ new CopyRatioSamplers.VarianceSampler(VARIANCE_MIN, varianceMax, varianceSliceSamplingWidth);
+ final ParameterSampler outlierProbabilitySampler =
+ new CopyRatioSamplers.OutlierProbabilitySampler(OUTLIER_PROBABILITY_PRIOR_ALPHA, OUTLIER_PROBABILITY_PRIOR_BETA);
+ final ParameterSampler segmentMeansSampler =
+ new CopyRatioSamplers.SegmentMeansSampler(LOG2_COPY_RATIO_MIN, LOG2_COPY_RATIO_MAX, meanSliceSamplingWidth);
+ final ParameterSampler outlierIndicatorsSampler =
+ new CopyRatioSamplers.OutlierIndicatorsSampler(outlierUniformLogLikelihood);
+
+ model = new ParameterizedModel.GibbsBuilder<>(initialState, data)
+ .addParameterSampler(CopyRatioParameter.VARIANCE, varianceSampler, Double.class)
+ .addParameterSampler(CopyRatioParameter.OUTLIER_PROBABILITY, outlierProbabilitySampler, Double.class)
+ .addParameterSampler(CopyRatioParameter.SEGMENT_MEANS, segmentMeansSampler, CopyRatioState.SegmentMeans.class)
+ .addParameterSampler(CopyRatioParameter.OUTLIER_INDICATORS, outlierIndicatorsSampler, CopyRatioState.OutlierIndicators.class)
+ .build();
+ }
+
+ /**
+ * Adds {@code numSamples - numBurnIn} Markov-Chain Monte-Carlo samples of the parameter posteriors (generated using
+ * Gibbs sampling) to the collections held internally. The current {@link CopyRatioState} held internally is used
+ * to initialize the Markov Chain.
+ * @param numSamples total number of samples per posterior
+ * @param numBurnIn number of burn-in samples to discard
+ */
+ void fitMCMC(final int numSamples,
+ final int numBurnIn) {
+ ParamUtils.isPositiveOrZero(numBurnIn, "Number of burn-in samples must be non-negative.");
+ Utils.validateArg(numBurnIn < numSamples, "Number of samples must be greater than number of burn-in samples.");
+
+ //run MCMC
+ final GibbsSampler gibbsSampler = new GibbsSampler<>(numSamples, model);
+ gibbsSampler.runMCMC();
+
+ //update posterior samples
+ varianceSamples.addAll(gibbsSampler.getSamples(CopyRatioParameter.VARIANCE, Double.class, numBurnIn));
+ outlierProbabilitySamples.addAll(gibbsSampler.getSamples(CopyRatioParameter.OUTLIER_PROBABILITY, Double.class, numBurnIn));
+ segmentMeansSamples.addAll(gibbsSampler.getSamples(CopyRatioParameter.SEGMENT_MEANS, CopyRatioState.SegmentMeans.class, numBurnIn));
+ }
+
+ List getVarianceSamples() {
+ return Collections.unmodifiableList(varianceSamples);
+ }
+
+ List getOutlierProbabilitySamples() {
+ return Collections.unmodifiableList(outlierProbabilitySamples);
+ }
+
+ List getSegmentMeansSamples() {
+ return Collections.unmodifiableList(segmentMeansSamples);
+ }
+
+ /**
+ * Should only be called after {@link #fitMCMC} has been called.
+ */
+ List getSegmentMeansPosteriorSummaries() {
+ if (segmentMeansSamples.isEmpty()) {
+ throw new IllegalStateException("Attempted to get posterior summaries for segment means before MCMC was performed.");
+ }
+ final int numSegments = segmentMeansSamples.get(0).size();
+ final List posteriorSummaries = new ArrayList<>(numSegments);
+ for (int segment = 0; segment < numSegments; segment++) {
+ final int j = segment;
+ final List meanSamples =
+ segmentMeansSamples.stream().map(s -> s.get(j)).collect(Collectors.toList());
+ posteriorSummaries.add(new ModeledSegment.SimplePosteriorSummary(meanSamples));
+ }
+ return posteriorSummaries;
+ }
+
+ /**
+ * Should only be called after {@link #fitMCMC} has been called.
+ */
+ ParameterDecileCollection getGlobalParameterDeciles() {
+ if (varianceSamples.isEmpty()) {
+ throw new IllegalStateException("Attempted to get posterior summaries for global parameters before MCMC was performed.");
+ }
+ final Map parameterToDecilesMap = new LinkedHashMap<>();
+ parameterToDecilesMap.put(CopyRatioParameter.VARIANCE, new DecileCollection(varianceSamples));
+ parameterToDecilesMap.put(CopyRatioParameter.OUTLIER_PROBABILITY, new DecileCollection(outlierProbabilitySamples));
+ return new ParameterDecileCollection<>(sampleMetadata, parameterToDecilesMap, CopyRatioParameter.class, DOUBLE_FORMAT);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioParameter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioParameter.java
new file mode 100644
index 00000000000..53bb863691d
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioParameter.java
@@ -0,0 +1,21 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.utils.mcmc.ParameterEnum;
+
+/**
+ * Enumerates the parameters for {@link CopyRatioState}.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+enum CopyRatioParameter implements ParameterEnum {
+ VARIANCE("CR_variance"),
+ OUTLIER_PROBABILITY("CR_outlier_probability"),
+ SEGMENT_MEANS("CR_segment_means"),
+ OUTLIER_INDICATORS("CR_outlier_indicators");
+
+ final String name;
+
+ CopyRatioParameter(final String name) {
+ this.name = name;
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java
new file mode 100644
index 00000000000..e60b0d47e71
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java
@@ -0,0 +1,198 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.apache.commons.math3.distribution.BetaDistribution;
+import org.apache.commons.math3.random.RandomGenerator;
+import org.apache.commons.math3.util.FastMath;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.utils.MathUtils;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler;
+import org.broadinstitute.hellbender.utils.mcmc.SliceSampler;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class CopyRatioSamplers {
+ private static final Logger logger = LogManager.getLogger(CopyRatioSamplers.class);
+
+ private static final int NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD = 10000;
+ private static final int NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD = 1000;
+
+ private CopyRatioSamplers() {}
+
+ //Calculates the exponent for a normal distribution; used in log-likelihood calculation below.
+ private static double normalTerm(final double quantity,
+ final double mean,
+ final double variance) {
+ return (quantity - mean) * (quantity - mean) / (2. * variance);
+ }
+
+ //samples log conditional posterior for the variance parameter, assuming uniform prior; this is given by
+ //the product of Gaussian likelihoods for each non-outlier point t:
+ // log[product_{non-outlier t} variance^(-1/2) * exp(-(log2cr_t - mean_t)^2 / (2 * variance))] + constant
+ //where mean_t is identical for all points in a segment
+ static final class VarianceSampler implements ParameterSampler {
+ private final double varianceMin;
+ private final double varianceMax;
+ private final double varianceSliceSamplingWidth;
+
+ VarianceSampler(final double varianceMin,
+ final double varianceMax,
+ final double varianceSliceSamplingWidth) {
+ this.varianceMin = varianceMin;
+ this.varianceMax = varianceMax;
+ this.varianceSliceSamplingWidth = varianceSliceSamplingWidth;
+ }
+
+ @Override
+ public Double sample(final RandomGenerator rng,
+ final CopyRatioState state,
+ final CopyRatioSegmentedData data) {
+ logger.debug("Sampling variance...");
+ final List indexedCopyRatiosSubsample = subsample(
+ rng, data.getIndexedCopyRatios(), NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD);
+ final double scalingFactor = (double) data.getNumPoints() / indexedCopyRatiosSubsample.size();
+ final Function logConditionalPDF = newVariance -> {
+ final double gaussianLogNormalization = 0.5 * FastMath.log(newVariance);
+ double ll = 0.;
+ for (final CopyRatioSegmentedData.IndexedCopyRatio indexedCopyRatio : indexedCopyRatiosSubsample) {
+ if (!state.outlierIndicator(indexedCopyRatio.getIndex())) {
+ ll -= gaussianLogNormalization +
+ normalTerm(
+ indexedCopyRatio.getLog2CopyRatioValue(),
+ state.segmentMean(indexedCopyRatio.getSegmentIndex()),
+ newVariance);
+ }
+ }
+ return scalingFactor * ll;
+ };
+ return new SliceSampler(rng, logConditionalPDF, varianceMin, varianceMax, varianceSliceSamplingWidth).sample(state.variance());
+ }
+ }
+
+ //samples log conditional posterior for the outlier-probability parameter, assuming Beta(alpha, beta) prior;
+ //this is given by:
+ // log Beta(alpha + number of outlier points, beta + number of non-outlier points) + constant
+ static final class OutlierProbabilitySampler implements ParameterSampler {
+ private final double outlierProbabilityPriorAlpha;
+ private final double outlierProbabilityPriorBeta;
+
+ OutlierProbabilitySampler(final double outlierProbabilityPriorAlpha,
+ final double outlierProbabilityPriorBeta) {
+ this.outlierProbabilityPriorAlpha = outlierProbabilityPriorAlpha;
+ this.outlierProbabilityPriorBeta = outlierProbabilityPriorBeta;
+ }
+
+ @Override
+ public Double sample(final RandomGenerator rng,
+ final CopyRatioState state,
+ final CopyRatioSegmentedData data) {
+ logger.debug("Sampling outlier probability...");
+ final int numOutliers = (int) IntStream.range(0, data.getNumPoints()).filter(state::outlierIndicator).count();
+ return new BetaDistribution(rng,
+ outlierProbabilityPriorAlpha + numOutliers,
+ outlierProbabilityPriorBeta + data.getNumPoints() - numOutliers).sample();
+ }
+ }
+
+ //samples log conditional posteriors for the segment-mean parameters, assuming uniform priors bounded by minimum and maximum log2 copy-ratio values;
+ //for each segment s, this is given by the product of Gaussian likelihoods for each non-outlier point t:
+ // log[product_{non-outlier t in s} exp(-(log2cr_t - mean_s)^2 / (2 * variance))] + constant
+ static final class SegmentMeansSampler implements ParameterSampler {
+ private final double meanMin;
+ private final double meanMax;
+ private final double meanSliceSamplingWidth;
+
+ SegmentMeansSampler(final double meanMin,
+ final double meanMax,
+ final double meanSliceSamplingWidth) {
+ this.meanMin = meanMin;
+ this.meanMax = meanMax;
+ this.meanSliceSamplingWidth = meanSliceSamplingWidth;
+ }
+
+ @Override
+ public CopyRatioState.SegmentMeans sample(final RandomGenerator rng,
+ final CopyRatioState state,
+ final CopyRatioSegmentedData data) {
+ final List means = new ArrayList<>(data.getNumSegments());
+ for (int segment = 0; segment < data.getNumSegments(); segment++) {
+ final List indexedCopyRatiosInSegment = data.getIndexedCopyRatiosInSegment(segment);
+ if (indexedCopyRatiosInSegment.isEmpty()) {
+ means.add(Double.NaN);
+ } else {
+ logger.debug(String.format("Sampling mean for segment %d...", segment));
+ final List indexedCopyRatiosInSegmentSubsample = subsample(
+ rng, indexedCopyRatiosInSegment, NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD);
+ final double scalingFactor = (double) indexedCopyRatiosInSegment.size() / indexedCopyRatiosInSegmentSubsample.size();
+ final Function logConditionalPDF = newMean ->
+ scalingFactor * indexedCopyRatiosInSegmentSubsample.stream()
+ .filter(c -> !state.outlierIndicator(c.getIndex()))
+ .mapToDouble(c -> -normalTerm(c.getLog2CopyRatioValue(), newMean, state.variance()))
+ .sum();
+ final SliceSampler sampler = new SliceSampler(rng, logConditionalPDF, meanMin, meanMax, meanSliceSamplingWidth);
+ means.add(sampler.sample(state.segmentMean(segment)));
+ }
+ }
+ return new CopyRatioState.SegmentMeans(means);
+ }
+ }
+
+ //samples log conditional posteriors for the outlier-indicator parameters; for each point t, this is given by:
+ // z_t * [log outlier_prob + outlierUniformLogLikelihood]
+ // + (1 - z_t) * [log(1 - outlier_prob) - log(2 * pi * variance)/2 - (log2cr_t - mean_t)^2 / (2 * variance)]
+ // + const
+ //where z_t is the indicator for point t, and outlier_prob is the outlier probability.
+ //note that we compute the normalizing constant, so that we can sample a new indicator value by simply sampling
+ //uniformly in [0, 1] and checking whether the resulting value is less than the probability of being an outlier
+ //(corresponding to the first line in the unnormalized expression above)
+ static final class OutlierIndicatorsSampler implements ParameterSampler {
+ private final double outlierUniformLogLikelihood;
+
+ OutlierIndicatorsSampler(final double outlierUniformLogLikelihood) {
+ this.outlierUniformLogLikelihood = outlierUniformLogLikelihood;
+ }
+
+ @Override
+ public CopyRatioState.OutlierIndicators sample(final RandomGenerator rng,
+ final CopyRatioState state,
+ final CopyRatioSegmentedData data) {
+ logger.debug("Sampling outlier indicators...");
+ final double outlierUnnormalizedLogProbability =
+ Math.log(state.outlierProbability()) + outlierUniformLogLikelihood;
+ final double notOutlierUnnormalizedLogProbabilityPrefactor =
+ Math.log(1. - state.outlierProbability()) - 0.5 * Math.log(2 * Math.PI * state.variance());
+ final List indicators = new ArrayList<>();
+ for (int segment = 0; segment < data.getNumSegments(); segment++) {
+ final List indexedCopyRatiosInSegment = data.getIndexedCopyRatiosInSegment(segment);
+ for (final CopyRatioSegmentedData.IndexedCopyRatio indexedCopyRatio : indexedCopyRatiosInSegment) {
+ final double notOutlierUnnormalizedLogProbability =
+ notOutlierUnnormalizedLogProbabilityPrefactor
+ - normalTerm(indexedCopyRatio.getLog2CopyRatioValue(), state.segmentMean(segment), state.variance());
+ //note: we are working in natural log space, so we divide by ln(10) before using normalizeFromLog10
+ final double conditionalProbability =
+ MathUtils.normalizeFromLog10ToLinearSpace(new double[]{
+ MathUtils.logToLog10(outlierUnnormalizedLogProbability),
+ MathUtils.logToLog10(notOutlierUnnormalizedLogProbability)})[0];
+ indicators.add(rng.nextDouble() < conditionalProbability);
+ }
+ }
+ return new CopyRatioState.OutlierIndicators(indicators);
+ }
+ }
+
+ private static List subsample(final RandomGenerator rng,
+ final List copyRatios,
+ final int numPointsSubsampleThreshold) {
+ //subsample the data if we are above the threshold
+ return copyRatios.size() > numPointsSubsampleThreshold
+ ? IntStream.range(0, numPointsSubsampleThreshold).boxed().map(i -> rng.nextInt(copyRatios.size())).map(copyRatios::get).collect(Collectors.toList())
+ : copyRatios;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSegmentedData.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSegmentedData.java
new file mode 100644
index 00000000000..d2025a6fbeb
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSegmentedData.java
@@ -0,0 +1,145 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import com.google.common.primitives.Doubles;
+import htsjdk.samtools.util.OverlapDetector;
+import org.apache.commons.math3.stat.descriptive.moment.Mean;
+import org.apache.commons.math3.stat.descriptive.moment.Variance;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
+import org.broadinstitute.hellbender.utils.IndexRange;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.mcmc.DataCollection;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * {@link DataCollection} for the copy-ratio model containing the copy-ratio data grouped by segment.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class CopyRatioSegmentedData implements DataCollection {
+ private final CopyRatioCollection copyRatios;
+ private final List segments;
+ private final double minLog2CopyRatioValue;
+ private final double maxLog2CopyRatioValue;
+
+ private final List indexedCopyRatios;
+ private final List indexRangesPerSegment;
+
+ CopyRatioSegmentedData(final CopyRatioCollection copyRatios,
+ final List segments) {
+ this.copyRatios = Utils.nonNull(copyRatios);
+ this.segments = Utils.nonEmpty(segments).stream().sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList());
+
+ final List log2CopyRatioValues = copyRatios.getLog2CopyRatioValues();
+ minLog2CopyRatioValue = log2CopyRatioValues.stream().min(Double::compareTo).orElse(Double.NaN);
+ maxLog2CopyRatioValue = log2CopyRatioValues.stream().max(Double::compareTo).orElse(Double.NaN);
+
+ indexedCopyRatios = new ArrayList<>(copyRatios.size());
+ indexRangesPerSegment = new ArrayList<>(segments.size());
+
+ //construct list of lists of copy ratios with an index in order corresponding to that of segments;
+ //segment assignment is based on midpoint of copy-ratio interval
+ final OverlapDetector copyRatioMidpointOverlapDetector = copyRatios.getMidpointOverlapDetector();
+ int index = 0;
+ for (int segmentIndex = 0; segmentIndex < segments.size(); segmentIndex++) {
+ final SimpleInterval segment = segments.get(segmentIndex);
+ final List copyRatiosInSegment = copyRatioMidpointOverlapDetector.getOverlaps(segment).stream()
+ .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR)
+ .collect(Collectors.toList());
+ final int segmentStartIndex = index;
+ final int si = segmentIndex;
+ IntStream.range(0, copyRatiosInSegment.size()).boxed()
+ .map(i -> new IndexedCopyRatio(copyRatiosInSegment.get(i), segmentStartIndex + i, si))
+ .forEach(indexedCopyRatios::add);
+ indexRangesPerSegment.add(new IndexRange(segmentStartIndex, segmentStartIndex + copyRatiosInSegment.size()));
+ index += copyRatiosInSegment.size();
+ }
+ }
+
+ CopyRatioCollection getCopyRatios() {
+ return copyRatios;
+ }
+
+ List getSegments() {
+ return Collections.unmodifiableList(segments);
+ }
+
+ int getNumSegments() {
+ return segments.size();
+ }
+
+ int getNumPoints() {
+ return copyRatios.size();
+ }
+
+ String getSampleName() {
+ return copyRatios.getSampleName();
+ }
+
+ double getMinLog2CopyRatioValue() {
+ return minLog2CopyRatioValue;
+ }
+
+ double getMaxLog2CopyRatioValue() {
+ return maxLog2CopyRatioValue;
+ }
+
+ List getIndexedCopyRatios() {
+ return Collections.unmodifiableList(indexedCopyRatios);
+ }
+
+ List getIndexedCopyRatiosInSegment(final int segmentIndex) {
+ return Collections.unmodifiableList(indexedCopyRatios.subList(
+ indexRangesPerSegment.get(segmentIndex).from, indexRangesPerSegment.get(segmentIndex).to));
+ }
+
+ //estimate global variance empirically by taking average of all per-segment variances
+ double estimateVariance() {
+ return IntStream.range(0, segments.size())
+ .mapToDouble(s -> new Variance().evaluate(Doubles.toArray(
+ getIndexedCopyRatiosInSegment(s).stream()
+ .map(IndexedCopyRatio::getLog2CopyRatioValue)
+ .collect(Collectors.toList()))))
+ .filter(v -> !Double.isNaN(v))
+ .average().orElse(Double.NaN);
+ }
+
+ //estimate segment means empirically by taking averages of log2 copy ratios in each segment
+ CopyRatioState.SegmentMeans estimateSegmentMeans() {
+ final List means = IntStream.range(0, segments.size()).boxed()
+ .map(s -> new Mean().evaluate(Doubles.toArray(
+ getIndexedCopyRatiosInSegment(s).stream()
+ .map(IndexedCopyRatio::getLog2CopyRatioValue)
+ .collect(Collectors.toList()))))
+ .collect(Collectors.toList());
+ return new CopyRatioState.SegmentMeans(means);
+ }
+
+ static final class IndexedCopyRatio extends CopyRatio {
+ private final int index;
+ private final int segmentIndex;
+
+ private IndexedCopyRatio(final CopyRatio copyRatio,
+ final int index,
+ final int segmentIndex) {
+ super(copyRatio.getInterval(), copyRatio.getLog2CopyRatioValue());
+ this.index = index;
+ this.segmentIndex = segmentIndex;
+ }
+
+ int getIndex() {
+ return index;
+ }
+
+ int getSegmentIndex() {
+ return segmentIndex;
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioState.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioState.java
new file mode 100644
index 00000000000..445663c4f84
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioState.java
@@ -0,0 +1,67 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import org.broadinstitute.hellbender.utils.mcmc.Parameter;
+import org.broadinstitute.hellbender.utils.mcmc.ParameterizedState;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.List;
+import java.util.stream.IntStream;
+
+/**
+ * The state of the copy-ratio model, containing:
+ * 1. the global variance
+ * 2. the global outlier probability
+ * 3. log2 mean copy ratios for each segment
+ * 4. outlier indicators for each copy-ratio interval
+ *
+ * See docs/CNVs/CNV-methods.pdf for details.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+final class CopyRatioState extends ParameterizedState {
+ static final class SegmentMeans extends ArrayList {
+ private static final long serialVersionUID = 951753L;
+
+ SegmentMeans(final List segmentMeans) {
+ super(new ArrayList<>(segmentMeans));
+ }
+ }
+
+ static final class OutlierIndicators extends BitSet {
+ private static final long serialVersionUID = 357159L;
+
+ OutlierIndicators(final List outlierIndicators) {
+ super(outlierIndicators.size());
+ IntStream.range(0, outlierIndicators.size()).filter(outlierIndicators::get).forEach(this::set);
+ }
+ }
+
+ CopyRatioState(final double variance,
+ final double outlierProbability,
+ final SegmentMeans segmentMeans,
+ final OutlierIndicators outlierIndicators) {
+ super(Arrays.asList(
+ new Parameter<>(CopyRatioParameter.VARIANCE, variance),
+ new Parameter<>(CopyRatioParameter.OUTLIER_PROBABILITY, outlierProbability),
+ new Parameter<>(CopyRatioParameter.SEGMENT_MEANS, segmentMeans),
+ new Parameter<>(CopyRatioParameter.OUTLIER_INDICATORS, outlierIndicators)));
+ }
+
+ double variance() {
+ return get(CopyRatioParameter.VARIANCE, Double.class);
+ }
+
+ double outlierProbability() {
+ return get(CopyRatioParameter.OUTLIER_PROBABILITY, Double.class);
+ }
+
+ double segmentMean(final int segmentIndex) {
+ return get(CopyRatioParameter.SEGMENT_MEANS, CopyRatioState.SegmentMeans.class).get(segmentIndex);
+ }
+
+ boolean outlierIndicator(final int copyRatioIndex) {
+ return get(CopyRatioParameter.OUTLIER_INDICATORS, CopyRatioState.OutlierIndicators.class).get(copyRatioIndex);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModeller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModeller.java
new file mode 100644
index 00000000000..d1455e617cd
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModeller.java
@@ -0,0 +1,311 @@
+package org.broadinstitute.hellbender.tools.copynumber.models;
+
+import com.google.common.annotations.VisibleForTesting;
+import htsjdk.samtools.util.OverlapDetector;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ModeledSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.MultidimensionalSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * Represents a segmented model for copy ratio and allele fraction.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class MultidimensionalModeller {
+ private static final Logger logger = LogManager.getLogger(MultidimensionalModeller.class);
+
+ public static final String DOUBLE_FORMAT = "%6.6f";
+
+ private final SampleMetadata sampleMetadata;
+ private final CopyRatioCollection denoisedCopyRatios;
+ private final OverlapDetector copyRatioMidpointOverlapDetector;
+ private final AllelicCountCollection allelicCounts;
+ private final OverlapDetector allelicCountOverlapDetector;
+ private final AlleleFractionPrior alleleFractionPrior;
+
+ private CopyRatioModeller copyRatioModeller;
+ private AlleleFractionModeller alleleFractionModeller;
+
+ private List currentSegments;
+ private final List modeledSegments = new ArrayList<>();
+
+ //similar-segment merging may leave model in a state where it is not properly fit (deciles may be estimated naively)
+ private boolean isModelFit;
+
+ private final int numSamplesCopyRatio;
+ private final int numBurnInCopyRatio;
+ private final int numSamplesAlleleFraction;
+ private final int numBurnInAlleleFraction;
+
+ /**
+ * Constructs a copy-ratio and allele-fraction modeller, specifying number of total samples
+ * and number of burn-in samples for Markov-Chain Monte Carlo model fitting.
+ * An initial model fit is performed.
+ */
+ public MultidimensionalModeller(final MultidimensionalSegmentCollection multidimensionalSegments,
+ final CopyRatioCollection denoisedCopyRatios,
+ final AllelicCountCollection allelicCounts,
+ final AlleleFractionPrior alleleFractionPrior,
+ final int numSamplesCopyRatio,
+ final int numBurnInCopyRatio,
+ final int numSamplesAlleleFraction,
+ final int numBurnInAlleleFraction) {
+ Utils.validateArg(Stream.of(
+ Utils.nonNull(multidimensionalSegments).getSampleName(),
+ Utils.nonNull(denoisedCopyRatios).getSampleName(),
+ Utils.nonNull(allelicCounts).getSampleName()).distinct().count() == 1,
+ "Sample names from all inputs must match.");
+ ParamUtils.isPositive(multidimensionalSegments.size(), "Number of segments must be positive.");
+ sampleMetadata = multidimensionalSegments.getSampleMetadata();
+ currentSegments = multidimensionalSegments.getIntervals();
+ this.denoisedCopyRatios = denoisedCopyRatios;
+ copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector();
+ this.allelicCounts = allelicCounts;
+ allelicCountOverlapDetector = allelicCounts.getOverlapDetector();
+ this.alleleFractionPrior = Utils.nonNull(alleleFractionPrior);
+ this.numSamplesCopyRatio = numSamplesCopyRatio;
+ this.numBurnInCopyRatio = numBurnInCopyRatio;
+ this.numSamplesAlleleFraction = numSamplesAlleleFraction;
+ this.numBurnInAlleleFraction = numBurnInAlleleFraction;
+ logger.info("Fitting initial model...");
+ fitModel();
+ }
+
+ public ModeledSegmentCollection getModeledSegments() {
+ return new ModeledSegmentCollection(sampleMetadata, modeledSegments);
+ }
+
+ /**
+ * Performs Markov-Chain Monte Carlo model fitting using the
+ * number of total samples and number of burn-in samples specified at construction.
+ */
+ private void fitModel() {
+ //perform MCMC to generate posterior samples
+ logger.info("Fitting copy-ratio model...");
+ copyRatioModeller = new CopyRatioModeller(denoisedCopyRatios, currentSegments);
+ copyRatioModeller.fitMCMC(numSamplesCopyRatio, numBurnInCopyRatio);
+ logger.info("Fitting allele-fraction model...");
+ alleleFractionModeller = new AlleleFractionModeller(allelicCounts, currentSegments, alleleFractionPrior);
+ alleleFractionModeller.fitMCMC(numSamplesAlleleFraction, numBurnInAlleleFraction);
+
+ //update list of ModeledSegment with new PosteriorSummaries
+ modeledSegments.clear();
+ final List segmentMeansPosteriorSummaries =
+ copyRatioModeller.getSegmentMeansPosteriorSummaries();
+ final List minorAlleleFractionsPosteriorSummaries =
+ alleleFractionModeller.getMinorAlleleFractionsPosteriorSummaries();
+ for (int segmentIndex = 0; segmentIndex < currentSegments.size(); segmentIndex++) {
+ final SimpleInterval segment = currentSegments.get(segmentIndex);
+ final int numPointsCopyRatio = copyRatioMidpointOverlapDetector.getOverlaps(segment).size();
+ final int numPointsAlleleFraction = allelicCountOverlapDetector.getOverlaps(segment).size();
+ final ModeledSegment.SimplePosteriorSummary segmentMeansPosteriorSummary = segmentMeansPosteriorSummaries.get(segmentIndex);
+ final ModeledSegment.SimplePosteriorSummary minorAlleleFractionPosteriorSummary = minorAlleleFractionsPosteriorSummaries.get(segmentIndex);
+ modeledSegments.add(new ModeledSegment(
+ segment, numPointsCopyRatio, numPointsAlleleFraction, segmentMeansPosteriorSummary, minorAlleleFractionPosteriorSummary));
+ }
+ isModelFit = true;
+ }
+
+ /**
+ * @param numSmoothingIterationsPerFit if this is zero, no refitting will be performed between smoothing iterations
+ */
+ public void smoothSegments(final int maxNumSmoothingIterations,
+ final int numSmoothingIterationsPerFit,
+ final double smoothingCredibleIntervalThresholdCopyRatio,
+ final double smoothingCredibleIntervalThresholdAlleleFraction) {
+ ParamUtils.isPositiveOrZero(maxNumSmoothingIterations,
+ "The maximum number of smoothing iterations must be non-negative.");
+ ParamUtils.isPositiveOrZero(smoothingCredibleIntervalThresholdCopyRatio,
+ "The number of smoothing iterations per fit must be non-negative.");
+ ParamUtils.isPositiveOrZero(smoothingCredibleIntervalThresholdAlleleFraction,
+ "The allele-fraction credible-interval threshold for segmentation smoothing must be non-negative.");
+ logger.info(String.format("Initial number of segments before smoothing: %d", modeledSegments.size()));
+ //perform iterations of similar-segment merging until all similar segments are merged
+ for (int numIterations = 1; numIterations <= maxNumSmoothingIterations; numIterations++) {
+ logger.info(String.format("Smoothing iteration: %d", numIterations));
+ final int prevNumSegments = modeledSegments.size();
+ if (numSmoothingIterationsPerFit > 0 && numIterations % numSmoothingIterationsPerFit == 0) {
+ //refit model after this merge iteration
+ performSmoothingIteration(smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction, true);
+ } else {
+ //do not refit model after this merge iteration (posterior modes will be identical to posterior medians)
+ performSmoothingIteration(smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction, false);
+ }
+ if (modeledSegments.size() == prevNumSegments) {
+ break;
+ }
+ }
+ if (!isModelFit) {
+ //make sure final model is completely fit (i.e., posterior modes are specified)
+ fitModel();
+ }
+ logger.info(String.format("Final number of segments after smoothing: %d", modeledSegments.size()));
+ }
+
+ /**
+ * Performs one iteration of similar-segment merging on the list of {@link ModeledSegment} held internally.
+ * Markov-Chain Monte Carlo model fitting is optionally performed after each iteration using the
+ * number of total samples and number of burn-in samples specified at construction.
+ * @param intervalThresholdSegmentMean threshold number of credible intervals for segment-mean similarity
+ * @param intervalThresholdMinorAlleleFraction threshold number of credible intervals for minor-allele-fraction similarity
+ * @param doModelFit if true, refit MCMC model after merging
+ */
+ private void performSmoothingIteration(final double intervalThresholdSegmentMean,
+ final double intervalThresholdMinorAlleleFraction,
+ final boolean doModelFit) {
+ logger.info("Number of segments before smoothing iteration: " + modeledSegments.size());
+ final List mergedSegments = SimilarSegmentUtils.mergeSimilarSegments(modeledSegments, intervalThresholdSegmentMean, intervalThresholdMinorAlleleFraction);
+ logger.info("Number of segments after smoothing iteration: " + mergedSegments.size());
+ currentSegments = mergedSegments.stream().map(ModeledSegment::getInterval).collect(Collectors.toList());
+ if (doModelFit) {
+ fitModel();
+ } else {
+ modeledSegments.clear();
+ modeledSegments.addAll(mergedSegments);
+ isModelFit = false;
+ }
+ }
+
+ /**
+ * Writes posterior summaries for the global model parameters to a file.
+ */
+ public void writeModelParameterFiles(final File copyRatioParameterFile,
+ final File alleleFractionParameterFile) {
+ Utils.nonNull(copyRatioParameterFile);
+ Utils.nonNull(alleleFractionParameterFile);
+ ensureModelIsFit();
+ logger.info("Writing posterior summaries for copy-ratio global parameters to " + copyRatioParameterFile);
+ copyRatioModeller.getGlobalParameterDeciles().write(copyRatioParameterFile);
+ logger.info("Writing posterior summaries for allele-fraction global parameters to " + alleleFractionParameterFile);
+ alleleFractionModeller.getGlobalParameterDeciles().write(alleleFractionParameterFile);
+ }
+
+ @VisibleForTesting
+ CopyRatioModeller getCopyRatioModeller() {
+ return copyRatioModeller;
+ }
+
+ @VisibleForTesting
+ AlleleFractionModeller getAlleleFractionModeller() {
+ return alleleFractionModeller;
+ }
+
+ private void ensureModelIsFit() {
+ if (!isModelFit) {
+ logger.warn("Attempted to write ACNV results to file when model was not completely fit. Performing model fit now.");
+ fitModel();
+ }
+ }
+
+ /**
+ * Contains private methods for similar-segment merging.
+ */
+ private static final class SimilarSegmentUtils {
+ /**
+ * Returns a new, modifiable list of segments with similar segments (i.e., adjacent segments with both
+ * segment-mean and minor-allele-fractions posteriors similar; posteriors are similar if the difference between
+ * posterior central tendencies is less than intervalThreshold times the posterior credible interval of either summary)
+ * merged. The list of segments is traversed once from beginning to end, and each segment is checked for similarity
+ * with the segment to the right and merged until it is no longer similar.
+ * @param intervalThresholdSegmentMean threshold number of credible intervals for segment-mean similarity
+ * @param intervalThresholdMinorAlleleFraction threshold number of credible intervals for minor-allele-fraction similarity
+ */
+ private static List mergeSimilarSegments(final List segments,
+ final double intervalThresholdSegmentMean,
+ final double intervalThresholdMinorAlleleFraction) {
+ final List mergedSegments = new ArrayList<>(segments);
+ int index = 0;
+ while (index < mergedSegments.size() - 1) {
+ final ModeledSegment segment1 = mergedSegments.get(index);
+ final ModeledSegment segment2 = mergedSegments.get(index + 1);
+ if (segment1.getContig().equals(segment2.getContig()) &&
+ areSimilar(segment1, segment2,
+ intervalThresholdSegmentMean, intervalThresholdMinorAlleleFraction)) {
+ mergedSegments.set(index, merge(segment1, segment2));
+ mergedSegments.remove(index + 1);
+ index--; //if merge performed, stay on current segment during next iteration
+ }
+ index++; //if no merge performed, go to next segment during next iteration
+ }
+ return mergedSegments;
+ }
+
+ //checks similarity of posterior summaries to within a credible-interval threshold;
+ //posterior summaries are similar if the difference between posterior central tendencies is less than
+ //intervalThreshold times the credible-interval width for both summaries
+ private static boolean areSimilar(final ModeledSegment.SimplePosteriorSummary summary1,
+ final ModeledSegment.SimplePosteriorSummary summary2,
+ final double intervalThreshold) {
+ if (Double.isNaN(summary1.getDecile50()) || Double.isNaN(summary2.getDecile50())) {
+ return true;
+ }
+ final double absoluteDifference = Math.abs(summary1.getDecile50() - summary2.getDecile50());
+ return absoluteDifference < intervalThreshold * (summary1.getDecile90() - summary1.getDecile10()) &&
+ absoluteDifference < intervalThreshold * (summary2.getDecile90() - summary2.getDecile10());
+ }
+
+ //checks similarity of modeled segments to within credible-interval thresholds for segment mean and minor allele fraction
+ private static boolean areSimilar(final ModeledSegment segment1,
+ final ModeledSegment segment2,
+ final double intervalThresholdSegmentMean,
+ final double intervalThresholdMinorAlleleFraction) {
+ return areSimilar(segment1.getLog2CopyRatioSimplePosteriorSummary(), segment2.getLog2CopyRatioSimplePosteriorSummary(), intervalThresholdSegmentMean) &&
+ areSimilar(segment1.getMinorAlleleFractionSimplePosteriorSummary(), segment2.getMinorAlleleFractionSimplePosteriorSummary(), intervalThresholdMinorAlleleFraction);
+ }
+
+ //merges posterior summaries naively by approximating posteriors as normal
+ private static ModeledSegment.SimplePosteriorSummary merge(final ModeledSegment.SimplePosteriorSummary summary1,
+ final ModeledSegment.SimplePosteriorSummary summary2) {
+ if (Double.isNaN(summary1.getDecile50()) && !Double.isNaN(summary2.getDecile50())) {
+ return summary2;
+ }
+ if ((!Double.isNaN(summary1.getDecile50()) && Double.isNaN(summary2.getDecile50())) ||
+ (Double.isNaN(summary1.getDecile50()) && Double.isNaN(summary2.getDecile50()))) {
+ return summary1;
+ }
+ //use credible half-interval as standard deviation
+ final double standardDeviation1 = (summary1.getDecile90() - summary1.getDecile10()) / 2.;
+ final double standardDeviation2 = (summary2.getDecile90() - summary2.getDecile10()) / 2.;
+ final double variance = 1. / (1. / Math.pow(standardDeviation1, 2.) + 1. / Math.pow(standardDeviation2, 2.));
+ final double mean =
+ (summary1.getDecile50() / Math.pow(standardDeviation1, 2.) + summary2.getDecile50() / Math.pow(standardDeviation2, 2.))
+ * variance;
+ final double standardDeviation = Math.sqrt(variance);
+ return new ModeledSegment.SimplePosteriorSummary(mean, mean - standardDeviation, mean + standardDeviation);
+ }
+
+ private static ModeledSegment merge(final ModeledSegment segment1,
+ final ModeledSegment segment2) {
+ return new ModeledSegment(mergeSegments(segment1.getInterval(), segment2.getInterval()),
+ segment1.getNumPointsCopyRatio() + segment2.getNumPointsCopyRatio(),
+ segment1.getNumPointsAlleleFraction() + segment2.getNumPointsAlleleFraction(),
+ merge(segment1.getLog2CopyRatioSimplePosteriorSummary(), segment2.getLog2CopyRatioSimplePosteriorSummary()),
+ merge(segment1.getMinorAlleleFractionSimplePosteriorSummary(), segment2.getMinorAlleleFractionSimplePosteriorSummary()));
+ }
+
+ private static SimpleInterval mergeSegments(final SimpleInterval segment1,
+ final SimpleInterval segment2) {
+ Utils.validateArg(segment1.getContig().equals(segment2.getContig()),
+ String.format("Cannot join segments %s and %s on different chromosomes.", segment1.toString(), segment2.toString()));
+ final int start = Math.min(segment1.getStart(), segment2.getStart());
+ final int end = Math.max(segment1.getEnd(), segment2.getEnd());
+ return new SimpleInterval(segment1.getContig(), start, end);
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
index 88b8c7acc3f..cf92cde73aa 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
@@ -8,8 +8,8 @@
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.DenoiseReadCounts;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
index f6e23eb8f3c..b925b6ed9ea 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
@@ -7,13 +7,14 @@
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCount;
-import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollection;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.ModelSegments;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
-import org.broadinstitute.hellbender.tools.copynumber.multidimensional.model.ModeledSegment;
-import org.broadinstitute.hellbender.tools.copynumber.multidimensional.model.ModeledSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ModeledSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
import org.broadinstitute.hellbender.utils.R.RScriptExecutor;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
@@ -27,7 +28,7 @@
import java.util.stream.Collectors;
/**
- * Plots segmented copy-ratio and minor-allele-fraction modeling results.
+ * Plots segmented copy-ratio and minor-allele-fraction modeling results from {@link ModelSegments}.
*
* The order and representation of contigs in plots follows the contig ordering within the required reference sequence dictionary.
*
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenter.java
new file mode 100644
index 00000000000..ce33ae4868b
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenter.java
@@ -0,0 +1,114 @@
+package org.broadinstitute.hellbender.tools.copynumber.segmentation;
+
+import org.apache.commons.math3.util.FastMath;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AlleleFractionSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AlleleFractionSegment;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.util.*;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+/**
+ * Segments alternate-allele-fraction data using kernel segmentation. Segments do not span chromosomes.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class AlleleFractionKernelSegmenter {
+ private static final Logger logger = LogManager.getLogger(AlleleFractionKernelSegmenter.class);
+
+ private static final int MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME = 10;
+
+ //Gaussian kernel for a specified variance; if variance is zero, use a linear kernel
+ private static final Function> KERNEL =
+ variance -> variance == 0.
+ ? (x, y) -> x * y
+ : (x, y) -> FastMath.exp(-(x - y) * (x - y) / (2. * variance));
+
+ private final AllelicCountCollection allelicCounts;
+ private final Map> allelicCountsPerChromosome;
+
+ public AlleleFractionKernelSegmenter(final AllelicCountCollection allelicCounts) {
+ Utils.nonNull(allelicCounts);
+ this.allelicCounts = allelicCounts;
+ allelicCountsPerChromosome = allelicCounts.getRecords().stream()
+ .collect(Collectors.groupingBy(
+ AllelicCount::getContig,
+ LinkedHashMap::new,
+ Collectors.mapping(Function.identity(), Collectors.toList())));
+ }
+
+ /**
+ * Segments the internally held {@link AllelicCountCollection} using a separate {@link KernelSegmenter} for each chromosome.
+ * @param kernelVariance variance of the Gaussian kernel; if zero, a linear kernel is used instead
+ */
+ public AlleleFractionSegmentCollection findSegmentation(final int maxNumChangepointsPerChromosome,
+ final double kernelVariance,
+ final int kernelApproximationDimension,
+ final List windowSizes,
+ final double numChangepointsPenaltyLinearFactor,
+ final double numChangepointsPenaltyLogLinearFactor) {
+ ParamUtils.isPositiveOrZero(maxNumChangepointsPerChromosome, "Maximum number of changepoints must be non-negative.");
+ ParamUtils.isPositiveOrZero(kernelVariance, "Variance of Gaussian kernel must be non-negative (if zero, a linear kernel will be used).");
+ ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive.");
+ Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive.");
+ Utils.validateArg(new HashSet<>(windowSizes).size() == windowSizes.size(), "Window sizes must all be unique.");
+ ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor,
+ "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
+ ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor,
+ "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
+
+ logger.info(String.format("Finding changepoints in %d data points and %d chromosomes...",
+ allelicCounts.getRecords().size(), allelicCountsPerChromosome.size()));
+
+ //loop over chromosomes, find changepoints, and create allele-fraction segments
+ final List segments = new ArrayList<>();
+ for (final String chromosome : allelicCountsPerChromosome.keySet()) {
+ final List allelicCountsInChromosome = allelicCountsPerChromosome.get(chromosome);
+ final int numAllelicCountsInChromosome = allelicCountsInChromosome.size();
+ logger.info(String.format("Finding changepoints in %d data points in chromosome %s...",
+ numAllelicCountsInChromosome, chromosome));
+
+ if (numAllelicCountsInChromosome < MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME) {
+ logger.warn(String.format("Number of points in chromosome %s (%d) is less than that required (%d), skipping segmentation...",
+ chromosome, numAllelicCountsInChromosome, MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME));
+ final int start = allelicCountsInChromosome.get(0).getStart();
+ final int end = allelicCountsInChromosome.get(numAllelicCountsInChromosome - 1).getEnd();
+ segments.add(new AlleleFractionSegment(
+ new SimpleInterval(chromosome, start, end), numAllelicCountsInChromosome));
+ continue;
+ }
+
+ final List alternateAlleleFractionsInChromosome = allelicCountsPerChromosome.get(chromosome).stream()
+ .map(AllelicCount::getAlternateAlleleFraction)
+ .collect(Collectors.toList());
+ final List changepoints = new ArrayList<>(new KernelSegmenter<>(alternateAlleleFractionsInChromosome)
+ .findChangepoints(maxNumChangepointsPerChromosome, KERNEL.apply(kernelVariance), kernelApproximationDimension,
+ windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, KernelSegmenter.ChangepointSortOrder.INDEX));
+
+ if (!changepoints.contains(numAllelicCountsInChromosome)) {
+ changepoints.add(numAllelicCountsInChromosome - 1);
+ }
+ int previousChangepoint = -1;
+ for (final int changepoint : changepoints) {
+ final int start = allelicCountsPerChromosome.get(chromosome).get(previousChangepoint + 1).getStart();
+ final int end = allelicCountsPerChromosome.get(chromosome).get(changepoint).getEnd();
+ final List allelicCountsInSegment = allelicCountsInChromosome.subList(
+ previousChangepoint + 1, changepoint + 1);
+ segments.add(new AlleleFractionSegment(
+ new SimpleInterval(chromosome, start, end), allelicCountsInSegment));
+ previousChangepoint = changepoint;
+ }
+ }
+ logger.info(String.format("Found %d segments in %d chromosomes.", segments.size(), allelicCountsPerChromosome.keySet().size()));
+ return new AlleleFractionSegmentCollection(allelicCounts.getSampleMetadata(), segments);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenter.java
new file mode 100644
index 00000000000..e038b10aa02
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenter.java
@@ -0,0 +1,118 @@
+package org.broadinstitute.hellbender.tools.copynumber.segmentation;
+
+import org.apache.commons.math3.util.FastMath;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment;
+import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.util.*;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+/**
+ * Segments copy-ratio data using kernel segmentation. Segments do not span chromosomes.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class CopyRatioKernelSegmenter {
+ private static final Logger logger = LogManager.getLogger(CopyRatioKernelSegmenter.class);
+
+ private static final int MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME = 10;
+
+ //Gaussian kernel for a specified variance; if variance is zero, use a linear kernel
+ private static final Function> KERNEL =
+ variance -> variance == 0.
+ ? (x, y) -> x * y
+ : (x, y) -> FastMath.exp(-(x - y) * (x - y) / (2. * variance));
+
+ private final CopyRatioCollection denoisedCopyRatios;
+ private final Map> denoisedCopyRatiosPerChromosome; //in log2 space
+
+ /**
+ * @param denoisedCopyRatios in log2 space
+ */
+ public CopyRatioKernelSegmenter(final CopyRatioCollection denoisedCopyRatios) {
+ Utils.nonNull(denoisedCopyRatios);
+ this.denoisedCopyRatios = denoisedCopyRatios;
+ denoisedCopyRatiosPerChromosome = denoisedCopyRatios.getRecords().stream()
+ .collect(Collectors.groupingBy(
+ CopyRatio::getContig,
+ LinkedHashMap::new,
+ Collectors.mapping(Function.identity(), Collectors.toList())));
+ }
+
+ /**
+ * Segments the internally held {@link CopyRatioCollection} using a separate {@link KernelSegmenter} for each chromosome.
+ * @param kernelVariance variance of the Gaussian kernel; if zero, a linear kernel is used instead
+ */
+ public CopyRatioSegmentCollection findSegmentation(final int maxNumChangepointsPerChromosome,
+ final double kernelVariance,
+ final int kernelApproximationDimension,
+ final List windowSizes,
+ final double numChangepointsPenaltyLinearFactor,
+ final double numChangepointsPenaltyLogLinearFactor) {
+ ParamUtils.isPositiveOrZero(maxNumChangepointsPerChromosome, "Maximum number of changepoints must be non-negative.");
+ ParamUtils.isPositiveOrZero(kernelVariance, "Variance of Gaussian kernel must be non-negative (if zero, a linear kernel will be used).");
+ ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive.");
+ Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive.");
+ Utils.validateArg(new HashSet<>(windowSizes).size() == windowSizes.size(), "Window sizes must all be unique.");
+ ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor,
+ "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
+ ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor,
+ "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
+
+ logger.info(String.format("Finding changepoints in %d data points and %d chromosomes...",
+ denoisedCopyRatios.getRecords().size(), denoisedCopyRatiosPerChromosome.size()));
+
+ //loop over chromosomes, find changepoints, and create copy-ratio segments
+ final List segments = new ArrayList<>();
+ for (final String chromosome : denoisedCopyRatiosPerChromosome.keySet()) {
+ final List denoisedCopyRatiosInChromosome = denoisedCopyRatiosPerChromosome.get(chromosome);
+ final int numDenoisedCopyRatiosInChromosome = denoisedCopyRatiosInChromosome.size();
+ logger.info(String.format("Finding changepoints in %d data points in chromosome %s...",
+ numDenoisedCopyRatiosInChromosome, chromosome));
+
+ if (numDenoisedCopyRatiosInChromosome < MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME) {
+ logger.warn(String.format("Number of points in chromosome %s (%d) is less than that required (%d), skipping segmentation...",
+ chromosome, numDenoisedCopyRatiosInChromosome, MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME));
+ final int start = denoisedCopyRatiosPerChromosome.get(chromosome).get(0).getStart();
+ final int end = denoisedCopyRatiosPerChromosome.get(chromosome).get(numDenoisedCopyRatiosInChromosome - 1).getEnd();
+ segments.add(new CopyRatioSegment(
+ new SimpleInterval(chromosome, start, end), denoisedCopyRatiosInChromosome));
+ continue;
+ }
+
+ final List denoisedLog2CopyRatioValuesInChromosome = denoisedCopyRatiosInChromosome.stream()
+ .map(CopyRatio::getLog2CopyRatioValue)
+ .collect(Collectors.toList());
+ final List changepoints = new ArrayList<>(new KernelSegmenter<>(denoisedLog2CopyRatioValuesInChromosome)
+ .findChangepoints(maxNumChangepointsPerChromosome, KERNEL.apply(kernelVariance), kernelApproximationDimension,
+ windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, KernelSegmenter.ChangepointSortOrder.INDEX));
+
+ if (!changepoints.contains(numDenoisedCopyRatiosInChromosome)) {
+ changepoints.add(numDenoisedCopyRatiosInChromosome - 1);
+ }
+ int previousChangepoint = -1;
+ for (final int changepoint : changepoints) {
+ final int start = denoisedCopyRatiosPerChromosome.get(chromosome).get(previousChangepoint + 1).getStart();
+ final int end = denoisedCopyRatiosPerChromosome.get(chromosome).get(changepoint).getEnd();
+ final List denoisedCopyRatiosInSegment = denoisedCopyRatiosInChromosome.subList(
+ previousChangepoint + 1, changepoint + 1);
+ segments.add(new CopyRatioSegment(
+ new SimpleInterval(chromosome, start, end),
+ denoisedCopyRatiosInSegment));
+ previousChangepoint = changepoint;
+ }
+ }
+ logger.info(String.format("Found %d segments in %d chromosomes.", segments.size(), denoisedCopyRatiosPerChromosome.keySet().size()));
+ return new CopyRatioSegmentCollection(denoisedCopyRatios.getSampleMetadata(), segments);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/MultidimensionalKernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/MultidimensionalKernelSegmenter.java
new file mode 100644
index 00000000000..10640e0ceaa
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/MultidimensionalKernelSegmenter.java
@@ -0,0 +1,199 @@
+package org.broadinstitute.hellbender.tools.copynumber.segmentation;
+
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
+import org.apache.commons.math3.distribution.NormalDistribution;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.MultidimensionalSegmentCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment;
+import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.param.ParamUtils;
+
+import java.util.*;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+/**
+ * Segments copy-ratio and alternate-allele-fraction data using kernel segmentation. Segments do not span chromosomes.
+ * Only the first allele-fraction site in each copy-ratio interval is used. The alternate-allele fraction in
+ * copy-ratio intervals that do not contain any sites is imputed to be balanced at 0.5.
+ *
+ * @author Samuel Lee <slee@broadinstitute.org>
+ */
+public final class MultidimensionalKernelSegmenter {
+ private static final Logger logger = LogManager.getLogger(MultidimensionalKernelSegmenter.class);
+
+ private static final int MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME = 10;
+
+ //assume alternate-allele fraction is 0.5 for missing data
+ private static final SimpleInterval DUMMY_INTERVAL = new SimpleInterval("DUMMY", 1, 1);
+ private static final AllelicCount BALANCED_ALLELIC_COUNT = new AllelicCount(DUMMY_INTERVAL, 1, 1);
+
+ //Gaussian kernel for a specified variance; if variance is zero, use a linear kernel
+ private static final Function> KERNEL =
+ standardDeviation -> standardDeviation == 0.
+ ? (x, y) -> x * y
+ : (x, y) -> new NormalDistribution(null, x, standardDeviation).density(y);
+
+ static final class MultidimensionalPoint implements Locatable {
+ private final SimpleInterval interval;
+ private final double log2CopyRatio;
+ private final double alternateAlleleFraction;
+
+ MultidimensionalPoint(final SimpleInterval interval,
+ final double log2CopyRatio,
+ final double alternateAlleleFraction) {
+ this.interval = interval;
+ this.log2CopyRatio = log2CopyRatio;
+ this.alternateAlleleFraction = alternateAlleleFraction;
+ }
+
+ @Override
+ public String getContig() {
+ return interval.getContig();
+ }
+
+ @Override
+ public int getStart() {
+ return interval.getStart();
+ }
+
+ @Override
+ public int getEnd() {
+ return interval.getEnd();
+ }
+ }
+
+ private final CopyRatioCollection denoisedCopyRatios;
+ private final OverlapDetector copyRatioMidpointOverlapDetector;
+ private final AllelicCountCollection allelicCounts;
+ private final OverlapDetector allelicCountOverlapDetector;
+ private final Map> multidimensionalPointsPerChromosome;
+
+ public MultidimensionalKernelSegmenter(final CopyRatioCollection denoisedCopyRatios,
+ final AllelicCountCollection allelicCounts) {
+ Utils.nonNull(denoisedCopyRatios);
+ Utils.nonNull(allelicCounts);
+ Utils.validateArg(denoisedCopyRatios.getSampleName().equals(allelicCounts.getSampleName()),
+ "Sample names do not match.");
+ this.denoisedCopyRatios = denoisedCopyRatios;
+ copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector();
+ this.allelicCounts = allelicCounts;
+ allelicCountOverlapDetector = allelicCounts.getOverlapDetector();
+ final int numAllelicCountsToUse = (int) denoisedCopyRatios.getRecords().stream()
+ .filter(allelicCountOverlapDetector::overlapsAny)
+ .count();
+ logger.info(String.format("Using first allelic-count site in each copy-ratio interval (%d / %d) for multidimensional segmentation...",
+ numAllelicCountsToUse, allelicCounts.size()));
+ multidimensionalPointsPerChromosome = denoisedCopyRatios.getRecords().stream()
+ .map(cr -> new MultidimensionalPoint(
+ cr.getInterval(),
+ cr.getLog2CopyRatioValue(),
+ allelicCountOverlapDetector.getOverlaps(cr).stream()
+ .min(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR::compare)
+ .orElse(BALANCED_ALLELIC_COUNT).getAlternateAlleleFraction()))
+ .collect(Collectors.groupingBy(
+ MultidimensionalPoint::getContig,
+ LinkedHashMap::new,
+ Collectors.toList()));
+ }
+
+ /**
+ * Segments the internally held {@link CopyRatioCollection} and {@link AllelicCountCollection}
+ * using a separate {@link KernelSegmenter} for each chromosome.
+ * @param kernelVarianceCopyRatio variance of the Gaussian kernel used for copy-ratio data;
+ * if zero, a linear kernel is used instead
+ * @param kernelVarianceAlleleFraction variance of the Gaussian kernel used for allele-fraction data;
+ * if zero, a linear kernel is used instead
+ * @param kernelScalingAlleleFraction relative scaling S of the kernel K_AF for allele-fraction data
+ * to the kernel K_CR for copy-ratio data;
+ * the total kernel is K_CR + S * K_AF
+ */
+ public MultidimensionalSegmentCollection findSegmentation(final int maxNumChangepointsPerChromosome,
+ final double kernelVarianceCopyRatio,
+ final double kernelVarianceAlleleFraction,
+ final double kernelScalingAlleleFraction,
+ final int kernelApproximationDimension,
+ final List windowSizes,
+ final double numChangepointsPenaltyLinearFactor,
+ final double numChangepointsPenaltyLogLinearFactor) {
+ ParamUtils.isPositiveOrZero(maxNumChangepointsPerChromosome, "Maximum number of changepoints must be non-negative.");
+ ParamUtils.isPositiveOrZero(kernelVarianceCopyRatio, "Variance of copy-ratio Gaussian kernel must be non-negative (if zero, a linear kernel will be used).");
+ ParamUtils.isPositiveOrZero(kernelVarianceAlleleFraction, "Variance of allele-fraction Gaussian kernel must be non-negative (if zero, a linear kernel will be used).");
+ ParamUtils.isPositiveOrZero(kernelScalingAlleleFraction, "Scaling of allele-fraction Gaussian kernel must be non-negative.");
+ ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive.");
+ Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive.");
+ Utils.validateArg(new HashSet<>(windowSizes).size() == windowSizes.size(), "Window sizes must all be unique.");
+ ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor,
+ "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
+ ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor,
+ "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative.");
+
+ final BiFunction kernel = constructKernel(
+ kernelVarianceCopyRatio, kernelVarianceAlleleFraction, kernelScalingAlleleFraction);
+
+ logger.info(String.format("Finding changepoints in (%d, %d) data points and %d chromosomes...",
+ denoisedCopyRatios.getRecords().size(), allelicCounts.size(), multidimensionalPointsPerChromosome.size()));
+
+ //loop over chromosomes, find changepoints, and create allele-fraction segments
+ final List segments = new ArrayList<>();
+ for (final String chromosome : multidimensionalPointsPerChromosome.keySet()) {
+ final List multidimensionalPointsInChromosome = multidimensionalPointsPerChromosome.get(chromosome);
+ final int numMultidimensionalPointsInChromosome = multidimensionalPointsInChromosome.size();
+ logger.info(String.format("Finding changepoints in %d data points in chromosome %s...",
+ numMultidimensionalPointsInChromosome, chromosome));
+
+ if (numMultidimensionalPointsInChromosome < MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME) {
+ logger.warn(String.format("Number of points in chromosome %s (%d) is less than that required (%d), skipping segmentation...",
+ chromosome, numMultidimensionalPointsInChromosome, MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME));
+ final int start = multidimensionalPointsInChromosome.get(0).getStart();
+ final int end = multidimensionalPointsInChromosome.get(numMultidimensionalPointsInChromosome - 1).getEnd();
+ segments.add(new MultidimensionalSegment(
+ new SimpleInterval(chromosome, start, end),
+ copyRatioMidpointOverlapDetector,
+ allelicCountOverlapDetector));
+ continue;
+ }
+
+ final List changepoints = new ArrayList<>(new KernelSegmenter<>(multidimensionalPointsInChromosome)
+ .findChangepoints(maxNumChangepointsPerChromosome, kernel, kernelApproximationDimension,
+ windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, KernelSegmenter.ChangepointSortOrder.INDEX));
+
+ if (!changepoints.contains(numMultidimensionalPointsInChromosome)) {
+ changepoints.add(numMultidimensionalPointsInChromosome - 1);
+ }
+ int previousChangepoint = -1;
+ for (final int changepoint : changepoints) {
+ final int start = multidimensionalPointsPerChromosome.get(chromosome).get(previousChangepoint + 1).getStart();
+ final int end = multidimensionalPointsPerChromosome.get(chromosome).get(changepoint).getEnd();
+ segments.add(new MultidimensionalSegment(
+ new SimpleInterval(chromosome, start, end),
+ copyRatioMidpointOverlapDetector,
+ allelicCountOverlapDetector));
+ previousChangepoint = changepoint;
+ }
+ }
+ logger.info(String.format("Found %d segments in %d chromosomes.", segments.size(), multidimensionalPointsPerChromosome.keySet().size()));
+ return new MultidimensionalSegmentCollection(allelicCounts.getSampleMetadata(), segments);
+ }
+
+ private BiFunction constructKernel(final double kernelVarianceCopyRatio,
+ final double kernelVarianceAlleleFraction,
+ final double kernelScalingAlleleFraction) {
+ final double standardDeviationCopyRatio = Math.sqrt(kernelVarianceCopyRatio);
+ final double standardDeviationAlleleFraction = Math.sqrt(kernelVarianceAlleleFraction);
+ return (p1, p2) ->
+ KERNEL.apply(standardDeviationCopyRatio).apply(p1.log2CopyRatio, p2.log2CopyRatio) +
+ kernelScalingAlleleFraction * KERNEL.apply(standardDeviationAlleleFraction).apply(p1.alternateAlleleFraction, p2.alternateAlleleFraction);
+
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java
index 24bb5d3513f..baf717cef39 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java
@@ -8,7 +8,6 @@
import org.apache.commons.math3.random.RandomGeneratorFactory;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
-import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.copynumber.utils.optimization.PersistenceOptimizer;
import org.broadinstitute.hellbender.utils.IndexRange;
import org.broadinstitute.hellbender.utils.MathUtils;
@@ -134,18 +133,18 @@ public List findChangepoints(final int maxNumChangepoints,
return Collections.emptyList();
}
- logger.info(String.format("Finding up to %d changepoints in %d data points...", maxNumChangepoints, data.size()));
+ logger.debug(String.format("Finding up to %d changepoints in %d data points...", maxNumChangepoints, data.size()));
final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED));
- logger.info("Calculating low-rank approximation to kernel matrix...");
+ logger.debug("Calculating low-rank approximation to kernel matrix...");
final RealMatrix reducedObservationMatrix = calculateReducedObservationMatrix(rng, data, kernel, kernelApproximationDimension);
final double[] kernelApproximationDiagonal = calculateKernelApproximationDiagonal(reducedObservationMatrix);
- logger.info(String.format("Finding changepoint candidates for all window sizes %s...", windowSizes.toString()));
+ logger.debug(String.format("Finding changepoint candidates for all window sizes %s...", windowSizes.toString()));
final List changepointCandidates = findChangepointCandidates(
data, reducedObservationMatrix, kernelApproximationDiagonal, maxNumChangepoints, windowSizes);
- logger.info("Performing backward model selection on changepoint candidates...");
+ logger.debug("Performing backward model selection on changepoint candidates...");
return selectChangepoints(
changepointCandidates, maxNumChangepoints, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor,
reducedObservationMatrix, kernelApproximationDiagonal).stream()
@@ -198,19 +197,19 @@ private static RealMatrix calculateReducedObservationMatrix(final RandomG
final BiFunction kernel,
final int kernelApproximationDimension) {
if (kernelApproximationDimension > data.size()) {
- logger.warn("Specified dimension of the kernel approximation exceeds the number of data points to segment; " +
- "using all data points to calculate kernel matrix.");
+ logger.warn(String.format("Specified dimension of the kernel approximation (%d) exceeds the number of data points (%d) to segment; " +
+ "using all data points to calculate kernel matrix.", kernelApproximationDimension, data.size()));
}
//subsample data with replacement
final int numSubsample = Math.min(kernelApproximationDimension, data.size());
- logger.info(String.format("Subsampling %d points from data to find kernel approximation...", numSubsample));
+ logger.debug(String.format("Subsampling %d points from data to find kernel approximation...", numSubsample));
final List dataSubsample = numSubsample == data.size()
? data
: IntStream.range(0, numSubsample).mapToObj(i -> data.get(rng.nextInt(data.size()))).collect(Collectors.toList());
//calculate (symmetric) kernel matrix of subsampled data
- logger.info(String.format("Calculating kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample));
+ logger.debug(String.format("Calculating kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample));
final RealMatrix subKernelMatrix = new Array2DRowRealMatrix(numSubsample, numSubsample);
for (int i = 0; i < numSubsample; i++) {
for (int j = 0; j < i; j++) {
@@ -222,11 +221,11 @@ private static RealMatrix calculateReducedObservationMatrix(final RandomG
}
//perform SVD of kernel matrix of subsampled data
- logger.info(String.format("Performing SVD of kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample));
+ logger.debug(String.format("Performing SVD of kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample));
final SingularValueDecomposition svd = new SingularValueDecomposition(subKernelMatrix);
//calculate reduced observation matrix
- logger.info(String.format("Calculating reduced observation matrix (%d x %d)...", data.size(), numSubsample));
+ logger.debug(String.format("Calculating reduced observation matrix (%d x %d)...", data.size(), numSubsample));
final double[] invSqrtSingularValues = Arrays.stream(svd.getSingularValues()).map(Math::sqrt).map(x -> 1. / (x + EPSILON)).toArray();
final RealMatrix subKernelUMatrix = new Array2DRowRealMatrix(numSubsample, numSubsample);
subKernelUMatrix.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@@ -267,7 +266,7 @@ private static List findChangepointCandidates(final List d
logger.debug(String.format("Calculating local changepoints costs for window size %d...", windowSize));
if (windowSize > data.size()) {
logger.warn(String.format("Number of points needed to calculate local changepoint costs (2 * window size = %d) " +
- "exceeds number of data points %d. Local changepoint costs will not be calculated for this window size.",
+ "exceeds number of data points (%d). Local changepoint costs will not be calculated for this window size.",
2 * windowSize, data.size()));
continue;
}
@@ -281,7 +280,7 @@ private static List findChangepointCandidates(final List d
}
if (changepointCandidates.isEmpty()) {
- throw new GATKException.ShouldNeverReachHereException("No changepoint candidates found.");
+ logger.warn("No changepoint candidates were found. The specified window sizes may be inappropriate, or there may be insufficient data points");
}
return changepointCandidates;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java
index b2addafcd45..7d7fe6f27f1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java
@@ -16,8 +16,8 @@
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCount;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
import org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils;
import org.broadinstitute.hellbender.tools.exome.SampleCollection;
@@ -31,7 +31,6 @@
import java.io.File;
import java.util.*;
import java.util.stream.Collectors;
-import java.util.stream.IntStream;
/**
* Collects read counts on whole genome sequencing (WGS) alignments using Spark.
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java
index ab07fa3f838..e341ea87a0b 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java
@@ -27,19 +27,19 @@ public ParameterReader(final File file, final Class parameterClass) throws IO
protected Map.Entry createRecord(final DataLine dataLine) {
final String parameterName = dataLine.get(ParameterTableColumn.PARAMETER_NAME);
final T parameter = Enum.valueOf(parameterClass, parameterName);
- final double center = dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_MODE);
- final double lower = dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_LOWER);
- final double upper = dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_UPPER);
+ final double center = dataLine.getDouble(ParameterTableColumn.POSTERIOR_MODE);
+ final double lower = dataLine.getDouble(ParameterTableColumn.POSTERIOR_LOWER);
+ final double upper = dataLine.getDouble(ParameterTableColumn.POSTERIOR_UPPER);
final DecileCollection deciles = new DecileCollection(Arrays.asList(
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_10),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_20),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_30),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_40),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_50),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_60),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_70),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_80),
- dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_90)));
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_10),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_20),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_30),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_40),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_50),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_60),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_70),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_80),
+ dataLine.getDouble(ParameterTableColumn.POSTERIOR_90)));
final PosteriorSummary posteriorSummary = new PosteriorSummary(center, lower, upper);
posteriorSummary.setDeciles(deciles);
return new AbstractMap.SimpleEntry<>(parameter, posteriorSummary);
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java
index 36be10d8e6c..e1361f561eb 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java
@@ -6,28 +6,20 @@
* @author Samuel Lee <slee@broadinstitute.org>
*/
public enum ParameterTableColumn {
- PARAMETER_NAME("Parameter"),
- PARAMETER_POSTERIOR_MODE("Post_Mode"),
- PARAMETER_POSTERIOR_LOWER("Post_Lo"),
- PARAMETER_POSTERIOR_UPPER("Post_Hi"),
- PARAMETER_POSTERIOR_10("Post_10"),
- PARAMETER_POSTERIOR_20("Post_20"),
- PARAMETER_POSTERIOR_30("Post_30"),
- PARAMETER_POSTERIOR_40("Post_40"),
- PARAMETER_POSTERIOR_50("Post_50"),
- PARAMETER_POSTERIOR_60("Post_60"),
- PARAMETER_POSTERIOR_70("Post_70"),
- PARAMETER_POSTERIOR_80("Post_80"),
- PARAMETER_POSTERIOR_90("Post_90");
+ PARAMETER_NAME,
+ POSTERIOR_MODE,
+ POSTERIOR_LOWER,
+ POSTERIOR_UPPER,
+ POSTERIOR_10,
+ POSTERIOR_20,
+ POSTERIOR_30,
+ POSTERIOR_40,
+ POSTERIOR_50,
+ POSTERIOR_60,
+ POSTERIOR_70,
+ POSTERIOR_80,
+ POSTERIOR_90;
- private final String columnName; //store the column names
-
- ParameterTableColumn(final String columnName) { this.columnName = columnName; }
-
- @Override
- public String toString() {
- return columnName;
- }
public static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java
index f247bdebbd4..f1e87b1fac5 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java
@@ -11,31 +11,30 @@
import java.nio.file.Files;
public class GetSampleNameIntegrationTest extends CommandLineProgramTest {
- private static final String TEST_SUB_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/copynumber/allelic";
- private static final File NORMAL_BAM_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-normal.bam");
- private static final String TEST_SUB_DIR2 = publicTestDir + "org/broadinstitute/hellbender/tools";
- private static final File MS_BAD_BAM_FILE = new File(TEST_SUB_DIR2, "multi_sample_bam_header.bam");
+ private static final File SINGLE_SAMPLE_BAM_FILE = new File(toolsTestDir, "valid.bam");
+ private static final File BAD_MULTI_SAMPLE_BAM_FILE = new File(toolsTestDir, "multi_sample_bam_header.bam");
@Test
public void testBasicUsage() throws IOException {
final File outputFile = createTempFile("get-sample-name", ".txt");
final String[] arguments = {
- "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, NORMAL_BAM_FILE.getAbsolutePath(),
- "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath()
+ "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, SINGLE_SAMPLE_BAM_FILE.getAbsolutePath(),
+ "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath(),
+ "--verbosity", "INFO"
};
runCommandLine(arguments);
Assert.assertTrue(outputFile.exists());
Assert.assertTrue(outputFile.length() > 0);
Assert.assertTrue(Files.readAllLines(outputFile.toPath()).stream().count() == 1);
- Assert.assertTrue(Files.readAllLines(outputFile.toPath()).stream().filter(n -> n.equals("20")).count() == 1);
+ Assert.assertTrue(Files.readAllLines(outputFile.toPath()).stream().filter(n -> n.equals("Hi,Mom!")).count() == 1);
}
@Test(expectedExceptions = UserException.class)
public void testMultiSampleBam() {
final File outputFile = createTempFile("get-sample-name-ms", ".txt");
final String[] arguments = {
- "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, MS_BAD_BAM_FILE.getAbsolutePath(),
+ "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, BAD_MULTI_SAMPLE_BAM_FILE.getAbsolutePath(),
"-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath()
};
runCommandLine(arguments);
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java
index 35471e1e0a3..9380063106f 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java
@@ -1,10 +1,10 @@
package org.broadinstitute.hellbender.tools.copynumber;
import org.broadinstitute.hellbender.CommandLineProgramTest;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedInterval;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedIntervalCollection;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotationSet;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LocatableCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalSetRule;
import org.broadinstitute.hellbender.utils.SimpleInterval;
@@ -21,7 +21,7 @@
* @author Samuel Lee <slee@broadinstitute.org>
*/
public final class AnnotateIntervalsIntegrationTest extends CommandLineProgramTest {
- private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/";
+ private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber");
private static final File INTERVALS_FILE = new File(TEST_SUB_DIR, "annotate-intervals-test.interval_list");
private static final File REFERENCE_FILE = new File(b37_reference_20_21);
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java
index 3ec326471db..6a7752fd897 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java
@@ -1,9 +1,8 @@
package org.broadinstitute.hellbender.tools.copynumber;
import org.broadinstitute.hellbender.CommandLineProgramTest;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.caller.CalledCopyRatioSegmentCollection;
-import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection;
+import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
import org.testng.Assert;
import org.testng.annotations.Test;
@@ -13,20 +12,16 @@
* Integration test for {@link CallCopyRatioSegments}.
*/
public final class CallCopyRatioSegmentsIntegrationTest extends CommandLineProgramTest {
- private static final File TEST_DIR = new File(toolsTestDir, "copynumber/coverage/caller");
- private static final File TEST_DENOISED_COPY_RATIOS = new File(TEST_DIR, "call-copy-ratio-segments-denoised-copy-ratios.tsv");
- private static final File TEST_SEGMENTS = new File(TEST_DIR, "call-copy-ratio-segments-segments.seg");
+ private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber");
+ private static final File TEST_SEGMENTS = new File(TEST_SUB_DIR, "call-copy-ratio-segments-segments.seg");
@Test
public void testCallSegments() {
final File outputFile = createTempFile("test.called",".seg");
-
- final String[] arguments = {
- "-" + CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_SHORT_NAME, TEST_DENOISED_COPY_RATIOS.getAbsolutePath(),
- "-" + CopyNumberStandardArgument.SEGMENTS_FILE_SHORT_NAME, TEST_SEGMENTS.getAbsolutePath(),
- "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath()
- };
- runCommandLine(arguments);
+ final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
+ .addInput(TEST_SEGMENTS)
+ .addOutput(outputFile);
+ runCommandLine(argsBuilder);
final CalledCopyRatioSegmentCollection calledCopyRatioSegments = new CalledCopyRatioSegmentCollection(outputFile);
Assert.assertEquals(calledCopyRatioSegments.getRecords().stream().map(s -> s.getCall().getOutputString()).toArray(), new String[] {"+", "-", "0", "0"});
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java
index 1286162cda6..6e3bfdfad6c 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java
@@ -2,9 +2,9 @@
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCount;
-import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.testng.Assert;
@@ -21,7 +21,7 @@
*/
public final class CollectAllelicCountsIntegrationTest extends CommandLineProgramTest {
- private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/allelic";
+ private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber");
private static final File NORMAL_BAM_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-normal.bam");
private static final File TUMOR_BAM_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-tumor.bam");
private static final File SITES_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-sites.interval_list");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java
index 0797b38ca26..43e7a9046e6 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java
@@ -2,7 +2,7 @@
import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.hellbender.CommandLineProgramTest;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalSetRule;
import org.broadinstitute.hellbender.utils.SimpleInterval;
@@ -22,7 +22,7 @@
* @author Andrey Smirnov <asmirnov@broadinstitute.org>
*/
public class CollectFragmentCountsIntegrationTest extends CommandLineProgramTest {
- private static final String TEST_SUB_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts";
+ private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber");
private static final File NA12878_BAM = new File(TEST_SUB_DIR, "collect-fragment-counts-NA12878.bam");
private static final File NA12878_FRAGMENT_COUNTS_EXPECTED_OUTPUT = new File(TEST_SUB_DIR, "collect-fragment-counts-NA12878-expected.tsv");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java
index c45bf0b941f..eeae497d848 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java
@@ -6,17 +6,17 @@
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
import org.broadinstitute.hdf5.HDF5File;
import org.broadinstitute.hellbender.CommandLineProgramTest;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedInterval;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedIntervalCollection;
-import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotationSet;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.HDF5SVDReadCountPanelOfNormals;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDDenoisedCopyRatioResult;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDReadCountPanelOfNormals;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCount;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
+import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDDenoisedCopyRatioResult;
+import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
import org.testng.Assert;
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java
index a8587e6a9e7..dd214333a51 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java
@@ -2,8 +2,8 @@
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
-import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument;
+import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
@@ -20,7 +20,7 @@
* @author Samuel Lee <slee@broadinstitute.org>
*/
public final class DenoiseReadCountsIntegrationTest extends CommandLineProgramTest {
- private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/coverage/denoising";
+ private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber");
private static final File WGS_READ_COUNTS_TSV_FILE = new File(TEST_SUB_DIR, "denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.tsv");
private static final File WGS_READ_COUNTS_HDF5_FILE = new File(TEST_SUB_DIR, "denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.hdf5");
private static final File WGS_ANNOTATED_INTERVALS_FILE = new File(TEST_SUB_DIR, "denoise-read-counts-wgs-annotated-intervals.tsv");
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java
index dd022a132eb..8a0ea20316a 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java
@@ -10,10 +10,11 @@
import java.io.File;
import java.util.Arrays;
+import java.util.Collections;
import java.util.List;
public final class PreprocessIntervalsIntegrationTest extends CommandLineProgramTest {
- private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/";
+ private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber");
private static final File INTERVAL_LIST_FILE = new File(TEST_SUB_DIR, "preprocess-intervals-test.interval_list");
private static final File REFERENCE_FILE = new File(b37_reference_20_21);
@@ -32,6 +33,18 @@ public Object[][] testData() {
new Interval("20", 13_000, 20_000)
);
+ // Test for no binning (specified by zero bin length)
+ final int binLengthNoBinningTest = 0;
+ final int paddingLengthNoBinningTest = 0;
+ final List inputIntervalsNoBinningTest = Arrays.asList(
+ new Interval("20", 3_000, 20_000),
+ new Interval("20", 200, 1_900)
+ );
+ final List expectedBinsNoBinningTest = Arrays.asList(
+ new Interval("20", 200, 1_900),
+ new Interval("20", 3_000, 20_000)
+ );
+
// Test for overlapping intervals
final int binLengthOverlappingIntervalTest = 10_000;
final int paddingLengthOverlappingIntervalTest = 500;
@@ -61,7 +74,7 @@ public Object[][] testData() {
// Test for whole chromosome
final int binLengthWholeChromosomeTest = 10_000_000;
final int paddingLengthWholeChromosomeTest = 500;
- final List inputIntervalsWholeChromosomeTest = Arrays.asList(new Interval("20", 1, 63_025_520));
+ final List inputIntervalsWholeChromosomeTest = Collections.singletonList(new Interval("20", 1, 63_025_520));
final List expectedBinsWholeChromosomeTest = Arrays.asList(
new Interval("20", 1, 10_000_000),
new Interval("20", 10_000_001, 20_000_000),
@@ -75,7 +88,7 @@ public Object[][] testData() {
// Test for whole genome -- when we don't give any intervals, then the tool assumes that the user wants to sequence the whole genome
final int binLengthWholeGenomeTest = 10_000_000;
final int paddingLengthWholeGenomeTest = 500;
- final List