diff --git a/.gitignore b/.gitignore index b552d8eeb2d..9d47c41ba71 100644 --- a/.gitignore +++ b/.gitignore @@ -24,7 +24,4 @@ client_secret.json servicekey.json #Test Generated File -I_SHOULD_HAVE_BEEN_DELETED - -/scripts/cnv_wdl/somatic/cnv_common_tasks.wdl -/scripts/cnv_wdl/germline/cnv_common_tasks.wdl +I_SHOULD_HAVE_BEEN_DELETED \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index ca272a21522..40d46276874 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ env: - TEST_TYPE=integration TEST_DOCKER=true TEST_VERBOSITY=minimal - TEST_TYPE=unit TEST_DOCKER=true TEST_VERBOSITY=minimal - TEST_TYPE=python TEST_DOCKER=true TEST_VERBOSITY=minimal + - RUN_CNV_SOMATIC_WDL=true - RUN_CNV_SOMATIC_LEGACY_WDL=true - RUN_M2_WDL=true global: @@ -87,7 +88,7 @@ before_install: sudo Rscript scripts/docker/gatkbase/install_R_packages.R; fi # Download Cromwell jar -- if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_M2_WDL == true || $RUN_CNV_GERMLINE_WDL == true ]]; then +- if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_CNV_GERMLINE_WDL == true || $RUN_M2_WDL == true ]]; then wget -O ~/cromwell-0.28.jar https://github.com/broadinstitute/cromwell/releases/download/28/cromwell-28.jar; fi # Download Picard jar @@ -107,7 +108,7 @@ install: else ./gradlew assemble; ./gradlew installDist; - if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_M2_WDL == true || $RUN_CNV_GERMLINE_WDL == true ]]; then + if [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_CNV_GERMLINE_WDL == true || $RUN_M2_WDL == true ]]; then echo "building a shadow jar for the wdl"; ./gradlew shadowJar; elif [[ $TEST_TYPE == cloud ]]; then @@ -122,12 +123,12 @@ script: echo "Not running any tests for nightly builds"; elif [[ $TRAVIS_SECURE_ENV_VARS == false && $TEST_TYPE == cloud ]]; then echo "Can't run cloud tests without keys so don't run tests"; + elif [[ $RUN_CNV_SOMATIC_WDL == true ]]; then + echo "Running CNV somatic workflows"; + bash scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh; elif [[ $RUN_CNV_SOMATIC_LEGACY_WDL == true ]]; then echo "Running legacy CNV somatic workflows"; bash scripts/cnv_cromwell_tests/somatic_legacy/run_cnv_somatic_workflows.sh; - elif [[ $RUN_CNV_GERMLINE_WDL == true ]]; then - echo "Running CNV germline workflows"; - bash scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh; elif [[ $RUN_M2_WDL == true ]]; then echo "Deleting some unused files before running M2 WDL..."; rm -Rf src/test/resources/large/VQSR; diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wes.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wes.json deleted file mode 100755 index 80b8423fcb1..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wes.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "CNVGermlineCohortWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv", - "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCohortWorkflow.num_latents": "2", - "CNVGermlineCohortWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv", - "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta", - "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCohortWorkflow.gatk_jar": "/root/gatk.jar", - "CNVGermlineCohortWorkflow.targets": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/ice_targets_chr20xy.tsv", - "CNVGermlineCohortWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv", - "CNVGermlineCohortWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv", - "CNVGermlineCohortWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ], - "CNVGermlineCohortWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wes_pon/model_final/", - "CNVGermlineCohortWorkflow.output_path": "output", - "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__" -} diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wgs.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wgs.json deleted file mode 100755 index e12b8a4caf3..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_cohort_workflow_wgs.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "CNVGermlineCohortWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv", - "CNVGermlineCohortWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineCohortWorkflow.num_latents": "1", - "CNVGermlineCohortWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv", - "CNVGermlineCohortWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta", - "CNVGermlineCohortWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineCohortWorkflow.gatk_jar": "/root/gatk.jar", - "CNVGermlineCohortWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv", - "CNVGermlineCohortWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv", - "CNVGermlineCohortWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ], - "CNVGermlineCohortWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wgs_pon/model_final/", - "CNVGermlineCohortWorkflow.output_path": "output", - "CNVGermlineCohortWorkflow.gatk_docker": "__GATK_DOCKER__" -} diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wes.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wes.json deleted file mode 100755 index 0d51652eb94..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wes.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "CNVGermlinePanelWorkflow.gatk_jar": "/root/gatk.jar", - "CNVGermlinePanelWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv", - "CNVGermlinePanelWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv", - "CNVGermlinePanelWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv", - "CNVGermlinePanelWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ], - "CNVGermlinePanelWorkflow.num_latents": "2", - "CNVGermlinePanelWorkflow.targets": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/ice_targets_chr20xy.tsv", - "CNVGermlinePanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv", - "CNVGermlinePanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlinePanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlinePanelWorkflow.pon_output_path": "test_pon", - "CNVGermlinePanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta", - "CNVGermlinePanelWorkflow.gatk_docker": "__GATK_DOCKER__" -} diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wgs.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wgs.json deleted file mode 100755 index be4c75e545b..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_panel_workflow_wgs.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "CNVGermlinePanelWorkflow.gatk_jar": "/root/gatk.jar", - "CNVGermlinePanelWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv", - "CNVGermlinePanelWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv", - "CNVGermlinePanelWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv", - "CNVGermlinePanelWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ], - "CNVGermlinePanelWorkflow.num_latents": "1", - "CNVGermlinePanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv", - "CNVGermlinePanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlinePanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlinePanelWorkflow.pon_output_path": "test_pon", - "CNVGermlinePanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta", - "CNVGermlinePanelWorkflow.gatk_docker": "__GATK_DOCKER__" -} diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wes.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wes.json deleted file mode 100755 index 138e3685dee..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wes.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "CNVGermlineSingleSampleWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam", - "CNVGermlineSingleSampleWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineSingleSampleWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam.bai", - "CNVGermlineSingleSampleWorkflow.num_latents": "2", - "CNVGermlineSingleSampleWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv", - "CNVGermlineSingleSampleWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta", - "CNVGermlineSingleSampleWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineSingleSampleWorkflow.gatk_jar": "/root/gatk.jar", - "CNVGermlineSingleSampleWorkflow.targets": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/ice_targets_chr20xy.tsv", - "CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv", - "CNVGermlineSingleSampleWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv", - "CNVGermlineSingleSampleWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ], - "CNVGermlineSingleSampleWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wes_pon/model_final/", - "CNVGermlineSingleSampleWorkflow.output_path": "output", - "CNVGermlineSingleSampleWorkflow.gatk_docker": "__GATK_DOCKER__" -} diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wgs.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wgs.json deleted file mode 100755 index 15d1b8f8a2b..00000000000 --- a/scripts/cnv_cromwell_tests/germline/cnv_germline_single_sample_workflow_wgs.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "CNVGermlineSingleSampleWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam", - "CNVGermlineSingleSampleWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta.fai", - "CNVGermlineSingleSampleWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam.bai", - "CNVGermlineSingleSampleWorkflow.num_latents": "1", - "CNVGermlineSingleSampleWorkflow.sex_genotypes": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/sex_genotypes.tsv", - "CNVGermlineSingleSampleWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.fasta", - "CNVGermlineSingleSampleWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/Homo_sapiens_assembly19.truncated.dict", - "CNVGermlineSingleSampleWorkflow.gatk_jar": "/root/gatk.jar", - "CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/contig_annots.tsv", - "CNVGermlineSingleSampleWorkflow.transition_prior_table": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_priors.tsv", - "CNVGermlineSingleSampleWorkflow.copy_number_transition_prior_files": [ "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv", - "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv" ], - "CNVGermlineSingleSampleWorkflow.model_path": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/wgs_pon/model_final/", - "CNVGermlineSingleSampleWorkflow.output_path": "output", - "CNVGermlineSingleSampleWorkflow.gatk_docker": "__GATK_DOCKER__" -} diff --git a/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv b/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv deleted file mode 100755 index eae8ed2f969..00000000000 --- a/scripts/cnv_cromwell_tests/germline/normal_bam_list.tsv +++ /dev/null @@ -1,3 +0,0 @@ -/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74NEG_20xy-downsampled.bam.bai -/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P2T_20xy-downsampled.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P2T_20xy-downsampled.bam.bai -/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P35_20xy-downsampled.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_germline_workflows_test_files/inputs/bams/SM-74P35_20xy-downsampled.bam.bai \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh b/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh deleted file mode 100644 index 2dd6b804618..00000000000 --- a/scripts/cnv_cromwell_tests/germline/run_cnv_germline_workflows.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -l -set -e -#cd in the directory of the script in order to use relative paths -script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) -cd "$script_path" - -ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/cnv_common_tasks.wdl -ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl -ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl -ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl - -WORKING_DIR=/home/travis/build/broadinstitute - -pushd . -echo "Building docker without running unit tests... =========" -cd $WORKING_DIR/gatk -# IMPORTANT: This code is duplicated in the M2 WDL test. -if [ ${TRAVIS_PULL_REQUEST} != false ]; then - HASH_TO_USE=FETCH_HEAD - sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST}; -else - HASH_TO_USE=${TRAVIS_COMMIT} - sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; -fi -echo "Docker build done ==========" -popd - -echo "Inserting docker image into json ========" -CNV_CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/cnv_cromwell_tests/germline/" -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_workflow_wes.json >cnv_germline_cohort_workflow_wes_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_cohort_workflow_wgs.json >cnv_germline_cohort_workflow_wgs_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_panel_workflow_wes.json >cnv_germline_panel_workflow_wes_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_panel_workflow_wgs.json >cnv_germline_panel_workflow_wgs_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_single_sample_workflow_wes.json >cnv_germline_single_sample_workflow_wes_mod.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_germline_single_sample_workflow_wgs.json >cnv_germline_single_sample_workflow_wgs_mod.json - -echo "Running ========" - -CROMWELL_JAR="cromwell-0.28.jar" - -# Panel WES -java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl cnv_germline_panel_workflow_wes_mod.json -# Panel WGS -java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl cnv_germline_panel_workflow_wgs_mod.json - -# Single sample WES calling -java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl cnv_germline_single_sample_workflow_wes_mod.json -# Single sample WGS calling -java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl cnv_germline_single_sample_workflow_wgs_mod.json - -# Cohort WES calling -java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl cnv_germline_cohort_workflow_wes_mod.json -# Cohort WGS calling -java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl cnv_germline_cohort_workflow_wgs_mod.json \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wes_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wes_workflow.json new file mode 100644 index 00000000000..3c63db7937a --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wes_workflow.json @@ -0,0 +1,13 @@ +{ + "CNVSomaticPairWorkflow.common_sites": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/common_snps_sample-chr20.interval_list", + "CNVSomaticPairWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVSomaticPairWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/ice_targets_sample-chr20.interval_list", + "CNVSomaticPairWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam", + "CNVSomaticPairWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam.bai", + "CNVSomaticPairWorkflow.read_count_pon": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/wes-no-gc.pon.hdf5", + "CNVSomaticPairWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict", + "CNVSomaticPairWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai", + "CNVSomaticPairWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta", + "CNVSomaticPairWorkflow.tumor_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam", + "CNVSomaticPairWorkflow.tumor_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam.bai" +} diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wgs_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wgs_workflow.json new file mode 100644 index 00000000000..b34d6bb3874 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_pair_wgs_workflow.json @@ -0,0 +1,14 @@ +{ + "CNVSomaticPairWorkflow.common_sites": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/common_snps_sample-chr20.interval_list", + "CNVSomaticPairWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVSomaticPairWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/chr20.interval_list", + "CNVSomaticPairWorkflow.normal_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam", + "CNVSomaticPairWorkflow.normal_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam.bai", + "CNVSomaticPairWorkflow.PreprocessIntervals.bin_length": "10000", + "CNVSomaticPairWorkflow.read_count_pon": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/wgs-no-gc.pon.hdf5", + "CNVSomaticPairWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict", + "CNVSomaticPairWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai", + "CNVSomaticPairWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta", + "CNVSomaticPairWorkflow.tumor_bam": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam", + "CNVSomaticPairWorkflow.tumor_bam_idx": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam.bai" +} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_do-gc_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_do-gc_workflow.json new file mode 100644 index 00000000000..90c7ec34562 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_do-gc_workflow.json @@ -0,0 +1,10 @@ +{ + "CNVSomaticPanelWorkflow.do_explicit_gc_correction": "true", + "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/ice_targets_sample-chr20.interval_list", + "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv", + "CNVSomaticPanelWorkflow.pon_entity_id": "test", + "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict", + "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai", + "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta" +} diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv new file mode 100644 index 00000000000..c0cdb12c282 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv @@ -0,0 +1,2 @@ +/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74NEG-v1-chr20-downsampled.deduplicated.bam.bai +/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/SM-74P4M-v1-chr20-downsampled.deduplicated.bam.bai diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_workflow.json new file mode 100644 index 00000000000..c227b993414 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_workflow.json @@ -0,0 +1,9 @@ +{ + "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/ice_targets_sample-chr20.interval_list", + "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wes_input.tsv", + "CNVSomaticPanelWorkflow.pon_entity_id": "test", + "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict", + "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai", + "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta" +} \ No newline at end of file diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_do-gc_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_do-gc_workflow.json new file mode 100644 index 00000000000..26df09b9695 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_do-gc_workflow.json @@ -0,0 +1,11 @@ +{ + "CNVSomaticPanelWorkflow.do_explicit_gc_correction": "true", + "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/chr20.interval_list", + "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv", + "CNVSomaticPanelWorkflow.pon_entity_id": "test", + "CNVSomaticPanelWorkflow.PreprocessIntervals.bin_length": "10000", + "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict", + "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai", + "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta" +} diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv new file mode 100644 index 00000000000..269d8a013b9 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv @@ -0,0 +1,2 @@ +/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143_BL-n1-chr20-downsampled.deduplicated.bam.bai +/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam /home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/HCC1143-t1-chr20-downsampled.deduplicated.bam.bai diff --git a/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_workflow.json b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_workflow.json new file mode 100644 index 00000000000..3e2ab86ac06 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_workflow.json @@ -0,0 +1,10 @@ +{ + "CNVSomaticPanelWorkflow.gatk_docker": "__GATK_DOCKER__", + "CNVSomaticPanelWorkflow.intervals": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/chr20.interval_list", + "CNVSomaticPanelWorkflow.normal_bams_list": "/home/travis/build/broadinstitute/gatk/scripts/cnv_cromwell_tests/somatic/cnv_somatic_panel_wgs_input.tsv", + "CNVSomaticPanelWorkflow.pon_entity_id": "test", + "CNVSomaticPanelWorkflow.PreprocessIntervals.bin_length": "10000", + "CNVSomaticPanelWorkflow.ref_fasta_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.dict", + "CNVSomaticPanelWorkflow.ref_fasta_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta.fai", + "CNVSomaticPanelWorkflow.ref_fasta": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/cnv_somatic_workflows_test_files/human_g1k_v37.chr-20.truncated.fasta" +} diff --git a/scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh b/scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh new file mode 100644 index 00000000000..891fbe680f6 --- /dev/null +++ b/scripts/cnv_cromwell_tests/somatic/run_cnv_somatic_workflows.sh @@ -0,0 +1,50 @@ +#!/bin/bash -l +set -e +#cd in the directory of the script in order to use relative paths +script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) +cd "$script_path" + +ln -fs /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/cnv_common_tasks.wdl + +WORKING_DIR=/home/travis/build/broadinstitute + +pushd . +echo "Building docker without running unit tests... =========" +cd $WORKING_DIR/gatk +# IMPORTANT: This code is duplicated in the M2 WDL test. +if [ ${TRAVIS_PULL_REQUEST} != false ]; then + HASH_TO_USE=FETCH_HEAD + sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST}; +else + HASH_TO_USE=${TRAVIS_COMMIT} + sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; +fi +echo "Docker build done ==========" + +popd + +echo "Inserting docker image into json ========" +CNV_CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/cnv_cromwell_tests/somatic/" +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wes_workflow.json >cnv_somatic_panel_wes_workflow_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wgs_workflow.json >cnv_somatic_panel_wgs_workflow_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wes_do-gc_workflow.json >cnv_somatic_panel_wes_do-gc_workflow_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_panel_wgs_do-gc_workflow.json >cnv_somatic_panel_wgs_do-gc_workflow_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_pair_wes_workflow.json >cnv_somatic_pair_wes_workflow_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" ${CNV_CROMWELL_TEST_DIR}/cnv_somatic_pair_wgs_workflow.json >cnv_somatic_pair_wgs_workflow_mod.json + +echo "Running ========" +CROMWELL_JAR="cromwell-0.28.jar" + +# Panel WES +java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wes_workflow_mod.json +# Panel WGS +java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wgs_workflow_mod.json +# Panel WES w/ explicit GC correction +java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wes_do-gc_workflow_mod.json +# Panel WGS w/ explicit GC correction +java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl cnv_somatic_panel_wgs_do-gc_workflow_mod.json + +# Pair WES +java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl cnv_somatic_pair_wes_workflow_mod.json +# Pair WGS +java -jar ~/${CROMWELL_JAR} run /home/travis/build/broadinstitute/gatk/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl cnv_somatic_pair_wgs_workflow_mod.json \ No newline at end of file diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index f35717da41e..55a27ab5605 100755 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -1,48 +1,9 @@ -# Tasks common to both the CNV somatic panel and case workflows. -# -############# - -# Pad targets in the target file by the specified amount (this was found to improve sensitivity and specificity) -task PadTargets { - File targets - Int? padding - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - # Determine output filename - String filename = select_first([targets, ""]) - String base_filename = basename(filename, ".tsv") - - command { - java -Xmx${default="1" mem}g -jar ${gatk_jar} PadTargets \ - --targets ${targets} \ - --padding ${default="250" padding} \ - --output ${base_filename}.padded.tsv - } - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 2]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, 40]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File padded_targets = "${base_filename}.padded.tsv" - } -} - task PreprocessIntervals { File? intervals File ref_fasta_dict Int? padding Int? bin_length - String gatk_jar + File? gatk4_jar_override # Runtime parameters Int? mem @@ -54,21 +15,24 @@ task PreprocessIntervals { String filename = select_first([intervals, "wgs"]) String base_filename = basename(filename, ".interval_list") - command { - java -Xmx${default="2" mem}g -jar ${gatk_jar} PreprocessIntervals \ + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${default="2" mem}g -jar $GATK_JAR PreprocessIntervals \ ${"-L " + intervals} \ - -sequenceDictionary ${ref_fasta_dict} \ + --sequence-dictionary ${ref_fasta_dict} \ --padding ${default="250" padding} \ --binLength ${default="1000" bin_length} \ - --interval_merging_rule OVERLAPPING_ONLY \ + --interval-merging-rule OVERLAPPING_ONLY \ --output ${base_filename}.preprocessed.interval_list - } + >>> runtime { docker: "${gatk_docker}" memory: select_first([mem, 2]) + " GB" disks: "local-disk " + select_first([disk_space_gb, 40]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) + preemptible: select_first([preemptible_attempts, 5]) } output { @@ -76,47 +40,12 @@ task PreprocessIntervals { } } -# Create a target file with GC annotations -task AnnotateTargets { - String entity_id - File intervals - File ref_fasta - File ref_fasta_fai - File ref_fasta_dict - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - command { - java -Xmx${default="4" mem}g -jar ${gatk_jar} AnnotateTargets \ - --targets ${intervals} \ - --reference ${ref_fasta} \ - --interval_merging_rule OVERLAPPING_ONLY \ - --output ${entity_id}.annotated.tsv - } - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, ceil(size(ref_fasta, "GB")) + 50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File annotated_intervals = "${entity_id}.annotated.tsv" - } -} - task AnnotateIntervals { File intervals File ref_fasta File ref_fasta_fai File ref_fasta_dict - String gatk_jar + File? gatk4_jar_override # Runtime parameters Int? mem @@ -124,19 +53,22 @@ task AnnotateIntervals { Int? preemptible_attempts Int? disk_space_gb - command { - java -Xmx${default="4" mem}g -jar ${gatk_jar} AnnotateIntervals \ + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${default="4" mem}g -jar $GATK_JAR AnnotateIntervals \ -L ${intervals} \ --reference ${ref_fasta} \ - --interval_merging_rule OVERLAPPING_ONLY \ + --interval-merging-rule OVERLAPPING_ONLY \ --output annotated_intervals.tsv - } + >>> runtime { docker: "${gatk_docker}" memory: select_first([mem, 5]) + " GB" disks: "local-disk " + select_first([disk_space_gb, ceil(size(ref_fasta, "GB")) + 50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) + preemptible: select_first([preemptible_attempts, 5]) } output { @@ -144,91 +76,12 @@ task AnnotateIntervals { } } -# Collect read counts for germline workflow (TSV output in target format) -task CollectReadCounts { - File? padded_targets - File bam - File bam_idx - File ref_fasta - File ref_fasta_fai - File ref_fasta_dict - Int? wgs_bin_length - Boolean? keep_non_autosomes - Boolean? disable_all_read_filters - Boolean? disable_sequence_dictionary_validation - Boolean? keep_duplicate_reads - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - # If no padded target file is input, then do WGS workflow - Boolean is_wgs = !defined(padded_targets) - - # Sample name is derived from the bam filename - String base_filename = basename(bam, ".bam") - - String read_counts_tsv_filename = "${base_filename}.readCounts.tsv" - String read_counts_hdf5_filename = if is_wgs then "${base_filename}.readCounts.hdf5" else "" - String intervals_filename = if is_wgs then "${base_filename}.readCounts.intervals.tsv" else select_first([padded_targets, ""]) - - command <<< - if [ ${is_wgs} = true ] - then - java -Xmx${default="8" mem}g -jar ${gatk_jar} SparkGenomeReadCounts \ - --input ${bam} \ - --reference ${ref_fasta} \ - --binLength ${default="1000" wgs_bin_length} \ - --keepXYMT ${default="false" keep_non_autosomes} \ - --disable-tool-default-read-filters ${default="false" disable_all_read_filters} \ - --disable-sequence-dictionary-validation ${default="true" disable_sequence_dictionary_validation} \ - $(if [ ${default="true" keep_duplicate_reads} = true ]; then echo " --disable-read-filter NotDuplicateReadFilter "; else echo ""; fi) \ - --output ${read_counts_tsv_filename} \ - --writeHdf5 - else - java -Xmx${default="4" mem}g -jar ${gatk_jar} CalculateTargetCoverage \ - --input ${bam} \ - --reference ${ref_fasta} \ - --targets ${padded_targets} \ - --groupBy SAMPLE \ - --transform RAW \ - --targetInformationColumns FULL \ - --interval-set-rule UNION \ - --interval-merging-rule OVERLAPPING_ONLY \ - --interval-padding 0 \ - --seconds-between-progress-updates 10.0 \ - --disable-tool-default-read-filters ${default="false" disable_all_read_filters} \ - --disable-sequence-dictionary-validation ${default="true" disable_sequence_dictionary_validation} \ - $(if [ ${default="true" keep_duplicate_reads} = true ]; then echo " --disable-read-filter NotDuplicateReadFilter "; else echo ""; fi) \ - --output ${read_counts_tsv_filename} - fi - >>> - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - String entity_id = base_filename - File read_counts = read_counts_tsv_filename - File read_counts_hdf5 = read_counts_hdf5_filename #"" if is_wgs = false - File intervals = intervals_filename #padded_targets if is_wgs = false - } -} - -# Collect counts for ModelSegments workflow task CollectCounts { File intervals File bam File bam_idx String? output_format - String gatk_jar + File? gatk4_jar_override # Runtime parameters Int? mem @@ -240,20 +93,23 @@ task CollectCounts { String base_filename = basename(bam, ".bam") String counts_filename = if !defined(output_format) then "${base_filename}.counts.hdf5" else "${base_filename}.counts.tsv" - command { - java -Xmx${default="8" mem}g -jar ${gatk_jar} CollectFragmentCounts \ + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${default="8" mem}g -jar $GATK_JAR CollectFragmentCounts \ --input ${bam} \ -L ${intervals} \ --outputFormat ${default="HDF5" output_format} \ --interval-merging-rule OVERLAPPING_ONLY \ --output ${counts_filename} - } + >>> runtime { docker: "${gatk_docker}" memory: select_first([mem, 8]) + " GB" disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) + preemptible: select_first([preemptible_attempts, 5]) } output { @@ -262,7 +118,6 @@ task CollectCounts { } } -# Collect allelic counts task CollectAllelicCounts { File common_sites File bam @@ -271,7 +126,7 @@ task CollectAllelicCounts { File ref_fasta_fai File ref_fasta_dict Int? minimum_base_quality - String gatk_jar + File? gatk4_jar_override # Runtime parameters Int? mem @@ -279,61 +134,36 @@ task CollectAllelicCounts { Int? preemptible_attempts Int? disk_space_gb + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 13000 + Int command_mem = machine_mem - 1000 + # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") String allelic_counts_filename = "${base_filename}.allelicCounts.tsv" - command { - java -Xmx${default="8" mem}g -jar ${gatk_jar} CollectAllelicCounts \ + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${command_mem}m -jar $GATK_JAR CollectAllelicCounts \ -L ${common_sites} \ --input ${bam} \ --reference ${ref_fasta} \ --minimumBaseQuality ${default="20" minimum_base_quality} \ --output ${allelic_counts_filename} - } + >>> runtime { docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" + memory: machine_mem + " MB" disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) + preemptible: select_first([preemptible_attempts, 5]) } output { String entity_id = base_filename File allelic_counts = allelic_counts_filename } -} - -# Correct coverage profile(s) for sample-specific GC bias -task CorrectGCBias { - String entity_id - File coverage # This can be either single-sample or multi-sample - File annotated_intervals - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - command { - java -Xmx${default=4 mem}g -jar ${gatk_jar} CorrectGCBias \ - --input ${coverage} \ - --targets ${annotated_intervals} \ - --output ${entity_id}.gc_corrected.tsv - } - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, ceil(size(coverage, "GB"))+50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File corrected_coverage = "${entity_id}.gc_corrected.tsv" - } } \ No newline at end of file diff --git a/scripts/cnv_wdl/germline/README.md b/scripts/cnv_wdl/germline/README.md deleted file mode 100644 index 4a4f052233c..00000000000 --- a/scripts/cnv_wdl/germline/README.md +++ /dev/null @@ -1,86 +0,0 @@ -## Running the Germline CNV WDL - -### Which WDL should you use? -- Building a panel of normals (PoN): ``cnv_germline_panel_workflow.wdl`` -- Calling events on a single normal sample: ``cnv_germline_single_sample_workflow.wdl`` -- Calling events on a cohort of normal samples: ``cnv_germline_cohort_workflow.wdl`` - -#### Setting up parameter json file for a run - -To get started, copy the relevant ``*_template.json`` for the workflow you wish to run and adjust parameters accordingly. -You can find all required resource inputs needed to run the workflows in the ``/resources`` directory. These inputs could be run out-of-the-box. - -*Please note that there are task-level parameters that do not appear in the template files. These are set to reasonable values by default, but can also be adjusted if desired. - -#### Fields of germline CNV panel of normals creation workflow - - ``CNVGermlinePanelWorkflow.sex_genotypes`` -- path to table of per-sample sex genotypes - ``CNVGermlinePanelWorkflow.contig_ploidy_annotations`` -- path to the germline contig ploidy annotations table; located in ``/resources`` directory - ``CNVGermlinePanelWorkflow.transition_prior_table`` -- path to copy number transition priors table; located in ``/resources`` directory - ``CNVGermlinePanelWorkflow.transition_matrix_XY_Y`` -- path to copy number transition prior for Y contig for XY-genotyped samples; located in ``/resources`` directory - ``CNVGermlinePanelWorkflow.transition_matrix_XX_X`` -- path to copy number transition prior for X contig for XX-genotyped samples; located in ``/resources`` directory - ``CNVGermlinePanelWorkflow.transition_matrix_XY_X`` -- path to copy number transition prior for X contig for XY-genotyped samples; located in ``/resources`` directory - ``CNVGermlinePanelWorkflow.transition_matrix_XX_Y`` -- path to copy number transition prior for Y contig for XX-genotyped samples; located in ``/resources`` directory - ``CNVGermlinePanelWorkflow.transition_matrix_autosomal`` -- path to transition prior on autosomal loci; located in ``/resources`` directory, - ``CNVGermlinePanelWorkflow.normal_bams_list`` -- TSV file consisting of corresponding bam and corresponding index files as described in cnv_germline_panel_workflow.wdl - ``CNVGermlinePanelWorkflow.pon_output_path`` -- name of the final output directory - ``CNVGermlinePanelWorkflow.num_latents`` -- (advanced) maximum number of principal components. Must be strictly less than the number of samples. The recommended value is 20 ~ 30 for large cohorts. For smaller cohorts, use 0.5 * number of samples. Unnecessary principal components are automatically pruned during PoN creation - ``CNVGermlinePanelWorkflow.ref_fasta`` -- path to reference fasta file - ``CNVGermlinePanelWorkflow.ref_fasta_dict`` -- path to reference dict file - ``CNVGermlinePanelWorkflow.ref_fasta_fai`` -- path to reference fasta fai file - ``CNVGermlinePanelWorkflow.gatk_jar`` -- absolute path to gatk.jar - ``CNVGermlinePanelWorkflow.targets`` -- (optional) Target file (NOT in BED format) corresponding to the genomic loci of enriched targets in WES sample (e.g. Agilent, Illumina, etc). Please run ConvertBedToTargetFile to convert a BED file to a target file. If provided, then WES workflow will be run; otherwise, WGS workflow will be run - - In addition, there are several task-level parameters that may be set by advanced users; for example: - - - ``CNVGermlinePanelWorkflow.CollectReadCounts.wgs_bin_length`` -- Size of bins (in bp) for WGS coverage collection. *This must be the same value used for all samples.* Ignored if not running WGS. - - ``CNVGermlinePanelWorkflow.PadTargets.padding`` -- Amount of padding (in bp) to add to both sides of targets for WES coverage collection. *This must be the same value used for all samples.* Ignored if not running WES. - - Further explanation of these task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool. - - -#### Fields of germline CNV single sample calling workflow - -The reference used must be the same between PoN and case samples. - - ``CNVGermlineSingleSampleWorkflow.sex_genotypes`` -- path to table of per-sample sex genotypes - ``CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations`` -- path to the germline contig ploidy annotations table; located in ``/resources`` directory - ``CNVGermlineSingleSampleWorkflow.transition_prior_table`` -- path to copy number transition priors table; located in ``/resources`` directory - ``CNVGermlineSingleSampleWorkflow.transition_matrix_XY_Y`` -- path to copy number transition prior for Y contig for XY-genotyped samples; located in ``/resources`` directory - ``CNVGermlineSingleSampleWorkflow.transition_matrix_XX_X`` -- path to copy number transition prior for X contig for XX-genotyped samples; located in ``/resources`` directory - ``CNVGermlineSingleSampleWorkflow.transition_matrix_XY_X`` -- path to copy number transition prior for X contig for XY-genotyped samples; located in ``/resources`` directory - ``CNVGermlineSingleSampleWorkflow.transition_matrix_XX_Y`` -- path to copy number transition prior for Y contig for XX-genotyped samples; located in ``/resources`` directory - ``CNVGermlineSingleSampleWorkflow.transition_matrix_autosomal`` -- path to transition prior on autosomal loci; located in ``/resources`` directory, - ``CNVGermlineSingleSampleWorkflow.output_path`` -- name of the final output directory - ``CNVGermlineSingleSampleWorkflow.num_latents`` -- (advanced) maximum number of principal components. Must be strictly less than the number of samples. The recommended value is 20 ~ 30 for large cohorts. For smaller cohorts, use 0.5 * number of samples. Unnecessary principal components are automatically pruned during PoN creation - ``CNVGermlineSingleSampleWorkflow.model_path`` -- absolute path of the PoN model (posterior_finals directory of the panel creation output) - ``CNVGermlineSingleSampleWorkflow.normal_bam`` -- path to the normal bam file - ``CNVGermlineSingleSampleWorkflow.normal_bam_idx`` -- path to the corresponding bam index file - ``CNVGermlineSingleSampleWorkflow.ref_fasta`` -- path to reference fasta file - ``CNVGermlineSingleSampleWorkflow.ref_fasta_dict`` -- path to reference dict file - ``CNVGermlineSingleSampleWorkflow.ref_fasta_fai`` -- path to reference fasta fai file - ``CNVGermlineSingleSampleWorkflow.gatk_jar`` -- absolute path to gatk.jar - ``CNVGermlineSingleSampleWorkflow.targets`` -- (optional) Target file (NOT in BED format) corresponding to the genomic loci of enriched targets in WES sample (e.g. Agilent, Illumina, etc). Please run ConvertBedToTargetFile to convert a BED file to a target file. If provided, then WES workflow will be run; otherwise, WGS workflow will be run - - -#### Fields of germline CNV cohort calling workflow - -The reference used must be the same between PoN and case samples. - - ``CNVGermlineCohortWorkflow.sex_genotypes`` -- path to table of per-sample sex genotypes - ``CNVGermlineCohortWorkflow.contig_ploidy_annotations`` -- path to the germline contig ploidy annotations table; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.transition_prior_table`` -- path to copy number transition priors table; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.transition_matrix_XY_Y`` -- path to copy number transition prior for Y contig for XY-genotyped samples; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.transition_matrix_XX_X`` -- path to copy number transition prior for X contig for XX-genotyped samples; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.transition_matrix_XY_X`` -- path to copy number transition prior for X contig for XY-genotyped samples; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.transition_matrix_XX_Y`` -- path to copy number transition prior for Y contig for XX-genotyped samples; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.transition_matrix_autosomal`` -- path to transition prior on autosomal loci; located in ``/resources`` directory - ``CNVGermlineCohortWorkflow.output_path`` -- name of the final output directory - ``CNVGermlineCohortWorkflow.num_latents`` -- (advanced) maximum number of principal components. Must be strictly less than the number of samples. The recommended value is 20 ~ 30 for large cohorts. For smaller cohorts, use 0.5 * number of samples. Unnecessary principal components are automatically pruned during PoN creation - ``CNVGermlineCohortWorkflow.model_path`` -- absolute path of the PoN model (posterior_finals directory of the panel creation output) - ``CNVGermlineCohortWorkflow.normal_bams_list`` -- TSV file consisting of corresponding bam and corresponding index files as described in cnv_germline_cohort_workflow.wdl - ``CNVGermlineCohortWorkflow.ref_fasta`` -- path to reference fasta file - ``CNVGermlineCohortWorkflow.ref_fasta_dict`` -- path to reference dict file - ``CNVGermlineCohortWorkflow.ref_fasta_fai`` -- path to reference fasta fai file - ``CNVGermlineCohortWorkflow.gatk_jar`` -- absolute path to gatk.jar - ``CNVGermlineCohortWorkflow.targets`` -- (optional) Target file (NOT in BED format) corresponding to the genomic loci of enriched targets in WES sample (e.g. Agilent, Illumina, etc). Please run ConvertBedToTargetFile to convert a BED file to a target file. If provided, then WES workflow will be run; otherwise, WGS workflow will be run \ No newline at end of file diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl deleted file mode 100755 index dafd517161f..00000000000 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ /dev/null @@ -1,89 +0,0 @@ -# This workflow is used for running germline CNV on a cohort of germline samples -# Notes: -# -# -Basic sex genotype tab-separated table for homo sapiens must be formatted as follows (Refer to the Javadoc of SexGenotypeTableReader for full description): -# SAMPLE_NAME SEX_GENOTYPE -# sample_name_1 SEX_XX -# sample_name_2 SEX_XY -# sample_name_3 SEX_XY -# sample_name_4 SEX_XX -# Sex genotype identifiers (SEX_XX and SEX_XY in the above example) must match those in the tab-separated germline contig ploidy annotation table. -# The latter is formatted as follows: -# CONTIG CLASS SEX_XX SEX_XY -# 1 AUTOSOMAL 2 2 -# 2 AUTOSOMAL 2 2 -# ... ... ... ... -# X ALLOSOMAL 2 0 -# Y ALLOSOMAL 1 1 -# -# - Input file (normal_bams_list) must contain file paths to bam and bam index files separated by tabs in the following format: -# normal_bam_1 bam_idx_1 -# normal_bam_2 bam_idx_2 -# -# - The target file (targets) is required for the WES workflow and should be a tab-separated file with the column headers: -# contig start stop name -# These targets will be padded on both sides by the amount specified by PadTargets.padding (default 250). -# -# - If a target file is not provided, then the WGS workflow will be run instead and the specified value of -# wgs_bin_length (default 1000) will be used. -# -# - Example invocation: -# java -jar cromwell.jar run cnv_germline_cohort_workflow.wdl myParameters.json -# We recommend taking cnv_germline_cohort_workflow.json as a template json file and modifying it accordingly (please save -# your modified version with a different filename and do not commit to the gatk repository). -################ - - -import "cnv_germline_single_sample_workflow.wdl" as CNVGermlineSingleSampleWorkflow - -workflow CNVGermlineCohortWorkflow { - # Workflow input files - File? targets - File normal_bams_list - Array[Array[String]]+ normal_bams = read_tsv(normal_bams_list) - File ref_fasta - File ref_fasta_dict - File ref_fasta_fai - File sex_genotypes - File contig_ploidy_annotations - String gatk_jar - String gatk_docker - - # Transition prior table files - File transition_prior_table - Array[File] copy_number_transition_prior_files - - # Model directory and parameters - File model_path - Int num_latents - - # Output path - String output_path - - scatter (normal_bam in normal_bams) { - call CNVGermlineSingleSampleWorkflow.CNVGermlineSingleSampleWorkflow as SingleSampleWorkflow { - input: - targets = targets, - normal_bam = normal_bam[0], - normal_bam_idx = normal_bam[1], - ref_fasta = ref_fasta, - ref_fasta_dict = ref_fasta_dict, - ref_fasta_fai = ref_fasta_fai, - sex_genotypes = sex_genotypes, - contig_ploidy_annotations = contig_ploidy_annotations, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker, - transition_prior_table = transition_prior_table, - copy_number_transition_prior_files = copy_number_transition_prior_files, - model_path = model_path, - output_path = output_path, - num_latents = num_latents - } - } - - output { - Array[Array[File]] posterior_files = SingleSampleWorkflow.posteriors - Array[Array[File]] segment_files = SingleSampleWorkflow.segments - } - -} diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow_template.json b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow_template.json deleted file mode 100755 index 152f4e62125..00000000000 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow_template.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "CNVGermlineCohortWorkflow.sex_genotypes": "File", - "CNVGermlineCohortWorkflow.contig_ploidy_annotations": "File", - "CNVGermlineCohortWorkflow.transition_prior_table": "File", - "CNVGermlineCohortWorkflow.copy_number_transition_prior_files": "Array[File]", - "CNVGermlineCohortWorkflow.output_path": "String", - "CNVGermlineCohortWorkflow.num_latents": "Int", - "CNVGermlineCohortWorkflow.model_path": "String", - "CNVGermlineCohortWorkflow.normal_bams_list": "File", - "CNVGermlineCohortWorkflow.ref_fasta": "File", - "CNVGermlineCohortWorkflow.ref_fasta_dict": "File", - "CNVGermlineCohortWorkflow.ref_fasta_fai": "File", - "CNVGermlineCohortWorkflow.gatk_jar": "String", - "CNVGermlineCohortWorkflow.targets": "(optional) File?" -} diff --git a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl deleted file mode 100755 index 73c7d24f373..00000000000 --- a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow.wdl +++ /dev/null @@ -1,211 +0,0 @@ -# Workflow for creating a panel of normals for germline CNV pipeline -# Notes: -# -# -Basic sex genotype tab-separated table for homo sapiens must be formatted as follows (Refer to the Javadoc of SexGenotypeTableReader for full description): -# SAMPLE_NAME SEX_GENOTYPE -# sample_name_1 SEX_XX -# sample_name_2 SEX_XY -# sample_name_3 SEX_XY -# sample_name_4 SEX_XX -# Sex genotype identifiers (SEX_XX and SEX_XY in the above example) must match those in the tab-separated germline contig ploidy annotation table. -# The latter is formatted as follows: -# CONTIG CLASS SEX_XX SEX_XY -# 1 AUTOSOMAL 2 2 -# 2 AUTOSOMAL 2 2 -# ... ... ... ... -# X ALLOSOMAL 2 0 -# Y ALLOSOMAL 1 1 -# -# - Input file (normal_bams_list) must contain file paths to bam and bam index files separated by tabs in the following format: -# normal_bam_1 bam_idx_1 -# normal_bam_2 bam_idx_2 -# -# - The target file (targets) is required for the WES workflow and should be a tab-separated file with the column headers: -# contig start stop name -# These targets will be padded on both sides by the amount specified by PadTargets.padding (default 250). -# -# - If a target file is not provided, then the WGS workflow will be run instead and the specified value of -# wgs_bin_length (default 1000) will be used. -# -# - Example invocation: -# java -jar cromwell.jar run cnv_germline_panel_workflow.wdl myParameters.json -# We recommend taking cnv_germline_cohort_workflow.json as a template json file and modifying it accordingly (please save -# your modified version with a different filename and do not commit to the gatk repository). -################## - -import "cnv_common_tasks.wdl" as CNVTasks - -workflow CNVGermlinePanelWorkflow { - # Workflow input files - File? targets - File normal_bams_list - Array[Array[String]]+ normal_bams = read_tsv(normal_bams_list) - File sex_genotypes - File contig_ploidy_annotations - File transition_prior_table - Array[File] copy_number_transition_prior_files - File ref_fasta - File ref_fasta_dict - File ref_fasta_fai - String gatk_jar - String gatk_docker - - # Model parameters - Int num_latents - # CombineReadCounts name - String combined_entity_id = "combined_coverage" - # Sex genotypes file name - String sex_genotypes_entity_id = "sex_genotypes" - # PoN output path - String pon_output_path - # If no target file is input, then do WGS workflow - Boolean is_wgs = !defined(targets) - - if (!is_wgs) { - call CNVTasks.PadTargets { - input: - # This is a bit of a hack. The task will fail if targets is not defined when it gets here. - targets = select_first([targets, ""]), - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - } - - scatter (normal_bam in normal_bams) { - call CNVTasks.CollectReadCounts { - input: - padded_targets = PadTargets.padded_targets, - keep_non_autosomes = true, - bam = normal_bam[0], - bam_idx = normal_bam[1], - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - ref_fasta_dict = ref_fasta_dict, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - } - - call CombineReadCounts { - input: - combined_entity_id = combined_entity_id, - coverage_file_list = CollectReadCounts.read_counts, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - call CNVTasks.AnnotateTargets { - input: - entity_id = combined_entity_id, - intervals = CollectReadCounts.intervals[0], - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - ref_fasta_dict = ref_fasta_dict, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - call CNVTasks.CorrectGCBias { - input: - entity_id = combined_entity_id, - coverage = CombineReadCounts.combined_coverage, - annotated_intervals = AnnotateTargets.annotated_intervals, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - call GermlineCNVCaller { - input: - coverage = CorrectGCBias.corrected_coverage, - contig_ploidy_annotations = contig_ploidy_annotations, - sex_genotypes = sex_genotypes, - transition_prior_table = transition_prior_table, - copy_number_transition_prior_files = copy_number_transition_prior_files, - pon_output_path = pon_output_path, - num_latents = num_latents, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - output { - Array[File] posteriors = GermlineCNVCaller.posteriors - Array[File] model = GermlineCNVCaller.model - Array[File] segments = GermlineCNVCaller.segments - } -} - -# Combine sample-level coverage files into a single file -task CombineReadCounts { - String combined_entity_id - Array[File]+ coverage_file_list - Int? max_open_files - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - command { - java -Xmx${default=4 mem}g -jar ${gatk_jar} CombineReadCounts \ - --input ${sep=" --input " coverage_file_list} \ - --maxOpenFiles ${default=100 max_open_files} \ - --output ${combined_entity_id}.tsv - } - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, 150]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File combined_coverage = "${combined_entity_id}.tsv" - } -} - -# Learn the coverage model -task GermlineCNVCaller { - File coverage - File contig_ploidy_annotations - File sex_genotypes - File transition_prior_table - Array[File] copy_number_transition_prior_files - String pon_output_path - Int num_latents - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - command { - java -Xmx${default=4 mem}g -Ddtype=double -jar ${gatk_jar} GermlineCNVCaller \ - --input ${coverage} \ - --contigAnnotationsTable ${contig_ploidy_annotations} \ - --sexGenotypeTable ${sex_genotypes} \ - --copyNumberTransitionPriorTable ${transition_prior_table} \ - --outputPath ${pon_output_path} \ - --jobType LEARN_AND_CALL \ - --numLatents ${default=5 num_latents} \ - --rddCheckpointing false \ - --disableSpark true - } - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, 200]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - Array[File] posteriors = glob("./${pon_output_path}/posteriors_final/*") - Array[File] model = glob("./${pon_output_path}/model_final/*") - Array[File] segments = glob("./${pon_output_path}/posteriors_final/segments/*") - } -} diff --git a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow_template.json b/scripts/cnv_wdl/germline/cnv_germline_panel_workflow_template.json deleted file mode 100755 index 53ac97a4c5c..00000000000 --- a/scripts/cnv_wdl/germline/cnv_germline_panel_workflow_template.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "CNVGermlinePanelWorkflow.sex_genotypes": "File", - "CNVGermlinePanelWorkflow.gatk_jar": "File", - "CNVGermlinePanelWorkflow.contig_ploidy_annotations": "File", - "CNVGermlinePanelWorkflow.targets": "(optional) File?", - "CNVGermlinePanelWorkflow.normal_bams_list": "File", - "CNVGermlinePanelWorkflow.num_latents": "Int", - "CNVGermlinePanelWorkflow.pon_output_path": "String", - "CNVGermlinePanelWorkflow.ref_fasta": "File", - "CNVGermlinePanelWorkflow.ref_fasta_dict": "File", - "CNVGermlinePanelWorkflow.ref_fasta_fai": "File", - "CNVGermlinePanelWorkflow.transition_prior_table": "File", - "CNVGermlinePanelWorkflow.copy_number_transition_prior_files": "Array[File]" -} diff --git a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl deleted file mode 100755 index a2d663d4623..00000000000 --- a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow.wdl +++ /dev/null @@ -1,165 +0,0 @@ -# Subworkflow for running GATK germline CNV on a single BAM. Supports both WGS and WES samples. -# Notes: -# -# -Basic sex genotype tab-separated table for homo sapiens must be formatted as follows (Refer to the Javadoc of SexGenotypeTableReader for full description): -# SAMPLE_NAME SEX_GENOTYPE -# sample_name_1 SEX_XX -# sample_name_2 SEX_XY -# sample_name_3 SEX_XY -# sample_name_4 SEX_XX -# Sex genotype identifiers (SEX_XX and SEX_XY in the above example) must match those in the tab-separated germline contig ploidy annotation table. -# The latter is formatted as follows: -# CONTIG CLASS SEX_XX SEX_XY -# 1 AUTOSOMAL 2 2 -# 2 AUTOSOMAL 2 2 -# ... ... ... ... -# X ALLOSOMAL 2 0 -# Y ALLOSOMAL 1 1 -# -# - The target file (targets) is required for the WES workflow and should be a tab-separated file with the column headers: -# contig start stop name -# These targets will be padded on both sides by the amount specified by PadTargets.padding (default 250). -# -# - If a target file is not provided, then the WGS workflow will be run instead and the specified value of -# wgs_bin_length (default 1000) will be used. -# -# - Example invocation: -# java -jar cromwell.jar run cnv_germline_single_sample_workflow.wdl myParameters.json -# We recommend taking cnv_germline_cohort_workflow.json as a template json file and modifying it accordingly (please save -# your modified version with a different filename and do not commit to the gatk repository). -################ - -import "cnv_common_tasks.wdl" as CNVTasks - -workflow CNVGermlineSingleSampleWorkflow { - # Workflow input files - File? targets - File normal_bam - File normal_bam_idx - File ref_fasta - File ref_fasta_dict - File ref_fasta_fai - File sex_genotypes - File contig_ploidy_annotations - String gatk_jar - String gatk_docker - - # Transtion prior table files - File transition_prior_table - Array[File] copy_number_transition_prior_files - - # Model directory and parameters - File model_path - Int num_latents - - # Output path - String output_path - - # If no target file is input, then do WGS workflow - Boolean is_wgs = !defined(targets) - - if (!is_wgs) { - call CNVTasks.PadTargets { - input: - # The task will fail if targets is not defined when it gets here, but that should not be allowed to happen. - targets = select_first([targets, ""]), - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - } - - call CNVTasks.CollectReadCounts { - input: - padded_targets = PadTargets.padded_targets, - bam = normal_bam, - bam_idx = normal_bam_idx, - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - ref_fasta_dict = ref_fasta_dict, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - call CNVTasks.AnnotateTargets { - input: - entity_id = CollectReadCounts.entity_id, - intervals = CollectReadCounts.intervals, - ref_fasta = ref_fasta, - ref_fasta_fai = ref_fasta_fai, - ref_fasta_dict = ref_fasta_dict, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - call CNVTasks.CorrectGCBias { - input: - entity_id = CollectReadCounts.entity_id, - coverage = CollectReadCounts.read_counts, - annotated_intervals = AnnotateTargets.annotated_intervals, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - call GermlineCNVCaller { - input: - coverage = CorrectGCBias.corrected_coverage, - contig_ploidy_annotations = contig_ploidy_annotations, - sex_genotypes = sex_genotypes, - transition_prior_table = transition_prior_table, - copy_number_transition_prior_files = copy_number_transition_prior_files, - model_path = model_path, - num_latents = num_latents, - output_path = output_path, - gatk_jar = gatk_jar, - gatk_docker = gatk_docker - } - - output { - Array[File] posteriors = GermlineCNVCaller.posteriors - Array[File] segments = GermlineCNVCaller.segments - } -} - -task GermlineCNVCaller { - File coverage - File contig_ploidy_annotations - File sex_genotypes - File transition_prior_table - Array[File] copy_number_transition_prior_files - String output_path - File model_path - Int num_latents - String gatk_jar - - # Runtime parameters - Int? mem - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - - command { - java -Xmx${default=4 mem}g -Ddtype=double -jar ${gatk_jar} GermlineCNVCaller \ - --input ${coverage} \ - --inputModelPath ${model_path} \ - --contigAnnotationsTable ${contig_ploidy_annotations} \ - --sexGenotypeTable ${sex_genotypes} \ - --copyNumberTransitionPriorTable ${transition_prior_table} \ - --outputPath ${output_path} \ - --numLatents ${default=5 num_latents} \ - --jobType CALL_ONLY \ - --rddCheckpointing false \ - --disableSpark true - } - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 5]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, 200]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - Array[File] posteriors = glob("./${output_path}/posteriors_final/*") - Array[File] segments = glob("./${output_path}/posteriors_final/segments/*") - } -} diff --git a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow_template.json b/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow_template.json deleted file mode 100755 index fe582d98ca0..00000000000 --- a/scripts/cnv_wdl/germline/cnv_germline_single_sample_workflow_template.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "CNVGermlineSingleSampleWorkflow.normal_bam": "File", - "CNVGermlineSingleSampleWorkflow.normal_bam_idx": "File", - "CNVGermlineSingleSampleWorkflow.num_latents": "Int", - "CNVGermlineSingleSampleWorkflow.sex_genotypes": "File", - "CNVGermlineSingleSampleWorkflow.ref_fasta": "File", - "CNVGermlineSingleSampleWorkflow.ref_fasta_dict": "File", - "CNVGermlineSingleSampleWorkflow.ref_fasta_fai": "File", - "CNVGermlineSingleSampleWorkflow.model_path": "String", - "CNVGermlineSingleSampleWorkflow.gatk_jar": "String", - "CNVGermlineSingleSampleWorkflow.targets": "(optional) File?", - "CNVGermlineSingleSampleWorkflow.contig_ploidy_annotations": "File", - "CNVGermlineSingleSampleWorkflow.transition_prior_table": "File", - "CNVGermlineSingleSampleWorkflow.copy_number_transition_prior_files": "Array[File]", - "CNVGermlineSingleSampleWorkflow.output_path": "String", -} diff --git a/scripts/cnv_wdl/germline/resources/contig_annots.tsv b/scripts/cnv_wdl/germline/resources/contig_annots.tsv deleted file mode 100755 index 406629d7c90..00000000000 --- a/scripts/cnv_wdl/germline/resources/contig_annots.tsv +++ /dev/null @@ -1,25 +0,0 @@ -CONTIG CLASS SEX_XX SEX_XY -1 AUTOSOMAL 2 2 -2 AUTOSOMAL 2 2 -3 AUTOSOMAL 2 2 -4 AUTOSOMAL 2 2 -5 AUTOSOMAL 2 2 -6 AUTOSOMAL 2 2 -7 AUTOSOMAL 2 2 -8 AUTOSOMAL 2 2 -9 AUTOSOMAL 2 2 -10 AUTOSOMAL 2 2 -11 AUTOSOMAL 2 2 -12 AUTOSOMAL 2 2 -13 AUTOSOMAL 2 2 -14 AUTOSOMAL 2 2 -15 AUTOSOMAL 2 2 -16 AUTOSOMAL 2 2 -17 AUTOSOMAL 2 2 -18 AUTOSOMAL 2 2 -19 AUTOSOMAL 2 2 -20 AUTOSOMAL 2 2 -21 AUTOSOMAL 2 2 -22 AUTOSOMAL 2 2 -X ALLOSOMAL 2 1 -Y ALLOSOMAL 0 1 diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_priors.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_priors.tsv deleted file mode 100755 index 8f56c282bf2..00000000000 --- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_priors.tsv +++ /dev/null @@ -1,25 +0,0 @@ -CONTIG SEX_XX SEX_XY -1 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -2 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -3 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -4 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -5 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -6 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -7 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -8 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -9 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -10 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -11 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -12 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -13 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -14 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -15 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -16 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -17 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -18 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -19 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -20 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -21 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -22 homo_sapiens_germline_CN_transition_matrix_autosomal.tsv homo_sapiens_germline_CN_transition_matrix_autosomal.tsv -X homo_sapiens_germline_CN_transition_matrix_XX_X.tsv homo_sapiens_germline_CN_transition_matrix_XY_X.tsv -Y homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv deleted file mode 100755 index ea18c070fb0..00000000000 --- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_X.tsv +++ /dev/null @@ -1,7 +0,0 @@ -#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples -T_MATRIX_XX_X FROM_0 FROM_1 FROM_2 FROM_3 FROM_4 -TO_0 0.99966751443861601 0.0 4.6641935242897276e-08 0.0 0.0 -TO_1 0.0 0.9997899779920747 9.3423238818193651e-08 0.0 0.0 -TO_2 0.00033248556138398773 0.00021002200792527603 0.99999985473174158 4.541929579905158e-05 7.8833267638943636e-05 -TO_3 0.0 0.0 5.0172599663674365e-09 0.99995458070420096 0.0 -TO_4 0.0 0.0 1.8582444319879394e-10 0.0 0.99992116673236109 diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv deleted file mode 100755 index f8d0228761f..00000000000 --- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XX_Y.tsv +++ /dev/null @@ -1,3 +0,0 @@ -#A trivial transition matrix for enforcing zero ploidy on Y contig in XX samples -T_MATRIX_XX_Y TO_0 -FROM_0 1.0 diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv deleted file mode 100755 index 992f224460f..00000000000 --- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_X.tsv +++ /dev/null @@ -1,6 +0,0 @@ -#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples -T_MATRIX_XY_X FROM_0 FROM_1 FROM_2 FROM_3 -TO_0 0.99971173098797461 1.0067714836234777e-07 0.0 0.0 -TO_1 0.00028826901202540391 0.99999989259309574 7.456796468461193e-05 4.0420371867421184e-05 -TO_2 0.0 6.5615120089096975e-09 0.99992504963549644 8.0840743734842364e-06 -TO_3 0.0 1.6824389766435122e-10 3.8239981889544576e-07 0.99995149555375906 diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv deleted file mode 100755 index 8c80b9eaa16..00000000000 --- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_XY_Y.tsv +++ /dev/null @@ -1,7 +0,0 @@ -#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples -T_MATRIX_XY_Y FROM_0 FROM_1 FROM_2 FROM_3 FROM_4 -TO_0 0.99966851990709416 5.9399783434370542e-08 0.0 0.0 0.0 -TO_1 0.00033148009290586881 0.99999937404917871 0.00016831138093714932 0.00035529148884256304 0.00027047913446676971 -TO_2 0.0 5.2251326738303193e-07 0.99983149068329535 6.4209305212511401e-06 0.0 -TO_3 0.0 3.4001255345191416e-08 1.9793576746822735e-07 0.99963614727046246 5.519982336056525e-06 -TO_4 0.0 1.0036515132014333e-08 0.0 2.1403101737503797e-06 0.99972400088319713 diff --git a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv b/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv deleted file mode 100755 index 5b0ef7c5cd6..00000000000 --- a/scripts/cnv_wdl/germline/resources/homo_sapiens_germline_CN_transition_matrix_autosomal.tsv +++ /dev/null @@ -1,13 +0,0 @@ -#The following germline copy number transition matrix is obtained from analyzing Genome STRiP calls on a cohort of 170 blood normal TCGA samples -T_MATRIX_AUTOSOMAL FROM_0 FROM_1 FROM_2 FROM_3 FROM_4 FROM_5 FROM_6 FROM_7 FROM_8 FROM_9 FROM_10 -TO_0 0.9997389770672177 2.1467075095351557e-07 5.9100196666398515e-08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -TO_1 7.032645704368021e-07 0.99981467801052126 1.376696014985822e-07 1.1958483005083788e-08 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -TO_2 0.00026031966821188487 0.00018510163208542175 0.99999972823891037 9.7745650462803608e-05 0.00010292959512329779 8.0138884782014782e-05 9.0232386606850211e-05 7.9777767777145916e-05 7.5782475345150855e-05 8.8396952830754563e-05 8.9130531663621367e-05 -TO_3 0.0 5.6866424093646516e-09 3.4570453537193609e-08 0.99990217064015618 6.1564638523662337e-08 1.1699107267447413e-07 4.469162288600803e-08 0.0 0.0 0.0 0.0 -TO_4 0.0 0.0 3.3588164155451664e-08 5.6802794274147987e-08 0.99989682090607845 1.1699107267447413e-07 2.4133476358444333e-06 0.0 0.0 0.0 0.0 -TO_5 0.0 0.0 2.8971721269891569e-09 1.1958483005083788e-08 1.2960976531297335e-08 0.99991953938976808 1.3407486865802407e-07 0.0 0.0 0.0 0.0 -TO_6 0.0 0.0 2.1348140599967544e-09 2.989620751270947e-09 1.7497318317251403e-07 8.7743304505855591e-08 0.99990713080764293 1.0766230469250462e-07 0.0 0.0 0.0 -TO_7 0.0 0.0 7.8350530879524278e-10 0.0 0.0 0.0 4.469162288600803e-08 0.99992011456991814 0.0 0.0 0.0 -TO_8 0.0 0.0 9.4105226022640503e-10 0.0 0.0 0.0 0.0 0.0 0.99992421752465488 0.0 0.0 -TO_9 0.0 0.0 5.9212277047953571e-11 0.0 0.0 0.0 0.0 0.0 0.0 0.9999116030471692 0.0 -TO_10 0.0 0.0 4.2294483605681122e-12 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.99991086946833641 diff --git a/scripts/cnv_wdl/somatic/README.md b/scripts/cnv_wdl/somatic/README.md new file mode 100644 index 00000000000..61bffbb3474 --- /dev/null +++ b/scripts/cnv_wdl/somatic/README.md @@ -0,0 +1,52 @@ +## Running the Somatic CNV WDL + +### Which WDL should you use? + +- Building a panel of normals (PoN): ``cnv_somatic_panel_workflow.wdl`` +- Running a matched pair: ``cnv_somatic_pair_workflow.wdl`` + +#### Setting up parameter json file for a run + +To get started, create the json template (using ``java -jar wdltool.jar inputs ``) for the workflow you wish to run and adjust parameters accordingly. + +*Please note that there are optional workflow-level and task-level parameters that do not appear in the template file. These are set to reasonable values by default, but can also be adjusted if desired.* + +#### Required parameters in the somatic panel workflow + +The reference used must be the same between PoN and case samples. + +- ``CNVSomaticPanelWorkflow.gatk_docker`` -- GATK Docker image (e.g., ``broadinstitute/gatk:latest``). +- ``CNVSomaticPanelWorkflow.intervals`` -- Picard or GATK-style interval list. For WGS, this should typically only include the autosomal chromosomes. +- ``CNVSomaticPanelWorkflow.normal_bams_list`` -- TSV file consisting of corresponding bam and corresponding index files as described in cnv_somatic_panel_workflow.wdl. +- ``CNVSomaticPanelWorkflow.pon_entity_id`` -- Name of the final PoN file. +- ``CNVSomaticPanelWorkflow.ref_fasta_dict`` -- Path to reference dict file. +- ``CNVSomaticPanelWorkflow.ref_fasta_fai`` -- Path to reference fasta fai file. +- ``CNVSomaticPanelWorkflow.ref_fasta`` -- Path to reference fasta file. + +In additional, there are optional workflow-level and task-level parameters that may be set by advanced users; for example: + +- ``CNVSomaticPanelWorkflow.do_explicit_gc_correction`` -- (optional) If true, perform explicit GC-bias correction when creating PoN and in subsequent denoising of case samples. If false, rely on PCA-based denoising to correct for GC bias. +- ``CNVSomaticPanelWorkflow.PreprocessIntervals.bin_length`` -- Size of bins (in bp) for coverage collection. *This must be the same value used for all case samples.* +- ``CNVSomaticPanelWorkflow.PreprocessIntervals.padding`` -- Amount of padding (in bp) to add to both sides of targets for WES coverage collection. *This must be the same value used for all case samples.* + +Further explanation of other task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool. + +#### Required parameters in the somatic pair workflow + +The reference (and bins, if specified) used must be the same between PoN and case samples. + +- ``CNVSomaticPairWorkflow.common_sites`` -- Picard or GATK-style interval list of common sites to use for collecting allelic counts. +- ``CNVSomaticPairWorkflow.gatk_docker`` -- GATK Docker image (e.g., ``broadinstitute/gatk:latest``). +- ``CNVSomaticPairWorkflow.intervals`` -- Picard or GATK-style interval list. For WGS, this should typically only include the autosomal chromosomes. +- ``CNVSomaticPairWorkflow.normal_bam`` -- File path or storage location (depending on backend) of the normal BAM file. +- ``CNVSomaticPairWorkflow.normal_bam_idx`` -- File path or storage location (depending on backend) of the normal BAM file index. +- ``CNVSomaticPairWorkflow.read_count_pon`` -- Path to read-count PoN created by the panel workflow. +- ``CNVSomaticPairWorkflow.ref_fasta_dict`` -- Path to reference dict file. +- ``CNVSomaticPairWorkflow.ref_fasta_fai`` -- Path to reference fasta fai file. +- ``CNVSomaticPairWorkflow.ref_fasta`` -- Path to reference fasta file. +- ``CNVSomaticPairWorkflow.tumor_bam`` -- File path or storage location (depending on backend) of the tumor BAM file. +- ``CNVSomaticPairWorkflow.tumor_bam_idx`` -- File path or storage location (depending on backend) of the tumor BAM file index. + +In additional, there are several task-level parameters that may be set by advanced users as above. + +Further explanation of these task-level parameters may be found by invoking the ``--help`` documentation available in the gatk.jar for each tool. \ No newline at end of file diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl new file mode 100644 index 00000000000..35e153d24cc --- /dev/null +++ b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl @@ -0,0 +1,506 @@ +# Workflow for running the GATK CNV pipeline on a matched pair. Supports both WGS and WES. +# +# Notes: +# +# - The interval-list file is required for both WGS and WES workflows and should be a Picard or GATK-style interval list. +# These intervals will be padded on both sides by the amount specified by PreprocessIntervals.padding (default 250) +# and split into bins of length specified by PreprocessIntervals.bin_length (default 1000; specify 0 to skip binning). +# For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be included, but care +# should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only with panels containing +# individuals of the same sex as the case samples). +# +# - The sites file (common_sites) should be a Picard or GATK-style interval list. This is a list of sites +# of known variation at which allelic counts will be collected for use in modeling minor-allele fractions. +# +# - Example invocation: +# java -jar cromwell.jar run cnv_somatic_pair_workflow.wdl myParameters.json +# See cnv_somatic_pair_workflow_template.json for a template json file to modify with your own parameters (please save +# your modified version with a different filename and do not commit to the gatk repository). +# +############# + +import "cnv_common_tasks.wdl" as CNVTasks + +workflow CNVSomaticPairWorkflow { + File common_sites + File intervals + File tumor_bam + File tumor_bam_idx + File normal_bam + File normal_bam_idx + File read_count_pon + File ref_fasta_dict + File ref_fasta_fai + File ref_fasta + String gatk_docker + File? gatk4_jar_override + + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int? emergency_extra_disk + + Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_dict, "GB") + size(ref_fasta_fai, "GB")) + Int read_count_pon_size = ceil(size(read_count_pon, "GB")) + Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bam_idx, "GB")) + Int normal_bam_size = ceil(size(normal_bam, "GB") + size(normal_bam_idx, "GB")) + + Int gatk4_override_size = if defined(gatk4_jar_override) then ceil(size(gatk4_jar_override, "GB")) else 0 + # This is added to every task as padding, should increase if systematically you need more disk for every call + Int disk_pad = 20 + ceil(size(intervals, "GB")) + ceil(size(common_sites, "GB")) + gatk4_override_size + select_first([emergency_extra_disk,0]) + + Int process_disk = ref_size + disk_pad + call CNVTasks.PreprocessIntervals { + input: + intervals = intervals, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = process_disk + } + + Int collect_counts_tumor_disk = tumor_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad + call CNVTasks.CollectCounts as CollectCountsTumor { + input: + intervals = PreprocessIntervals.preprocessed_intervals, + bam = tumor_bam, + bam_idx = tumor_bam_idx, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = collect_counts_tumor_disk + } + + Int collect_counts_normal_disk = normal_bam_size + ceil(size(PreprocessIntervals.preprocessed_intervals, "GB")) + disk_pad + call CNVTasks.CollectCounts as CollectCountsNormal { + input: + intervals = PreprocessIntervals.preprocessed_intervals, + bam = normal_bam, + bam_idx = normal_bam_idx, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = collect_counts_normal_disk + } + + Int collect_allelic_counts_tumor_disk = tumor_bam_size + ref_size + disk_pad + call CNVTasks.CollectAllelicCounts as CollectAllelicCountsTumor { + input: + common_sites = common_sites, + bam = tumor_bam, + bam_idx = tumor_bam_idx, + ref_fasta = ref_fasta, + ref_fasta_dict = ref_fasta_dict, + ref_fasta_fai = ref_fasta_fai, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = collect_allelic_counts_tumor_disk + } + + Int collect_allelic_counts_normal_disk = normal_bam_size + ref_size + disk_pad + call CNVTasks.CollectAllelicCounts as CollectAllelicCountsNormal { + input: + common_sites = common_sites, + bam = normal_bam, + bam_idx = normal_bam_idx, + ref_fasta = ref_fasta, + ref_fasta_dict = ref_fasta_dict, + ref_fasta_fai = ref_fasta_fai, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = collect_allelic_counts_normal_disk + } + + Int denoise_read_counts_tumor_disk = read_count_pon_size + ceil(size(CollectCountsTumor.counts, "GB")) + disk_pad + call DenoiseReadCounts as DenoiseReadCountsTumor { + input: + entity_id = CollectCountsTumor.entity_id, + read_counts = CollectCountsTumor.counts, + read_count_pon = read_count_pon, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = denoise_read_counts_tumor_disk + } + + Int denoise_read_counts_normal_disk = read_count_pon_size + ceil(size(CollectCountsNormal.counts, "GB")) + disk_pad + call DenoiseReadCounts as DenoiseReadCountsNormal { + input: + entity_id = CollectCountsNormal.entity_id, + read_counts = CollectCountsNormal.counts, + read_count_pon = read_count_pon, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = denoise_read_counts_normal_disk + } + + Int model_segments_disk = ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(CollectAllelicCountsTumor.allelic_counts, "GB")) + ceil(size(CollectAllelicCountsNormal.allelic_counts, "GB")) + disk_pad + call ModelSegments as ModelSegmentsTumor { + input: + entity_id = CollectCountsTumor.entity_id, + denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios, + allelic_counts = CollectAllelicCountsTumor.allelic_counts, + normal_allelic_counts = CollectAllelicCountsNormal.allelic_counts, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = model_segments_disk + } + + call ModelSegments as ModelSegmentsNormal { + input: + entity_id = CollectCountsNormal.entity_id, + denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios, + allelic_counts = CollectAllelicCountsNormal.allelic_counts, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = model_segments_disk + } + + Int copy_ratio_segments_tumor_disk = ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsTumor.copy_ratio_only_segments, "GB")) + disk_pad + call CallCopyRatioSegments as CallCopyRatioSegmentsTumor { + input: + entity_id = CollectCountsTumor.entity_id, + copy_ratio_segments = ModelSegmentsTumor.copy_ratio_only_segments, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = copy_ratio_segments_tumor_disk + } + + Int copy_ratio_segments_normal_disk = ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsNormal.copy_ratio_only_segments, "GB")) + disk_pad + call CallCopyRatioSegments as CallCopyRatioSegmentsNormal { + input: + entity_id = CollectCountsNormal.entity_id, + copy_ratio_segments = ModelSegmentsNormal.copy_ratio_only_segments, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = copy_ratio_segments_normal_disk + } + + # The F=files from other tasks are small enough to just combine into one disk variable and pass to the tumor plotting tasks + Int plot_tumor_disk = ref_size + ceil(size(DenoiseReadCountsTumor.standardized_copy_ratios, "GB")) + ceil(size(DenoiseReadCountsTumor.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsTumor.het_allelic_counts, "GB")) + ceil(size(ModelSegmentsTumor.modeled_segments, "GB")) + disk_pad + call PlotDenoisedCopyRatios as PlotDenoisedCopyRatiosTumor { + input: + entity_id = CollectCountsTumor.entity_id, + standardized_copy_ratios = DenoiseReadCountsTumor.standardized_copy_ratios, + denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = plot_tumor_disk + } + # The files from other tasks are small enough to just combine into one disk variable and pass to the normal plotting tasks + Int plot_normal_disk = ref_size + ceil(size(DenoiseReadCountsNormal.standardized_copy_ratios, "GB")) + ceil(size(DenoiseReadCountsNormal.denoised_copy_ratios, "GB")) + ceil(size(ModelSegmentsNormal.het_allelic_counts, "GB")) + ceil(size(ModelSegmentsNormal.modeled_segments, "GB")) + disk_pad + call PlotDenoisedCopyRatios as PlotDenoisedCopyRatiosNormal { + input: + entity_id = CollectCountsNormal.entity_id, + standardized_copy_ratios = DenoiseReadCountsNormal.standardized_copy_ratios, + denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = plot_normal_disk + } + + call PlotModeledSegments as PlotModeledSegmentsTumor { + input: + entity_id = CollectCountsTumor.entity_id, + denoised_copy_ratios = DenoiseReadCountsTumor.denoised_copy_ratios, + het_allelic_counts = ModelSegmentsTumor.het_allelic_counts, + modeled_segments = ModelSegmentsTumor.modeled_segments, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = plot_tumor_disk + } + + call PlotModeledSegments as PlotModeledSegmentsNormal { + input: + entity_id = CollectCountsNormal.entity_id, + denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios, + het_allelic_counts = ModelSegmentsNormal.het_allelic_counts, + modeled_segments = ModelSegmentsNormal.modeled_segments, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + disk_space_gb = plot_normal_disk + } +} + +task DenoiseReadCounts { + String entity_id + File read_counts + File read_count_pon + Int? number_of_eigensamples #use all eigensamples in panel by default + File? gatk4_jar_override + + # Runtime parameters + Int? mem + String gatk_docker + Int? preemptible_attempts + Int disk_space_gb + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 13000 + Int command_mem = machine_mem - 1000 + + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${command_mem}m -jar $GATK_JAR DenoiseReadCounts \ + --input ${read_counts} \ + --readCountPanelOfNormals ${read_count_pon} \ + ${"--numberOfEigensamples " + number_of_eigensamples} \ + --standardizedCopyRatios ${entity_id}.standardizedCR.tsv \ + --denoisedCopyRatios ${entity_id}.denoisedCR.tsv + >>> + + runtime { + docker: "${gatk_docker}" + memory: machine_mem + " MB" + disks: "local-disk " + disk_space_gb + " HDD" + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File standardized_copy_ratios = "${entity_id}.standardizedCR.tsv" + File denoised_copy_ratios = "${entity_id}.denoisedCR.tsv" + } +} + +task ModelSegments { + String entity_id + File denoised_copy_ratios + File allelic_counts + File? normal_allelic_counts + Int? max_num_segments_per_chromosome + Int? min_total_allele_count + Float? genotyping_homozygous_log_ratio_threshold + Float? genotyping_base_error_rate + Float? kernel_variance_copy_ratio + Float? kernel_variance_allele_fraction + Float? kernel_scaling_allele_fraction + Int? kernel_approximation_dimension + Array[Int]+? window_sizes = [8, 16, 32, 64, 128, 256] + Float? num_changepoints_penalty_factor + Float? minor_allele_fraction_prior_alpha + Int? num_samples_copy_ratio + Int? num_burn_in_copy_ratio + Int? num_samples_allele_fraction + Int? num_burn_in_allele_fraction + Float? smoothing_threshold_copy_ratio + Float? smoothing_threshold_allele_fraction + Int? max_num_smoothing_iterations + Int? num_smoothing_iterations_per_fit + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + Int? mem + String gatk_docker + Int? preemptible_attempts + Int disk_space_gb + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 13000 + # ModelSegments seems to need at least 3GB of overhead to run + Int command_mem = machine_mem - 3000 + + # If optional output_dir not specified, use "out" + String output_dir_ = select_first([output_dir, "out"]) + + command <<< + set -e + mkdir ${output_dir_} + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${command_mem}m -jar $GATK_JAR ModelSegments \ + --denoisedCopyRatios ${denoised_copy_ratios} \ + --allelicCounts ${allelic_counts} \ + ${"--normalAllelicCounts " + normal_allelic_counts} \ + --maxNumSegmentsPerChromosome ${default="500" max_num_segments_per_chromosome} \ + --minTotalAlleleCount ${default="30" min_total_allele_count} \ + --genotypingHomozygousLogRatioThreshold ${default="-10.0" genotyping_homozygous_log_ratio_threshold} \ + --genotypingBaseErrorRate ${default="0.05" genotyping_base_error_rate} \ + --kernelVarianceCopyRatio ${default="0.0" kernel_variance_copy_ratio} \ + --kernelVarianceAlleleFraction ${default="0.025" kernel_variance_allele_fraction} \ + --kernelScalingAlleleFraction ${default="1.0" kernel_scaling_allele_fraction} \ + --kernelApproximationDimension ${default="100" kernel_approximation_dimension} \ + --windowSize ${sep= " --windowSize " window_sizes} \ + --numChangepointsPenaltyFactor ${default="1.0" num_changepoints_penalty_factor} \ + --minorAlleleFractionPriorAlpha ${default="25.0" minor_allele_fraction_prior_alpha} \ + --numSamplesCopyRatio ${default=100 num_samples_copy_ratio} \ + --numBurnInCopyRatio ${default=50 num_burn_in_copy_ratio} \ + --numSamplesAlleleFraction ${default=100 num_samples_allele_fraction} \ + --numBurnInAlleleFraction ${default=50 num_burn_in_allele_fraction} \ + --smoothingThresholdCopyRatio ${default="2.0" smoothing_threshold_copy_ratio} \ + --smoothingThresholdAlleleFraction ${default="2.0" smoothing_threshold_allele_fraction} \ + --maxNumSmoothingIterations ${default=10 max_num_smoothing_iterations} \ + --numSmoothingIterationsPerFit ${default=0 num_smoothing_iterations_per_fit} \ + --output ${output_dir_} \ + --outputPrefix ${entity_id} + + # We need to create the file even if the above command doesn't so we have something to delocalize + # If no file is created by the above task then it will copy out an empty file + touch ${output_dir_}/${entity_id}.hets.normal.tsv + >>> + + runtime { + docker: "${gatk_docker}" + memory: machine_mem + " MB" + disks: "local-disk " + disk_space_gb + " HDD" + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File het_allelic_counts = "${output_dir_}/${entity_id}.hets.tsv" + File normal_het_allelic_counts = "${output_dir_}/${entity_id}.hets.normal.tsv" + File copy_ratio_only_segments = "${output_dir_}/${entity_id}.cr.seg" + File modeled_segments_begin = "${output_dir_}/${entity_id}.modelBegin.seg" + File copy_ratio_parameters_begin = "${output_dir_}/${entity_id}.modelBegin.cr.param" + File allele_fraction_parameters_begin = "${output_dir_}/${entity_id}.modelBegin.af.param" + File modeled_segments = "${output_dir_}/${entity_id}.modelFinal.seg" + File copy_ratio_parameters = "${output_dir_}/${entity_id}.modelFinal.cr.param" + File allele_fraction_parameters = "${output_dir_}/${entity_id}.modelFinal.af.param" + } +} + +task CallCopyRatioSegments { + String entity_id + File copy_ratio_segments + Float? neutral_segment_copy_ratio_threshold + Float? outlier_neutral_segment_copy_ratio_z_score_threshold + Float? calling_copy_ratio_z_score_threshold + File? gatk4_jar_override + + # Runtime parameters + Int? mem + String gatk_docker + Int? preemptible_attempts + Int disk_space_gb + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 7000 + Int command_mem = machine_mem - 1000 + + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${command_mem}m -jar $GATK_JAR CallCopyRatioSegments \ + --input ${copy_ratio_segments} \ + --neutralSegmentCopyRatioThreshold ${default="0.1" neutral_segment_copy_ratio_threshold} \ + --outlierNeutralSegmentCopyRatioZScoreThreshold ${default="2.0" outlier_neutral_segment_copy_ratio_z_score_threshold} \ + --callingCopyRatioZScoreThreshold ${default="2.0" calling_copy_ratio_z_score_threshold} \ + --output ${entity_id}.called.seg + >>> + + runtime { + docker: "${gatk_docker}" + memory: machine_mem + " MB" + disks: "local-disk " + disk_space_gb + " HDD" + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File called_copy_ratio_segments = "${entity_id}.called.seg" + } +} + +task PlotDenoisedCopyRatios { + String entity_id + File standardized_copy_ratios + File denoised_copy_ratios + File ref_fasta_dict + Int? minimum_contig_length + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + Int? mem + String gatk_docker + Int? preemptible_attempts + Int disk_space_gb + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 7000 + Int command_mem = machine_mem - 1000 + + # If optional output_dir not specified, use "out" + String output_dir_ = select_first([output_dir, "out"]) + + command <<< + set -e + mkdir ${output_dir_} + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${command_mem}m -jar $GATK_JAR PlotDenoisedCopyRatios \ + --standardizedCopyRatios ${standardized_copy_ratios} \ + --denoisedCopyRatios ${denoised_copy_ratios} \ + -SD ${ref_fasta_dict} \ + --minimumContigLength ${default="1000000" minimum_contig_length} \ + --output ${output_dir_} \ + --outputPrefix ${entity_id} + >>> + + runtime { + docker: "${gatk_docker}" + memory: machine_mem + " MB" + disks: "local-disk " + disk_space_gb + " HDD" + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File denoised_copy_ratios_plot = "${output_dir_}/${entity_id}.denoised.png" + File denoised_copy_ratios_lim_4_plot = "${output_dir_}/${entity_id}.denoisedLimit4.png" + File standardized_MAD = "${output_dir_}/${entity_id}.standardizedMAD.txt" + File denoised_MAD = "${output_dir_}/${entity_id}.denoisedMAD.txt" + File delta_MAD = "${output_dir_}/${entity_id}.deltaMAD.txt" + File scaled_delta_MAD = "${output_dir_}/${entity_id}.scaledDeltaMAD.txt" + } +} + +task PlotModeledSegments { + String entity_id + File denoised_copy_ratios + File het_allelic_counts + File modeled_segments + File ref_fasta_dict + Int? minimum_contig_length + String? output_dir + File? gatk4_jar_override + + # Runtime parameters + Int? mem + String gatk_docker + Int? preemptible_attempts + Int disk_space_gb + + # Mem is in units of GB but our command and memory runtime values are in MB + Int machine_mem = if defined(mem) then mem * 1000 else 7000 + Int command_mem = machine_mem - 1000 + + # If optional output_dir not specified, use "out" + String output_dir_ = select_first([output_dir, "out"]) + + command <<< + set -e + mkdir ${output_dir_} + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${command_mem}m -jar $GATK_JAR PlotModeledSegments \ + --denoisedCopyRatios ${denoised_copy_ratios} \ + --allelicCounts ${het_allelic_counts} \ + --segments ${modeled_segments} \ + -SD ${ref_fasta_dict} \ + --minimumContigLength ${default="1000000" minimum_contig_length} \ + --output ${output_dir_} \ + --outputPrefix ${entity_id} + >>> + + runtime { + docker: "${gatk_docker}" + memory: machine_mem + " MB" + disks: "local-disk " + disk_space_gb + " HDD" + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File modeled_segments_plot = "${output_dir_}/${entity_id}.modeled.png" + } +} diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl new file mode 100644 index 00000000000..ec83f4a75d2 --- /dev/null +++ b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl @@ -0,0 +1,137 @@ +# Workflow for creating a GATK CNV Panel of Normals given a list of normal samples. Supports both WGS and WES. +# +# Notes: +# +# - Input file (normal_bams_list) must contain file paths to bam and bam index files separated by tabs in the following format: +# normal_bam_1 bam_idx_1 +# normal_bam_2 bam_idx_2 +# ... +# +# - The interval-list file is required for both WGS and WES workflows and should be a Picard or GATK-style interval list. +# These intervals will be padded on both sides by the amount specified by PreprocessIntervals.padding (default 250) +# and split into bins of length specified by PreprocessIntervals.bin_length (default 1000; specify 0 to skip binning). +# For WGS, the intervals should simply cover the autosomal chromosomes (sex chromosomes may be included, but care +# should be taken to 1) avoid creating panels of mixed sex, and 2) denoise case samples only with panels containing +# individuals of the same sex as the case samples). +# +# - Example invocation: +# java -jar cromwell.jar run cnv_somatic_panel_workflow.wdl myParameters.json +# See cnv_somatic_panel_workflow_template.json for a template json file to modify with your own parameters (please save +# your modified version with a different filename and do not commit to the gatk repository). +# +############# + +import "cnv_common_tasks.wdl" as CNVTasks + +workflow CNVSomaticPanelWorkflow { + File intervals + File normal_bams_list + Array[Array[String]]+ normal_bams = read_tsv(normal_bams_list) + String pon_entity_id + File ref_fasta_dict + File ref_fasta_fai + File ref_fasta + String gatk_docker + File? gatk4_jar_override + Int? mem_for_create_read_count_pon + + # If true, AnnotateIntervals will be run to create GC annotations and explicit GC correction + # will be performed by the PoN generated by CreateReadCountPanelOfNormals before PCA is performed on subsequent cases + Boolean? do_explicit_gc_correction + + call CNVTasks.PreprocessIntervals { + input: + intervals = intervals, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker + } + + if (select_first([do_explicit_gc_correction, false])) { + call CNVTasks.AnnotateIntervals { + input: + intervals = PreprocessIntervals.preprocessed_intervals, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + ref_fasta_dict = ref_fasta_dict, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker + } + } + + scatter (normal_bam in normal_bams) { + call CNVTasks.CollectCounts { + input: + intervals = PreprocessIntervals.preprocessed_intervals, + bam = normal_bam[0], + bam_idx = normal_bam[1], + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker + } + } + + call CreateReadCountPanelOfNormals { + input: + pon_entity_id = pon_entity_id, + read_count_files = CollectCounts.counts, + annotated_intervals = AnnotateIntervals.annotated_intervals, + gatk4_jar_override = gatk4_jar_override, + gatk_docker = gatk_docker, + mem = mem_for_create_read_count_pon + } + + output { + File read_count_pon = CreateReadCountPanelOfNormals.read_count_pon + } +} + +task CreateReadCountPanelOfNormals { + String pon_entity_id + Array[File] read_count_files + Float? minimum_interval_median_percentile + Float? maximum_zeros_in_sample_percentage + Float? maximum_zeros_in_interval_percentage + Float? extreme_sample_median_percentile + Boolean? do_impute_zeros + Float? extreme_outlier_truncation_percentile + Int? number_of_eigensamples + File? annotated_intervals #do not perform explicit GC correction by default + File? gatk4_jar_override + + # Runtime parameters + Int? mem + String gatk_docker + Int? preemptible_attempts + Int? disk_space_gb + + Int machine_mem = if defined(mem) then select_first([mem]) else 8 + Float command_mem = machine_mem - 0.5 + + command <<< + set -e + GATK_JAR=${default="/root/gatk.jar" gatk4_jar_override} + + java -Xmx${machine_mem}g -jar $GATK_JAR CreateReadCountPanelOfNormals \ + --input ${sep=" --input " read_count_files} \ + --minimumIntervalMedianPercentile ${default="10.0" minimum_interval_median_percentile} \ + --maximumZerosInSamplePercentage ${default="5.0" maximum_zeros_in_sample_percentage} \ + --maximumZerosInIntervalPercentage ${default="5.0" maximum_zeros_in_interval_percentage} \ + --extremeSampleMedianPercentile ${default="2.5" extreme_sample_median_percentile} \ + --doImputeZeros ${default="true" do_impute_zeros} \ + --extremeOutlierTruncationPercentile ${default="0.1" extreme_outlier_truncation_percentile} \ + --numberOfEigensamples ${default="20" number_of_eigensamples} \ + ${"--annotatedIntervals " + annotated_intervals} \ + --output ${pon_entity_id}.pon.hdf5 + >>> + + runtime { + docker: "${gatk_docker}" + memory: command_mem + " GB" + disks: "local-disk " + select_first([disk_space_gb, 150]) + " HDD" + preemptible: select_first([preemptible_attempts, 2]) + } + + output { + File read_count_pon = "${pon_entity_id}.pon.hdf5" + } +} \ No newline at end of file diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl index 520a0324ac9..2041bd221f0 100755 --- a/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/somatic_legacy/cnv_common_tasks.wdl @@ -18,12 +18,12 @@ task PadTargets { String filename = select_first([targets, ""]) String base_filename = basename(filename, ".tsv") - command { + command <<< java -Xmx${default="1" mem}g -jar ${gatk_jar} PadTargets \ --targets ${targets} \ --padding ${default="250" padding} \ --output ${base_filename}.padded.tsv - } + >>> runtime { docker: "${gatk_docker}" @@ -126,12 +126,12 @@ task AnnotateTargets { Int? preemptible_attempts Int? disk_space_gb - command { + command <<< java -Xmx${default=4 mem}g -jar ${gatk_jar} AnnotateTargets \ --targets ${targets} \ --reference ${ref_fasta} \ --output ${entity_id}.annotated.tsv - } + >>> runtime { docker: "${gatk_docker}" @@ -158,12 +158,12 @@ task CorrectGCBias { Int? preemptible_attempts Int? disk_space_gb - command { + command <<< java -Xmx${default=4 mem}g -jar ${gatk_jar} CorrectGCBias \ --input ${coverage} \ --targets ${annotated_targets} \ --output ${entity_id}.gc_corrected.tsv - } + >>> runtime { docker: "${gatk_docker}" diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl index 3223b108fb4..4dfb7e421fd 100644 --- a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl +++ b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_allele_fraction_pair_workflow.wdl @@ -248,7 +248,7 @@ task PlotACNVResults { # If optional output_dir not specified, use "." String output_dir_ = select_first([output_dir, "."]) - command { + command <<< mkdir -p ${output_dir_}; \ java -Xmx${default=4 mem}g -jar ${gatk_jar} PlotACNVResults \ --hets ${hets} \ @@ -257,7 +257,7 @@ task PlotACNVResults { -SD ${ref_fasta_dict} \ --output ${output_dir_} \ --outputPrefix ${entity_id} - } + >>> runtime { docker: "${gatk_docker}" @@ -289,14 +289,14 @@ task ConvertACNVResults { # If optional output_dir not specified, use "." String output_dir_ = select_first([output_dir, "."]) - command { + command <<< mkdir -p ${output_dir_}; \ java -Xmx${default=4 mem}g -jar ${gatk_jar} ConvertACNVResults \ --tumorHets ${hets} \ --tangentNormalized ${tn_coverage} \ --segments ${acnv_segments} \ --outputDir ${output_dir_} - } + >>> runtime { docker: "${gatk_docker}" diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl index 09416567afd..af928c1cad9 100644 --- a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl +++ b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_copy_ratio_bam_workflow.wdl @@ -119,7 +119,7 @@ task NormalizeSomaticReadCounts { Int? preemptible_attempts Int? disk_space_gb - command { + command <<< java -Xmx${default=4 mem}g -jar ${gatk_jar} NormalizeSomaticReadCounts \ --input ${coverage} \ --targets ${padded_targets} \ @@ -128,7 +128,7 @@ task NormalizeSomaticReadCounts { --factorNormalizedOutput ${entity_id}.fnt.tsv \ --preTangentNormalized ${entity_id}.preTN.tsv \ --betaHatsOutput ${entity_id}.betaHats.tsv - } + >>> runtime { docker: "${gatk_docker}" @@ -231,13 +231,13 @@ task CallSegments { Int? preemptible_attempts Int? disk_space_gb - command { + command <<< java -Xmx${default=4 mem}g -jar ${gatk_jar} CallSegments \ --tangentNormalized ${tn_coverage} \ --segments ${segments} \ --legacy false \ --output ${entity_id}.called - } + >>> runtime { docker: "${gatk_docker}" @@ -269,7 +269,7 @@ task PlotSegmentedCopyRatio { # If optional output_dir not specified, use "." String output_dir_ = select_first([output_dir, "."]) - command { + command <<< mkdir -p ${output_dir_}; \ java -Xmx${default=4 mem}g -jar ${gatk_jar} PlotSegmentedCopyRatio \ --tangentNormalized ${tn_coverage} \ @@ -278,7 +278,7 @@ task PlotSegmentedCopyRatio { -SD ${ref_fasta_dict} \ --output ${output_dir_} \ --outputPrefix ${entity_id} - } + >>> runtime { docker: "${gatk_docker}" diff --git a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl index 75226576532..a1310909901 100644 --- a/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl +++ b/scripts/cnv_wdl/somatic_legacy/cnv_somatic_panel_workflow.wdl @@ -120,12 +120,12 @@ task CombineReadCounts { Int? preemptible_attempts Int? disk_space_gb - command { + command <<< java -Xmx${default=4 mem}g -jar ${gatk_jar} CombineReadCounts \ --input ${sep=" --input " coverage_file_list} \ --maxOpenFiles ${default=100 max_open_files} \ --output ${combined_entity_id}.tsv - } + >>> runtime { docker: "${gatk_docker}" @@ -152,7 +152,7 @@ task CreatePanelOfNormals { Int? preemptible_attempts Int? disk_space_gb - command { + command <<< # If there are no removed samples the output file still needs to be created touch "${pon_entity_id}.pon.removed_samples.txt" ; \ java -Xmx${default=4 mem}g -jar ${gatk_jar} CreatePanelOfNormals \ @@ -161,7 +161,7 @@ task CreatePanelOfNormals { --truncatePercentileThreshold 0.1 \ --noQC ${default="false" no_qc} \ --output ${pon_entity_id}.pon - } + >>> runtime { docker: "${gatk_docker}" diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java index 2a3cdd9f4d9..623c973671d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java @@ -8,10 +8,10 @@ import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.engine.*; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedInterval; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedIntervalCollection; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotationSet; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberArgumentValidationUtils; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.SimpleInterval; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java index 09537ff243f..b9124f0a1e6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegments.java @@ -7,12 +7,9 @@ import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; -import org.broadinstitute.hellbender.tools.copynumber.coverage.caller.CalledCopyRatioSegmentCollection; -import org.broadinstitute.hellbender.tools.copynumber.coverage.caller.ReCapSegCaller; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegmentCollection; -import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; -import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.tools.copynumber.caller.SimpleCopyRatioCaller; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection; import java.io.File; @@ -20,16 +17,16 @@ * Calls segments as amplified, deleted or copy number neutral given files containing denoised copy ratios * and a list of segments. * - * @author David Benjamin - * *

Examples

* *
  * gatk-launch --javaOptions "-Xmx4g" CallCopyRatioSegments \
- *   --denoisedCopyRatios tumor.denoisedCR.tsv \
  *   --segments tumor.cr.seg \
  *   --output tumor.called
  * 
+ * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> */ @CommandLineProgramProperties( summary = "Call copy-ratio segments as amplified, deleted, or copy number neutral.", @@ -39,18 +36,19 @@ @DocumentedFeature @BetaFeature public final class CallCopyRatioSegments extends CommandLineProgram { + public static final String NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_LONG_NAME = "neutralSegmentCopyRatioThreshold"; + public static final String NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_SHORT_NAME = "neutralTh"; - @Argument( - doc = "Input file containing denoised copy-ratio profile (output of DenoiseReadCounts).", - fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME, - shortName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_SHORT_NAME - ) - private File inputDenoisedCopyRatiosFile; + public static final String OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME = "outlierNeutralSegmentCopyRatioZScoreThreshold"; + public static final String OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME = "outlierTh"; + + public static final String CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME = "callingCopyRatioZScoreThreshold"; + public static final String CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME = "callingTh"; @Argument( doc = "Input file containing copy-ratio segments (.cr.seg output of ModelSegments).", - fullName = CopyNumberStandardArgument.SEGMENTS_FILE_LONG_NAME, - shortName = CopyNumberStandardArgument.SEGMENTS_FILE_SHORT_NAME + fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME ) private File segmentsFile; @@ -61,15 +59,43 @@ public final class CallCopyRatioSegments extends CommandLineProgram { ) private File outFile; + @Argument( + doc = "Threshold on non-log2 copy ratio used for determining copy-neutral segments. " + + "If non-log2 copy ratio is within 1 +/- this threshold, a segment is considered copy-neutral.", + fullName = NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_LONG_NAME, + shortName = NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD_SHORT_NAME, + optional = true + ) + private double neutralSegmentCopyRatioThreshold = 0.1; + + @Argument( + doc = "Threshold on z-score of non-log2 copy ratio used for determining outlier copy-neutral segments. " + + "If non-log2 copy ratio z-score is above this threshold for a copy-neutral segment, " + + "it is considered an outlier and not used in the calculation of the length-weighted mean and standard deviation " + + "used for calling.", + fullName = OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME, + shortName = OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME, + optional = true, + minValue = 0. + ) + private double outlierNeutralSegmentCopyRatioZScoreThreshold = 2.; + + @Argument( + doc = "Threshold on z-score of non-log2 copy ratio used for calling segments.", + fullName = CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_LONG_NAME, + shortName = CALLING_COPY_RATIO_Z_SCORE_THRESHOLD_SHORT_NAME, + optional = true, + minValue = 0. + ) + private double callingCopyRatioZScoreThreshold = 2.; + @Override protected Object doWork() { - final CopyRatioCollection denoisedCopyRatios = new CopyRatioCollection(inputDenoisedCopyRatiosFile); final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(segmentsFile); - Utils.validateArg(denoisedCopyRatios.getSampleName().equals(copyRatioSegments.getSampleName()), - "Denoised copy ratios and copy-ratio segments do not have the same sample name."); - final CalledCopyRatioSegmentCollection calledCopyRatioSegments = - new ReCapSegCaller(denoisedCopyRatios, copyRatioSegments).makeCalls(); + new SimpleCopyRatioCaller(copyRatioSegments, + neutralSegmentCopyRatioThreshold, outlierNeutralSegmentCopyRatioZScoreThreshold, callingCopyRatioZScoreThreshold) + .makeCalls(); calledCopyRatioSegments.write(outFile); return "SUCCESS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java index 1f877728325..aad65cd3c22 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCounts.java @@ -14,7 +14,7 @@ import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; -import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollector; +import org.broadinstitute.hellbender.tools.copynumber.datacollection.AllelicCountCollector; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleNameUtils; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java index ba9b1e5324d..bd66bcc29df 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCounts.java @@ -19,12 +19,12 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCount; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberArgumentValidationUtils; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleNameUtils; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java index 019c78b49d0..b20db1143a5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java @@ -12,10 +12,10 @@ import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.engine.spark.SparkCommandLineProgram; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.copynumber.annotation.GCBiasCorrector; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.HDF5SVDReadCountPanelOfNormals; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.denoising.GCBiasCorrector; +import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java index a2de829ffde..deef2ef33ec 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java @@ -10,13 +10,9 @@ import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.copynumber.annotation.GCBiasCorrector; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.HDF5SVDReadCountPanelOfNormals; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDDenoisedCopyRatioResult; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDDenoisingUtils; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDReadCountPanelOfNormals; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.denoising.*; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java new file mode 100644 index 00000000000..3007cb20b70 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java @@ -0,0 +1,573 @@ +package org.broadinstitute.hellbender.tools.copynumber; + +import com.google.common.collect.ImmutableSet; +import htsjdk.samtools.util.OverlapDetector; +import org.apache.commons.math3.special.Beta; +import org.apache.commons.math3.util.FastMath; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.*; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment; +import org.broadinstitute.hellbender.tools.copynumber.models.AlleleFractionPrior; +import org.broadinstitute.hellbender.tools.copynumber.models.MultidimensionalModeller; +import org.broadinstitute.hellbender.tools.copynumber.segmentation.AlleleFractionKernelSegmenter; +import org.broadinstitute.hellbender.tools.copynumber.segmentation.CopyRatioKernelSegmenter; +import org.broadinstitute.hellbender.tools.copynumber.segmentation.MultidimensionalKernelSegmenter; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.File; +import java.util.*; +import java.util.stream.Collectors; + +/** + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Model segmented copy ratio from denoised read counts and segmented minor-allele fraction from allelic counts.", + oneLineSummary = "Model segmented copy ratio from denoised read counts and segmented minor-allele fraction from allelic counts.", + programGroup = CopyNumberProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public final class ModelSegments extends CommandLineProgram { + //filename tags for output + public static final String HET_ALLELIC_COUNTS_FILE_SUFFIX = ".hets.tsv"; + public static final String NORMAL_HET_ALLELIC_COUNTS_FILE_SUFFIX = ".hets.normal.tsv"; + public static final String SEGMENTS_FILE_SUFFIX = ".seg"; + public static final String BEGIN_FIT_FILE_TAG = ".modelBegin"; + public static final String FINAL_FIT_FILE_TAG = ".modelFinal"; + public static final String COPY_RATIO_MODEL_PARAMETER_FILE_SUFFIX = ".cr.param"; + public static final String ALLELE_FRACTION_MODEL_PARAMETER_FILE_SUFFIX = ".af.param"; + public static final String COPY_RATIO_SEGMENTS_FOR_CALLER_FILE = ".cr" + SEGMENTS_FILE_SUFFIX; + + public static final String MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_LONG_NAME = "maxNumSegmentsPerChromosome"; + public static final String MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_SHORT_NAME = "maxNumSegsPerChr"; + + public static final String MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME = "minTotalAlleleCount"; + public static final String MINIMUM_TOTAL_ALLELE_COUNT_SHORT_NAME = "minAC"; + + public static final String GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_LONG_NAME = "genotypingHomozygousLogRatioThreshold"; + public static final String GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_SHORT_NAME = "homLRT"; + + public static final String GENOTYPING_BASE_ERROR_RATE_LONG_NAME = "genotypingBaseErrorRate"; + public static final String GENOTYPING_BASE_ERROR_RATE_SHORT_NAME = "baseErrRate"; + + public static final String KERNEL_VARIANCE_COPY_RATIO_LONG_NAME = "kernelVarianceCopyRatio"; + public static final String KERNEL_VARIANCE_COPY_RATIO_SHORT_NAME = "kernVarCR"; + + public static final String KERNEL_VARIANCE_ALLELE_FRACTION_LONG_NAME = "kernelVarianceAlleleFraction"; + public static final String KERNEL_VARIANCE_ALLELE_FRACTION_SHORT_NAME = "kernVarAF"; + + public static final String KERNEL_SCALING_ALLELE_FRACTION_LONG_NAME = "kernelScalingAlleleFraction"; + public static final String KERNEL_SCALING_ALLELE_FRACTION_SHORT_NAME = "kernSclAF"; + + public static final String KERNEL_APPROXIMATION_DIMENSION_LONG_NAME = "kernelApproximationDimension"; + public static final String KERNEL_APPROXIMATION_DIMENSION_SHORT_NAME = "kernApproxDim"; + + public static final String WINDOW_SIZE_LONG_NAME = "windowSize"; + public static final String WINDOW_SIZE_SHORT_NAME = "winSize"; + + public static final String NUM_CHANGEPOINTS_PENALTY_FACTOR_LONG_NAME = "numChangepointsPenaltyFactor"; + public static final String NUM_CHANGEPOINTS_PENALTY_FACTOR_SHORT_NAME = "numChangeptsPen"; + + public static final String MINOR_ALLELE_FRACTION_PRIOR_ALPHA_LONG_NAME = "minorAlleleFractionPriorAlpha"; + public static final String MINOR_ALLELE_FRACTION_PRIOR_ALPHA_SHORT_NAME = "alphaAF"; + + public static final String NUM_SAMPLES_COPY_RATIO_LONG_NAME = "numSamplesCopyRatio"; + public static final String NUM_SAMPLES_COPY_RATIO_SHORT_NAME = "numSampCR"; + + public static final String NUM_BURN_IN_COPY_RATIO_LONG_NAME = "numBurnInCopyRatio"; + public static final String NUM_BURN_IN_COPY_RATIO_SHORT_NAME = "numBurnCR"; + + public static final String NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME = "numSamplesAlleleFraction"; + public static final String NUM_SAMPLES_ALLELE_FRACTION_SHORT_NAME = "numSampAF"; + + public static final String NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME = "numBurnInAlleleFraction"; + public static final String NUM_BURN_IN_ALLELE_FRACTION_SHORT_NAME = "numBurnAF"; + + public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_LONG_NAME = "smoothingThresholdCopyRatio"; + public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_SHORT_NAME = "smoothThCR"; + + public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_LONG_NAME = "smoothingThresholdAlleleFraction"; + public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_SHORT_NAME = "smoothThAF"; + + public static final String MAX_NUM_SMOOTHING_ITERATIONS_LONG_NAME = "maxNumSmoothingIterations"; + public static final String MAX_NUM_SMOOTHING_ITERATIONS_SHORT_NAME = "maxNumSmoothIter"; + + public static final String NUM_SMOOTHING_ITERATIONS_PER_FIT_LONG_NAME = "numSmoothingIterationsPerFit"; + public static final String NUM_SMOOTHING_ITERATIONS_PER_FIT_SHORT_NAME = "numSmoothIterPerFit"; + + @Argument( + doc = "Input file containing denoised copy-ratio profile (output of DenoiseReadCounts).", + fullName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_LONG_NAME, + shortName = CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_SHORT_NAME, + optional = true + ) + private File inputDenoisedCopyRatiosFile = null; + + @Argument( + doc = "Input file containing allelic counts (output of CollectAllelicCounts).", + fullName = CopyNumberStandardArgument.ALLELIC_COUNTS_FILE_LONG_NAME, + shortName = CopyNumberStandardArgument.ALLELIC_COUNTS_FILE_SHORT_NAME, + optional = true + ) + private File inputAllelicCountsFile = null; + + @Argument( + doc = "Input file containing allelic counts for a matched normal (output of CollectAllelicCounts).", + fullName = CopyNumberStandardArgument.NORMAL_ALLELIC_COUNTS_FILE_LONG_NAME, + shortName = CopyNumberStandardArgument.NORMAL_ALLELIC_COUNTS_FILE_SHORT_NAME, + optional = true + ) + private File inputNormalAllelicCountsFile = null; + + @Argument( + doc = "Prefix for output files.", + fullName = CopyNumberStandardArgument.OUTPUT_PREFIX_LONG_NAME, + shortName = CopyNumberStandardArgument.OUTPUT_PREFIX_SHORT_NAME + ) + private String outputPrefix; + + @Argument( + doc = "Output directory.", + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME + ) + private String outputDir; + + @Argument( + doc = "Maximum number of segments allowed per chromosome.", + fullName = MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_LONG_NAME, + shortName = MAXIMUM_NUMBER_OF_SEGMENTS_PER_CHROMOSOME_SHORT_NAME, + minValue = 1, + optional = true + ) + private int maxNumSegmentsPerChromosome = 1000; + + @Argument( + doc = "Minimum total count for filtering allelic counts, if available.", + fullName = MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME, + shortName = MINIMUM_TOTAL_ALLELE_COUNT_SHORT_NAME, + minValue = 0, + optional = true + ) + private int minTotalAlleleCount = 30; + + @Argument( + doc = "Log-ratio threshold for genotyping and filtering homozygous allelic counts, if available. " + + "Increasing this value will increase the number of sites assumed to be heterozygous for modeling.", + fullName = GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_LONG_NAME, + shortName = GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_SHORT_NAME, + optional = true + ) + private double genotypingHomozygousLogRatioThreshold = -10.; + + @Argument( + doc = "Maximum base-error rate for genotyping and filtering homozygous allelic counts, if available. " + + "The likelihood for an allelic count to be generated from a homozygous site will be integrated " + + "from zero base-error rate up to this value. Decreasing this value will increase " + + "the number of sites assumed to be heterozygous for modeling.", + fullName = GENOTYPING_BASE_ERROR_RATE_LONG_NAME, + shortName = GENOTYPING_BASE_ERROR_RATE_SHORT_NAME, + optional = true + ) + private double genotypingBaseErrorRate = 5E-2; + + @Argument( + doc = "Variance of Gaussian kernel for copy-ratio segmentation, if performed. If zero, a linear kernel will be used.", + fullName = KERNEL_VARIANCE_COPY_RATIO_LONG_NAME, + shortName = KERNEL_VARIANCE_COPY_RATIO_SHORT_NAME, + minValue = 0., + optional = true + ) + private double kernelVarianceCopyRatio = 0.; + + @Argument( + doc = "Variance of Gaussian kernel for allele-fraction segmentation, if performed. If zero, a linear kernel will be used.", + fullName = KERNEL_VARIANCE_ALLELE_FRACTION_LONG_NAME, + shortName = KERNEL_VARIANCE_ALLELE_FRACTION_SHORT_NAME, + minValue = 0., + optional = true + ) + private double kernelVarianceAlleleFraction = 0.025; + + @Argument( + doc = "Relative scaling S of the kernel K_AF for allele-fraction segmentation to the kernel K_CR for copy-ratio segmentation. " + + "If multidimensional segmentation is performed, the total kernel used will be K_CR + S * K_AF.", + fullName = KERNEL_SCALING_ALLELE_FRACTION_LONG_NAME, + shortName = KERNEL_SCALING_ALLELE_FRACTION_SHORT_NAME, + minValue = 0., + optional = true + ) + private double kernelScalingAlleleFraction = 1.0; + + @Argument( + doc = "Dimension of the kernel approximation. A subsample containing this number of data points " + + "will be used to construct the approximation for each chromosome. " + + "If the total number of data points in a chromosome is greater " + + "than this number, then all data points in the chromosome will be used. " + + "Time complexity scales quadratically and space complexity scales linearly with this parameter.", + fullName = KERNEL_APPROXIMATION_DIMENSION_LONG_NAME, + shortName = KERNEL_APPROXIMATION_DIMENSION_SHORT_NAME, + minValue = 1, + optional = true + ) + private int kernelApproximationDimension = 100; + + @Argument( + doc = "Window sizes to use for calculating local changepoint costs. " + + "For each window size, the cost for each data point to be a changepoint will be calculated " + + "assuming that it demarcates two adjacent segments of that size. " + + "Including small (large) window sizes will increase sensitivity to small (large) events. " + + "Duplicate values will be ignored.", + fullName = WINDOW_SIZE_LONG_NAME, + shortName = WINDOW_SIZE_SHORT_NAME, + minValue = 1, + optional = true + ) + private List windowSizes = new ArrayList<>(Arrays.asList(8, 16, 32, 64, 128, 256)); + + @Argument( + doc = "Factor A for the penalty on the number of changepoints per chromosome for segmentation. " + + "Adds a penalty of the form A * C * [1 + log (N / C)], " + + "where C is the number of changepoints in the chromosome, " + + "to the cost function for each chromosome. " + + "Must be non-negative.", + fullName = NUM_CHANGEPOINTS_PENALTY_FACTOR_LONG_NAME, + shortName = NUM_CHANGEPOINTS_PENALTY_FACTOR_SHORT_NAME, + minValue = 0., + optional = true + ) + private double numChangepointsPenaltyFactor = 1.; + + @Argument( + doc = "Alpha hyperparameter for the 4-parameter beta-distribution prior on segment minor-allele fraction. " + + "The prior for the minor-allele fraction f in each segment is assumed to be Beta(alpha, 1, 0, 1/2). " + + "Increasing this hyperparameter will reduce the effect of reference bias at the expense of sensitivity.", + fullName = MINOR_ALLELE_FRACTION_PRIOR_ALPHA_LONG_NAME, + shortName = MINOR_ALLELE_FRACTION_PRIOR_ALPHA_SHORT_NAME, + optional = true, + minValue = 1 + ) + private double minorAlleleFractionPriorAlpha = 25.; + + @Argument( + doc = "Total number of MCMC samples for copy-ratio model.", + fullName = NUM_SAMPLES_COPY_RATIO_LONG_NAME, + shortName = NUM_SAMPLES_COPY_RATIO_SHORT_NAME, + optional = true, + minValue = 1 + ) + private int numSamplesCopyRatio = 100; + + @Argument( + doc = "Number of burn-in samples to discard for copy-ratio model.", + fullName = NUM_BURN_IN_COPY_RATIO_LONG_NAME, + shortName = NUM_BURN_IN_COPY_RATIO_SHORT_NAME, + optional = true, + minValue = 0 + ) + private int numBurnInCopyRatio = 50; + + @Argument( + doc = "Total number of MCMC samples for allele-fraction model.", + fullName = NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME, + shortName = NUM_SAMPLES_ALLELE_FRACTION_SHORT_NAME, + optional = true, + minValue = 1 + ) + private int numSamplesAlleleFraction = 100; + + @Argument( + doc = "Number of burn-in samples to discard for allele-fraction model.", + fullName = NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME, + shortName = NUM_BURN_IN_ALLELE_FRACTION_SHORT_NAME, + optional = true, + minValue = 0 + ) + private int numBurnInAlleleFraction = 50; + + @Argument( + doc = "Number of 10% equal-tailed credible-interval widths to use for copy-ratio segmentation smoothing.", + fullName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_LONG_NAME, + shortName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_SHORT_NAME, + optional = true, + minValue = 0. + ) + private double smoothingCredibleIntervalThresholdCopyRatio = 2.; + + @Argument( + doc = "Number of 10% equal-tailed credible-interval widths to use for allele-fraction segmentation smoothing.", + fullName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_LONG_NAME, + shortName = SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_ALLELE_FRACTION_SHORT_NAME, + optional = true, + minValue = 0. + ) + private double smoothingCredibleIntervalThresholdAlleleFraction = 2.; + + @Argument( + doc = "Maximum number of iterations allowed for segmentation smoothing.", + fullName = MAX_NUM_SMOOTHING_ITERATIONS_LONG_NAME, + shortName = MAX_NUM_SMOOTHING_ITERATIONS_SHORT_NAME, + optional = true, + minValue = 0 + ) + private int maxNumSmoothingIterations = 25; + + @Argument( + doc = "Number of segmentation-smoothing iterations per MCMC model refit. " + + "(Increasing this will decrease runtime, but the final number of segments may be higher. " + + "Setting this to 0 will completely disable model refitting between iterations.)", + fullName = NUM_SMOOTHING_ITERATIONS_PER_FIT_LONG_NAME, + shortName = NUM_SMOOTHING_ITERATIONS_PER_FIT_SHORT_NAME, + optional = true, + minValue = 0 + ) + private int numSmoothingIterationsPerFit = 0; + + //initialize data variables, some of which may be optional + private CopyRatioCollection denoisedCopyRatios = null; + private AllelicCountCollection hetAllelicCounts = null; + + @Override + protected Object doWork() { + validateArguments(); + + //perform one-dimensional or multidimensional segmentation as appropriate and write to file + //(for use by CallCopyRatioSegments, if copy ratios are available) + final MultidimensionalSegmentCollection multidimensionalSegments; + if (inputDenoisedCopyRatiosFile != null && inputAllelicCountsFile == null) { + readDenoisedCopyRatios(); + final CopyRatioSegmentCollection copyRatioSegments = performCopyRatioSegmentation(); + multidimensionalSegments = new MultidimensionalSegmentCollection( + copyRatioSegments.getSampleMetadata(), + copyRatioSegments.getRecords().stream() + .map(s -> new MultidimensionalSegment(s.getInterval(), s.getNumPoints(), 0, s.getMeanLog2CopyRatio())) + .collect(Collectors.toList())); + hetAllelicCounts = new AllelicCountCollection(denoisedCopyRatios.getSampleMetadata(), Collections.emptyList()); //create an empty collection with the appropriate name + } else if (inputDenoisedCopyRatiosFile == null && inputAllelicCountsFile != null) { + readAndFilterAllelicCounts(); + final AlleleFractionSegmentCollection alleleFractionSegments = performAlleleFractionSegmentation(); + multidimensionalSegments = new MultidimensionalSegmentCollection( + alleleFractionSegments.getSampleMetadata(), + alleleFractionSegments.getRecords().stream() + .map(s -> new MultidimensionalSegment(s.getInterval(), 0, s.getNumPoints(), Double.NaN)) + .collect(Collectors.toList())); + denoisedCopyRatios = new CopyRatioCollection(hetAllelicCounts.getSampleMetadata(), Collections.emptyList()); //create an empty collection with the appropriate name + } else { + readDenoisedCopyRatios(); + readAndFilterAllelicCounts(); + multidimensionalSegments = new MultidimensionalKernelSegmenter(denoisedCopyRatios, hetAllelicCounts) + .findSegmentation(maxNumSegmentsPerChromosome, + kernelVarianceCopyRatio, kernelVarianceAlleleFraction, kernelScalingAlleleFraction, kernelApproximationDimension, + ImmutableSet.copyOf(windowSizes).asList(), + numChangepointsPenaltyFactor, numChangepointsPenaltyFactor); + } + + logger.info("Modeling available denoised copy ratios and heterozygous allelic counts..."); + //initial MCMC model fitting performed by MultidimensionalModeller constructor + final AlleleFractionPrior alleleFractionPrior = new AlleleFractionPrior(minorAlleleFractionPriorAlpha); + final MultidimensionalModeller modeller = new MultidimensionalModeller( + multidimensionalSegments, denoisedCopyRatios, hetAllelicCounts, alleleFractionPrior, + numSamplesCopyRatio, numBurnInCopyRatio, + numSamplesAlleleFraction, numBurnInAlleleFraction); + + //write initial segments and parameters to file + writeModeledSegmentsAndParameterFiles(modeller, BEGIN_FIT_FILE_TAG); + + //segmentation smoothing + modeller.smoothSegments( + maxNumSmoothingIterations, numSmoothingIterationsPerFit, + smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction); + + //write final segments and parameters to file + writeModeledSegmentsAndParameterFiles(modeller, FINAL_FIT_FILE_TAG); + + //write final segments for copy-ratio caller (TODO remove this and MEAN_LOG2_COPY_RATIO column when new caller is available) + final OverlapDetector copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector(); + final CopyRatioSegmentCollection copyRatioSegmentsFinal = new CopyRatioSegmentCollection( + modeller.getModeledSegments().getSampleMetadata(), + modeller.getModeledSegments().getIntervals().stream() + .map(s -> new CopyRatioSegment(s, new ArrayList<>(copyRatioMidpointOverlapDetector.getOverlaps(s)))) + .collect(Collectors.toList())); + writeSegments(copyRatioSegmentsFinal, COPY_RATIO_SEGMENTS_FOR_CALLER_FILE); + + logger.info("SUCCESS: ModelSegments run complete."); + + return "SUCCESS"; + } + + private void validateArguments() { + Utils.nonNull(outputPrefix); + Utils.validateArg(!(inputDenoisedCopyRatiosFile == null && inputAllelicCountsFile == null), + "Must provide at least a denoised copy-ratio profile file or an allelic-counts file."); + Utils.validateArg(!(inputAllelicCountsFile == null && inputNormalAllelicCountsFile != null), + "Must provide an allelic-counts file for the case sample to run in matched-normal mode."); + if (inputDenoisedCopyRatiosFile != null) { + IOUtils.canReadFile(inputDenoisedCopyRatiosFile); + } + if (inputAllelicCountsFile != null) { + IOUtils.canReadFile(inputAllelicCountsFile); + } + if (inputNormalAllelicCountsFile != null) { + IOUtils.canReadFile(inputNormalAllelicCountsFile); + } + if (!new File(outputDir).exists()) { + throw new UserException(String.format("Output directory %s does not exist.", outputDir)); + } + } + + private void readDenoisedCopyRatios() { + logger.info(String.format("Reading denoised copy-ratio profile file (%s)...", inputDenoisedCopyRatiosFile)); + denoisedCopyRatios = new CopyRatioCollection(inputDenoisedCopyRatiosFile); + } + + private CopyRatioSegmentCollection performCopyRatioSegmentation() { + logger.info("Starting segmentation of denoised copy ratios..."); + final int maxNumChangepointsPerChromosome = maxNumSegmentsPerChromosome - 1; + return new CopyRatioKernelSegmenter(denoisedCopyRatios) + .findSegmentation(maxNumChangepointsPerChromosome, kernelVarianceCopyRatio, kernelApproximationDimension, + ImmutableSet.copyOf(windowSizes).asList(), + numChangepointsPenaltyFactor, numChangepointsPenaltyFactor); + } + + private void readAndFilterAllelicCounts() { + //read in case sample + logger.info(String.format("Reading allelic-counts file (%s)...", inputAllelicCountsFile)); + final AllelicCountCollection unfilteredAllelicCounts = new AllelicCountCollection(inputAllelicCountsFile); + final SampleMetadata sampleMetadata = unfilteredAllelicCounts.getSampleMetadata(); + + //filter on total count in case sample + logger.info(String.format("Filtering allelic counts with total count less than %d...", minTotalAlleleCount)); + AllelicCountCollection filteredAllelicCounts = new AllelicCountCollection( + sampleMetadata, + unfilteredAllelicCounts.getRecords().stream() + .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount) + .collect(Collectors.toList())); + logger.info(String.format("Retained %d / %d sites after filtering on total count...", + filteredAllelicCounts.getRecords().size(), unfilteredAllelicCounts.getRecords().size())); + + //filter on overlap with copy-ratio intervals, if available + if (denoisedCopyRatios != null) { + logger.info("Filtering allelic-count sites not overlapping with copy-ratio intervals..."); + final OverlapDetector copyRatioOverlapDetector = denoisedCopyRatios.getOverlapDetector(); + filteredAllelicCounts = new AllelicCountCollection( + sampleMetadata, + filteredAllelicCounts.getRecords().stream() + .filter(copyRatioOverlapDetector::overlapsAny) + .collect(Collectors.toList())); + logger.info(String.format("Retained %d / %d sites after filtering on overlap with copy-ratio intervals...", + filteredAllelicCounts.getRecords().size(), unfilteredAllelicCounts.getRecords().size())); + } + + if (inputNormalAllelicCountsFile == null) { + //filter on homozygosity in case sample + logger.info("No matched normal was provided, not running in matched-normal mode..."); + logger.info("Performing binomial testing and filtering homozygous allelic counts..."); + hetAllelicCounts = new AllelicCountCollection( + sampleMetadata, + filteredAllelicCounts.getRecords().stream() + .filter(ac -> calculateHomozygousLogRatio(ac, genotypingBaseErrorRate) < genotypingHomozygousLogRatioThreshold) + .collect(Collectors.toList())); + final File hetAllelicCountsFile = new File(outputDir, outputPrefix + HET_ALLELIC_COUNTS_FILE_SUFFIX); + hetAllelicCounts.write(hetAllelicCountsFile); + logger.info(String.format("Retained %d / %d sites after testing for heterozygosity...", + hetAllelicCounts.getRecords().size(), unfilteredAllelicCounts.getRecords().size())); + logger.info(String.format("Heterozygous allelic counts written to %s.", hetAllelicCountsFile)); + } else { + //read in matched normal + logger.info("Matched normal was provided, running in matched-normal mode..."); + logger.info("Performing binomial testing and filtering homozygous allelic counts in matched normal..."); + final AllelicCountCollection unfilteredNormalAllelicCounts = new AllelicCountCollection(inputNormalAllelicCountsFile); + if (!unfilteredNormalAllelicCounts.getIntervals().equals(unfilteredAllelicCounts.getIntervals())) { + throw new UserException.BadInput("Allelic-count sites in case sample and matched normal do not match. " + + "Run CollectAllelicCounts using the same interval list of sites for both samples."); + } + final SampleMetadata normalSampleMetadata = unfilteredNormalAllelicCounts.getSampleMetadata(); + + //filter on total count in matched normal + logger.info(String.format("Filtering allelic counts in matched normal with total count less than %d...", minTotalAlleleCount)); + final AllelicCountCollection filteredNormalAllelicCounts = new AllelicCountCollection( + normalSampleMetadata, + unfilteredNormalAllelicCounts.getRecords().stream() + .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount) + .collect(Collectors.toList())); + logger.info(String.format("Retained %d / %d sites in matched normal after filtering on total count...", + filteredNormalAllelicCounts.getRecords().size(), unfilteredNormalAllelicCounts.getRecords().size())); + + //filter on homozygosity in matched normal + final AllelicCountCollection hetNormalAllelicCounts = new AllelicCountCollection( + normalSampleMetadata, + filteredNormalAllelicCounts.getRecords().stream() + .filter(ac -> calculateHomozygousLogRatio(ac, genotypingBaseErrorRate) < genotypingHomozygousLogRatioThreshold) + .collect(Collectors.toList())); + final File hetNormalAllelicCountsFile = new File(outputDir, outputPrefix + NORMAL_HET_ALLELIC_COUNTS_FILE_SUFFIX); + hetNormalAllelicCounts.write(hetNormalAllelicCountsFile); + logger.info(String.format("Retained %d / %d sites in matched normal after testing for heterozygosity...", + hetNormalAllelicCounts.getRecords().size(), unfilteredNormalAllelicCounts.getRecords().size())); + logger.info(String.format("Heterozygous allelic counts for matched normal written to %s.", hetNormalAllelicCountsFile)); + + //retrieve sites in case sample + logger.info("Retrieving allelic counts at these sites in case sample..."); + final Set hetNormalAllelicCountSites = new HashSet<>(hetNormalAllelicCounts.getIntervals()); + hetAllelicCounts = new AllelicCountCollection( + sampleMetadata, + filteredAllelicCounts.getRecords().stream() + .filter(ac -> hetNormalAllelicCountSites.contains(ac.getInterval())) + .collect(Collectors.toList())); + final File hetAllelicCountsFile = new File(outputDir, outputPrefix + HET_ALLELIC_COUNTS_FILE_SUFFIX); + hetAllelicCounts.write(hetAllelicCountsFile); + logger.info(String.format("Allelic counts for case sample at heterozygous sites in matched normal written to %s.", hetAllelicCountsFile)); + } + } + + private static double calculateHomozygousLogRatio(final AllelicCount allelicCount, + final double genotypingBaseErrorRate) { + final int r = allelicCount.getRefReadCount(); + final int n = allelicCount.getTotalReadCount(); + final double betaAll = Beta.regularizedBeta(1, r + 1, n - r + 1); + final double betaError = Beta.regularizedBeta(genotypingBaseErrorRate, r + 1, n - r + 1); + final double betaOneMinusError = Beta.regularizedBeta(1 - genotypingBaseErrorRate, r + 1, n - r + 1); + final double betaHom = betaError + betaAll - betaOneMinusError; + final double betaHet = betaOneMinusError - betaError; + return FastMath.log(betaHom) - FastMath.log(betaHet); + } + + private AlleleFractionSegmentCollection performAlleleFractionSegmentation() { + logger.info("Starting segmentation of heterozygous allelic counts..."); + final int maxNumChangepointsPerChromosome = maxNumSegmentsPerChromosome - 1; + return new AlleleFractionKernelSegmenter(hetAllelicCounts) + .findSegmentation(maxNumChangepointsPerChromosome, kernelVarianceAlleleFraction, kernelApproximationDimension, + ImmutableSet.copyOf(windowSizes).asList(), + numChangepointsPenaltyFactor, numChangepointsPenaltyFactor); + } + + private void writeModeledSegmentsAndParameterFiles(final MultidimensionalModeller modeller, + final String fileTag) { + final ModeledSegmentCollection modeledSegments = modeller.getModeledSegments(); + writeSegments(modeledSegments, fileTag + SEGMENTS_FILE_SUFFIX); + final File copyRatioParameterFile = new File(outputDir, outputPrefix + fileTag + COPY_RATIO_MODEL_PARAMETER_FILE_SUFFIX); + final File alleleFractionParameterFile = new File(outputDir, outputPrefix + fileTag + ALLELE_FRACTION_MODEL_PARAMETER_FILE_SUFFIX); + modeller.writeModelParameterFiles(copyRatioParameterFile, alleleFractionParameterFile); + } + + private void writeSegments(final SampleLocatableCollection segments, + final String fileSuffix) { + final File segmentsFile = new File(outputDir, outputPrefix + fileSuffix); + segments.write(segmentsFile); + logger.info(String.format("Segments written to %s", segmentsFile)); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java index b75a63e127c..10c365fbdea 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervals.java @@ -30,7 +30,8 @@ * IntervalArgumentCollection. However, we encourage using only the -P flag. * *

The user can also specify the length of the bins (in bp) using the -BL option. If this is not commensurate with - * the length of the padded intervals, then the last bin will be of different length than the others.

+ * the length of the padded intervals, then the last bin will be of different length than the others. If zero is + * specified, then no binning will be performed.

* *

The -O argument specifies a filename for the output bins, stored as a Picard interval list.

* @@ -66,11 +67,11 @@ public final class PreprocessIntervals extends GATKTool { public static final String PADDING_SHORT_NAME = "P"; @Argument( - doc = "Length (in bp) of the bins.", + doc = "Length (in bp) of the bins. If zero, no binning will be performed.", fullName = BIN_LENGTH_LONG_NAME, shortName = BIN_LENGTH_SHORT_NAME, optional = true, - minValue = 1 + minValue = 0 ) private int binLength = 1000; @@ -120,6 +121,9 @@ private static IntervalList padAndMergeIntervals(final List inpu } private static IntervalList generateBins(final IntervalList preparedIntervalList, final int binLength, final SAMSequenceDictionary sequenceDictionary) { + if (binLength == 0) { + return IntervalList.copyOf(preparedIntervalList); + } final IntervalList bins = new IntervalList(sequenceDictionary); for (final Interval interval : preparedIntervalList) { for (int binStart = interval.getStart(); binStart <= interval.getEnd(); binStart += binLength) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCaller.java new file mode 100644 index 00000000000..f6427e6acc7 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCaller.java @@ -0,0 +1,135 @@ +package org.broadinstitute.hellbender.tools.copynumber.caller; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CalledCopyRatioSegment; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * This caller is loosely based on the legacy ReCapSeg caller that was originally implemented in ReCapSeg v1.4.5.0, + * but introduces major changes. The method is as follows: + * 1) use the non-log2 mean copy ratio to determine copy-neutral segments, + * 2) weight segments by length for determining the mean and standard deviation of the non-log2 copy ratio in copy-neutral segments, + * 3) filter outlier copy-neutral segments by non-log2 copy ratio z-score, + * 4) use the filtered copy-neutral segments to determine a length-weighted mean and standard deviation, + * 5) call segments using z-score based on this mean and standard deviation. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class SimpleCopyRatioCaller { + private static final Logger logger = LogManager.getLogger(SimpleCopyRatioCaller.class); + + private final double neutralSegmentCopyRatioThreshold; + private final double outlierNeutralSegmentCopyRatioZScoreThreshold; + private final double callingCopyRatioZScoreThreshold; + private final Statistics callingStatistics; + + private final CopyRatioSegmentCollection copyRatioSegments; + + /** + * @param neutralSegmentCopyRatioThreshold non-log2 copy ratio must be within 1 +/- this threshold for a segment to be copy neutral + * @param outlierNeutralSegmentCopyRatioZScoreThreshold z-score on non-log2 copy ratio above which a copy-neutral segment is assumed to be an outlier + * and not included in the calculation of the length-weighted standard deviation of + * non-log2 copy ratio in copy-neutral segments + * @param callingCopyRatioZScoreThreshold z-score with respect to length-weighted standard deviation of non-log2 copy ratio + * in non-outlier copy-neutral segments used for calling segments + */ + public SimpleCopyRatioCaller(final CopyRatioSegmentCollection copyRatioSegments, + final double neutralSegmentCopyRatioThreshold, + final double outlierNeutralSegmentCopyRatioZScoreThreshold, + final double callingCopyRatioZScoreThreshold) { + ParamUtils.isPositive(neutralSegmentCopyRatioThreshold, "Copy-neutral threshold must be positive."); + ParamUtils.isPositive(outlierNeutralSegmentCopyRatioZScoreThreshold, "Outlier z-score threshold must be positive."); + ParamUtils.isPositive(callingCopyRatioZScoreThreshold, "Calling z-score threshold must be positive."); + this.copyRatioSegments = Utils.nonNull(copyRatioSegments); + this.neutralSegmentCopyRatioThreshold = neutralSegmentCopyRatioThreshold; + this.outlierNeutralSegmentCopyRatioZScoreThreshold = outlierNeutralSegmentCopyRatioZScoreThreshold; + this.callingCopyRatioZScoreThreshold = callingCopyRatioZScoreThreshold; + callingStatistics = calculateCallingStatistics(); + } + + public CalledCopyRatioSegmentCollection makeCalls() { + final List segments = copyRatioSegments.getRecords(); + final List calledSegments = new ArrayList<>(segments.size()); + for (final CopyRatioSegment segment : segments) { + final double copyRatioMean = Math.pow(2., segment.getMeanLog2CopyRatio()); + if (Math.abs(1. - copyRatioMean) < neutralSegmentCopyRatioThreshold) { + calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.NEUTRAL)); + } else { + final double copyRatioDeviation = copyRatioMean - callingStatistics.mean; + if (copyRatioDeviation < -callingStatistics.standardDeviation * callingCopyRatioZScoreThreshold) { + calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.DELETION)); + } else if (copyRatioDeviation > callingStatistics.standardDeviation * callingCopyRatioZScoreThreshold) { + calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.AMPLIFICATION)); + } else { + calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.NEUTRAL)); + } + } + } + return new CalledCopyRatioSegmentCollection(copyRatioSegments.getSampleMetadata(), calledSegments); + } + + private Statistics calculateCallingStatistics() { + //get the segments that fall within the copy-neutral region + final List copyNeutralSegments = copyRatioSegments.getRecords().stream() + .filter(s -> Math.abs(1. - Math.pow(2., s.getMeanLog2CopyRatio())) < neutralSegmentCopyRatioThreshold) + .collect(Collectors.toList()); + logger.info(String.format("%d segments in copy-neutral region [%.4f, %.4f]...", copyNeutralSegments.size(), + 1. - neutralSegmentCopyRatioThreshold, 1. + neutralSegmentCopyRatioThreshold)); + + //calculate length-weighted statistics of unfiltered copy-neutral segments + final Statistics unfilteredStatistics = calculateLengthWeightedStatistics(copyNeutralSegments); + logger.info(String.format("Length-weighted mean of segments in copy-neutral region (CR space): %.4f", unfilteredStatistics.mean)); + logger.info(String.format("Length-weighted standard deviation for segments in copy-neutral region : %.4f", unfilteredStatistics.standardDeviation)); + + //filter outlier segments by only including those within 2 standard deviations + final List filteredCopyNeutralSegments = copyNeutralSegments.stream() + .filter(s -> Math.abs(Math.pow(2., s.getMeanLog2CopyRatio()) - unfilteredStatistics.mean) + <= unfilteredStatistics.standardDeviation * outlierNeutralSegmentCopyRatioZScoreThreshold) + .collect(Collectors.toList()); + logger.info(String.format("%d / %d segments in copy-neutral region remain after outliers filtered using z-score threshold (%.4f)...", + filteredCopyNeutralSegments.size(), copyNeutralSegments.size(), outlierNeutralSegmentCopyRatioZScoreThreshold)); + + final Statistics statistics = calculateLengthWeightedStatistics(filteredCopyNeutralSegments); + logger.info(String.format("Length-weighted mean for z-score calling (CR space): %.4f", statistics.mean)); + logger.info(String.format("Length-weighted standard deviation for z-score calling (CR space): %.4f", statistics.standardDeviation)); + + return statistics; + } + + private static Statistics calculateLengthWeightedStatistics(final List copyRatioSegments) { + final List segmentLengths = copyRatioSegments.stream() + .map(c -> c.getInterval().getLengthOnReference()) + .collect(Collectors.toList()); + final double totalLength = segmentLengths.stream().mapToDouble(Integer::doubleValue).sum(); + final int numSegments = segmentLengths.size(); + final double lengthWeightedCopyRatioMean = IntStream.range(0, numSegments) + .mapToDouble(i -> segmentLengths.get(i) * Math.pow(2., copyRatioSegments.get(i).getMeanLog2CopyRatio())) + .sum() / totalLength; + final double lengthWeightedCopyRatioStandardDeviation = Math.sqrt(IntStream.range(0, numSegments) + .mapToDouble(i -> segmentLengths.get(i) * Math.pow(Math.pow(2., copyRatioSegments.get(i).getMeanLog2CopyRatio()) - lengthWeightedCopyRatioMean, 2)) + .sum() / (((double) (numSegments - 1) / numSegments) * totalLength)); + return new Statistics(lengthWeightedCopyRatioMean, lengthWeightedCopyRatioStandardDeviation); + } + + private static final class Statistics { + private final double mean; + private final double standardDeviation; + + private Statistics(final double mean, + final double standardDeviation) { + this.mean = mean; + this.standardDeviation = standardDeviation; + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCaller.java deleted file mode 100644 index 363c292b081..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCaller.java +++ /dev/null @@ -1,107 +0,0 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.caller; - -import htsjdk.samtools.util.OverlapDetector; -import org.apache.commons.math3.stat.descriptive.moment.Mean; -import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegmentCollection; -import org.broadinstitute.hellbender.utils.Utils; - -import java.util.*; -import java.util.stream.Collectors; - -/** - *

This caller mimics the legacy ReCapSeg Caller that was originally implemented in ReCapSeg v1.4.5.0.

- * - *

There is a small difference. The python code was using the same algorithm as intersectBed, which was causing it to drop - * the first interval of each segment in calculations of the copy-neutral intervals. The code here does - * not do this. This difference in the two codebases can cause a slight difference in the T calculation. Hence, the - * results of this code and the python code will not be exactly the same, but will be - * very close. A fix (to make this code match the python) has been deemed unworthy of our time.

- */ -public final class ReCapSegCaller { - private static final Logger logger = LogManager.getLogger(ReCapSegCaller.class); - - //bounds on log_2 coverage for high-confidence neutral segments - private static final double COPY_NEUTRAL_CUTOFF = 0.1; - // Number of standard deviations before assuming that an interval was an outlier in a segment - private static final double Z_THRESHOLD = 2; - - private final CopyRatioSegmentCollection copyRatioSegments; - private final LinkedHashMap> segmentToCopyRatiosMap; - - /** - * @param denoisedCopyRatios in log2 space - */ - public ReCapSegCaller(final CopyRatioCollection denoisedCopyRatios, - final CopyRatioSegmentCollection copyRatioSegments) { - this.copyRatioSegments = Utils.nonNull(copyRatioSegments); - Utils.validateArg(denoisedCopyRatios.getSampleName().equals(copyRatioSegments.getSampleName()), - "Denoised copy ratios and copy-ratio segments do not have the same sample name."); - segmentToCopyRatiosMap = constructSegmentToCopyRatiosMap(denoisedCopyRatios, copyRatioSegments); - } - - private static LinkedHashMap> constructSegmentToCopyRatiosMap(final CopyRatioCollection denoisedCopyRatios, - final CopyRatioSegmentCollection copyRatioSegments) { - final LinkedHashMap> segmentToCopyRatiosMap = new LinkedHashMap<>(); - final OverlapDetector copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector(); - for (final CopyRatioSegment segment : copyRatioSegments.getRecords()) { - final int numPointsExpected = segment.getNumPoints(); - final Set copyRatiosInSegment = copyRatioMidpointOverlapDetector.getOverlaps(segment); - if (copyRatiosInSegment.size() != numPointsExpected) { - throw new IllegalArgumentException("Denoised copy ratios and copy-ratio segments are not consistent."); - } - segmentToCopyRatiosMap.put(segment, copyRatiosInSegment); - } - return segmentToCopyRatiosMap; - } - - private double calculateT() { - //Get the segments that are likely copy neutral. - //Math.abs removed to mimic python... - final List copyNeutralSegments = segmentToCopyRatiosMap.keySet().stream() - .filter(s -> s.getMeanLog2CopyRatio() < COPY_NEUTRAL_CUTOFF).collect(Collectors.toList()); - - //Get the intervals that correspond to the copyNeutralSegments... note that individual intervals, due to noise, - //can be far away from copy neutral - final double[] copyNeutralIntervals = copyNeutralSegments.stream() - .flatMap(s -> segmentToCopyRatiosMap.get(s).stream()) - .mapToDouble(CopyRatio::getLog2CopyRatioValue).toArray(); - - final double meanCopyNeutralIntervals = new Mean().evaluate(copyNeutralIntervals); - final double sigmaCopyNeutralIntervals = new StandardDeviation().evaluate(copyNeutralIntervals); - - // Now we filter outliers by only including those w/in 2 standard deviations. - final double [] filteredCopyNeutralIntervals = Arrays.stream(copyNeutralIntervals) - .filter(c -> Math.abs(c - meanCopyNeutralIntervals) < sigmaCopyNeutralIntervals * Z_THRESHOLD).toArray(); - - return new StandardDeviation().evaluate(filteredCopyNeutralIntervals); - } - - public CalledCopyRatioSegmentCollection makeCalls() { - final double t = calculateT(); - - logger.info("Running caller that mimics the ReCapSeg 1.4.5.0 (python) caller."); - // Log some information about thresholds chosen for the segments. - logger.info(String.format("Copy neutral (log2CR space) [%.4f, %.4f]", -t, t)); - logger.info(String.format("Copy neutral (CR space) [%.4f, %.4f]", Math.pow(2, -t), Math.pow(2, t))); - - final Set segments = segmentToCopyRatiosMap.keySet(); - final List calledSegments = new ArrayList<>(segments.size()); - for (final CopyRatioSegment segment : segments) { - if (segment.getMeanLog2CopyRatio() < -t) { - calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.DELETION)); - } else if (segment.getMeanLog2CopyRatio() > t) { - calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.AMPLIFICATION)); - } else { - calledSegments.add(new CalledCopyRatioSegment(segment, CalledCopyRatioSegment.Call.NEUTRAL)); - } - } - - return new CalledCopyRatioSegmentCollection(copyRatioSegments.getSampleMetadata(), calledSegments); - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollector.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/datacollection/AllelicCountCollector.java similarity index 94% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollector.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/datacollection/AllelicCountCollector.java index c0c4847011f..b129911e346 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollector.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/datacollection/AllelicCountCollector.java @@ -1,9 +1,11 @@ -package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount; +package org.broadinstitute.hellbender.tools.copynumber.datacollection; import htsjdk.samtools.util.Locatable; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrector.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrector.java similarity index 98% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrector.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrector.java index fefe9ae2337..6be8504971f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrector.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrector.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.annotation; +package org.broadinstitute.hellbender.tools.copynumber.denoising; import org.apache.commons.math3.linear.ArrayRealVector; import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor; @@ -7,6 +7,7 @@ import org.apache.commons.math3.stat.descriptive.rank.Median; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.param.ParamUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/HDF5SVDReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/HDF5SVDReadCountPanelOfNormals.java similarity index 99% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/HDF5SVDReadCountPanelOfNormals.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/HDF5SVDReadCountPanelOfNormals.java index 7500e2214b0..ab1087c1b12 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/HDF5SVDReadCountPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/HDF5SVDReadCountPanelOfNormals.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd; +package org.broadinstitute.hellbender.tools.copynumber.denoising; import htsjdk.samtools.util.Lazy; import org.apache.commons.math3.linear.Array2DRowRealMatrix; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisedCopyRatioResult.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisedCopyRatioResult.java similarity index 92% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisedCopyRatioResult.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisedCopyRatioResult.java index 402127f210f..fa3a70ee228 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisedCopyRatioResult.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisedCopyRatioResult.java @@ -1,9 +1,9 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd; +package org.broadinstitute.hellbender.tools.copynumber.denoising; import org.apache.commons.math3.linear.RealMatrix; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java similarity index 99% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisingUtils.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java index 9039c72ebfa..4497481ae4e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDDenoisingUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDDenoisingUtils.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd; +package org.broadinstitute.hellbender.tools.copynumber.denoising; import com.google.common.primitives.Doubles; import org.apache.commons.math3.linear.Array2DRowRealMatrix; @@ -10,8 +10,7 @@ import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.copynumber.CreateReadCountPanelOfNormals; -import org.broadinstitute.hellbender.tools.copynumber.annotation.GCBiasCorrector; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils; import org.broadinstitute.hellbender.utils.MatrixSummaryUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDReadCountPanelOfNormals.java similarity index 94% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDReadCountPanelOfNormals.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDReadCountPanelOfNormals.java index 356956e553b..29f37769a17 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/svd/SVDReadCountPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/denoising/SVDReadCountPanelOfNormals.java @@ -1,6 +1,6 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd; +package org.broadinstitute.hellbender.tools.copynumber.denoising; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.utils.SimpleInterval; import java.util.List; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AlleleFractionSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AlleleFractionSegmentCollection.java new file mode 100644 index 00000000000..8b2b0e48860 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AlleleFractionSegmentCollection.java @@ -0,0 +1,52 @@ +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; + +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AlleleFractionSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.tsv.DataLine; +import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; + +import java.io.File; +import java.util.List; +import java.util.function.BiConsumer; +import java.util.function.Function; + +/** + * Represents an allele-fraction segmentation. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionSegmentCollection extends SampleLocatableCollection { + enum AlleleFractionSegmentTableColumn { + CONTIG, + START, + END, + NUM_POINTS_ALLELE_FRACTION; + + static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values()); + } + + private static final Function ALLELE_FRACTION_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION = dataLine -> { + final String contig = dataLine.get(AlleleFractionSegmentTableColumn.CONTIG); + final int start = dataLine.getInt(AlleleFractionSegmentTableColumn.START); + final int end = dataLine.getInt(AlleleFractionSegmentTableColumn.END); + final int numPoints = dataLine.getInt(AlleleFractionSegmentTableColumn.NUM_POINTS_ALLELE_FRACTION); + final SimpleInterval interval = new SimpleInterval(contig, start, end); + return new AlleleFractionSegment(interval, numPoints); + }; + + private static final BiConsumer ALLELE_FRACTION_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER = (alleleFractionSegment, dataLine) -> + dataLine.append(alleleFractionSegment.getContig()) + .append(alleleFractionSegment.getStart()) + .append(alleleFractionSegment.getEnd()) + .append(alleleFractionSegment.getNumPoints()); + + public AlleleFractionSegmentCollection(final File inputFile) { + super(inputFile, AlleleFractionSegmentTableColumn.COLUMNS, ALLELE_FRACTION_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, ALLELE_FRACTION_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER); + } + + public AlleleFractionSegmentCollection(final SampleMetadata sampleMetadata, + final List AlleleFractionSegments) { + super(sampleMetadata, AlleleFractionSegments, AlleleFractionSegmentTableColumn.COLUMNS, ALLELE_FRACTION_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, ALLELE_FRACTION_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java similarity index 94% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java index 627d9761adf..2b2d20bcd32 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollection.java @@ -1,7 +1,7 @@ -package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.tsv.DataLine; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedIntervalCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java similarity index 90% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedIntervalCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java index 8b5baccab64..4fc80398244 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedIntervalCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java @@ -1,6 +1,7 @@ -package org.broadinstitute.hellbender.tools.copynumber.annotation; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.tsv.DataLine; import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CalledCopyRatioSegmentCollection.java similarity index 92% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegmentCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CalledCopyRatioSegmentCollection.java index 43606d172ee..50caf45a610 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegmentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CalledCopyRatioSegmentCollection.java @@ -1,9 +1,9 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.caller; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CalledCopyRatioSegment; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.tsv.DataLine; import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatioCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioCollection.java similarity index 94% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatioCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioCollection.java index fdb36df96f7..3e18aeda89d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatioCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioCollection.java @@ -1,8 +1,8 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import htsjdk.samtools.util.OverlapDetector; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.tsv.DataLine; import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioSegmentCollection.java similarity index 93% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegmentCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioSegmentCollection.java index a6eb23cd105..eedc2042e96 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegmentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/CopyRatioSegmentCollection.java @@ -1,7 +1,7 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.tsv.DataLine; import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollection.java similarity index 95% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollection.java index f7a25cc6cd2..56fc46774d4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollection.java @@ -1,10 +1,9 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import htsjdk.samtools.util.Lazy; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.RealMatrix; import org.broadinstitute.hdf5.HDF5File; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java index 20e76949c97..372cf141c74 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/LocatableCollection.java @@ -34,10 +34,10 @@ public abstract class LocatableCollection extends Reco /** * Records are sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR}. */ - protected LocatableCollection(final List records, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + LocatableCollection(final List records, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { super( Utils.nonNull(records).stream().sorted(LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList()), mandatoryColumns, @@ -49,10 +49,10 @@ protected LocatableCollection(final List records, /** * @throws IllegalArgumentException if records are not sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR} */ - protected LocatableCollection(final File inputFile, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + LocatableCollection(final File inputFile, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { super(inputFile, mandatoryColumns, recordFromDataLineDecoder, recordToDataLineEncoder); validateIntervals(getRecords()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ModeledSegmentCollection.java similarity index 93% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegmentCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ModeledSegmentCollection.java index cbfe844c88c..78a01ea36cc 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegmentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ModeledSegmentCollection.java @@ -1,7 +1,8 @@ -package org.broadinstitute.hellbender.tools.copynumber.multidimensional.model; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; +import org.broadinstitute.hellbender.tools.copynumber.models.MultidimensionalModeller; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.tsv.DataLine; import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; @@ -15,7 +16,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class ModeledSegmentCollection extends SampleLocatableCollection { - private static final String DOUBLE_FORMAT = "%6.6f"; //TODO replace this with MultidimensionalModeller.DOUBLE_FORMAT from sl_wgs_acnv branch + private static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT; enum ModeledSegmentTableColumn { CONTIG, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/MultidimensionalSegmentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/MultidimensionalSegmentCollection.java new file mode 100644 index 00000000000..807b359a0bb --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/MultidimensionalSegmentCollection.java @@ -0,0 +1,56 @@ +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; + +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.tsv.DataLine; +import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; + +import java.io.File; +import java.util.List; +import java.util.function.BiConsumer; +import java.util.function.Function; + +/** + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class MultidimensionalSegmentCollection extends SampleLocatableCollection { + enum MultidimensionalSegmentTableColumn { + CONTIG, + START, + END, + NUM_POINTS_COPY_RATIO, + NUM_POINTS_ALLELE_FRACTION, + MEAN_LOG2_COPY_RATIO; + + static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values()); + } + + private static final Function MULTIDIMENSIONAL_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION = dataLine -> { + final String contig = dataLine.get(MultidimensionalSegmentTableColumn.CONTIG); + final int start = dataLine.getInt(MultidimensionalSegmentTableColumn.START); + final int end = dataLine.getInt(MultidimensionalSegmentTableColumn.END); + final int numPointsCopyRatio = dataLine.getInt(MultidimensionalSegmentTableColumn.NUM_POINTS_COPY_RATIO); + final int numPointsAlleleFraction = dataLine.getInt(MultidimensionalSegmentTableColumn.NUM_POINTS_ALLELE_FRACTION); + final double meanLog2CopyRatio = dataLine.getDouble(MultidimensionalSegmentTableColumn.MEAN_LOG2_COPY_RATIO); + final SimpleInterval interval = new SimpleInterval(contig, start, end); + return new MultidimensionalSegment(interval, numPointsCopyRatio, numPointsAlleleFraction, meanLog2CopyRatio); + }; + + private static final BiConsumer MULTIDIMENSIONAL_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER = (alleleFractionSegment, dataLine) -> + dataLine.append(alleleFractionSegment.getContig()) + .append(alleleFractionSegment.getStart()) + .append(alleleFractionSegment.getEnd()) + .append(alleleFractionSegment.getNumPointsCopyRatio()) + .append(alleleFractionSegment.getNumPointsAlleleFraction()) + .append(alleleFractionSegment.getMeanLog2CopyRatio()); + + public MultidimensionalSegmentCollection(final File inputFile) { + super(inputFile, MultidimensionalSegmentTableColumn.COLUMNS, MULTIDIMENSIONAL_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, MULTIDIMENSIONAL_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER); + } + + public MultidimensionalSegmentCollection(final SampleMetadata sampleMetadata, + final List multidimensionalSegments) { + super(sampleMetadata, multidimensionalSegments, MultidimensionalSegmentTableColumn.COLUMNS, MULTIDIMENSIONAL_SEGMENT_DATA_LINE_TO_RECORD_FUNCTION, MULTIDIMENSIONAL_SEGMENT_RECORD_AND_DATA_LINE_BI_CONSUMER); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ParameterDecileCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ParameterDecileCollection.java new file mode 100644 index 00000000000..7a8fc4eb537 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/ParameterDecileCollection.java @@ -0,0 +1,112 @@ +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; + +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.mcmc.Decile; +import org.broadinstitute.hellbender.utils.mcmc.DecileCollection; +import org.broadinstitute.hellbender.utils.mcmc.ParameterEnum; +import org.broadinstitute.hellbender.utils.tsv.DataLine; +import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; + +import java.io.File; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class ParameterDecileCollection & ParameterEnum> extends SampleRecordCollection> { + enum ParameterTableColumn { + PARAMETER_NAME, + POSTERIOR_10, + POSTERIOR_20, + POSTERIOR_30, + POSTERIOR_40, + POSTERIOR_50, + POSTERIOR_60, + POSTERIOR_70, + POSTERIOR_80, + POSTERIOR_90; + + static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values()); + } + + private static DecileCollection parseDecilesFromDataLine(final DataLine dataLine) { + return new DecileCollection(Arrays.asList( + dataLine.getDouble(ParameterTableColumn.POSTERIOR_10), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_20), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_30), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_40), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_50), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_60), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_70), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_80), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_90))); + } + + private static void appendDecilesToDataLine(final DataLine dataLine, + final DecileCollection deciles, + final String doubleFormat) { + dataLine.append(String.format(doubleFormat, deciles.get(Decile.DECILE_10))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_20))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_30))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_40))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_50))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_60))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_70))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_80))) + .append(String.format(doubleFormat, deciles.get(Decile.DECILE_90))); + } + + private final Map parameterToDecileCollectionMap; + + public ParameterDecileCollection(final SampleMetadata sampleMetadata, + final Map parameterToDecileCollectionMap, + final Class parameterClass, + final String doubleFormat) { + super( + Utils.nonNull(sampleMetadata), + new ArrayList<>(parameterToDecileCollectionMap.entrySet()), + ParameterTableColumn.COLUMNS, + dataLine -> { + final String parameterName = dataLine.get(ParameterTableColumn.PARAMETER_NAME); + final T parameter = Enum.valueOf(Utils.nonNull(parameterClass), parameterName); + final DecileCollection deciles = parseDecilesFromDataLine(dataLine); + return new AbstractMap.SimpleEntry<>(parameter, deciles);}, + (record, dataLine) -> { + final T parameter = record.getKey(); + final DecileCollection deciles = record.getValue(); + appendDecilesToDataLine(dataLine.append(parameter.toString()), deciles, doubleFormat); + } + ); + this.parameterToDecileCollectionMap = parameterToDecileCollectionMap; + } + + public ParameterDecileCollection(final File file, + final Class parameterClass, + final String doubleFormat) { + super( + Utils.nonNull(file), + ParameterTableColumn.COLUMNS, + dataLine -> { + final String parameterName = dataLine.get(ParameterTableColumn.PARAMETER_NAME); + final T parameter = Enum.valueOf(Utils.nonNull(parameterClass), parameterName); + final DecileCollection deciles = parseDecilesFromDataLine(dataLine); + return new AbstractMap.SimpleEntry<>(parameter, deciles);}, + (record, dataLine) -> { + final T parameter = record.getKey(); + final DecileCollection deciles = record.getValue(); + dataLine.append(parameter.toString()); + appendDecilesToDataLine(dataLine, deciles, doubleFormat); + } + ); + parameterToDecileCollectionMap = getRecords().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + public DecileCollection getDeciles(final T parameter) { + return parameterToDecileCollectionMap.get(parameter); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java index 0f9e2af1fe9..c9ae2837fb0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/RecordCollection.java @@ -36,10 +36,10 @@ public abstract class RecordCollection { * @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file * @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file */ - protected RecordCollection(final List records, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + RecordCollection(final List records, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { this.records = ImmutableList.copyOf(Utils.nonNull(records)); this.mandatoryColumns = Utils.nonNull(mandatoryColumns); this.recordFromDataLineDecoder = Utils.nonNull(recordFromDataLineDecoder); @@ -56,10 +56,10 @@ protected RecordCollection(final List records, * @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file * @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file */ - protected RecordCollection(final File inputFile, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + RecordCollection(final File inputFile, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { IOUtils.canReadFile(inputFile); this.mandatoryColumns = Utils.nonNull(mandatoryColumns); this.recordFromDataLineDecoder = Utils.nonNull(recordFromDataLineDecoder); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java index 2263130a9d9..925ac1de213 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollection.java @@ -35,11 +35,11 @@ public abstract class SampleLocatableCollection extend /** * Records are sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR}. */ - protected SampleLocatableCollection(final SampleMetadata sampleMetadata, - final List records, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + SampleLocatableCollection(final SampleMetadata sampleMetadata, + final List records, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { super( sampleMetadata, Utils.nonNull(records).stream().sorted(LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList()), @@ -52,10 +52,10 @@ protected SampleLocatableCollection(final SampleMetadata sampleMetadata, /** * @throws IllegalArgumentException if records are not sorted using {@code LEXICOGRAPHICAL_ORDER_COMPARATOR} */ - protected SampleLocatableCollection(final File inputFile, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + SampleLocatableCollection(final File inputFile, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { super(inputFile, mandatoryColumns, recordFromDataLineDecoder, recordToDataLineEncoder); validateIntervals(getSampleName(), getRecords()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java index 9eecc6d8b6d..e02bd1db06e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleRecordCollection.java @@ -41,11 +41,11 @@ public abstract class SampleRecordCollection implements SampleMetadata { * @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file * @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file */ - protected SampleRecordCollection(final SampleMetadata sampleMetadata, - final List records, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + SampleRecordCollection(final SampleMetadata sampleMetadata, + final List records, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { this.sampleMetadata = Utils.nonNull(sampleMetadata); this.records = ImmutableList.copyOf(Utils.nonNull(records)); this.mandatoryColumns = Utils.nonNull(mandatoryColumns); @@ -64,10 +64,10 @@ protected SampleRecordCollection(final SampleMetadata sampleMetadata, * @param recordFromDataLineDecoder lambda for decoding a record from a {@link DataLine} when reading from a TSV file * @param recordToDataLineEncoder lambda for encoding a record to a {@link DataLine} when writing to a TSV file */ - protected SampleRecordCollection(final File inputFile, - final TableColumnCollection mandatoryColumns, - final Function recordFromDataLineDecoder, - final BiConsumer recordToDataLineEncoder) { + SampleRecordCollection(final File inputFile, + final TableColumnCollection mandatoryColumns, + final Function recordFromDataLineDecoder, + final BiConsumer recordToDataLineEncoder) { IOUtils.canReadFile(inputFile); this.mandatoryColumns = Utils.nonNull(mandatoryColumns); this.recordFromDataLineDecoder = Utils.nonNull(recordFromDataLineDecoder); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollection.java similarity index 95% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollection.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollection.java index 97b8f178149..dbc7928879c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollection.java @@ -1,9 +1,9 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import org.broadinstitute.hdf5.HDF5File; import org.broadinstitute.hdf5.HDF5LibException; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AlleleFractionSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AlleleFractionSegment.java new file mode 100644 index 00000000000..621e2ddf5dc --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AlleleFractionSegment.java @@ -0,0 +1,82 @@ +package org.broadinstitute.hellbender.tools.copynumber.formats.records; + +import htsjdk.samtools.util.Locatable; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.util.List; + +public class AlleleFractionSegment implements Locatable { + private final SimpleInterval interval; + private final int numPoints; + + public AlleleFractionSegment(final SimpleInterval interval, + final int numPoints) { + Utils.nonNull(interval); + ParamUtils.isPositiveOrZero(numPoints, "Number of points must be non-negative."); + this.interval = interval; + this.numPoints = numPoints; + } + + public AlleleFractionSegment(final SimpleInterval interval, + final List allelicCounts) { + Utils.nonNull(interval); + Utils.nonNull(allelicCounts); + this.interval = interval; + numPoints = allelicCounts.size(); + } + + @Override + public String getContig() { + return interval.getContig(); + } + + @Override + public int getStart() { + return interval.getStart(); + } + + @Override + public int getEnd() { + return interval.getEnd(); + } + + public SimpleInterval getInterval() { + return interval; + } + + public int getNumPoints() { + return numPoints; + } + + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final AlleleFractionSegment that = (AlleleFractionSegment) o; + return numPoints == that.numPoints && + interval.equals(that.interval); + } + + @Override + public int hashCode() { + int result; + result = interval.hashCode(); + result = 31 * result + numPoints; + return result; + } + + @Override + public String toString() { + return "AlleleFractionSegment{" + + "interval=" + interval + + ", numPoints=" + numPoints + + '}'; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCount.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AllelicCount.java similarity index 98% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCount.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AllelicCount.java index 6bcb21d9093..c5db911c6db 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCount.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AllelicCount.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; import htsjdk.samtools.util.Locatable; import org.broadinstitute.hellbender.utils.Nucleotide; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedInterval.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotatedInterval.java similarity index 96% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedInterval.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotatedInterval.java index c06fcee370c..1bc35f57ed1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotatedInterval.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotatedInterval.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.annotation; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; import htsjdk.samtools.util.Locatable; import htsjdk.tribble.Feature; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotationSet.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotationSet.java similarity index 86% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotationSet.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotationSet.java index cba098ab86b..dd3a55beddb 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/annotation/AnnotationSet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/AnnotationSet.java @@ -1,5 +1,6 @@ -package org.broadinstitute.hellbender.tools.copynumber.annotation; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; import org.broadinstitute.hellbender.utils.Utils; /** @@ -7,7 +8,7 @@ * * @author Samuel Lee <slee@broadinstitute.org> */ -public class AnnotationSet { +public final class AnnotationSet { /** * If additional annotation fields are added here, then {@link AnnotatedIntervalCollection} * should be updated accordingly. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CalledCopyRatioSegment.java similarity index 90% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegment.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CalledCopyRatioSegment.java index 9f23254a4b4..178dab834a0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/CalledCopyRatioSegment.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CalledCopyRatioSegment.java @@ -1,6 +1,5 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.caller; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment; import org.broadinstitute.hellbender.utils.Utils; public class CalledCopyRatioSegment extends CopyRatioSegment { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatio.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatio.java similarity index 96% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatio.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatio.java index 2e3a4cc4591..42dc271016e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/copyratio/CopyRatio.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatio.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; import htsjdk.samtools.util.Locatable; import org.broadinstitute.hellbender.utils.SimpleInterval; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatioSegment.java similarity index 85% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegment.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatioSegment.java index 94cce16acd9..fb55015449d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/segmentation/CopyRatioSegment.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/CopyRatioSegment.java @@ -1,7 +1,6 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; import htsjdk.samtools.util.Locatable; -import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.param.ParamUtils; @@ -24,13 +23,12 @@ public CopyRatioSegment(final SimpleInterval interval, } public CopyRatioSegment(final SimpleInterval interval, - final List denoisedLog2CopyRatios) { + final List denoisedLog2CopyRatios) { Utils.nonNull(interval); Utils.nonNull(denoisedLog2CopyRatios); this.interval = interval; numPoints = denoisedLog2CopyRatios.size(); - final double meanCopyRatio = denoisedLog2CopyRatios.stream().mapToDouble(log2CR -> Math.pow(2., log2CR)).average().orElse(Double.NaN); - meanLog2CopyRatio = Math.log(meanCopyRatio) * GATKProtectedMathUtils.INV_LOG_2; + meanLog2CopyRatio = denoisedLog2CopyRatios.stream().mapToDouble(CopyRatio::getLog2CopyRatioValue).average().orElse(Double.NaN); } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/ModeledSegment.java similarity index 97% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegment.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/ModeledSegment.java index abfad139019..f3e36851d40 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/multidimensional/model/ModeledSegment.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/ModeledSegment.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.multidimensional.model; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; import htsjdk.samtools.util.Locatable; import org.broadinstitute.hellbender.utils.SimpleInterval; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/MultidimensionalSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/MultidimensionalSegment.java new file mode 100644 index 00000000000..c0cd8ac822c --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/MultidimensionalSegment.java @@ -0,0 +1,125 @@ +package org.broadinstitute.hellbender.tools.copynumber.formats.records; + +import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.OverlapDetector; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; + +import java.util.List; +import java.util.stream.Collectors; + +public class MultidimensionalSegment implements Locatable { + private final SimpleInterval interval; + private final int numPointsCopyRatio; + private final int numPointsAlleleFraction; + private final double meanLog2CopyRatio; + + public MultidimensionalSegment(final SimpleInterval interval, + final int numPointsCopyRatio, + final int numPointsAlleleFraction, + final double meanLog2CopyRatio) { + Utils.nonNull(interval); + Utils.validateArg(numPointsCopyRatio > 0 || numPointsAlleleFraction > 0, + String.format("Number of copy-ratio points or number of allele-fraction points must be positive: %s", interval)); + this.interval = interval; + this.numPointsCopyRatio = numPointsCopyRatio; + this.numPointsAlleleFraction = numPointsAlleleFraction; + this.meanLog2CopyRatio = meanLog2CopyRatio; + } + + public MultidimensionalSegment(final SimpleInterval interval, + final List denoisedLog2CopyRatios, + final List allelicCounts) { + Utils.nonNull(interval); + Utils.nonNull(denoisedLog2CopyRatios); + Utils.nonNull(allelicCounts); + this.interval = interval; + numPointsCopyRatio = denoisedLog2CopyRatios.size(); + numPointsAlleleFraction = allelicCounts.size(); + meanLog2CopyRatio = new CopyRatioSegment(interval, denoisedLog2CopyRatios).getMeanLog2CopyRatio(); + } + + public MultidimensionalSegment(final SimpleInterval interval, + final OverlapDetector copyRatioMidpointOverlapDetector, + final OverlapDetector allelicCountOverlapDetector) { + this( + interval, + copyRatioMidpointOverlapDetector.getOverlaps(interval).stream() + .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR) + .collect(Collectors.toList()), + allelicCountOverlapDetector.getOverlaps(interval).stream() + .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR) + .collect(Collectors.toList())); + } + + @Override + public String getContig() { + return interval.getContig(); + } + + @Override + public int getStart() { + return interval.getStart(); + } + + @Override + public int getEnd() { + return interval.getEnd(); + } + + public SimpleInterval getInterval() { + return interval; + } + + public int getNumPointsCopyRatio() { + return numPointsCopyRatio; + } + + public int getNumPointsAlleleFraction() { + return numPointsAlleleFraction; + } + + public double getMeanLog2CopyRatio() { + return meanLog2CopyRatio; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + final MultidimensionalSegment that = (MultidimensionalSegment) o; + + return numPointsCopyRatio == that.numPointsCopyRatio && + numPointsAlleleFraction == that.numPointsAlleleFraction && + Double.compare(that.meanLog2CopyRatio, meanLog2CopyRatio) == 0 && + interval.equals(that.interval); + } + + @Override + public int hashCode() { + int result; + long temp; + result = interval.hashCode(); + result = 31 * result + numPointsCopyRatio; + result = 31 * result + numPointsAlleleFraction; + temp = Double.doubleToLongBits(meanLog2CopyRatio); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + return result; + } + + @Override + public String toString() { + return "MultidimensionalSegment{" + + "interval=" + interval + + ", numPointsCopyRatio=" + numPointsCopyRatio + + ", numPointsAlleleFraction=" + numPointsAlleleFraction + + ", meanLog2CopyRatio=" + meanLog2CopyRatio + + '}'; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCount.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java similarity index 95% rename from src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCount.java rename to src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java index a74b04dc5f8..7e99d2b1e9e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCount.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount; +package org.broadinstitute.hellbender.tools.copynumber.formats.records; import htsjdk.samtools.util.Locatable; import org.broadinstitute.hellbender.utils.SimpleInterval; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionGlobalParameters.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionGlobalParameters.java new file mode 100644 index 00000000000..806fcdcb19b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionGlobalParameters.java @@ -0,0 +1,66 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +/** + * Encapsulates the global parameters of the allele fraction model: the mean and variance of the common prior on + * allelic biases and the outlier probability. + * + * @author David Benjamin <davidben@broadinstitute.org> + */ +final class AlleleFractionGlobalParameters { + static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT; + + private final double meanBias; + private final double biasVariance; + private final double outlierProbability; + + AlleleFractionGlobalParameters(final double meanBias, + final double biasVariance, + final double outlierProbability) { + this.meanBias = meanBias; + this.biasVariance = biasVariance; + this.outlierProbability = outlierProbability; + } + + double getMeanBias() { + return meanBias; + } + + double getBiasVariance() { + return biasVariance; + } + + double getOutlierProbability() { + return outlierProbability; + } + + //get the gamma distribution alpha parameter + double getAlpha() { + return meanBias * meanBias / biasVariance; + } + + //get the gamma distribution beta parameter + double getBeta() { + return meanBias / biasVariance; + } + + AlleleFractionGlobalParameters copyWithNewMeanBias(final double newMeanBias) { + return new AlleleFractionGlobalParameters(newMeanBias, biasVariance, outlierProbability); + } + + AlleleFractionGlobalParameters copyWithNewBiasVariance(final double newBiasVariance) { + return new AlleleFractionGlobalParameters(meanBias, newBiasVariance, outlierProbability); + } + + AlleleFractionGlobalParameters copyWithNewOutlierProbability(final double newOutlierProbability) { + return new AlleleFractionGlobalParameters(meanBias, biasVariance, newOutlierProbability); + } + + @Override + public String toString() { + return String.format("AlleleFractionGlobalParameters{" + + "meanBias=" + DOUBLE_FORMAT + + ", biasVariance=" + DOUBLE_FORMAT + + ", outlierProbability=" + DOUBLE_FORMAT + + '}', meanBias, biasVariance, outlierProbability); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializer.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializer.java new file mode 100644 index 00000000000..287f1ad97f4 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializer.java @@ -0,0 +1,185 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.exception.MaxCountExceededException; +import org.apache.commons.math3.special.Beta; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.utils.OptimizationUtils; +import org.broadinstitute.hellbender.utils.Utils; + +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * The allele-fraction model (after marginalizing latent parameters as described in docs/CNVs/CNV-methods.pdf) + * contains the following parameters: + * 1. minor-allele fractions for each segment + * 2. a global outlier probability + * 3. the mean allelic bias + * 4. the rate (mean / variance) of the allelic bias + * + * Note that 3 and 4 are hyperparameters specifying a gamma distribution prior on allelic bias -- the latent variables + * for bias at each het site have been marginalized but the hyperparameters have not. + * + * The allele-fraction model samples the distribution of these parameters using Markov chain Monte Carlo and in principle + * an initialization step is not necessary. However, in practice this initialization finds the mode of the posterior + * distributions in only a few iterations, whereas sampling would require many more. Thus we greatly reduce the + * number of burn-in samples that we must discard. + * + * The initialization is straightforward: first we set the minor fractions to reasonable guesses based on alt and ref + * counts, assuming no allelic bias. Then we numerically maximize the likelihood with respect to each parameter until + * the likelihood converges to a maximum. In practice this is the unique global maximum. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class AlleleFractionInitializer { + private static final Logger logger = LogManager.getLogger(AlleleFractionInitializer.class); + + private static final double INITIAL_OUTLIER_PROBABILITY = 0.01; + private static final double INITIAL_MEAN_BIAS = 1.0; + private static final double INITIAL_BIAS_VARIANCE = 0.05; //this is an overestimate, but starting small makes it slow for + //mean bias to escape a bad initial guess + private static final AlleleFractionGlobalParameters INITIAL_GLOBAL_PARAMETERS = + new AlleleFractionGlobalParameters(INITIAL_MEAN_BIAS, INITIAL_BIAS_VARIANCE, INITIAL_OUTLIER_PROBABILITY); + + private static final double LOG_LIKELIHOOD_CONVERGENCE_THRESHOLD = 0.5; + private static final int MAX_ITERATIONS = 50; + + //define maxima of search intervals for maximum likelihood -- parameter values above these would be ridiculous + static final double MAX_REASONABLE_OUTLIER_PROBABILITY = 0.15; + static final double MAX_REASONABLE_MEAN_BIAS = 5.0; + static final double MAX_REASONABLE_BIAS_VARIANCE = 0.5; + private static final double EPSILON_FOR_NEAR_MAX_WARNING = 1E-2; + + //the minor-allele fraction of a segment must be less than one half by definition + private static final double MAX_MINOR_ALLELE_FRACTION = 0.5; + + private final AlleleFractionSegmentedData data; + private AlleleFractionGlobalParameters globalParameters; + private AlleleFractionState.MinorFractions minorFractions; + + /** + * This constructor performs the initialization. + */ + AlleleFractionInitializer(final AlleleFractionSegmentedData data) { + this.data = Utils.nonNull(data); + globalParameters = INITIAL_GLOBAL_PARAMETERS; + minorFractions = calculateInitialMinorFractions(data); + double previousIterationLogLikelihood; + double nextIterationLogLikelihood = Double.NEGATIVE_INFINITY; + logger.info(String.format("Initializing allele-fraction model, iterating until log likelihood converges to within %.3f...", + LOG_LIKELIHOOD_CONVERGENCE_THRESHOLD)); + int iteration = 1; + do { + previousIterationLogLikelihood = nextIterationLogLikelihood; + globalParameters = new AlleleFractionGlobalParameters( + estimateMeanBias(), estimateBiasVariance(), estimateOutlierProbability()); + minorFractions = estimateMinorFractions(); + + nextIterationLogLikelihood = AlleleFractionLikelihoods.logLikelihood(globalParameters, minorFractions, data); + logger.info(String.format("Iteration %d, model log likelihood = %.3f...", iteration, nextIterationLogLikelihood)); + logger.info(globalParameters); + iteration++; + } while (iteration < MAX_ITERATIONS && + nextIterationLogLikelihood - previousIterationLogLikelihood > LOG_LIKELIHOOD_CONVERGENCE_THRESHOLD); + warnIfNearMax(AlleleFractionParameter.MEAN_BIAS.name, globalParameters.getMeanBias(), MAX_REASONABLE_MEAN_BIAS, EPSILON_FOR_NEAR_MAX_WARNING); + warnIfNearMax(AlleleFractionParameter.BIAS_VARIANCE.name, globalParameters.getBiasVariance(), MAX_REASONABLE_BIAS_VARIANCE, EPSILON_FOR_NEAR_MAX_WARNING); + warnIfNearMax(AlleleFractionParameter.OUTLIER_PROBABILITY.name, globalParameters.getOutlierProbability(), MAX_REASONABLE_OUTLIER_PROBABILITY, EPSILON_FOR_NEAR_MAX_WARNING); + } + + private static void warnIfNearMax(final String parameterName, + final double value, + final double maxValue, + final double epsilon) { + if (maxValue - value < epsilon) { + logger.warn(String.format("The maximum-likelihood estimate for the global parameter %s (%s) was near its boundary (%s), " + + "the model is likely not a good fit to the data! Consider changing parameters for filtering homozygous sites.", + parameterName, + String.format(AlleleFractionGlobalParameters.DOUBLE_FORMAT, value), + String.format(AlleleFractionGlobalParameters.DOUBLE_FORMAT, maxValue))); + } + } + + AlleleFractionState getInitializedState() { + return new AlleleFractionState( + globalParameters.getMeanBias(), globalParameters.getBiasVariance(), globalParameters.getOutlierProbability(), minorFractions); + } + + /** + *

+ * Initialize minor fractions assuming no allelic bias. + *

+ * + *

+ * We integrate over f to get posterior probabilities (responsibilities) of alt / ref minor + * that is, responsibility of alt minor is int_{0 to 1/2} f^a (1 - f)^r df + * responsibility of ref minor is int_{0 to 1/2} f^r (1 - f)^a df + * These are proportional to I(1/2, a + 1, r + 1) and I(1/2, r + 1, a + 1), + * respectively, where I is the (incomplete) regularized Beta function. + * By definition, these likelihoods sum to 1, i.e., they are already normalized. + *

+ * + *

+ * Finally, we set each minor fraction to the responsibility-weighted total count of + * reads in minor allele divided by total reads, ignoring outliers. + *

+ */ + private AlleleFractionState.MinorFractions calculateInitialMinorFractions(final AlleleFractionSegmentedData data) { + final int numSegments = data.getNumSegments(); + final AlleleFractionState.MinorFractions result = new AlleleFractionState.MinorFractions(numSegments); + for (int segment = 0; segment < numSegments; segment++) { + double responsibilityWeightedMinorAlleleReadCount = 0.0; + double responsibilityWeightedTotalReadCount = 0.0; + for (final AllelicCount count : data.getIndexedAllelicCountsInSegment(segment)) { + final int a = count.getAltReadCount(); + final int r = count.getRefReadCount(); + double altMinorResponsibility; + try { + altMinorResponsibility = Beta.regularizedBeta(0.5, a + 1, r + 1); + } catch (final MaxCountExceededException e) { + altMinorResponsibility = a < r ? 1.0 : 0.0; //if the special function can't be computed, give an all-or-nothing responsibility + } + responsibilityWeightedMinorAlleleReadCount += altMinorResponsibility * a + (1 - altMinorResponsibility) * r; + responsibilityWeightedTotalReadCount += a + r; + } + + // we achieve a flat prior via a single pseudocount for minor and non-minor reads, hence the +1 and +2 + result.add((responsibilityWeightedMinorAlleleReadCount + 1)/(responsibilityWeightedTotalReadCount + 2)); + } + return result; + } + + private double estimateOutlierProbability() { + final Function objective = outlierProbability -> + AlleleFractionLikelihoods.logLikelihood(globalParameters.copyWithNewOutlierProbability(outlierProbability), minorFractions, data); + return OptimizationUtils.argmax(objective, 0.0, MAX_REASONABLE_OUTLIER_PROBABILITY, globalParameters.getOutlierProbability()); + } + + private double estimateMeanBias() { + final Function objective = meanBias -> + AlleleFractionLikelihoods.logLikelihood(globalParameters.copyWithNewMeanBias(meanBias), minorFractions, data); + return OptimizationUtils.argmax(objective, 0.0, MAX_REASONABLE_MEAN_BIAS, globalParameters.getMeanBias()); + } + + private double estimateBiasVariance() { + final Function objective = biasVariance -> + AlleleFractionLikelihoods.logLikelihood(globalParameters.copyWithNewBiasVariance(biasVariance), minorFractions, data); + return OptimizationUtils.argmax(objective, 0.0, MAX_REASONABLE_BIAS_VARIANCE, globalParameters.getBiasVariance()); + } + + private double estimateMinorFraction(final int segment) { + final Function objective = minorFraction -> + AlleleFractionLikelihoods.segmentLogLikelihood(globalParameters, minorFraction, data.getIndexedAllelicCountsInSegment(segment)); + return OptimizationUtils.argmax(objective, 0.0, MAX_MINOR_ALLELE_FRACTION, minorFractions.get(segment)); + } + + private AlleleFractionState.MinorFractions estimateMinorFractions() { + return new AlleleFractionState.MinorFractions( + IntStream.range(0, data.getNumSegments()).boxed() + .map(this::estimateMinorFraction) + .collect(Collectors.toList())); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java new file mode 100644 index 00000000000..40abdc00ff4 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionLikelihoods.java @@ -0,0 +1,188 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.special.Gamma; +import org.apache.commons.math3.util.FastMath; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils; + +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.IntStream; + +import static org.apache.commons.math3.util.FastMath.sqrt; +import static org.broadinstitute.hellbender.utils.MathUtils.log10Factorial; +import static org.broadinstitute.hellbender.utils.MathUtils.log10ToLog; + +/** + * Contains likelihood methods for the allele-fraction model. + * See docs/CNVs/CNV-methods.pdf for a thorough description of the model. + * + * We can compute the log-likelihood of a alt reads and r ref reads given minor fraction f and gamma hyperparameters + * (specifying the distribution on allelic biases) mu (mean) and beta (rate = mean/variance) and given + * an alt minor, ref minor, or outlier indicator state. Note that this is a partially collapsed log-likelihood in that the + * latent variable corresponding to the allelic bias at this site has been marginalized out but the indicator + * variable has not been marginalized out. + *

+ * See docs/CNVs/CNV-methods.pdf for derivation. + *

+ * Finally, note that this is a static method and does not get mu, beta, and minorFraction from an AlleleFractionState object + * We need such functionality because MCMC evaluates the likelihood under proposed parameter changes. + * + *

+ * if indicator == ALT_MINOR: + *

+ * log { [beta^alpha / Gamma(alpha)][(1 - pi) / 2] * int_{0 to infty} f^a * (1 - f)^r * lambda^(alpha + r - 1) * exp(-beta * lambda)/(f + (1 - f) * lambda)^n d lambda } + *

+ * if indicator == REF_MINOR, same as ALT_MINOR but with f <--> 1 - f + *

+ * if indicator == OUTLIER log {pi * a!r!/(n+1)!} + *

+ * where alpha = mu*beta and n = a + r. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class AlleleFractionLikelihoods { + private static final double EPSILON = 1E-10; + + private static final FunctionCache logGammaCache = new FunctionCache(Gamma::logGamma); + private static final FunctionCache logCache = new FunctionCache(FastMath::log); + + private static final class FunctionCache extends LinkedHashMap { + private static final long serialVersionUID = 19841647L; + private static final int MAX_SIZE = 100_000; + + private final Function mappingFunction; + + FunctionCache(final Function mappingFunction) { + this.mappingFunction = mappingFunction; + } + + Double computeIfAbsent(final Double key) { + return super.computeIfAbsent(key, mappingFunction); + } + + @Override + protected boolean removeEldestEntry(final Map.Entry eldest) { + return size() >= MAX_SIZE; + } + } + + private AlleleFractionLikelihoods() {} + + static double segmentLogLikelihood(final AlleleFractionGlobalParameters parameters, + final double minorFraction, + final List allelicCountsInSegment) { + final double alpha = parameters.getAlpha(); + final double beta = parameters.getBeta(); + final double pi = parameters.getOutlierProbability(); + + //we compute some quantities that will be reused + final double logPi = logCache.computeIfAbsent(pi); + final double logNotPi = logCache.computeIfAbsent((1 - pi) / 2); + final double logcCommon = alpha * logCache.computeIfAbsent(beta) - logGammaCache.computeIfAbsent(alpha); + final double majorFraction = 1 - minorFraction; + final double logMinorFraction = log(minorFraction); + final double logMajorFraction = log(majorFraction); + + double logLikelihood = 0.; + for (final AllelicCount allelicCount : allelicCountsInSegment) { + final int a = allelicCount.getAltReadCount(); + final int r = allelicCount.getRefReadCount(); + final int n = a + r; + + //alt-minor calculation + final double lambda0AltMinor = biasPosteriorMode(alpha, beta, minorFraction, a, r); + final double kappaAltMinor = biasPosteriorCurvature(alpha, minorFraction, r, n, lambda0AltMinor); + final double rhoAltMinor = biasPosteriorEffectiveAlpha(lambda0AltMinor, kappaAltMinor); + final double tauAltMinor = biasPosteriorEffectiveBeta(lambda0AltMinor, kappaAltMinor); + final double logcAltMinor = logcCommon + a * logMinorFraction + r * logMajorFraction + + (r + alpha - rhoAltMinor) * log(lambda0AltMinor) + (tauAltMinor - beta) * lambda0AltMinor + - n * log(minorFraction + majorFraction * lambda0AltMinor); + final double altMinorLogLikelihood = logNotPi + logcAltMinor + Gamma.logGamma(rhoAltMinor) - rhoAltMinor * log(tauAltMinor); + + //ref-minor calculation + final double lambda0RefMinor = biasPosteriorMode(alpha, beta, majorFraction, a, r); + final double kappaRefMinor = biasPosteriorCurvature(alpha, majorFraction, r, n, lambda0RefMinor); + final double rhoRefMinor = biasPosteriorEffectiveAlpha(lambda0RefMinor, kappaRefMinor); + final double tauRefMinor = biasPosteriorEffectiveBeta(lambda0RefMinor, kappaRefMinor); + final double logcRefMinor = logcCommon + a * logMajorFraction + r * logMinorFraction + + (r + alpha - rhoRefMinor) * log(lambda0RefMinor) + (tauRefMinor - beta) * lambda0RefMinor + - n * log(majorFraction + minorFraction * lambda0RefMinor); + final double refMinorLogLikelihood = logNotPi + logcRefMinor + Gamma.logGamma(rhoRefMinor) - rhoRefMinor * log(tauRefMinor); + + final double outlierLogLikelihood = logPi + log10ToLog(log10Factorial(a) + log10Factorial(r) - log10Factorial(a + r + 1)); + + logLikelihood += GATKProtectedMathUtils.logSumExp(altMinorLogLikelihood, refMinorLogLikelihood, outlierLogLikelihood); + } + return logLikelihood; + } + + /** + * The total log likelihood of all segments. + */ + static double logLikelihood(final AlleleFractionGlobalParameters parameters, + final AlleleFractionState.MinorFractions minorFractions, + final AlleleFractionSegmentedData data) { + return IntStream.range(0, data.getNumSegments()) + .mapToDouble(segment -> segmentLogLikelihood(parameters, minorFractions.get(segment), data.getIndexedAllelicCountsInSegment(segment))) + .sum(); + } + + /** + * Calculates the mode of the exact allelic-bias posterior at given values of the hyperparameters for the + * * allelic-bias Gamma-distribution prior, the minor-allele fraction parameter, and the observed + * counts at a site. See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as lambda_0) for details. + * @param alpha alpha hyperparameter for allelic-bias Gamma-distribution prior + * @param beta beta hyperparameter for allelic-bias Gamma-distribution prior + * @param f minor-allele fraction + * @param a alt counts + * @param r ref counts + */ + private static double biasPosteriorMode(final double alpha, final double beta, final double f, final int a, final int r) { + final double w = (1 - f) * (a - alpha + 1) + beta * f; + return Math.max((sqrt(w * w + 4 * beta * f * (1 - f) * (r + alpha - 1)) - w) / (2 * beta * (1 - f)), EPSILON); + } + + /** + * Calculates the curvature (second derivative at the mode) of the exact allelic-bias log posterior + * at given values of the hyperparameters for the allelic-bias Gamma-distribution prior, + * the minor-allele fraction parameter, and the observed counts at a site. + * See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as kappa) for details. + * @param alpha alpha hyperparameter for allelic-bias Gamma-distribution prior + * @param f minor-allele fraction + * @param r ref counts + * @param n total counts + * @param lambda0 mode of allelic-bias posterior + */ + private static double biasPosteriorCurvature(final double alpha, final double f, final int r, final int n, final double lambda0) { + final double y = (1 - f) / (f + (1 - f) * lambda0); + return n * y * y - (r + alpha - 1) / (lambda0 * lambda0); + } + + /** + * Calculates the effective alpha hyperparameter for the Gamma-distribution approximation of the exact allelic-bias posterior. + * See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as rho) for details. + * @param lambda0 mode of allelic-bias posterior + * @param kappa curvature of allelic-bias posterior + */ + private static double biasPosteriorEffectiveAlpha(final double lambda0, final double kappa) { + return Math.max(1 - kappa * lambda0 * lambda0, EPSILON); + } + + /** + * Calculates the effective beta hyperparameter for the Gamma-distribution approximation of the exact allelic-bias posterior. + * See docs/CNVs/CNV-methods.pdf (where this quantity is referred to as tau) for details. + * @param lambda0 mode of allelic-bias posterior + * @param kappa curvature of allelic-bias posterior + */ + private static double biasPosteriorEffectiveBeta(final double lambda0, final double kappa) { + return Math.max(-kappa * lambda0, EPSILON); + } + + private static double log(final double x) { + return FastMath.log(Math.max(EPSILON, x)); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModeller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModeller.java new file mode 100644 index 00000000000..d097d427392 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModeller.java @@ -0,0 +1,193 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.mcmc.DecileCollection; +import org.broadinstitute.hellbender.utils.mcmc.GibbsSampler; +import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler; +import org.broadinstitute.hellbender.utils.mcmc.ParameterizedModel; + +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Given segments and counts of alt and ref reads over a list of het sites, + * infers the minor-allele fraction of each segment. For example, a segment + * with (alt,ref) counts (10,90), (11,93), (88,12), (90,10) probably has a minor-allele fraction + * somewhere around 0.1. The model takes into account allelic bias due to mapping etc. by learning + * a global gamma distribution on allelic bias ratios. + *

+ * We define the bias ratio of each het locus to be the expected ratio of + * mapped ref reads to mapped alt reads given equal amounts of DNA (that is, given + * a germline het). The model learns a common gamma distribution: + * bias ratio ~ Gamma(alpha = mu^2 / sigma^2, beta = mu / sigma^2) + * where mu and sigma^2 are the global mean and variance of bias ratios, and + * alpha, beta are the natural parameters of the gamma distribution. + *

+ *

+ * Each segment has a minor-allele fraction f, and for each het within the locus + * the number of alt reads is drawn from a binomial distribution with total count + * n = #alt reads + #ref reads and alt probability f / (f + (1 - f) * bias ratio) if the + * locus is alt minor and (1 - f) / (1 - f + f * bias ratio) if the locus is ref minor. + *

+ *

+ * Conceptually, the model contains latent variables corresponding to the bias ratio + * and indicators for alt minor/ref minor at each het locus. However, we integrate them + * out and the MCMC model below only contains the minor-allele fractions and + * the three hyperparameters of the model: the two parameters of the gamma distribution + * along with the global outlier probability. + *

+ * See docs/CNVs/CNV-methods.pdf for a thorough description of the model. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionModeller { + private static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT; + + private static final double MAX_REASONABLE_MEAN_BIAS = AlleleFractionInitializer.MAX_REASONABLE_MEAN_BIAS; + private static final double MAX_REASONABLE_BIAS_VARIANCE = AlleleFractionInitializer.MAX_REASONABLE_BIAS_VARIANCE; + private static final double MAX_REASONABLE_OUTLIER_PROBABILITY = AlleleFractionInitializer.MAX_REASONABLE_OUTLIER_PROBABILITY; + private static final double MIN_MINOR_FRACTION_SAMPLING_WIDTH = 1E-3; + + private final SampleMetadata sampleMetadata; + private final ParameterizedModel model; + + private final List meanBiasSamples = new ArrayList<>(); + private final List biasVarianceSamples = new ArrayList<>(); + private final List outlierProbabilitySamples = new ArrayList<>(); + private final List minorFractionsSamples = new ArrayList<>(); + + /** + * Constructs an allele-fraction model given allelic counts and segments. + * {@link AlleleFractionInitializer} is used for initialization and slice-sampling widths are estimated. + */ + AlleleFractionModeller(final AllelicCountCollection allelicCounts, + final List segments, + final AlleleFractionPrior prior) { + Utils.nonNull(allelicCounts); + Utils.nonEmpty(segments); + Utils.nonNull(prior); + + sampleMetadata = allelicCounts.getSampleMetadata(); + final AlleleFractionSegmentedData data = new AlleleFractionSegmentedData(allelicCounts, segments); + + //initialization gets us to the mode of the likelihood + final AlleleFractionState initialState = new AlleleFractionInitializer(data).getInitializedState(); + final AlleleFractionGlobalParameters initialParameters = initialState.globalParameters(); + final AlleleFractionState.MinorFractions initialMinorFractions = initialState.minorFractions(); + + //if we approximate conditionals as normal, we can guess the width from the curvature at the mode and use as the slice-sampling widths + final double meanBiasSamplingWidths = approximatePosteriorWidthAtMode(meanBias -> + AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewMeanBias(meanBias), initialMinorFractions, data), initialParameters.getMeanBias()); + final double biasVarianceSamplingWidths = approximatePosteriorWidthAtMode(biasVariance -> + AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewBiasVariance(biasVariance), initialMinorFractions, data), initialParameters.getBiasVariance()); + final double outlierProbabilitySamplingWidths = approximatePosteriorWidthAtMode(outlierProbability -> + AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewOutlierProbability(outlierProbability), initialMinorFractions, data), initialParameters.getOutlierProbability()); + + final List minorFractionsSliceSamplingWidths = IntStream.range(0, data.getNumSegments()).boxed() + .map(segment -> approximatePosteriorWidthAtMode( + f -> AlleleFractionLikelihoods.segmentLogLikelihood(initialParameters, f, data.getIndexedAllelicCountsInSegment(segment)), initialMinorFractions.get(segment))) + .map(w -> Math.max(w, MIN_MINOR_FRACTION_SAMPLING_WIDTH)) + .collect(Collectors.toList()); + + final ParameterSampler meanBiasSampler = + new AlleleFractionSamplers.MeanBiasSampler(MAX_REASONABLE_MEAN_BIAS, meanBiasSamplingWidths); + final ParameterSampler biasVarianceSampler = + new AlleleFractionSamplers.BiasVarianceSampler(MAX_REASONABLE_BIAS_VARIANCE, biasVarianceSamplingWidths); + final ParameterSampler outlierProbabilitySampler = + new AlleleFractionSamplers.OutlierProbabilitySampler(MAX_REASONABLE_OUTLIER_PROBABILITY, outlierProbabilitySamplingWidths); + final ParameterSampler minorFractionsSampler = + new AlleleFractionSamplers.MinorFractionsSampler(prior, minorFractionsSliceSamplingWidths); + + model = new ParameterizedModel.GibbsBuilder<>(initialState, data) + .addParameterSampler(AlleleFractionParameter.MEAN_BIAS, meanBiasSampler, Double.class) + .addParameterSampler(AlleleFractionParameter.BIAS_VARIANCE, biasVarianceSampler, Double.class) + .addParameterSampler(AlleleFractionParameter.OUTLIER_PROBABILITY, outlierProbabilitySampler, Double.class) + .addParameterSampler(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, minorFractionsSampler, AlleleFractionState.MinorFractions.class) + .build(); + } + + /** + * Adds {@code numSamples - numBurnIn} Markov-Chain Monte-Carlo samples of the parameter posteriors (generated using + * Gibbs sampling) to the collections held internally. The current {@link AlleleFractionState} held internally is used + * to initialize the Markov Chain. + * @param numSamples total number of samples per posterior + * @param numBurnIn number of burn-in samples to discard + */ + void fitMCMC(final int numSamples, final int numBurnIn) { + //run MCMC + final GibbsSampler gibbsSampler = new GibbsSampler<>(numSamples, model); + gibbsSampler.runMCMC(); + + //update posterior samples + meanBiasSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.MEAN_BIAS, Double.class, numBurnIn)); + biasVarianceSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.BIAS_VARIANCE, Double.class, numBurnIn)); + outlierProbabilitySamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.OUTLIER_PROBABILITY, Double.class, numBurnIn)); + minorFractionsSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, AlleleFractionState.MinorFractions.class, numBurnIn)); + } + + List getMeanBiasSamples() { + return Collections.unmodifiableList(meanBiasSamples); + } + + List getBiasVarianceSamples() { + return Collections.unmodifiableList(biasVarianceSamples); + } + + List getOutlierProbabilitySamples() { + return Collections.unmodifiableList(outlierProbabilitySamples); + } + + List getMinorFractionsSamples() { + return Collections.unmodifiableList(minorFractionsSamples); + } + + /** + * Should only be called after {@link #fitMCMC} has been called. + */ + List getMinorAlleleFractionsPosteriorSummaries() { + if (minorFractionsSamples.isEmpty()) { + throw new IllegalStateException("Attempted to get posterior summaries for minor-allele fractions before MCMC was performed."); + } + final int numSegments = minorFractionsSamples.get(0).size(); + final List posteriorSummaries = new ArrayList<>(numSegments); + for (int segment = 0; segment < numSegments; segment++) { + final int j = segment; + final List minorFractionSamples = + minorFractionsSamples.stream().map(s -> s.get(j)).collect(Collectors.toList()); + posteriorSummaries.add(new ModeledSegment.SimplePosteriorSummary(minorFractionSamples)); + } + return posteriorSummaries; + } + + /** + * Should only be called after {@link #fitMCMC} has been called. + */ + ParameterDecileCollection getGlobalParameterDeciles() { + if (meanBiasSamples.isEmpty()) { + throw new IllegalStateException("Attempted to get posterior summaries for global parameters before MCMC was performed."); + } + final Map parameterToDecilesMap = new LinkedHashMap<>(); + parameterToDecilesMap.put(AlleleFractionParameter.MEAN_BIAS, new DecileCollection(meanBiasSamples)); + parameterToDecilesMap.put(AlleleFractionParameter.BIAS_VARIANCE, new DecileCollection(biasVarianceSamples)); + parameterToDecilesMap.put(AlleleFractionParameter.OUTLIER_PROBABILITY, new DecileCollection(outlierProbabilitySamples)); + return new ParameterDecileCollection<>(sampleMetadata, parameterToDecilesMap, AlleleFractionParameter.class, DOUBLE_FORMAT); + } + + //use width of a probability distribution given the position of its mode (estimated from Gaussian approximation) as step size + private static double approximatePosteriorWidthAtMode(final Function logPDF, + final double mode) { + final double absMode = Math.abs(mode); + final double epsilon = Math.min(1E-6, absMode / 2); //adjust scale if mode is very near zero + final double defaultWidth = absMode / 10; //if "mode" is not close to true mode of logPDF, approximation may not apply; just use 1 / 10 of absMode in this case + final double secondDerivative = (logPDF.apply(mode + epsilon) - 2 * logPDF.apply(mode) + logPDF.apply(mode - epsilon)) / (epsilon * epsilon); + return secondDerivative < 0 ? Math.sqrt(-1.0 / secondDerivative) : defaultWidth; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionParameter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionParameter.java new file mode 100644 index 00000000000..cff195ea12a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionParameter.java @@ -0,0 +1,21 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.utils.mcmc.ParameterEnum; + +/** + * Enumerates the parameters for {@link AlleleFractionState}. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +enum AlleleFractionParameter implements ParameterEnum { + MEAN_BIAS("AF_reference_bias_mean"), + BIAS_VARIANCE("AF_reference_bias_variance"), + OUTLIER_PROBABILITY("AF_outlier_probability"), + MINOR_ALLELE_FRACTIONS("AF_minor_allele_fractions"); + + final String name; + + AlleleFractionParameter(final String name) { + this.name = name; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionPrior.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionPrior.java new file mode 100644 index 00000000000..db7ac3132f6 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionPrior.java @@ -0,0 +1,23 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.utils.Utils; + +/** + * Represents priors for the allele-fraction model. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionPrior { + private final double minorAlleleFractionPriorAlpha; + + public AlleleFractionPrior(final double minorAlleleFractionPriorAlpha) { + Utils.validateArg(minorAlleleFractionPriorAlpha >= 1, + "Alpha hyperparameter for the 4-parameter beta-distribution prior on " + + "segment minor-allele fraction must be greater than or equal to one."); + this.minorAlleleFractionPriorAlpha = minorAlleleFractionPriorAlpha; + } + + double getMinorAlleleFractionPriorAlpha() { + return minorAlleleFractionPriorAlpha; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSamplers.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSamplers.java new file mode 100644 index 00000000000..6b474e20aad --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSamplers.java @@ -0,0 +1,184 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.distribution.BetaDistribution; +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler; +import org.broadinstitute.hellbender.utils.mcmc.SliceSampler; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Sampler classes for the allele-fraction model. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class AlleleFractionSamplers { + private static final Logger logger = LogManager.getLogger(AlleleFractionSamplers.class); + + private static final int NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD = 10000; + private static final int NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD = 1000; + + private AlleleFractionSamplers() {} + + static final class MeanBiasSampler implements ParameterSampler { + private static final double MIN_MEAN_BIAS = 0.; + + private final double maxMeanBias; + private final double meanBiasSliceSamplingWidth; + + MeanBiasSampler(final double maxMeanBias, + final double meanBiasSliceSamplingWidth) { + this.maxMeanBias = maxMeanBias; + this.meanBiasSliceSamplingWidth = meanBiasSliceSamplingWidth; + } + + @Override + public Double sample(final RandomGenerator rng, + final AlleleFractionState state, + final AlleleFractionSegmentedData data) { + logger.debug("Sampling mean bias..."); + final Function logLikelihoodEstimate = logLikelihoodFromSubsample( + rng, state.minorFractions(), data, NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD); + return new SliceSampler(rng, + x -> logLikelihoodEstimate.apply(state.globalParameters().copyWithNewMeanBias(x)), + MIN_MEAN_BIAS, maxMeanBias, meanBiasSliceSamplingWidth) + .sample(state.meanBias()); + } + } + + static final class BiasVarianceSampler implements ParameterSampler { + private static final double MIN_BIAS_VARIANCE = 1E-10; + + private final double maxBiasVariance; + private final double biasVarianceSliceSamplingWidth; + + BiasVarianceSampler(final double maxBiasVariance, + final double biasVarianceSliceSamplingWidth) { + this.maxBiasVariance = maxBiasVariance; + this.biasVarianceSliceSamplingWidth = biasVarianceSliceSamplingWidth; + } + + @Override + public Double sample(final RandomGenerator rng, + final AlleleFractionState state, + final AlleleFractionSegmentedData data) { + logger.debug("Sampling bias variance..."); + final Function logLikelihoodEstimate = logLikelihoodFromSubsample( + rng, state.minorFractions(), data, NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD); + return new SliceSampler(rng, + x -> logLikelihoodEstimate.apply(state.globalParameters().copyWithNewBiasVariance(x)), + MIN_BIAS_VARIANCE, maxBiasVariance, biasVarianceSliceSamplingWidth) + .sample(state.biasVariance()); + } + } + + static final class OutlierProbabilitySampler implements ParameterSampler { + private static final double MIN_OUTLIER_PROBABILITY = 0.; + + private final double maxOutlierProbability; + private final double outlierProbabilitySliceSamplingWidth; + + OutlierProbabilitySampler(final double maxOutlierProbability, + final double outlierProbabilitySliceSamplingWidth) { + this.maxOutlierProbability = maxOutlierProbability; + this.outlierProbabilitySliceSamplingWidth = outlierProbabilitySliceSamplingWidth; + } + + @Override + public Double sample(final RandomGenerator rng, + final AlleleFractionState state, + final AlleleFractionSegmentedData data) { + logger.debug("Sampling outlier probability..."); + final Function logLikelihoodEstimate = logLikelihoodFromSubsample( + rng, state.minorFractions(), data, NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD); + return new SliceSampler(rng, + x -> logLikelihoodEstimate.apply(state.globalParameters().copyWithNewOutlierProbability(x)), + MIN_OUTLIER_PROBABILITY, maxOutlierProbability, outlierProbabilitySliceSamplingWidth) + .sample(state.outlierProbability()); + } + } + + // sample minor fractions of all segments + static final class MinorFractionsSampler implements ParameterSampler { + private static double MIN_MINOR_FRACTION = 0.; + private static double MAX_MINOR_FRACTION = 0.5; + private static final double PRIOR_BETA = 1.; + + private final Function logPrior; + private final List sliceSamplingWidths; + + MinorFractionsSampler(final AlleleFractionPrior prior, + final List sliceSamplingWidths) { + logPrior = f -> new BetaDistribution(null, prior.getMinorAlleleFractionPriorAlpha(), PRIOR_BETA).logDensity(2 * f); + this.sliceSamplingWidths = sliceSamplingWidths; + } + + @Override + public AlleleFractionState.MinorFractions sample(final RandomGenerator rng, final AlleleFractionState state, final AlleleFractionSegmentedData data) { + final List minorFractions = new ArrayList<>(data.getNumSegments()); + for (int segment = 0; segment < data.getNumSegments(); segment++) { + logger.debug(String.format("Sampling minor fraction for segment %d...", segment)); + final List allelicCountsInSegment = + data.getIndexedAllelicCountsInSegment(segment); + if (allelicCountsInSegment.isEmpty()){ + minorFractions.add(Double.NaN); + } else { + final Function segmentLogLikelihoodEstimate = segmentLogLikelihoodFromSubsample( + rng, state.globalParameters(), allelicCountsInSegment, NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD); + final SliceSampler sampler = new SliceSampler(rng, + f -> logPrior.apply(f) + segmentLogLikelihoodEstimate.apply(f), + MIN_MINOR_FRACTION, MAX_MINOR_FRACTION, sliceSamplingWidths.get(segment)); + minorFractions.add(sampler.sample(state.segmentMinorFraction(segment))); + } + } + return new AlleleFractionState.MinorFractions(minorFractions); + } + } + + private static List subsample(final RandomGenerator rng, + final List allelicCounts, + final int numPointsSubsampleThreshold) { + //subsample the data if we are above the threshold + return allelicCounts.size() > numPointsSubsampleThreshold + ? IntStream.range(0, numPointsSubsampleThreshold).boxed().map(i -> rng.nextInt(allelicCounts.size())).map(allelicCounts::get).collect(Collectors.toList()) + : allelicCounts; + } + + private static Function logLikelihoodFromSubsample(final RandomGenerator rng, + final AlleleFractionState.MinorFractions minorFractions, + final AlleleFractionSegmentedData data, + final int numPointsSubsampleThreshold) { + final List subsampledAllelicCounts = + subsample(rng, data.getIndexedAllelicCounts(), numPointsSubsampleThreshold); + final double scalingFactor = (double) data.getNumPoints() / subsampledAllelicCounts.size(); + final Map> segmentIndexToSubsampledAllelicCountsInSegmentMap = + subsampledAllelicCounts.stream() + .collect(Collectors.groupingBy(AlleleFractionSegmentedData.IndexedAllelicCount::getSegmentIndex, Collectors.toList())); + return parameters -> { + double logLikelihood = 0.; + for (final int segmentIndex : segmentIndexToSubsampledAllelicCountsInSegmentMap.keySet()) { + logLikelihood += AlleleFractionLikelihoods.segmentLogLikelihood( + parameters, minorFractions.get(segmentIndex), segmentIndexToSubsampledAllelicCountsInSegmentMap.get(segmentIndex)); + } + return scalingFactor * logLikelihood; + }; + } + + private static Function segmentLogLikelihoodFromSubsample(final RandomGenerator rng, + final AlleleFractionGlobalParameters parameters, + final List allelicCountsInSegment, + final int numPointsSubsampleThreshold) { + final List subsampledAllelicCountsInSegment = + subsample(rng, allelicCountsInSegment, numPointsSubsampleThreshold); + final double scalingFactor = (double) allelicCountsInSegment.size() / subsampledAllelicCountsInSegment.size(); + return minorFraction -> scalingFactor * AlleleFractionLikelihoods.segmentLogLikelihood(parameters, minorFraction, subsampledAllelicCountsInSegment); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSegmentedData.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSegmentedData.java new file mode 100644 index 00000000000..afd35c6e28e --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSegmentedData.java @@ -0,0 +1,100 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import htsjdk.samtools.util.OverlapDetector; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.utils.IndexRange; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.mcmc.DataCollection; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * {@link DataCollection} for the allele-fraction model containing the het alt and ref counts grouped by segment. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class AlleleFractionSegmentedData implements DataCollection { + private final AllelicCountCollection allelicCounts; + private final List segments; + + private final List indexedAllelicCounts; + private final List indexRangesPerSegment; + + AlleleFractionSegmentedData(final AllelicCountCollection allelicCounts, + final List segments) { + this.allelicCounts = Utils.nonNull(allelicCounts); + this.segments = Utils.nonEmpty(segments).stream().sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList()); + + indexedAllelicCounts = new ArrayList<>(allelicCounts.size()); + indexRangesPerSegment = new ArrayList<>(segments.size()); + + final OverlapDetector allelicCountOverlapDetector = allelicCounts.getOverlapDetector(); + int startIndex = 0; + for (int segmentIndex = 0; segmentIndex < segments.size(); segmentIndex++) { + final SimpleInterval segment = segments.get(segmentIndex); + final List allelicCountsInSegment = allelicCountOverlapDetector.getOverlaps(segment).stream() + .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR) + .collect(Collectors.toList()); + final int segmentStartIndex = startIndex; + final int si = segmentIndex; + IntStream.range(0, allelicCountsInSegment.size()).boxed() + .map(i -> new IndexedAllelicCount(allelicCountsInSegment.get(i), segmentStartIndex + i, si)) + .forEach(indexedAllelicCounts::add); + indexRangesPerSegment.add(new IndexRange(segmentStartIndex, segmentStartIndex + allelicCountsInSegment.size())); + startIndex += allelicCountsInSegment.size(); + } + } + + AllelicCountCollection getAllelicCounts() { + return allelicCounts; + } + + List getSegments() { + return Collections.unmodifiableList(segments); + } + + int getNumSegments() { + return segments.size(); + } + + int getNumPoints() { + return allelicCounts.size(); + } + + List getIndexedAllelicCounts() { + return Collections.unmodifiableList(indexedAllelicCounts); + } + + List getIndexedAllelicCountsInSegment(final int segmentIndex) { + return Collections.unmodifiableList(indexedAllelicCounts.subList( + indexRangesPerSegment.get(segmentIndex).from, indexRangesPerSegment.get(segmentIndex).to)); + } + + static final class IndexedAllelicCount extends AllelicCount { + private final int index; + private final int segmentIndex; + + private IndexedAllelicCount(final AllelicCount allelicCount, + final int index, + final int segmentIndex) { + super(allelicCount.getInterval(), allelicCount.getRefReadCount(), allelicCount.getAltReadCount(), allelicCount.getRefNucleotide(), allelicCount.getAltNucleotide()); + this.index = index; + this.segmentIndex = segmentIndex; + } + + int getIndex() { + return index; + } + + int getSegmentIndex() { + return segmentIndex; + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionState.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionState.java new file mode 100644 index 00000000000..c983fd6cacc --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionState.java @@ -0,0 +1,69 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.utils.mcmc.Parameter; +import org.broadinstitute.hellbender.utils.mcmc.ParameterizedState; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * The state of the allele-fraction model, containing:

+ * 1. the global mean reference bias

+ * 2. the global variance of the reference bias

+ * 3. the global outlier probability

+ * 4. minor-allele fractions for each segment

+ *

+ * See docs/CNVs/CNV-methods.pdf for details. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class AlleleFractionState extends ParameterizedState { + static final class MinorFractions extends ArrayList { + private static final long serialVersionUID = 1029384756L; + + MinorFractions(final int numSegments) { + super(numSegments); + } + + MinorFractions(final List minorFractions) { + super(new ArrayList<>(minorFractions)); + } + } + + AlleleFractionState(final double meanBias, + final double biasVariance, + final double outlierProbability, + final MinorFractions minorFractions) { + super(Arrays.asList( + new Parameter<>(AlleleFractionParameter.MEAN_BIAS, meanBias), + new Parameter<>(AlleleFractionParameter.BIAS_VARIANCE, biasVariance), + new Parameter<>(AlleleFractionParameter.OUTLIER_PROBABILITY, outlierProbability), + new Parameter<>(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, minorFractions))); + } + + double meanBias() { + return get(AlleleFractionParameter.MEAN_BIAS, Double.class); + } + + double biasVariance() { + return get(AlleleFractionParameter.BIAS_VARIANCE, Double.class); + } + + double outlierProbability() { + return get(AlleleFractionParameter.OUTLIER_PROBABILITY, Double.class); + } + + double segmentMinorFraction(final int segment) { + return get(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, MinorFractions.class).get(segment); + } + + AlleleFractionGlobalParameters globalParameters() { + return new AlleleFractionGlobalParameters(meanBias(), biasVariance(), outlierProbability()); + } + + MinorFractions minorFractions() { + return get(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, MinorFractions.class); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModeller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModeller.java new file mode 100644 index 00000000000..dbdfc437936 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModeller.java @@ -0,0 +1,166 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.mcmc.DecileCollection; +import org.broadinstitute.hellbender.utils.mcmc.GibbsSampler; +import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler; +import org.broadinstitute.hellbender.utils.mcmc.ParameterizedModel; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * Represents a segmented model for copy ratio fit to denoised log2 copy-ratio data. + * The log2 copy ratios in each segment are fit by a mixture model with a normal-distribution component + * and a uniform outlier component. The variance of the normal-distribution component and the relative + * contribution of the uniform outlier component in all segments are both assumed to be global parameters. + * The mean of the normal-distribution component in each segment is taken to be a segment-level parameter. + * The component (i.e., normal or outlier) that each copy-ratio point is drawn from is determined by a latent + * point-level indicator. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class CopyRatioModeller { + private static final String DOUBLE_FORMAT = MultidimensionalModeller.DOUBLE_FORMAT; + + private static final double EPSILON = 1E-6; + static final double LOG2_COPY_RATIO_MIN = -50.; + static final double LOG2_COPY_RATIO_MAX = 10.; + private static final double LOG2_COPY_RATIO_RANGE = LOG2_COPY_RATIO_MAX - LOG2_COPY_RATIO_MIN; + private static final double VARIANCE_MIN = EPSILON; + + private static final double OUTLIER_PROBABILITY_INITIAL = 0.05; + private static final double OUTLIER_PROBABILITY_PRIOR_ALPHA = 5.; + private static final double OUTLIER_PROBABILITY_PRIOR_BETA = 95.; + + private final SampleMetadata sampleMetadata; + private final ParameterizedModel model; + + private final List varianceSamples = new ArrayList<>(); + private final List outlierProbabilitySamples = new ArrayList<>(); + private final List segmentMeansSamples = new ArrayList<>(); + + /** + * Constructs a copy-ratio model given copy ratios and segments. + * Initial point estimates of parameters are set to empirical estimates where available. + */ + CopyRatioModeller(final CopyRatioCollection copyRatios, + final List segments) { + Utils.nonNull(copyRatios); + Utils.nonEmpty(segments); + + sampleMetadata = copyRatios.getSampleMetadata(); + final CopyRatioSegmentedData data = new CopyRatioSegmentedData(copyRatios, segments); + + //set widths for slice sampling of variance and segment-mean posteriors using empirical variance estimate. + //variance posterior is inverse chi-squared, segment-mean posteriors are Gaussian; the below expressions + //approximate the standard deviations of these distributions. + //we also make sure all initial values are within appropriate bounds + final double dataRangeOrNaN = data.getMaxLog2CopyRatioValue() - data.getMinLog2CopyRatioValue(); + final double dataRange = Double.isNaN(dataRangeOrNaN) ? LOG2_COPY_RATIO_RANGE : dataRangeOrNaN; + final double varianceEstimateOrNaN = data.estimateVariance(); + final double varianceEstimate = Double.isNaN(varianceEstimateOrNaN) ? VARIANCE_MIN : Math.max(varianceEstimateOrNaN, VARIANCE_MIN); + final double varianceSliceSamplingWidth = 2. * varianceEstimate; + final double varianceMax = Math.max(10. * varianceEstimate, dataRange * dataRange); + final double meanSliceSamplingWidth = Math.sqrt(varianceEstimate * data.getNumSegments() / data.getNumPoints()); + final List segmentMeans = data.estimateSegmentMeans().stream() + .map(m -> Math.max(LOG2_COPY_RATIO_MIN, Math.min(LOG2_COPY_RATIO_MAX, m))) + .collect(Collectors.toList()); + + //the uniform log-likelihood for outliers is determined by the minimum and maximum coverages in the dataset; + //the outlier-probability parameter should be interpreted accordingly + final double outlierUniformLogLikelihood = -Math.log(dataRange); + + //use empirical segment means and empirical average variance across segments to initialize CopyRatioState + final CopyRatioState initialState = new CopyRatioState(varianceEstimate, CopyRatioModeller.OUTLIER_PROBABILITY_INITIAL, + new CopyRatioState.SegmentMeans(segmentMeans), new CopyRatioState.OutlierIndicators(Collections.nCopies(data.getNumPoints(), false))); + + //define ParameterSamplers + final ParameterSampler varianceSampler = + new CopyRatioSamplers.VarianceSampler(VARIANCE_MIN, varianceMax, varianceSliceSamplingWidth); + final ParameterSampler outlierProbabilitySampler = + new CopyRatioSamplers.OutlierProbabilitySampler(OUTLIER_PROBABILITY_PRIOR_ALPHA, OUTLIER_PROBABILITY_PRIOR_BETA); + final ParameterSampler segmentMeansSampler = + new CopyRatioSamplers.SegmentMeansSampler(LOG2_COPY_RATIO_MIN, LOG2_COPY_RATIO_MAX, meanSliceSamplingWidth); + final ParameterSampler outlierIndicatorsSampler = + new CopyRatioSamplers.OutlierIndicatorsSampler(outlierUniformLogLikelihood); + + model = new ParameterizedModel.GibbsBuilder<>(initialState, data) + .addParameterSampler(CopyRatioParameter.VARIANCE, varianceSampler, Double.class) + .addParameterSampler(CopyRatioParameter.OUTLIER_PROBABILITY, outlierProbabilitySampler, Double.class) + .addParameterSampler(CopyRatioParameter.SEGMENT_MEANS, segmentMeansSampler, CopyRatioState.SegmentMeans.class) + .addParameterSampler(CopyRatioParameter.OUTLIER_INDICATORS, outlierIndicatorsSampler, CopyRatioState.OutlierIndicators.class) + .build(); + } + + /** + * Adds {@code numSamples - numBurnIn} Markov-Chain Monte-Carlo samples of the parameter posteriors (generated using + * Gibbs sampling) to the collections held internally. The current {@link CopyRatioState} held internally is used + * to initialize the Markov Chain. + * @param numSamples total number of samples per posterior + * @param numBurnIn number of burn-in samples to discard + */ + void fitMCMC(final int numSamples, + final int numBurnIn) { + ParamUtils.isPositiveOrZero(numBurnIn, "Number of burn-in samples must be non-negative."); + Utils.validateArg(numBurnIn < numSamples, "Number of samples must be greater than number of burn-in samples."); + + //run MCMC + final GibbsSampler gibbsSampler = new GibbsSampler<>(numSamples, model); + gibbsSampler.runMCMC(); + + //update posterior samples + varianceSamples.addAll(gibbsSampler.getSamples(CopyRatioParameter.VARIANCE, Double.class, numBurnIn)); + outlierProbabilitySamples.addAll(gibbsSampler.getSamples(CopyRatioParameter.OUTLIER_PROBABILITY, Double.class, numBurnIn)); + segmentMeansSamples.addAll(gibbsSampler.getSamples(CopyRatioParameter.SEGMENT_MEANS, CopyRatioState.SegmentMeans.class, numBurnIn)); + } + + List getVarianceSamples() { + return Collections.unmodifiableList(varianceSamples); + } + + List getOutlierProbabilitySamples() { + return Collections.unmodifiableList(outlierProbabilitySamples); + } + + List getSegmentMeansSamples() { + return Collections.unmodifiableList(segmentMeansSamples); + } + + /** + * Should only be called after {@link #fitMCMC} has been called. + */ + List getSegmentMeansPosteriorSummaries() { + if (segmentMeansSamples.isEmpty()) { + throw new IllegalStateException("Attempted to get posterior summaries for segment means before MCMC was performed."); + } + final int numSegments = segmentMeansSamples.get(0).size(); + final List posteriorSummaries = new ArrayList<>(numSegments); + for (int segment = 0; segment < numSegments; segment++) { + final int j = segment; + final List meanSamples = + segmentMeansSamples.stream().map(s -> s.get(j)).collect(Collectors.toList()); + posteriorSummaries.add(new ModeledSegment.SimplePosteriorSummary(meanSamples)); + } + return posteriorSummaries; + } + + /** + * Should only be called after {@link #fitMCMC} has been called. + */ + ParameterDecileCollection getGlobalParameterDeciles() { + if (varianceSamples.isEmpty()) { + throw new IllegalStateException("Attempted to get posterior summaries for global parameters before MCMC was performed."); + } + final Map parameterToDecilesMap = new LinkedHashMap<>(); + parameterToDecilesMap.put(CopyRatioParameter.VARIANCE, new DecileCollection(varianceSamples)); + parameterToDecilesMap.put(CopyRatioParameter.OUTLIER_PROBABILITY, new DecileCollection(outlierProbabilitySamples)); + return new ParameterDecileCollection<>(sampleMetadata, parameterToDecilesMap, CopyRatioParameter.class, DOUBLE_FORMAT); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioParameter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioParameter.java new file mode 100644 index 00000000000..53bb863691d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioParameter.java @@ -0,0 +1,21 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.utils.mcmc.ParameterEnum; + +/** + * Enumerates the parameters for {@link CopyRatioState}. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +enum CopyRatioParameter implements ParameterEnum { + VARIANCE("CR_variance"), + OUTLIER_PROBABILITY("CR_outlier_probability"), + SEGMENT_MEANS("CR_segment_means"), + OUTLIER_INDICATORS("CR_outlier_indicators"); + + final String name; + + CopyRatioParameter(final String name) { + this.name = name; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java new file mode 100644 index 00000000000..e60b0d47e71 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSamplers.java @@ -0,0 +1,198 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.distribution.BetaDistribution; +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.util.FastMath; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.utils.MathUtils; +import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler; +import org.broadinstitute.hellbender.utils.mcmc.SliceSampler; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class CopyRatioSamplers { + private static final Logger logger = LogManager.getLogger(CopyRatioSamplers.class); + + private static final int NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD = 10000; + private static final int NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD = 1000; + + private CopyRatioSamplers() {} + + //Calculates the exponent for a normal distribution; used in log-likelihood calculation below. + private static double normalTerm(final double quantity, + final double mean, + final double variance) { + return (quantity - mean) * (quantity - mean) / (2. * variance); + } + + //samples log conditional posterior for the variance parameter, assuming uniform prior; this is given by + //the product of Gaussian likelihoods for each non-outlier point t: + // log[product_{non-outlier t} variance^(-1/2) * exp(-(log2cr_t - mean_t)^2 / (2 * variance))] + constant + //where mean_t is identical for all points in a segment + static final class VarianceSampler implements ParameterSampler { + private final double varianceMin; + private final double varianceMax; + private final double varianceSliceSamplingWidth; + + VarianceSampler(final double varianceMin, + final double varianceMax, + final double varianceSliceSamplingWidth) { + this.varianceMin = varianceMin; + this.varianceMax = varianceMax; + this.varianceSliceSamplingWidth = varianceSliceSamplingWidth; + } + + @Override + public Double sample(final RandomGenerator rng, + final CopyRatioState state, + final CopyRatioSegmentedData data) { + logger.debug("Sampling variance..."); + final List indexedCopyRatiosSubsample = subsample( + rng, data.getIndexedCopyRatios(), NUM_POINTS_GLOBAL_SUBSAMPLE_THRESHOLD); + final double scalingFactor = (double) data.getNumPoints() / indexedCopyRatiosSubsample.size(); + final Function logConditionalPDF = newVariance -> { + final double gaussianLogNormalization = 0.5 * FastMath.log(newVariance); + double ll = 0.; + for (final CopyRatioSegmentedData.IndexedCopyRatio indexedCopyRatio : indexedCopyRatiosSubsample) { + if (!state.outlierIndicator(indexedCopyRatio.getIndex())) { + ll -= gaussianLogNormalization + + normalTerm( + indexedCopyRatio.getLog2CopyRatioValue(), + state.segmentMean(indexedCopyRatio.getSegmentIndex()), + newVariance); + } + } + return scalingFactor * ll; + }; + return new SliceSampler(rng, logConditionalPDF, varianceMin, varianceMax, varianceSliceSamplingWidth).sample(state.variance()); + } + } + + //samples log conditional posterior for the outlier-probability parameter, assuming Beta(alpha, beta) prior; + //this is given by: + // log Beta(alpha + number of outlier points, beta + number of non-outlier points) + constant + static final class OutlierProbabilitySampler implements ParameterSampler { + private final double outlierProbabilityPriorAlpha; + private final double outlierProbabilityPriorBeta; + + OutlierProbabilitySampler(final double outlierProbabilityPriorAlpha, + final double outlierProbabilityPriorBeta) { + this.outlierProbabilityPriorAlpha = outlierProbabilityPriorAlpha; + this.outlierProbabilityPriorBeta = outlierProbabilityPriorBeta; + } + + @Override + public Double sample(final RandomGenerator rng, + final CopyRatioState state, + final CopyRatioSegmentedData data) { + logger.debug("Sampling outlier probability..."); + final int numOutliers = (int) IntStream.range(0, data.getNumPoints()).filter(state::outlierIndicator).count(); + return new BetaDistribution(rng, + outlierProbabilityPriorAlpha + numOutliers, + outlierProbabilityPriorBeta + data.getNumPoints() - numOutliers).sample(); + } + } + + //samples log conditional posteriors for the segment-mean parameters, assuming uniform priors bounded by minimum and maximum log2 copy-ratio values; + //for each segment s, this is given by the product of Gaussian likelihoods for each non-outlier point t: + // log[product_{non-outlier t in s} exp(-(log2cr_t - mean_s)^2 / (2 * variance))] + constant + static final class SegmentMeansSampler implements ParameterSampler { + private final double meanMin; + private final double meanMax; + private final double meanSliceSamplingWidth; + + SegmentMeansSampler(final double meanMin, + final double meanMax, + final double meanSliceSamplingWidth) { + this.meanMin = meanMin; + this.meanMax = meanMax; + this.meanSliceSamplingWidth = meanSliceSamplingWidth; + } + + @Override + public CopyRatioState.SegmentMeans sample(final RandomGenerator rng, + final CopyRatioState state, + final CopyRatioSegmentedData data) { + final List means = new ArrayList<>(data.getNumSegments()); + for (int segment = 0; segment < data.getNumSegments(); segment++) { + final List indexedCopyRatiosInSegment = data.getIndexedCopyRatiosInSegment(segment); + if (indexedCopyRatiosInSegment.isEmpty()) { + means.add(Double.NaN); + } else { + logger.debug(String.format("Sampling mean for segment %d...", segment)); + final List indexedCopyRatiosInSegmentSubsample = subsample( + rng, indexedCopyRatiosInSegment, NUM_POINTS_SEGMENT_SUBSAMPLE_THRESHOLD); + final double scalingFactor = (double) indexedCopyRatiosInSegment.size() / indexedCopyRatiosInSegmentSubsample.size(); + final Function logConditionalPDF = newMean -> + scalingFactor * indexedCopyRatiosInSegmentSubsample.stream() + .filter(c -> !state.outlierIndicator(c.getIndex())) + .mapToDouble(c -> -normalTerm(c.getLog2CopyRatioValue(), newMean, state.variance())) + .sum(); + final SliceSampler sampler = new SliceSampler(rng, logConditionalPDF, meanMin, meanMax, meanSliceSamplingWidth); + means.add(sampler.sample(state.segmentMean(segment))); + } + } + return new CopyRatioState.SegmentMeans(means); + } + } + + //samples log conditional posteriors for the outlier-indicator parameters; for each point t, this is given by: + // z_t * [log outlier_prob + outlierUniformLogLikelihood] + // + (1 - z_t) * [log(1 - outlier_prob) - log(2 * pi * variance)/2 - (log2cr_t - mean_t)^2 / (2 * variance)] + // + const + //where z_t is the indicator for point t, and outlier_prob is the outlier probability. + //note that we compute the normalizing constant, so that we can sample a new indicator value by simply sampling + //uniformly in [0, 1] and checking whether the resulting value is less than the probability of being an outlier + //(corresponding to the first line in the unnormalized expression above) + static final class OutlierIndicatorsSampler implements ParameterSampler { + private final double outlierUniformLogLikelihood; + + OutlierIndicatorsSampler(final double outlierUniformLogLikelihood) { + this.outlierUniformLogLikelihood = outlierUniformLogLikelihood; + } + + @Override + public CopyRatioState.OutlierIndicators sample(final RandomGenerator rng, + final CopyRatioState state, + final CopyRatioSegmentedData data) { + logger.debug("Sampling outlier indicators..."); + final double outlierUnnormalizedLogProbability = + Math.log(state.outlierProbability()) + outlierUniformLogLikelihood; + final double notOutlierUnnormalizedLogProbabilityPrefactor = + Math.log(1. - state.outlierProbability()) - 0.5 * Math.log(2 * Math.PI * state.variance()); + final List indicators = new ArrayList<>(); + for (int segment = 0; segment < data.getNumSegments(); segment++) { + final List indexedCopyRatiosInSegment = data.getIndexedCopyRatiosInSegment(segment); + for (final CopyRatioSegmentedData.IndexedCopyRatio indexedCopyRatio : indexedCopyRatiosInSegment) { + final double notOutlierUnnormalizedLogProbability = + notOutlierUnnormalizedLogProbabilityPrefactor + - normalTerm(indexedCopyRatio.getLog2CopyRatioValue(), state.segmentMean(segment), state.variance()); + //note: we are working in natural log space, so we divide by ln(10) before using normalizeFromLog10 + final double conditionalProbability = + MathUtils.normalizeFromLog10ToLinearSpace(new double[]{ + MathUtils.logToLog10(outlierUnnormalizedLogProbability), + MathUtils.logToLog10(notOutlierUnnormalizedLogProbability)})[0]; + indicators.add(rng.nextDouble() < conditionalProbability); + } + } + return new CopyRatioState.OutlierIndicators(indicators); + } + } + + private static List subsample(final RandomGenerator rng, + final List copyRatios, + final int numPointsSubsampleThreshold) { + //subsample the data if we are above the threshold + return copyRatios.size() > numPointsSubsampleThreshold + ? IntStream.range(0, numPointsSubsampleThreshold).boxed().map(i -> rng.nextInt(copyRatios.size())).map(copyRatios::get).collect(Collectors.toList()) + : copyRatios; + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSegmentedData.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSegmentedData.java new file mode 100644 index 00000000000..d2025a6fbeb --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSegmentedData.java @@ -0,0 +1,145 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import com.google.common.primitives.Doubles; +import htsjdk.samtools.util.OverlapDetector; +import org.apache.commons.math3.stat.descriptive.moment.Mean; +import org.apache.commons.math3.stat.descriptive.moment.Variance; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.utils.IndexRange; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.mcmc.DataCollection; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * {@link DataCollection} for the copy-ratio model containing the copy-ratio data grouped by segment. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class CopyRatioSegmentedData implements DataCollection { + private final CopyRatioCollection copyRatios; + private final List segments; + private final double minLog2CopyRatioValue; + private final double maxLog2CopyRatioValue; + + private final List indexedCopyRatios; + private final List indexRangesPerSegment; + + CopyRatioSegmentedData(final CopyRatioCollection copyRatios, + final List segments) { + this.copyRatios = Utils.nonNull(copyRatios); + this.segments = Utils.nonEmpty(segments).stream().sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR).collect(Collectors.toList()); + + final List log2CopyRatioValues = copyRatios.getLog2CopyRatioValues(); + minLog2CopyRatioValue = log2CopyRatioValues.stream().min(Double::compareTo).orElse(Double.NaN); + maxLog2CopyRatioValue = log2CopyRatioValues.stream().max(Double::compareTo).orElse(Double.NaN); + + indexedCopyRatios = new ArrayList<>(copyRatios.size()); + indexRangesPerSegment = new ArrayList<>(segments.size()); + + //construct list of lists of copy ratios with an index in order corresponding to that of segments; + //segment assignment is based on midpoint of copy-ratio interval + final OverlapDetector copyRatioMidpointOverlapDetector = copyRatios.getMidpointOverlapDetector(); + int index = 0; + for (int segmentIndex = 0; segmentIndex < segments.size(); segmentIndex++) { + final SimpleInterval segment = segments.get(segmentIndex); + final List copyRatiosInSegment = copyRatioMidpointOverlapDetector.getOverlaps(segment).stream() + .sorted(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR) + .collect(Collectors.toList()); + final int segmentStartIndex = index; + final int si = segmentIndex; + IntStream.range(0, copyRatiosInSegment.size()).boxed() + .map(i -> new IndexedCopyRatio(copyRatiosInSegment.get(i), segmentStartIndex + i, si)) + .forEach(indexedCopyRatios::add); + indexRangesPerSegment.add(new IndexRange(segmentStartIndex, segmentStartIndex + copyRatiosInSegment.size())); + index += copyRatiosInSegment.size(); + } + } + + CopyRatioCollection getCopyRatios() { + return copyRatios; + } + + List getSegments() { + return Collections.unmodifiableList(segments); + } + + int getNumSegments() { + return segments.size(); + } + + int getNumPoints() { + return copyRatios.size(); + } + + String getSampleName() { + return copyRatios.getSampleName(); + } + + double getMinLog2CopyRatioValue() { + return minLog2CopyRatioValue; + } + + double getMaxLog2CopyRatioValue() { + return maxLog2CopyRatioValue; + } + + List getIndexedCopyRatios() { + return Collections.unmodifiableList(indexedCopyRatios); + } + + List getIndexedCopyRatiosInSegment(final int segmentIndex) { + return Collections.unmodifiableList(indexedCopyRatios.subList( + indexRangesPerSegment.get(segmentIndex).from, indexRangesPerSegment.get(segmentIndex).to)); + } + + //estimate global variance empirically by taking average of all per-segment variances + double estimateVariance() { + return IntStream.range(0, segments.size()) + .mapToDouble(s -> new Variance().evaluate(Doubles.toArray( + getIndexedCopyRatiosInSegment(s).stream() + .map(IndexedCopyRatio::getLog2CopyRatioValue) + .collect(Collectors.toList())))) + .filter(v -> !Double.isNaN(v)) + .average().orElse(Double.NaN); + } + + //estimate segment means empirically by taking averages of log2 copy ratios in each segment + CopyRatioState.SegmentMeans estimateSegmentMeans() { + final List means = IntStream.range(0, segments.size()).boxed() + .map(s -> new Mean().evaluate(Doubles.toArray( + getIndexedCopyRatiosInSegment(s).stream() + .map(IndexedCopyRatio::getLog2CopyRatioValue) + .collect(Collectors.toList())))) + .collect(Collectors.toList()); + return new CopyRatioState.SegmentMeans(means); + } + + static final class IndexedCopyRatio extends CopyRatio { + private final int index; + private final int segmentIndex; + + private IndexedCopyRatio(final CopyRatio copyRatio, + final int index, + final int segmentIndex) { + super(copyRatio.getInterval(), copyRatio.getLog2CopyRatioValue()); + this.index = index; + this.segmentIndex = segmentIndex; + } + + int getIndex() { + return index; + } + + int getSegmentIndex() { + return segmentIndex; + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioState.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioState.java new file mode 100644 index 00000000000..445663c4f84 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioState.java @@ -0,0 +1,67 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.broadinstitute.hellbender.utils.mcmc.Parameter; +import org.broadinstitute.hellbender.utils.mcmc.ParameterizedState; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; +import java.util.stream.IntStream; + +/** + * The state of the copy-ratio model, containing:

+ * 1. the global variance

+ * 2. the global outlier probability

+ * 3. log2 mean copy ratios for each segment

+ * 4. outlier indicators for each copy-ratio interval

+ *

+ * See docs/CNVs/CNV-methods.pdf for details. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class CopyRatioState extends ParameterizedState { + static final class SegmentMeans extends ArrayList { + private static final long serialVersionUID = 951753L; + + SegmentMeans(final List segmentMeans) { + super(new ArrayList<>(segmentMeans)); + } + } + + static final class OutlierIndicators extends BitSet { + private static final long serialVersionUID = 357159L; + + OutlierIndicators(final List outlierIndicators) { + super(outlierIndicators.size()); + IntStream.range(0, outlierIndicators.size()).filter(outlierIndicators::get).forEach(this::set); + } + } + + CopyRatioState(final double variance, + final double outlierProbability, + final SegmentMeans segmentMeans, + final OutlierIndicators outlierIndicators) { + super(Arrays.asList( + new Parameter<>(CopyRatioParameter.VARIANCE, variance), + new Parameter<>(CopyRatioParameter.OUTLIER_PROBABILITY, outlierProbability), + new Parameter<>(CopyRatioParameter.SEGMENT_MEANS, segmentMeans), + new Parameter<>(CopyRatioParameter.OUTLIER_INDICATORS, outlierIndicators))); + } + + double variance() { + return get(CopyRatioParameter.VARIANCE, Double.class); + } + + double outlierProbability() { + return get(CopyRatioParameter.OUTLIER_PROBABILITY, Double.class); + } + + double segmentMean(final int segmentIndex) { + return get(CopyRatioParameter.SEGMENT_MEANS, CopyRatioState.SegmentMeans.class).get(segmentIndex); + } + + boolean outlierIndicator(final int copyRatioIndex) { + return get(CopyRatioParameter.OUTLIER_INDICATORS, CopyRatioState.OutlierIndicators.class).get(copyRatioIndex); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModeller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModeller.java new file mode 100644 index 00000000000..d1455e617cd --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModeller.java @@ -0,0 +1,311 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.util.OverlapDetector; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ModeledSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.MultidimensionalSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Represents a segmented model for copy ratio and allele fraction. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class MultidimensionalModeller { + private static final Logger logger = LogManager.getLogger(MultidimensionalModeller.class); + + public static final String DOUBLE_FORMAT = "%6.6f"; + + private final SampleMetadata sampleMetadata; + private final CopyRatioCollection denoisedCopyRatios; + private final OverlapDetector copyRatioMidpointOverlapDetector; + private final AllelicCountCollection allelicCounts; + private final OverlapDetector allelicCountOverlapDetector; + private final AlleleFractionPrior alleleFractionPrior; + + private CopyRatioModeller copyRatioModeller; + private AlleleFractionModeller alleleFractionModeller; + + private List currentSegments; + private final List modeledSegments = new ArrayList<>(); + + //similar-segment merging may leave model in a state where it is not properly fit (deciles may be estimated naively) + private boolean isModelFit; + + private final int numSamplesCopyRatio; + private final int numBurnInCopyRatio; + private final int numSamplesAlleleFraction; + private final int numBurnInAlleleFraction; + + /** + * Constructs a copy-ratio and allele-fraction modeller, specifying number of total samples + * and number of burn-in samples for Markov-Chain Monte Carlo model fitting. + * An initial model fit is performed. + */ + public MultidimensionalModeller(final MultidimensionalSegmentCollection multidimensionalSegments, + final CopyRatioCollection denoisedCopyRatios, + final AllelicCountCollection allelicCounts, + final AlleleFractionPrior alleleFractionPrior, + final int numSamplesCopyRatio, + final int numBurnInCopyRatio, + final int numSamplesAlleleFraction, + final int numBurnInAlleleFraction) { + Utils.validateArg(Stream.of( + Utils.nonNull(multidimensionalSegments).getSampleName(), + Utils.nonNull(denoisedCopyRatios).getSampleName(), + Utils.nonNull(allelicCounts).getSampleName()).distinct().count() == 1, + "Sample names from all inputs must match."); + ParamUtils.isPositive(multidimensionalSegments.size(), "Number of segments must be positive."); + sampleMetadata = multidimensionalSegments.getSampleMetadata(); + currentSegments = multidimensionalSegments.getIntervals(); + this.denoisedCopyRatios = denoisedCopyRatios; + copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector(); + this.allelicCounts = allelicCounts; + allelicCountOverlapDetector = allelicCounts.getOverlapDetector(); + this.alleleFractionPrior = Utils.nonNull(alleleFractionPrior); + this.numSamplesCopyRatio = numSamplesCopyRatio; + this.numBurnInCopyRatio = numBurnInCopyRatio; + this.numSamplesAlleleFraction = numSamplesAlleleFraction; + this.numBurnInAlleleFraction = numBurnInAlleleFraction; + logger.info("Fitting initial model..."); + fitModel(); + } + + public ModeledSegmentCollection getModeledSegments() { + return new ModeledSegmentCollection(sampleMetadata, modeledSegments); + } + + /** + * Performs Markov-Chain Monte Carlo model fitting using the + * number of total samples and number of burn-in samples specified at construction. + */ + private void fitModel() { + //perform MCMC to generate posterior samples + logger.info("Fitting copy-ratio model..."); + copyRatioModeller = new CopyRatioModeller(denoisedCopyRatios, currentSegments); + copyRatioModeller.fitMCMC(numSamplesCopyRatio, numBurnInCopyRatio); + logger.info("Fitting allele-fraction model..."); + alleleFractionModeller = new AlleleFractionModeller(allelicCounts, currentSegments, alleleFractionPrior); + alleleFractionModeller.fitMCMC(numSamplesAlleleFraction, numBurnInAlleleFraction); + + //update list of ModeledSegment with new PosteriorSummaries + modeledSegments.clear(); + final List segmentMeansPosteriorSummaries = + copyRatioModeller.getSegmentMeansPosteriorSummaries(); + final List minorAlleleFractionsPosteriorSummaries = + alleleFractionModeller.getMinorAlleleFractionsPosteriorSummaries(); + for (int segmentIndex = 0; segmentIndex < currentSegments.size(); segmentIndex++) { + final SimpleInterval segment = currentSegments.get(segmentIndex); + final int numPointsCopyRatio = copyRatioMidpointOverlapDetector.getOverlaps(segment).size(); + final int numPointsAlleleFraction = allelicCountOverlapDetector.getOverlaps(segment).size(); + final ModeledSegment.SimplePosteriorSummary segmentMeansPosteriorSummary = segmentMeansPosteriorSummaries.get(segmentIndex); + final ModeledSegment.SimplePosteriorSummary minorAlleleFractionPosteriorSummary = minorAlleleFractionsPosteriorSummaries.get(segmentIndex); + modeledSegments.add(new ModeledSegment( + segment, numPointsCopyRatio, numPointsAlleleFraction, segmentMeansPosteriorSummary, minorAlleleFractionPosteriorSummary)); + } + isModelFit = true; + } + + /** + * @param numSmoothingIterationsPerFit if this is zero, no refitting will be performed between smoothing iterations + */ + public void smoothSegments(final int maxNumSmoothingIterations, + final int numSmoothingIterationsPerFit, + final double smoothingCredibleIntervalThresholdCopyRatio, + final double smoothingCredibleIntervalThresholdAlleleFraction) { + ParamUtils.isPositiveOrZero(maxNumSmoothingIterations, + "The maximum number of smoothing iterations must be non-negative."); + ParamUtils.isPositiveOrZero(smoothingCredibleIntervalThresholdCopyRatio, + "The number of smoothing iterations per fit must be non-negative."); + ParamUtils.isPositiveOrZero(smoothingCredibleIntervalThresholdAlleleFraction, + "The allele-fraction credible-interval threshold for segmentation smoothing must be non-negative."); + logger.info(String.format("Initial number of segments before smoothing: %d", modeledSegments.size())); + //perform iterations of similar-segment merging until all similar segments are merged + for (int numIterations = 1; numIterations <= maxNumSmoothingIterations; numIterations++) { + logger.info(String.format("Smoothing iteration: %d", numIterations)); + final int prevNumSegments = modeledSegments.size(); + if (numSmoothingIterationsPerFit > 0 && numIterations % numSmoothingIterationsPerFit == 0) { + //refit model after this merge iteration + performSmoothingIteration(smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction, true); + } else { + //do not refit model after this merge iteration (posterior modes will be identical to posterior medians) + performSmoothingIteration(smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction, false); + } + if (modeledSegments.size() == prevNumSegments) { + break; + } + } + if (!isModelFit) { + //make sure final model is completely fit (i.e., posterior modes are specified) + fitModel(); + } + logger.info(String.format("Final number of segments after smoothing: %d", modeledSegments.size())); + } + + /** + * Performs one iteration of similar-segment merging on the list of {@link ModeledSegment} held internally. + * Markov-Chain Monte Carlo model fitting is optionally performed after each iteration using the + * number of total samples and number of burn-in samples specified at construction. + * @param intervalThresholdSegmentMean threshold number of credible intervals for segment-mean similarity + * @param intervalThresholdMinorAlleleFraction threshold number of credible intervals for minor-allele-fraction similarity + * @param doModelFit if true, refit MCMC model after merging + */ + private void performSmoothingIteration(final double intervalThresholdSegmentMean, + final double intervalThresholdMinorAlleleFraction, + final boolean doModelFit) { + logger.info("Number of segments before smoothing iteration: " + modeledSegments.size()); + final List mergedSegments = SimilarSegmentUtils.mergeSimilarSegments(modeledSegments, intervalThresholdSegmentMean, intervalThresholdMinorAlleleFraction); + logger.info("Number of segments after smoothing iteration: " + mergedSegments.size()); + currentSegments = mergedSegments.stream().map(ModeledSegment::getInterval).collect(Collectors.toList()); + if (doModelFit) { + fitModel(); + } else { + modeledSegments.clear(); + modeledSegments.addAll(mergedSegments); + isModelFit = false; + } + } + + /** + * Writes posterior summaries for the global model parameters to a file. + */ + public void writeModelParameterFiles(final File copyRatioParameterFile, + final File alleleFractionParameterFile) { + Utils.nonNull(copyRatioParameterFile); + Utils.nonNull(alleleFractionParameterFile); + ensureModelIsFit(); + logger.info("Writing posterior summaries for copy-ratio global parameters to " + copyRatioParameterFile); + copyRatioModeller.getGlobalParameterDeciles().write(copyRatioParameterFile); + logger.info("Writing posterior summaries for allele-fraction global parameters to " + alleleFractionParameterFile); + alleleFractionModeller.getGlobalParameterDeciles().write(alleleFractionParameterFile); + } + + @VisibleForTesting + CopyRatioModeller getCopyRatioModeller() { + return copyRatioModeller; + } + + @VisibleForTesting + AlleleFractionModeller getAlleleFractionModeller() { + return alleleFractionModeller; + } + + private void ensureModelIsFit() { + if (!isModelFit) { + logger.warn("Attempted to write ACNV results to file when model was not completely fit. Performing model fit now."); + fitModel(); + } + } + + /** + * Contains private methods for similar-segment merging. + */ + private static final class SimilarSegmentUtils { + /** + * Returns a new, modifiable list of segments with similar segments (i.e., adjacent segments with both + * segment-mean and minor-allele-fractions posteriors similar; posteriors are similar if the difference between + * posterior central tendencies is less than intervalThreshold times the posterior credible interval of either summary) + * merged. The list of segments is traversed once from beginning to end, and each segment is checked for similarity + * with the segment to the right and merged until it is no longer similar. + * @param intervalThresholdSegmentMean threshold number of credible intervals for segment-mean similarity + * @param intervalThresholdMinorAlleleFraction threshold number of credible intervals for minor-allele-fraction similarity + */ + private static List mergeSimilarSegments(final List segments, + final double intervalThresholdSegmentMean, + final double intervalThresholdMinorAlleleFraction) { + final List mergedSegments = new ArrayList<>(segments); + int index = 0; + while (index < mergedSegments.size() - 1) { + final ModeledSegment segment1 = mergedSegments.get(index); + final ModeledSegment segment2 = mergedSegments.get(index + 1); + if (segment1.getContig().equals(segment2.getContig()) && + areSimilar(segment1, segment2, + intervalThresholdSegmentMean, intervalThresholdMinorAlleleFraction)) { + mergedSegments.set(index, merge(segment1, segment2)); + mergedSegments.remove(index + 1); + index--; //if merge performed, stay on current segment during next iteration + } + index++; //if no merge performed, go to next segment during next iteration + } + return mergedSegments; + } + + //checks similarity of posterior summaries to within a credible-interval threshold; + //posterior summaries are similar if the difference between posterior central tendencies is less than + //intervalThreshold times the credible-interval width for both summaries + private static boolean areSimilar(final ModeledSegment.SimplePosteriorSummary summary1, + final ModeledSegment.SimplePosteriorSummary summary2, + final double intervalThreshold) { + if (Double.isNaN(summary1.getDecile50()) || Double.isNaN(summary2.getDecile50())) { + return true; + } + final double absoluteDifference = Math.abs(summary1.getDecile50() - summary2.getDecile50()); + return absoluteDifference < intervalThreshold * (summary1.getDecile90() - summary1.getDecile10()) && + absoluteDifference < intervalThreshold * (summary2.getDecile90() - summary2.getDecile10()); + } + + //checks similarity of modeled segments to within credible-interval thresholds for segment mean and minor allele fraction + private static boolean areSimilar(final ModeledSegment segment1, + final ModeledSegment segment2, + final double intervalThresholdSegmentMean, + final double intervalThresholdMinorAlleleFraction) { + return areSimilar(segment1.getLog2CopyRatioSimplePosteriorSummary(), segment2.getLog2CopyRatioSimplePosteriorSummary(), intervalThresholdSegmentMean) && + areSimilar(segment1.getMinorAlleleFractionSimplePosteriorSummary(), segment2.getMinorAlleleFractionSimplePosteriorSummary(), intervalThresholdMinorAlleleFraction); + } + + //merges posterior summaries naively by approximating posteriors as normal + private static ModeledSegment.SimplePosteriorSummary merge(final ModeledSegment.SimplePosteriorSummary summary1, + final ModeledSegment.SimplePosteriorSummary summary2) { + if (Double.isNaN(summary1.getDecile50()) && !Double.isNaN(summary2.getDecile50())) { + return summary2; + } + if ((!Double.isNaN(summary1.getDecile50()) && Double.isNaN(summary2.getDecile50())) || + (Double.isNaN(summary1.getDecile50()) && Double.isNaN(summary2.getDecile50()))) { + return summary1; + } + //use credible half-interval as standard deviation + final double standardDeviation1 = (summary1.getDecile90() - summary1.getDecile10()) / 2.; + final double standardDeviation2 = (summary2.getDecile90() - summary2.getDecile10()) / 2.; + final double variance = 1. / (1. / Math.pow(standardDeviation1, 2.) + 1. / Math.pow(standardDeviation2, 2.)); + final double mean = + (summary1.getDecile50() / Math.pow(standardDeviation1, 2.) + summary2.getDecile50() / Math.pow(standardDeviation2, 2.)) + * variance; + final double standardDeviation = Math.sqrt(variance); + return new ModeledSegment.SimplePosteriorSummary(mean, mean - standardDeviation, mean + standardDeviation); + } + + private static ModeledSegment merge(final ModeledSegment segment1, + final ModeledSegment segment2) { + return new ModeledSegment(mergeSegments(segment1.getInterval(), segment2.getInterval()), + segment1.getNumPointsCopyRatio() + segment2.getNumPointsCopyRatio(), + segment1.getNumPointsAlleleFraction() + segment2.getNumPointsAlleleFraction(), + merge(segment1.getLog2CopyRatioSimplePosteriorSummary(), segment2.getLog2CopyRatioSimplePosteriorSummary()), + merge(segment1.getMinorAlleleFractionSimplePosteriorSummary(), segment2.getMinorAlleleFractionSimplePosteriorSummary())); + } + + private static SimpleInterval mergeSegments(final SimpleInterval segment1, + final SimpleInterval segment2) { + Utils.validateArg(segment1.getContig().equals(segment2.getContig()), + String.format("Cannot join segments %s and %s on different chromosomes.", segment1.toString(), segment2.toString())); + final int start = Math.min(segment1.getStart(), segment2.getStart()); + final int end = Math.max(segment1.getEnd(), segment2.getEnd()); + return new SimpleInterval(segment1.getContig(), start, end); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java index 88b8c7acc3f..cf92cde73aa 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java @@ -8,8 +8,8 @@ import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.copynumber.DenoiseReadCounts; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; import org.broadinstitute.hellbender.utils.R.RScriptExecutor; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java index f6e23eb8f3c..b925b6ed9ea 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java @@ -7,13 +7,14 @@ import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCount; -import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollection; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.ModelSegments; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; -import org.broadinstitute.hellbender.tools.copynumber.multidimensional.model.ModeledSegment; -import org.broadinstitute.hellbender.tools.copynumber.multidimensional.model.ModeledSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ModeledSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; import org.broadinstitute.hellbender.utils.R.RScriptExecutor; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; @@ -27,7 +28,7 @@ import java.util.stream.Collectors; /** - * Plots segmented copy-ratio and minor-allele-fraction modeling results. + * Plots segmented copy-ratio and minor-allele-fraction modeling results from {@link ModelSegments}. * *

The order and representation of contigs in plots follows the contig ordering within the required reference sequence dictionary.

* diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenter.java new file mode 100644 index 00000000000..ce33ae4868b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenter.java @@ -0,0 +1,114 @@ +package org.broadinstitute.hellbender.tools.copynumber.segmentation; + +import org.apache.commons.math3.util.FastMath; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AlleleFractionSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AlleleFractionSegment; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.util.*; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Segments alternate-allele-fraction data using kernel segmentation. Segments do not span chromosomes. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionKernelSegmenter { + private static final Logger logger = LogManager.getLogger(AlleleFractionKernelSegmenter.class); + + private static final int MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME = 10; + + //Gaussian kernel for a specified variance; if variance is zero, use a linear kernel + private static final Function> KERNEL = + variance -> variance == 0. + ? (x, y) -> x * y + : (x, y) -> FastMath.exp(-(x - y) * (x - y) / (2. * variance)); + + private final AllelicCountCollection allelicCounts; + private final Map> allelicCountsPerChromosome; + + public AlleleFractionKernelSegmenter(final AllelicCountCollection allelicCounts) { + Utils.nonNull(allelicCounts); + this.allelicCounts = allelicCounts; + allelicCountsPerChromosome = allelicCounts.getRecords().stream() + .collect(Collectors.groupingBy( + AllelicCount::getContig, + LinkedHashMap::new, + Collectors.mapping(Function.identity(), Collectors.toList()))); + } + + /** + * Segments the internally held {@link AllelicCountCollection} using a separate {@link KernelSegmenter} for each chromosome. + * @param kernelVariance variance of the Gaussian kernel; if zero, a linear kernel is used instead + */ + public AlleleFractionSegmentCollection findSegmentation(final int maxNumChangepointsPerChromosome, + final double kernelVariance, + final int kernelApproximationDimension, + final List windowSizes, + final double numChangepointsPenaltyLinearFactor, + final double numChangepointsPenaltyLogLinearFactor) { + ParamUtils.isPositiveOrZero(maxNumChangepointsPerChromosome, "Maximum number of changepoints must be non-negative."); + ParamUtils.isPositiveOrZero(kernelVariance, "Variance of Gaussian kernel must be non-negative (if zero, a linear kernel will be used)."); + ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive."); + Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive."); + Utils.validateArg(new HashSet<>(windowSizes).size() == windowSizes.size(), "Window sizes must all be unique."); + ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor, + "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative."); + ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor, + "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative."); + + logger.info(String.format("Finding changepoints in %d data points and %d chromosomes...", + allelicCounts.getRecords().size(), allelicCountsPerChromosome.size())); + + //loop over chromosomes, find changepoints, and create allele-fraction segments + final List segments = new ArrayList<>(); + for (final String chromosome : allelicCountsPerChromosome.keySet()) { + final List allelicCountsInChromosome = allelicCountsPerChromosome.get(chromosome); + final int numAllelicCountsInChromosome = allelicCountsInChromosome.size(); + logger.info(String.format("Finding changepoints in %d data points in chromosome %s...", + numAllelicCountsInChromosome, chromosome)); + + if (numAllelicCountsInChromosome < MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME) { + logger.warn(String.format("Number of points in chromosome %s (%d) is less than that required (%d), skipping segmentation...", + chromosome, numAllelicCountsInChromosome, MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME)); + final int start = allelicCountsInChromosome.get(0).getStart(); + final int end = allelicCountsInChromosome.get(numAllelicCountsInChromosome - 1).getEnd(); + segments.add(new AlleleFractionSegment( + new SimpleInterval(chromosome, start, end), numAllelicCountsInChromosome)); + continue; + } + + final List alternateAlleleFractionsInChromosome = allelicCountsPerChromosome.get(chromosome).stream() + .map(AllelicCount::getAlternateAlleleFraction) + .collect(Collectors.toList()); + final List changepoints = new ArrayList<>(new KernelSegmenter<>(alternateAlleleFractionsInChromosome) + .findChangepoints(maxNumChangepointsPerChromosome, KERNEL.apply(kernelVariance), kernelApproximationDimension, + windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, KernelSegmenter.ChangepointSortOrder.INDEX)); + + if (!changepoints.contains(numAllelicCountsInChromosome)) { + changepoints.add(numAllelicCountsInChromosome - 1); + } + int previousChangepoint = -1; + for (final int changepoint : changepoints) { + final int start = allelicCountsPerChromosome.get(chromosome).get(previousChangepoint + 1).getStart(); + final int end = allelicCountsPerChromosome.get(chromosome).get(changepoint).getEnd(); + final List allelicCountsInSegment = allelicCountsInChromosome.subList( + previousChangepoint + 1, changepoint + 1); + segments.add(new AlleleFractionSegment( + new SimpleInterval(chromosome, start, end), allelicCountsInSegment)); + previousChangepoint = changepoint; + } + } + logger.info(String.format("Found %d segments in %d chromosomes.", segments.size(), allelicCountsPerChromosome.keySet().size())); + return new AlleleFractionSegmentCollection(allelicCounts.getSampleMetadata(), segments); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenter.java new file mode 100644 index 00000000000..e038b10aa02 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenter.java @@ -0,0 +1,118 @@ +package org.broadinstitute.hellbender.tools.copynumber.segmentation; + +import org.apache.commons.math3.util.FastMath; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; +import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.util.*; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Segments copy-ratio data using kernel segmentation. Segments do not span chromosomes. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class CopyRatioKernelSegmenter { + private static final Logger logger = LogManager.getLogger(CopyRatioKernelSegmenter.class); + + private static final int MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME = 10; + + //Gaussian kernel for a specified variance; if variance is zero, use a linear kernel + private static final Function> KERNEL = + variance -> variance == 0. + ? (x, y) -> x * y + : (x, y) -> FastMath.exp(-(x - y) * (x - y) / (2. * variance)); + + private final CopyRatioCollection denoisedCopyRatios; + private final Map> denoisedCopyRatiosPerChromosome; //in log2 space + + /** + * @param denoisedCopyRatios in log2 space + */ + public CopyRatioKernelSegmenter(final CopyRatioCollection denoisedCopyRatios) { + Utils.nonNull(denoisedCopyRatios); + this.denoisedCopyRatios = denoisedCopyRatios; + denoisedCopyRatiosPerChromosome = denoisedCopyRatios.getRecords().stream() + .collect(Collectors.groupingBy( + CopyRatio::getContig, + LinkedHashMap::new, + Collectors.mapping(Function.identity(), Collectors.toList()))); + } + + /** + * Segments the internally held {@link CopyRatioCollection} using a separate {@link KernelSegmenter} for each chromosome. + * @param kernelVariance variance of the Gaussian kernel; if zero, a linear kernel is used instead + */ + public CopyRatioSegmentCollection findSegmentation(final int maxNumChangepointsPerChromosome, + final double kernelVariance, + final int kernelApproximationDimension, + final List windowSizes, + final double numChangepointsPenaltyLinearFactor, + final double numChangepointsPenaltyLogLinearFactor) { + ParamUtils.isPositiveOrZero(maxNumChangepointsPerChromosome, "Maximum number of changepoints must be non-negative."); + ParamUtils.isPositiveOrZero(kernelVariance, "Variance of Gaussian kernel must be non-negative (if zero, a linear kernel will be used)."); + ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive."); + Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive."); + Utils.validateArg(new HashSet<>(windowSizes).size() == windowSizes.size(), "Window sizes must all be unique."); + ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor, + "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative."); + ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor, + "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative."); + + logger.info(String.format("Finding changepoints in %d data points and %d chromosomes...", + denoisedCopyRatios.getRecords().size(), denoisedCopyRatiosPerChromosome.size())); + + //loop over chromosomes, find changepoints, and create copy-ratio segments + final List segments = new ArrayList<>(); + for (final String chromosome : denoisedCopyRatiosPerChromosome.keySet()) { + final List denoisedCopyRatiosInChromosome = denoisedCopyRatiosPerChromosome.get(chromosome); + final int numDenoisedCopyRatiosInChromosome = denoisedCopyRatiosInChromosome.size(); + logger.info(String.format("Finding changepoints in %d data points in chromosome %s...", + numDenoisedCopyRatiosInChromosome, chromosome)); + + if (numDenoisedCopyRatiosInChromosome < MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME) { + logger.warn(String.format("Number of points in chromosome %s (%d) is less than that required (%d), skipping segmentation...", + chromosome, numDenoisedCopyRatiosInChromosome, MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME)); + final int start = denoisedCopyRatiosPerChromosome.get(chromosome).get(0).getStart(); + final int end = denoisedCopyRatiosPerChromosome.get(chromosome).get(numDenoisedCopyRatiosInChromosome - 1).getEnd(); + segments.add(new CopyRatioSegment( + new SimpleInterval(chromosome, start, end), denoisedCopyRatiosInChromosome)); + continue; + } + + final List denoisedLog2CopyRatioValuesInChromosome = denoisedCopyRatiosInChromosome.stream() + .map(CopyRatio::getLog2CopyRatioValue) + .collect(Collectors.toList()); + final List changepoints = new ArrayList<>(new KernelSegmenter<>(denoisedLog2CopyRatioValuesInChromosome) + .findChangepoints(maxNumChangepointsPerChromosome, KERNEL.apply(kernelVariance), kernelApproximationDimension, + windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, KernelSegmenter.ChangepointSortOrder.INDEX)); + + if (!changepoints.contains(numDenoisedCopyRatiosInChromosome)) { + changepoints.add(numDenoisedCopyRatiosInChromosome - 1); + } + int previousChangepoint = -1; + for (final int changepoint : changepoints) { + final int start = denoisedCopyRatiosPerChromosome.get(chromosome).get(previousChangepoint + 1).getStart(); + final int end = denoisedCopyRatiosPerChromosome.get(chromosome).get(changepoint).getEnd(); + final List denoisedCopyRatiosInSegment = denoisedCopyRatiosInChromosome.subList( + previousChangepoint + 1, changepoint + 1); + segments.add(new CopyRatioSegment( + new SimpleInterval(chromosome, start, end), + denoisedCopyRatiosInSegment)); + previousChangepoint = changepoint; + } + } + logger.info(String.format("Found %d segments in %d chromosomes.", segments.size(), denoisedCopyRatiosPerChromosome.keySet().size())); + return new CopyRatioSegmentCollection(denoisedCopyRatios.getSampleMetadata(), segments); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/MultidimensionalKernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/MultidimensionalKernelSegmenter.java new file mode 100644 index 00000000000..10640e0ceaa --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/MultidimensionalKernelSegmenter.java @@ -0,0 +1,199 @@ +package org.broadinstitute.hellbender.tools.copynumber.segmentation; + +import htsjdk.samtools.util.Locatable; +import htsjdk.samtools.util.OverlapDetector; +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.MultidimensionalSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment; +import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenter; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.param.ParamUtils; + +import java.util.*; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Segments copy-ratio and alternate-allele-fraction data using kernel segmentation. Segments do not span chromosomes. + * Only the first allele-fraction site in each copy-ratio interval is used. The alternate-allele fraction in + * copy-ratio intervals that do not contain any sites is imputed to be balanced at 0.5. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class MultidimensionalKernelSegmenter { + private static final Logger logger = LogManager.getLogger(MultidimensionalKernelSegmenter.class); + + private static final int MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME = 10; + + //assume alternate-allele fraction is 0.5 for missing data + private static final SimpleInterval DUMMY_INTERVAL = new SimpleInterval("DUMMY", 1, 1); + private static final AllelicCount BALANCED_ALLELIC_COUNT = new AllelicCount(DUMMY_INTERVAL, 1, 1); + + //Gaussian kernel for a specified variance; if variance is zero, use a linear kernel + private static final Function> KERNEL = + standardDeviation -> standardDeviation == 0. + ? (x, y) -> x * y + : (x, y) -> new NormalDistribution(null, x, standardDeviation).density(y); + + static final class MultidimensionalPoint implements Locatable { + private final SimpleInterval interval; + private final double log2CopyRatio; + private final double alternateAlleleFraction; + + MultidimensionalPoint(final SimpleInterval interval, + final double log2CopyRatio, + final double alternateAlleleFraction) { + this.interval = interval; + this.log2CopyRatio = log2CopyRatio; + this.alternateAlleleFraction = alternateAlleleFraction; + } + + @Override + public String getContig() { + return interval.getContig(); + } + + @Override + public int getStart() { + return interval.getStart(); + } + + @Override + public int getEnd() { + return interval.getEnd(); + } + } + + private final CopyRatioCollection denoisedCopyRatios; + private final OverlapDetector copyRatioMidpointOverlapDetector; + private final AllelicCountCollection allelicCounts; + private final OverlapDetector allelicCountOverlapDetector; + private final Map> multidimensionalPointsPerChromosome; + + public MultidimensionalKernelSegmenter(final CopyRatioCollection denoisedCopyRatios, + final AllelicCountCollection allelicCounts) { + Utils.nonNull(denoisedCopyRatios); + Utils.nonNull(allelicCounts); + Utils.validateArg(denoisedCopyRatios.getSampleName().equals(allelicCounts.getSampleName()), + "Sample names do not match."); + this.denoisedCopyRatios = denoisedCopyRatios; + copyRatioMidpointOverlapDetector = denoisedCopyRatios.getMidpointOverlapDetector(); + this.allelicCounts = allelicCounts; + allelicCountOverlapDetector = allelicCounts.getOverlapDetector(); + final int numAllelicCountsToUse = (int) denoisedCopyRatios.getRecords().stream() + .filter(allelicCountOverlapDetector::overlapsAny) + .count(); + logger.info(String.format("Using first allelic-count site in each copy-ratio interval (%d / %d) for multidimensional segmentation...", + numAllelicCountsToUse, allelicCounts.size())); + multidimensionalPointsPerChromosome = denoisedCopyRatios.getRecords().stream() + .map(cr -> new MultidimensionalPoint( + cr.getInterval(), + cr.getLog2CopyRatioValue(), + allelicCountOverlapDetector.getOverlaps(cr).stream() + .min(SampleLocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR::compare) + .orElse(BALANCED_ALLELIC_COUNT).getAlternateAlleleFraction())) + .collect(Collectors.groupingBy( + MultidimensionalPoint::getContig, + LinkedHashMap::new, + Collectors.toList())); + } + + /** + * Segments the internally held {@link CopyRatioCollection} and {@link AllelicCountCollection} + * using a separate {@link KernelSegmenter} for each chromosome. + * @param kernelVarianceCopyRatio variance of the Gaussian kernel used for copy-ratio data; + * if zero, a linear kernel is used instead + * @param kernelVarianceAlleleFraction variance of the Gaussian kernel used for allele-fraction data; + * if zero, a linear kernel is used instead + * @param kernelScalingAlleleFraction relative scaling S of the kernel K_AF for allele-fraction data + * to the kernel K_CR for copy-ratio data; + * the total kernel is K_CR + S * K_AF + */ + public MultidimensionalSegmentCollection findSegmentation(final int maxNumChangepointsPerChromosome, + final double kernelVarianceCopyRatio, + final double kernelVarianceAlleleFraction, + final double kernelScalingAlleleFraction, + final int kernelApproximationDimension, + final List windowSizes, + final double numChangepointsPenaltyLinearFactor, + final double numChangepointsPenaltyLogLinearFactor) { + ParamUtils.isPositiveOrZero(maxNumChangepointsPerChromosome, "Maximum number of changepoints must be non-negative."); + ParamUtils.isPositiveOrZero(kernelVarianceCopyRatio, "Variance of copy-ratio Gaussian kernel must be non-negative (if zero, a linear kernel will be used)."); + ParamUtils.isPositiveOrZero(kernelVarianceAlleleFraction, "Variance of allele-fraction Gaussian kernel must be non-negative (if zero, a linear kernel will be used)."); + ParamUtils.isPositiveOrZero(kernelScalingAlleleFraction, "Scaling of allele-fraction Gaussian kernel must be non-negative."); + ParamUtils.isPositive(kernelApproximationDimension, "Dimension of kernel approximation must be positive."); + Utils.validateArg(windowSizes.stream().allMatch(ws -> ws > 0), "Window sizes must all be positive."); + Utils.validateArg(new HashSet<>(windowSizes).size() == windowSizes.size(), "Window sizes must all be unique."); + ParamUtils.isPositiveOrZero(numChangepointsPenaltyLinearFactor, + "Linear factor for the penalty on the number of changepoints per chromosome must be non-negative."); + ParamUtils.isPositiveOrZero(numChangepointsPenaltyLogLinearFactor, + "Log-linear factor for the penalty on the number of changepoints per chromosome must be non-negative."); + + final BiFunction kernel = constructKernel( + kernelVarianceCopyRatio, kernelVarianceAlleleFraction, kernelScalingAlleleFraction); + + logger.info(String.format("Finding changepoints in (%d, %d) data points and %d chromosomes...", + denoisedCopyRatios.getRecords().size(), allelicCounts.size(), multidimensionalPointsPerChromosome.size())); + + //loop over chromosomes, find changepoints, and create allele-fraction segments + final List segments = new ArrayList<>(); + for (final String chromosome : multidimensionalPointsPerChromosome.keySet()) { + final List multidimensionalPointsInChromosome = multidimensionalPointsPerChromosome.get(chromosome); + final int numMultidimensionalPointsInChromosome = multidimensionalPointsInChromosome.size(); + logger.info(String.format("Finding changepoints in %d data points in chromosome %s...", + numMultidimensionalPointsInChromosome, chromosome)); + + if (numMultidimensionalPointsInChromosome < MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME) { + logger.warn(String.format("Number of points in chromosome %s (%d) is less than that required (%d), skipping segmentation...", + chromosome, numMultidimensionalPointsInChromosome, MIN_NUM_POINTS_REQUIRED_PER_CHROMOSOME)); + final int start = multidimensionalPointsInChromosome.get(0).getStart(); + final int end = multidimensionalPointsInChromosome.get(numMultidimensionalPointsInChromosome - 1).getEnd(); + segments.add(new MultidimensionalSegment( + new SimpleInterval(chromosome, start, end), + copyRatioMidpointOverlapDetector, + allelicCountOverlapDetector)); + continue; + } + + final List changepoints = new ArrayList<>(new KernelSegmenter<>(multidimensionalPointsInChromosome) + .findChangepoints(maxNumChangepointsPerChromosome, kernel, kernelApproximationDimension, + windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, KernelSegmenter.ChangepointSortOrder.INDEX)); + + if (!changepoints.contains(numMultidimensionalPointsInChromosome)) { + changepoints.add(numMultidimensionalPointsInChromosome - 1); + } + int previousChangepoint = -1; + for (final int changepoint : changepoints) { + final int start = multidimensionalPointsPerChromosome.get(chromosome).get(previousChangepoint + 1).getStart(); + final int end = multidimensionalPointsPerChromosome.get(chromosome).get(changepoint).getEnd(); + segments.add(new MultidimensionalSegment( + new SimpleInterval(chromosome, start, end), + copyRatioMidpointOverlapDetector, + allelicCountOverlapDetector)); + previousChangepoint = changepoint; + } + } + logger.info(String.format("Found %d segments in %d chromosomes.", segments.size(), multidimensionalPointsPerChromosome.keySet().size())); + return new MultidimensionalSegmentCollection(allelicCounts.getSampleMetadata(), segments); + } + + private BiFunction constructKernel(final double kernelVarianceCopyRatio, + final double kernelVarianceAlleleFraction, + final double kernelScalingAlleleFraction) { + final double standardDeviationCopyRatio = Math.sqrt(kernelVarianceCopyRatio); + final double standardDeviationAlleleFraction = Math.sqrt(kernelVarianceAlleleFraction); + return (p1, p2) -> + KERNEL.apply(standardDeviationCopyRatio).apply(p1.log2CopyRatio, p2.log2CopyRatio) + + kernelScalingAlleleFraction * KERNEL.apply(standardDeviationAlleleFraction).apply(p1.alternateAlleleFraction, p2.alternateAlleleFraction); + + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java index 24bb5d3513f..baf717cef39 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenter.java @@ -8,7 +8,6 @@ import org.apache.commons.math3.random.RandomGeneratorFactory; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.tools.copynumber.utils.optimization.PersistenceOptimizer; import org.broadinstitute.hellbender.utils.IndexRange; import org.broadinstitute.hellbender.utils.MathUtils; @@ -134,18 +133,18 @@ public List findChangepoints(final int maxNumChangepoints, return Collections.emptyList(); } - logger.info(String.format("Finding up to %d changepoints in %d data points...", maxNumChangepoints, data.size())); + logger.debug(String.format("Finding up to %d changepoints in %d data points...", maxNumChangepoints, data.size())); final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED)); - logger.info("Calculating low-rank approximation to kernel matrix..."); + logger.debug("Calculating low-rank approximation to kernel matrix..."); final RealMatrix reducedObservationMatrix = calculateReducedObservationMatrix(rng, data, kernel, kernelApproximationDimension); final double[] kernelApproximationDiagonal = calculateKernelApproximationDiagonal(reducedObservationMatrix); - logger.info(String.format("Finding changepoint candidates for all window sizes %s...", windowSizes.toString())); + logger.debug(String.format("Finding changepoint candidates for all window sizes %s...", windowSizes.toString())); final List changepointCandidates = findChangepointCandidates( data, reducedObservationMatrix, kernelApproximationDiagonal, maxNumChangepoints, windowSizes); - logger.info("Performing backward model selection on changepoint candidates..."); + logger.debug("Performing backward model selection on changepoint candidates..."); return selectChangepoints( changepointCandidates, maxNumChangepoints, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor, reducedObservationMatrix, kernelApproximationDiagonal).stream() @@ -198,19 +197,19 @@ private static RealMatrix calculateReducedObservationMatrix(final RandomG final BiFunction kernel, final int kernelApproximationDimension) { if (kernelApproximationDimension > data.size()) { - logger.warn("Specified dimension of the kernel approximation exceeds the number of data points to segment; " + - "using all data points to calculate kernel matrix."); + logger.warn(String.format("Specified dimension of the kernel approximation (%d) exceeds the number of data points (%d) to segment; " + + "using all data points to calculate kernel matrix.", kernelApproximationDimension, data.size())); } //subsample data with replacement final int numSubsample = Math.min(kernelApproximationDimension, data.size()); - logger.info(String.format("Subsampling %d points from data to find kernel approximation...", numSubsample)); + logger.debug(String.format("Subsampling %d points from data to find kernel approximation...", numSubsample)); final List dataSubsample = numSubsample == data.size() ? data : IntStream.range(0, numSubsample).mapToObj(i -> data.get(rng.nextInt(data.size()))).collect(Collectors.toList()); //calculate (symmetric) kernel matrix of subsampled data - logger.info(String.format("Calculating kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample)); + logger.debug(String.format("Calculating kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample)); final RealMatrix subKernelMatrix = new Array2DRowRealMatrix(numSubsample, numSubsample); for (int i = 0; i < numSubsample; i++) { for (int j = 0; j < i; j++) { @@ -222,11 +221,11 @@ private static RealMatrix calculateReducedObservationMatrix(final RandomG } //perform SVD of kernel matrix of subsampled data - logger.info(String.format("Performing SVD of kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample)); + logger.debug(String.format("Performing SVD of kernel matrix of subsampled data (%d x %d)...", numSubsample, numSubsample)); final SingularValueDecomposition svd = new SingularValueDecomposition(subKernelMatrix); //calculate reduced observation matrix - logger.info(String.format("Calculating reduced observation matrix (%d x %d)...", data.size(), numSubsample)); + logger.debug(String.format("Calculating reduced observation matrix (%d x %d)...", data.size(), numSubsample)); final double[] invSqrtSingularValues = Arrays.stream(svd.getSingularValues()).map(Math::sqrt).map(x -> 1. / (x + EPSILON)).toArray(); final RealMatrix subKernelUMatrix = new Array2DRowRealMatrix(numSubsample, numSubsample); subKernelUMatrix.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { @@ -267,7 +266,7 @@ private static List findChangepointCandidates(final List d logger.debug(String.format("Calculating local changepoints costs for window size %d...", windowSize)); if (windowSize > data.size()) { logger.warn(String.format("Number of points needed to calculate local changepoint costs (2 * window size = %d) " + - "exceeds number of data points %d. Local changepoint costs will not be calculated for this window size.", + "exceeds number of data points (%d). Local changepoint costs will not be calculated for this window size.", 2 * windowSize, data.size())); continue; } @@ -281,7 +280,7 @@ private static List findChangepointCandidates(final List d } if (changepointCandidates.isEmpty()) { - throw new GATKException.ShouldNeverReachHereException("No changepoint candidates found."); + logger.warn("No changepoint candidates were found. The specified window sizes may be inappropriate, or there may be insufficient data points"); } return changepointCandidates; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java index b2addafcd45..7d7fe6f27f1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCounts.java @@ -16,8 +16,8 @@ import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCount; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; import org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils; import org.broadinstitute.hellbender.tools.exome.SampleCollection; @@ -31,7 +31,6 @@ import java.io.File; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.IntStream; /** * Collects read counts on whole genome sequencing (WGS) alignments using Spark. diff --git a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java index ab07fa3f838..e341ea87a0b 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterReader.java @@ -27,19 +27,19 @@ public ParameterReader(final File file, final Class parameterClass) throws IO protected Map.Entry createRecord(final DataLine dataLine) { final String parameterName = dataLine.get(ParameterTableColumn.PARAMETER_NAME); final T parameter = Enum.valueOf(parameterClass, parameterName); - final double center = dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_MODE); - final double lower = dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_LOWER); - final double upper = dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_UPPER); + final double center = dataLine.getDouble(ParameterTableColumn.POSTERIOR_MODE); + final double lower = dataLine.getDouble(ParameterTableColumn.POSTERIOR_LOWER); + final double upper = dataLine.getDouble(ParameterTableColumn.POSTERIOR_UPPER); final DecileCollection deciles = new DecileCollection(Arrays.asList( - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_10), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_20), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_30), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_40), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_50), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_60), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_70), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_80), - dataLine.getDouble(ParameterTableColumn.PARAMETER_POSTERIOR_90))); + dataLine.getDouble(ParameterTableColumn.POSTERIOR_10), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_20), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_30), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_40), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_50), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_60), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_70), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_80), + dataLine.getDouble(ParameterTableColumn.POSTERIOR_90))); final PosteriorSummary posteriorSummary = new PosteriorSummary(center, lower, upper); posteriorSummary.setDeciles(deciles); return new AbstractMap.SimpleEntry<>(parameter, posteriorSummary); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java index 36be10d8e6c..e1361f561eb 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/mcmc/ParameterTableColumn.java @@ -6,28 +6,20 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public enum ParameterTableColumn { - PARAMETER_NAME("Parameter"), - PARAMETER_POSTERIOR_MODE("Post_Mode"), - PARAMETER_POSTERIOR_LOWER("Post_Lo"), - PARAMETER_POSTERIOR_UPPER("Post_Hi"), - PARAMETER_POSTERIOR_10("Post_10"), - PARAMETER_POSTERIOR_20("Post_20"), - PARAMETER_POSTERIOR_30("Post_30"), - PARAMETER_POSTERIOR_40("Post_40"), - PARAMETER_POSTERIOR_50("Post_50"), - PARAMETER_POSTERIOR_60("Post_60"), - PARAMETER_POSTERIOR_70("Post_70"), - PARAMETER_POSTERIOR_80("Post_80"), - PARAMETER_POSTERIOR_90("Post_90"); + PARAMETER_NAME, + POSTERIOR_MODE, + POSTERIOR_LOWER, + POSTERIOR_UPPER, + POSTERIOR_10, + POSTERIOR_20, + POSTERIOR_30, + POSTERIOR_40, + POSTERIOR_50, + POSTERIOR_60, + POSTERIOR_70, + POSTERIOR_80, + POSTERIOR_90; - private final String columnName; //store the column names - - ParameterTableColumn(final String columnName) { this.columnName = columnName; } - - @Override - public String toString() { - return columnName; - } public static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values()); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java index f247bdebbd4..f1e87b1fac5 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/GetSampleNameIntegrationTest.java @@ -11,31 +11,30 @@ import java.nio.file.Files; public class GetSampleNameIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/copynumber/allelic"; - private static final File NORMAL_BAM_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-normal.bam"); - private static final String TEST_SUB_DIR2 = publicTestDir + "org/broadinstitute/hellbender/tools"; - private static final File MS_BAD_BAM_FILE = new File(TEST_SUB_DIR2, "multi_sample_bam_header.bam"); + private static final File SINGLE_SAMPLE_BAM_FILE = new File(toolsTestDir, "valid.bam"); + private static final File BAD_MULTI_SAMPLE_BAM_FILE = new File(toolsTestDir, "multi_sample_bam_header.bam"); @Test public void testBasicUsage() throws IOException { final File outputFile = createTempFile("get-sample-name", ".txt"); final String[] arguments = { - "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, NORMAL_BAM_FILE.getAbsolutePath(), - "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath() + "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, SINGLE_SAMPLE_BAM_FILE.getAbsolutePath(), + "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath(), + "--verbosity", "INFO" }; runCommandLine(arguments); Assert.assertTrue(outputFile.exists()); Assert.assertTrue(outputFile.length() > 0); Assert.assertTrue(Files.readAllLines(outputFile.toPath()).stream().count() == 1); - Assert.assertTrue(Files.readAllLines(outputFile.toPath()).stream().filter(n -> n.equals("20")).count() == 1); + Assert.assertTrue(Files.readAllLines(outputFile.toPath()).stream().filter(n -> n.equals("Hi,Mom!")).count() == 1); } @Test(expectedExceptions = UserException.class) public void testMultiSampleBam() { final File outputFile = createTempFile("get-sample-name-ms", ".txt"); final String[] arguments = { - "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, MS_BAD_BAM_FILE.getAbsolutePath(), + "-" + StandardArgumentDefinitions.INPUT_SHORT_NAME, BAD_MULTI_SAMPLE_BAM_FILE.getAbsolutePath(), "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath() }; runCommandLine(arguments); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java index 35471e1e0a3..9380063106f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervalsIntegrationTest.java @@ -1,10 +1,10 @@ package org.broadinstitute.hellbender.tools.copynumber; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedInterval; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedIntervalCollection; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotationSet; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet; import org.broadinstitute.hellbender.utils.IntervalMergingRule; import org.broadinstitute.hellbender.utils.IntervalSetRule; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -21,7 +21,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class AnnotateIntervalsIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber"); private static final File INTERVALS_FILE = new File(TEST_SUB_DIR, "annotate-intervals-test.interval_list"); private static final File REFERENCE_FILE = new File(b37_reference_20_21); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java index 3ec326471db..6a7752fd897 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CallCopyRatioSegmentsIntegrationTest.java @@ -1,9 +1,8 @@ package org.broadinstitute.hellbender.tools.copynumber; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.tools.copynumber.coverage.caller.CalledCopyRatioSegmentCollection; -import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection; +import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; import org.testng.Assert; import org.testng.annotations.Test; @@ -13,20 +12,16 @@ * Integration test for {@link CallCopyRatioSegments}. */ public final class CallCopyRatioSegmentsIntegrationTest extends CommandLineProgramTest { - private static final File TEST_DIR = new File(toolsTestDir, "copynumber/coverage/caller"); - private static final File TEST_DENOISED_COPY_RATIOS = new File(TEST_DIR, "call-copy-ratio-segments-denoised-copy-ratios.tsv"); - private static final File TEST_SEGMENTS = new File(TEST_DIR, "call-copy-ratio-segments-segments.seg"); + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber"); + private static final File TEST_SEGMENTS = new File(TEST_SUB_DIR, "call-copy-ratio-segments-segments.seg"); @Test public void testCallSegments() { final File outputFile = createTempFile("test.called",".seg"); - - final String[] arguments = { - "-" + CopyNumberStandardArgument.DENOISED_COPY_RATIOS_FILE_SHORT_NAME, TEST_DENOISED_COPY_RATIOS.getAbsolutePath(), - "-" + CopyNumberStandardArgument.SEGMENTS_FILE_SHORT_NAME, TEST_SEGMENTS.getAbsolutePath(), - "-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME, outputFile.getAbsolutePath() - }; - runCommandLine(arguments); + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder() + .addInput(TEST_SEGMENTS) + .addOutput(outputFile); + runCommandLine(argsBuilder); final CalledCopyRatioSegmentCollection calledCopyRatioSegments = new CalledCopyRatioSegmentCollection(outputFile); Assert.assertEquals(calledCopyRatioSegments.getRecords().stream().map(s -> s.getCall().getOutputString()).toArray(), new String[] {"+", "-", "0", "0"}); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java index 1286162cda6..6e3bfdfad6c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectAllelicCountsIntegrationTest.java @@ -2,9 +2,9 @@ import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCount; -import org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.testng.Assert; @@ -21,7 +21,7 @@ */ public final class CollectAllelicCountsIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/allelic"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber"); private static final File NORMAL_BAM_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-normal.bam"); private static final File TUMOR_BAM_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-tumor.bam"); private static final File SITES_FILE = new File(TEST_SUB_DIR, "collect-allelic-counts-sites.interval_list"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java index 0797b38ca26..43e7a9046e6 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CollectFragmentCountsIntegrationTest.java @@ -2,7 +2,7 @@ import htsjdk.samtools.SAMFileHeader; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.utils.IntervalMergingRule; import org.broadinstitute.hellbender.utils.IntervalSetRule; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -22,7 +22,7 @@ * @author Andrey Smirnov <asmirnov@broadinstitute.org> */ public class CollectFragmentCountsIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber"); private static final File NA12878_BAM = new File(TEST_SUB_DIR, "collect-fragment-counts-NA12878.bam"); private static final File NA12878_FRAGMENT_COUNTS_EXPECTED_OUTPUT = new File(TEST_SUB_DIR, "collect-fragment-counts-NA12878-expected.tsv"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java index c45bf0b941f..eeae497d848 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormalsIntegrationTest.java @@ -6,17 +6,17 @@ import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; import org.broadinstitute.hdf5.HDF5File; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedInterval; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotatedIntervalCollection; -import org.broadinstitute.hellbender.tools.copynumber.annotation.AnnotationSet; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.HDF5SVDReadCountPanelOfNormals; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDDenoisedCopyRatioResult; -import org.broadinstitute.hellbender.tools.copynumber.coverage.denoising.svd.SVDReadCountPanelOfNormals; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCount; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals; +import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDDenoisedCopyRatioResult; +import org.broadinstitute.hellbender.tools.copynumber.denoising.SVDReadCountPanelOfNormals; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; import org.testng.Assert; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java index a8587e6a9e7..dd214333a51 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCountsIntegrationTest.java @@ -2,8 +2,8 @@ import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberStandardArgument; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; import org.broadinstitute.hellbender.utils.test.ArgumentsBuilder; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -20,7 +20,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class DenoiseReadCountsIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/coverage/denoising"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber"); private static final File WGS_READ_COUNTS_TSV_FILE = new File(TEST_SUB_DIR, "denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.tsv"); private static final File WGS_READ_COUNTS_HDF5_FILE = new File(TEST_SUB_DIR, "denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.hdf5"); private static final File WGS_ANNOTATED_INTERVALS_FILE = new File(TEST_SUB_DIR, "denoise-read-counts-wgs-annotated-intervals.tsv"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java index dd022a132eb..8a0ea20316a 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/PreprocessIntervalsIntegrationTest.java @@ -10,10 +10,11 @@ import java.io.File; import java.util.Arrays; +import java.util.Collections; import java.util.List; public final class PreprocessIntervalsIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber"); private static final File INTERVAL_LIST_FILE = new File(TEST_SUB_DIR, "preprocess-intervals-test.interval_list"); private static final File REFERENCE_FILE = new File(b37_reference_20_21); @@ -32,6 +33,18 @@ public Object[][] testData() { new Interval("20", 13_000, 20_000) ); + // Test for no binning (specified by zero bin length) + final int binLengthNoBinningTest = 0; + final int paddingLengthNoBinningTest = 0; + final List inputIntervalsNoBinningTest = Arrays.asList( + new Interval("20", 3_000, 20_000), + new Interval("20", 200, 1_900) + ); + final List expectedBinsNoBinningTest = Arrays.asList( + new Interval("20", 200, 1_900), + new Interval("20", 3_000, 20_000) + ); + // Test for overlapping intervals final int binLengthOverlappingIntervalTest = 10_000; final int paddingLengthOverlappingIntervalTest = 500; @@ -61,7 +74,7 @@ public Object[][] testData() { // Test for whole chromosome final int binLengthWholeChromosomeTest = 10_000_000; final int paddingLengthWholeChromosomeTest = 500; - final List inputIntervalsWholeChromosomeTest = Arrays.asList(new Interval("20", 1, 63_025_520)); + final List inputIntervalsWholeChromosomeTest = Collections.singletonList(new Interval("20", 1, 63_025_520)); final List expectedBinsWholeChromosomeTest = Arrays.asList( new Interval("20", 1, 10_000_000), new Interval("20", 10_000_001, 20_000_000), @@ -75,7 +88,7 @@ public Object[][] testData() { // Test for whole genome -- when we don't give any intervals, then the tool assumes that the user wants to sequence the whole genome final int binLengthWholeGenomeTest = 10_000_000; final int paddingLengthWholeGenomeTest = 500; - final List inputIntervalsWholeGenomeTest = Arrays.asList(); + final List inputIntervalsWholeGenomeTest = Collections.emptyList(); final List expectedBinsWholeGenomeTest = Arrays.asList( new Interval("20", 1, 10_000_000), new Interval("20", 10_000_001, 20_000_000), @@ -94,6 +107,7 @@ public Object[][] testData() { // Return all test data return new Object[][]{ {binLengthSeparateIntervalTest, paddingLengthSeparateIntervalTest, inputIntervalsSeparateIntervalTest, expectedBinsSeparateIntervalTest}, + {binLengthNoBinningTest, paddingLengthNoBinningTest, inputIntervalsNoBinningTest, expectedBinsNoBinningTest}, {binLengthOverlappingIntervalTest, paddingLengthOverlappingIntervalTest, inputIntervalsOverlappingIntervalTest, expectedBinsOverlappingIntervalTest}, {binLengthEdgeIntervalTest, paddingLengthEdgeIntervalTest, inputIntervalsEdgeIntervalTest, expectedBinsEdgeIntervalTest}, {binLengthWholeChromosomeTest, paddingLengthWholeChromosomeTest, inputIntervalsWholeChromosomeTest, expectedBinsWholeChromosomeTest}, diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCallerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCallerUnitTest.java new file mode 100644 index 00000000000..f089b6a640f --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/caller/SimpleCopyRatioCallerUnitTest.java @@ -0,0 +1,76 @@ +package org.broadinstitute.hellbender.tools.copynumber.caller; + +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CalledCopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CalledCopyRatioSegment; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.param.ParamUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; + +import static org.broadinstitute.hellbender.tools.copynumber.formats.records.CalledCopyRatioSegment.Call.*; + +public final class SimpleCopyRatioCallerUnitTest extends GATKBaseTest { + private static final double NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD = 0.1; + private static final double OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD = 2.; + private static final double CALLING_COPY_RATIO_Z_SCORE_THRESHOLD = 2.; + + private static final int RANDOM_SEED = 42; + private static final double EPSILON = 1E-10; + + @Test + public void testMakeCalls() { + final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED)); + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("Sample"); + final double segmentNoise = 0.05; + final double intervalLog2Noise = 0.2; + final List segmentCopyRatios = Arrays.asList(2., 3., 1., 1., 0.25, 1., 5., 1., 0., 0.5); + final List numIntervalsPerSegment = Arrays.asList(10, 5, 5, 100, 10, 10, 20, 10, 10, 5); + final List expectedCalls = Arrays.asList( + AMPLIFICATION, AMPLIFICATION, NEUTRAL, NEUTRAL, DELETION, NEUTRAL, AMPLIFICATION, NEUTRAL, DELETION, DELETION); + + final List segments = new ArrayList<>(); + for (int segmentIndex = 0; segmentIndex < numIntervalsPerSegment.size(); segmentIndex++) { + final String contig = "chr" + segmentIndex; + final List intervalLog2CopyRatiosInSegment = new ArrayList<>(numIntervalsPerSegment.size()); + for (int intervalIndex = 0; intervalIndex < numIntervalsPerSegment.get(segmentIndex); intervalIndex++) { + final double log2CopyRatioValue = ParamUtils.log2(Math.max(EPSILON, + segmentCopyRatios.get(segmentIndex) + rng.nextGaussian() * segmentNoise)) + intervalLog2Noise * rng.nextGaussian(); + intervalLog2CopyRatiosInSegment.add(new CopyRatio( + new SimpleInterval(contig, intervalIndex + 1, intervalIndex + 1), log2CopyRatioValue)); + } + segments.add(new CopyRatioSegment( + new SimpleInterval(contig, 1, numIntervalsPerSegment.get(segmentIndex)), + intervalLog2CopyRatiosInSegment)); + } + final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(sampleMetadata, segments); + + final CalledCopyRatioSegmentCollection calledCopyRatioSegments = + new SimpleCopyRatioCaller(copyRatioSegments, + NEUTRAL_SEGMENT_COPY_RATIO_THRESHOLD, OUTLIER_NEUTRAL_SEGMENT_COPY_RATIO_Z_SCORE_THRESHOLD, CALLING_COPY_RATIO_Z_SCORE_THRESHOLD) + .makeCalls(); + + Assert.assertEquals(copyRatioSegments.getSampleName(), calledCopyRatioSegments.getSampleName()); + Assert.assertEquals( + copyRatioSegments.getIntervals(), calledCopyRatioSegments.getIntervals()); + Assert.assertEquals( + copyRatioSegments.getRecords().stream().map(CopyRatioSegment::getMeanLog2CopyRatio).collect(Collectors.toList()), + calledCopyRatioSegments.getRecords().stream().map(CopyRatioSegment::getMeanLog2CopyRatio).collect(Collectors.toList())); + Assert.assertEquals( + calledCopyRatioSegments.getRecords().stream().map(CalledCopyRatioSegment::getCall).collect(Collectors.toList()), + expectedCalls); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCallerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCallerUnitTest.java deleted file mode 100644 index 30f6d79b028..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/ReCapSegCallerUnitTest.java +++ /dev/null @@ -1,85 +0,0 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.caller; - -import org.apache.commons.math3.linear.Array2DRowRealMatrix; -import org.apache.commons.math3.linear.RealMatrix; -import org.broadinstitute.hellbender.GATKBaseTest; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatio; -import org.broadinstitute.hellbender.tools.copynumber.coverage.copyratio.CopyRatioCollection; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegment; -import org.broadinstitute.hellbender.tools.copynumber.coverage.segmentation.CopyRatioSegmentCollection; -import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; -import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; -import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.utils.param.ParamUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static org.broadinstitute.hellbender.tools.copynumber.coverage.caller.CalledCopyRatioSegment.Call.*; - -public final class ReCapSegCallerUnitTest extends GATKBaseTest { - @Test - public void testMakeCalls() { - - final SampleMetadata sampleMetadata = new SimpleSampleMetadata("Sample"); - final List intervals = new ArrayList<>(); - final List testData = new ArrayList<>(); - - //add amplification intervals - for (int i = 0; i < 10; i++) { - final SimpleInterval interval = new SimpleInterval("chr", 101 + i, 101 + i); - intervals.add(interval); - testData.add(ParamUtils.log2(2.0)); - } - //add deletion intervals - for (int i = 0; i < 10; i++) { - final SimpleInterval interval = new SimpleInterval("chr", 201 + i, 201 + i); - intervals.add(interval); - testData.add(ParamUtils.log2(0.5)); - } - //add obviously neutral intervals with some small spread - for (int i = 0; i < 10; i++) { - final SimpleInterval interval = new SimpleInterval("chr", 301 + i, 301 + i); - intervals.add(interval); - testData.add(ParamUtils.log2(0.01 * (i - 5) + 1)); - } - //add spread-out intervals to a neutral segment (mean near zero) - for (int i = 0; i < 10; i++) { - final SimpleInterval interval = new SimpleInterval("chr", 401 + i, 401 + i); - intervals.add(interval); - testData.add(ParamUtils.log2(0.1 * (i - 5) + 1)); - } - - final RealMatrix denoisedCopyRatioValues = new Array2DRowRealMatrix(1, intervals.size()); - denoisedCopyRatioValues.setRow(0, testData.stream().mapToDouble(x -> x).toArray()); - final CopyRatioCollection denoisedCopyRatios = new CopyRatioCollection( - sampleMetadata, - IntStream.range(0, intervals.size()) - .mapToObj(i -> new CopyRatio(intervals.get(i), denoisedCopyRatioValues.getEntry(0, i))) - .collect(Collectors.toList())); - - final CopyRatioSegmentCollection copyRatioSegments = new CopyRatioSegmentCollection(sampleMetadata, - Arrays.asList( - new CopyRatioSegment(new SimpleInterval("chr", 101, 110), 10, ParamUtils.log2(2.0)), //amplification - new CopyRatioSegment(new SimpleInterval("chr", 201, 210), 10, ParamUtils.log2(0.5)), //deletion - new CopyRatioSegment(new SimpleInterval("chr", 301, 310), 10, ParamUtils.log2(1)), //neutral - new CopyRatioSegment(new SimpleInterval("chr", 401, 410), 10, ParamUtils.log2(1)))); //neutral - - final CalledCopyRatioSegmentCollection calledCopyRatioSegments = new ReCapSegCaller(denoisedCopyRatios, copyRatioSegments).makeCalls(); - - Assert.assertEquals(copyRatioSegments.getSampleName(), calledCopyRatioSegments.getSampleName()); - Assert.assertEquals( - copyRatioSegments.getIntervals(), calledCopyRatioSegments.getIntervals()); - Assert.assertEquals( - copyRatioSegments.getRecords().stream().map(CopyRatioSegment::getMeanLog2CopyRatio).collect(Collectors.toList()), - calledCopyRatioSegments.getRecords().stream().map(CopyRatioSegment::getMeanLog2CopyRatio).collect(Collectors.toList())); - Assert.assertEquals( - calledCopyRatioSegments.getRecords().stream().map(CalledCopyRatioSegment::getCall).collect(Collectors.toList()), - Arrays.asList(AMPLIFICATION, DELETION, NEUTRAL, NEUTRAL)); - } -} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrectorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrectorUnitTest.java similarity index 98% rename from src/test/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrectorUnitTest.java rename to src/test/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrectorUnitTest.java index 841f0ce80eb..f9318e2970f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/annotation/GCBiasCorrectorUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/denoising/GCBiasCorrectorUnitTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.annotation; +package org.broadinstitute.hellbender.tools.copynumber.denoising; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollectionUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollectionUnitTest.java similarity index 94% rename from src/test/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollectionUnitTest.java rename to src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollectionUnitTest.java index d72b5b7fe39..7bd521f047f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/allelic/alleliccount/AllelicCountCollectionUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/AllelicCountCollectionUnitTest.java @@ -1,9 +1,10 @@ -package org.broadinstitute.hellbender.tools.copynumber.allelic.alleliccount; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import org.apache.commons.io.FileUtils; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.testng.Assert; @@ -19,7 +20,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class AllelicCountCollectionUnitTest extends GATKBaseTest { - private static final String TEST_SUB_DIR = publicTestDir + "org/broadinstitute/hellbender/tools/copynumber/allelic"; + private static final File TEST_SUB_DIR = new File(toolsTestDir + "copynumber/formats/collections"); private static final File ALLELIC_COUNTS_FILE = new File(TEST_SUB_DIR, "allelic-count-collection-normal.tsv"); private static final File ALLELIC_COUNTS_MISSING_NUCLEOTIDES_FILE = new File(TEST_SUB_DIR, "allelic-count-collection-normal-missing-nucleotides.tsv"); private static final String SAMPLE_NAME_EXPECTED = "test"; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollectionUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollectionUnitTest.java similarity index 94% rename from src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollectionUnitTest.java rename to src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollectionUnitTest.java index fef6649763f..470ffdb35db 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/HDF5SimpleCountCollectionUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/HDF5SimpleCountCollectionUnitTest.java @@ -1,4 +1,4 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import org.broadinstitute.hdf5.HDF5File; import org.broadinstitute.hellbender.utils.SimpleInterval; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/SampleLocatableCollectionUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollectionUnitTest.java similarity index 97% rename from src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/SampleLocatableCollectionUnitTest.java rename to src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollectionUnitTest.java index 64b07e7bba4..a6d3e785601 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/SampleLocatableCollectionUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SampleLocatableCollectionUnitTest.java @@ -1,10 +1,9 @@ -package org.broadinstitute.hellbender.tools.copynumber.formats; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import htsjdk.samtools.util.Locatable; import org.apache.commons.io.FileUtils; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SampleLocatableCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -27,7 +26,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class SampleLocatableCollectionUnitTest extends GATKBaseTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/formats"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber/formats/collections"); private static final File SIMPLE_LOCATABLE_COLLECTION_FILE = new File(TEST_SUB_DIR, "locatable-collection-tsv-simple-locatable-collection.tsv"); private static final File SIMPLE_LOCATABLE_COLLECTION_NON_LEXICOGRAPHICAL_ORDER_FILE = diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollectionUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollectionUnitTest.java similarity index 91% rename from src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollectionUnitTest.java rename to src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollectionUnitTest.java index 75dce36dd1f..a3667ac85fc 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/SimpleCountCollectionUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/formats/collections/SimpleCountCollectionUnitTest.java @@ -1,9 +1,10 @@ -package org.broadinstitute.hellbender.tools.copynumber.coverage.readcount; +package org.broadinstitute.hellbender.tools.copynumber.formats.collections; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.RealMatrix; import org.broadinstitute.hellbender.GATKBaseTest; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.testng.Assert; import org.testng.annotations.Test; @@ -14,7 +15,7 @@ public final class SimpleCountCollectionUnitTest extends GATKBaseTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/coverage/readcount/"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber/formats/collections"); private static final File INTEGER_COUNTS_FILE = new File(TEST_SUB_DIR,"simple-count-collection-integer-counts.tsv"); private static final File DOUBLE_COUNTS_FILE = new File(TEST_SUB_DIR, "simple-count-collection-double-counts.tsv"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializerUnitTest.java new file mode 100644 index 00000000000..df8890bee38 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionInitializerUnitTest.java @@ -0,0 +1,50 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Random; +import java.util.stream.IntStream; + +/** + * Tests the initialization performed by {@link AlleleFractionInitializer}. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionInitializerUnitTest { + private static final int RANDOM_SEED = 13; + private static final double ABSOLUTE_TOLERANCE = 0.01; + + @Test + public void testInitialization() { + final double meanBias = 1.2; + final double biasVariance = 0.04; + final double outlierProbability = 0.02; + final AlleleFractionGlobalParameters globalParameters = new AlleleFractionGlobalParameters(meanBias, biasVariance, outlierProbability); + final int numSegments = 100; + final double averageHetsPerSegment = 50.; + final double averageDepth = 50.; + final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED)); + + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("test"); + final AlleleFractionSimulatedData simulatedData = new AlleleFractionSimulatedData( + sampleMetadata, globalParameters, numSegments, averageHetsPerSegment, averageDepth, rng); + + final AlleleFractionSegmentedData data = simulatedData.getData(); + final AlleleFractionState initializedState = new AlleleFractionInitializer(data).getInitializedState(); + + Assert.assertEquals(initializedState.meanBias(), meanBias, ABSOLUTE_TOLERANCE); + Assert.assertEquals(initializedState.biasVariance(), biasVariance, ABSOLUTE_TOLERANCE); + Assert.assertEquals(initializedState.outlierProbability(), outlierProbability, ABSOLUTE_TOLERANCE); + + final double averageMinorFractionError = IntStream.range(0, numSegments) + .mapToDouble(s -> Math.abs(initializedState.segmentMinorFraction(s) - simulatedData.getTrueState().segmentMinorFraction(s))) + .average().getAsDouble(); + Assert.assertEquals(averageMinorFractionError, 0, ABSOLUTE_TOLERANCE); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModellerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModellerUnitTest.java new file mode 100644 index 00000000000..f81efad3957 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionModellerUnitTest.java @@ -0,0 +1,113 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; +import org.broadinstitute.hellbender.utils.mcmc.Decile; +import org.broadinstitute.hellbender.utils.mcmc.DecileCollection; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Tests the MCMC inference performed by {@link AlleleFractionModeller}. Only recovery of posterior centers is tested. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionModellerUnitTest extends BaseTest { + private static final int RANDOM_SEED = 13; + + // note: the following tolerance could actually be made much smaller if we used more segments and/or + // more hets -- most of the error is the sampling error of a finite simulated data set, not numerical error of MCMC + private static final double ABSOLUTE_TOLERANCE = 0.01; + + @Test + public void testMCMC() { + final double meanBias = 1.2; + final double biasVariance = 0.04; + final double outlierProbability = 0.02; + final AlleleFractionGlobalParameters globalParameters = new AlleleFractionGlobalParameters(meanBias, biasVariance, outlierProbability); + final double minorAlleleFractionPriorAlpha = 1.; + final AlleleFractionPrior prior = new AlleleFractionPrior(minorAlleleFractionPriorAlpha); + final int numSegments = 50; + final double averageHetsPerSegment = 50.; + final double averageDepth = 50.; + final int numSamples = 150; + final int numBurnIn = 50; + final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED)); + + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("test"); + final AlleleFractionSimulatedData simulatedData = new AlleleFractionSimulatedData( + sampleMetadata, globalParameters, numSegments, averageHetsPerSegment, averageDepth, rng); + + final AlleleFractionModeller modeller = new AlleleFractionModeller(simulatedData.getData().getAllelicCounts(), simulatedData.getData().getSegments(), prior); + modeller.fitMCMC(numSamples, numBurnIn); + + assertAlleleFractionPosteriorCenters(modeller, simulatedData); + } + + static void assertAlleleFractionPosteriorCenters(final AlleleFractionModeller modeller, + final AlleleFractionSimulatedData simulatedData) { + final AlleleFractionState trueState = simulatedData.getTrueState(); + final int numSegments = simulatedData.getData().getNumSegments(); + + //check centers from samples + final List meanBiasSamples = modeller.getMeanBiasSamples(); + final List biasVarianceSamples = modeller.getBiasVarianceSamples(); + final List outlierProbabilitySamples = modeller.getOutlierProbabilitySamples(); + final List minorFractionsSamples = modeller.getMinorFractionsSamples(); + + Assert.assertEquals(numSegments, minorFractionsSamples.get(0).size()); + final List> minorFractionsSamplesBySegment = IntStream.range(0, numSegments) + .mapToObj(i -> minorFractionsSamples.stream().map(s -> s.get(i)).collect(Collectors.toList())) + .collect(Collectors.toList()); + + final double meanBiasResult = meanBiasSamples.stream().mapToDouble(x -> x).average().getAsDouble(); + final double biasVarianceResult = biasVarianceSamples.stream().mapToDouble(x -> x).average().getAsDouble(); + final double outlierProbabilityResult = outlierProbabilitySamples.stream().mapToDouble(x -> x).average().getAsDouble(); + final List minorFractionsResult = minorFractionsSamplesBySegment + .stream().map(list -> list.stream().mapToDouble(x -> x).average().getAsDouble()) + .collect(Collectors.toList()); + + final double totalSegmentError = IntStream.range(0, numSegments) + .mapToDouble(s -> Math.abs(minorFractionsResult.get(s) - trueState.segmentMinorFraction(s))) + .sum(); + + Assert.assertEquals(meanBiasResult, trueState.meanBias(), ABSOLUTE_TOLERANCE); + Assert.assertEquals(biasVarianceResult, trueState.biasVariance(), ABSOLUTE_TOLERANCE); + Assert.assertEquals(outlierProbabilityResult, trueState.outlierProbability(), ABSOLUTE_TOLERANCE); + Assert.assertEquals(totalSegmentError / numSegments, 0.0, ABSOLUTE_TOLERANCE); + + //check centers from summaries + final ParameterDecileCollection globalParameterDeciles = modeller.getGlobalParameterDeciles(); + final DecileCollection meanBiasDeciles = globalParameterDeciles.getDeciles(AlleleFractionParameter.MEAN_BIAS); + final double meanBiasPosteriorCenter = meanBiasDeciles.get(Decile.DECILE_50); + Assert.assertEquals(meanBiasPosteriorCenter, trueState.meanBias(), ABSOLUTE_TOLERANCE); + + final DecileCollection biasVarianceDeciles = globalParameterDeciles.getDeciles(AlleleFractionParameter.BIAS_VARIANCE); + final double biasVariancePosteriorCenter = biasVarianceDeciles.get(Decile.DECILE_50); + Assert.assertEquals(biasVariancePosteriorCenter, trueState.biasVariance(), ABSOLUTE_TOLERANCE); + + final DecileCollection outlierProbabilityDeciles = globalParameterDeciles.getDeciles(AlleleFractionParameter.OUTLIER_PROBABILITY); + final double outlierProbabilityPosteriorCenter = outlierProbabilityDeciles.get(Decile.DECILE_50); + Assert.assertEquals(outlierProbabilityPosteriorCenter, trueState.outlierProbability(), ABSOLUTE_TOLERANCE); + + final List minorFractionsPosteriorSummaries = modeller.getMinorAlleleFractionsPosteriorSummaries(); + Assert.assertEquals(numSegments, minorFractionsPosteriorSummaries.size()); + final List minorFractionsPosteriorCenters = minorFractionsPosteriorSummaries.stream().map(ModeledSegment.SimplePosteriorSummary::getDecile50).collect(Collectors.toList()); + double totalPosteriorCentersSegmentError = 0.0; + for (int segment = 0; segment < numSegments; segment++) { + totalPosteriorCentersSegmentError += Math.abs(minorFractionsPosteriorCenters.get(segment) - trueState.segmentMinorFraction(segment)); + } + Assert.assertEquals(totalPosteriorCentersSegmentError / numSegments, 0.0, ABSOLUTE_TOLERANCE); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSimulatedData.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSimulatedData.java new file mode 100644 index 00000000000..538b2baa218 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/AlleleFractionSimulatedData.java @@ -0,0 +1,108 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.distribution.BinomialDistribution; +import org.apache.commons.math3.distribution.GammaDistribution; +import org.apache.commons.math3.distribution.PoissonDistribution; +import org.apache.commons.math3.distribution.UniformRealDistribution; +import org.apache.commons.math3.random.RandomGenerator; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.utils.SimpleInterval; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Simulates {@link AlleleFractionSegmentedData} given parameter values for use in test classes. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class AlleleFractionSimulatedData { + private static final int MIN_HETS_PER_SEGMENT = 3; + + private static PoissonDistribution makePoisson(final RandomGenerator rng, final double mean) { + return new PoissonDistribution(rng, mean, PoissonDistribution.DEFAULT_EPSILON, PoissonDistribution.DEFAULT_MAX_ITERATIONS); + } + + private final AlleleFractionSegmentedData data; + private final AlleleFractionState trueState; + + AlleleFractionSimulatedData(final SampleMetadata sampleMetadata, + final AlleleFractionGlobalParameters globalParameters, + final int numSegments, + final double averageHetsPerSegment, + final double averageDepth, + final RandomGenerator rng) { + final AlleleFractionState.MinorFractions minorFractions = new AlleleFractionState.MinorFractions(numSegments); + final List allelicCounts = new ArrayList<>(); + final List segments = new ArrayList<>(); + + final PoissonDistribution segmentLengthGenerator = makePoisson(rng, averageHetsPerSegment); + final PoissonDistribution readDepthGenerator = makePoisson(rng, averageDepth); + final UniformRealDistribution minorFractionGenerator = new UniformRealDistribution(rng, 0.0, 0.5); + + final double meanBias = globalParameters.getMeanBias(); + final double biasVariance = globalParameters.getBiasVariance(); + final double outlierProbability = globalParameters.getOutlierProbability(); + + //translate to ApacheCommons' parametrization of the gamma distribution + final double gammaShape = meanBias * meanBias / biasVariance; + final double gammaScale = biasVariance / meanBias; + final GammaDistribution biasGenerator = new GammaDistribution(rng, gammaShape, gammaScale); + + //put each segment on its own chromosome and sort by lexicographical order + final List chromosomes = IntStream.range(0, numSegments) + .mapToObj(Integer::toString) + .sorted((c1, c2) -> LocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR.compare(new SimpleInterval(c1, 1, 1), new SimpleInterval(c2, 1, 1))) + .collect(Collectors.toList()); + + for (final String chromosome : chromosomes) { + // calculate the range of het indices for this segment + final int numHetsInSegment = Math.max(MIN_HETS_PER_SEGMENT, segmentLengthGenerator.sample()); + + final double minorFraction = minorFractionGenerator.sample(); + minorFractions.add(minorFraction); + + //we will put all the hets in this segment/chromosome at loci 1, 2, 3 etc + segments.add(new SimpleInterval(chromosome, 1, numHetsInSegment)); + for (int het = 1; het < numHetsInSegment + 1; het++) { + final double bias = biasGenerator.sample(); + + //flip a coin to decide alt minor (alt fraction = minor fraction) or ref minor (alt fraction = 1 - minor fraction) + final boolean isAltMinor = rng.nextDouble() < 0.5; + final double altFraction = isAltMinor ? minorFraction : 1 - minorFraction; + + //the probability of an alt read is the alt fraction modified by the bias or, in the case of an outlier, random + final double pAlt; + if (rng.nextDouble() < outlierProbability) { + pAlt = rng.nextDouble(); + } else { + pAlt = altFraction / (altFraction + (1 - altFraction) * bias); + } + + final int numReads = readDepthGenerator.sample(); + final int numAltReads = new BinomialDistribution(rng, numReads, pAlt).sample(); + final int numRefReads = numReads - numAltReads; + allelicCounts.add(new AllelicCount(new SimpleInterval(chromosome, het, het), numRefReads, numAltReads)); + } + } + + data = new AlleleFractionSegmentedData(new AllelicCountCollection(sampleMetadata, allelicCounts), segments); + trueState = new AlleleFractionState(meanBias, biasVariance, outlierProbability, minorFractions); + } + + AlleleFractionSegmentedData getData() { + return data; + } + + AllelicCountCollection getAllelicCounts() { + return data.getAllelicCounts(); + } + + AlleleFractionState getTrueState() { return trueState; } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModellerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModellerUnitTest.java new file mode 100644 index 00000000000..3c8b28dac03 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioModellerUnitTest.java @@ -0,0 +1,101 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment; +import org.broadinstitute.hellbender.utils.mcmc.Decile; +import org.broadinstitute.hellbender.utils.mcmc.DecileCollection; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Tests the MCMC inference performed by {@link CopyRatioModeller}. Only recovery of posterior centers is tested. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class CopyRatioModellerUnitTest extends BaseTest { + private static final int RANDOM_SEED = 13; + + // note: the following tolerance could actually be made much smaller if we used more segments and/or + // more intervals -- most of the error is the sampling error of a finite simulated data set, not numerical error of MCMC + private static final double ABSOLUTE_TOLERANCE = 0.01; + + @Test + public void testMCMC() { + final double variance = 0.01; + final double outlierProbability = 0.05; + final int numSegments = 100; + final double averageIntervalsPerSegment = 100.; + final int numSamples = 150; + final int numBurnIn = 50; + final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED)); + + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("test"); + final CopyRatioSimulatedData simulatedData = new CopyRatioSimulatedData( + sampleMetadata, variance, outlierProbability, numSegments, averageIntervalsPerSegment, rng); + + final CopyRatioModeller modeller = new CopyRatioModeller(simulatedData.getData().getCopyRatios(), simulatedData.getData().getSegments()); + modeller.fitMCMC(numSamples, numBurnIn); + + assertCopyRatioPosteriorCenters(modeller, simulatedData); + } + + static void assertCopyRatioPosteriorCenters(final CopyRatioModeller modeller, + final CopyRatioSimulatedData simulatedData) { + final CopyRatioState trueState = simulatedData.getTrueState(); + final int numSegments = simulatedData.getData().getNumSegments(); + + //check centers from samples + final List varianceSamples = modeller.getVarianceSamples(); + final List outlierProbabilitySamples = modeller.getOutlierProbabilitySamples(); + final List segmentMeansSamples = modeller.getSegmentMeansSamples(); + + Assert.assertEquals(numSegments, segmentMeansSamples.get(0).size()); + final List> segmentMeansSamplesBySegment = IntStream.range(0, numSegments) + .mapToObj(i -> segmentMeansSamples.stream().map(s -> s.get(i)).collect(Collectors.toList())) + .collect(Collectors.toList()); + + final double varianceResult = varianceSamples.stream().mapToDouble(x -> x).average().getAsDouble(); + final double outlierProbabilityResult = outlierProbabilitySamples.stream().mapToDouble(x -> x).average().getAsDouble(); + final List segmentMeansResult = segmentMeansSamplesBySegment + .stream().map(list -> list.stream().mapToDouble(x -> x).average().getAsDouble()) + .collect(Collectors.toList()); + + final double totalSegmentError = IntStream.range(0, numSegments) + .mapToDouble(s -> Math.abs(segmentMeansResult.get(s) - trueState.segmentMean(s))) + .sum(); + + Assert.assertEquals(varianceResult, trueState.variance(), ABSOLUTE_TOLERANCE); + Assert.assertEquals(outlierProbabilityResult, trueState.outlierProbability(), ABSOLUTE_TOLERANCE); + Assert.assertEquals(totalSegmentError / numSegments, 0.0, ABSOLUTE_TOLERANCE); + + //check centers from summaries + final ParameterDecileCollection globalParameterDeciles = modeller.getGlobalParameterDeciles(); + final DecileCollection varianceDeciles = globalParameterDeciles.getDeciles(CopyRatioParameter.VARIANCE); + final double variancePosteriorCenter = varianceDeciles.get(Decile.DECILE_50); + Assert.assertEquals(variancePosteriorCenter, trueState.variance(), ABSOLUTE_TOLERANCE); + + final DecileCollection outlierProbabilityDeciles = globalParameterDeciles.getDeciles(CopyRatioParameter.OUTLIER_PROBABILITY); + final double outlierProbabilityPosteriorCenter = outlierProbabilityDeciles.get(Decile.DECILE_50); + Assert.assertEquals(outlierProbabilityPosteriorCenter, trueState.outlierProbability(), ABSOLUTE_TOLERANCE); + + final List segmentMeansPosteriorSummaries = modeller.getSegmentMeansPosteriorSummaries(); + Assert.assertEquals(numSegments, segmentMeansPosteriorSummaries.size()); + final List segmentMeansPosteriorCenters = segmentMeansPosteriorSummaries.stream().map(ModeledSegment.SimplePosteriorSummary::getDecile50).collect(Collectors.toList()); + double totalPosteriorCentersSegmentError = 0.0; + for (int segment = 0; segment < numSegments; segment++) { + totalPosteriorCentersSegmentError += Math.abs(segmentMeansPosteriorCenters.get(segment) - trueState.segmentMean(segment)); + } + Assert.assertEquals(totalPosteriorCentersSegmentError / numSegments, 0.0, ABSOLUTE_TOLERANCE); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSimulatedData.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSimulatedData.java new file mode 100644 index 00000000000..ce9a725c26a --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/CopyRatioSimulatedData.java @@ -0,0 +1,97 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.distribution.PoissonDistribution; +import org.apache.commons.math3.distribution.UniformRealDistribution; +import org.apache.commons.math3.random.RandomGenerator; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.LocatableCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.utils.SimpleInterval; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Simulates {@link CopyRatioSegmentedData} given parameter values for use in test classes. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +final class CopyRatioSimulatedData { + private static final int MIN_INTERVALS_PER_SEGMENT = 3; + private static final double LOG2_COPY_RATIO_MIN = CopyRatioModeller.LOG2_COPY_RATIO_MIN; + private static final double LOG2_COPY_RATIO_MAX = CopyRatioModeller.LOG2_COPY_RATIO_MAX; + private static final double SEGMENT_MEAN_MIN = -10.; + private static final double SEGMENT_MEAN_MAX = 5.; + + private static PoissonDistribution makePoisson(final RandomGenerator rng, final double mean) { + return new PoissonDistribution(rng, mean, PoissonDistribution.DEFAULT_EPSILON, PoissonDistribution.DEFAULT_MAX_ITERATIONS); + } + + private final CopyRatioSegmentedData data; + private final CopyRatioState trueState; + + CopyRatioSimulatedData(final SampleMetadata sampleMetadata, + final double variance, + final double outlierProbability, + final int numSegments, + final double averageIntervalsPerSegment, + final RandomGenerator rng) { + final List segmentMeans = new ArrayList<>(numSegments); + final List outlierIndicators = new ArrayList<>(); + final List copyRatios = new ArrayList<>(); + final List segments = new ArrayList<>(); + + final double standardDeviation = Math.sqrt(variance); + + final PoissonDistribution segmentLengthGenerator = makePoisson(rng, averageIntervalsPerSegment); + final UniformRealDistribution segmentMeanGenerator = new UniformRealDistribution(rng, SEGMENT_MEAN_MIN, SEGMENT_MEAN_MAX); + final UniformRealDistribution outlierGenerator = new UniformRealDistribution(rng, LOG2_COPY_RATIO_MIN, LOG2_COPY_RATIO_MAX); + + //put each segment on its own chromosome and sort by lexicographical order + final List chromosomes = IntStream.range(0, numSegments) + .mapToObj(Integer::toString) + .sorted((c1, c2) -> LocatableCollection.LEXICOGRAPHICAL_ORDER_COMPARATOR.compare(new SimpleInterval(c1, 1, 1), new SimpleInterval(c2, 1, 1))) + .collect(Collectors.toList()); + + for (final String chromosome : chromosomes) { + // calculate the range of interval indices for this segment + final int numIntervalsInSegment = Math.max(MIN_INTERVALS_PER_SEGMENT, segmentLengthGenerator.sample()); + + final double segmentMean = segmentMeanGenerator.sample(); + segmentMeans.add(segmentMean); + + //we will put all the intervals in this segment/chromosome at loci 1, 2, 3 etc + segments.add(new SimpleInterval(chromosome, 1, numIntervalsInSegment)); + for (int interval = 1; interval < numIntervalsInSegment + 1; interval++) { + + final double log2CopyRatio; + if (rng.nextDouble() < outlierProbability) { + outlierIndicators.add(true); + log2CopyRatio = outlierGenerator.sample(); + } else { + outlierIndicators.add(false); + log2CopyRatio = segmentMean + rng.nextGaussian() * standardDeviation; + } + + copyRatios.add(new CopyRatio(new SimpleInterval(chromosome, interval, interval), log2CopyRatio)); + } + } + + data = new CopyRatioSegmentedData(new CopyRatioCollection(sampleMetadata, copyRatios), segments); + trueState = new CopyRatioState(variance, outlierProbability, new CopyRatioState.SegmentMeans(segmentMeans), new CopyRatioState.OutlierIndicators(outlierIndicators)); + } + + CopyRatioSegmentedData getData() { + return data; + } + + CopyRatioCollection getCopyRatios() { + return data.getCopyRatios(); + } + + CopyRatioState getTrueState() { return trueState; } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModellerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModellerUnitTest.java new file mode 100644 index 00000000000..2b2e0b69f56 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/models/MultidimensionalModellerUnitTest.java @@ -0,0 +1,114 @@ +package org.broadinstitute.hellbender.tools.copynumber.models; + +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.MultidimensionalSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.MultidimensionalSegment; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; + +/** + * Tests the MCMC inference performed by {@link MultidimensionalModeller}. Only recovery of posterior centers is tested. + * Merging of adjacent similar segments is also tested. + * + * @author David Benjamin <davidben@broadinstitute.org> + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class MultidimensionalModellerUnitTest extends BaseTest { + private static final int RANDOM_SEED = 13; + + @Test + public void testMCMC() { + final int numSegments = 25; + final int numSamples = 150; + final int numBurnIn = 50; + final RandomGenerator rng = RandomGeneratorFactory.createRandomGenerator(new Random(RANDOM_SEED)); + + //copy-ratio model parameters + final double varianceCR = 0.01; + final double outlierProbabilityCR = 0.05; + final double averageIntervalsPerSegment = 100.; + + //allele-fraction model parameters + final double meanBiasAF = 1.2; + final double biasVarianceAF = 0.04; + final double outlierProbabilityAF = 0.02; + final AlleleFractionGlobalParameters globalParametersAF = new AlleleFractionGlobalParameters(meanBiasAF, biasVarianceAF, outlierProbabilityAF); + final double minorAlleleFractionPriorAlpha = 1.; + final AlleleFractionPrior priorAF = new AlleleFractionPrior(minorAlleleFractionPriorAlpha); + final double averageHetsPerSegment = 50.; + final double averageDepthAF = 50.; + + //similar-segment merging parameters + final int maxNumSmoothingIterations = 10; + final int numSmoothingIterationsPerFit = 0; + final double smoothingCredibleIntervalThresholdCopyRatio = 2.; + final double smoothingCredibleIntervalThresholdAlleleFraction = 2.; + + //recall that both CR and AF data points are at loci 1, 2, 3, etc. and that each segment is on a different contig + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("test"); + final CopyRatioSimulatedData simulatedDataCR = new CopyRatioSimulatedData( + sampleMetadata, varianceCR, outlierProbabilityCR, numSegments, averageIntervalsPerSegment, rng); + final AlleleFractionSimulatedData simulatedDataAF = new AlleleFractionSimulatedData( + sampleMetadata, globalParametersAF, numSegments, averageHetsPerSegment, averageDepthAF, rng); + + //we introduce extra segments, which we will later merge to test similar-segment merging + final MultidimensionalSegmentCollection oversegmentedSegments = new MultidimensionalSegmentCollection( + sampleMetadata, + constructOversegmentedSegments(simulatedDataCR, simulatedDataAF)); + + final MultidimensionalModeller modeller = new MultidimensionalModeller( + oversegmentedSegments, + simulatedDataCR.getCopyRatios(), + simulatedDataAF.getAllelicCounts(), priorAF, + numSamples, numBurnIn, numSamples, numBurnIn); + modeller.smoothSegments(maxNumSmoothingIterations, numSmoothingIterationsPerFit, smoothingCredibleIntervalThresholdCopyRatio, smoothingCredibleIntervalThresholdAlleleFraction); + + CopyRatioModellerUnitTest.assertCopyRatioPosteriorCenters(modeller.getCopyRatioModeller(), simulatedDataCR); + AlleleFractionModellerUnitTest.assertAlleleFractionPosteriorCenters(modeller.getAlleleFractionModeller(), simulatedDataAF); + } + + private List constructOversegmentedSegments(final CopyRatioSimulatedData simulatedDataCR, + final AlleleFractionSimulatedData simulatedDataAF) { + final int numSegments = simulatedDataCR.getData().getNumSegments(); + final List contigs = simulatedDataCR.getData().getSegments().stream().map(SimpleInterval::getContig).distinct().collect(Collectors.toList()); + final List segments = new ArrayList<>(2 * numSegments); //we split every real segment into two + for (int segmentIndex = 0; segmentIndex < numSegments; segmentIndex++) { + final String contig = contigs.get(segmentIndex); + final List copyRatiosInSegment = + simulatedDataCR.getData().getIndexedCopyRatiosInSegment(segmentIndex).stream() + .map(icr -> (CopyRatio) icr) + .collect(Collectors.toList()); + final List allelicCountsInSegment = + simulatedDataAF.getData().getIndexedAllelicCountsInSegment(segmentIndex).stream() + .map(iac -> (AllelicCount) iac) + .collect(Collectors.toList()); + + //take half of whichever data source has fewer points, take the same number from the other data source, and make a segment + final int numPointsCR = copyRatiosInSegment.size(); + final int numPointsAF = allelicCountsInSegment.size(); + final int numPointsMinHalf = Math.min(numPointsCR, numPointsAF) / 2; + segments.add(new MultidimensionalSegment( + new SimpleInterval(contig, 1, numPointsMinHalf), + copyRatiosInSegment.subList(0, numPointsMinHalf + 1), + allelicCountsInSegment.subList(0, numPointsMinHalf + 1))); + //add the remaining points to another segment + final int numPointsMax = Math.max(numPointsCR, numPointsAF); + segments.add(new MultidimensionalSegment( + new SimpleInterval(contig, numPointsMinHalf + 1, numPointsMax), + copyRatiosInSegment.subList(numPointsMinHalf + 1, numPointsCR), + allelicCountsInSegment.subList(numPointsMinHalf + 1, numPointsAF))); + } + return segments; + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatiosIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatiosIntegrationTest.java index 80a6a5c29c7..1dfe0a906be 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatiosIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatiosIntegrationTest.java @@ -14,7 +14,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class PlotDenoisedCopyRatiosIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/plotting/"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber/plotting"); //test files private static final File STANDARDIZED_COPY_RATIOS_FILE = new File(TEST_SUB_DIR, "plotting-copy-ratios.tsv"); //just use the same file for both standardized and denoised diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegmentsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegmentsIntegrationTest.java index 9013cf2a608..15ff2b3b9a2 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegmentsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegmentsIntegrationTest.java @@ -13,7 +13,7 @@ * @author Samuel Lee <slee@broadinstitute.org> */ public final class PlotModeledSegmentsIntegrationTest extends CommandLineProgramTest { - private static final String TEST_SUB_DIR = toolsTestDir + "copynumber/plotting/"; + private static final File TEST_SUB_DIR = new File(toolsTestDir, "copynumber/plotting"); //test files private static final File DENOISED_COPY_RATIOS_FILE = new File(TEST_SUB_DIR, "plotting-copy-ratios.tsv"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenterUnitTest.java new file mode 100644 index 00000000000..c65e62a5885 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/AlleleFractionKernelSegmenterUnitTest.java @@ -0,0 +1,106 @@ +package org.broadinstitute.hellbender.tools.copynumber.segmentation; + +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AlleleFractionSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AlleleFractionSegment; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.AllelicCount; +import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenterUnitTest; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * @author Samuel Lee <slee@broadinstitute.org> + */ +public final class AlleleFractionKernelSegmenterUnitTest extends BaseTest { + private static final int RANDOM_SEED = 1; //reset seed before each simulated test case + + /** + * Generates alternate-allele-fraction-like data (similar to zero-mean multimodal test data + * in {@link KernelSegmenterUnitTest#dataKernelSegmenter()}), + * but introduces further segments by placing data on different chromosomes. + */ + @DataProvider(name = "dataAlleleFractionKernelSegmenter") + public Object[][] dataAlleleFractionKernelSegmenter() { + final int numPoints = 10000; + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("testSample"); + final double noiseLevel = 0.001; + final double homFraction = 0.1; //low hom fraction minimizes uncertainty in the changepoints coming from runs of adjacent homs near the changepoints + + final Random rng = new Random(RANDOM_SEED); + rng.setSeed(RANDOM_SEED); + final List minorAlleleFractions = Arrays.asList(0.45, 0.05, 0.25, 0.45, 0.05, 0.25, 0.45, 0.05, 0.25, 0.45, 0.05, 0.25); + final List alternateAlleleFractions = IntStream.range(0, numPoints).boxed() + .map(i -> rng.nextFloat() < homFraction + ? rng.nextBoolean() + ? 0. + noiseLevel * Math.abs(rng.nextGaussian()) //hom ref + : 1. - noiseLevel * Math.abs(rng.nextGaussian()) //hom alt + : rng.nextBoolean() + ? Math.max(minorAlleleFractions.get(i / 1000) + noiseLevel * rng.nextGaussian(), 0.) //het alt minor + : Math.min(1. - minorAlleleFractions.get(i / 1000) + noiseLevel * rng.nextGaussian(), 1.)) //het ref minor + .collect(Collectors.toList()); //changepoints at 999, 1999, 2999, 3999, 4999, 5999, 6999, 7999, 8999 + + final List intervals = IntStream.range(0, numPoints).boxed() + .map(i -> new SimpleInterval( + Integer.toString(i / 2500 + 1), //start a new chromosome every 2500 points, which adds additional changepoints + (i % 2500) + 1, + (i % 2500) + 1)) + .collect(Collectors.toList()); + + final int globalDepth = 100; + final List allelicCountsList = IntStream.range(0, numPoints).boxed() + .map(i -> new AllelicCount( + intervals.get(i), + (int) ((1 - alternateAlleleFractions.get(i)) * globalDepth), + (int) (alternateAlleleFractions.get(i) * globalDepth))) + .collect(Collectors.toList()); + final AllelicCountCollection allelicCounts = new AllelicCountCollection(sampleMetadata, allelicCountsList); + + final AlleleFractionSegmentCollection segmentsExpected = + new AlleleFractionSegmentCollection( + sampleMetadata, + Arrays.asList( + new AlleleFractionSegment(new SimpleInterval("1", 1, 1000), allelicCountsList.subList(0, 1000)), + new AlleleFractionSegment(new SimpleInterval("1", 1001, 2000), allelicCountsList.subList(1000, 2000)), + new AlleleFractionSegment(new SimpleInterval("1", 2001, 2500), allelicCountsList.subList(2000, 2500)), + new AlleleFractionSegment(new SimpleInterval("2", 1, 500), allelicCountsList.subList(2500, 3000)), + new AlleleFractionSegment(new SimpleInterval("2", 501, 1500), allelicCountsList.subList(3000, 4000)), + new AlleleFractionSegment(new SimpleInterval("2", 1501, 2500), allelicCountsList.subList(4000, 5000)), + new AlleleFractionSegment(new SimpleInterval("3", 1, 1000), allelicCountsList.subList(5000, 6000)), + new AlleleFractionSegment(new SimpleInterval("3", 1001, 2000), allelicCountsList.subList(6000, 7000)), + new AlleleFractionSegment(new SimpleInterval("3", 2001, 2500), allelicCountsList.subList(7000, 7500)), + new AlleleFractionSegment(new SimpleInterval("4", 1, 500), allelicCountsList.subList(7500, 8000)), + new AlleleFractionSegment(new SimpleInterval("4", 501, 1500), allelicCountsList.subList(8000, 9000)), + new AlleleFractionSegment(new SimpleInterval("4", 1501, 2500), allelicCountsList.subList(9000, 10000)))); + + return new Object[][]{ + {allelicCounts, segmentsExpected} + }; + } + + @Test(dataProvider = "dataAlleleFractionKernelSegmenter") + public void testAlleleFractionKernelSegmenter(final AllelicCountCollection allelicCounts, + final AlleleFractionSegmentCollection segmentsExpected) { + final int maxNumChangepointsPerChromosome = 25; + final double kernelVariance = 0.01; + final int kernelApproximationDimension = 20; + final List windowSizes = Arrays.asList(8, 16, 32, 64); + final double numChangepointsPenaltyLinearFactor = 1.; + final double numChangepointsPenaltyLogLinearFactor = 1.; + + final AlleleFractionSegmentCollection segments = new AlleleFractionKernelSegmenter(allelicCounts) + .findSegmentation(maxNumChangepointsPerChromosome, kernelVariance, kernelApproximationDimension, + windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor); + Assert.assertEquals(segments, segmentsExpected); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenterUnitTest.java new file mode 100644 index 00000000000..c3d856e5900 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/segmentation/CopyRatioKernelSegmenterUnitTest.java @@ -0,0 +1,92 @@ +package org.broadinstitute.hellbender.tools.copynumber.segmentation; + +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioSegmentCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatio; +import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyRatioSegment; +import org.broadinstitute.hellbender.tools.copynumber.utils.segmentation.KernelSegmenterUnitTest; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * @author Samuel Lee <slee@broadinstitute.org> + */ +public class CopyRatioKernelSegmenterUnitTest { + private static final int RANDOM_SEED = 1; //reset seed before each simulated test case + + /** + * Generates same Gaussian test data as {@link KernelSegmenterUnitTest#dataKernelSegmenter()}, + * but introduces further segments by placing data on different chromosomes. + */ + @DataProvider(name = "dataCopyRatioKernelSegmenter") + public Object[][] dataCopyRatioKernelSegmenter() { + final int numPoints = 1000; + final SampleMetadata sampleMetadata = new SimpleSampleMetadata("testSample"); + + final Random rng = new Random(RANDOM_SEED); + rng.setSeed(RANDOM_SEED); + final List dataGaussian = IntStream.range(0, numPoints).boxed() + .map(i -> Math.abs(i / 100 - 5) + 0.1 * rng.nextGaussian()) + .collect(Collectors.toList()); //changepoints at 99, 199, 299, 399, 499, 599, 699, 799, 899 + + final List intervals = IntStream.range(0, numPoints).boxed() + .map(i -> new SimpleInterval( + Integer.toString(i / 250 + 1), //start a new chromosome every 250 points, which adds additional changepoints + (i % 250) * 10 + 1, + (i % 250) * 10 + 10)) //intervals for copy-ratio data points have length = 10 + .collect(Collectors.toList()); + + final CopyRatioCollection denoisedCopyRatios = new CopyRatioCollection( + sampleMetadata, + IntStream.range(0, intervals.size()).boxed() + .map(i -> new CopyRatio(intervals.get(i), dataGaussian.get(i))) + .collect(Collectors.toList())); + + final CopyRatioSegmentCollection segmentsExpected = + new CopyRatioSegmentCollection( + sampleMetadata, + Arrays.asList( + new CopyRatioSegment(new SimpleInterval("1", 1, 1000), denoisedCopyRatios.getRecords().subList(0, 100)), + new CopyRatioSegment(new SimpleInterval("1", 1001, 2000), denoisedCopyRatios.getRecords().subList(100, 200)), + new CopyRatioSegment(new SimpleInterval("1", 2001, 2500), denoisedCopyRatios.getRecords().subList(200, 250)), + new CopyRatioSegment(new SimpleInterval("2", 1, 500), denoisedCopyRatios.getRecords().subList(250, 300)), + new CopyRatioSegment(new SimpleInterval("2", 501, 1500), denoisedCopyRatios.getRecords().subList(300, 400)), + new CopyRatioSegment(new SimpleInterval("2", 1501, 2500), denoisedCopyRatios.getRecords().subList(400, 500)), + new CopyRatioSegment(new SimpleInterval("3", 1, 1000), denoisedCopyRatios.getRecords().subList(500, 600)), + new CopyRatioSegment(new SimpleInterval("3", 1001, 2000), denoisedCopyRatios.getRecords().subList(600, 700)), + new CopyRatioSegment(new SimpleInterval("3", 2001, 2500), denoisedCopyRatios.getRecords().subList(700, 750)), + new CopyRatioSegment(new SimpleInterval("4", 1, 500), denoisedCopyRatios.getRecords().subList(750, 800)), + new CopyRatioSegment(new SimpleInterval("4", 501, 1500), denoisedCopyRatios.getRecords().subList(800, 900)), + new CopyRatioSegment(new SimpleInterval("4", 1501, 2500), denoisedCopyRatios.getRecords().subList(900, 1000)))); + + return new Object[][]{ + {denoisedCopyRatios, segmentsExpected} + }; + } + + @Test(dataProvider = "dataCopyRatioKernelSegmenter") + public void testCopyRatioKernelSegmenter(final CopyRatioCollection denoisedCopyRatios, + final CopyRatioSegmentCollection segmentsExpected) { + final int maxNumChangepointsPerChromosome = 25; + final double kernelVariance = 0.; + final int kernelApproximationDimension = 20; + final List windowSizes = Arrays.asList(8, 16, 32, 64); + final double numChangepointsPenaltyLinearFactor = 2.; + final double numChangepointsPenaltyLogLinearFactor = 2.; + + final CopyRatioSegmentCollection segments = new CopyRatioKernelSegmenter(denoisedCopyRatios) + .findSegmentation(maxNumChangepointsPerChromosome, kernelVariance, kernelApproximationDimension, + windowSizes, numChangepointsPenaltyLinearFactor, numChangepointsPenaltyLogLinearFactor); + Assert.assertEquals(segments, segmentsExpected); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenterUnitTest.java index d1b43d96062..125bc552526 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/utils/segmentation/KernelSegmenterUnitTest.java @@ -106,8 +106,8 @@ public void testKernelSegmenterTruncateChangepoints(final List data, @Test(dataProvider = "dataKernelSegmenter") public void testKernelSegmenterExtremePenalty(final List data, - final BiFunction kernel, - final List changepointsExpected) { + final BiFunction kernel, + final List changepointsExpected) { final int maxNumChangepoints = 25; final int kernelApproximationDimension = 20; final List windowSizes = Arrays.asList(8, 16, 32, 64); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCountsIntegrationTest.java index a5353e9ecb1..d1add5ff325 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCountsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/genome/SparkGenomeReadCountsIntegrationTest.java @@ -1,10 +1,9 @@ package org.broadinstitute.hellbender.tools.genome; import org.apache.commons.io.FilenameUtils; -import org.broadinstitute.hdf5.HDF5File; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.tools.copynumber.coverage.readcount.SimpleCountCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.exome.ReadCountCollection; import org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils; import org.broadinstitute.hellbender.tools.exome.Target; diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/call-copy-ratio-segments-segments.seg b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/call-copy-ratio-segments-segments.seg similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/call-copy-ratio-segments-segments.seg rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/call-copy-ratio-segments-segments.seg diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-normal.bam b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-normal.bam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-normal.bam rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-normal.bam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-normal.bam.bai b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-normal.bam.bai similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-normal.bam.bai rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-normal.bam.bai diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-sites.interval_list b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-sites.interval_list similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-sites.interval_list rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-sites.interval_list diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-tumor.bam b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-tumor.bam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-tumor.bam rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-tumor.bam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-tumor.bam.bai b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-tumor.bam.bai similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/collect-allelic-counts-tumor.bam.bai rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-allelic-counts-tumor.bam.bai diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-NA12878-expected.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-NA12878-expected.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-NA12878-expected.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-NA12878-expected.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-NA12878.bam b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-NA12878.bam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-NA12878.bam rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-NA12878.bam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-NA12878.bam.bai b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-NA12878.bam.bai similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-NA12878.bam.bai rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-NA12878.bam.bai diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-test.interval_list b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-test.interval_list similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collectfragmentcounts/collect-fragment-counts-test.interval_list rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/collect-fragment-counts-test.interval_list diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/call-copy-ratio-segments-denoised-copy-ratios.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/call-copy-ratio-segments-denoised-copy-ratios.tsv deleted file mode 100644 index c27b606b7f4..00000000000 --- a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/caller/call-copy-ratio-segments-denoised-copy-ratios.tsv +++ /dev/null @@ -1,42 +0,0 @@ -#SAMPLE_NAME=test -CONTIG START END LOG2_COPY_RATIO -chr 101 101 1 -chr 102 102 1 -chr 103 103 1 -chr 104 104 1 -chr 105 105 1 -chr 106 106 1 -chr 107 107 1 -chr 108 108 1 -chr 109 109 1 -chr 110 110 1 -chr 201 201 -1 -chr 202 202 -1 -chr 203 203 -1 -chr 204 204 -1 -chr 205 205 -1 -chr 206 206 -1 -chr 207 207 -1 -chr 208 208 -1 -chr 209 209 -1 -chr 210 210 -1 -chr 301 301 0.05 -chr 302 302 0.04 -chr 303 303 0.03 -chr 304 304 0.02 -chr 305 305 0.01 -chr 306 306 -0.01 -chr 307 307 -0.02 -chr 308 308 -0.03 -chr 309 309 -0.04 -chr 310 310 -0.05 -chr 401 401 0.05 -chr 402 402 0.04 -chr 403 403 0.03 -chr 404 404 0.02 -chr 405 405 0.01 -chr 406 406 -0.01 -chr 407 407 -0.02 -chr 408 408 -0.03 -chr 409 409 -0.04 -chr 410 410 -0.05 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/denoise-read-counts-wgs-annotated-intervals.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/denoise-read-counts-wgs-annotated-intervals.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/denoise-read-counts-wgs-annotated-intervals.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/denoise-read-counts-wgs-annotated-intervals.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.hdf5 b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.hdf5 similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.hdf5 rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.hdf5 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/denoising/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/denoise-read-counts-wgs-read-counts-HCC1143_BL-n1-chr20-downsampled-deduplicated.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/allelic-count-collection-normal-missing-nucleotides.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/allelic-count-collection-normal-missing-nucleotides.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/allelic-count-collection-normal-missing-nucleotides.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/allelic-count-collection-normal-missing-nucleotides.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/allelic-count-collection-normal.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/allelic-count-collection-normal.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/allelic/allelic-count-collection-normal.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/allelic-count-collection-normal.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/locatable-collection-tsv-simple-locatable-collection-missing-column.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/locatable-collection-tsv-simple-locatable-collection-missing-column.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/locatable-collection-tsv-simple-locatable-collection-missing-column.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/locatable-collection-tsv-simple-locatable-collection-missing-column.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/locatable-collection-tsv-simple-locatable-collection-non-lexicographical-order.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/locatable-collection-tsv-simple-locatable-collection-non-lexicographical-order.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/locatable-collection-tsv-simple-locatable-collection-non-lexicographical-order.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/locatable-collection-tsv-simple-locatable-collection-non-lexicographical-order.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/locatable-collection-tsv-simple-locatable-collection.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/locatable-collection-tsv-simple-locatable-collection.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/locatable-collection-tsv-simple-locatable-collection.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/locatable-collection-tsv-simple-locatable-collection.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/simple-count-collection-double-counts.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/simple-count-collection-double-counts.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/simple-count-collection-double-counts.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/simple-count-collection-double-counts.tsv diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/simple-count-collection-integer-counts.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/simple-count-collection-integer-counts.tsv similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/copynumber/coverage/readcount/simple-count-collection-integer-counts.tsv rename to src/test/resources/org/broadinstitute/hellbender/tools/copynumber/formats/collections/simple-count-collection-integer-counts.tsv