From c6856d6744806372747261592afe90aa959d8a42 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 2 Jul 2024 17:01:44 +0200 Subject: [PATCH 1/7] setup some groundwork for vqsr --- main.nf | 39 +++++++ modules.json | 10 ++ .../nf-core/gatk4/applyvqsr/environment.yml | 7 ++ modules/nf-core/gatk4/applyvqsr/main.nf | 63 +++++++++++ modules/nf-core/gatk4/applyvqsr/meta.yml | 76 +++++++++++++ .../applyvqsr/tests/allelspecificity.config | 5 + .../gatk4/applyvqsr/tests/main.nf.test | 106 ++++++++++++++++++ .../gatk4/applyvqsr/tests/main.nf.test.snap | 95 ++++++++++++++++ .../tests/no-allelspecificity.config | 5 + .../nf-core/gatk4/applyvqsr/tests/tags.yml | 2 + .../gatk4/variantrecalibrator/environment.yml | 7 ++ .../nf-core/gatk4/variantrecalibrator/main.nf | 71 ++++++++++++ .../gatk4/variantrecalibrator/meta.yml | 84 ++++++++++++++ nextflow.config | 1 + nextflow_schema.json | 84 ++++++++++++++ workflows/germline.nf | 11 ++ 16 files changed, 666 insertions(+) create mode 100644 modules/nf-core/gatk4/applyvqsr/environment.yml create mode 100644 modules/nf-core/gatk4/applyvqsr/main.nf create mode 100644 modules/nf-core/gatk4/applyvqsr/meta.yml create mode 100644 modules/nf-core/gatk4/applyvqsr/tests/allelspecificity.config create mode 100644 modules/nf-core/gatk4/applyvqsr/tests/main.nf.test create mode 100644 modules/nf-core/gatk4/applyvqsr/tests/main.nf.test.snap create mode 100644 modules/nf-core/gatk4/applyvqsr/tests/no-allelspecificity.config create mode 100644 modules/nf-core/gatk4/applyvqsr/tests/tags.yml create mode 100644 modules/nf-core/gatk4/variantrecalibrator/environment.yml create mode 100644 modules/nf-core/gatk4/variantrecalibrator/main.nf create mode 100644 modules/nf-core/gatk4/variantrecalibrator/meta.yml diff --git a/main.nf b/main.nf index c49a9e0d..696cdbfd 100644 --- a/main.nf +++ b/main.nf @@ -39,6 +39,16 @@ params.alphamissense = getGenomeAttribute('alphamissense') params.alphamissense_tbi = getGenomeAttribute('alphamissense_tbi') params.vcfanno_resources = getGenomeAttribute('vcfanno_resources') params.vcfanno_config = getGenomeAttribute('vcfanno_config') +params.hapmap = getGenomeAttribute('hapmap') +params.hapmap_tbi = getGenomeAttribute('hapmap_tbi') +params.omni_1000G = getGenomeAttribute('omni_1000G') +params.omni_1000G_tbi = getGenomeAttribute('omni_1000G_tbi') +params.snps_1000G = getGenomeAttribute('snps_1000G') +params.snps_1000G_tbi = getGenomeAttribute('snps_1000G_tbi') +params.indels_1000G = getGenomeAttribute('indels_1000G') +params.indels_1000G_tbi = getGenomeAttribute('indels_1000G_tbi') +params.known_indels = getGenomeAttribute('known_indels') +params.known_indels_tbi = getGenomeAttribute('known_indels_tbi') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -80,6 +90,24 @@ if (params.output_suffix && callers.size() > 1) { error("Cannot use --output_suffix with more than one caller") } +if(params.vqsr) { + if(!params.hapmap || !params.hapmap_tbi) { + error("Please provide --hapmap and --hapmap_tbi when using --vqsr") + } + if(!params.omni_1000G || !params.omni_1000G_tbi) { + error("Please provide --omni_1000G and --omni_1000G_tbi when using --vqsr") + } + if(!params.snps_1000G || !params.snps_1000G_tbi) { + error("Please provide --snps_1000G and --snps_1000G_tbi when using --vqsr") + } + if(!params.indels_1000G || !params.indels_1000G_tbi) { + error("Please provide --indels_1000G and --indels_1000G_tbi when using --vqsr") + } + if(!params.known_indels || !params.known_indels_tbi) { + error("Please provide --known_indels and --known_indels_tbi when using --vqsr") + } +} + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -150,6 +178,16 @@ workflow NFCMGG_GERMLINE { params.automap_repeats, params.automap_panel, params.outdir, + params.hapmap, + params.hapmap_tbi, + params.omni_1000G, + params.omni_1000G_tbi, + params.snps_1000G, + params.snps_1000G_tbi, + params.indels_1000G, + params.indels_1000G_tbi, + params.known_indels, + params.known_indels_tbi, // Boolean inputs params.dragstr, @@ -169,6 +207,7 @@ workflow NFCMGG_GERMLINE { params.vep_mastermind, params.vep_eog, params.vep_alphamissense, + params.vqsr, // Value inputs params.genome, diff --git a/modules.json b/modules.json index 6f8e48ba..6bc737c7 100644 --- a/modules.json +++ b/modules.json @@ -79,6 +79,11 @@ "installed_by": ["vcf_annotate_ensemblvep_snpeff"], "patch": "modules/nf-core/ensemblvep/vep/ensemblvep-vep.diff" }, + "gatk4/applyvqsr": { + "branch": "master", + "git_sha": "cee8fe33d3ef1a220dee67dac75a32f7c872f63f", + "installed_by": ["modules"] + }, "gatk4/calibratedragstrmodel": { "branch": "master", "git_sha": "d742e3143f2ccb8853c29b35cfcf50b5e5026980", @@ -111,6 +116,11 @@ "installed_by": ["modules"], "patch": "modules/nf-core/gatk4/haplotypecaller/gatk4-haplotypecaller.diff" }, + "gatk4/variantrecalibrator": { + "branch": "master", + "git_sha": "d742e3143f2ccb8853c29b35cfcf50b5e5026980", + "installed_by": ["modules"] + }, "gawk": { "branch": "master", "git_sha": "dc3527855e7358c6d8400828754c0caa5f11698f", diff --git a/modules/nf-core/gatk4/applyvqsr/environment.yml b/modules/nf-core/gatk4/applyvqsr/environment.yml new file mode 100644 index 00000000..c043cd63 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_applyvqsr +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.5.0.0 diff --git a/modules/nf-core/gatk4/applyvqsr/main.nf b/modules/nf-core/gatk4/applyvqsr/main.nf new file mode 100644 index 00000000..047321b8 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/main.nf @@ -0,0 +1,63 @@ +process GATK4_APPLYVQSR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.5.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.5.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(vcf_tbi), path(recal), path(recal_index), path(tranches) + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference_command = fasta ? "--reference $fasta" : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK ApplyVQSR] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + ApplyVQSR \\ + --variant ${vcf} \\ + --output ${prefix}.vcf.gz \\ + $reference_command \\ + --tranches-file $tranches \\ + --recal-file $recal \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "" | gzip > ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/applyvqsr/meta.yml b/modules/nf-core/gatk4/applyvqsr/meta.yml new file mode 100644 index 00000000..de5d6d06 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/meta.yml @@ -0,0 +1,76 @@ +name: gatk4_applyvqsr +description: | + Apply a score cutoff to filter variants based on a recalibration table. + AplyVQSR performs the second pass in a two-stage process called Variant Quality Score Recalibration (VQSR). + Specifically, it applies filtering to the input variants based on the recalibration table produced + in the first step by VariantRecalibrator and a target sensitivity value. +keywords: + - gatk4 + - variant quality score recalibration + - vcf + - vqsr +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: file + description: VCF file to be recalibrated, this should be the same file as used for the first stage VariantRecalibrator. + pattern: "*.vcf" + - vcf_tbi: + type: file + description: tabix index for the input vcf file. + pattern: "*.vcf.tbi" + - recal: + type: file + description: Recalibration file produced when the input vcf was run through VariantRecalibrator in stage 1. + pattern: "*.recal" + - recal_index: + type: file + description: Index file for the recalibration file. + pattern: ".recal.idx" + - tranches: + type: file + description: Tranches file produced when the input vcf was run through VariantRecalibrator in stage 1. + pattern: ".tranches" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - vcf: + type: file + description: compressed vcf file containing the recalibrated variants. + pattern: "*.vcf.gz" + - tbi: + type: file + description: Index of recalibrated vcf file. + pattern: "*vcf.gz.tbi" + - versions: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "@GCJMackenzie" +maintainers: + - "@GCJMackenzie" diff --git a/modules/nf-core/gatk4/applyvqsr/tests/allelspecificity.config b/modules/nf-core/gatk4/applyvqsr/tests/allelspecificity.config new file mode 100644 index 00000000..ac368bd0 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/tests/allelspecificity.config @@ -0,0 +1,5 @@ +process { + withName: GATK4_APPLYVQSR { + ext.args = '--mode SNP --truth-sensitivity-filter-level 99.0 -AS' + } +} diff --git a/modules/nf-core/gatk4/applyvqsr/tests/main.nf.test b/modules/nf-core/gatk4/applyvqsr/tests/main.nf.test new file mode 100644 index 00000000..104eb7f0 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process GATK4_APPLYVQSR" + script "../main.nf" + process "GATK4_APPLYVQSR" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/applyvqsr" + + test("human - vcf") { + + config "./no-allelspecificity.config" + + when { + process { + """ + input[0] = [ [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz.tbi', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.recal', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.recal.idx', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.tranches', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.dict', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert path(process.out.vcf.get(0).get(1)).linesGzip.contains("##fileformat=VCFv4.2") }, + { assert snapshot(file(process.out.tbi.get(0).get(1)).name).match() } + ) + } + + } + + test("human - vcf - allele-specific") { + + config "./allelspecificity.config" + + when { + process { + """ + input[0] = [ [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz.tbi', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2_allele_specific.recal', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2_allele_specific.recal.idx', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2_allele_specific.tranches', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.dict', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions_allelspecific") }, + { assert path(process.out.vcf.get(0).get(1)).linesGzip.contains("##fileformat=VCFv4.2") }, + { assert snapshot(file(process.out.tbi.get(0).get(1)).name).match() } + ) + } + + } + + test("human - vcf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/haplotypecaller_calls/test2_haplotc.ann.vcf.gz.tbi', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.recal', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.recal.idx', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/variantrecalibrator/test2.tranches', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta', checkIfExists: true) + input[2] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai', checkIfExists: true) + input[3] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.dict', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gatk4/applyvqsr/tests/main.nf.test.snap b/modules/nf-core/gatk4/applyvqsr/tests/main.nf.test.snap new file mode 100644 index 00000000..ad2fe8be --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/tests/main.nf.test.snap @@ -0,0 +1,95 @@ +{ + "human - vcf": { + "content": [ + "test.vcf.gz.tbi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T13:31:50.727658" + }, + "human - vcf - allele-specific": { + "content": [ + "test.vcf.gz.tbi" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T13:29:42.331816" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,4a6890d486a62ce6f2edfd2f8961da4f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T13:00:49.353138" + }, + "human - vcf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,4a6890d486a62ce6f2edfd2f8961da4f" + ], + "tbi": [ + [ + { + "id": "test" + }, + "test.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "test.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,4a6890d486a62ce6f2edfd2f8961da4f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T13:01:24.370421" + }, + "versions_allelspecific": { + "content": [ + [ + "versions.yml:md5,4a6890d486a62ce6f2edfd2f8961da4f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T13:01:08.104194" + } +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/applyvqsr/tests/no-allelspecificity.config b/modules/nf-core/gatk4/applyvqsr/tests/no-allelspecificity.config new file mode 100644 index 00000000..c08c918e --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/tests/no-allelspecificity.config @@ -0,0 +1,5 @@ +process { + withName: GATK4_APPLYVQSR { + ext.args = '--mode SNP --truth-sensitivity-filter-level 99.0' + } +} diff --git a/modules/nf-core/gatk4/applyvqsr/tests/tags.yml b/modules/nf-core/gatk4/applyvqsr/tests/tags.yml new file mode 100644 index 00000000..c6570630 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/tests/tags.yml @@ -0,0 +1,2 @@ +gatk4/applyvqsr: + - "modules/nf-core/gatk4/applyvqsr/**" diff --git a/modules/nf-core/gatk4/variantrecalibrator/environment.yml b/modules/nf-core/gatk4/variantrecalibrator/environment.yml new file mode 100644 index 00000000..95b744c4 --- /dev/null +++ b/modules/nf-core/gatk4/variantrecalibrator/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_variantrecalibrator +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.5.0.0 diff --git a/modules/nf-core/gatk4/variantrecalibrator/main.nf b/modules/nf-core/gatk4/variantrecalibrator/main.nf new file mode 100644 index 00000000..24844ce0 --- /dev/null +++ b/modules/nf-core/gatk4/variantrecalibrator/main.nf @@ -0,0 +1,71 @@ +process GATK4_VARIANTRECALIBRATOR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.5.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.5.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) // input vcf and tbi of variants to recalibrate + path resource_vcf // resource vcf + path resource_tbi // resource tbi + val labels // string (or list of strings) containing dedicated resource labels already formatted with '--resource:' tag + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.recal") , emit: recal + tuple val(meta), path("*.idx") , emit: idx + tuple val(meta), path("*.tranches"), emit: tranches + tuple val(meta), path("*plots.R") , emit: plots, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference_command = fasta ? "--reference $fasta " : '' + def labels_command = labels.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK VariantRecalibrator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + VariantRecalibrator \\ + --variant $vcf \\ + --output ${prefix}.recal \\ + --tranches-file ${prefix}.tranches \\ + $reference_command \\ + --tmp-dir . \\ + $labels_command \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.recal + touch ${prefix}.idx + touch ${prefix}.tranches + touch ${prefix}plots.R + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/variantrecalibrator/meta.yml b/modules/nf-core/gatk4/variantrecalibrator/meta.yml new file mode 100644 index 00000000..39a415b6 --- /dev/null +++ b/modules/nf-core/gatk4/variantrecalibrator/meta.yml @@ -0,0 +1,84 @@ +name: gatk4_variantrecalibrator +description: | + Build a recalibration model to score variant quality for filtering purposes. + It is highly recommended to follow GATK best practices when using this module, + the gaussian mixture model requires a large number of samples to be used for the + tool to produce optimal results. For example, 30 samples for exome data. For more details see + https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR- +keywords: + - gatk4 + - recalibration model + - variantrecalibrator +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: input vcf file containing the variants to be recalibrated + pattern: "*.vcf.gz" + - tbi: + type: file + description: tbi file matching with -vcf + pattern: "*.vcf.gz.tbi" + - resource_vcf: + type: file + description: all resource vcf files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz" + - resource_tbi: + type: file + description: all resource tbi files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz.tbi" + - labels: + type: string + description: necessary arguments for GATK VariantRecalibrator. Specified to directly match the resources provided. More information can be found at https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - recal: + type: file + description: Output recal file used by ApplyVQSR + pattern: "*.recal" + - idx: + type: file + description: Index file for the recal output file + pattern: "*.idx" + - tranches: + type: file + description: Output tranches file used by ApplyVQSR + pattern: "*.tranches" + - plots: + type: file + description: Optional output rscript file to aid in visualization of the input data and learned model. + pattern: "*plots.R" + - version: + type: file + description: File containing software versions + pattern: "*.versions.yml" +authors: + - "@GCJMackenzie" + - "@nickhsmith" +maintainers: + - "@GCJMackenzie" + - "@nickhsmith" diff --git a/nextflow.config b/nextflow.config index 64821d10..230c2b4f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,7 @@ params { // Pipeline specific parameters scatter_count = 40 filter = false + vqsr = false annotate = false gemini = false add_ped = false diff --git a/nextflow_schema.json b/nextflow_schema.json index 36e7b721..94088c08 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -208,6 +208,86 @@ "pattern": "^\\S+\\.tbi$", "mimetype": "text/plain" }, + "hapmap": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the HapMap VCF file", + "pattern": "^\\S+\\.vcf\\.gz", + "format": "file-path", + "mimetype": "text/plain" + }, + "hapmap_tbi": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the HapMap VCF index file", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi", + "format": "file-path", + "mimetype": "text/plain" + }, + "omni_1000G": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the 1000 Genomes omni VCF file", + "pattern": "^\\S+\\.vcf\\.gz", + "format": "file-path", + "mimetype": "text/plain" + }, + "omni_1000G_tbi": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the 1000 Genomes omni VCF index file", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi", + "format": "file-path", + "mimetype": "text/plain" + }, + "snps_1000G": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the 1000 Genomes SNPs VCF file", + "pattern": "^\\S+\\.vcf\\.gz", + "format": "file-path", + "mimetype": "text/plain" + }, + "snps_1000G_tbi": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the 1000 Genomes SNPs VCF index file", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi", + "format": "file-path", + "mimetype": "text/plain" + }, + "indels_1000G": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the 1000 Genomes indels VCF file", + "pattern": "^\\S+\\.vcf\\.gz", + "format": "file-path", + "mimetype": "text/plain" + }, + "indels_1000G_tbi": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the 1000 Genomes indels VCF index file", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi", + "format": "file-path", + "mimetype": "text/plain" + }, + "known_indels": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the known indels VCF file", + "pattern": "^\\S+\\.vcf\\.gz", + "format": "file-path", + "mimetype": "text/plain" + }, + "known_indels_tbi": { + "type": "string", + "fa_icon": "far fa-file-alt", + "description": "Path to the known indels VCF index file", + "pattern": "^\\S+\\.vcf\\.gz\\.tbi", + "format": "file-path", + "mimetype": "text/plain" + }, "somalier_sites": { "type": "string", "default": "https://github.com/brentp/somalier/files/3412456/sites.hg38.vcf.gz", @@ -217,6 +297,10 @@ "format": "file-path", "mimetype": "text/plain" }, + "vqsr": { + "type": "boolean", + "description": "Perform variant recalibration on the VCFs created by GATK tooling" + }, "only_call": { "type": "boolean", "description": "Only call the variants without doing any post-processing" diff --git a/workflows/germline.nf b/workflows/germline.nf index ef118e7e..2fdd1bbc 100644 --- a/workflows/germline.nf +++ b/workflows/germline.nf @@ -96,6 +96,16 @@ workflow GERMLINE { automap_repeats // string: path to the Automap repeats file automap_panel // string: path to the Automap panel file outdir // string: path to the output directory + hapmap // string: path to the hapmap VCF + hapmap_tbi // string: path to the hapmap VCF index + omni_1000G // string: path to the 1000 Genomes omni VCF + omni_1000G_tbi // string: path to the 1000 Genomes omni VCF index + snps_1000G // string: path to the 1000 Genomes SNPs VCF + snps_1000G_tbi // string: path to the 1000 Genomes SNPs VCF index + indels_1000G // string: path to the 1000 Genomes indels VCF + indels_1000G_tbi // string: path to the 1000 Genomes indels VCF index + known_indels // string: path to the known indels VCF + known_indels_tbi // string: path to the known indels VCF index // Boolean inputs dragstr // boolean: create a dragstr model and use it for haplotypecaller @@ -115,6 +125,7 @@ workflow GERMLINE { vep_mastermind // boolean: use the Mastermind VEP plugin vep_eog // boolean: use the EOG VEP plugin vep_alphamissense // boolean: use the AlphaMissense VEP plugin + vqsr // boolean: perform variant recalibration on GATK output // Value inputs genome // string: the genome used by the pipeline run From be46d9cf42436a4ca216525baac37dc2de1f5b63 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Wed, 3 Jul 2024 14:06:51 +0200 Subject: [PATCH 2/7] implement vqsr --- conf/modules.config | 47 +++++++ main.nf | 10 +- nextflow_schema.json | 40 +++--- nf-test.config | 2 +- .../local/cram_call_genotype_gatk4/main.nf | 31 ++++- subworkflows/local/vcf_vqsr_gatk4/main.nf | 119 ++++++++++++++++++ workflows/germline.nf | 27 +++- 7 files changed, 240 insertions(+), 36 deletions(-) create mode 100644 subworkflows/local/vcf_vqsr_gatk4/main.nf diff --git a/conf/modules.config b/conf/modules.config index 6ae71597..eecf6906 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -21,6 +21,7 @@ def enableOutput(state) { "add_ped": params.add_ped, "normalize": params.normalize, "filter": params.filter, + "vqsr": params.vqsr, "original": true ] @@ -257,6 +258,52 @@ process { ] // SAVE } + if(params.vqsr) { + withName: "^.*VCF_VQSR_GATK4:GATK4_VARIANTRECALIBRATOR_SNPS" { + // -an DP => This option is not advised for WES runs, see if this can somehow still be added to WGS runs + ext.args = [ + "-an QD", + "-an MQ", + "-an MQRankSum", + "-an ReadPosRankSum", + "-an FS", + "-an SOR", + "-an InbreedingCoeff", + "-mode SNP" + ].join(" ") + } + + withName: "^.*VCF_VQSR_GATK4:GATK4_VARIANTRECALIBRATOR_INDELS" { + // -an DP => This option is not advised for WES runs, see if this can somehow still be added to WGS runs + ext.args = [ + "-an QD", + "-an MQRankSum", + "-an ReadPosRankSum", + "-an FS", + "-an SOR", + "-an InbreedingCoeff", + "-mode INDEL" + ].join(" ") + } + + withName: "^.*VCF_VQSR_GATK4:GATK4_APPLYVQSR_SNPS" { + ext.args = "--truth-sensitivity-filter-level 99.9 -mode SNP" + ext.prefix = { "${meta.id}_snp" } + } + + withName: "^.*VCF_VQSR_GATK4:GATK4_APPLYVQSR_INDELS" { + ext.args = "--truth-sensitivity-filter-level 99.9 -mode INDEL" + ext.prefix = enableOutput("vqsr") ? final_prefix : { "${meta.id}.vqsr" } + publishDir = [ + enabled: enableOutput("vqsr"), + overwrite: true, + path: final_output, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] // SAVE + } + } + if(params.filter){ withName: "^.*CRAM_CALL_GENOTYPE_GATK4:VCF_FILTER_BCFTOOLS:FILTER_1\$" { ext.prefix = { "${meta.id}_filtered_snps" } diff --git a/main.nf b/main.nf index 696cdbfd..13e4869f 100644 --- a/main.nf +++ b/main.nf @@ -47,8 +47,6 @@ params.snps_1000G = getGenomeAttribute('snps_1000G') params.snps_1000G_tbi = getGenomeAttribute('snps_1000G_tbi') params.indels_1000G = getGenomeAttribute('indels_1000G') params.indels_1000G_tbi = getGenomeAttribute('indels_1000G_tbi') -params.known_indels = getGenomeAttribute('known_indels') -params.known_indels_tbi = getGenomeAttribute('known_indels_tbi') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -103,9 +101,9 @@ if(params.vqsr) { if(!params.indels_1000G || !params.indels_1000G_tbi) { error("Please provide --indels_1000G and --indels_1000G_tbi when using --vqsr") } - if(!params.known_indels || !params.known_indels_tbi) { - error("Please provide --known_indels and --known_indels_tbi when using --vqsr") - } + if(!params.dbsnp || !params.dbsnp_tbi) { + error("Please provide --dbsnp and --dbsnp_tbi when using --vqsr") + } } /* @@ -186,8 +184,6 @@ workflow NFCMGG_GERMLINE { params.snps_1000G_tbi, params.indels_1000G, params.indels_1000G_tbi, - params.known_indels, - params.known_indels_tbi, // Boolean inputs params.dragstr, diff --git a/nextflow_schema.json b/nextflow_schema.json index 94088c08..04d6ce45 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -214,7 +214,8 @@ "description": "Path to the HapMap VCF file", "pattern": "^\\S+\\.vcf\\.gz", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "hapmap_tbi": { "type": "string", @@ -222,7 +223,8 @@ "description": "Path to the HapMap VCF index file", "pattern": "^\\S+\\.vcf\\.gz\\.tbi", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "omni_1000G": { "type": "string", @@ -230,7 +232,8 @@ "description": "Path to the 1000 Genomes omni VCF file", "pattern": "^\\S+\\.vcf\\.gz", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "omni_1000G_tbi": { "type": "string", @@ -238,7 +241,8 @@ "description": "Path to the 1000 Genomes omni VCF index file", "pattern": "^\\S+\\.vcf\\.gz\\.tbi", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "snps_1000G": { "type": "string", @@ -246,7 +250,8 @@ "description": "Path to the 1000 Genomes SNPs VCF file", "pattern": "^\\S+\\.vcf\\.gz", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "snps_1000G_tbi": { "type": "string", @@ -254,7 +259,8 @@ "description": "Path to the 1000 Genomes SNPs VCF index file", "pattern": "^\\S+\\.vcf\\.gz\\.tbi", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "indels_1000G": { "type": "string", @@ -262,7 +268,8 @@ "description": "Path to the 1000 Genomes indels VCF file", "pattern": "^\\S+\\.vcf\\.gz", "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "indels_1000G_tbi": { "type": "string", @@ -270,23 +277,8 @@ "description": "Path to the 1000 Genomes indels VCF index file", "pattern": "^\\S+\\.vcf\\.gz\\.tbi", "format": "file-path", - "mimetype": "text/plain" - }, - "known_indels": { - "type": "string", - "fa_icon": "far fa-file-alt", - "description": "Path to the known indels VCF file", - "pattern": "^\\S+\\.vcf\\.gz", - "format": "file-path", - "mimetype": "text/plain" - }, - "known_indels_tbi": { - "type": "string", - "fa_icon": "far fa-file-alt", - "description": "Path to the known indels VCF index file", - "pattern": "^\\S+\\.vcf\\.gz\\.tbi", - "format": "file-path", - "mimetype": "text/plain" + "mimetype": "text/plain", + "exists": true }, "somalier_sites": { "type": "string", diff --git a/nf-test.config b/nf-test.config index 6d58c41d..d1eae8c6 100644 --- a/nf-test.config +++ b/nf-test.config @@ -6,7 +6,7 @@ config { profile "nf_test,docker" plugins { - load "nft-bam@0.1.1" + load "nft-bam@0.3.0" } } diff --git a/subworkflows/local/cram_call_genotype_gatk4/main.nf b/subworkflows/local/cram_call_genotype_gatk4/main.nf index 2a33b0ab..201a17e2 100644 --- a/subworkflows/local/cram_call_genotype_gatk4/main.nf +++ b/subworkflows/local/cram_call_genotype_gatk4/main.nf @@ -4,6 +4,7 @@ include { CRAM_CALL_GATK4 } from '../cram_call_gatk4/main' include { GVCF_JOINT_GENOTYPE_GATK4 } from '../gvcf_joint_genotype_gatk4/main' +include { VCF_VQSR_GATK4 } from '../vcf_vqsr_gatk4/main' include { VCF_FILTER_BCFTOOLS } from '../vcf_filter_bcftools/main' workflow CRAM_CALL_GENOTYPE_GATK4 { @@ -16,9 +17,14 @@ workflow CRAM_CALL_GENOTYPE_GATK4 { ch_strtablefile // channel: [optional] [ path(strtablefile) ] => STR table file ch_dbsnp // channel: [optional] [ path(dbsnp) ] => The VCF containing the dbsnp variants ch_dbsnp_tbi // channel: [optional] [ path(dbsnp_tbi) ] => The index of the dbsnp VCF + ch_hapmap // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_omni_1000G // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_snps_1000G // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_indels_1000G // channel: [mandatory] [ path(vcf), path(tbi) ] dragstr // boolean: create a DragSTR model and run haplotypecaller with it only_call // boolean: only run the variant calling only_merge // boolean: run until the family merging + vqsr // boolean: run variant recalibration filter // boolean: filter the VCFs scatter_count // integer: the amount of times the VCFs should be scattered @@ -66,9 +72,30 @@ workflow CRAM_CALL_GENOTYPE_GATK4 { if(!only_call && !only_merge) { + if(vqsr) { + VCF_VQSR_GATK4( + GVCF_JOINT_GENOTYPE_GATK4.out.vcfs, + ch_fasta, + ch_fai, + ch_dict, + ch_hapmap, + ch_omni_1000G, + ch_snps_1000G, + ch_dbsnp.combine(ch_dbsnp_tbi).collect(), + ch_indels_1000G, + ) + ch_versions = ch_versions.mix(VCF_VQSR_GATK4.out.versions) + + VCF_VQSR_GATK4.out.vcfs.set { ch_vqsr_vcfs } + + } else { + GVCF_JOINT_GENOTYPE_GATK4.out.vcfs + .set { ch_vqsr_vcfs } + } + if(filter) { VCF_FILTER_BCFTOOLS( - GVCF_JOINT_GENOTYPE_GATK4.out.vcfs, + ch_vqsr_vcfs, true ) ch_versions = ch_versions.mix(VCF_FILTER_BCFTOOLS.out.versions) @@ -76,7 +103,7 @@ workflow CRAM_CALL_GENOTYPE_GATK4 { VCF_FILTER_BCFTOOLS.out.vcfs .set { ch_vcfs } } else { - GVCF_JOINT_GENOTYPE_GATK4.out.vcfs + ch_vqsr_vcfs .set { ch_vcfs } } diff --git a/subworkflows/local/vcf_vqsr_gatk4/main.nf b/subworkflows/local/vcf_vqsr_gatk4/main.nf new file mode 100644 index 00000000..bd5534d0 --- /dev/null +++ b/subworkflows/local/vcf_vqsr_gatk4/main.nf @@ -0,0 +1,119 @@ +// +// Run Variant recalibration +// + +include { GATK4_VARIANTRECALIBRATOR as GATK4_VARIANTRECALIBRATOR_SNPS } from '../../../modules/nf-core/gatk4/variantrecalibrator/main' +include { GATK4_VARIANTRECALIBRATOR as GATK4_VARIANTRECALIBRATOR_INDELS } from '../../../modules/nf-core/gatk4/variantrecalibrator/main' +include { GATK4_APPLYVQSR as GATK4_APPLYVQSR_SNPS } from '../../../modules/nf-core/gatk4/applyvqsr/main' +include { GATK4_APPLYVQSR as GATK4_APPLYVQSR_INDELS } from '../../../modules/nf-core/gatk4/applyvqsr/main' + +workflow VCF_VQSR_GATK4 { + take: + ch_vcfs // channel: [mandatory] [ val(meta), path(vcf), path(tbi) ] => The post-processed VCFs + ch_fasta // channel: [mandatory] [ val(meta2), path(fasta) ] + ch_fai // channel: [mandatory] [ val(meta3), path(fai) ] + ch_dict // channel: [mandatory] [ val(meta4), path(dict) ] + ch_hapmap // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_omni_1000G // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_snps_1000G // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_dbsnp // channel: [mandatory] [ path(vcf), path(tbi) ] + ch_indels_1000G // channel: [mandatory] [ path(vcf), path(tbi) ] + + main: + + ch_versions = Channel.empty() + + ch_hapmap.map { vcf, tbi -> + def label = "--resource hapmap,known=false,training=true,truth=true,prior=15.0:${vcf.name}" + [ label, vcf, tbi ] + }.mix( + ch_omni_1000G.map { vcf, tbi -> + def label = "--resource omni,known=false,training=true,truth=false,prior=12.0:${vcf.name}" + [ label, vcf, tbi ] + }, + ch_snps_1000G.map { vcf, tbi -> + def label = "--resource 1000G,known=false,training=true,truth=false,prior=10.0:${vcf.name}" + [ label, vcf, tbi ] + }, + ch_dbsnp.map { vcf, tbi -> + def label = "--resource dbsnp,known=true,training=false,truth=false,prior=2.0:${vcf.name}" + [ label, vcf, tbi ] + } + ).multiMap { label, vcf, tbi -> + labels: [ label ] + resources: [ vcf, tbi ] + } + .set { ch_snp_resources } + + ch_indels_1000G.map { vcf, tbi -> + def label = "--resource 1000G,known=false,truth=true,training=true:${vcf.name}" + [ label, vcf, tbi ] + }.mix( + ch_dbsnp.map { vcf, tbi -> + def label = "--resource dbsnp,known=true,truth=false,training=false:${vcf.name}" + [ label, vcf, tbi ] + } + ).multiMap { label, vcf, tbi -> + labels: [ label ] + resources: [ vcf, tbi ] + } + .set { ch_indel_resources } + + GATK4_VARIANTRECALIBRATOR_SNPS( + ch_vcfs, + ch_snp_resources.resources, + [], + ch_snp_resources.labels, + ch_fasta.map { it[1] }, + ch_fai.map { it[1] }, + ch_dict.map { it[1] } + ) + ch_versions = ch_versions.mix(GATK4_VARIANTRECALIBRATOR_SNPS.out.versions.first()) + + GATK4_VARIANTRECALIBRATOR_INDELS( + ch_vcfs, + ch_indel_resources.resources, + [], + ch_indel_resources.labels, + ch_fasta.map { it[1] }, + ch_fai.map { it[1] }, + ch_dict.map { it[1] } + ) + ch_versions = ch_versions.mix(GATK4_VARIANTRECALIBRATOR_INDELS.out.versions.first()) + + ch_vcfs + .join(GATK4_VARIANTRECALIBRATOR_SNPS.out.recal, failOnDuplicate:true, failOnMismatch:true) + .join(GATK4_VARIANTRECALIBRATOR_SNPS.out.idx, failOnDuplicate:true, failOnMismatch:true) + .join(GATK4_VARIANTRECALIBRATOR_SNPS.out.tranches, failOnDuplicate:true, failOnMismatch:true) + .set { ch_applyvqsr_snps_input } + + GATK4_APPLYVQSR_SNPS( + ch_applyvqsr_snps_input, + ch_fasta.map { it[1] }, + ch_fai.map { it[1] }, + ch_dict.map { it[1] } + ) + ch_versions = ch_versions.mix(GATK4_APPLYVQSR_SNPS.out.versions.first()) + + GATK4_APPLYVQSR_SNPS.out.vcf + .join(GATK4_APPLYVQSR_SNPS.out.tbi, failOnDuplicate:true, failOnMismatch:true) + .join(GATK4_VARIANTRECALIBRATOR_INDELS.out.recal, failOnDuplicate:true, failOnMismatch:true) + .join(GATK4_VARIANTRECALIBRATOR_INDELS.out.idx, failOnDuplicate:true, failOnMismatch:true) + .join(GATK4_VARIANTRECALIBRATOR_INDELS.out.tranches, failOnDuplicate:true, failOnMismatch:true) + .set { ch_applyvqsr_indels_input } + + GATK4_APPLYVQSR_INDELS( + ch_applyvqsr_indels_input, + ch_fasta.map { it[1] }, + ch_fai.map { it[1] }, + ch_dict.map { it[1] } + ) + ch_versions = ch_versions.mix(GATK4_APPLYVQSR_INDELS.out.versions.first()) + + ch_vcfs = GATK4_APPLYVQSR_INDELS.out.vcf + .join(GATK4_APPLYVQSR_INDELS.out.tbi, failOnDuplicate:true, failOnMismatch:true) + + emit: + vcfs = ch_vcfs + versions = ch_versions // [ path(versions) ] +} diff --git a/workflows/germline.nf b/workflows/germline.nf index 2fdd1bbc..8c99b78c 100644 --- a/workflows/germline.nf +++ b/workflows/germline.nf @@ -104,8 +104,6 @@ workflow GERMLINE { snps_1000G_tbi // string: path to the 1000 Genomes SNPs VCF index indels_1000G // string: path to the 1000 Genomes indels VCF indels_1000G_tbi // string: path to the 1000 Genomes indels VCF index - known_indels // string: path to the known indels VCF - known_indels_tbi // string: path to the known indels VCF index // Boolean inputs dragstr // boolean: create a dragstr model and use it for haplotypecaller @@ -169,6 +167,26 @@ workflow GERMLINE { ch_automap_repeats = automap_repeats ? Channel.fromPath(automap_repeats).map { [[id:"repeats"], it]}.collect() : [] ch_automap_panel = automap_panel ? Channel.fromPath(automap_panel).map { [[id:"automap_panel"], it]}.collect() : [[],[]] + if(vqsr) { + ch_hapmap = Channel.of( + file(hapmap), file(hapmap_tbi) + ).collect() + ch_omni_1000G = Channel.of( + file(omni_1000G), file(omni_1000G_tbi) + ).collect() + ch_snps_1000G = Channel.of( + file(snps_1000G), file(snps_1000G_tbi) + ).collect() + ch_indels_1000G = Channel.of( + file(indels_1000G), file(indels_1000G_tbi) + ).collect() + } else { + ch_hapmap = [] + ch_omni_1000G = [] + ch_snps_1000G = [] + ch_indels_1000G = [] + } + // // Check for the presence of EnsemblVEP plugins that use extra files // @@ -427,9 +445,14 @@ workflow GERMLINE { ch_strtablefile_ready, ch_dbsnp_ready, ch_dbsnp_tbi_ready, + ch_hapmap, + ch_omni_1000G, + ch_snps_1000G, + ch_indels_1000G, dragstr, only_call, only_merge, + vqsr, filter, scatter_count ) From 7e972686a2889f4589cdda897bfb8a702790fcea Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jul 2024 12:57:14 +0200 Subject: [PATCH 3/7] small update to the resource labels --- subworkflows/local/vcf_vqsr_gatk4/main.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/vcf_vqsr_gatk4/main.nf b/subworkflows/local/vcf_vqsr_gatk4/main.nf index bd5534d0..14778406 100644 --- a/subworkflows/local/vcf_vqsr_gatk4/main.nf +++ b/subworkflows/local/vcf_vqsr_gatk4/main.nf @@ -46,11 +46,11 @@ workflow VCF_VQSR_GATK4 { .set { ch_snp_resources } ch_indels_1000G.map { vcf, tbi -> - def label = "--resource 1000G,known=false,truth=true,training=true:${vcf.name}" + def label = "--resource 1000G,known=false,truth=true,training=true,prior=10.0:${vcf.name}" [ label, vcf, tbi ] }.mix( ch_dbsnp.map { vcf, tbi -> - def label = "--resource dbsnp,known=true,truth=false,training=false:${vcf.name}" + def label = "--resource dbsnp,known=true,truth=false,training=false,prior=2.0:${vcf.name}" [ label, vcf, tbi ] } ).multiMap { label, vcf, tbi -> @@ -61,9 +61,9 @@ workflow VCF_VQSR_GATK4 { GATK4_VARIANTRECALIBRATOR_SNPS( ch_vcfs, - ch_snp_resources.resources, + ch_snp_resources.resources.collect(), [], - ch_snp_resources.labels, + ch_snp_resources.labels.collect().view(), ch_fasta.map { it[1] }, ch_fai.map { it[1] }, ch_dict.map { it[1] } @@ -72,9 +72,9 @@ workflow VCF_VQSR_GATK4 { GATK4_VARIANTRECALIBRATOR_INDELS( ch_vcfs, - ch_indel_resources.resources, + ch_indel_resources.resources.collect(), [], - ch_indel_resources.labels, + ch_indel_resources.labels.collect().view(), ch_fasta.map { it[1] }, ch_fai.map { it[1] }, ch_dict.map { it[1] } From e27d66b64d72a1a133499bf9d5fb617ca64b0b63 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jul 2024 14:13:16 +0200 Subject: [PATCH 4/7] use correct way of supplying resources --- drs.config | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 drs.config diff --git a/drs.config b/drs.config new file mode 100644 index 00000000..639bca51 --- /dev/null +++ b/drs.config @@ -0,0 +1,30 @@ +params { + outdir = "s3://test-data/nf-drs-germline-test/" +} + +plugins { + id "nf-drs@0.1.1" +} + +drs { + user = "admin" // Can also be fetched from DRS_USERNAME + password = "admin" // Can also be fetched from DRS_PASSWORD + enabled = true + endpoint = "http://localhost:8000" // Can also be fetched from DRS_URL + // allowedExtensions = [ + // "vcf.gz", + // "html" + // ] + run = "germline_test" + summary = "$params.outdir/drs_summary.csv" +} + +aws { + region = "uz" + client { + endpoint = "https://s3.ugent.be" + protocol = "https" + s3PathStyleAccess = true + connectionTimeout = 60000 + } +} From b1e70d1edad8cbc2c8bfa83dc4b58c47eea92c8b Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jul 2024 15:06:52 +0200 Subject: [PATCH 5/7] use correct way of supplying resources --- drs.config | 30 ----------------------- subworkflows/local/vcf_vqsr_gatk4/main.nf | 12 ++++----- 2 files changed, 6 insertions(+), 36 deletions(-) delete mode 100644 drs.config diff --git a/drs.config b/drs.config deleted file mode 100644 index 639bca51..00000000 --- a/drs.config +++ /dev/null @@ -1,30 +0,0 @@ -params { - outdir = "s3://test-data/nf-drs-germline-test/" -} - -plugins { - id "nf-drs@0.1.1" -} - -drs { - user = "admin" // Can also be fetched from DRS_USERNAME - password = "admin" // Can also be fetched from DRS_PASSWORD - enabled = true - endpoint = "http://localhost:8000" // Can also be fetched from DRS_URL - // allowedExtensions = [ - // "vcf.gz", - // "html" - // ] - run = "germline_test" - summary = "$params.outdir/drs_summary.csv" -} - -aws { - region = "uz" - client { - endpoint = "https://s3.ugent.be" - protocol = "https" - s3PathStyleAccess = true - connectionTimeout = 60000 - } -} diff --git a/subworkflows/local/vcf_vqsr_gatk4/main.nf b/subworkflows/local/vcf_vqsr_gatk4/main.nf index 14778406..3cdf3f7a 100644 --- a/subworkflows/local/vcf_vqsr_gatk4/main.nf +++ b/subworkflows/local/vcf_vqsr_gatk4/main.nf @@ -24,19 +24,19 @@ workflow VCF_VQSR_GATK4 { ch_versions = Channel.empty() ch_hapmap.map { vcf, tbi -> - def label = "--resource hapmap,known=false,training=true,truth=true,prior=15.0:${vcf.name}" + def label = "--resource:hapmap,known=false,training=true,truth=true,prior=15.0 ${vcf.name}" [ label, vcf, tbi ] }.mix( ch_omni_1000G.map { vcf, tbi -> - def label = "--resource omni,known=false,training=true,truth=false,prior=12.0:${vcf.name}" + def label = "--resource:omni,known=false,training=true,truth=false,prior=12.0 ${vcf.name}" [ label, vcf, tbi ] }, ch_snps_1000G.map { vcf, tbi -> - def label = "--resource 1000G,known=false,training=true,truth=false,prior=10.0:${vcf.name}" + def label = "--resource:1000G,known=false,training=true,truth=false,prior=10.0 ${vcf.name}" [ label, vcf, tbi ] }, ch_dbsnp.map { vcf, tbi -> - def label = "--resource dbsnp,known=true,training=false,truth=false,prior=2.0:${vcf.name}" + def label = "--resource:dbsnp,known=true,training=false,truth=false,prior=2.0 ${vcf.name}" [ label, vcf, tbi ] } ).multiMap { label, vcf, tbi -> @@ -46,11 +46,11 @@ workflow VCF_VQSR_GATK4 { .set { ch_snp_resources } ch_indels_1000G.map { vcf, tbi -> - def label = "--resource 1000G,known=false,truth=true,training=true,prior=10.0:${vcf.name}" + def label = "--resource:1000G,known=false,truth=true,training=true,prior=10.0 ${vcf.name}" [ label, vcf, tbi ] }.mix( ch_dbsnp.map { vcf, tbi -> - def label = "--resource dbsnp,known=true,truth=false,training=false,prior=2.0:${vcf.name}" + def label = "--resource:dbsnp,known=true,truth=false,training=false,prior=2.0 ${vcf.name}" [ label, vcf, tbi ] } ).multiMap { label, vcf, tbi -> From 12b734542ee4a29275adb86e08e710b09b706996 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jul 2024 17:08:35 +0200 Subject: [PATCH 6/7] remove inbreeding coeff for indels --- conf/modules.config | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index eecf6906..3cfd15e0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -281,7 +281,6 @@ process { "-an ReadPosRankSum", "-an FS", "-an SOR", - "-an InbreedingCoeff", "-mode INDEL" ].join(" ") } From b8ad690a46db2b481fe79d7915978023dd5f0568 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Wed, 17 Jul 2024 09:09:41 +0200 Subject: [PATCH 7/7] remove inbreeding coeff for snps --- conf/modules.config | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 3cfd15e0..c3db2212 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -268,7 +268,6 @@ process { "-an ReadPosRankSum", "-an FS", "-an SOR", - "-an InbreedingCoeff", "-mode SNP" ].join(" ") }