nf-core · Patricie34 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
@@ -1,3 +1,5 @@
+
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     Config file for defining DSL2 per module options and publishing paths
@@ -16,6 +18,30 @@
 
 process {
 
+    withName: 'GERMLINE_VCFS_NORM'{
+        ext.args = { [
+            '--multiallelics - both', //split multiallelic sites into biallelic records and both SNPs and indels should be merged separately into two records
+            '--rm-dup all'            //output only the first instance of a record which is present multiple times
+        ].join(' ') }
+        ext.when   = { params.concatenate_vcfs }
+        publishDir = [
+            mode: params.publish_dir_mode,
+                path: { "${params.outdir}/variant_calling/concat/${meta.id}/" }
+        ]
+    }
+
+        withName: 'VCFS_NORM'{
+        ext.args = { [
+            '--multiallelics - both', //split multiallelic sites into biallelic records and both SNPs and indels should be merged separately into two records
+            '--rm-dup all'            //output only the first instance of a record which is present multiple times
+        ].join(' ') }
+        ext.when   = { params.normalized_vcfs }
+        publishDir = [
+            mode: params.publish_dir_mode,
+                path: { "${params.outdir}/variant_calling/normalized/${meta.id}/" }
+        ]
+    }
+
     withName: 'GERMLINE_VCFS_CONCAT'{
         ext.args   = { "-a" }
         ext.when   = { params.concatenate_vcfs }
@@ -34,11 +60,25 @@ process {
         ]
     }
 
+    withName: 'VCFS__SORT'{
+        ext.prefix = { "${meta.id}.norm" }
+        ext.when   = { params.normalized_vcfs }
+        publishDir = [
+            mode: params.publish_dir_mode,
+                path: { "${params.outdir}/variant_calling/normalized/${meta.id}/" }
+        ]
+    }
+
     withName: 'TABIX_EXT_VCF' {
         ext.prefix = { "${input.baseName}" }
         ext.when   = { params.concatenate_vcfs }
     }
 
+    withName: 'TABIX_VCF' {
+        ext.prefix = { "${input.baseName}" }
+        ext.when   = { params.normalized_vcfs }
+    }
+
     withName: 'TABIX_GERMLINE_VCFS_CONCAT_SORT'{
         ext.prefix = { "${meta.id}.germline" }
         ext.when   = { params.concatenate_vcfs }
@@ -47,4 +87,13 @@ process {
                 path: { "${params.outdir}/variant_calling/concat/${meta.id}/" }
         ]
     }
+
+    withName: 'TABIX_VCFS_INDEX'{
+        ext.prefix = { "${meta.id}.norm" }
+        ext.when   = { params.normalized_vcfs }
+        publishDir = [
+            mode: params.publish_dir_mode,
+                path: { "${params.outdir}/variant_calling/norm/${meta.id}/" }
+        ]
+    }
 }
@@ -71,6 +71,7 @@ params {
     ignore_soft_clipped_bases         = false // no --dont-use-soft-clipped-bases for GATK Mutect2
     joint_germline                    = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected
     joint_mutect2                     = false // if true, enables patient-wise multi-sample somatic variant calling
+    normalized_vcfs                   = false // by default we don't normalize the vcf-files                     
     only_paired_variant_calling       = false // if true, skips germline variant calling for normal-paired sample
     sentieon_dnascope_emit_mode       = 'variant' // default value for Sentieon dnascope
     sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE'

@@ -3,23 +3,33 @@
 //
 
 include { CONCATENATE_GERMLINE_VCFS } from '../vcf_concatenate_germline/main'
+include { NORMALIZE_VCFS } from '../vcf_normalization/main'
 
 workflow POST_VARIANTCALLING {
 
     take:
     vcfs
+    fasta
     concatenate_vcfs
-
+    normalized_vcfs
+
     main:
     versions = Channel.empty()
 
     if (concatenate_vcfs){
-        CONCATENATE_GERMLINE_VCFS(vcfs)
+        CONCATENATE_GERMLINE_VCFS(vcfs, fasta)
 
         vcfs = vcfs.mix(CONCATENATE_GERMLINE_VCFS.out.vcfs)
         versions = versions.mix(CONCATENATE_GERMLINE_VCFS.out.versions)
     }
 
+    if (normalized_vcfs){
+        NORMALIZE_VCFS(vcfs, fasta)
+
+        vcfs = vcfs.mix(NORMALIZE_VCFS.out.vcfs)
+        versions = versions.mix(NORMALIZE_VCFS.out.versions)
+    }
+
     emit:
     vcfs // post processed vcfs
 

@@ -1,42 +1,58 @@
-//
 // CONCATENATE Germline VCFs
-//
 
 // Concatenation of germline vcf-files
-include { ADD_INFO_TO_VCF                                     } from '../../../modules/local/add_info_to_vcf/main'
-include { TABIX_BGZIPTABIX as TABIX_EXT_VCF                   } from '../../../modules/nf-core/tabix/bgziptabix/main'
-include { BCFTOOLS_CONCAT  as GERMLINE_VCFS_CONCAT            } from '../../../modules/nf-core/bcftools/concat/main'
-include { BCFTOOLS_SORT    as GERMLINE_VCFS_CONCAT_SORT       } from '../../../modules/nf-core/bcftools/sort/main'
-include { TABIX_TABIX      as TABIX_GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/tabix/tabix/main'
+include { ADD_INFO_TO_VCF  } from '../../../modules/local/add_info_to_vcf/main'
+include { TABIX_BGZIPTABIX as TABIX_EXT_VCF } from '../../../modules/nf-core/tabix/bgziptabix/main'
+include { BCFTOOLS_NORM as GERMLINE_VCFS_NORM } from '../../../modules/nf-core/bcftools/norm/main'
+include { BCFTOOLS_CONCAT as GERMLINE_VCFS_CONCAT } from '../../../modules/nf-core/bcftools/concat/main'
+include { BCFTOOLS_SORT as GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/bcftools/sort/main'
+include { TABIX_TABIX as TABIX_GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/tabix/tabix/main'
 
 workflow CONCATENATE_GERMLINE_VCFS {
 
     take:
     vcfs
+    fasta
 
     main:
     versions = Channel.empty()
 
-    // Concatenate vcf-files
+    // Add additional information to VCF files
     ADD_INFO_TO_VCF(vcfs)
+
+    // Compress the VCF files with bgzip
     TABIX_EXT_VCF(ADD_INFO_TO_VCF.out.vcf)
 
+    // Normalize the VCF files with BCFTOOLS_NORM
+    GERMLINE_VCFS_NORM(vcf: ADD_INFO_TO_VCF.out.vcf, fasta: fasta)
+
+    // Compress the normalized VCF files with bgzip
+    TABIX_EXT_VCF(GERMLINE_VCFS_NORM.out.vcf)
+
+    // Index the compressed normalized VCF files
+    TABIX_GERMLINE_VCFS_CONCAT_SORT(TABIX_EXT_VCF.out.gz)
+
     // Gather vcfs and vcf-tbis for concatenating germline-vcfs
-    germline_vcfs_with_tbis = TABIX_EXT_VCF.out.gz_tbi.map{ meta, vcf, tbi -> [ meta.subMap('id'), vcf, tbi ] }.groupTuple()
+    germline_vcfs_with_tbis = TABIX_GERMLINE_VCFS_CONCAT_SORT.out.map { meta, vcf, tbi -> [meta.subMap('id'), vcf, tbi] }.groupTuple()
 
+    // Concatenate the VCF files
     GERMLINE_VCFS_CONCAT(germline_vcfs_with_tbis)
+
+    // Sort the concatenated VCF files
     GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT.out.vcf)
+
+    // Index the sorted concatenated VCF files
     TABIX_GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT_SORT.out.vcf)
 
     // Gather versions of all tools used
     versions = versions.mix(ADD_INFO_TO_VCF.out.versions)
     versions = versions.mix(TABIX_EXT_VCF.out.versions)
+    versions = versions.mix(GERMLINE_VCFS_NORM.out.versions)
     versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions)
-    versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions)
+    versions = versions.mix(GERMLINE_VCFS_CONCAT_SORT.out.versions)
     versions = versions.mix(TABIX_GERMLINE_VCFS_CONCAT_SORT.out.versions)
 
     emit:
-    vcfs = germline_vcfs_with_tbis // post processed vcfs
-
+    vcfs = TABIX_GERMLINE_VCFS_CONCAT_SORT.out.gz_tbi // post-processed VCFs
     versions // channel: [ versions.yml ]
-}
+}
@@ -0,0 +1,46 @@
+// Normalize all unannotated VCFs
+
+// Import modules
+include { ADD_INFO_TO_VCF  } from '../../../modules/local/add_info_to_vcf/main'
+include { TABIX_BGZIPTABIX as TABIX_VCF } from '../../../modules/nf-core/tabix/bgziptabix/main'
+include { BCFTOOLS_NORM as VCFS_NORM } from '../../../modules/nf-core/bcftools/norm/main'
+include { BCFTOOLS_SORT as VCFS_SORT } from '../../../modules/nf-core/bcftools/sort/main'
+include { TABIX_TABIX as TABIX_VCFS_INDEX } from '../../../modules/nf-core/tabix/tabix/main'
+
+// Workflow to normalize, compress, and index VCF files
+workflow NORMALIZE_VCFS {
+
+    take:
+    vcfs
+    fasta
+
+    main:
+    versions = Channel.empty()
+
+    // Add additional information to VCF files
+    ADD_INFO_TO_VCF(vcfs)
+
+    // Normalize the VCF files with BCFTOOLS_NORM
+    normalized_vcf = VCFS_NORM(vcf: ADD_INFO_TO_VCF.out.vcf)
+
+    // Compress the normalized VCF files with bgzip
+    compressed_vcf = TABIX_VCF(normalized_vcf)
+
+    // Sort the compressed normalized VCF files
+    sorted_vcf = VCFS_SORT(compressed_vcf)
+
+    // Index the sorted VCF files
+    sorted_indexed_vcf = TABIX_VCFS_INDEX(sorted_vcf)
+
+    // Gather versions of all tools used
+    versions = versions.mix(ADD_INFO_TO_VCF.out.versions)
+    versions = versions.mix(VCFS_NORM.out.versions)
+    versions = versions.mix(TABIX_VCF.out.versions)
+    versions = versions.mix(VCFS_SORT.out.versions)
+    versions = versions.mix(TABIX_VCFS_INDEX.out.versions)
+
+    emit:
+    normalized_vcfs = sorted_indexed_vcf // Post-processed sorted VCFs
+    versions // Channel: [versions.yml]
+}
+
@@ -794,7 +794,11 @@ workflow SAREK {
 
         // POST VARIANTCALLING
         POST_VARIANTCALLING(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_all,
-                            params.concatenate_vcfs)
+                BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.vcf_all,
+                BAM_VARIANT_CALLING_SOMATIC_ALL.out.vcf_all,
+                fasta,
+                params.concatenate_vcfs,
+                params.normalized_vcfs)
 
         // Gather vcf files for annotation and QC
         vcf_to_annotate = Channel.empty()