Skip to content

Commit

Permalink
Merge pull request #56 from CCBR/cnv
Browse files Browse the repository at this point in the history
CNV and GT fixes
  • Loading branch information
dnousome authored Jul 10, 2024
2 parents 3fce8c6 + 2f8fe6d commit 12b3b9c
Show file tree
Hide file tree
Showing 16 changed files with 273 additions and 163 deletions.
96 changes: 56 additions & 40 deletions bin/convertStrelka.py → bin/strelka_convert.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env python
import os
import numpy as np
import vcfpy
import cyvcf2
import sys
import gzip
import os

def _tumor_normal_genotypes(ref, alt, info):
"""Retrieve standard 0/0, 0/1, 1/1 style genotypes from INFO field.
Expand Down Expand Up @@ -33,7 +34,7 @@ def name_to_gt(val):
# print(fname, coords, ref, alt, info, val)
return "0/1"
def alleles_to_gt(val):
gt_indices = {gt.upper(): i for i, gt in enumerate([ref] + [alt])}
gt_indices = {gt.upper(): i for i, gt in enumerate([ref] + alt)}
tumor_gts = [gt_indices[x.upper()] for x in val if x in gt_indices]
if tumor_gts and val not in known_names:
if max(tumor_gts) == 0:
Expand All @@ -45,13 +46,13 @@ def alleles_to_gt(val):
else:
tumor_gt = name_to_gt(val)
return tumor_gt
nt_val = info.get('NT').split("=")[-1]
nt_val = [x.split("=")[-1] for x in info if x.startswith("NT=")][0]
normal_gt = name_to_gt(nt_val)
sgt_val = info.get('SGT').split("=")[-1]
sgt_val = [x.split("=")[-1] for x in info if x.startswith("SGT=")]
if not sgt_val:
tumor_gt = "0/0"
else:
sgt_val = sgt_val.split("->")[-1]
sgt_val = sgt_val[0].split("->")[-1]
tumor_gt = alleles_to_gt(sgt_val)
return normal_gt, tumor_gt

Expand All @@ -70,47 +71,62 @@ def _af_annotate_and_filter(in_file,out_file):
#data = paired.tumor_data if paired else items[0]
#min_freq = float(utils.get_in(data["config"], ("algorithm", "min_allele_fraction"), 10)) / 100.0
#logger.debug("Filtering Strelka2 calls with allele fraction threshold of %s" % min_freq)
vcf = vcfpy.Reader.from_path(in_file)
vcf.header.add_format_line(vcfpy.OrderedDict([
('ID', 'AF'),
('Description', 'Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), TIR/DPI (somatic indels)'),
('Type','Float'),
('Number', '.')
]))
vcf.header.add_format_line(vcfpy.OrderedDict([
('ID', 'GT'),
('Description', 'Genotype'),
('Type','String'),
('Number', '1')
]))
writer = vcfpy.Writer.from_path(out_file, vcf.header)
vcf = cyvcf2.VCF(in_file)
vcf.add_format_to_header(dict(
ID='AF', Number='1',Type='Float',
Description='Allele frequency, as calculated in bcbio: AD/DP (germline), <ALT>U/DP (somatic snps), TIR/DPI (somatic indels)'
))
writer = cyvcf2.Writer(out_file, vcf)
for rec in vcf:
#print(rec)
if rec.is_snv(): # snps?
alt_counts_n = rec.calls[0].data[rec.ALT[0].value + "U"] # {ALT}U=tier1_depth,tier2_depth
alt_counts_t = rec.calls[1].data[rec.ALT[0].value + "U"] # {ALT}U=tier1_depth,tier2_depth
if rec.is_snp: # snps?
alt_counts = rec.format(rec.ALT[0] +'U')[:,0] # {ALT}U=tier1_depth,tier2_depth
else: # indels
alt_counts_n = rec.calls[0].data['TIR'] # TIR=tier1_depth,tier2_depth
alt_counts_t = rec.calls[1].data['TIR']
DP_n=rec.calls[0].data["DP"]
DP_t=rec.calls[1].data["DP"]
if DP_n is not None and DP_t is not None:
alt_counts = rec.format('TIR')[:,0] # TIR=tier1_depth,tier2_depth
dp = rec.format('DP')[:,0]
if dp is not None :
with np.errstate(divide='ignore', invalid='ignore'): # ignore division by zero and put AF=.0
#alt_n = alt_counts_n[0]/DP_n
#alt_t = alt_counts_t[0]/DP_t
af_n = np.true_divide(alt_counts_n[0], DP_n)
af_t = np.true_divide(alt_counts_t[0], DP_t)
rec.add_format('AF',0)
rec.calls[0].data["AF"]= [round(af_n,5)]
rec.calls[1].data["AF"]= [round(af_t,5)]
normal_gt, tumor_gt= _tumor_normal_genotypes(rec.REF,rec.ALT[0].value,rec.INFO)
rec.add_format('GT',"1/0")
rec.calls[0].data["GT"]=normal_gt
rec.calls[1].data["GT"]=tumor_gt
writer.write_record(rec)
af = np.true_divide(alt_counts, dp)
rec.set_format('AF',np.round(af,5))
writer.write_record(rec)
writer.close()

#def is_gzipped(path):
# return path.endswith(".gz")

def _add_gt(in_file):
##Set genotypes now
out_file = os.path.basename(in_file).replace(".vcf.gz", "-fixed.vcf")
#open_fn = gzip.open if is_gzipped(in_file) else open
with gzip.open(in_file,'rt') as in_handle:
with open(out_file,"wt") as out_handle:
added_gt = False
for line in in_handle:
if line.startswith("##FORMAT") and not added_gt:
added_gt = True
out_handle.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
out_handle.write(line)
elif line.startswith("#CHROM"):
assert added_gt
out_handle.write(line)
elif line.startswith("#"):
out_handle.write(line)
else:
parts = line.rstrip().split("\t")
normal_gt,tumor_gt = _tumor_normal_genotypes(parts[3], parts[4].split(","),
parts[7].split(";"))
parts[8] = "GT:%s" % parts[8]
parts[9] = "%s:%s" % (normal_gt, parts[9])
parts[10] = "%s:%s" % (tumor_gt, parts[10])
out_handle.write("\t".join(parts) + "\n")

if __name__ == '__main__':
filename = sys.argv[1]
outname = sys.argv[2]
_af_annotate_and_filter(filename, outname)

_add_gt(filename)
newname = os.path.basename(filename).replace(".vcf.gz", "-fixed.vcf")
_af_annotate_and_filter(newname,outname)
os.remove(newname)

5 changes: 5 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ process {
memory = { check_max( 48.GB * task.attempt, 'memory' ) }
time = { check_max( 72.h * task.attempt, 'time' ) }
}
withName:bwamem2 {
cpus = { check_max( 20 * task.attempt, 'cpus' ) }
memory = { check_max( 200.GB * task.attempt, 'memory' ) }
time = { check_max( 72.h * task.attempt, 'time' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'
}
Expand Down
5 changes: 3 additions & 2 deletions conf/containers.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
params {
containers {
base = 'docker://nciccbr/ccbr_ubuntu_base_20.04:v6.1'
logan = 'docker://dnousome/ccbr_logan_base:v0.3.5'
logan = 'docker://dnousome/ccbr_logan_base:v0.3.6'
vcf2maf = 'docker://dnousome/ccbr_vcf2maf:v102.0.0'
lofreq = 'docker://dnousome/ccbr_lofreq:v0.0.1'
octopus = 'docker://dancooke/octopus:latest'
annotcnvsv = 'docker://dnousome/ccbr_annotate_cnvsv:v0.0.1'
annotsv = "docker://quay.io/biocontainers/annotsv:3.4.2--py312hdfd78af_0"
annotcnvsv = 'docker://dnousome/ccbr_annotate_cnvsv:v0.0.2'
}
}
69 changes: 57 additions & 12 deletions conf/genomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ params {
chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
//HMFTOOLS
GENOMEVER = "38"
HOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.somatic.38.vcf.gz"
HMFGENOME = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/bwamem2/GRCh38.d1.vd1.fa"
SOMATICHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.somatic.38.vcf.gz"
GERMLINEHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.germline.38.vcf.gz"
PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/ActionableCodingPanel.38.bed.gz"
HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz"
ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/ensembl_data"
Expand All @@ -45,7 +47,7 @@ params {
'hg19' {
genome = "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa"
genomefai = "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.fa.fai"
bwagenome= "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa"
bwagenome = "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa"
genomedict= "/data/CCBR_Pipeliner/db/PipeDB/lib/hg19.with_extra.dict"
intervals= "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hg19_noblacklist_maincontig.bed"
INDELREF = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/GATKbundle/Mills_and_1000G_gold_standard.indels.hg19.vcf.gz"
Expand All @@ -71,16 +73,16 @@ params {
chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
//HMFTOOLS
GENOMEVER = "37"
HOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.38.vcf.gz"
PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/ActionableCodingPanel.38.bed.gz"
HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz"
ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/ensembl_data"
//PURPLE
GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/AmberGermlineSites.38.tsv.gz"
GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/GC_profile.1000bp.38.cnp"
DIPLODREG = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/DiploidRegions.38.bed.gz'
ENSEMBLCACHE = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/ensembl_data/'
DRIVERS = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PURPLE/DriverGenePanel.38.tsv'
HMFGENOME = "/data/CCBR_Pipeliner/db/PipeDB/lib/hs37d5.fa"
SOMATICHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/KnownHotspots.somatic.37.vcf.gz"
GERMLINEHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/KnownHotspots.germline.37.vcf.gz"
PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/ActionableCodingPanel.37.bed.gz"
HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/variants/NA12878_GIAB_highconf_IllFB-IllGATKHC-CG-Ion-Solid_ALLCHROM_v3.2.2_highconf.bed.gz"
ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/common/ensembl_data"
GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/AmberGermlineSites.37.tsv.gz"
GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/GC_profile.1000bp.37.cnp"
DIPLODREG = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/copy_number/DiploidRegions.37.bed.gz'
DRIVERS = '/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg19/hmftools/v5_34/ref/37/common/DriverGenePanel.37.tsv'
}

'mm10' {
Expand Down Expand Up @@ -119,5 +121,48 @@ params {
chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chrX','chrY','chrM']

}
'hg38_sf' {
genome = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/bwamem2/Homo_sapiens_assembly38.fasta"
genomefai = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/genome/bwamem2/Homo_sapiens_assembly38.fasta.fai"
bwagenome= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/genome/Homo_sapiens_assembly38.fasta"
genomedict= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/genome/Homo_sapiens_assembly38.dict"
wgsregion = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/resources_broad_hg38_v0_wgs_calling_regions.hg38.interval_list"
intervals= "${projectDir}/assets/hg38_v0_wgs_calling_regions.hg38.bed"
fullinterval = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/genomes/hg38_main.bed"
INDELREF = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz"
KNOWNINDELS = "-known /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz -known /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz"
KNOWNRECAL = '--known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz --known-sites /data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/GATK_resource_bundle/ALL.wgs.1000G_phase3.GRCh38.ncbi_remapper.20150424.shapeit2_indels.vcf.gz'
dbsnp = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GATK_resource_bundle/dbsnp_138.hg38.vcf.gz"
gnomad = '--germline-resource /data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz'
pon = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/PON/updatedpon.vcf.gz" //pon="/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz" //file{params.pon}
germline_resource = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz"
KRAKENBACDB = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/kraken/20180907_standard_kraken2"
snpeff_genome = "GRCh38.86"
snpeff_config = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/snpEff/4.3t/snpEff.config"
snpeff_bundle = "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/snpEff/4.3t/"
sites_vcf= "/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/somalier/sites.hg38.vcf.gz"
somalier_ancestrydb="/data/CCBR_Pipeliner/CCBR_Pipeliner_Legacy/Exome-seek/hg38/somalier/1kg-somalier"
vepcache = "/fdb/VEP/102/cache"
vepspecies = "homo_sapiens"
vepbuild = "GRCh38"
annotsvgenome = "GRCh38"
octopus_sforest= "--somatic-forest /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/octopus/somatic.v0.7.4.forest"
octopus_gforest= "--forest /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/octopus/germline.v0.7.4.forest"
SEQUENZAGC = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/SEQUENZA/hg38_gc50Base.txt.gz"
chromosomes = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
//HMFTOOLS
GENOMEVER = "38"
HMFGENOME = "/data/CCBR_Pipeliner/Pipelines/XAVIER/resources/hg38/bwamem2/GRCh38.d1.vd1.fa"
SOMATICHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.somatic.38.vcf.gz"
GERMLINEHOTSPOTS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/KnownHotspots.germline.38.vcf.gz"
PANELBED = "-panel_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/ActionableCodingPanel.38.bed.gz"
HCBED = "-high_confidence_bed /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/variants/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed.gz"
ENSEMBLCACHE = "-ensembl_data_dir /data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/ensembl_data"
//PURPLE
GERMLINEHET = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/AmberGermlineSites.38.tsv.gz"
GCPROFILE = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/GC_profile.1000bp.38.cnp"
DIPLODREG = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/copy_number/DiploidRegions.38.bed.gz"
DRIVERS = "/data/CCBR_Pipeliner/Pipelines/LOGAN/resources/hg38/hmftools/v5_34/ref/38/common/DriverGenePanel.38.tsv"
}
}
}
6 changes: 3 additions & 3 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ process {
]
}

withName: 'purple' {
withName: 'purple|purple_tonly|purple_novc' {
publishDir = [
path: { "${params.outdir}/cnv/purple" },
mode: 'copy'
Expand Down Expand Up @@ -224,14 +224,14 @@ process {
]
}

withName: 'contamination_tumoronly|learnreadorientationmodel_tonly|learnreadorientationmodel|mergemut2stats|mergemut2stats_tonly|contamination_paired|mutect2filter' {
withName: 'learnreadorientationmodel|mergemut2stats|contamination_paired|mutect2filter' {
publishDir = [
path: { "${params.outdir}/vcfs/mutect2" },
mode: 'copy'
]
}

withName: 'mutect2filter_tonly' {
withName: 'mutect2filter_tonly|contamination_tumoronly|learnreadorientationmodel_tonly|mergemut2stats_tonly' {
publishDir = [
path: { "${params.outdir}/vcfs/mutect2_tonly" },
mode: 'copy'
Expand Down
Loading

0 comments on commit 12b3b9c

Please sign in to comment.