diff --git a/README.md b/README.md index 0159005..2ca86e4 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ The following include tested versions in parenthesis when applicable; later vers 1. python (2.7.12) + argparse + os + + pysam 2. samtools (1.2) 3. bcftools (1.2) 4. htslib (1.2.1) diff --git a/global-config.human.txt b/global-config.human.txt new file mode 100644 index 0000000..08c7471 --- /dev/null +++ b/global-config.human.txt @@ -0,0 +1,38 @@ +#This file ignores blank and commented ('#') lines. +#Values are noted as " " + +#Path to snpEff installation +SNPEFF /n/data1/hms/dbmi/park/cbohrson/installed/local/bin/snpEff + +#Path to DBSNP database +DBSNP /n/data1/hms/dbmi/park/cbohrson/common_all_20160601.vcf.gz + +#Path to downloaded 1000 genomes haplotype reference panel +KGEN /n/data1/hms/dbmi/park/simon_chu/projects/data/1000G + +#Path to picard JAR file +PICARD /n/data1/hms/dbmi/park/cbohrson/installed/local/bin/picard.jar + +#Gap between SNPs required for regions to be chunked separately +GAP_REQUIREMENT 2000 + +#Target number of reads per LiRA job +READS_TARGET 100000 + +#Partition to submit to for parallelization +PARTITION short + +#Batch size for parallelization (number of scripts per job) +BATCH_SIZE 10 + +#Method to use for creating sSNV control curve of composite coverage vs. genome-wide sSNV rate. Options are 'germline' (use germline sSNVs) or 'general' (use all powered sites). 'general' is not yet implemented. +CONTROL_METHOD germline + +#For 10X, the max distance between two SNPs to be considered within a barcode family +MAX_DISTANCE_10X 50000 + +#Script in $LIRA_DIR/scripts to use for parallelization +PARALLEL_SCRIPT slurm.R + +#Number of sampling replicates to use for CONTROL_METHOD +BOOTSTRAP_REPLICATES 100 diff --git a/global-config.txt b/global-config.txt index 08c7471..87242dd 100644 --- a/global-config.txt +++ b/global-config.txt @@ -5,10 +5,11 @@ SNPEFF /n/data1/hms/dbmi/park/cbohrson/installed/local/bin/snpEff #Path to DBSNP database -DBSNP /n/data1/hms/dbmi/park/cbohrson/common_all_20160601.vcf.gz +#DBSNP /n/data1/hms/dbmi/park/cbohrson/common_all_20160601.vcf.gz +DBSNP /n/data1/hms/dbmi/park/splitseq_analysis/vcf_contig_rename/mgp.v5.merged.snps_all.dbSNP142.contig_rename_2.vcf.gz #Path to downloaded 1000 genomes haplotype reference panel -KGEN /n/data1/hms/dbmi/park/simon_chu/projects/data/1000G +#KGEN /n/data1/hms/dbmi/park/simon_chu/projects/data/1000G #Path to picard JAR file PICARD /n/data1/hms/dbmi/park/cbohrson/installed/local/bin/picard.jar @@ -36,3 +37,5 @@ PARALLEL_SCRIPT slurm.R #Number of sampling replicates to use for CONTROL_METHOD BOOTSTRAP_REPLICATES 100 + +reference_identifier mm10 diff --git a/scripts/functions.R b/scripts/functions.R index cb055fa..54ec866 100644 --- a/scripts/functions.R +++ b/scripts/functions.R @@ -49,9 +49,11 @@ annotate.and.phase.vcf <- function(chromosome) { database.tmp <- list.files(db.dir,pattern=paste("ALL.*chr",gsub("chr","",chromosome),"_.*vcf.gz$",sep=""),full.names = T) out.log.cmd(paste("bcftools view --force-samples -s . ",database.tmp," -O z -o db.vcf.gz && tabix -f db.vcf.gz",sep="")) database <- "db.vcf.gz" + } else if (config$phasing_software == "crossbred_mouse") { + database <- global$DBSNP } - out.log.cmd(paste("java -jar ",global$SNPEFF,"/SnpSift.jar annotate -exists LIRA_GHET -tabix -name \"DBSNP_\" ",database," calls.vcf.gz > calls.id.vcf && bgzip -f calls.id.vcf && tabix -f calls.id.vcf.gz",sep="")) - + out.log.cmd(paste("java -jar ",global$SNPEFF,"/SnpSift.jar annotate -exists LIRA_GHET -tabix -name \"DBSNP_\" ",database," calls.vcf.gz > calls.id.vcf && bgzip -f calls.id.vcf && tabix -f calls.id.vcf.gz",sep="")) # this step is where variants called from the cell are annotated to indicate whcih variant calls are known SNPs from the SNP database + suppressWarnings(dir.create("phasing")) setwd("phasing") out.log("Make phasing input from population-polymorphic SNPs") @@ -64,7 +66,7 @@ annotate.and.phase.vcf <- function(chromosome) { " | cut -f 1-10", " | grep -e '#' -e '0/0' -e '0/1' -e '1/0' -e '1/1'", ifelse(hg19.convert," | sed 's#^#chr#g'",""), - " >> phasing-input.vcf",sep="")) + " >> phasing-input.vcf",sep="")) # this step is meant to obtain the variant calls that are known SNPs and are genotyped within the cell. if(config$phasing_software == "shapeit") { tmp <- paste("[.]chr",gsub("chr","",chromosome),"[.]",sep="") if(chromosome == "X") { @@ -74,8 +76,8 @@ annotate.and.phase.vcf <- function(chromosome) { add.args <- "--chrX --input-sex sex.ped" } else { add.args <- "" - } - l <- list.files(global$KGEN,recursive=T,pattern=tmp,full.names = T) + } # meant to annotate the X in case the sample is from a female + l <- list.files(global$KGEN,recursive=T,pattern=tmp,full.names = T) # list files in the haplotype reference panel. Meant for phasing analysis. Don't use for crossbred organisms legend <- l[grepl("legend",l)] hap <- l[grepl("hap",l)] map <- l[grepl("genetic_map",l)] @@ -104,6 +106,24 @@ annotate.and.phase.vcf <- function(chromosome) { map <- list.files(paste(global$EAGLE,"/tables",sep=""),pattern="hg38",full.names = T) } out.log.cmd(paste(global$EAGLE,"/eagle --geneticMapFile ",map," --vcfRef ",vcfRef," --vcfTarget phasing-input.vcf.gz --vcfOutFormat z --outPrefix phasing-output && tabix -f phasing-output.vcf.gz",sep="")) + } else if (config$phasing_software == "crossbred_mouse") { + out.log("No phasing software to be run--VCF for heterozygous crossbred mouse is expected") + # don't run any phasing analysis; how should the phasing-output.vcf be formatted? + # this needs a phasing-output.vcf.gz for the mouse; otherwise, we cannot perform bulk phasing! + #out.log.cmd(paste(global$EAGLE,"/eagle --geneticMapFile ",map," --vcfRef ",vcfRef," --vcfTarget phasing-input.vcf.gz --vcfOutFormat z --outPrefix phasing-output && tabix -f phasing-output.vcf.gz",sep="")) + # out.log.cmd(paste("shapeit -V phasing-input.vcf -M ",map, + # " --input-ref ",hap," ",legend," ",sample, + # " --exclude-snp shapeit-check.snp.strand.exclude -O phasing-output ",add.args, + # " && shapeit -convert --input-haps phasing-output --output-vcf phasing-output.vcf",sep="")) + + out.log.cmd("cp phasing-input.vcf phasing-output.temp.vcf") + out.log.cmd(paste("sed 's#/#|#g' phasing-output.temp.vcf |", + "awk 'BEGIN {OFS=\"\t\"} {if(NF == 10){$7 = \"PASS\"; print $0} else {print $0}}' > phasing-output.vcf")) # just to make sure that the PASS is present as it should be in phasing-output.vcf + + # awk 'BEGIN {FS=OFS="\t"} {if(NF == 10) {$7 = "PASS"; print $0} else {print $0}}' + + out.log.cmd("bgzip phasing-output.vcf && tabix -f phasing-output.vcf.gz") + # does the exclusion of KGEN affect the phasing-output.vcf.gz? No, it does not } } @@ -132,7 +152,7 @@ split <- function(config,chromosome) { cum.reads <- cumsum(coverage[,4]) #assign regions to jobs targeting READS_TARGET - run.assignment <- ceiling(cum.reads/global$READS_TARGET) + run.assignment <- ceiling(cum.reads/global$READS_TARGET) # split the jobs based on the number of reads covering the sites in sites.bed. jobs <- base::split(coverage[,1:3],run.assignment) dir.create("jobs") for(j in seq_along(jobs)) { @@ -187,7 +207,7 @@ local.region.function <- function(config,work.dir) { linkage <- data.frame(RR=numeric(0),RV=numeric(0),VR=numeric(0),VV=numeric(0)) save(linkage,file=paste("linkage.rda",sep="")) return(0) - } + } # if no results were found (i.e no sites), then save an empty linkage.rda #Save work save(results,file=paste("results.rda",sep="")) @@ -285,11 +305,16 @@ local.region.function <- function(config,work.dir) { get_alt_counts() } + # print(config$phasing_software) + get_vcf_info <- function() { if(config$phasing_software == "eagle") { caf.col <- "%INFO/DBSNP_AF" } else if(config$phasing_software == "shapeit") { caf.col <- "%INFO/DBSNP_CAF" + } else if (config$phasing_software == "crossbred_mouse") { + #caf.col <- "%INFO/DBSNP_CAF" # the code behind the lack of pop.ref.freq, I think, is here + caf.col <- "%INFO/AF" # check with craig if this is valid # for a crossbred mouse, the AF of a standard heterozygous SNP in the whole population should be 0.5. Can't be more or less! } cmds <- c(paste("bcftools query -i 'TYPE=\"snp\" & N_ALT=1' -s ",config$sample," -R targets -f '",c("%CHROM;%POS;%REF;%ALT\\t%ID", ifelse(config$bulk,caf.col,"."), @@ -303,15 +328,43 @@ local.region.function <- function(config,work.dir) { " | awk '{print $1\"\\t\"$2-2\"\\t\"$2+1}' > .tmp5"),sep="",collapse=" && "), paste("bedtools getfasta -fi ",config$reference_file," -bed .tmp5 -fo - | grep -v '>' | tr 'actg' 'ACTG' > .tmp6",sep=""), "paste .tmp1 .tmp2 .tmp3 .tmp4 .tmp6 > .tmp7", - "rm .tmp1", - "rm .tmp2", - "rm .tmp3", - "rm .tmp4", - "rm .tmp5", - "rm .tmp6", "echo \"site\tid\tpop.ref.freq\tgenotype\tref\talt\tgq\tcontext\" > vcf-info.txt", "cat .tmp7 | sed -E 's#^([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t*$#\\1\\t\\2\\t\\3\\t\\4\\t\\5\\t\\6\\t0\\t\\7#g' >> vcf-info.txt", "rm .tmp7") + # when creating .tmp2, all entries seem to be "."'s and thus converted to 1. + # what does the .tmp2 for the BULK look like? the sc pop.ref.freq is set to the one calculated for bulk. That's important*** + ### I FOUND THE ERROR! + # the calls.id.vcf.gz file does NOT have a DBSNP_CAF entry in the info tab. For the crossbred mouse, AF is sufficent. + # vvv3@compute-a-16-164:~/parklab/parklab/splitseq_analysis/variant_calls_100cns/lira_calls/test2/bulk_splitseq_126cns/chr1/jobs/1$ bcftools query -i 'TYPE="snp" & N_ALT=1' -s 1 -R targets -f '%INFO/DBSNP_CAF\n' /n/data1/hms/dbmi/park/splitseq_analysis/variant_calls_100cns/lira_calls/test2/bulk_splitseq_126cns/chr1/calls.id.vcf.gz| tr ',' '\t' | cut -f 1 | sed 's#^[.]$#1#g' + # Error: no such tag defined in the VCF header: INFO/DBSNP_CAF + + # cmds <- c(paste("bcftools query -i 'TYPE=\"snp\" & N_ALT=1' -s ",config$sample," -R targets -f '",c("%CHROM;%POS;%REF;%ALT\\t%ID", + # ifelse(config$bulk,caf.col,"."), + # "[%GT]", + # "[%AD]\\t[%GQ]", + # "%CHROM\\t%POS"),"\\n' ",paste(config$analysis_path,"/",system("cat sites.bed | cut -f 1 | uniq",intern=T),"/",ifelse(config$bulk,"calls.id.vcf.gz","calls.vcf.gz"),sep=""), + # c("> .tmp1", + # "| tr ',' '\\t' | cut -f 1 | sed 's#^[.]$#1#g' > .tmp2", + # "> .tmp3", + # " | tr ',.' '\\t0' > .tmp4", + # " | awk '{print $1\"\\t\"$2-2\"\\t\"$2+1}' > .tmp5"),sep="",collapse=" && "), + # paste("bedtools getfasta -fi ",config$reference_file," -bed .tmp5 -fo - | grep -v '>' | tr 'actg' 'ACTG' > .tmp6",sep=""), + # "paste .tmp1 .tmp2 .tmp3 .tmp4 .tmp6 > .tmp7", + # "rm .tmp1", + # "rm .tmp2", + # "rm .tmp3", + # "rm .tmp4", + # "rm .tmp5", + # "rm .tmp6", + # "echo \"site\tid\tpop.ref.freq\tgenotype\tref\talt\tgq\tcontext\" > vcf-info.txt", + # "cat .tmp7 | sed -E 's#^([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t*$#\\1\\t\\2\\t\\3\\t\\4\\t\\5\\t\\6\\t0\\t\\7#g' >> vcf-info.txt", + # "rm .tmp7") + + # why is pop.ref.freq empty? For this cell, many of the targets do not have genotype information available. + # the pop.ref.freq field is 1 for all entries in single-cell. They get converted to the pop.ref.freq in the bulk sample. + # so, examine the pop.ref.freq in the bulk sample's in the bulk sample. + + # check the creation of calls.id.vcf.gz cmd <- paste(cmds,collapse=" && ") out.log.cmd(cmd) site.frame <- read.table("vcf-info.txt",sep="\t",header=T) @@ -340,7 +393,8 @@ local.region.function <- function(config,work.dir) { names(site.frame)[names(site.frame) == "single.cell.gq"] <- "bulk.gq" #load phasing dumb <- system(paste("bcftools query -R targets -f '%CHROM;%POS;%REF;%ALT\t[%GT]\n' ../../phasing/phasing-output.vcf.gz 2> /dev/null | grep -e '0|1' -e '1|0' > phasing.txt",sep="")) - phasing <- try(read.table("phasing.txt",sep="\t"),silent = T) + phasing <- try(read.table("phasing.txt",sep="\t"),silent = T) # is phasing.txt empty? # yes! it appears that phasing.txt has not even been read! #This has been fixed! + print(phasing) if(class(phasing) != "try-error") { vec <- numeric(nrow(phasing)) names(vec) <- phasing[,1] @@ -406,6 +460,7 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { } } } + print(paste("Getting linkage information from",jobs.dir)) linkage <- do.call(rbind,lapply(paste(jobs.dir,"/",j,"/linkage.rda",sep=""),function(x){load(x); return(linkage)})) linkage$site1 <- gsub("([^~]*)~([^~]*)","\\1",rownames(linkage)) linkage$site2 <- gsub("([^~]*)~([^~]*)","\\2",rownames(linkage)) @@ -424,62 +479,108 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { } #Load single cell info - out.log("Loading single cell info") + out.log("Loading single cell info 1") + print("sc 1") + #out.log("ctrl1") + #### NEED TO FIGURE OUT WHY THE JOBS EXIT AT THIS POINT + #print(config) + + out.log("Loading bulk info 1") + #print(config) + print("bulk 1") + #print(bulk.config) + obj <- process.local(config,paste("Jobs still undone from plink. See ",config$analysis_path,"/",chromosome,"/compare/log.txt",sep="")) + out.log(obj) + #out.log("ctrl2") single.cell.linkage <- obj$linkage + out.log(single.cell.linkage) + print("save single cell linkage") save(single.cell.linkage,file="single.cell.linkage.rda") single.cell.vcf.info <- obj$vcf.info + print("save single cell vcf info") save(single.cell.vcf.info,file="single.cell.vcf.info.rda") #Load bulk info out.log("Loading bulk info") + print("bulk config") obj <- process.local(bulk.config,paste("Bulk jobs still undone from plink. See ",config$analysis_path,"/",chromosome,"/compare/log.txt",sep="")) bulk.linkage <- obj$linkage + # print("save bulk linkage") + # print(head(bulk.linkage)) save(bulk.linkage,file="bulk.linkage.rda") bulk.vcf.info <- obj$vcf.info + # print("save bulk vcf") save(bulk.vcf.info,file="bulk.vcf.info.rda") #make a site-specific table + print("make a site-specific table") site.frame <- single.cell.vcf.info site.frame$id <- bulk.vcf.info$id - site.frame$pop.ref.freq <- bulk.vcf.info$pop.ref.freq + site.frame$pop.ref.freq <- bulk.vcf.info$pop.ref.freq # this line is important!!! #pull bulk info + print("pull bulk info---------") site.frame[,"bulk.gt"] <- bulk.vcf.info$bulk.gt site.frame[,"bulk.gq"] <- bulk.vcf.info$bulk.gq site.frame[,"bulk.ref"] <- bulk.vcf.info$bulk.ref site.frame[,"bulk.alt"] <- bulk.vcf.info$bulk.alt site.frame[,"bulk.phasing"] <- bulk.vcf.info$bulk.phasing - site.frame$bulk.phasing[site.frame$bulk.phasing == 0] <- NA + site.frame$bulk.phasing[site.frame$bulk.phasing == 0] <- NA + #only look for linkage near bulk het sites in 1KG #somatic candidates: bulk.gt == "0/0" or bulk.gt == "./." + print("only look for linkage near bulk het sites in 1KG") site.frame$in.linkage <- rownames(site.frame) %in% c(single.cell.linkage$site1,single.cell.linkage$site2) - site.frame$onek.bulk.het <- (site.frame$pop.ref.freq != 1) & (site.frame$bulk.gt == "0/1") + site.frame$onek.bulk.het <- (site.frame$pop.ref.freq != 1) & (site.frame$bulk.gt == "0/1") # (site.frame$bulk.gt == "0/1") # this step removes all of the one.bulk.het sites + # removing the pop.ref.freq requirement lets the script go to completion. Why? + print("values of pop.ref.freq") + print(table(site.frame$pop.ref.freq)) + print("bulk genotypes at the single-cell sites") + print(table(site.frame$bulk.gt)) + print("is the site heterozygous in the bulk? Based on OneK") + print(table(site.frame$onek.bulk.het)) site.frame$somatic <- (site.frame$bulk.gt == "0/0")|(site.frame$bulk.gt == "./.") - onek.bulk.sites <- rownames(site.frame)[site.frame$onek.bulk.het] + onek.bulk.sites <- rownames(site.frame)[site.frame$onek.bulk.het] # this turns up empty #ERROR! # fix this issue or at least find out why all of the one.bulk.het's are false + print("onekbulk sites") + print(head(onek.bulk.sites)) somatic.sites <- rownames(site.frame)[site.frame$somatic] + print("somatic.sites") + print(head(somatic.sites)) single.cell.linkage <- single.cell.linkage[((single.cell.linkage$site1 %in% c(onek.bulk.sites,somatic.sites))&(single.cell.linkage$site2 %in% c(onek.bulk.sites,somatic.sites))),] + print("single.cell.linkage") + print(head(single.cell.linkage)) + print("sclinkage") single.cell.linkage <- single.cell.linkage[single.cell.linkage$pair_id %in% bulk.linkage$pair_id,] rownames(single.cell.linkage) <- single.cell.linkage$pair_id single.cell.linkage$pair_id <- NULL + print(head(single.cell.linkage)) rownames(bulk.linkage) <- bulk.linkage$pair_id bulk.linkage<- bulk.linkage[rownames(single.cell.linkage),] bulk.linkage$pair_id <- NULL + print("bl") + print(head(bulk.linkage)) + print("bulklinkage") bulk.linkage$phased.orientation <- "" ind <- site.frame[bulk.linkage$site1,"bulk.phasing"] == site.frame[bulk.linkage$site2,"bulk.phasing"] ind1 <- ind2 <- ind ind1[is.na(ind1)] <- F bulk.linkage$phased.orientation[ind1] <- "cis" - + print(head(site.frame)) + # why are all of the bulk.phasing entries at "NA"? + # all of the pop.ref.freq are at 1! + + print("trans") ind2 <- !ind2 ind2[is.na(ind2)] <- F bulk.linkage$phased.orientation[ind2] <- "trans" + print(bulk.linkage) # this fills out fine haplotype.parser <- function(ind, type, site, orientation, hc.bulk.function, hc.single.cell.function, dc.single.cell.function,flag.functions=NULL) { bulk.tmp <- bulk.linkage[ind,] @@ -504,12 +605,17 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { } #onek.bulk to onek.bulk + print("site.frame") tmp <- site.frame[bulk.linkage$site1,"onek.bulk.het"] & site.frame[bulk.linkage$site2,"onek.bulk.het"] site.frame$bulk.onek.bulk.linked <- F site.frame$bulk.onek.bulk.linked[rownames(site.frame) %in% c(bulk.linkage$site1[tmp],bulk.linkage$site2[tmp])] <- T flag.functions <- list() + # this command is determining if any of the single-cell variants are linked in the bulk sample + # the site.frame is from the single-cell vcf info ('vcf-info.txt') + #trans, site 1 + print("trans 1") flag.functions[["bad_bulk_haplotypes"]] <- function(b,s) {(b$RR != 0) | (b$VV != 0)} flag.functions[["sc_dropout"]] <- function(b,s){s$VR == 0} flag.functions[["bad_phasing"]] <- function(b,s){b$phased.orientation == "cis"} @@ -521,8 +627,10 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { hc.single.cell.function=function(b,s){s$VR}, dc.single.cell.function=function(b,s){s$RR + s$VV},#should be just s$RR? flag.functions=flag.functions) + print(onek.bulk.site1.trans.info) #trans, site 2 + print("trans 2") flag.functions[["sc_dropout"]] <- function(b,s){s$RV == 0} onek.bulk.site2.trans.info <- haplotype.parser(ind=with(bulk.linkage,RV > VV) & tmp, type="1KG", @@ -534,6 +642,7 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { flag.functions=flag.functions) #cis, site 1 + print("cis 1") flag.functions[["bad_bulk_haplotypes"]] <- function(b,s) {(b$RV != 0) | (b$VR != 0)} flag.functions[["sc_dropout"]] <- function(b,s) {s$VV == 0} flag.functions[["bad_phasing"]] <- function(b,s) {b$phased.orientation == "trans"} @@ -547,6 +656,7 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { flag.functions=flag.functions) #cis, site 2 + print("cis 2") onek.bulk.site2.cis.info <- haplotype.parser(ind=with(bulk.linkage,VV > RV) & tmp, type="1KG", site=2, @@ -555,16 +665,26 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { hc.single.cell.function=function(b,s){s$VV}, dc.single.cell.function=function(b,s){s$RV + s$VR},#should be just s$RV? flag.functions=flag.functions) - + + print("onek") combined.onek <- rbind(onek.bulk.site1.cis.info,onek.bulk.site2.cis.info,onek.bulk.site1.trans.info,onek.bulk.site2.trans.info) - flags <- sapply(base::split(combined.onek$flags,combined.onek$site),function(x){paste(x,collapse=", ")}) + #print(combined.onek) # combined.onek is empty + # print(combined.onek$flags,combined.onek$site) #these are empty + flags <- sapply(base::split(combined.onek$flags,combined.onek$site),function(x){paste(x,collapse=", ")}) # this list is empty, and it is throwing the error + print("site.frames") + ###The compare issue interferes with runtime below here + print(flags) site.frame[names(flags),"onek.flags"] <- flags + print(site.frame) combined.onek$flags <- NULL #site 1 somatic, site 2 onek.bulk + print("tmp") tmp <- (site.frame[single.cell.linkage$site1,"somatic"]) & (site.frame[single.cell.linkage$site2,"onek.bulk.het"]) & with(bulk.linkage,(VR == 0) & (VV == 0)) + ###The compare issue interferes with runtime above here #trans + print("trans site2 somatic") somatic.site1.trans.info <- haplotype.parser(ind=with(single.cell.linkage,VR > VV) & tmp, type="somatic", site=1, @@ -573,6 +693,7 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { hc.single.cell.function=function(b,s){s$VR}, dc.single.cell.function=function(b,s){s$RR}) #cis + print("cis site2 somatic") somatic.site1.cis.info <- haplotype.parser(ind=with(single.cell.linkage,VV > VR) & tmp, type="somatic", site=1, @@ -585,6 +706,7 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { tmp <- (site.frame[single.cell.linkage$site1,"onek.bulk.het"]) & (site.frame[single.cell.linkage$site2,"somatic"]) & with(bulk.linkage,(RV == 0) & (VV == 0)) #trans + print("trans site2 somatic") somatic.site2.trans.info <- haplotype.parser(ind=with(single.cell.linkage,RV > VV) & tmp, type="somatic", site=2, @@ -593,6 +715,7 @@ compare <- function(config, bulk.config, chromosome, overwrite, wait) { hc.single.cell.function=function(b,s){s$RV}, dc.single.cell.function=function(b,s){s$RR}) #cis + print("cis site2 somatic") somatic.site2.cis.info <- haplotype.parser(ind=with(single.cell.linkage,VV > RV) & tmp, type="somatic", site=2, diff --git a/scripts/linkage.py b/scripts/linkage.py index 7a73e86..365edc3 100755 --- a/scripts/linkage.py +++ b/scripts/linkage.py @@ -20,14 +20,24 @@ with open(args.bed) as bed: reader = csv.reader(bed, delimiter="\t") sites = list(reader) - + for site in sites: start = int(site[1]) end = int(site[2]) + # print(start) # this prints pileup = bamfile.pileup(site[0],start,end,stepper="all",max_depth=500000) for pileupColumn in pileup: for pileupRead in pileupColumn.pileups: - if (pileupColumn.pos >= start) and (pileupColumn.pos < end) and (not pileupRead.is_refskip) and (pileupRead.alignment.mapping_quality == 60) and (pileupRead.alignment.is_proper_pair): + + # print(pileupRead.is_refskip) + # print(pileupRead.alignment.is_proper_pair) + #print(pileupRead.alignment.mapping_quality) + # print('----------') + + #print(pileupColumn.pos >= start,pileupColumn.pos < end,pileupRead.is_refskip) + + if (pileupColumn.pos >= start) and (pileupColumn.pos < end) and (not pileupRead.is_refskip) and (pileupRead.alignment.mapping_quality > 0): #and (pileupRead.alignment.mapping_quality == 60): #and (pileupRead.alignment.is_proper_pair): + #print(pileupRead.alignment.cigartuples) if(len(pileupRead.alignment.cigartuples) == 1): if(pileupRead.is_del): base = "*" diff --git a/scripts/main.R b/scripts/main.R index 0277082..3b38fd3 100644 --- a/scripts/main.R +++ b/scripts/main.R @@ -129,6 +129,7 @@ if(cmd == "plink") { scripts <- scripts[!ind] } tot <- length(scripts) + print(tot) cmds <- paste(getwd(),"/job_scripts/",scripts,sep="") batches <- batcher(cmds,batch.size) if(tot > 0) { diff --git a/scripts/utils.R b/scripts/utils.R index 480a81b..a196708 100644 --- a/scripts/utils.R +++ b/scripts/utils.R @@ -86,6 +86,11 @@ get.chromosomes <- function(config) { if(config$gender == "female") { chromosomes <- c(chromosomes,"X") } + } else if(config$reference_identifier == "mm10") { + chromosomes <- paste("chr",1:19,sep="") + if(config$gender == "female") { + chromosomes <- c(chromosomes,"chrX") + } } else { stop("Cannot get chromosomes.") }