parklab · vinayakvsv · Jun 20, 2018 · Jun 20, 2018 · Jun 21, 2018 · Jun 23, 2018
diff --git a/README.md b/README.md
@@ -12,6 +12,7 @@ The following include tested versions in parenthesis when applicable; later vers
 1. python (2.7.12)
 	+ argparse
 	+ os
+	+ pysam
 2. samtools (1.2)
 3. bcftools (1.2)
 4. htslib (1.2.1)

diff --git a/global-config.human.txt b/global-config.human.txt
@@ -0,0 +1,38 @@
+#This file ignores blank and commented ('#') lines.
+#Values are noted as "<KEY>	<VALUE>"
+
+#Path to snpEff installation
+SNPEFF	/n/data1/hms/dbmi/park/cbohrson/installed/local/bin/snpEff
+
+#Path to DBSNP database
+DBSNP	/n/data1/hms/dbmi/park/cbohrson/common_all_20160601.vcf.gz
+
+#Path to downloaded 1000 genomes haplotype reference panel
+KGEN	/n/data1/hms/dbmi/park/simon_chu/projects/data/1000G
+
+#Path to picard JAR file
+PICARD	/n/data1/hms/dbmi/park/cbohrson/installed/local/bin/picard.jar
+
+#Gap between SNPs required for regions to be chunked separately
+GAP_REQUIREMENT	2000
+
+#Target number of reads per LiRA job
+READS_TARGET	100000
+
+#Partition to submit to for parallelization
+PARTITION	short
+
+#Batch size for parallelization (number of scripts per job)
+BATCH_SIZE	10
+
+#Method to use for creating sSNV control curve of composite coverage vs. genome-wide sSNV rate.  Options are 'germline' (use germline sSNVs) or 'general' (use all powered sites).  'general' is not yet implemented.
+CONTROL_METHOD	germline
+
+#For 10X, the max distance between two SNPs to be considered within a barcode family
+MAX_DISTANCE_10X	50000
+
+#Script in $LIRA_DIR/scripts to use for parallelization
+PARALLEL_SCRIPT	slurm.R
+
+#Number of sampling replicates to use for CONTROL_METHOD
+BOOTSTRAP_REPLICATES	100
diff --git a/global-config.txt b/global-config.txt
@@ -5,10 +5,11 @@
 SNPEFF	/n/data1/hms/dbmi/park/cbohrson/installed/local/bin/snpEff
 
 #Path to DBSNP database
-DBSNP	/n/data1/hms/dbmi/park/cbohrson/common_all_20160601.vcf.gz
+#DBSNP	/n/data1/hms/dbmi/park/cbohrson/common_all_20160601.vcf.gz
+DBSNP	/n/data1/hms/dbmi/park/splitseq_analysis/vcf_contig_rename/mgp.v5.merged.snps_all.dbSNP142.contig_rename_2.vcf.gz
 
 #Path to downloaded 1000 genomes haplotype reference panel
-KGEN	/n/data1/hms/dbmi/park/simon_chu/projects/data/1000G
+#KGEN	/n/data1/hms/dbmi/park/simon_chu/projects/data/1000G
 
 #Path to picard JAR file
 PICARD	/n/data1/hms/dbmi/park/cbohrson/installed/local/bin/picard.jar
@@ -36,3 +37,5 @@ PARALLEL_SCRIPT	slurm.R
 
 #Number of sampling replicates to use for CONTROL_METHOD
 BOOTSTRAP_REPLICATES	100
+
+reference_identifier	mm10
diff --git a/scripts/functions.R b/scripts/functions.R
diff --git a/scripts/linkage.py b/scripts/linkage.py
@@ -20,14 +20,24 @@
 with open(args.bed) as bed:
 	reader = csv.reader(bed, delimiter="\t")
 	sites = list(reader)
-
+	
 for site in sites:
 	start = int(site[1])
 	end = int(site[2])
+	# print(start) # this prints
 	pileup = bamfile.pileup(site[0],start,end,stepper="all",max_depth=500000)
 	for pileupColumn in pileup:
 		for pileupRead in pileupColumn.pileups:
-			if (pileupColumn.pos >= start) and (pileupColumn.pos < end) and (not pileupRead.is_refskip) and (pileupRead.alignment.mapping_quality == 60) and (pileupRead.alignment.is_proper_pair):
+
+			# print(pileupRead.is_refskip)
+			# print(pileupRead.alignment.is_proper_pair)
+			#print(pileupRead.alignment.mapping_quality)
+			# print('----------')
+
+			#print(pileupColumn.pos >= start,pileupColumn.pos < end,pileupRead.is_refskip)
+
+			if (pileupColumn.pos >= start) and (pileupColumn.pos < end) and (not pileupRead.is_refskip) and (pileupRead.alignment.mapping_quality > 0): #and (pileupRead.alignment.mapping_quality == 60): #and (pileupRead.alignment.is_proper_pair):
+				#print(pileupRead.alignment.cigartuples)
 				if(len(pileupRead.alignment.cigartuples) == 1):
 					if(pileupRead.is_del):
 						base = "*"

diff --git a/scripts/main.R b/scripts/main.R
@@ -129,6 +129,7 @@ if(cmd == "plink") {
     scripts <- scripts[!ind]
   }
   tot <- length(scripts)
+  print(tot)
   cmds <- paste(getwd(),"/job_scripts/",scripts,sep="")
   batches <- batcher(cmds,batch.size)
   if(tot > 0) {

diff --git a/scripts/utils.R b/scripts/utils.R
@@ -86,6 +86,11 @@ get.chromosomes <- function(config) {
     if(config$gender == "female") {
       chromosomes <- c(chromosomes,"X")
     }
+  } else if(config$reference_identifier == "mm10") {
+    chromosomes <- paste("chr",1:19,sep="")
+    if(config$gender == "female") {
+      chromosomes <- c(chromosomes,"chrX")
+    }
   } else {
     stop("Cannot get chromosomes.")
   }