Merge branch 'master' of https://github.com/calico/basenji

ok
calico · Sep 19, 2017 · 344e02a · 344e02a
2 parents 918492b + 159c684
commit 344e02a
Show file tree

Hide file tree

Showing 16 changed files with 353 additions and 556 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ To verify the install, launch python and run
 ---------------------------------------------------------------------------------------------------
 ### Documentation
 
-At this stage, Basenji is something in between personal research code and accessible software for wide use. The primary challenge is uncertainty what the best role for this type of toolkit is going to be in functional genomics and statistical genetics. The computational requirements don't make it easy either. Thus, this package is under active development, and I encourage anyone to get in touch to relate your experience and request clarifications or additional features, documentation, or tutorials.
+At this stage, Basenji is something in between personal research code and accessible software for wide use. The primary challenge is uncertainty in what the best role for this type of toolkit is going to be in functional genomics and statistical genetics. The computational requirements don't make it easy either. Thus, this package is under active development, and I encourage anyone to get in touch to relate your experience and request clarifications or additional features, documentation, or tutorials.
 
 - [Preprocess](docs/preprocess.md)
   - [bam_cov.py](docs/preprocess.md#bam_cov)
@@ -71,4 +71,4 @@ These are a work in progress, so forgive incompleteness for the moment. If there
   - [Test gene expression predictions.](tutorials/genes.ipynb)
 - Study
   - [Execute an in silico saturated mutagenesis](tutorials/sat_mut.ipynb)
-  - [Compute SNP Activity Difference (SAD) and Expression Difference (SED) scores.](tutorials/sad.ipynb)
+  - [Compute SNP Activity Difference (SAD) and Expression Difference (SED) scores.](tutorials/sad.ipynb)
diff --git a/bin/bam_cov.py b/bin/bam_cov.py
@@ -73,17 +73,18 @@ def main():
     usage = 'usage: %prog [options] <bam_file> <output_file>'
     parser = OptionParser(usage)
     parser.add_option('-a', dest='adaptive_max', default=8, type='int', help='Maximum coverage at a single position, adaptively-determined [Default: %default]')
-    parser.add_option('-c', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]')
+    parser.add_option('-b', dest='cut_bias_kmer', default=None, action='store_true', help='Normalize coverage for a cutting bias model for k-mers [Default: %default]')
+    parser.add_option('-c', dest='shift_center', default=False, action='store_true', help='Shift the event to the fragment center, learning the distribution for single end reads [Default: %default]')
     parser.add_option('-d', dest='duplicate_max', default=None, type='int', help='Maximum coverage at a single position for initial clipping [Default: %default]')
     parser.add_option('-f', dest='fasta_file', default='%s/assembly/hg19.fa'%os.environ['HG19'], help='FASTA to obtain sequence to control for GC% [Default: %default]')
     parser.add_option('-g', dest='gc', default=False, action='store_true', help='Control for local GC% [Default: %default]')
     parser.add_option('-m', dest='multi_em', default=0, type='int', help='Iterations of EM to distribute multi-mapping reads [Default: %default]')
     parser.add_option('--multi_max', dest='multi_max', default=1, type='int', help='Maximum coverage at a single position from multi-mapping reads [Default: %default]')
     parser.add_option('-o', dest='out_dir', default='bam_cov', help='Output directory [Default: %default]')
-    parser.add_option('-r', dest='strand_corr_shift', default=False, action='store_true', help='Learn the fragment shift by fitting a strand cross correlation model [Default: %default]')
     parser.add_option('-s', dest='smooth_sd', default=32, type='float', help='Gaussian standard deviation to smooth coverage estimates with [Default: %default]')
-    parser.add_option('-t', dest='shift', default=0, type='int', help='Fragment shift [Default: %default]')
     parser.add_option('-u', dest='unsorted', default=False, action='store_true', help='Alignments are unsorted [Default: %default]')
+    parser.add_option('-v', dest='shift_forward_end', default=0, type='int', help='Fragment shift for forward end read [Default: %default]')
+    parser.add_option('-w', dest='shift_reverse_end', default=0, type='int', help='Fragment shift for reverse end read [Default: %default]')
     (options,args) = parser.parse_args()
 
     if len(args) != 2:
@@ -110,10 +111,17 @@ def main():
         options.duplicate_max *= 2
 
     # initialize
-    genome_coverage = GenomeCoverage(chrom_lengths, smooth_sd=options.smooth_sd, duplicate_max=options.duplicate_max, multi_max=options.multi_max, shift=options.shift, fasta_file=options.fasta_file)
+    genome_coverage = GenomeCoverage(chrom_lengths,
+        smooth_sd=options.smooth_sd,
+        duplicate_max=options.duplicate_max,
+        multi_max=options.multi_max,
+        shift_center=options.shift_center,
+        shift_forward=options.shift_forward_end,
+        shift_reverse=options.shift_reverse_end,
+        fasta_file=options.fasta_file)
 
     # estimate fragment shift
-    if options.strand_corr_shift:
+    if options.shift_center:
         if sp == 'single':
             genome_coverage.learn_shift_single(bam_file, out_dir=options.out_dir)
         else:
@@ -366,7 +374,7 @@ class GenomeCoverage:
      shift (int): Alignment shift to maximize cross-strand coverage correlation
     '''
 
-    def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift=0, fasta_file=None):
+    def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2, multi_max=1, shift_center=False, shift_forward=0, shift_reverse=0, fasta_file=None):
         self.chrom_lengths = chrom_lengths
         self.genome_length = sum(chrom_lengths.values())
         self.unique_counts = np.zeros(self.genome_length, dtype='uint16')
@@ -375,7 +383,10 @@ def __init__(self, chrom_lengths, smooth_sd=32, adaptive_max=8, duplicate_max=2,
         self.smooth_sd = smooth_sd
         self.duplicate_max = duplicate_max
         self.multi_max = multi_max
-        self.shift = shift
+
+        self.shift_center = shift_center
+        self.shift_forward = shift_forward
+        self.shift_reverse = shift_reverse
 
         self.adaptive_max = adaptive_max
         self.adaptive_cdf = .05
@@ -849,14 +860,15 @@ def learn_shift_pair(self, bam_file):
                 template_lengths.append(abs(align.template_length))
 
         # compute mean
-        self.shift = int(np.round(np.mean(template_lengths) / 2))
+        self.shift_forward = int(np.round(np.mean(template_lengths) / 2))
 
         print(' Done in %ds.' % (time.time()-t0))
-        print('Shift: %d' % self.shift, flush=True)
+        print('Shift: %d' % self.shift_forward, flush=True)
 
 
     def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None):
-        ''' Learn the optimal fragment shift that maximizes across strand correlation '''
+        ''' Learn the optimal fragment shift that maximizes across strand correlation
+             (to be applied for single end discordant alignments.) '''
 
         t0 = time.time()
         print('Learning shift from single-end sequences.', end='', flush=True)
@@ -928,10 +940,10 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None
         strand_corrs_smooth = gaussian_filter1d(strand_corrs, sigma=12, truncate=3)
 
         # find max
-        self.shift = (shift_min + np.argmax(strand_corrs_smooth)) // 2
+        self.shift_forward = (shift_min + np.argmax(strand_corrs_smooth)) // 2
 
         print(' Done in %ds.' % (time.time()-t0))
-        print('Shift: %d' % self.shift, flush=True)
+        print('Shift: %d' % self.shift_forward, flush=True)
 
         if out_dir is not None:
             # plot training fit
@@ -940,7 +952,7 @@ def learn_shift_single(self, bam_file, shift_min=50, shift_max=350, out_dir=None
             # print table
             shift_out = open('%s/shift_table.txt' % out_dir, 'w')
             for si in range(len(shifts)):
-                print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift==shifts[si]), file=shift_out)
+                print(shifts[si], strand_corrs[si], strand_corrs_smooth[si], '*'*int(2*self.shift_forward==shifts[si]), file=shift_out)
             shift_out.close()
 
 
@@ -966,22 +978,26 @@ def read_bam(self, bam_file, genome_sorted=False):
             if not align.is_unmapped:
                 read_id = (align.query_name, align.is_read1)
 
-                # determine shift
-                if self.shift == 0:
-                    # don't shift anyone
-                    align_shift = 0
-                else:
+                if self.shift_center:
                     if align.is_proper_pair:
                         # shift proper pairs according to mate
-                        align_shift = abs(align.template_length) // 2
+                        align_shift_forward = abs(align.template_length) // 2
                     else:
                         # shift others by estimated amount
-                        align_shift = self.shift
+                        align_shift_forward = self.shift_forward
+
+                    # match reverse to forward
+                    align_shift_reverse = align_shift_forward
+
+                else:
+                    # apply user-specific shifts
+                    align_shift_forward = self.shift_forward
+                    align_shift_reverse = self.shift_reverse
 
                 # determine alignment position
-                chrom_pos = align.reference_start + align_shift
+                chrom_pos = align.reference_start + align_shift_forward
                 if align.is_reverse:
-                    chrom_pos = align.reference_end - align_shift
+                    chrom_pos = align.reference_end - align_shift_reverse
 
                 # determine genome index
                 gi = self.genome_index(align.reference_id, chrom_pos)

diff --git a/bin/basenji_sad.py b/bin/basenji_sad.py
@@ -143,10 +143,10 @@ def main():
         target_labels = []
         for line in open(options.targets_file):
             a = line.strip().split('\t')
-            target_ids.append(a[1])
+            target_ids.append(a[0])
             target_labels.append(a[2])
 
-    header_cols = ('rsid', 'index', 'score', 'ref', 'alt', 'ref_pred', 'alt pred', 'sad', 'target_index', 'target_id', 'target_label')
+    header_cols = ('rsid', 'index', 'score', 'ref', 'alt', 'ref_upred', 'alt_upred', 'usad', 'usar', 'ref_xpred', 'alt_xpred', 'xsad', 'xsar', 'target_index', 'target_id', 'target_label')
     if options.csv:
         sad_out = open('%s/sad_table.csv' % options.out_dir, 'w')
         print(','.join(header_cols), file=sad_out)
@@ -238,10 +238,10 @@ def main():
                         # profile the max difference position
                         max_li = 0
                         max_sad = alt_preds[max_li,ti] - ref_preds[max_li,ti]
-                        max_sar = np.log2(alt_preds[max_li,ti]) - np.log2(ref_preds[max_li,ti])
+                        max_sar = np.log2(alt_preds[max_li,ti]+1) - np.log2(ref_preds[max_li,ti]+1)
                         for li in range(ref_preds.shape[0]):
                             sad_li = alt_preds[li,ti] - ref_preds[li,ti]
-                            sar_li = np.log2(alt_preds[li,ti]) - np.log2(ref_preds[li,ti])
+                            sar_li = np.log2(alt_preds[li,ti]+1) - np.log2(ref_preds[li,ti]+1)
                             # if abs(sad_li) > abs(max_sad):
                             if abs(sar_li) > abs(max_sar):
                                 max_li = li

diff --git a/tutorials/.gitignore b/tutorials/.gitignore
@@ -1 +1,4 @@
 .ipynb_checkpoints
+data/gm12878_test
+pim1_sat
+rfx6_sad
diff --git a/tutorials/data/gm12878_wigs.txt b/tutorials/data/gm12878_wigs.txt
@@ -0,0 +1,39 @@
+ENCSR000EJD_3_1	../../../data/human/dnase/encode_fastq/covse/ENCSR000EJD_3_1.h5	DNASE:GM12878
+ENCSR000EMT_2_1	../../../data/human/dnase/encode_fastq/covse/ENCSR000EMT_2_1.h5	DNASE:GM12878
+ENCSR000EMT_1_1	../../../data/human/dnase/encode_fastq/covse/ENCSR000EMT_1_1.h5	DNASE:GM12878
+ENCSR000EJD_1_1	../../../data/human/dnase/encode_fastq/covse/ENCSR000EJD_1_1.h5	DNASE:GM12878
+ENCSR000EJD_2_1	../../../data/human/dnase/encode_fastq/covse/ENCSR000EJD_2_1.h5	DNASE:GM12878
+ENCSR057BWO_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR057BWO_2_1.h5	HISTONE:H3K4me3 GM12878
+ENCSR000AKE_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKE_1_1.h5	HISTONE:H3K36me3 GM12878
+ENCSR000AKF_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKF_2_1.h5	HISTONE:H3K4me1 GM12878
+ENCSR000AOV_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOV_2_1.h5	HISTONE:H2AFZ GM12878
+ENCSR000AKI_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKI_2_1.h5	HISTONE:H4K20me1 GM12878
+ENCSR000AKA_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKA_2_1.h5	HISTONE:H3K4me3 GM12878
+ENCSR000AOX_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOX_2_1.h5	HISTONE:H3K9me3 GM12878
+ENCSR000DRW_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000DRW_1_1.h5	HISTONE:H3K36me3 GM12878
+ENCSR000AOW_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOW_1_1.h5	HISTONE:H3K79me2 GM12878
+ENCSR000AKD_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKD_1_1.h5	HISTONE:H3K27me3 GM12878
+ENCSR000AKE_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKE_2_1.h5	HISTONE:H3K36me3 GM12878
+ENCSR000DRW_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000DRW_2_1.h5	HISTONE:H3K36me3 GM12878
+ENCSR000AKA_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKA_1_1.h5	HISTONE:H3K4me3 GM12878
+ENCSR000AKH_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKH_1_1.h5	HISTONE:H3K9ac GM12878
+ENCSR000AOX_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOX_1_1.h5	HISTONE:H3K9me3 GM12878
+ENCSR000AKG_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKG_1_1.h5	HISTONE:H3K4me2 GM12878
+ENCSR057BWO_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR057BWO_1_1.h5	HISTONE:H3K4me3 GM12878
+ENCSR000DRY_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000DRY_2_1.h5	HISTONE:H3K4me3 GM12878
+ENCSR000DRY_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000DRY_1_1.h5	HISTONE:H3K4me3 GM12878
+ENCSR000DRX_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000DRX_2_1.h5	HISTONE:H3K27me3 GM12878
+ENCSR000AKG_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKG_2_1.h5	HISTONE:H3K4me2 GM12878
+ENCSR000AKD_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKD_2_1.h5	HISTONE:H3K27me3 GM12878
+ENCSR000AKC_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKC_2_1.h5	HISTONE:H3K27ac GM12878
+ENCSR000AOX_3_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOX_3_1.h5	HISTONE:H3K9me3 GM12878
+ENCSR000DRX_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000DRX_1_1.h5	HISTONE:H3K27me3 GM12878
+ENCSR000AKF_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKF_1_1.h5	HISTONE:H3K4me1 GM12878
+ENCSR000AOW_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOW_2_1.h5	HISTONE:H3K79me2 GM12878
+ENCSR000AKI_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKI_1_1.h5	HISTONE:H4K20me1 GM12878
+ENCSR000AKH_2_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKH_2_1.h5	HISTONE:H3K9ac GM12878
+ENCSR000AKC_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AKC_1_1.h5	HISTONE:H3K27ac GM12878
+ENCSR000AOV_1_1	../../../data/human/histone/encode_fastq/covse/ENCSR000AOV_1_1.h5	HISTONE:H2AFZ GM12878
+CNhs12332	../../../data/human/cage/fantom/coverage/CNhs12332.h5	CAGE:B lymphoblastoid cell line: GM12878 ENCODE, biol_rep2
+CNhs12333	../../../data/human/cage/fantom/coverage/CNhs12333.h5	CAGE:B lymphoblastoid cell line: GM12878 ENCODE, biol_rep3
+CNhs12331	../../../data/human/cage/fantom/coverage/CNhs12331.h5	CAGE:B lymphoblastoid cell line: GM12878 ENCODE, biol_rep1
diff --git a/tutorials/data/pim1.bed b/tutorials/data/pim1.bed
@@ -0,0 +1 @@
+chr6	37006697	37268841
diff --git a/tutorials/data/pim1.fa b/tutorials/data/pim1.fa
diff --git a/tutorials/data/rs339331.vcf b/tutorials/data/rs339331.vcf
@@ -0,0 +1,2 @@
+##fileformat=VCFv4.0
+chr6	117210052	rs339331	T	C	100
diff --git a/tutorials/models/gm12878.tf.data-00000-of-00001 b/tutorials/models/gm12878.tf.data-00000-of-00001
diff --git a/tutorials/models/gm12878.tf.index b/tutorials/models/gm12878.tf.index
diff --git a/tutorials/models/gm12878.tf.meta b/tutorials/models/gm12878.tf.meta
diff --git a/tutorials/models/params_large.txt b/tutorials/models/params_large.txt