-
Notifications
You must be signed in to change notification settings - Fork 72
/
cyvcf2.pyx
2370 lines (2047 loc) · 83.3 KB
/
cyvcf2.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#cython: profile=False
#cython:language_level=3
#cython: embedsignature=True
from __future__ import print_function
import os
import sys
from collections import defaultdict
import atexit
import tempfile
import numpy as np
from array import array
import math
import ctypes
try:
from pathlib import Path
except ImportError:
from pathlib2 import Path # python 2 backport
from libc cimport stdlib
cimport numpy as np
np.seterr(invalid='ignore')
np.import_array()
import locale
ENC = locale.getpreferredencoding()
from cython cimport view
from cpython.version cimport PY_MAJOR_VERSION
# overcome lack of __file__ in cython
import inspect
if not hasattr(sys.modules[__name__], '__file__'):
__file__ = inspect.getfile(inspect.currentframe())
def par_relatedness(vcf_path, samples, ncpus, sites, min_depth=5, each=1):
from multiprocessing import Pool
p = Pool(ncpus)
cdef np.ndarray aibs, an, ahets
for i, fname in enumerate(p.imap_unordered(_par_relatedness, [
(vcf_path, samples, min_depth, i, ncpus, each, sites) for i in range(ncpus)])):
arrays = np.load(fname)
os.unlink(fname)
if i == 0:
aibs, an, ahets = arrays['ibs'], arrays['n'], arrays['hets']
else:
aibs += arrays['ibs']
an += arrays['n']
ahets += arrays['hets']
return VCF(vcf_path, samples=samples)._relatedness_finish(aibs, an, ahets)
def par_het(vcf_path, samples, ncpus, qsites, min_depth=8, percentiles=(10, 90),
int each=1, int offset=0):
from multiprocessing import Pool
p = Pool(ncpus)
any_counts, sum_counts, het_counts = 0, 0, 0
all_gt_types, mean_depths, sites = [], [], []
maf_lists = defaultdict(list)
for ret in p.imap_unordered(_par_het, [(vcf_path, samples, qsites, min_depth, i, ncpus, each) for i in range(ncpus)]):
(mean_depths_, maf_lists_, het_counts_, sum_counts_,
all_gt_types_, sites_, any_counts_) = ret
mean_depths.extend(mean_depths_)
any_counts += any_counts_
sites.extend(sites_)
all_gt_types.extend(all_gt_types_)
sum_counts += sum_counts_ # an array
het_counts += het_counts_ # an array
for k in maf_lists_:
li = maf_lists_[k]
maf_lists[k].extend(li)
mean_depths = np.array(mean_depths, dtype=np.int32).T
vcf = VCF(vcf_path, samples=samples, gts012=True)
return vcf._finish_het(mean_depths, maf_lists,
percentiles,
het_counts, sum_counts, all_gt_types,
sites, any_counts)
def _par_het(args):
vcf_path, samples, sites, min_depth, offset, ncpus, each = args
each *= ncpus
vcf = VCF(vcf_path, samples=samples, gts012=True)
return vcf.het_check(min_depth=min_depth, sites=sites, each=each, offset=offset, _finish=False)
def _par_relatedness(args):
vcf_path, samples, min_depth, offset, ncpus, each, sites = args
vcf = VCF(vcf_path, samples=samples, gts012=True)
each = each * ncpus
vibs, vn, vhet = vcf._site_relatedness(min_depth=min_depth, offset=offset, each=each, sites=sites)
# to get around limits of multiprocessing size of transmitted data, we save
# the arrays to disk and return the file
fname = tempfile.mktemp(suffix=".npz")
atexit.register(os.unlink, fname)
np.savez_compressed(fname, ibs=np.asarray(vibs), hets=np.asarray(vhet), n=np.asarray(vn))
return fname
cdef unicode xstr(s):
if type(s) is unicode:
# fast path for most common case(s)
return <unicode>s
elif PY_MAJOR_VERSION < 3 and isinstance(s, bytes):
# only accept byte strings in Python 2.x, not in Py3
return (<bytes>s).decode('ascii')
elif isinstance(s, unicode):
# an evil cast to <unicode> might work here in some(!) cases,
# depending on what the further processing does. to be safe,
# we can always create a copy instead
return unicode(s)
else:
raise TypeError(...)
def r_(int32_t[::view.contiguous] a_gts, int32_t[::view.contiguous] b_gts, float f, int32_t n_samples):
return r_unphased(&a_gts[0], &b_gts[0], f, n_samples)
cdef set_constants(VCF v):
v.HOM_REF = 0
v.HET = 1
if v.gts012:
v.HOM_ALT = 2
v.UNKNOWN = 3
else:
v.UNKNOWN = 2
v.HOM_ALT = 3
cdef class HTSFile:
cdef htsFile *hts
cdef bytes fname
cdef bytes mode
cdef bint from_path
cdef _open_htsfile(self, fname, mode):
"""Opens an htsfile for reading or writing.
Parameters
----------
fname: str
filename (str or Path), file descriptor (int), or file-like object (has fileno method).
mode: str
the mode to pass to hts_open.
"""
cdef hFILE *hf
self.mode = to_bytes(mode)
reading = b"r" in self.mode
writing = b"w" in self.mode
if not reading and not writing:
raise IOError("No 'r' or 'w' in mode %s" % str(self.mode))
self.from_path = isinstance(fname, (basestring, Path))
if self.from_path:
self.fname = to_bytes(str(fname))
if self.fname == b"-":
self.fname = to_bytes(b"/dev/stdin") if reading else to_bytes(b"/dev/stdout")
self.hts = hts_open(self.fname, self.mode)
# from a file descriptor
elif isinstance(fname, int):
hf = hdopen(int(fname), self.mode)
self.hts = hts_hopen(hf, "<file>", self.mode)
self.fname = None
# reading from a File object or other object with fileno
elif hasattr(fname, "fileno"):
if fname.closed:
raise IOError('I/O operation on closed file')
hf = hdopen(fname.fileno(), self.mode)
self.hts = hts_hopen(hf, "<file>", self.mode)
# .name can be TextIOWrapper
try:
self.fname = to_bytes(fname.name)
except AttributeError:
self.fname = None
else:
raise IOError("Cannot open '%s' for writing." % str(type(fname)))
if self.hts == NULL:
raise IOError("Error opening %s" % str(fname))
if reading:
if self.hts.format.format != vcf and self.hts.format.format != bcf:
raise IOError(
"%s is not valid bcf or vcf (format: %s mode: %s)" % (fname, self.hts.format.format, mode)
)
else:
if self.hts.format.format != text_format and self.hts.format.format != binary_format:
raise IOError(
"%s is not valid text_format or binary_format (format: %s mode: %s)" % (fname, self.hts.format.format, mode)
)
def close(self):
if self.hts != NULL:
if self.from_path:
hts_close(self.hts)
self.hts = NULL
cdef class VCF(HTSFile):
"""
VCF class holds methods to iterate over and query a VCF.
Parameters
----------
fname: str
path to file
gts012: bool
if True, then gt_types will be 0=HOM_REF, 1=HET, 2=HOM_ALT, 3=UNKNOWN. If False, 3, 2 are flipped.
lazy: bool
if True, then don't unpack (parse) the underlying record until needed.
strict_gt: bool
if True, then any '.' present in a genotype will classify the corresponding element in the gt_types array as UNKNOWN.
samples: list
list of samples to extract from full set in file.
threads: int
the number of threads to use including this reader.
Returns
-------
VCF object for iterating and querying.
"""
cdef const bcf_hdr_t *hdr
cdef tbx_t *idx
cdef hts_idx_t *hidx
cdef int n_samples
cdef int PASS
cdef bint gts012
cdef bint lazy
cdef bint strict_gt
cdef list _seqnames
cdef list _seqlens
# holds a lookup of format field -> type.
cdef dict format_types
#: The constant used to indicate the the genotype is HOM_REF.
cdef readonly int HOM_REF
#: The constant used to indicate the the genotype is HET.
cdef readonly int HET
#: The constant used to indicate the the genotype is HOM_ALT.
cdef readonly int HOM_ALT
#: The constant used to indicate the the genotype is UNKNOWN.
cdef readonly int UNKNOWN
def __init__(self, fname, mode="r", gts012=False, lazy=False, strict_gt=False, samples=None, threads=None):
cdef bcf_hdr_t *hdr
self._open_htsfile(fname, mode)
hdr = self.hdr = bcf_hdr_read(self.hts)
if samples is not None:
self.set_samples(samples)
self.n_samples = bcf_hdr_nsamples(self.hdr)
self.PASS = -1
self.gts012 = gts012
self.lazy = lazy
self.strict_gt = strict_gt
self._seqnames = []
self._seqlens = []
set_constants(self)
self.format_types = {}
if threads is not None:
self.set_threads(threads)
def set_threads(self, int n):
v = hts_set_threads(self.hts, n)
if v < 0:
raise Exception("error setting number of threads: %d" % v)
cdef get_type(self, fmt):
fmt = from_bytes(fmt)
if not fmt in self.format_types:
s = self.get_header_type(fmt, order=[BCF_HL_FMT])
self.format_types[fmt] = s["Type"]
return from_bytes(self.format_types[fmt])
def add_to_header(self, line):
"""Add a new line to the VCF header.
Parameters
----------
line: str
full vcf header line.
"""
ret = bcf_hdr_append(self.hdr, to_bytes(line))
if ret != 0:
raise Exception("couldn't add '%s' to header")
ret = bcf_hdr_sync(self.hdr)
if ret != 0:
raise Exception("couldn't add '%s' to header")
if line.startswith("##contig"):
# need to trigger a refresh of seqnames
self._seqnames = []
return ret
def add_info_to_header(self, adict):
"""Add a INFO line to the VCF header.
Parameters
----------
adict: dict
dict containing keys for ID, Number, Type, Description.
"""
return self.add_to_header("##INFO=<ID={ID},Number={Number},Type={Type},Description=\"{Description}\">".format(**adict))
def add_format_to_header(self, adict):
"""Add a FORMAT line to the VCF header.
Parameters
----------
adict: dict
dict containing keys for ID, Number, Type, Description.
"""
return self.add_to_header("##FORMAT=<ID={ID},Number={Number},Type={Type},Description=\"{Description}\">".format(**adict))
def add_filter_to_header(self, adict):
"""Add a FILTER line to the VCF header.
Parameters
----------
adict: dict
dict containing keys for ID, Description.
"""
return self.add_to_header("##FILTER=<ID={ID},Description=\"{Description}\">".format(**adict))
def set_samples(self, samples):
"""Set the samples to be pulled from the VCF; this must be called before any iteration.
Parameters
----------
samples: list
list of samples to extract.
"""
if samples is None:
samples = "-".encode()
if isinstance(samples, list):
samples = to_bytes(",".join(samples))
else:
samples = to_bytes(samples)
ret = bcf_hdr_set_samples(self.hdr, <const char *>samples, 0)
assert ret >= 0, ("error setting samples", ret)
if ret != 0 and samples != "-":
s = from_bytes(samples).split(",")
if ret < len(s):
sys.stderr.write("warning: not all requested samples found in VCF\n")
self.n_samples = bcf_hdr_nsamples(self.hdr)
def update(self, id, type, number, description):
"""Update the header with an INFO field of the given parameters.
Parameters
----------
id: str
ID
type: str
valid VCF type
number: str
valid VCF number
description: str
description of added line.
"""
ret = bcf_hdr_append(self.hdr, "##INFO=<ID={id},Number={number},Type={type},Description=\"{description}\">".format(id=id, type=type, number=number, description=description))
if ret != 0:
raise Exception("unable to update to header: %d", ret)
ret = bcf_hdr_sync(self.hdr)
if ret != 0:
raise Exception("unable to update to header")
def set_index(self, index_path=""):
if index_path.endswith(".tbi"):
self.idx = tbx_index_load2(to_bytes(self.fname), to_bytes(index_path))
if self.idx != NULL:
return
self.hidx = hts_idx_load2(to_bytes(self.fname), to_bytes(index_path))
if self.hidx == NULL:
self.idx = tbx_index_load2(to_bytes(self.fname), to_bytes(index_path))
if self.hidx == NULL and self.idx == NULL:
raise OSError("unable to open index:'%s' for '%s'" % (index_path, self.fname))
def _bcf_region(VCF self, region):
if self.hidx == NULL:
self.hidx = bcf_index_load(self.fname)
assert self.hidx != NULL, ("error loading .csi index for %s" % self.fname)
cdef bcf1_t *b
cdef int ret
cdef hts_itr_t *itr
itr = bcf_itr_querys(self.hidx, self.hdr, to_bytes(region))
if itr == NULL:
sys.stderr.write("no intervals found for %s at %s\n" % (self.fname, region))
raise StopIteration
try:
while True:
b = bcf_init()
ret = bcf_itr_next(self.hts, itr, b)
if ret < 0:
bcf_destroy(b)
break
if bcf_subset_format(self.hdr, b) != 0:
sys.stderr.write("could not subset variant")
bcf_destroy(b)
break
yield newVariant(b, self)
finally:
if itr != NULL:
hts_itr_destroy(itr)
def __call__(VCF self, region=None):
"""
Extract the region from the VCF.
Parameters
----------
region: str
region string like chr1:1234-34566 or 'chr7
Returns
-------
An Iterator over the requested region.
"""
if not region:
yield from self
raise StopIteration
if self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')):
yield from self._bcf_region(region)
raise StopIteration
if self.idx == NULL:
self.idx = tbx_index_load(to_bytes(self.fname))
assert self.idx != NULL, "error loading tabix index for %s" % self.fname
cdef hts_itr_t *itr
cdef kstring_t s
cdef bcf1_t *b
cdef int slen = 1, ret = 0
cdef bytes bregion = to_bytes(region)
cdef char *cregion = bregion
with nogil:
itr = tbx_itr_querys(self.idx, cregion)
if itr == NULL:
sys.stderr.write("no intervals found for %s at %s\n" % (self.fname, region))
raise StopIteration
try:
while 1:
with nogil:
slen = tbx_itr_next(self.hts, self.idx, itr, &s)
if slen > 0:
b = bcf_init()
ret = vcf_parse(&s, self.hdr, b)
if slen <= 0:
break
if ret > 0:
bcf_destroy(b)
stdlib.free(s.s)
hts_itr_destroy(itr)
raise Exception("error parsing")
yield newVariant(b, self)
finally:
stdlib.free(s.s)
hts_itr_destroy(itr)
def header_iter(self):
"""
Iterate over fields in the HEADER
"""
cdef int i
for i in range(self.hdr.nhrec):
yield newHREC(self.hdr.hrec[i], self.hdr)
def ibd(self, int nmax=-1):
assert self.gts012
import itertools
cdef int i, rl, n_bins = 16
samples = self.samples
sample_to_idx = {s: samples.index(s) for s in samples}
sample_pairs = list(itertools.combinations(samples, 2))
# values of bins, run_length
cdef int n = 0
cdef float pi
cdef int[:] b
cdef int[:] gts
cdef int idx0, idx1
bins = np.zeros((len(sample_pairs), n_bins), dtype=np.int32)
rls = np.zeros(len(sample_pairs), dtype=np.int32)
for v in self:
if n == nmax: break
n += 1
gts = v.gt_types
pi = v.aaf
for i, (s0, s1) in enumerate(sample_pairs):
b = bins[i, :]
idx0, idx1 = sample_to_idx[s0], sample_to_idx[s1]
rls[i] = ibd(gts[idx0], gts[idx1], rls[i], pi, &b[0], n_bins)
return {sample_pairs[i]: bins[i, :] for i in range(len(sample_pairs))}
# pull something out of the HEADER, e.g. CSQ
cpdef get_header_type(self, key, order=[BCF_HL_INFO, BCF_HL_FMT]):
"""Extract a field from the VCF header by id.
Parameters
----------
key: str
ID to pull from the header.
Returns
-------
rec: dict
dictionary containing header information.
"""
key = to_bytes(key)
cdef bcf_hrec_t *b
cdef int i
for typ in order:
b = bcf_hdr_get_hrec(self.hdr, typ, b"ID", key, NULL);
if b != NULL:
break
if b == NULL:
b = bcf_hdr_get_hrec(self.hdr, BCF_HL_GEN, key, NULL, NULL);
if b == NULL:
raise KeyError(key)
d = {from_bytes(b.key): from_bytes(b.value)}
else:
d = {from_bytes(b.keys[i]): from_bytes(b.vals[i]) for i in range(b.nkeys)}
#bcf_hrec_destroy(b)
return d
def __getitem__(self, key):
return self.get_header_type(key)
def __contains__(self, key):
"""Check if the given ID is in the header."""
try:
self[key]
return True
except KeyError:
return False
contains = __contains__
def __dealloc__(self):
if self.hts != NULL and self.hdr != NULL:
bcf_hdr_destroy(self.hdr)
self.hdr = NULL
self.close()
if self.idx != NULL:
tbx_destroy(self.idx)
if self.hidx != NULL:
hts_idx_destroy(self.hidx)
def __iter__(self):
return self
def __next__(self):
cdef bcf1_t *b
cdef int ret
if self.hts == NULL:
raise Exception("attempt to iterate over closed/invalid VCF")
with nogil:
b = bcf_init()
ret = bcf_read(self.hts, self.hdr, b)
if ret >= 0 or b.errcode == BCF_ERR_CTG_UNDEF:
return newVariant(b, self)
else:
bcf_destroy(b)
if ret == -1: # end-of-file
raise StopIteration
else:
raise Exception("error parsing variant with `htslib::bcf_read` error-code: %d" % (b.errcode))
property samples:
"list of samples pulled from the VCF."
def __get__(self):
cdef int i
return [str(self.hdr.samples[i].decode('utf-8')) for i in range(self.n_samples)]
property raw_header:
"string of the raw header from the VCF"
def __get__(self):
cdef kstring_t s
s.s, s.l, s.m = NULL, 0, 0
bcf_hdr_format(self.hdr, 0, &s)
return from_bytes(s.s)
property seqlens:
def __get__(self):
if len(self._seqlens) > 0: return self._seqlens
cdef int32_t nseq;
cdef int32_t* sls = bcf_hdr_seqlen(self.hdr, &nseq)
if sls == NULL or nseq <= 0:
raise AttributeError("no sequence lengths found in header")
self._seqlens = [sls[i] for i in range(nseq)]
stdlib.free(sls)
return self._seqlens
property seqnames:
"list of chromosomes in the VCF"
def __get__(self):
if len(self._seqnames) > 0: return self._seqnames
cdef char **cnames
cdef int i, n = 0
cnames = bcf_hdr_seqnames(self.hdr, &n)
if n == 0 and self.fname.decode(ENC).endswith(('.bcf', '.bcf.gz')):
if self.hidx == NULL:
self.hidx = bcf_index_load(self.fname)
if self.hidx != NULL:
cnames = bcf_index_seqnames(self.hidx, self.hdr, &n)
elif n == 0:
if self.idx == NULL:
self.idx = tbx_index_load(to_bytes(self.fname))
if self.idx !=NULL:
cnames = tbx_seqnames(self.idx, &n)
self._seqnames = [cnames[i].decode() for i in range(n)]
stdlib.free(cnames)
return self._seqnames
def plot_relatedness(self, riter):
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import gridspec
import seaborn as sns
sns.set_style("ticks")
df = []
for row in riter:
row['jtags'] = '|'.join(row['tags'])
df.append(row)
df = pd.DataFrame(df)
fig = plt.figure(figsize=(9, 9))
gs = gridspec.GridSpec(2, 1, height_ratios=[3.5, 1])
ax0, ax1 = plt.subplot(gs[0]), plt.subplot(gs[1])
if "error" in df.columns:
# plot all gray except points that don't match our expectation.
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
import matplotlib.colors as mc
colors = [mc.hex2color(h) for h in ('#b6b6b6', '#ff3333')]
for i, err in enumerate(("ok", "error")):
subset = df[df.error == err]
subset.plot(kind='scatter', x='rel', y='ibs0', c=colors[i],
edgecolor=colors[0],
label=err, ax=ax0, s=17 if i == 0 else 35)
sub = df[df.error == "error"]
for i, row in sub.iterrows():
ax0.annotate(row['sample_a'] + "\n" + row['sample_b'],
(row['rel'], row['ibs0']), fontsize=8)
else:
# color by the relation derived from the genotypes.
colors = sns.color_palette("Set1", len(set(df.jtags)))
for i, tag in enumerate(set(df.jtags)):
subset = df[df.jtags == tag]
subset.plot(kind='scatter', x='rel', y='ibs0', c=colors[i],
label=tag, ax=ax0)
ax0.legend()
ax0.set_ylim(ymin=0)
ax0.set_xlim(xmin=df.rel.min())
ax1.set_xlim(*ax0.get_xlim())
ax1.hist(df.rel, 40)
ax1.set_yscale('log', nonposy='clip')
return fig
def gen_variants(self, sites,
offset=0, each=1, call_rate=0.8):
seqnames = set(self.seqnames)
if sites is not None:
if isinstance(sites, basestring):
isites = []
for i in (x.strip().split(":") for x in open(sites)):
if not i[0] in seqnames and 'chr' + i[0] in seqnames:
i[0] = 'chr' + i[0]
i[1] = int(i[1])
isites.append(i)
else:
isites = sites
for i in isites:
if not i[0] in seqnames and 'chr' + i[0] in seqnames:
i[0] = 'chr' + i[0]
cdef Variant v
cdef int k, last_pos
if sites:
isites = isites[offset::each]
ref, alt = None, None
j = 0
for i, osite in enumerate(isites):
if len(osite) >= 4:
chrom, pos, ref, alt = osite[:4]
else:
chrom, pos = osite[:2]
for v in self("%s:%s-%s" % (chrom, pos, pos)):
if len(v.ALT) != 1: continue
if ref is not None and v.REF != ref: continue
if alt is not None and v.ALT[0] != alt: continue
if v.call_rate < call_rate: continue
yield i, v
j += 1
break
else:
last_pos, k = -10000, 0
for v in self:
if abs(v.POS - last_pos) < 5000: continue
if len(v.REF) != 1: continue
if len(v.ALT) != 1: continue
if v.call_rate < 0.5: continue
if not 0.03 < v.aaf < 0.6: continue
if np.mean(v.gt_depths > 7) < 0.5: continue
last_pos = v.POS
if k >= offset and k % each == 0:
yield k, v
k += 1
if k > 20000: break
def het_check(self, min_depth=8, percentiles=(10, 90), _finish=True,
int each=1, int offset=0,
sites=None):
cdef int i, k, n_samples = len(self.samples)
cdef Variant v
cdef np.ndarray het_counts = np.zeros((n_samples,), dtype=np.int32)
cdef np.ndarray sum_depths = np.zeros((n_samples,), dtype=np.int32)
cdef np.ndarray sum_counts = np.zeros((n_samples,), dtype=np.int32)
cdef int any_counts = 0
# keep the sites and gts that we used for PCA
used_sites, all_gt_types = [], []
mean_depths = []
maf_lists = defaultdict(list)
idxs = np.arange(n_samples)
for i, v in self.gen_variants(sites, each=each, offset=offset):
if v.CHROM in ('X', 'chrX'): break
if v.aaf < 0.01: continue
if v.call_rate < 0.5: continue
used_sites.append("%s:%d:%s:%s" % (v.CHROM, v.start + 1, v.REF, v.ALT[0]))
alts = v.gt_alt_depths
assert len(alts) == n_samples
depths = (alts + v.gt_ref_depths).astype(np.int32)
sum_depths += depths
sum_counts += (depths > min_depth)
any_counts += 1
mean_depths.append(depths)
mafs = alts / depths.astype(float)
gt_types = v.gt_types
hets = gt_types == 1
het_counts[hets] += 1
for k in idxs[hets]:
if depths[k] <= min_depth: continue
maf_lists[k].append(mafs[k])
all_gt_types.append(np.array(gt_types, dtype=np.uint8))
if _finish:
mean_depths = np.array(mean_depths, dtype=np.int32).T
return self._finish_het(mean_depths, maf_lists,
percentiles,
het_counts, sum_counts, all_gt_types,
used_sites, any_counts)
return (mean_depths, maf_lists, het_counts, sum_counts,
all_gt_types, used_sites, any_counts)
def _finish_het(self, mean_depths, maf_lists, percentiles, het_counts,
sum_counts, all_gt_types, sites, any_counts):
sample_ranges = {}
for i, sample in enumerate(self.samples):
qs = np.asarray(np.percentile(maf_lists[i] or [0], percentiles))
sample_ranges[sample] = dict(zip(['p' + str(p) for p in percentiles], qs))
sample_ranges[sample]['range'] = qs.max() - qs.min()
sample_ranges[sample]['het_ratio'] = het_counts[i] / float(any_counts)
sample_ranges[sample]['het_count'] = het_counts[i]
sample_ranges[sample]['sampled_sites'] = sum_counts[i]
sample_ranges[sample]['mean_depth'] = np.mean(mean_depths[i])
sample_ranges[sample]['median_depth'] = np.median(mean_depths[i])
# used for the peddy paper.
if os.environ.get('PEDDY_MAF_DUMP'):
path = os.environ['PEDDY_MAF_DUMP']
import cPickle
cPickle.dump({s: maf_lists[i] for i, s in enumerate(self.samples)}, open(path, 'wb', -1))
return sample_ranges, sites, np.transpose(all_gt_types)
def site_relatedness(self, sites=None,
min_depth=5, each=1):
vibs, vn, vhet = self._site_relatedness(sites=sites, min_depth=min_depth, each=each)
return self._relatedness_finish(vibs, vn, vhet)
cdef _site_relatedness(self, sites=None,
int min_depth=5, int each=1, int offset=0):
"""
sites must be an file of format: chrom:pos1:ref:alt where
we match on all parts.
it must have a matching file with a suffix of .bin.gz that is the binary
genotype data. with 0 == hom_ref, 1 == het, 2 == hom_alt, 3 == unknown.
min_depth applies per-sample
"""
cdef int n_samples = len(self.samples)
cdef int k, i
assert each >= 0
cdef int32_t[:, ::view.contiguous] ibs = np.zeros((n_samples, n_samples), np.int32)
cdef int32_t[:, ::view.contiguous] n = np.zeros((n_samples, n_samples), np.int32)
cdef int32_t[:] hets = np.zeros((n_samples, ), np.int32)
cdef int32_t[:] gt_types = np.zeros((n_samples, ), np.int32)
cdef int32_t[:] depths = np.zeros((n_samples, ), np.int32)
cdef double[:] alt_freqs = np.zeros((n_samples,), np.double)
cdef Variant v
for j, (i, v) in enumerate(self.gen_variants(sites, offset=offset, each=each)):
gt_types = v.gt_types
alt_freqs = v.gt_alt_freqs
krelated(>_types[0], &ibs[0, 0], &n[0, 0], &hets[0], n_samples,
&alt_freqs[0])
return ibs, n, hets
def relatedness(self, int n_variants=35000, int gap=30000, float min_af=0.04,
float max_af=0.8, float linkage_max=0.2, min_depth=8):
cdef Variant v
cdef int last = -gap, nv = 0, nvt=0
cdef int32_t *last_gts = NULL
samples = self.samples
cdef int n_samples = len(samples)
cdef float aaf = 0.0
cdef int n_unlinked = 0
cdef int32_t[:, ::view.contiguous] ibs = np.zeros((n_samples, n_samples), np.int32)
cdef int32_t[:, ::view.contiguous] n = np.zeros((n_samples, n_samples), np.int32)
cdef int32_t[:] hets = np.zeros((n_samples, ), np.int32)
cdef int32_t[:] gt_types = np.zeros((n_samples, ), np.int32)
for v in self:
nvt += 1
if last_gts == NULL:
if v._gt_types == NULL:
v.gt_types
last_gts = v._gt_types
if v.POS - last < gap and v.POS > last:
continue
if v.call_rate < 0.5: continue
# require half of the samples to meet the min depth
if np.mean(v.gt_depths > min_depth) < 0.5: continue
aaf = v.aaf
if aaf < min_af: continue
if aaf > max_af: continue
if linkage_max < 1 and v.POS - last < 40000:
if v._gt_types == NULL:
v.gt_types
# require 5 unlinked variants
if r_unphased(last_gts, v._gt_types, 1e-5, n_samples) > linkage_max:
continue
n_unlinked += 1
if n_unlinked < 5:
continue
n_unlinked = 0
if v._gt_types == NULL:
gt_types = v.gt_types
last, last_gts = v.POS, v._gt_types
v.relatedness(ibs, n, hets)
nv += 1
if nv == n_variants:
break
sys.stderr.write("tested: %d variants out of %d\n" % (nv, nvt))
return self._relatedness_finish(ibs, n, hets)
cdef dict _relatedness_finish(self,
int32_t[:, ::view.contiguous] _ibs,
int32_t[:, ::view.contiguous] _n,
int32_t[:] _hets):
samples = self.samples
cdef int sj, sk, ns = len(samples)
res = {'sample_a': [], 'sample_b': [],
'rel': array('f'),
'hets_a': array('I'),
'hets_b': array('I'),
'shared_hets': array('I'),
'ibs0': array('I'),
'ibs2': array('I'),
'n': array('I')}
cdef float bot
for sj in range(ns):
sample_j = samples[sj]
if _hets[sj] == 0:
print("peddy: no hets found for sample %s\n" % sample_j, file=sys.stderr)
for sk in range(sj, ns):
if sj == sk: continue
sample_k = samples[sk]
# calculate relatedness. we use the geometric mean.
#bot = math.exp(0.5 * (math.log(1 + _hets[sk]) + math.log(1 + _hets[sj])))
#bot = (_hets[sk] + _hets[sj])/2.0
bot = min(_hets[sk], _hets[sj])
if bot == 0:
bot = max(_hets[sk], _hets[sj])
if bot == 0:
# set to negative value if we are unable to calculate it.
bot = -1
phi = (_ibs[sk, sj] - 2.0 * _ibs[sj, sk]) / (bot)
res['sample_a'].append(sample_j)
res['sample_b'].append(sample_k)
res['hets_a'].append(_hets[sj])
res['hets_b'].append(_hets[sk])
res['rel'].append(phi) # rel is 2*kinship
res['ibs0'].append(_ibs[sj, sk])
res['shared_hets'].append(_ibs[sk, sj])
res['ibs2'].append(_n[sk, sj])
res['n'].append(_n[sj, sk])
return res
cdef class Allele(object):
cdef int32_t *_raw
cdef int i
cdef int _value(self):
if self._raw[self.i] < 0: return self._raw[self.i]
return (self._raw[self.i] >> 1) - 1
@property
def phased(self):
return self._raw[self.i] & 1 == 1
@phased.setter
def phased(self, bint ph):
if ph:
self._raw[self.i] = (self._value() + 1)<<1|1
else:
self._raw[self.i] = (self._value() + 1)<<1
@property
def value(self):
if self._raw[self.i] < 0: return self._raw[self.i]
return (self._raw[self.i] >> 1) - 1
@value.setter
def value(self, int value):
if value < 0:
self._raw[self.i] = value
return
if self.phased:
self._raw[self.i] = (value + 1)<<1|1
else:
self._raw[self.i] = (value + 1)<<1
def __repr__(self):
if self.value < 0: return "."
return str(self.value) + ("|" if self.phased else "/")
cdef inline Allele newAllele(int32_t *raw, int i):
cdef Allele a = Allele.__new__(Allele)
a._raw = raw
a.i = i
return a