-
Notifications
You must be signed in to change notification settings - Fork 0
/
FalsePosForBed.py
126 lines (100 loc) · 5.21 KB
/
FalsePosForBed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# adapted from https://github.com/cjieming/alleleDB/tree/master/alleledb_pipeline/FalsePos.py
''' Some notes on what is going on here.
Basically, we want to use simulation to explicitly calculate a FDR for binomial tests on unbalanced alleles. We use
a binomial pvalue test to determine whether the ratio of alleles departs significantly from what would be expected
from a fair coin toss.
However, since the number of trials varies a lot from one test to another, it seems best to use an explicit method.
Imagine that you want a particular overall FDR, say 0.1, for the entire dataset. Then, what pvalue threshhold would correspond to that?
say we have n trials, where the number of coin flips in a trial varies, and is given by cnt(i)
FDR = Nsim/Nact, where:
Nsim = sum( indicator(test(i) < pval)) over i. This is the number of trials of the fair coin that had a "surprising" outcome, i.e.
were further in the tail than the pval threshold. In a perfect, non-discrete world, Nsim/n would equal pval, but the whole point of this
exercise is that in the discrete, imperfect world it doesnt.
Nact = the number of actual datapoints observed to have a binomial probability less than pval.
So, for a given pval, say 0.05, we can calculate the FDR, which will be larger. The first output from this code consists of a nice sampling of
example pvals and their corresponding FDR. We are interested in the reverse of this, i.e. having picked an FDR, we want the pval that would best give us
this FDR.
Thats the point of the second part of the output. Starting from the largest pval, we work our way down, calculating FDR for each test,
until FDR falls below our target.
Note that FDR is NOT monotonically decreasing as we do this. Its true that both Nsim and Nact decrease. However, Nact is strictly decreasing, but Nsim can hold steady, which results in temporarily increasing FDR over that interval.
Also note that we do multiple simulations and use the mean of the Nsim values, in order to smooth out the results.
'''
import sys, bisect, random, numpy, pdb
import math
import scipy.stats
def binomtest(x, n, p):
#return (scipy.stats.binom_test(x, n, p), normal_approx(x, n, p))
if n*p > 50:
return normal_approx(x, n, p)
else:
return scipy.stats.binom_test(x, n, p)
def normal_approx(x, n, p):
if abs(x-n*p) < 1e-5:
return 1.0
u=p*n
s=math.sqrt(n*p*(1-p))
norm=scipy.stats.distributions.norm(u,s)
if x<n*p:
pval=2*norm.cdf(x+.5) # add 0.5 for continuity correction
else:
pval=2*(1-norm.cdf(x-.5))
return pval
class binomMemo(object):
def __init__(self, n):
self.n=n
self.cache=[[binomtest(j, i, 0.5) for j in range(i+1)] for i in range(n)]
def binomtest(self, a, cnt):
if cnt<self.n:
return self.cache[cnt][a]
else:
return binomtest(a, cnt, 0.5)
def simpval(cnt,bm):
a=sum([random.randint(0,1) for i in range(cnt)])
pval=bm.binomtest(a, cnt)
return pval
def simpval2(cnt,bm):
a=sum([random.randint(0,1) for i in range(cnt)])
pval=bm.binomtest(a, cnt)
return a
if __name__=='__main__':
ifile=sys.argv[1]
sims=int(sys.argv[2])
data = numpy.loadtxt(ifile, dtype=str ,delimiter='\t', usecols=range(0,9), skiprows=0)
#verbose=False
verbose = len(sys.argv)==5 and sys.argv[4]=='-v'
bestFDR=bestPV=None
random.seed(0)
target=float(sys.argv[3]) # target is the FDR we are looking for, we want to find the corresponding pval
print "#"," ".join(sys.argv)
print "pval\tP\tFP\tFDR"
bm=binomMemo(60)
n=len(data)
#g=h.getAllAnnotationsGenerator();
act_pvals=numpy.array(data[:,-1],float) # pval as reported in counts file
cnt_sums =numpy.array(data[:,5],int) + numpy.array(data[:,6],int) # sum of mat and pat alleles
act_pvals.sort()
sim_pvals=numpy.array([ sorted([simpval(cnt_sums[j],bm) for j in xrange(n)]) for i in xrange(sims)])
#sim_pvals_means=numpy.mean(sim_pvals, 0)
pvs=[e*0.001 for e in range(10)]+[e*0.01 for e in range(1,10)]+[e*0.1 for e in range(1,10)]
# for a given test pv, find the number of actual pvals that are smaller, and the number of sim pvals that are smaller.
# FDR is the ratio
for pv in pvs:
Nact=bisect.bisect(act_pvals, pv)
mean_Nsims=numpy.mean([bisect.bisect(sim_pvals[i], pv) for i in xrange(sims)])
FDR=mean_Nsims/(Nact+1)
print "%f\t%s\t%f\t%f" % (pv, Nact, mean_Nsims, FDR)
# This is my attempt to find the act_pval that corresponds best to the desired target FDR.
# This version walks from largest observed pvalue to the smallest.
if target:
last_FDR=last_pv=0.0
for Nact, pv in sorted(enumerate(act_pvals), reverse=True):
mean_Nsims=numpy.mean([bisect.bisect(sim_pvals[i], pv) for i in xrange(sims)])
FDR=mean_Nsims/(Nact+1)
if verbose: print "test %d %f %f %f" % (Nact,mean_Nsims,FDR, pv)
if not bestFDR and FDR < target:
print "target %f" % target
print "before %f %f" % (last_FDR, last_pv)
print "after %f %f" % (FDR, pv)
bestFDR = FDR; bestPV = pv
last_FDR=FDR; last_pv=pv
print "Target %f FDR %f pv %f" % (target,bestFDR, bestPV)