FalsePosForBed.py

# adapted from https://github.com/cjieming/alleleDB/tree/master/alleledb_pipeline/FalsePos.py
''' Some notes on what is going on here.
Basically, we want to use simulation to explicitly calculate a FDR for binomial tests on unbalanced alleles.  We use
a binomial pvalue test to determine whether the ratio of alleles departs significantly from what would be expected
from a fair coin toss.

However, since the number of trials varies a lot from one test to another, it seems best to use an explicit method.

Imagine that you want a particular overall FDR, say 0.1, for the entire dataset.  Then, what pvalue threshhold would correspond to that?

say we have n trials, where the number of coin flips in a trial varies, and is given by cnt(i)

FDR = Nsim/Nact, where:

Nsim = sum( indicator(test(i) < pval)) over i.  This is the number of trials of the fair coin that had a "surprising" outcome, i.e. 
were further in the tail than the pval threshold.  In a perfect, non-discrete world, Nsim/n would equal pval, but the whole point of this
exercise is that in the discrete, imperfect world it doesnt.

Nact = the number of actual datapoints observed to have a binomial probability less than pval.

So, for a given pval, say 0.05, we can calculate the FDR, which will be larger.  The first output from this code consists of a nice sampling of
example pvals and their corresponding FDR.  We are interested in the reverse of this, i.e. having picked an FDR, we want the pval that would best give us
this FDR.

Thats the point of the second part of the output.  Starting from the largest pval, we work our way down, calculating FDR for each test,
until FDR falls below our target.

Note that FDR is NOT monotonically decreasing as we do this. Its true that both Nsim and Nact decrease.  However, Nact is strictly decreasing, but Nsim can hold steady, which results in temporarily increasing FDR over that interval.

Also note that we do multiple simulations and use the mean of the Nsim values, in order to smooth out the results.

'''
import sys, bisect, random, numpy, pdb
import math
import scipy.stats

def binomtest(x, n, p):
    #return (scipy.stats.binom_test(x, n, p), normal_approx(x, n, p))
    if n*p > 50:
        return normal_approx(x, n, p)
    else:
        return scipy.stats.binom_test(x, n, p)


def normal_approx(x, n, p):
    if abs(x-n*p) < 1e-5:
        return 1.0
    u=p*n
    s=math.sqrt(n*p*(1-p))
    norm=scipy.stats.distributions.norm(u,s)
    if x<n*p:
        pval=2*norm.cdf(x+.5) # add 0.5 for continuity correction
    else:
        pval=2*(1-norm.cdf(x-.5)) 
    return pval


class binomMemo(object):
    def __init__(self, n):
        self.n=n
        self.cache=[[binomtest(j, i, 0.5) for j in range(i+1)] for i in range(n)]
    def binomtest(self, a, cnt):
        if cnt<self.n:
            return self.cache[cnt][a]
        else:
            return binomtest(a, cnt, 0.5)

def simpval(cnt,bm):
    a=sum([random.randint(0,1) for i in range(cnt)])
    pval=bm.binomtest(a, cnt)
    return pval

def simpval2(cnt,bm):
    a=sum([random.randint(0,1) for i in range(cnt)])
    pval=bm.binomtest(a, cnt)
    return a

if __name__=='__main__':
    ifile=sys.argv[1]
    sims=int(sys.argv[2])
    data = numpy.loadtxt(ifile, dtype=str ,delimiter='\t', usecols=range(0,9), skiprows=0)
    #verbose=False
    verbose = len(sys.argv)==5 and sys.argv[4]=='-v'
    bestFDR=bestPV=None

    random.seed(0)
    target=float(sys.argv[3]) # target is the FDR we are looking for, we want to find the corresponding pval

    print "#"," ".join(sys.argv)
    print "pval\tP\tFP\tFDR"
    bm=binomMemo(60)
    n=len(data)
    #g=h.getAllAnnotationsGenerator();

    act_pvals=numpy.array(data[:,-1],float) # pval as reported in counts file
    cnt_sums =numpy.array(data[:,5],int) + numpy.array(data[:,6],int) # sum of mat and pat alleles
    act_pvals.sort()
    sim_pvals=numpy.array([ sorted([simpval(cnt_sums[j],bm) for j in xrange(n)]) for i in xrange(sims)])
    #sim_pvals_means=numpy.mean(sim_pvals, 0)

    pvs=[e*0.001 for e in range(10)]+[e*0.01 for e in range(1,10)]+[e*0.1 for e in range(1,10)]
    # for a given test pv, find the number of actual pvals that are smaller, and the number of sim pvals that are smaller.
    # FDR is the ratio
    for pv in pvs:
        Nact=bisect.bisect(act_pvals, pv)
        mean_Nsims=numpy.mean([bisect.bisect(sim_pvals[i], pv) for i in xrange(sims)])
        FDR=mean_Nsims/(Nact+1)
        print "%f\t%s\t%f\t%f" % (pv, Nact, mean_Nsims, FDR)

    # This is my attempt to find the act_pval that corresponds best to the desired target FDR.  
    # This version walks from largest observed pvalue to the smallest.
    if target:
        last_FDR=last_pv=0.0
        for Nact, pv in sorted(enumerate(act_pvals), reverse=True):
            mean_Nsims=numpy.mean([bisect.bisect(sim_pvals[i], pv) for i in xrange(sims)])
            FDR=mean_Nsims/(Nact+1)
            if verbose: print "test %d %f %f %f" % (Nact,mean_Nsims,FDR, pv)
            if not bestFDR and FDR < target:
                print "target %f" % target
                print "before %f %f" % (last_FDR, last_pv)
                print "after  %f %f" % (FDR, pv)
                bestFDR = FDR; bestPV = pv

            last_FDR=FDR; last_pv=pv

        print "Target %f FDR %f pv %f" % (target,bestFDR, bestPV)