forked from MasterprojectRK/Hi-cGAN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
134 lines (124 loc) · 6.05 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import click
import numpy as np
import os
import csv
import tensorflow as tf
import dataContainer
import records
import hicGAN
import utils
@click.option("--trainedModel", "-trm", required=True,
type=click.Path(exists=True, readable=True, dir_okay=False),
help="Trained generator model to predict from")
@click.option("--testChromPath", "-tcp", required=True,
type=click.Path(exists=True, readable=True, file_okay=False),
help="Path where test data (bigwig files) resides")
@click.option("--testChroms", "-tchroms", required=True,
type=str,
help="Chromosomes for testing. Must be available in all bigwig files")
@click.option("--outfolder", "-o", required=False,
type=click.Path(exists=True, writable=True, file_okay=False),
default="./", show_default=True,
help="Output path for predicted coolers")
@click.option("--multiplier", "-mul", required=False,
type=click.IntRange(min=1),
default=10, show_default=True)
@click.option("--binsize", "-b", required=True,
type=click.IntRange(min=1000),
help="bin size for binning the chromatin features")
@click.option("--batchsize", "-bs", required=False,
type=click.IntRange(min=1),
default=32, show_default=True,
help="batchsize for predicting")
@click.option("--windowsize", "-ws", required=True,
type=click.Choice(choices=["64", "128", "256"]),
help="windowsize for predicting; must be the same as in trained model. Supported values are 64, 128 and 256")
@click.command()
def prediction(trainedmodel,
testchrompath,
testchroms,
outfolder,
multiplier,
binsize,
batchsize,
windowsize
):
scalefactors = True
clampfactors = False
scalematrix = True
maxdist = None
windowsize = int(windowsize)
flankingsize = windowsize
paramDict = locals().copy()
#extract chromosome names from the input
chromNameList = testchroms.replace(",", " ").rstrip().split(" ")
chromNameList = sorted([x.lstrip("chr") for x in chromNameList])
containerCls = dataContainer.DataContainer
testdataContainerList = []
for chrom in chromNameList:
testdataContainerList.append(containerCls(chromosome=chrom,
matrixfilepath=None,
chromatinFolder=testchrompath,
binsize=binsize))
#define the load params for the containers
loadParams = {"scaleFeatures": scalefactors,
"clampFeatures": clampfactors,
"scaleTargets": scalematrix,
"windowsize": windowsize,
"flankingsize": flankingsize,
"maxdist": maxdist}
#now load the data and write TFRecords, one container at a time.
if len(testdataContainerList) == 0:
msg = "Exiting. No data found"
print(msg)
return #nothing to do
container0 = testdataContainerList[0]
nr_factors = container0.nr_factors
tfRecordFilenames = []
sampleSizeList = []
for container in testdataContainerList:
container.loadData(**loadParams)
if not container0.checkCompatibility(container):
msg = "Aborting. Incompatible data"
raise SystemExit(msg)
tfRecordFilenames.append(container.writeTFRecord(pOutfolder=outfolder,
pRecordSize=None)[0]) #list with 1 entry
sampleSizeList.append( int( np.ceil(container.getNumberSamples() / batchsize) ) )
nr_factors = container0.nr_factors
#data is no longer needed, unload it
for container in testdataContainerList:
container.unloadData()
trained_GAN = hicGAN.HiCGAN(log_dir=outfolder, number_factors=nr_factors)
trained_GAN.loadGenerator(trainedModelPath=trainedmodel)
predList = []
for record, container, nr_samples in zip(tfRecordFilenames, testdataContainerList, sampleSizeList):
storedFeaturesDict = container.storedFeatures
testDs = tf.data.TFRecordDataset(record,
num_parallel_reads=None,
compression_type="GZIP")
testDs = testDs.map(lambda x: records.parse_function(x, storedFeaturesDict), num_parallel_calls=tf.data.experimental.AUTOTUNE)
testDs = testDs.batch(batchsize, drop_remainder=False) #do NOT drop the last batch (maybe incomplete, i.e. smaller, because batch size doesn't integer divide chrom size)
#if validationmatrix is not None:
# testDs = testDs.map(lambda x, y: x) #drop the target matrices (they are for evaluation)
testDs = testDs.prefetch(tf.data.experimental.AUTOTUNE)
predArray = trained_GAN.predict(test_ds=testDs, steps_per_record=nr_samples)
triu_indices = np.triu_indices(windowsize)
predArray = np.array( [np.array(x[triu_indices]) for x in predArray] )
predList.append(predArray)
predList = [utils.rebuildMatrix(pArrayOfTriangles=x, pWindowSize=windowsize, pFlankingSize=windowsize) for x in predList]
predList = [utils.scaleArray(x) * multiplier for x in predList]
matrixname = os.path.join(outfolder, "predMatrix.cool")
utils.writeCooler(pMatrixList=predList,
pBinSizeInt=binsize,
pOutfile=matrixname,
pChromosomeList=chromNameList)
parameterFile = os.path.join(outfolder, "predParams.csv")
with open(parameterFile, "w") as csvfile:
dictWriter = csv.DictWriter(csvfile, fieldnames=sorted(list(paramDict.keys())))
dictWriter.writeheader()
dictWriter.writerow(paramDict)
for tfrecordfile in tfRecordFilenames:
if os.path.exists(tfrecordfile):
os.remove(tfrecordfile)
if __name__ == "__main__":
prediction() #pylint: disable=no-value-for-parameter