forked from gordanlab/SVR_models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_predict_tf_binding.py
79 lines (64 loc) · 3.85 KB
/
test_predict_tf_binding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import unittest
import json
from predict_tf_binding import svr_features_from_sequence, generate_matching_sequences, predict, load_model
class TestPredictGenome(unittest.TestCase):
def test_svr_features_from_sequence_1mer(self):
sequence = 'AGC'
features = svr_features_from_sequence(sequence, [1])
self.assertEqual(len(features), 12, "Feature matrix size mismatch") # with 4 nucleotides and 3 positions there should be 12 features
just_matches = [x for x in features if x['value'] == 1]
self.assertEqual(len(just_matches), 3, "Features of value 1 size mismatch") # For 1mers, there should only be 3 items with value 1
positions = [x['position'] for x in just_matches]
just_nucleotides = [x['feature'] for x in just_matches] # Extract the original sequence
self.assertEqual(''.join(just_nucleotides), sequence, "Reconstructed sequence does not match")
def test_svr_features_from_sequence_1_2_3mer(self):
sequence = 'CAGGCTTTGGGAGCCAGCGGGGCGGGAGCGGCGAAG'
features = svr_features_from_sequence(sequence, [1,2,3])
# with 4 nucleotides and 1,2,3 positions, total should be 4^1*36 + 4^2 * 35 + 4^3 * 34 = 2880
self.assertEqual(len(features), 2880, "Feature matrix size mismatch")
just_matches = [x for x in features if x['value'] == 1]
# Should be 36+35+34 = 105 items where value is 1
self.assertEqual(len(just_matches), 105, "Features of value 1 size mismatch")
positions = [x['position'] for x in just_matches]
# Extract the original sequence, just the 1mers
just_nucleotides = [x['feature'] for x in just_matches[0:36]]
self.assertEqual(''.join(just_nucleotides), sequence, "Reconstructed sequence does not match")
def test_svr_features_from_sequence_1_2_3mer_exact(self):
sequence = 'CAGGCTTTGGGAGCCAGCGGGGCGGGAGCGGCGAAG'
features = svr_features_from_sequence(sequence, [1,2,3])
with open('test_matrix.json', 'r') as f:
expected = json.load(f)
self.assertEqual(features, expected, "Generated feature matrix does not match")
def _check_generated_matches(self, sequence, core, width, expected_matches):
count = 0
for match in generate_matching_sequences(sequence, core, width):
count+=1
self.assertIn(match, expected_matches, 'Mismatch : {} not in {}'.format(match, expected_matches))
self.assertEqual(count, len(expected_matches), 'Unexpected number of matches')
def test_generates_matching_sequences(self):
sequence = 'ACCTTAGCCTTGATAT'
core = 'CCTT'
expected_matches = [(0,('ACCTTA',)),(6,('GCCTTG',))]
self._check_generated_matches(sequence, core, 6, expected_matches)
def test_skips_unknown_bases(self):
sequence = 'ACCTTAGCCTTNATAT'
core = 'CCTT'
expected_matches = [(0,('ACCTTA',))]
self._check_generated_matches(sequence, core, 6, expected_matches)
def load_model(self):
# This model is 178MB in size, so not practical to include in the source repo
model_dict = load_model('ELK1_100nM_Bound_filtered_normalized_GGAA_1a2a3mer_format.model')
self.assertEqual(model_dict['size'], 2881)
def test_generates_matching_reverse_complements(self):
core = 'GCTG' # must not be palindromic
sequence = 'ATTCAGCGAA' # Reverse complement in the middle
expected_matches = [(2,('CGCTGA',))]
self._check_generated_matches(sequence, core, 6, expected_matches)
def test_generates_matching_palindromes(self):
core = 'GGCC' # Must be palindromic
sequence = 'ATTGGCCGAA' # Core in the middle
expected_matches = [(2, ('TGGCCG', 'CGGCCA'))]
# Palindromes yield a position and two sequences
self._check_generated_matches(sequence, core, 6, expected_matches)
if __name__ == '__main__':
unittest.main()