-
Notifications
You must be signed in to change notification settings - Fork 0
/
FastaSeqExtract.py
196 lines (168 loc) · 6.96 KB
/
FastaSeqExtract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""
Sequence Extraction
by Tovah
(Based on Sunny's sequence extraction scripts)
This file does not require BioPython.
Updated: 2/25/15, Now includes personalized error messages
Now works with both Python and Python3
Inputs: genome fasta file, file with positions, query (optional), range (optional)
Outputs: fasta file of specific regions
"""
##################################################################################
# Modules
import optparse
import re
import sys
##################################################################################
# Functions
def main(fastaFile, inputFile, output, r_up, r_down, query):
# read in fasta sequence
genome = load_FASTA(fastaFile)
# run readBedGff
input = readBedGff(inputFile)
r_up = int(r_up)
r_down = int(r_down)
# note chromosome number starts with 1 and python indexing starts with 0
# takes into account range
# if query, find query within input file
if query is not "None":
queryInfo= [ i for i in input if query in i[3] ]
if not queryInfo:
print( "Query not found in " + inputFile + ". Try systematic name." )
sys.exit()
else:
queryInfo = input
# extract sequences and write output FASTA file
output_fasta = [0] * len(queryInfo)
for i in range( len(queryInfo) ):
output_fasta[i] = seq_extract( queryInfo[i], genome, r_up, r_down )
# write output file
f = open( output, 'w' )
f.write( '\n'.join( i for i in output_fasta ) )
f.close()
def seq_extract( row, fasta, r_up, r_down ):
# take "row from readBedGff file and extract sequence
# put into FASTA format
chrNum = [ chrN for chrN in range(len(fasta)) if fasta[chrN][0] == row[0] ][0]
if row[4] == '+':
start = row[1] - r_up - 1
if start < 0:
start = 0
print( "Clipped upstream of " + row[3] )
end = row[2] + r_down
if end > len(fasta[chrNum][1]):
end = len(fasta[chrNum][1])
print( "Clipped downstream of " + row[3] )
sequence = fasta[chrNum][1][start:end]
else:
chr = fasta[ row[0]-1 ][1]
start = row[2] + r_up
if start > len(fasta[chrNum][1]):
start = len(fasta[chrNum][1])
print( "Clipped upstream of " + row[3] )
end = row[1] - r_down - 1
if end < 0:
end = 0
print( "Clipped downstream of " + row[3] )
sequence = fasta[chrNum][1][end:start]
sequence = reverse_complement(sequence)
if "=" in row[3]:
name = ( re.search(r'ID=(.*);', row[3] ).group(1) + ','
+ re.search('Name=(.*)', row[3] ).group(1) )
else:
name = row[3]
out_fasta = ( '>' + name + '\t' + fasta[chrNum][0]
+ ':' + str(start) + '-' + str(end) + '(' + row[4]
+ ')' + '\n' + sequence )
return out_fasta
def reverse_complement(seq):
Complement = { 'A':'T', 'C':'G', 'T':'A', 'G':'C', 'N':'N' }
sequence = [ Complement.get(seq[i]) for i in range( len(seq) ) ]
return ''.join( sequence[::-1] )
def load_FASTA(seq_filename):
# load the raw data
f = open( seq_filename , 'r' )
lines = f.read().split( '>' )
f.close()
seq=[]
for i in lines:
if i: # not empty
# split by breakline
line = i.strip().split( '\n' )
chr = line[0]
sequence = ''.join( line[1:] )
seq.append( [chr, sequence] )
return(seq)
def readBedGff(inputName):
# function to read in Gff or Bed files and convert into a universal format
# read in input file
f = open(inputName,'r')
input = f.readlines()
f.close()
# split into a table
input = [ row.strip().split('\t') for row in input if not row.startswith("#") ]
# determine type of input file and make four column table
# column 1: chr#, column 2: start, column 3: end, column 4: name, column 5: orientation
if len(input[0]) == 9: # gff file
print( "Input file is a GFF" )
for i in range(len(input)):
chr = input[i][0]
input[i] = [ chr, int(input[i][3]), int(input[i][4]), input[i][8], input[i][6] ]
elif len(input[0]) == 3: # simplest bed file
print( "Input file is a 3-column BED" )
for i in range(len(input)):
chr = input[i][0]
input[i] = [ chr, int(input[i][1]), int(input[i][2]), 'id'+i, '+']
elif len(input[0]) == 5: # bed file made for summits
print( "Input file is a 5-column BED" )
for i in range(len(input)):
chr =input[i][0]
input[i] = [ chr, int(input[i][1]), int(input[i][2]), input[i][3], '+']
else:
print( inputFile + " cannot be read. Make sure you are using a GFF file, a \
3 column BED file, or a 5 column BED file.")
sys.exit()
return (input)
################################################################################
# Main
# parse object for managing input options
desc="""
Purpose: to extract sequence(s) from a FASTA file based upon sequence coordinates
given from a separate GFF or BED file.
If no specific query is given, the output file will contain sequences for every
row in the input file. This function will produce an output as long as the
chromosome naming system is the same in both FASTA and input file. R_up and r_down
expands the coordinates of the sequence to be extracted beyond those in the input
file. Example:
python FastaSeqExtract.py -f sk1_MvO_V1.fasta -i SK1_annotation_modified.gff
-q "YIL072W" -o "YIL072W.fa"
"""
parser = optparse.OptionParser(description=desc)
# essential data, defines commandline options
parser.add_option ('-f', dest = 'fastaFile', default = '', help = "This input \
is the name of the FASTA file of the entire genome.")
parser.add_option ('-i', dest = 'inputFile', default = '', help = "This input \
is the name of the file that contains positions of sequences to be extracted.")
parser.add_option ('-u', dest = 'r_up', default = '0', help = "This input \
is the length of sequence to be extracted upstream of the designated feature. \
Default = 0bp")
parser.add_option ('-d', dest = 'r_down', default = '0', help = "This input \
is the length of sequence to be extracted downstream of the designated feature. \
Default = 0bp")
parser.add_option ('-q', dest = 'query', default = 'None', help = "This input \
is the name of a specific query to search for. This includes: GeneID, Gene Name, \
and summit name. This is case-sensitive. Default: entire input file")
parser.add_option ('-o', dest = 'output', default = '', help = "This input \
is the name of the newly created output FASTA file.")
# load the inputs
(options,args) = parser.parse_args()
# reads the inputs from commandline
fastaFile = options.fastaFile
inputFile = options.inputFile
r_up = options.r_up
r_down = options.r_down
query = options.query
output = options.output
a = main(fastaFile, inputFile, output, r_up, r_down, query)
#fastaFile="sk1_MvO_V1.fasta"
#inputFile="RPLS.gff"