-
-
Notifications
You must be signed in to change notification settings - Fork 20
/
index.jl
366 lines (320 loc) · 11.5 KB
/
index.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# FASTA Index
# ===========
#
# Index for random access to FASTA files.
#
# This file is a part of BioJulia.
# License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md
"""
Index(src::Union{IO, AbstractString})
FASTA index object, which allows constant-time seeking of FASTA files by name.
The index is assumed to be in FAI format.
Notable methods:
* `Index(::Union{IO, AbstractString})`: Read FAI file from IO or file at path
* `write(::IO, ::Index)`: Write index in FAI format
* `faidx(::IO)::Index`: Index FASTA file
* `seekrecord(::Reader, ::AbstractString)`: Go to position of seq
* `extract(::Reader, ::AbstractString)`: Extract part of sequence
Note that the FAI specs are stricter than FASTX.jl's definition of FASTA,
such that some valid FASTA records may not be indexable.
See the specs at: http://www.htslib.org/doc/faidx.html
See also: [`FASTA.Reader`](@ref)
# Examples
```jldoctest
julia> src = IOBuffer("seqname\\t9\\t14\\t6\\t8\\nA\\t1\\t3\\t1\\t2");
julia> fna = IOBuffer(">A\\nG\\n>seqname\\nACGTAC\\r\\nTTG");
julia> rdr = FASTA.Reader(fna; index=src);
julia> seekrecord(rdr, "seqname");
julia> sequence(String, first(rdr))
"ACGTACTTG"
```
"""
struct Index
# Vector index for the record's sequence by header: See above specification
names::Dict{String, Int}
lengths::Vector{Int}
offsets::Vector{Int}
# Upper bit is linewidth - linebases - 1, whose only valid values
# are 0 or 1.
encoded_linebases::Vector{UInt}
# According to specs, the index need not be ordered by the offset in the FASTA
# file. However, we make sure the Index object is, because it make seeking easier.
function Index(
names::Dict{String, Int},
lengths::Vector{Int},
offsets::Vector{Int},
encoded_linebases::Vector{UInt}
)
issorted(offsets) && return new(names, lengths, offsets, encoded_linebases)
perm = sortperm(offsets)
new(
Dict(name => perm[i] for (name, i) in names),
lengths[perm],
offsets[perm],
encoded_linebases[perm]
)
end
end
function linebases_width(index::Index, i::Integer)
enc = index.encoded_linebases[i]
linebases = (enc % Int) & typemax(Int)
linewidth = linebases + 1 + (enc ≥ (typemin(Int) % UInt))
(linebases, linewidth)
end
function Base.show(io::IO, index::Index)
print(io, summary(index) * ":\n")
nrows = min(10, length(index.names))
elided = nrows < length(index.names)
names = Vector{String}(undef, nrows)
found = 0
for (name, i) in index.names
if i ≤ nrows
names[i] = name
found += 1
end
found == nrows && break
end
for i in 1:nrows
print(io, " ")
writeline(io, index, i, names)
# Do not write trailing newline
if elided || i < nrows
write(io, UInt8('\n'))
end
end
elided && print(io, '⋮')
end
index_machine = let
newline = let
lf = onenter!(re"\n", :countline)
Re.opt('\r') * lf
end
# The specs refer to the SAM specs, which contain this regex
name = onexit!(onenter!(re"[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*", :mark), :name)
number = onexit!(onall!(re"[0-9]+", :digit), :number)
line = name * re"\t" * number * re"\t" * number * re"\t" * number * re"\t" * number
fai = Re.opt(line * Re.rep(newline * line)) * Re.rep(newline)
Automa.compile(fai)
end
index_actions = Dict{Symbol, Expr}(
:mark => :(start = p),
:countline => :(linenum += 1),
:name => quote
let n = p - start
name = unsafe_string(pointer(data, start), n)
names[name] = linenum
end
end,
:digit => quote
num2 = 10*num + (byte - 0x30)
if num2 < num
error("Integer overflow on line " * string(linenum))
end
num = num2
end,
:number => quote
nnum = mod1(nnum + 1, 4)
if nnum == 1
push!(vectors[1], num)
elseif nnum == 2
num < 2 && error("First offset must be at least 2 in a valid FAI index")
push!(vectors[2], num)
# Number of basepairs per line obviously cannot exceed the sequence length.
elseif nnum == 3
if num > vectors[1][end]
error("Bases per line exceed sequence length on line ", string(linenum))
end
linebases = num
# Linewidth is linebases plus the length of the line terminator.
# Since we only accept \n and \r\n as line terminator, validate
# linewidth is linebases +1 or +2.
elseif nnum == 4
if num ∉ (linebases+1, linebases+2)
error("Linewidth must be equal to linebases +1 or +2 at line ", string(linenum))
end
# Encode linebases
encoded_linebases = (linebases % UInt)
encoded_linebases |= ifelse(num == linebases+1, UInt(0), typemin(Int) % UInt)
push!(vectors[3], encoded_linebases)
end
num = 0
end,
)
@noinline function throw_index_error(data::Vector{UInt8}, linenum::Integer, p::Integer)
p_newline = findprev(isequal(UInt8('\n')), data, p)
offset = p_newline === nothing ? 0 : Int(p_newline)
col = p - offset
error("Error when parsing FAI file: Unexpected byte at index $p (line $linenum col $col)")
end
@eval function read_faidx(data::Vector{UInt8})
start = 0
linenum = 1
names = Dict{String, Int}()
num = num2 = 0
nnum = 0
linebases = 0
linebases_num = 0
vectors = (Int[], Int[], UInt[])
$(Automa.generate_code(CONTEXT, index_machine, index_actions))
return Index(names, vectors...)
end
Index(io::IO) = read_faidx(read(io))
Index(filepath::AbstractString) = open(i -> Index(i), filepath)
function writeline(io::IO, index::Index, line::Integer, names::Vector{String})
(linebases, linewidth) = linebases_width(index, line)
write(io,
names[line], UInt8('\t'),
string(index.lengths[line]), UInt8('\t'),
string(index.offsets[line]), UInt8('\t'),
string(linebases), UInt8('\t'),
string(linewidth),
)
end
function Base.write(io::IO, index::Index)
# Put names dict in a sorted array
names = Vector{String}(undef, length(index.names))
for (name, i) in index.names
names[i] = name
end
n = 0
for i in eachindex(names)
n += writeline(io, index, i, names)
n += write(io, UInt8('\n'))
end
n
end
function Base.print(io::IO, index::Index)
buffer = IOBuffer()
write(buffer, index)
String(take!(buffer))
end
index_fasta_actions = Dict(
:mark => :(@mark),
:countline => :(linenum += 1),
:identifier => quote
identifier = unsafe_string(pointer(data, @markpos), p - @markpos)
end,
# Not used in fai files, the newline byte is consistent within one record
# and since this is the first newline in a record, we set it here
:description => quote
uses_rn_newline = byte == UInt8('\r')
no_more_seqlines = false
# Disturbingly, there is no API to get the absolute position of
# an Automa machine operating on a stream. We ought to fix this.
# This workaround works ONLY for a NoopStream,
# and relies on abusing the internals.
buffer_offset = buffer.transcoded - buffer.marginpos + 1
# We want 0-indexed, p is one-indexed, and we need the offset of first sequence
offset = buffer_offset + p + uses_rn_newline
end,
:seqline => quote
# Validate line terminator is same, i.e. no seq have have both \r\n and \n
if p < p_end && (uses_rn_newline ⊻ (byte == UInt8('\r')))
error("Line endings must be same within one record to index, but is not on line ", string(linenum))
end
# Validate sequence length is the same for all lines
let current_seqwidth = p - @markpos
# If on first line, seqwidth is -1, we set it correctly
if seqwidth == -1
seqwidth = current_seqwidth
# If we are not supposed to see more lines, or the next line
# is longer than expected, error
elseif no_more_seqlines || current_seqwidth > seqwidth
error("Sequence line width must be consistent to index, but is not on line ", string(linenum))
# If we see a shorter line, then it must be the last line
elseif current_seqwidth < seqwidth
no_more_seqlines = true
end
seqlen += current_seqwidth
end
end,
:record => quote
record_count += 1
names[identifier] = record_count
push!(lengths, seqlen)
push!(offsets, offset)
enc_linebases = (seqwidth % UInt)
enc_linebases |= ifelse(uses_rn_newline, typemin(Int) % UInt, UInt(0))
push!(encoded_linebases, enc_linebases)
seqwidth = -1
seqlen = 0
end
)
initcode = quote
names = Dict{String, Int}()
lengths = Int[]
offsets = Int[]
encoded_linebases = UInt[]
seqwidth = -1
seqlen = 0
linenum = 1
uses_rn_newline = false
# Marks whether the current seqline must be the last in the record
# (which is the case if its shorter than the previous)
no_more_seqlines = false
record_count = 0
end
returncode = quote
if cs < 0
throw_parser_error(data, p, linenum)
end
return Index(names, lengths, offsets, encoded_linebases)
end
Automa.generate_reader(
:faidx_,
machine,
actions = index_fasta_actions,
context = CONTEXT,
initcode = initcode,
returncode = returncode
) |> eval
# Set the reading position of `input` to the starting position of the record `name`.
function seekrecord(io::IO, index::Index, name::AbstractString)
seekrecord(io, index, index.names[name])
end
# We seek to previous sequence to find the > start of next sequence
function seekrecord(io::IO, index::Index, i::Integer)
i == 1 && return seekstart(io)
linebases, linewidth = linebases_width(index, i-1)
len = index.lengths[i-1]
prev_offset = index.offsets[i-1]
nlines = cld(len, linebases)
offset = prev_offset + len + nlines * (linewidth - linebases)
seek(io, offset)
return nothing
end
# Note: Current implementation relies on the IO being a NoopStream exactly,
# no other transcoding stream will do.
# This is because an indexer needs to find the absolute position of the mark
# in the stream, and this is, as far as I can tell, not supported in Automa.
# As a hacky workaround, I reach into the internals of NoopStream in the
# action dict code.
"""
faidx(io::IO)::Index
Read a `FASTA.Index` from `io`.
See also: [`Index`](@ref)
# Examples
```jldoctest
julia> ind = faidx(IOBuffer(">ab\\nTA\\nT\\n>x y\\nGAG\\nGA"))
Index:
ab 3 4 2 3
x 5 14 3 4
```
"""
faidx(x::IO) = faidx_(NoopStream(x))
faidx(x::NoopStream) = faidx_(x)
# High-level interface - not sure on this yet!
"""
faidx(fnapath::AbstractString, [idxpath::AbstractString], check=true)
Index FASTA path at `fnapath` and write index to `idxpath`.
If `idxpath` is not given, default to same name as `fnapath * ".fai"`.
If `check`, throw an error if the output file already exists
See also: [`Index`](@ref)
"""
function faidx(fnapath::AbstractString, faidxpath::AbstractString; check::Bool=true)
check && ispath(faidxpath) && error("Output path $faidxpath already exsists")
index = open(faidx, fnapath)
open(i -> write(i, index), faidxpath, "w")
index
end
faidx(path::AbstractString; check::Bool=true) = faidx(path, path * ".fai"; check=check)