Skip to content

Commit

Permalink
Merge pull request #180 from chrovis/feature/vcf-reader-with-index
Browse files Browse the repository at this point in the history
Add support for random reading of vcf file.
  • Loading branch information
athos authored Nov 13, 2019
2 parents e94da96 + a6d6692 commit 5d9ea2d
Show file tree
Hide file tree
Showing 15 changed files with 297 additions and 126 deletions.
6 changes: 4 additions & 2 deletions src/cljam/io/bam_index.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
(ns cljam.io.bam-index
"Parser for a BAM index file."
(:require [cljam.io.bam-index.core :as bai-core]))
(:require
[cljam.io.bam-index.core :as bai-core]
[cljam.io.util.bin :as util-bin]))

(defn bin-index
"Returns binning index for the given reference index."
Expand All @@ -15,7 +17,7 @@
(defn get-spans
"Returns regions of a BAM file that may contain an alignment for the given range."
[bai ref-idx beg end]
(bai-core/get-spans bai ref-idx beg end))
(util-bin/get-spans bai ref-idx beg end))

(defn bam-index
"Returns a cljam.bam-index.core.BAMIndex."
Expand Down
57 changes: 18 additions & 39 deletions src/cljam/io/bam_index/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
"The core of BAM index features."
(:require [clojure.java.io :as cio]
[clojure.tools.logging :as logging]
[cljam.io.bam-index [common :refer :all]
[chunk :as chunk]
[reader :as reader]
[writer :as writer]]
[cljam.util :as util])
[cljam.io.bam-index [reader :as reader]
[writer :as writer]
[common :as common]]
[cljam.util :as util]
[cljam.io.util.bin :as util-bin])
(:import [java.io DataOutputStream FileOutputStream]
cljam.io.bam_index.reader.BAIReader
cljam.io.bam_index.writer.BAIWriter))

(deftype BAMIndex [url bidx lidx])
(deftype BAMIndex [url bidx lidx]
util-bin/IBinaryIndex
(get-chunks [this ref-idx bins]
(into [] (mapcat (get (.bidx this) ref-idx) bins)))
(get-min-offset [this ref-idx beg]
(get (get (.lidx this) ref-idx)
(util-bin/pos->lidx-offset beg common/linear-index-shift) 0)))

(defn bam-index [f]
(let [{:keys [bidx lidx]} (with-open [r ^BAIReader (reader/reader f)] (reader/read-all-index! r))]
Expand All @@ -29,39 +35,6 @@
(with-open [r ^BAIReader (reader/reader f)]
(reader/read-linear-index! r ref-idx)))

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end]
(let [max-pos 0x1FFFFFFF
beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos))
end (if (<= end 0) max-pos (bit-and (dec end) max-pos))]
(if (<= beg end)
(loop [bins (transient [0])
xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]]
(if-let [[^long ini shift] (first xs)]
(let [ini* (+ ini (bit-shift-right beg shift))
end* (+ ini (bit-shift-right end shift))]
(recur
(loop [b bins k ini*]
(if (<= k end*)
(recur (conj! b k) (inc k))
b))
(next xs)))
(persistent! bins))))))

(def ^:private reg->bins (memoize reg->bins*))

(defn get-spans
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
bidx (get (.bidx bai) ref-idx)
lidx (get (.lidx bai) ref-idx)
chunks (into [] (comp (map bidx) cat) bins)
lin-beg (writer/pos->lidx-offset beg)
min-offset (get lidx lin-beg 0)]
(->> (chunk/optimize-chunks chunks min-offset)
(map vals))))

(defn get-unplaced-spans
[^BAMIndex bai]
(if-let [begin (some->>
Expand All @@ -74,8 +47,14 @@
[[begin Long/MAX_VALUE]]
[]))

(defn get-spans
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(util-bin/get-spans bai ref-idx beg end))


;; ## Writing


(defn ^BAIWriter writer
[f refs]
(BAIWriter. (DataOutputStream. (FileOutputStream. (cio/file f)))
Expand Down
12 changes: 6 additions & 6 deletions src/cljam/io/bam_index/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
(:require [clojure.java.io :as cio]
[cljam.io.util.lsb :as lsb]
[cljam.io.bam-index.common :refer [bai-magic]]
[cljam.io.bam-index.chunk :as chunk]
[cljam.io.util.chunk :as chunk]
[cljam.util :as util])
(:import java.util.Arrays
[java.io FileInputStream Closeable IOException]
[cljam.io.bam_index.chunk Chunk]))
[cljam.io.util.chunk Chunk]))

(deftype BAIReader [url reader]
Closeable
Expand Down Expand Up @@ -49,10 +49,10 @@
(defn- read-chunks!
[rdr]
(let [n (lsb/read-int rdr)]
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))

(defn- read-bin-index**!
[rdr]
Expand Down
18 changes: 7 additions & 11 deletions src/cljam/io/bam_index/writer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
[cljam.io.sam.util :as sam-util]
[cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb]
[cljam.io.util.bin :as util-bin]
[cljam.io.bam-index.common :refer :all]
[cljam.io.bam-index.chunk :as chunk]
[cljam.io.util.chunk :as chunk]
[cljam.io.bam.decoder :as bam-decoder])
(:import [java.io DataOutputStream Closeable]
[java.nio ByteBuffer ByteOrder]
[cljam.io.bam.decoder BAMPointerBlock]
[cljam.io.bam_index.chunk Chunk]))
[cljam.io.util.chunk Chunk]))

(declare make-index-from-blocks)

Expand All @@ -22,18 +23,13 @@
(close [this]
(.close writer)))

;; Indexing
;; --------

(defn pos->lidx-offset
[^long pos]
(bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift))

;; ### Intermediate data definitions
;;
;; Use record for performance.
;; Record is faster than map for retrieving elements.


(defrecord MetaData [^long first-offset ^long last-offset ^long aligned-alns ^long unaligned-alns])

(defrecord IndexStatus [^MetaData meta-data bin-index linear-index])
Expand Down Expand Up @@ -95,11 +91,11 @@
aln-beg (.pos aln)
aln-end (.end aln)
win-beg (if (zero? aln-end)
(pos->lidx-offset (dec aln-beg))
(pos->lidx-offset aln-beg))
(util-bin/pos->lidx-offset (dec aln-beg) linear-index-shift)
(util-bin/pos->lidx-offset aln-beg linear-index-shift))
win-end (if (zero? aln-end)
win-beg
(pos->lidx-offset aln-end))
(util-bin/pos->lidx-offset aln-end linear-index-shift))
min* (fn [x]
(if x (min x beg) beg))]
(loop [i win-beg, ret linear-index]
Expand Down
91 changes: 53 additions & 38 deletions src/cljam/io/tabix.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,59 @@
"Alpha - subject to change.
Reader of a TABIX format file."
(:require [cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb])
[cljam.io.util.lsb :as lsb]
[cljam.io.util.bin :as util-bin]
[clojure.string :as cstr])
(:import java.util.Arrays
[java.io DataInputStream IOException]))
[java.io DataInputStream IOException]
[cljam.io.util.chunk Chunk]))

(def ^:const linear-index-shift 14)

(deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx]
util-bin/IBinaryIndex
(get-chunks [this ref-idx bins]
(into [] (mapcat (get (.bidx this) ref-idx) bins)))
(get-min-offset [this ref-idx beg]
(get (get (.lidx this) ref-idx)
(util-bin/pos->lidx-offset beg linear-index-shift) 0))
(get-ref-index [this chr]
(.indexOf
^clojure.lang.PersistentVector
(.seq this)
chr)))

(def tabix-magic "TBI\1")

(defn- read-chunks!
[rdr]
(->> #(Chunk. (lsb/read-long rdr) (lsb/read-long rdr))
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-seq
[buf len]
(loop [i 0, j 0, seq* []]
(if (< i len)
(if (zero? (nth buf i))
(let [ba-len (- i j)
ba (byte-array ba-len)]
(System/arraycopy buf j ba 0 ba-len)
(recur (inc i) (inc i) (conj seq* (String. ba))))
(recur (inc i) j seq*))
seq*)))
[^bytes buf]
(cstr/split (String. buf) #"\00"))

(defn- read-chunk
[^DataInputStream rdr]
{:beg (lsb/read-long rdr)
:end (lsb/read-long rdr)})
(defn- read-bin-index
[rdr]
(->> #(hash-map
:bin (lsb/read-int rdr)
:chunks (read-chunks! rdr))
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-bin
[^DataInputStream rdr]
(let [bin (lsb/read-int rdr)
n-chunk (lsb/read-int rdr)]
{:bin bin
:chunks (doall (map (fn [_] (read-chunk rdr)) (range n-chunk)))}))
(defn- read-linear-index
[rdr]
(->> #(lsb/read-long rdr)
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-index*
[^DataInputStream rdr]
(when-not (Arrays/equals ^bytes (lsb/read-bytes rdr 4) (.getBytes ^String tabix-magic))
(throw (IOException. "Invalid TABIX file")))
(let [n-seq (lsb/read-int rdr)
(let [n-ref (lsb/read-int rdr)
preset (lsb/read-int rdr)
sc (lsb/read-int rdr)
bc (lsb/read-int rdr)
Expand All @@ -45,21 +63,18 @@
skip (lsb/read-int rdr)
len (lsb/read-int rdr)
buf (lsb/read-bytes rdr len)
seq (read-seq buf len)
index (loop [i 0
bin-index []
linear-index []]
(if (< i n-seq)
(let [n-bin (lsb/read-int rdr)
new-bin-index (doall (map (fn [_] (read-bin rdr)) (range n-bin)))
n-linear-index (lsb/read-int rdr)
new-linear-index (doall (map (fn [_] (lsb/read-long rdr)) (range n-linear-index)))]
(recur (inc i)
(conj bin-index new-bin-index)
(conj linear-index new-linear-index)))
[bin-index linear-index]))]
{:n-seq n-seq, :preset preset, :sc sc, :bc bc, :ec ec, :meta meta,
:skip skip, :seq seq ,:bin-index (first index), :linear-index (last index)}))
seq (read-seq buf)
refs (range n-ref)
all-idx (map (fn [_] [(read-bin-index rdr) (read-linear-index rdr)]) refs)
bidx-seq (map first all-idx)
bidx (zipmap
refs
(map (fn [bins]
(into {} (map (juxt :bin :chunks)) bins))
bidx-seq))
lidx (zipmap refs (map second all-idx))]
(->Tabix n-ref preset sc bc ec meta
skip seq bidx lidx)))

(defn read-index
[f]
Expand Down
44 changes: 44 additions & 0 deletions src/cljam/io/util/bin.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
(ns cljam.io.util.bin
(:require [cljam.io.util.chunk :as util-chunk]))

(def ^:const linear-index-shift 14)

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end]
(let [max-pos 0x1FFFFFFF
beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos))
end (if (<= end 0) max-pos (bit-and (dec end) max-pos))]
(if (<= beg end)
(loop [bins (transient [0])
xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]]
(if-let [[^long ini shift] (first xs)]
(let [ini* (+ ini (bit-shift-right beg shift))
end* (+ ini (bit-shift-right end shift))]
(recur
(loop [b bins k ini*]
(if (<= k end*)
(recur (conj! b k) (inc k))
b))
(next xs)))
(persistent! bins))))))

(def ^:private reg->bins (memoize reg->bins*))

(defprotocol IBinaryIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-ref-index [this chr]))

(defn pos->lidx-offset
[^long pos ^long linear-index-shift]
(bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift))

(defn get-spans
"Calculate span information for random access from index data such as tabix."
[index-data ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))
14 changes: 7 additions & 7 deletions src/cljam/io/bam_index/chunk.clj → src/cljam/io/util/chunk.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(ns cljam.io.bam-index.chunk
(ns cljam.io.util.chunk
(:refer-clojure :exclude [compare])
(:require [cljam.io.util.bgzf :as bgzf]))

Expand Down Expand Up @@ -52,10 +52,10 @@
ret (transient [])]
(if f
(cond
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
(persistent! ret)))))
Loading

0 comments on commit 5d9ea2d

Please sign in to comment.