Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for random reading of vcf file. #180

Merged
merged 5 commits into from
Nov 13, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/cljam/io/bam_index.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
(ns cljam.io.bam-index
"Parser for a BAM index file."
(:require [cljam.io.bam-index.core :as bai-core]))
(:require
[cljam.io.bam-index.core :as bai-core]
[cljam.io.util.bin :as util-bin]))

(defn bin-index
"Returns binning index for the given reference index."
Expand All @@ -15,7 +17,7 @@
(defn get-spans
"Returns regions of a BAM file that may contain an alignment for the given range."
[bai ref-idx beg end]
(bai-core/get-spans bai ref-idx beg end))
(util-bin/get-spans bai ref-idx beg end))

(defn bam-index
"Returns a cljam.bam-index.core.BAMIndex."
Expand Down
57 changes: 18 additions & 39 deletions src/cljam/io/bam_index/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
"The core of BAM index features."
(:require [clojure.java.io :as cio]
[clojure.tools.logging :as logging]
[cljam.io.bam-index [common :refer :all]
[chunk :as chunk]
[reader :as reader]
[writer :as writer]]
[cljam.util :as util])
[cljam.io.bam-index [reader :as reader]
[writer :as writer]
[common :as common]]
[cljam.util :as util]
[cljam.io.util.bin :as util-bin])
(:import [java.io DataOutputStream FileOutputStream]
cljam.io.bam_index.reader.BAIReader
cljam.io.bam_index.writer.BAIWriter))

(deftype BAMIndex [url bidx lidx])
(deftype BAMIndex [url bidx lidx]
util-bin/IBinaryIndex
(get-chunks [this ref-idx bins]
(into [] (mapcat (get (.bidx this) ref-idx) bins)))
(get-min-offset [this ref-idx beg]
(get (get (.lidx this) ref-idx)
(util-bin/pos->lidx-offset beg common/linear-index-shift) 0)))

(defn bam-index [f]
(let [{:keys [bidx lidx]} (with-open [r ^BAIReader (reader/reader f)] (reader/read-all-index! r))]
Expand All @@ -29,39 +35,6 @@
(with-open [r ^BAIReader (reader/reader f)]
(reader/read-linear-index! r ref-idx)))

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end]
(let [max-pos 0x1FFFFFFF
beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos))
end (if (<= end 0) max-pos (bit-and (dec end) max-pos))]
(if (<= beg end)
(loop [bins (transient [0])
xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]]
(if-let [[^long ini shift] (first xs)]
(let [ini* (+ ini (bit-shift-right beg shift))
end* (+ ini (bit-shift-right end shift))]
(recur
(loop [b bins k ini*]
(if (<= k end*)
(recur (conj! b k) (inc k))
b))
(next xs)))
(persistent! bins))))))

(def ^:private reg->bins (memoize reg->bins*))

(defn get-spans
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
bidx (get (.bidx bai) ref-idx)
lidx (get (.lidx bai) ref-idx)
chunks (into [] (comp (map bidx) cat) bins)
lin-beg (writer/pos->lidx-offset beg)
min-offset (get lidx lin-beg 0)]
(->> (chunk/optimize-chunks chunks min-offset)
alumi marked this conversation as resolved.
Show resolved Hide resolved
(map vals))))

(defn get-unplaced-spans
[^BAMIndex bai]
(if-let [begin (some->>
Expand All @@ -74,8 +47,14 @@
[[begin Long/MAX_VALUE]]
[]))

(defn get-spans
[^BAMIndex bai ^long ref-idx ^long beg ^long end]
(util-bin/get-spans bai ref-idx beg end))


;; ## Writing


(defn ^BAIWriter writer
[f refs]
(BAIWriter. (DataOutputStream. (FileOutputStream. (cio/file f)))
Expand Down
12 changes: 6 additions & 6 deletions src/cljam/io/bam_index/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
(:require [clojure.java.io :as cio]
[cljam.io.util.lsb :as lsb]
[cljam.io.bam-index.common :refer [bai-magic]]
[cljam.io.bam-index.chunk :as chunk]
[cljam.io.util.chunk :as chunk]
[cljam.util :as util])
(:import java.util.Arrays
[java.io FileInputStream Closeable IOException]
[cljam.io.bam_index.chunk Chunk]))
[cljam.io.util.chunk Chunk]))

(deftype BAIReader [url reader]
Closeable
Expand Down Expand Up @@ -49,10 +49,10 @@
(defn- read-chunks!
[rdr]
(let [n (lsb/read-int rdr)]
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))
(loop [i 0, chunks []]
(if (< i n)
(recur (inc i) (conj chunks (Chunk. (lsb/read-long rdr) (lsb/read-long rdr))))
chunks))))

(defn- read-bin-index**!
[rdr]
Expand Down
18 changes: 7 additions & 11 deletions src/cljam/io/bam_index/writer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
[cljam.io.sam.util :as sam-util]
[cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb]
[cljam.io.util.bin :as util-bin]
[cljam.io.bam-index.common :refer :all]
[cljam.io.bam-index.chunk :as chunk]
[cljam.io.util.chunk :as chunk]
[cljam.io.bam.decoder :as bam-decoder])
(:import [java.io DataOutputStream Closeable]
[java.nio ByteBuffer ByteOrder]
[cljam.io.bam.decoder BAMPointerBlock]
[cljam.io.bam_index.chunk Chunk]))
[cljam.io.util.chunk Chunk]))

(declare make-index-from-blocks)

Expand All @@ -22,18 +23,13 @@
(close [this]
(.close writer)))

;; Indexing
;; --------

(defn pos->lidx-offset
[^long pos]
(bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift))

;; ### Intermediate data definitions
;;
;; Use record for performance.
;; Record is faster than map for retrieving elements.


(defrecord MetaData [^long first-offset ^long last-offset ^long aligned-alns ^long unaligned-alns])

(defrecord IndexStatus [^MetaData meta-data bin-index linear-index])
Expand Down Expand Up @@ -95,11 +91,11 @@
aln-beg (.pos aln)
aln-end (.end aln)
win-beg (if (zero? aln-end)
(pos->lidx-offset (dec aln-beg))
(pos->lidx-offset aln-beg))
(util-bin/pos->lidx-offset (dec aln-beg) linear-index-shift)
(util-bin/pos->lidx-offset aln-beg linear-index-shift))
win-end (if (zero? aln-end)
win-beg
(pos->lidx-offset aln-end))
(util-bin/pos->lidx-offset aln-end linear-index-shift))
min* (fn [x]
(if x (min x beg) beg))]
(loop [i win-beg, ret linear-index]
Expand Down
91 changes: 53 additions & 38 deletions src/cljam/io/tabix.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,59 @@
"Alpha - subject to change.
Reader of a TABIX format file."
(:require [cljam.io.util.bgzf :as bgzf]
[cljam.io.util.lsb :as lsb])
[cljam.io.util.lsb :as lsb]
[cljam.io.util.bin :as util-bin]
[clojure.string :as cstr])
(:import java.util.Arrays
[java.io DataInputStream IOException]))
[java.io DataInputStream IOException]
[cljam.io.util.chunk Chunk]))

(def ^:const linear-index-shift 14)

(deftype Tabix [n-ref preset sc bc ec meta skip seq bidx lidx]
util-bin/IBinaryIndex
(get-chunks [this ref-idx bins]
(into [] (mapcat (get (.bidx this) ref-idx) bins)))
(get-min-offset [this ref-idx beg]
(get (get (.lidx this) ref-idx)
(util-bin/pos->lidx-offset beg linear-index-shift) 0))
(get-ref-index [this chr]
(.indexOf
^clojure.lang.PersistentVector
(.seq this)
chr)))

(def tabix-magic "TBI\1")

(defn- read-chunks!
[rdr]
(->> #(Chunk. (lsb/read-long rdr) (lsb/read-long rdr))
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-seq
[buf len]
(loop [i 0, j 0, seq* []]
(if (< i len)
(if (zero? (nth buf i))
(let [ba-len (- i j)
ba (byte-array ba-len)]
(System/arraycopy buf j ba 0 ba-len)
(recur (inc i) (inc i) (conj seq* (String. ba))))
(recur (inc i) j seq*))
seq*)))
[^bytes buf]
(cstr/split (String. buf) #"\00"))

(defn- read-chunk
[^DataInputStream rdr]
{:beg (lsb/read-long rdr)
:end (lsb/read-long rdr)})
(defn- read-bin-index
[rdr]
(->> #(hash-map
:bin (lsb/read-int rdr)
:chunks (read-chunks! rdr))
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-bin
[^DataInputStream rdr]
(let [bin (lsb/read-int rdr)
n-chunk (lsb/read-int rdr)]
{:bin bin
:chunks (doall (map (fn [_] (read-chunk rdr)) (range n-chunk)))}))
(defn- read-linear-index
[rdr]
(->> #(lsb/read-long rdr)
(repeatedly (lsb/read-int rdr))
vec))

(defn- read-index*
[^DataInputStream rdr]
(when-not (Arrays/equals ^bytes (lsb/read-bytes rdr 4) (.getBytes ^String tabix-magic))
(throw (IOException. "Invalid TABIX file")))
(let [n-seq (lsb/read-int rdr)
(let [n-ref (lsb/read-int rdr)
preset (lsb/read-int rdr)
sc (lsb/read-int rdr)
bc (lsb/read-int rdr)
Expand All @@ -45,21 +63,18 @@
skip (lsb/read-int rdr)
len (lsb/read-int rdr)
buf (lsb/read-bytes rdr len)
seq (read-seq buf len)
index (loop [i 0
bin-index []
linear-index []]
(if (< i n-seq)
(let [n-bin (lsb/read-int rdr)
new-bin-index (doall (map (fn [_] (read-bin rdr)) (range n-bin)))
n-linear-index (lsb/read-int rdr)
new-linear-index (doall (map (fn [_] (lsb/read-long rdr)) (range n-linear-index)))]
(recur (inc i)
(conj bin-index new-bin-index)
(conj linear-index new-linear-index)))
[bin-index linear-index]))]
{:n-seq n-seq, :preset preset, :sc sc, :bc bc, :ec ec, :meta meta,
:skip skip, :seq seq ,:bin-index (first index), :linear-index (last index)}))
seq (read-seq buf)
refs (range n-ref)
all-idx (map (fn [_] [(read-bin-index rdr) (read-linear-index rdr)]) refs)
bidx-seq (map first all-idx)
bidx (zipmap
refs
(map (fn [bins]
(into {} (map (juxt :bin :chunks)) bins))
bidx-seq))
lidx (zipmap refs (map second all-idx))]
(->Tabix n-ref preset sc bc ec meta
skip seq bidx lidx)))

(defn read-index
[f]
Expand Down
44 changes: 44 additions & 0 deletions src/cljam/io/util/bin.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
(ns cljam.io.util.bin
(:require [cljam.io.util.chunk :as util-chunk]))

(def ^:const linear-index-shift 14)

(defn- reg->bins*
"Returns candidate bins for the specified region as a vector."
[^long beg ^long end]
(let [max-pos 0x1FFFFFFF
beg (if (<= beg 0) 0 (bit-and (dec beg) max-pos))
end (if (<= end 0) max-pos (bit-and (dec end) max-pos))]
(if (<= beg end)
(loop [bins (transient [0])
xs [[1 26] [9 23] [73 20] [585 17] [4681 14]]]
(if-let [[^long ini shift] (first xs)]
(let [ini* (+ ini (bit-shift-right beg shift))
end* (+ ini (bit-shift-right end shift))]
(recur
(loop [b bins k ini*]
(if (<= k end*)
(recur (conj! b k) (inc k))
b))
(next xs)))
(persistent! bins))))))

(def ^:private reg->bins (memoize reg->bins*))

(defprotocol IBinaryIndex
(get-chunks [this ref-idx bins])
(get-min-offset [this ref-idx beg])
(get-ref-index [this chr]))

(defn pos->lidx-offset
[^long pos ^long linear-index-shift]
(bit-shift-right (if (<= pos 0) 0 (dec pos)) linear-index-shift))

(defn get-spans
"Calculate span information for random access from ndex data such as tabix."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Calculate span information for random access from ndex data such as tabix."
"Calculate span information for random access from index data such as tabix."

[index-data ^long ref-idx ^long beg ^long end]
(let [bins (reg->bins beg end)
chunks (get-chunks index-data ref-idx bins)
min-offset (get-min-offset index-data ref-idx beg)]
(->> (util-chunk/optimize-chunks chunks min-offset)
(map vals))))
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(ns cljam.io.bam-index.chunk
(ns cljam.io.util.chunk
(:refer-clojure :exclude [compare])
(:require [cljam.io.util.bgzf :as bgzf]))

Expand Down Expand Up @@ -52,10 +52,10 @@
ret (transient [])]
(if f
(cond
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
(<= (.end f) min-offset) (recur r last-chunk ret)
(nil? last-chunk) (recur r f (conj! ret f))
(and (not (overlap? last-chunk f))
(not (adjacent? last-chunk f))) (recur r f (conj! ret f))
(> (.end f) (.end last-chunk)) (let [l (assoc last-chunk :end (.end f))] (recur r l (conj! (pop! ret) l)))
:else (recur r last-chunk ret))
(persistent! ret)))))
Loading