Skip to content

Commit

Permalink
Support reading CRAM files with embedded references
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Apr 24, 2024
1 parent 1dd64cd commit 21bd22e
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 23 deletions.
3 changes: 1 addition & 2 deletions src/cljam/io/cram/decode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@

(defn- record-seq
[seq-resolver {:keys [preservation-map]} {:keys [rname pos end] :as record} features]
(let [region {:chr rname :start pos :end end}
ref-bases (.getBytes ^String (resolver/resolve-sequence seq-resolver region))
(let [^bytes ref-bases (resolver/resolve-sequence seq-resolver rname pos end)
len (long (::len record))
bs (byte-array len (byte (int \N)))
subst (:SM preservation-map)]
Expand Down
27 changes: 25 additions & 2 deletions src/cljam/io/cram/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
(:require [cljam.io.cram.decode.data-series :as ds]
[cljam.io.cram.decode.record :as record]
[cljam.io.cram.decode.structure :as struct]
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.protocols :as protocols]
[cljam.io.util.byte-buffer :as bb]
[cljam.util.intervals :as intervals])
(:import [java.io Closeable FileNotFoundException]
[java.nio Buffer ByteBuffer ByteOrder]
[java.nio.channels FileChannel FileChannel$MapMode]))
[java.nio.channels FileChannel FileChannel$MapMode]
[java.util Arrays]))

(declare read-alignments read-alignments-in-region)

Expand Down Expand Up @@ -62,13 +65,33 @@
(read-to-buffer rdr 26)
(struct/decode-file-definition (.-buffer rdr)))

(defn- seq-resolver-for-slice [^CRAMReader rdr slice-header blocks]
(let [embedded-ref-content-id (long (:embedded-reference slice-header))]
(if (pos? embedded-ref-content-id)
(let [ref-bases-block (when (pos? embedded-ref-content-id)
(->> blocks
(filter #(= (:content-id %) embedded-ref-content-id))
first))
offset (long (:start slice-header))
^bytes bs (bb/read-bytes (:data ref-bases-block)
(:raw-size ref-bases-block))]
(reify resolver/ISeqResolver
;; According to the CRAM specification v3.1 §8.5, a slice with an embedded
;; reference must not be a multiple reference slice, so the temporary
;; sequence resolver here can safely ignore the passed chr
(resolve-sequence [_ _chr start end]
(Arrays/copyOfRange bs
(- (long start) offset)
(inc (- (long end) offset))))))
(.-seq-resolver rdr))))

(defn- read-slice-records [^CRAMReader rdr bb compression-header]
(let [slice-header (struct/decode-slice-header-block bb)
blocks (into [] (map (fn [_] (struct/decode-block bb)))
(range (:blocks slice-header)))
ds-decoders (ds/build-data-series-decoders compression-header blocks)
tag-decoders (ds/build-tag-decoders compression-header blocks)]
(record/decode-slice-records (.-seq-resolver rdr)
(record/decode-slice-records (seq-resolver-for-slice rdr slice-header blocks)
@(.-header rdr)
compression-header
slice-header
Expand Down
7 changes: 4 additions & 3 deletions src/cljam/io/cram/seq_resolver.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
(close [_]
(.close ^Closeable seq-reader))
proto/ISeqResolver
(resolve-sequence [_ region]
(cseq/read-sequence seq-reader region)))
(resolve-sequence [_ chr start end]
(when-let [s (cseq/read-sequence seq-reader {:chr chr :start start :end end})]
(.getBytes ^String s))))

(defn seq-resolver
"Creates e new sequence resolver from the given sequence file."
"Creates a new sequence resolver from the given sequence file."
[seq-file]
(->SeqResolver (cseq/reader seq-file)))

Expand Down
6 changes: 3 additions & 3 deletions src/cljam/io/cram/seq_resolver/protocol.clj
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
(ns cljam.io.cram.seq-resolver.protocol)

(defprotocol ISeqResolver
(resolve-sequence [this region]))
(resolve-sequence [this chr start end]))

(extend-protocol ISeqResolver
nil
(resolve-sequence [_ region]
(resolve-sequence [_ chr start end]
(throw
(ex-info "reference was not specified, but tried to resolve sequence"
region))))
{:chr chr :start start :end end}))))
4 changes: 2 additions & 2 deletions test/cljam/io/cram/decode/record_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
(let [seqs (with-open [r (cseq/reader common/test-fa-file)]
(into {} (map (juxt :name :sequence)) (cseq/read-all-sequences r)))]
(reify resolver/ISeqResolver
(resolve-sequence [_ {:keys [chr start end]}]
(resolve-sequence [_ chr start end]
(let [s (get seqs chr)]
(subs s (dec (long start)) end))))))
(.getBytes (subs s (dec (long start)) end)))))))

(deftest record-end-test
(are [?record ?features ?expected]
Expand Down
19 changes: 8 additions & 11 deletions test/cljam/io/cram/seq_resolver_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,17 @@
(:require [cljam.io.cram.seq-resolver :as resolver]
[cljam.io.cram.seq-resolver.protocol :as proto]
[cljam.test-common :as common]
[clojure.test :refer [are deftest is testing]]))
[clojure.test :refer [deftest is testing]]))

(deftest resolve-sequence-test
(testing "seq resolver resolves sequence for specified region"
(let [resolver (resolver/seq-resolver common/test-fa-file)
resolver' (resolver/clone-seq-resolver resolver)]
(are [?region ?expected]
(= ?expected
(proto/resolve-sequence resolver ?region)
(proto/resolve-sequence resolver' ?region))
{:chr "ref" :start 1 :end 5}
"AGCAT"

{:chr "unknown" :start 1 :end 5}
nil)))
(is (= "AGCAT"
(String. (proto/resolve-sequence resolver "ref" 1 5))
(String. (proto/resolve-sequence resolver' "ref" 1 5))))
(is (= nil
(proto/resolve-sequence resolver "unknown" 1 5)
(proto/resolve-sequence resolver' "unknown" 1 5)))))
(testing "resolver can be omitted, but in that case, trying to resolve seq will end up with error"
(is (thrown? Exception (proto/resolve-sequence nil {:chr "ref" :start 1 :end 5})))))
(is (thrown? Exception (proto/resolve-sequence nil "ref" 1 5)))))

0 comments on commit 21bd22e

Please sign in to comment.