From 21bd22e3c7e56e7ad736ead2635002fdcf0cac56 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 17 Apr 2024 15:38:54 +0900 Subject: [PATCH] Support reading CRAM files with embedded references --- src/cljam/io/cram/decode/record.clj | 3 +-- src/cljam/io/cram/reader.clj | 27 +++++++++++++++++++-- src/cljam/io/cram/seq_resolver.clj | 7 +++--- src/cljam/io/cram/seq_resolver/protocol.clj | 6 ++--- test/cljam/io/cram/decode/record_test.clj | 4 +-- test/cljam/io/cram/seq_resolver_test.clj | 19 ++++++--------- 6 files changed, 43 insertions(+), 23 deletions(-) diff --git a/src/cljam/io/cram/decode/record.clj b/src/cljam/io/cram/decode/record.clj index 38e2b6cc..bca78490 100644 --- a/src/cljam/io/cram/decode/record.clj +++ b/src/cljam/io/cram/decode/record.clj @@ -98,8 +98,7 @@ (defn- record-seq [seq-resolver {:keys [preservation-map]} {:keys [rname pos end] :as record} features] - (let [region {:chr rname :start pos :end end} - ref-bases (.getBytes ^String (resolver/resolve-sequence seq-resolver region)) + (let [^bytes ref-bases (resolver/resolve-sequence seq-resolver rname pos end) len (long (::len record)) bs (byte-array len (byte (int \N))) subst (:SM preservation-map)] diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index e9d09e5d..1c5d8500 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -2,11 +2,14 @@ (:require [cljam.io.cram.decode.data-series :as ds] [cljam.io.cram.decode.record :as record] [cljam.io.cram.decode.structure :as struct] + [cljam.io.cram.seq-resolver.protocol :as resolver] [cljam.io.protocols :as protocols] + [cljam.io.util.byte-buffer :as bb] [cljam.util.intervals :as intervals]) (:import [java.io Closeable FileNotFoundException] [java.nio Buffer ByteBuffer ByteOrder] - [java.nio.channels FileChannel FileChannel$MapMode])) + [java.nio.channels FileChannel FileChannel$MapMode] + [java.util Arrays])) (declare read-alignments read-alignments-in-region) @@ -62,13 +65,33 @@ (read-to-buffer rdr 26) (struct/decode-file-definition (.-buffer rdr))) +(defn- seq-resolver-for-slice [^CRAMReader rdr slice-header blocks] + (let [embedded-ref-content-id (long (:embedded-reference slice-header))] + (if (pos? embedded-ref-content-id) + (let [ref-bases-block (when (pos? embedded-ref-content-id) + (->> blocks + (filter #(= (:content-id %) embedded-ref-content-id)) + first)) + offset (long (:start slice-header)) + ^bytes bs (bb/read-bytes (:data ref-bases-block) + (:raw-size ref-bases-block))] + (reify resolver/ISeqResolver + ;; According to the CRAM specification v3.1 ยง8.5, a slice with an embedded + ;; reference must not be a multiple reference slice, so the temporary + ;; sequence resolver here can safely ignore the passed chr + (resolve-sequence [_ _chr start end] + (Arrays/copyOfRange bs + (- (long start) offset) + (inc (- (long end) offset)))))) + (.-seq-resolver rdr)))) + (defn- read-slice-records [^CRAMReader rdr bb compression-header] (let [slice-header (struct/decode-slice-header-block bb) blocks (into [] (map (fn [_] (struct/decode-block bb))) (range (:blocks slice-header))) ds-decoders (ds/build-data-series-decoders compression-header blocks) tag-decoders (ds/build-tag-decoders compression-header blocks)] - (record/decode-slice-records (.-seq-resolver rdr) + (record/decode-slice-records (seq-resolver-for-slice rdr slice-header blocks) @(.-header rdr) compression-header slice-header diff --git a/src/cljam/io/cram/seq_resolver.clj b/src/cljam/io/cram/seq_resolver.clj index 3dee9f99..0fa95ead 100644 --- a/src/cljam/io/cram/seq_resolver.clj +++ b/src/cljam/io/cram/seq_resolver.clj @@ -8,11 +8,12 @@ (close [_] (.close ^Closeable seq-reader)) proto/ISeqResolver - (resolve-sequence [_ region] - (cseq/read-sequence seq-reader region))) + (resolve-sequence [_ chr start end] + (when-let [s (cseq/read-sequence seq-reader {:chr chr :start start :end end})] + (.getBytes ^String s)))) (defn seq-resolver - "Creates e new sequence resolver from the given sequence file." + "Creates a new sequence resolver from the given sequence file." [seq-file] (->SeqResolver (cseq/reader seq-file))) diff --git a/src/cljam/io/cram/seq_resolver/protocol.clj b/src/cljam/io/cram/seq_resolver/protocol.clj index ddebb14d..f9bf4829 100644 --- a/src/cljam/io/cram/seq_resolver/protocol.clj +++ b/src/cljam/io/cram/seq_resolver/protocol.clj @@ -1,11 +1,11 @@ (ns cljam.io.cram.seq-resolver.protocol) (defprotocol ISeqResolver - (resolve-sequence [this region])) + (resolve-sequence [this chr start end])) (extend-protocol ISeqResolver nil - (resolve-sequence [_ region] + (resolve-sequence [_ chr start end] (throw (ex-info "reference was not specified, but tried to resolve sequence" - region)))) + {:chr chr :start start :end end})))) diff --git a/test/cljam/io/cram/decode/record_test.clj b/test/cljam/io/cram/decode/record_test.clj index fd214b7e..eefbf031 100644 --- a/test/cljam/io/cram/decode/record_test.clj +++ b/test/cljam/io/cram/decode/record_test.clj @@ -48,9 +48,9 @@ (let [seqs (with-open [r (cseq/reader common/test-fa-file)] (into {} (map (juxt :name :sequence)) (cseq/read-all-sequences r)))] (reify resolver/ISeqResolver - (resolve-sequence [_ {:keys [chr start end]}] + (resolve-sequence [_ chr start end] (let [s (get seqs chr)] - (subs s (dec (long start)) end)))))) + (.getBytes (subs s (dec (long start)) end))))))) (deftest record-end-test (are [?record ?features ?expected] diff --git a/test/cljam/io/cram/seq_resolver_test.clj b/test/cljam/io/cram/seq_resolver_test.clj index 25cffb13..984d108d 100644 --- a/test/cljam/io/cram/seq_resolver_test.clj +++ b/test/cljam/io/cram/seq_resolver_test.clj @@ -2,20 +2,17 @@ (:require [cljam.io.cram.seq-resolver :as resolver] [cljam.io.cram.seq-resolver.protocol :as proto] [cljam.test-common :as common] - [clojure.test :refer [are deftest is testing]])) + [clojure.test :refer [deftest is testing]])) (deftest resolve-sequence-test (testing "seq resolver resolves sequence for specified region" (let [resolver (resolver/seq-resolver common/test-fa-file) resolver' (resolver/clone-seq-resolver resolver)] - (are [?region ?expected] - (= ?expected - (proto/resolve-sequence resolver ?region) - (proto/resolve-sequence resolver' ?region)) - {:chr "ref" :start 1 :end 5} - "AGCAT" - - {:chr "unknown" :start 1 :end 5} - nil))) + (is (= "AGCAT" + (String. (proto/resolve-sequence resolver "ref" 1 5)) + (String. (proto/resolve-sequence resolver' "ref" 1 5)))) + (is (= nil + (proto/resolve-sequence resolver "unknown" 1 5) + (proto/resolve-sequence resolver' "unknown" 1 5))))) (testing "resolver can be omitted, but in that case, trying to resolve seq will end up with error" - (is (thrown? Exception (proto/resolve-sequence nil {:chr "ref" :start 1 :end 5}))))) + (is (thrown? Exception (proto/resolve-sequence nil "ref" 1 5)))))