diff --git a/src/cljam/io/cram/core.clj b/src/cljam/io/cram/core.clj index 2e3322d0..c7dc75ff 100644 --- a/src/cljam/io/cram/core.clj +++ b/src/cljam/io/cram/core.clj @@ -1,15 +1,29 @@ (ns cljam.io.cram.core (:require [cljam.io.crai :as crai] - [cljam.io.cram.seq-resolver :as resolver] [cljam.io.cram.reader :as reader.core] + [cljam.io.cram.seq-resolver :as resolver] [cljam.io.sam.util.refs :as util.refs] [cljam.io.util.byte-buffer :as bb] [cljam.util :as util] - [clojure.java.io :as cio]) + [clojure.java.io :as cio] + [clojure.string :as str]) (:import [cljam.io.cram.reader CRAMReader] + [java.net URL] [java.nio.channels FileChannel] [java.nio.file OpenOption StandardOpenOption])) +(defn- qname-generator [^URL url] + (let [prefix (-> (.getPath url) + (str/replace #"^.*/" "") + (str/replace #"[^!-?A-~]+" "_")) + len (inc (count prefix))] + (fn [i] + (let [digits (str (inc (long i))) + len' (+ len (count digits))] + (cond-> (str prefix \: digits) + (> len' 254) + (subs (- len' 254) len')))))) + (defn reader "Creates a new CRAM reader that reads a CRAM file f. @@ -22,10 +36,11 @@ (into-array OpenOption [StandardOpenOption/READ])) bb (bb/allocate-lsb-byte-buffer 256) seq-resolver (some-> reference resolver/seq-resolver) + qname-gen (qname-generator url) header (volatile! nil) refs (delay (util.refs/make-refs @header)) idx (delay (crai/read-index (str f ".crai") @refs)) - rdr (reader.core/->CRAMReader url ch bb header refs idx seq-resolver)] + rdr (reader.core/->CRAMReader url ch bb header refs idx seq-resolver qname-gen)] (reader.core/read-file-definition rdr) (vreset! header (reader.core/read-header rdr)) rdr)) @@ -43,7 +58,8 @@ (delay @(.-header rdr)) (delay @(.-refs rdr)) (delay @(.-index rdr)) - seq-resolver)] + seq-resolver + (.-qname-generator rdr))] (reader.core/read-file-definition rdr') (reader.core/skip-container rdr') rdr')) diff --git a/src/cljam/io/cram/decode/record.clj b/src/cljam/io/cram/decode/record.clj index 38e2b6cc..14bb2ee4 100644 --- a/src/cljam/io/cram/decode/record.clj +++ b/src/cljam/io/cram/decode/record.clj @@ -39,10 +39,10 @@ (defn- build-read-name-decoder [{:keys [preservation-map]} {:keys [RN]}] (if (:RN preservation-map) #(assoc % :qname (String. ^bytes (RN))) - (throw (ex-info "Omitted read names are not supported yet." {})))) + identity)) -(defn- build-mate-read-decoder [cram-header {:keys [MF NS NP TS NF]}] - (fn [{:keys [rname] :as record}] +(defn- build-mate-read-decoder [cram-header {:keys [MF NS RN NP TS NF]}] + (fn [{:keys [rname qname] :as record}] (let [flag (long (::flag record))] (if (pos? (bit-and flag 0x02)) (let [mate-flag (long (MF)) @@ -54,6 +54,7 @@ (bit-or (sam.flag/encoded #{:next-unmapped}))) rnext (ref-name cram-header (NS))] (assoc record + :qname (or qname (String. ^bytes (RN))) :flag bam-flag :rnext (if (and (= rname rnext) (not= rname "*")) "=" rnext) :pnext (NP) @@ -338,9 +339,23 @@ (aset records i record') (aset records j mate')))))) +(defn- assign-record-names [qname-generator slice-header ^objects records] + (let [counter-start (long (:counter slice-header))] + (dotimes [i (alength records)] + (let [record (aget records i)] + (when (nil? (:qname record)) + ;; detached records always have qname, so if the control comes in here, + ;; it means this record is not detached and must have ::next-fragment + (let [j (inc (+ i (long (::next-fragment record)))) + mate (aget records j) + qname (qname-generator (+ counter-start i))] + (aset records i (assoc record :qname qname)) + (aset records j (assoc mate :qname qname)))))))) + (defn decode-slice-records "Decodes CRAM records in a slice all at once and returns them as a sequence of maps." - [seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders] + [seq-resolver qname-generator cram-header compression-header slice-header + ds-decoders tag-decoders] (let [record-decoder (build-cram-record-decoder seq-resolver cram-header compression-header @@ -352,4 +367,6 @@ (dotimes [i n] (aset records i (record-decoder))) (resolve-mate-records records) + (when-not (get-in compression-header [:preservation-map :RN]) + (assign-record-names qname-generator slice-header records)) (map #(dissoc % ::flag ::len ::next-fragment) records))) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index e9d09e5d..180fc07d 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -10,7 +10,7 @@ (declare read-alignments read-alignments-in-region) -(deftype CRAMReader [url channel buffer header refs index seq-resolver] +(deftype CRAMReader [url channel buffer header refs index seq-resolver qname-generator] Closeable (close [_] (when seq-resolver @@ -69,6 +69,7 @@ ds-decoders (ds/build-data-series-decoders compression-header blocks) tag-decoders (ds/build-tag-decoders compression-header blocks)] (record/decode-slice-records (.-seq-resolver rdr) + (.-qname-generator rdr) @(.-header rdr) compression-header slice-header diff --git a/test/cljam/io/cram/decode/record_test.clj b/test/cljam/io/cram/decode/record_test.clj index fd214b7e..c8bfc4cb 100644 --- a/test/cljam/io/cram/decode/record_test.clj +++ b/test/cljam/io/cram/decode/record_test.clj @@ -426,6 +426,7 @@ :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" :options []}] (record/decode-slice-records test-seq-resolver + nil cram-header compression-header slice-header @@ -483,6 +484,73 @@ :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" :options []}] (record/decode-slice-records test-seq-resolver + nil + cram-header + compression-header + slice-header + ds-decoders + tag-decoders))))) + (testing "unnamed reads" + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}]} + compression-header {:preservation-map + {:RN false + :AP true + :RR true + :SM {\A {0 \C, 1 \G, 2 \T, 3 \N} + \C {0 \A, 1 \G, 2 \T, 3 \N} + \G {0 \A, 1 \C, 2 \T, 3 \N} + \T {0 \A, 1 \C, 2 \G, 3 \N} + \N {0 \A, 1 \C, 2 \G, 3 \T}} + :TD [[]]}} + slice-header {:ref-seq-id 0 + :start 10 + :records 7 + :counter 100} + ds-decoders (build-stub-decoders + {:BF [67 67 147 147 147 67 0] + :CF [5 5 1 1 3 3 3] + :RL [5 5 5 5 5 5 5] + :AP [0 0 20 0 0 0 0] + :RG [-1 -1 -1 -1 -1 -1 -1] + :RN (->> [nil nil nil nil "q003" "q004" "q005"] + (keep #(some-> ^String % .getBytes))) + :MF [nil nil nil nil 0 1 2] + :NF [1 1 nil nil nil nil nil] + :NS [nil nil nil nil 0 1 -1] + :NP [nil nil nil nil 50 100 0] + :TS [nil nil nil nil 25 0 0] + :TL [0 0 0 0 0 0 0] + :FN [0 0 0 0 0 0 0] + :QS (->> (repeat 7 "#####") + (mapcat #(.getBytes ^String %)) + (map #(- (long %) 33))) + :MQ [40 40 40 40 40 40 40]}) + tag-decoders (build-stub-tag-decoders {})] + (is (= [{:qname "gen:101", :flag 99, :rname "ref", :pos 10, :end 14, :mapq 40, + :cigar "5M", :rnext "=", :pnext 30, :tlen 25, :seq "GATAA", :qual "#####" + :options []} + {:qname "gen:102", :flag 99, :rname "ref", :pos 10, :end 14, :mapq 40, + :cigar "5M", :rnext "=", :pnext 30, :tlen 25, :seq "GATAA", :qual "#####" + :options []} + {:qname "gen:101", :flag 147, :rname "ref", :pos 30, :end 34, :mapq 40, + :cigar "5M", :rnext "=", :pnext 10, :tlen -25, :seq "AGGCA", :qual "#####" + :options []} + {:qname "gen:102", :flag 147, :rname "ref", :pos 30, :end 34, :mapq 40, + :cigar "5M", :rnext "=", :pnext 10, :tlen -25, :seq "AGGCA", :qual "#####" + :options []} + {:qname "q003", :flag 147, :rname "ref", :pos 30, :end 34, :mapq 40, + :cigar "5M", :rnext "=", :pnext 50, :tlen 25, :seq "AGGCA", :qual "#####" + :options []} + {:qname "q004", :flag 99, :rname "ref", :pos 30, :end 34, :mapq 40, + :cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "AGGCA", :qual "#####" + :options []} + {:qname "q005", :flag 8, :rname "ref", :pos 30, :end 34, :mapq 40, + :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "AGGCA", :qual "#####" + :options []}] + (record/decode-slice-records test-seq-resolver + #(str "gen:" (inc %)) cram-header compression-header slice-header