Skip to content

Commit

Permalink
Generate CRAM qname if it's absent
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Apr 30, 2024
1 parent 1dd64cd commit d0143c8
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 9 deletions.
24 changes: 20 additions & 4 deletions src/cljam/io/cram/core.clj
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
(ns cljam.io.cram.core
(:require [cljam.io.crai :as crai]
[cljam.io.cram.seq-resolver :as resolver]
[cljam.io.cram.reader :as reader.core]
[cljam.io.cram.seq-resolver :as resolver]
[cljam.io.sam.util.refs :as util.refs]
[cljam.io.util.byte-buffer :as bb]
[cljam.util :as util]
[clojure.java.io :as cio])
[clojure.java.io :as cio]
[clojure.string :as str])
(:import [cljam.io.cram.reader CRAMReader]
[java.net URL]
[java.nio.channels FileChannel]
[java.nio.file OpenOption StandardOpenOption]))

(defn- qname-generator [^URL url]
(let [prefix (-> (.getPath url)
(str/replace #"^.*/" "")
(str/replace #"[^!-?A-~]+" "_"))
len (inc (count prefix))]
(fn [i]
(let [digits (str (inc (long i)))
len' (+ len (count digits))]
(cond-> (str prefix \: digits)
(> len' 254)
(subs (- len' 254) len'))))))

(defn reader
"Creates a new CRAM reader that reads a CRAM file f.
Expand All @@ -22,10 +36,11 @@
(into-array OpenOption [StandardOpenOption/READ]))
bb (bb/allocate-lsb-byte-buffer 256)
seq-resolver (some-> reference resolver/seq-resolver)
qname-gen (qname-generator url)
header (volatile! nil)
refs (delay (util.refs/make-refs @header))
idx (delay (crai/read-index (str f ".crai") @refs))
rdr (reader.core/->CRAMReader url ch bb header refs idx seq-resolver)]
rdr (reader.core/->CRAMReader url ch bb header refs idx seq-resolver qname-gen)]
(reader.core/read-file-definition rdr)
(vreset! header (reader.core/read-header rdr))
rdr))
Expand All @@ -43,7 +58,8 @@
(delay @(.-header rdr))
(delay @(.-refs rdr))
(delay @(.-index rdr))
seq-resolver)]
seq-resolver
(.-qname-generator rdr))]
(reader.core/read-file-definition rdr')
(reader.core/skip-container rdr')
rdr'))
25 changes: 21 additions & 4 deletions src/cljam/io/cram/decode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@
(defn- build-read-name-decoder [{:keys [preservation-map]} {:keys [RN]}]
(if (:RN preservation-map)
#(assoc % :qname (String. ^bytes (RN)))
(throw (ex-info "Omitted read names are not supported yet." {}))))
identity))

(defn- build-mate-read-decoder [cram-header {:keys [MF NS NP TS NF]}]
(fn [{:keys [rname] :as record}]
(defn- build-mate-read-decoder [cram-header {:keys [MF NS RN NP TS NF]}]
(fn [{:keys [rname qname] :as record}]
(let [flag (long (::flag record))]
(if (pos? (bit-and flag 0x02))
(let [mate-flag (long (MF))
Expand All @@ -54,6 +54,7 @@
(bit-or (sam.flag/encoded #{:next-unmapped})))
rnext (ref-name cram-header (NS))]
(assoc record
:qname (or qname (String. ^bytes (RN)))
:flag bam-flag
:rnext (if (and (= rname rnext) (not= rname "*")) "=" rnext)
:pnext (NP)
Expand Down Expand Up @@ -338,9 +339,23 @@
(aset records i record')
(aset records j mate'))))))

(defn- assign-record-names [qname-generator slice-header ^objects records]
(let [counter-start (long (:counter slice-header))]
(dotimes [i (alength records)]
(let [record (aget records i)]
(when (nil? (:qname record))
;; detached records always have qname, so if the control comes in here,
;; it means this record is not detached and must have ::next-fragment
(let [j (inc (+ i (long (::next-fragment record))))
mate (aget records j)
qname (qname-generator (+ counter-start i))]
(aset records i (assoc record :qname qname))
(aset records j (assoc mate :qname qname))))))))

(defn decode-slice-records
"Decodes CRAM records in a slice all at once and returns them as a sequence of maps."
[seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders]
[seq-resolver qname-generator cram-header compression-header slice-header
ds-decoders tag-decoders]
(let [record-decoder (build-cram-record-decoder seq-resolver
cram-header
compression-header
Expand All @@ -352,4 +367,6 @@
(dotimes [i n]
(aset records i (record-decoder)))
(resolve-mate-records records)
(when-not (get-in compression-header [:preservation-map :RN])
(assign-record-names qname-generator slice-header records))
(map #(dissoc % ::flag ::len ::next-fragment) records)))
3 changes: 2 additions & 1 deletion src/cljam/io/cram/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

(declare read-alignments read-alignments-in-region)

(deftype CRAMReader [url channel buffer header refs index seq-resolver]
(deftype CRAMReader [url channel buffer header refs index seq-resolver qname-generator]
Closeable
(close [_]
(when seq-resolver
Expand Down Expand Up @@ -69,6 +69,7 @@
ds-decoders (ds/build-data-series-decoders compression-header blocks)
tag-decoders (ds/build-tag-decoders compression-header blocks)]
(record/decode-slice-records (.-seq-resolver rdr)
(.-qname-generator rdr)
@(.-header rdr)
compression-header
slice-header
Expand Down
68 changes: 68 additions & 0 deletions test/cljam/io/cram/decode/record_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options []}]
(record/decode-slice-records test-seq-resolver
nil
cram-header
compression-header
slice-header
Expand Down Expand Up @@ -483,6 +484,73 @@
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options []}]
(record/decode-slice-records test-seq-resolver
nil
cram-header
compression-header
slice-header
ds-decoders
tag-decoders)))))
(testing "unnamed reads"
(let [cram-header {:SQ
[{:SN "ref"}
{:SN "ref2"}]}
compression-header {:preservation-map
{:RN false
:AP true
:RR true
:SM {\A {0 \C, 1 \G, 2 \T, 3 \N}
\C {0 \A, 1 \G, 2 \T, 3 \N}
\G {0 \A, 1 \C, 2 \T, 3 \N}
\T {0 \A, 1 \C, 2 \G, 3 \N}
\N {0 \A, 1 \C, 2 \G, 3 \T}}
:TD [[]]}}
slice-header {:ref-seq-id 0
:start 10
:records 7
:counter 100}
ds-decoders (build-stub-decoders
{:BF [67 67 147 147 147 67 0]
:CF [5 5 1 1 3 3 3]
:RL [5 5 5 5 5 5 5]
:AP [0 0 20 0 0 0 0]
:RG [-1 -1 -1 -1 -1 -1 -1]
:RN (->> [nil nil nil nil "q003" "q004" "q005"]
(keep #(some-> ^String % .getBytes)))
:MF [nil nil nil nil 0 1 2]
:NF [1 1 nil nil nil nil nil]
:NS [nil nil nil nil 0 1 -1]
:NP [nil nil nil nil 50 100 0]
:TS [nil nil nil nil 25 0 0]
:TL [0 0 0 0 0 0 0]
:FN [0 0 0 0 0 0 0]
:QS (->> (repeat 7 "#####")
(mapcat #(.getBytes ^String %))
(map #(- (long %) 33)))
:MQ [40 40 40 40 40 40 40]})
tag-decoders (build-stub-tag-decoders {})]
(is (= [{:qname "gen:101", :flag 99, :rname "ref", :pos 10, :end 14, :mapq 40,
:cigar "5M", :rnext "=", :pnext 30, :tlen 25, :seq "GATAA", :qual "#####"
:options []}
{:qname "gen:102", :flag 99, :rname "ref", :pos 10, :end 14, :mapq 40,
:cigar "5M", :rnext "=", :pnext 30, :tlen 25, :seq "GATAA", :qual "#####"
:options []}
{:qname "gen:101", :flag 147, :rname "ref", :pos 30, :end 34, :mapq 40,
:cigar "5M", :rnext "=", :pnext 10, :tlen -25, :seq "AGGCA", :qual "#####"
:options []}
{:qname "gen:102", :flag 147, :rname "ref", :pos 30, :end 34, :mapq 40,
:cigar "5M", :rnext "=", :pnext 10, :tlen -25, :seq "AGGCA", :qual "#####"
:options []}
{:qname "q003", :flag 147, :rname "ref", :pos 30, :end 34, :mapq 40,
:cigar "5M", :rnext "=", :pnext 50, :tlen 25, :seq "AGGCA", :qual "#####"
:options []}
{:qname "q004", :flag 99, :rname "ref", :pos 30, :end 34, :mapq 40,
:cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "AGGCA", :qual "#####"
:options []}
{:qname "q005", :flag 8, :rname "ref", :pos 30, :end 34, :mapq 40,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "AGGCA", :qual "#####"
:options []}]
(record/decode-slice-records test-seq-resolver
#(str "gen:" (inc %))
cram-header
compression-header
slice-header
Expand Down

0 comments on commit d0143c8

Please sign in to comment.