Skip to content

Commit

Permalink
Merge pull request #325 from chrovis/feature/freq-based-subst-mat
Browse files Browse the repository at this point in the history
SNV frequency-based substitution matrix construction for CRAM writer
  • Loading branch information
r6eve authored Oct 22, 2024
2 parents 943f181 + b07d48d commit fb6193b
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 71 deletions.
11 changes: 5 additions & 6 deletions src/cljam/io/cram/encode/context.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.encode.context
(:require [cljam.io.cram.data-series :as ds]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.sam.util.header :as sam.header]))

Expand All @@ -13,17 +14,13 @@
(= (sam.header/sort-order cram-header)
sam.header/order-coordinate)
(assoc :AP true))
subst-mat {\A {\T 0, \G 1, \C 2, \N 3}
\T {\A 0, \G 1, \C 2, \N 3}
\G {\A 0, \T 1, \C 2, \N 3}
\C {\A 0, \T 1, \G 2, \N 3}
\N {\A 0, \T 1, \G 2, \C 3}}
subst-mat-builder (subst-mat/make-subst-matrix-builder)
tag-dict-builder (tag-dict/make-tag-dict-builder)]
{:cram-header cram-header
:rname->idx rname->idx
:preservation-map preservation-map
:subst-mat subst-mat
:seq-resolver seq-resolver
:subst-mat-builder subst-mat-builder
:tag-dict-builder tag-dict-builder
:options options}))

Expand All @@ -36,12 +33,14 @@
tag-compressor-overrides]} (:options container-ctx)
ds-encodings (-> ds/default-data-series-encodings
(ds/apply-ds-compressor-overrides ds-compressor-overrides))
subst-mat (subst-mat/build-subst-matrix (:subst-mat-builder container-ctx))
tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx))
tag-encodings (-> (tag-dict/build-tag-encodings tag-dict)
(ds/apply-tag-compressor-overrides tag-compressor-overrides))]
(assoc container-ctx
:alignment-stats alignment-stats
:ds-encodings ds-encodings
:subst-mat subst-mat
:tag-dict tag-dict
:tag-encodings tag-encodings)))

Expand Down
17 changes: 8 additions & 9 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.encode.record
(:require [cljam.io.cram.encode.alignment-stats :as stats]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.sam.util.cigar :as sam.cigar]
Expand Down Expand Up @@ -82,7 +83,7 @@
(BA (:base f))
(QS (:qual f)))
:subst (do (FC (int \X))
(BS (:subst f)))
(BS @(:subst f)))
:insertion (do (FC (int \I))
(IN (:bases f)))
:deletion (do (FC (int \D))
Expand Down Expand Up @@ -149,7 +150,7 @@
(run! record-encoder records)))

(defn- add-mismatches
[n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs]
[n subst-mat-builder ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs]
(loop [i (long n), rpos (long rpos), spos (long spos), fs fs]
(if (zero? i)
fs
Expand All @@ -164,13 +165,11 @@
:base read-base
:qual (if qs (- (aget qs spos) 33) -1)}
{:code :subst :pos pos
:subst (-> subst-mat
(get (char ref-base))
(get (char read-base)))})]
:subst (subst-mat/assign-code! subst-mat-builder ref-base read-base)})]
(recur (dec i) (inc rpos) (inc spos) (conj! fs f))))))))

(defn- calculate-read-features&end
[seq-resolver subst-mat {:keys [rname ^long pos qual cigar] :as record}]
[seq-resolver subst-mat-builder {:keys [rname ^long pos qual cigar] :as record}]
(if (or (zero? pos) (= (:seq record) "*"))
[[] pos]
(let [ref-bases ^bytes (resolver/resolve-sequence seq-resolver rname)
Expand All @@ -185,7 +184,7 @@
(let [pos (inc spos)]
(case op
(\M \X \=) (recur more (+ rpos n) (+ spos n)
(add-mismatches n subst-mat ref-bases rpos
(add-mismatches n subst-mat-builder ref-bases rpos
read-bases qs spos fs))
\I (let [spos' (+ spos n)
bs (Arrays/copyOfRange read-bases spos spos')]
Expand All @@ -203,7 +202,7 @@
"Preprocesses slice records to calculate some record fields prior to record
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx subst-mat seq-resolver tag-dict-builder]} ^List records]
[{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records]
(let [stats-builder (stats/make-alignment-stats-builder)]
(dotimes [i (.size records)]
(let [record (.get records i)
Expand All @@ -215,7 +214,7 @@
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat record)
[fs end] (calculate-read-features&end seq-resolver subst-mat-builder record)
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
Expand Down
65 changes: 65 additions & 0 deletions src/cljam/io/cram/encode/subst_matrix.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
(ns cljam.io.cram.encode.subst-matrix)

(defprotocol ICounter
(inc! [this]))

(defprotocol IMutableInt
(set-val! [this v]))

(deftype MutableInt [^:unsynchronized-mutable ^long n]
ICounter
(inc! [_]
(set! n (inc n)))
IMutableInt
(set-val! [_ v]
(set! n (long v)))
clojure.lang.IDeref
(deref [_] n))

(defprotocol ISubstMatrixBuilder
(assign-code! [this ref alt])
(build-subst-matrix [this]))

(definline ^:private base->index [b]
`(case (long ~b)
~(int \A) 0
~(int \C) 1
~(int \G) 2
~(int \T) 3
~(int \N) 4))

(deftype SubstMatrixBuilder [^objects freqs]
ISubstMatrixBuilder
(assign-code! [_ r a]
(let [mut (aget ^objects (aget freqs (base->index r)) (base->index a))]
(inc! mut)
mut))
(build-subst-matrix [_]
;; Up to this point, the `MutableInt`s contained in the freqs hold
;; the frequency of how many times the corresponding SNV has been observed.
;; From this point on, a base substitution code (BS code) is assigned to
;; each SNV based on the frequencies, and the value of each `MutableInt` is
;; replaced with the BS code of the corresponding SNV.
(let [all-bases [\A \C \G \T \N]]
(reduce
(fn [m r]
(let [^objects freqs (aget freqs (base->index (int r)))]
(->> all-bases
(keep (fn [a]
(when (not= r a)
[a (aget freqs (base->index (int a)))])))
(sort-by (comp - deref second))
(map-indexed vector)
(reduce (fn [m [i [a mut]]]
(set-val! mut i)
(assoc-in m [r a] i))
m))))
{} all-bases))))

(defn make-subst-matrix-builder
"Creates a new substitution matrix builder."
[]
(->> (fn [] (into-array (repeatedly 5 #(->MutableInt 0))))
(repeatedly 5)
into-array
->SubstMatrixBuilder))
131 changes: 75 additions & 56 deletions test/cljam/io/cram/encode/record_test.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
(ns cljam.io.cram.encode.record-test
(:require [cljam.io.cram.encode.context :as context]
[cljam.io.cram.encode.record :as record]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.sequence :as cseq]
Expand All @@ -19,66 +20,79 @@
(let [s (get seqs chr)]
(.getBytes (subs s (dec (long start)) end)))))))

(def subst-mat
{\A {\T 0, \G 1, \C 2, \N 3}
\T {\A 0, \G 1, \C 2, \N 3}
\G {\A 0, \T 1, \C 2, \N 3}
\C {\A 0, \T 1, \G 2, \N 3}
\N {\A 0, \T 1, \G 2, \C 3}})
(defn- test-subst-mat-builder [m]
(let [^objects arr (make-array Object 5 5)
bs (map-indexed vector "ACGTN")]
(doseq [[i r] bs
[j a] bs
:when (not= i j)]
(aset ^objects (aget arr i) j
(or (get-in m [r a]) (subst-mat/->MutableInt 0))))
(subst-mat/->SubstMatrixBuilder arr)))

(deftest calculate-read-features&end
(are [?record ?expected]
(= ?expected
(walk/prewalk
#(if (instance? (Class/forName "[B") %) (vec %) %)
(#'record/calculate-read-features&end test-seq-resolver subst-mat ?record)))
{:rname "ref", :pos 2, :seq "GCATG", :qual "ABCDE", :cigar "5M"}
[[]
6]

{:rname "ref", :pos 2, :seq "GCCTG", :qual "ABCDE", :cigar "5M"}
[[{:code :subst, :pos 3, :subst 2}]
6]

{:rname "ref", :pos 2, :seq "GCRTG", :qual "ABCDE", :cigar "5M"}
[[{:code :read-base, :pos 3, :base (int \R), :qual (- (int \C) 33)}]
6]

{:rname "ref", :pos 2, :seq "GCCAA", :qual "ABCDE", :cigar "2M2I1M"}
[[{:code :insertion, :pos 3, :bases (mapv int "CA")}]
4]

{:rname "ref", :pos 2, :seq "GTCAA", :qual "ABCDE", :cigar "2M2I1M"}
[[{:code :subst, :pos 2, :subst 1}
{:code :insertion, :pos 3, :bases (mapv int "CA")}]
4]

{:rname "ref", :pos 2, :seq "GCGTT", :qual "ABCDE", :cigar "2M2D3M"}
[[{:code :deletion, :pos 3, :len 2}]
8]

{:rname "ref", :pos 2, :seq "GCCTT", :qual "ABCDE", :cigar "2M2D3M"}
[[{:code :deletion, :pos 3, :len 2}
{:code :subst, :pos 3, :subst 2}]
8]

{:rname "ref", :pos 2, :seq "GCATT", :qual "ABCDE", :cigar "3M2S"}
[[{:code :softclip, :pos 4, :bases (mapv int "TT")}]
4]

{:rname "ref", :pos 2, :seq "TTGCA", :qual "ABCDE", :cigar "2S3M"}
[[{:code :softclip, :pos 1, :bases (mapv int "TT")}]
4]))

(defn- preprocess-slice-records [cram-header records]
(let [a->c (subst-mat/->MutableInt 0)
c->t (subst-mat/->MutableInt 0)
g->c (subst-mat/->MutableInt 0)
subst-mat-builder (test-subst-mat-builder {\A {\C a->c}, \C {\T c->t}, \G {\C g->c}})]
(are [?record ?expected]
(= ?expected
(walk/prewalk
#(if (instance? (Class/forName "[B") %) (vec %) %)
(#'record/calculate-read-features&end test-seq-resolver subst-mat-builder ?record)))
{:rname "ref", :pos 2, :seq "GCATG", :qual "ABCDE", :cigar "5M"}
[[]
6]

{:rname "ref", :pos 2, :seq "GCCTG", :qual "ABCDE", :cigar "5M"}
[[{:code :subst, :pos 3, :subst a->c}]
6]

{:rname "ref", :pos 2, :seq "GCRTG", :qual "ABCDE", :cigar "5M"}
[[{:code :read-base, :pos 3, :base (int \R), :qual (- (int \C) 33)}]
6]

{:rname "ref", :pos 2, :seq "GCCAA", :qual "ABCDE", :cigar "2M2I1M"}
[[{:code :insertion, :pos 3, :bases (mapv int "CA")}]
4]

{:rname "ref", :pos 2, :seq "GTCAA", :qual "ABCDE", :cigar "2M2I1M"}
[[{:code :subst, :pos 2, :subst c->t}
{:code :insertion, :pos 3, :bases (mapv int "CA")}]
4]

{:rname "ref", :pos 2, :seq "GCGTT", :qual "ABCDE", :cigar "2M2D3M"}
[[{:code :deletion, :pos 3, :len 2}]
8]

{:rname "ref", :pos 2, :seq "GCCTT", :qual "ABCDE", :cigar "2M2D3M"}
[[{:code :deletion, :pos 3, :len 2}
{:code :subst, :pos 3, :subst g->c}]
8]

{:rname "ref", :pos 2, :seq "GCATT", :qual "ABCDE", :cigar "3M2S"}
[[{:code :softclip, :pos 4, :bases (mapv int "TT")}]
4]

{:rname "ref", :pos 2, :seq "TTGCA", :qual "ABCDE", :cigar "2S3M"}
[[{:code :softclip, :pos 1, :bases (mapv int "TT")}]
4])
(is (= 1 @a->c))
(is (= 1 @c->t))
(is (= 1 @g->c))))

(defn- preprocess-slice-records [cram-header subst-mat-init records]
(let [opts {:ds-compressor-overrides (constantly :raw)
:tag-compressor-overrides (constantly (constantly (constantly {:external :raw})))}
container-ctx (context/make-container-context cram-header test-seq-resolver opts)
container-ctx (-> (context/make-container-context cram-header test-seq-resolver opts)
(assoc :subst-mat-builder (test-subst-mat-builder subst-mat-init)))
stats (record/preprocess-slice-records container-ctx records)]
(context/finalize-container-context container-ctx [stats])))

(deftest preprocess-slice-records-test
(let [cram-header {:SQ [{:SN "ref"}]}
c->a (subst-mat/->MutableInt 0)
subst-mat-init {\C {\A c->a}}
records (ArrayList.
[{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
Expand All @@ -100,13 +114,13 @@
:options []}
{:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*"
:options []}])
container-ctx (preprocess-slice-records cram-header records)]
container-ctx (preprocess-slice-records cram-header subst-mat-init records)]
(is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0
::record/features [{:code :subst, :pos 3 :subst 0}]}
::record/features [{:code :subst, :pos 3 :subst c->a}]}
{:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
Expand Down Expand Up @@ -134,6 +148,7 @@
::record/features []}]
(walk/prewalk #(if (.isArray (class %)) (vec %) %)
(vec records))))
(is (= 0 @c->a))
(is (= [[{:tag :MD, :type \Z} {:tag :NM, :type \c}]
[]]
(:tag-dict container-ctx)))
Expand All @@ -160,6 +175,8 @@
:RG
[{:ID "rg001"}
{:ID "rg002"}]}
c->a (subst-mat/->MutableInt 0)
subst-mat-init {\C {\A c->a}}
records (ArrayList.
[{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0,
:cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
Expand All @@ -184,7 +201,8 @@
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options []}])
slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records) 0)
slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init records)
(context/make-slice-context 0))
_ (record/encode-slice-records slice-ctx records)
ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx))
tag-res (walk/prewalk #(if (fn? %) (%) %) (:tag-encoders slice-ctx))]
Expand Down Expand Up @@ -279,7 +297,7 @@

(is (= 1 (count (get ds-res :BS))))
(is (= 23 (get-in ds-res [:BS 0 :content-id])))
(is (= [0] (seq (get-in ds-res [:BS 0 :data]))))
(is (= [@c->a] (seq (get-in ds-res [:BS 0 :data]))))

(is (= 2 (count (get ds-res :IN))))
(is (= 24 (get-in ds-res [:IN 0 :content-id])))
Expand Down Expand Up @@ -355,7 +373,8 @@
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options []}])
slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records) 0)
slice-ctx (-> (preprocess-slice-records cram-header {} records)
(context/make-slice-context 0))
_ (record/encode-slice-records slice-ctx records)
ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx))
tag-res (walk/prewalk #(if (fn? %) (%) %) (:tag-encoders slice-ctx))]
Expand Down
Loading

0 comments on commit fb6193b

Please sign in to comment.