From 03e8b212b6d0ffec2e633c01e176b1445acc8bd8 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 4 Sep 2024 10:56:56 +0900 Subject: [PATCH] Embed ref seqs if :embed-reference? is set to true --- src/cljam/io/cram.clj | 17 +-- src/cljam/io/cram/data_series.clj | 68 ++++++----- src/cljam/io/cram/encode/context.clj | 13 ++- src/cljam/io/cram/encode/partitioning.clj | 30 +++-- src/cljam/io/cram/encode/structure.clj | 7 +- src/cljam/io/cram/writer.clj | 119 ++++++++++++------- test/cljam/io/cram/encode/record_test.clj | 136 +++++++++++----------- test/cljam/io/cram_test.clj | 60 +++++++++- 8 files changed, 283 insertions(+), 167 deletions(-) diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj index f618eb65..267fb2f6 100644 --- a/src/cljam/io/cram.clj +++ b/src/cljam/io/cram.clj @@ -72,13 +72,16 @@ a tag keyword, returns a keyword or a set of keywords representing compression method. It may return another function to add more conditions for the tag type and/or block encoding. - - create-index?: If true, creates a .crai index file in the course of CRAM - file writing. - - skip-sort-order-check?: When creating a CRAM index for the CRAM file, - the CRAM writer, by default, checks if the header is declared as - `SO:coordinate` and raises an error if not. - If this option is set to true, the CRAM writer will skip the header check - and create an index file regardless of the header declaration." + - create-index?: If set to true, the CRAM writer creates a .crai index file + in the course of CRAM file writing. + - embed-reference?: If set to true, the CRAM writer embeds the reference + sequences into the resulting CRAM file, allowing it to be read without + needing an external reference file. + - skip-sort-order-check?: When creating a CRAM index or embedding reference + sequences into a CRAM file, the CRAM writer, by default, checks if + the header specifies `SO:coordinate` and raises an error if it does not. + If this option is set to true, the CRAM writer will skip this header + check and proceed regardless of the header's sort order declaration." (^CRAMWriter [f] (writer f {})) (^CRAMWriter [f option] (cram/writer f option))) diff --git a/src/cljam/io/cram/data_series.clj b/src/cljam/io/cram/data_series.clj index 3f427d0a..8ed65155 100644 --- a/src/cljam/io/cram/data_series.clj +++ b/src/cljam/io/cram/data_series.clj @@ -147,42 +147,46 @@ (def ^{:doc "Default encodings for all the data series"} default-data-series-encodings - {:BF {:content-id 1, :codec :external, :compressor :gzip} - :CF {:content-id 2, :codec :external, :compressor :gzip} - :RI {:content-id 3, :codec :external, :compressor :gzip} - :RL {:content-id 4, :codec :external, :compressor :gzip} - :AP {:content-id 5, :codec :external, :compressor :gzip} - :RG {:content-id 6, :codec :external, :compressor :gzip} - :RN {:content-id 7, :codec :byte-array-stop, :stop-byte (int \tab), :compressor :gzip} - :MF {:content-id 8, :codec :external, :compressor :gzip} - :NS {:content-id 9, :codec :external, :compressor :gzip} - :NP {:content-id 10, :codec :external, :compressor :gzip} - :TS {:content-id 11, :codec :external, :compressor :gzip} - :NF {:content-id 12, :codec :external, :compressor :gzip} - :TL {:content-id 13, :codec :external, :compressor :gzip} - :FN {:content-id 14, :codec :external, :compressor :gzip} - :FC {:content-id 15, :codec :external, :compressor :gzip} - :FP {:content-id 16, :codec :external, :compressor :gzip} - :DL {:content-id 17, :codec :external, :compressor :gzip} + {;; :embedded-ref is a pseudo data series and won't be used for the real encoding + ;; except for the value for :compressor, which is used when compressing embedded + ;; reference blocks + :embedded-ref {:content-id 1, :codec :external, :compressor :gzip} + :BF {:content-id 2, :codec :external, :compressor :gzip} + :CF {:content-id 3, :codec :external, :compressor :gzip} + :RI {:content-id 4, :codec :external, :compressor :gzip} + :RL {:content-id 5, :codec :external, :compressor :gzip} + :AP {:content-id 6, :codec :external, :compressor :gzip} + :RG {:content-id 7, :codec :external, :compressor :gzip} + :RN {:content-id 8, :codec :byte-array-stop, :stop-byte (int \tab), :compressor :gzip} + :MF {:content-id 9, :codec :external, :compressor :gzip} + :NS {:content-id 10, :codec :external, :compressor :gzip} + :NP {:content-id 11, :codec :external, :compressor :gzip} + :TS {:content-id 12, :codec :external, :compressor :gzip} + :NF {:content-id 13, :codec :external, :compressor :gzip} + :TL {:content-id 14, :codec :external, :compressor :gzip} + :FN {:content-id 15, :codec :external, :compressor :gzip} + :FC {:content-id 16, :codec :external, :compressor :gzip} + :FP {:content-id 17, :codec :external, :compressor :gzip} + :DL {:content-id 18, :codec :external, :compressor :gzip} :BB {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 18, :compressor :gzip} - :val-encoding {:codec :external, :content-id 19, :compressor :gzip}} + :len-encoding {:codec :external, :content-id 19, :compressor :gzip} + :val-encoding {:codec :external, :content-id 20, :compressor :gzip}} :QQ {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 20, :compressor :gzip} - :val-encoding {:codec :external, :content-id 21, :compressor :gzip}} - :BS {:content-id 22, :codec :external, :compressor :gzip} + :len-encoding {:codec :external, :content-id 21, :compressor :gzip} + :val-encoding {:codec :external, :content-id 22, :compressor :gzip}} + :BS {:content-id 23, :codec :external, :compressor :gzip} :IN {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 23, :compressor :gzip} - :val-encoding {:codec :external, :content-id 24, :compressor :gzip}} - :RS {:content-id 25, :codec :external, :compressor :gzip} - :PD {:content-id 26, :codec :external, :compressor :gzip} - :HC {:content-id 27, :codec :external, :compressor :gzip} + :len-encoding {:codec :external, :content-id 24, :compressor :gzip} + :val-encoding {:codec :external, :content-id 25, :compressor :gzip}} + :RS {:content-id 26, :codec :external, :compressor :gzip} + :PD {:content-id 27, :codec :external, :compressor :gzip} + :HC {:content-id 28, :codec :external, :compressor :gzip} :SC {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 28, :compressor :gzip} - :val-encoding {:codec :external, :content-id 29, :compressor :gzip}} - :MQ {:content-id 30, :codec :external, :compressor :gzip} - :BA {:content-id 31, :codec :external, :compressor :gzip} - :QS {:content-id 32, :codec :external, :compressor :gzip}}) + :len-encoding {:codec :external, :content-id 29, :compressor :gzip} + :val-encoding {:codec :external, :content-id 30, :compressor :gzip}} + :MQ {:content-id 31, :codec :external, :compressor :gzip} + :BA {:content-id 32, :codec :external, :compressor :gzip} + :QS {:content-id 33, :codec :external, :compressor :gzip}}) (defn- build-codec-encoder [{:keys [codec content-id compressor] :as params} data-type content-id->state] diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index 4f926eeb..fb48aa03 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -5,7 +5,7 @@ (defn make-container-context "Creates a new container context." - [cram-header seq-resolver] + [cram-header seq-resolver options] (let [rname->idx (into {} (map-indexed (fn [i {:keys [SN]}] [SN i])) (:SQ cram-header)) @@ -24,14 +24,17 @@ :preservation-map preservation-map :subst-mat subst-mat :seq-resolver seq-resolver - :tag-dict-builder tag-dict-builder})) + :tag-dict-builder tag-dict-builder + :options options})) (defn finalize-container-context "Finalizes the builders in the container context and returns a new container context containing those builders' results. This operation must be done before creating a slice context." - [container-ctx alignment-stats ds-compressor-overrides tag-compressor-overrides] - (let [ds-encodings (-> ds/default-data-series-encodings + [container-ctx alignment-stats] + (let [{:keys [ds-compressor-overrides + tag-compressor-overrides]} (:options container-ctx) + ds-encodings (-> ds/default-data-series-encodings (ds/apply-ds-compressor-overrides ds-compressor-overrides)) tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx)) tag-encodings (-> (tag-dict/build-tag-encodings tag-dict) @@ -46,7 +49,7 @@ "Creates a slice context for the ith slice from the given container context. Note that the container context must be finalized with `finalize-container-context`." [{:keys [alignment-stats ds-encodings tag-encodings] :as container-ctx} i] - (let [ds-encoders (ds/build-data-series-encoders ds-encodings) + (let [ds-encoders (ds/build-data-series-encoders (dissoc ds-encodings :embedded-ref)) tag-encoders (ds/build-tag-encoders tag-encodings)] (assoc container-ctx :alignment-stats (nth alignment-stats i) diff --git a/src/cljam/io/cram/encode/partitioning.clj b/src/cljam/io/cram/encode/partitioning.clj index 3cffa4b7..11eb9192 100644 --- a/src/cljam/io/cram/encode/partitioning.clj +++ b/src/cljam/io/cram/encode/partitioning.clj @@ -3,8 +3,9 @@ (:import [java.util ArrayList List])) (defn- slice-record-collector - [header {:keys [^long records-per-slice ^long min-single-ref-slice-size] - :or {records-per-slice 10000, min-single-ref-slice-size 1000}}] + [header + {:keys [^long records-per-slice ^long min-single-ref-slice-size embed-reference?] + :or {records-per-slice 10000, min-single-ref-slice-size 1000}}] (let [sort-by-coord? (= (sam.header/sort-order header) sam.header/order-coordinate) ;; CRAM files sorted by coord can easily get back from a multi-ref state ;; to a single-ref state. To support this, multi-ref slices should be kept @@ -26,15 +27,17 @@ :unmapped (if (< n-records records-per-slice) - (let [ref-state' (cond (= rname "*") ref-state - sort-by-coord? (throw - (ex-info - (str "Unmapped records " - "must be last") - {})) - :else :multi-ref)] - (.add slice-records aln) - (recur ref-state' more)) + (if-let [ref-state' (cond (= rname "*") ref-state + sort-by-coord? (throw + (ex-info + (str "Unmapped records " + "must be last") + {})) + embed-reference? nil + :else :multi-ref)] + (do (.add slice-records aln) + (recur ref-state' more)) + [ref-state slice-records alns]) [ref-state slice-records alns]) :multi-ref @@ -52,7 +55,10 @@ ;; slices, instead of creating and adding a new multi-ref slice ;; to that container, add the accumulated single-ref slice ;; to the container and add the current record to the next slice - (if (and empty-container? (< n-records min-single-ref-slice-size)) + (if (and empty-container? + ;; reference embedding is not compatible with multi-ref slices + (not embed-reference?) + (< n-records min-single-ref-slice-size)) (do (.add slice-records aln) (recur :multi-ref more)) [ref-state slice-records alns])))) diff --git a/src/cljam/io/cram/encode/structure.clj b/src/cljam/io/cram/encode/structure.clj index fbb84ef9..c00a05c4 100644 --- a/src/cljam/io/cram/encode/structure.clj +++ b/src/cljam/io/cram/encode/structure.clj @@ -210,10 +210,13 @@ (defn encode-compression-header-block "Encodes a compression header block to the given OutputStream." [out preservation-map subst-mat tag-dict ds-encodings tag-encodings] - (let [bs (with-out-byte-array + ;; ensure that ds-encodings does not contain the encoding for the :embedded-ref + ;; pseudo data series + (let [ds-encodings' (dissoc ds-encodings :embedded-ref) + bs (with-out-byte-array (fn [out'] (encode-preservation-map out' preservation-map subst-mat tag-dict) - (encode-data-series-encodings out' ds-encodings) + (encode-data-series-encodings out' ds-encodings') (encode-tag-encoding-map out' tag-encodings)))] (encode-block out :raw 1 0 (alength bs) bs))) diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj index 1a8b7bd7..2f787c71 100644 --- a/src/cljam/io/cram/writer.clj +++ b/src/cljam/io/cram/writer.clj @@ -1,6 +1,7 @@ (ns cljam.io.cram.writer (:require [cljam.io.crai :as crai] [cljam.io.cram.encode.alignment-stats :as stats] + [cljam.io.cram.encode.compressor :as compressor] [cljam.io.cram.encode.context :as context] [cljam.io.cram.encode.partitioning :as partition] [cljam.io.cram.encode.record :as record] @@ -41,46 +42,69 @@ (defn write-header "Writes the CRAM header." [^CRAMWriter wtr header] - (when (and (.-index-writer wtr) - (not (:skip-sort-order-check? (.-options wtr))) - (not= (sam.header/sort-order header) sam.header/order-coordinate)) - (throw - (ex-info "Cannot create CRAM index file for CRAM file not declared as sorted by coordinate" - {:sort-order (sam.header/sort-order header)}))) - (struct/encode-cram-header-container (.-stream wtr) header)) + (let [opts (.-options wtr) + sort-order (sam.header/sort-order header)] + (when (and (not (= sort-order sam.header/order-coordinate)) + (not (:skip-sort-order-check? opts))) + (when (.-index-writer wtr) + (throw + (ex-info (str "Cannot create CRAM index file for CRAM file not declared " + "as sorted by coordinate") + {:sort-order sort-order}))) + (when (:embed-reference? opts) + (throw + (ex-info (str "Cannot embed reference sequences for CRAM file not declared " + "as sorted by coordinate") + {:sort-order sort-order})))) + (struct/encode-cram-header-container (.-stream wtr) header))) (defn- preprocess-records [cram-header seq-resolver options ^objects container-records] - (let [container-ctx (context/make-container-context cram-header seq-resolver) - {:keys [ds-compressor-overrides tag-compressor-overrides]} options + (let [container-ctx (context/make-container-context cram-header seq-resolver options) stats (mapv (partial record/preprocess-slice-records container-ctx) container-records)] - (context/finalize-container-context container-ctx - stats - ds-compressor-overrides - tag-compressor-overrides))) - -(defn- generate-blocks [slice-ctx] - (->> (context/encoding-results slice-ctx) - (keep (fn [{:keys [^long raw-size] :as block}] - (when (pos? raw-size) - (update block :data - (fn [^bytes data] - (struct/generate-block (:compressor block) 4 - (:content-id block) raw-size - data)))))) - ;; sort + dedupe by :content-id - (into (sorted-map) (map (juxt :content-id identity))) - vals - (cons {:content-id 0 - :data (struct/generate-block :raw 5 0 0 (byte-array 0))}))) - -(defn- reference-md5 - [{:keys [seq-resolver cram-header]} {:keys [^long ri ^long start ^long end]}] + (context/finalize-container-context container-ctx stats))) + +(def ^:private ^:const embedded-ref-content-id 1) + +(defn- generate-embedded-ref-block [slice-ctx ^bytes embedded-ref] + (let [raw-size (alength embedded-ref) + compr (compressor/compressor (get-in slice-ctx [:ds-encodings :embedded-ref :compressor])) + _ (with-open [^OutputStream os (compressor/compressor-output-stream compr)] + (.write os embedded-ref)) + {:keys [compressor data]} (compressor/->compressed-result compr)] + {:content-id embedded-ref-content-id + :data (struct/generate-block compressor 4 embedded-ref-content-id raw-size data)})) + +(defn- generate-blocks [slice-ctx ^bytes embedded-ref] + (letfn [(inject-embedded-ref-block [blocks] + (cond->> blocks + embedded-ref + (cons (generate-embedded-ref-block slice-ctx embedded-ref))))] + (->> (context/encoding-results slice-ctx) + (keep (fn [{:keys [^long raw-size] :as block}] + (when (pos? raw-size) + (update block :data + (fn [^bytes data] + (struct/generate-block (:compressor block) 4 + (:content-id block) raw-size + data)))))) + ;; sort + dedupe by :content-id + (into (sorted-map) (map (juxt :content-id identity))) + vals + inject-embedded-ref-block + (cons {:content-id 0 + :data (struct/generate-block :raw 5 0 0 (byte-array 0))})))) + +(defn- resolve-reference-seq + ^bytes [{:keys [seq-resolver cram-header]} {:keys [^long ri ^long start ^long end]}] + (let [chr (:SN (nth (:SQ cram-header) ri))] + (resolver/resolve-sequence seq-resolver chr start end))) + +(defn- reference-md5 [slice-ctx ^bytes embedded-ref {:keys [^long ri] :as stats}] (if (neg? ri) (byte-array 16) - (let [chr (:SN (nth (:SQ cram-header) ri)) - ref-bases (resolver/resolve-sequence seq-resolver chr start end) + (let [ref-bases (or embedded-ref (resolve-reference-seq slice-ctx stats)) md5 (MessageDigest/getInstance "md5")] (.digest md5 ref-bases)))) @@ -94,11 +118,14 @@ (defn- generate-slice [slice-ctx counter slice-records] (record/encode-slice-records slice-ctx slice-records) (let [stats (:alignment-stats slice-ctx) - blocks (generate-blocks slice-ctx) - ref-md5 (reference-md5 slice-ctx stats) + embedded-ref (when (and (:embed-reference? (:options slice-ctx)) + (>= (long (:ri stats)) 0)) + (resolve-reference-seq slice-ctx stats)) + blocks (generate-blocks slice-ctx embedded-ref) + ref-md5 (reference-md5 slice-ctx embedded-ref stats) header (assoc (stats->header-base stats) :counter counter - :embedded-reference -1 + :embedded-reference (if embedded-ref embedded-ref-content-id -1) :reference-md5 ref-md5) header-block (struct/generate-slice-header-block header blocks) block-data (mapv :data blocks)] @@ -119,9 +146,20 @@ acc))) (defn- generate-compression-header-block - ^bytes [{:keys [preservation-map subst-mat tag-dict ds-encodings tag-encodings]}] - (struct/generate-compression-header-block preservation-map subst-mat tag-dict - ds-encodings tag-encodings)) + ^bytes + [{:keys [preservation-map subst-mat tag-dict ds-encodings tag-encodings]} slices] + (let [preservation-map' (cond-> preservation-map + ;; A reference file is not required when either of + ;; the following holds for every slice in the container: + ;; - the slice has an embedded reference block + ;; - the slice only contains unmapped records + (every? (fn [{:keys [header]}] + (or (>= (long (:embedded-reference header)) 0) + (= (long (:ref-seq-id header)) -1))) + slices) + (assoc :RR false))] + (struct/generate-compression-header-block preservation-map' subst-mat tag-dict + ds-encodings tag-encodings))) (defn- generate-container-header [container-ctx ^bytes compression-header-block slices] (let [stats (stats/merge-stats (:alignment-stats container-ctx)) @@ -175,7 +213,8 @@ (let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr) (.-options wtr) container-records) slices (generate-slices container-ctx counter container-records) - compression-header-block (generate-compression-header-block container-ctx) + compression-header-block (generate-compression-header-block container-ctx + slices) container-header (generate-container-header container-ctx compression-header-block slices) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index 9e7d4a59..f00ed6e1 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -71,11 +71,11 @@ 4])) (defn- preprocess-slice-records [cram-header records] - (let [container-ctx (context/make-container-context cram-header test-seq-resolver) + (let [opts {:ds-compressor-overrides (constantly :raw) + :tag-compressor-overrides (constantly (constantly (constantly {:external :raw})))} + container-ctx (context/make-container-context cram-header test-seq-resolver opts) stats (record/preprocess-slice-records container-ctx records)] - (context/finalize-container-context container-ctx [stats] - (constantly :raw) - (constantly (constantly (constantly {:external :raw})))))) + (context/finalize-container-context container-ctx [stats]))) (deftest preprocess-slice-records-test (let [cram-header {:SQ [{:SN "ref"}]} @@ -192,129 +192,129 @@ (into {} (:alignment-stats slice-ctx)))) (is (= 1 (count (get ds-res :BF)))) - (is (= 1 (get-in ds-res [:BF 0 :content-id]))) + (is (= 2 (get-in ds-res [:BF 0 :content-id]))) (is (= [0x43 0x43 0x80 0x91 0x80 0x93 0x41] (map #(bit-and % 0xff) (get-in ds-res [:BF 0 :data])))) (is (= 1 (count (get ds-res :CF)))) - (is (= 2 (get-in ds-res [:CF 0 :content-id]))) + (is (= 3 (get-in ds-res [:CF 0 :content-id]))) (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) (is (= 1 (count (get ds-res :RI)))) - (is (= 3 (get-in ds-res [:RI 0 :content-id]))) + (is (= 4 (get-in ds-res [:RI 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:RI 0 :data])))) (is (= 1 (count (get ds-res :RL)))) - (is (= 4 (get-in ds-res [:RL 0 :content-id]))) + (is (= 5 (get-in ds-res [:RL 0 :content-id]))) (is (= [5 5 5 5 5] (seq (get-in ds-res [:RL 0 :data])))) (is (= 1 (count (get ds-res :AP)))) - (is (= 5 (get-in ds-res [:AP 0 :content-id]))) + (is (= 6 (get-in ds-res [:AP 0 :content-id]))) (is (= [0 4 5 5 5] (seq (get-in ds-res [:AP 0 :data])))) (is (= 1 (count (get ds-res :RG)))) - (is (= 6 (get-in ds-res [:RG 0 :content-id]))) + (is (= 7 (get-in ds-res [:RG 0 :content-id]))) (is (= [0 0 1 1 0xff 0xff 0xff 0xff 0x0f] (map #(bit-and % 0xff) (get-in ds-res [:RG 0 :data])))) (is (= 1 (count (get ds-res :RN)))) - (is (= 7 (get-in ds-res [:RN 0 :content-id]))) + (is (= 8 (get-in ds-res [:RN 0 :content-id]))) (is (= "q001\tq002\tq003\tq004\tq005\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) (is (= 1 (count (get ds-res :MF)))) - (is (= 8 (get-in ds-res [:MF 0 :content-id]))) + (is (= 9 (get-in ds-res [:MF 0 :content-id]))) (is (= [1 1 1 0 2] (seq (get-in ds-res [:MF 0 :data])))) (is (= 1 (count (get ds-res :NS)))) - (is (= 9 (get-in ds-res [:NS 0 :content-id]))) + (is (= 10 (get-in ds-res [:NS 0 :content-id]))) (is (= [0 0 1 0 0xff 0xff 0xff 0xff 0x0f] (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) (is (= 1 (count (get ds-res :NP)))) - (is (= 10 (get-in ds-res [:NP 0 :content-id]))) + (is (= 11 (get-in ds-res [:NP 0 :content-id]))) (is (= [0x80 0x97 0x0f 0x64 0x05 0x00] (map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data])))) (is (= 1 (count (get ds-res :TS)))) - (is (= 11 (get-in ds-res [:TS 0 :content-id]))) + (is (= 12 (get-in ds-res [:TS 0 :content-id]))) (is (= [0x80 0x96 0x0f 0x00 0xff 0xff 0xff 0xff 0x01 0x00] (map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data])))) (is (= 1 (count (get ds-res :NF)))) - (is (= 12 (get-in ds-res [:NF 0 :content-id]))) + (is (= 13 (get-in ds-res [:NF 0 :content-id]))) (is (zero? (count (get-in ds-res [:NF 0 :data])))) (is (= 1 (count (get ds-res :TL)))) - (is (= 13 (get-in ds-res [:TL 0 :content-id]))) + (is (= 14 (get-in ds-res [:TL 0 :content-id]))) (is (= [0 0 0 0 1] (seq (get-in ds-res [:TL 0 :data])))) (is (= 1 (count (get ds-res :FN)))) - (is (= 14 (get-in ds-res [:FN 0 :content-id]))) + (is (= 15 (get-in ds-res [:FN 0 :content-id]))) (is (= [1 1 0 2 0] (seq (get-in ds-res [:FN 0 :data])))) (is (= 1 (count (get ds-res :FC)))) - (is (= 15 (get-in ds-res [:FC 0 :content-id]))) + (is (= 16 (get-in ds-res [:FC 0 :content-id]))) (is (= [(int \X) (int \S) (int \I) (int \D)] (seq (get-in ds-res [:FC 0 :data])))) (is (= 1 (count (get ds-res :FP)))) - (is (= 16 (get-in ds-res [:FP 0 :content-id]))) + (is (= 17 (get-in ds-res [:FP 0 :content-id]))) (is (= [3 1 2 2] (seq (get-in ds-res [:FP 0 :data])))) (is (= 1 (count (get ds-res :DL)))) - (is (= 17 (get-in ds-res [:DL 0 :content-id]))) + (is (= 18 (get-in ds-res [:DL 0 :content-id]))) (is (= [1] (seq (get-in ds-res [:DL 0 :data])))) (is (= 2 (count (get ds-res :BB)))) - (is (= 18 (get-in ds-res [:BB 0 :content-id]))) + (is (= 19 (get-in ds-res [:BB 0 :content-id]))) (is (zero? (count (get-in ds-res [:BB 0 :data])))) - (is (= 19 (get-in ds-res [:BB 1 :content-id]))) + (is (= 20 (get-in ds-res [:BB 1 :content-id]))) (is (zero? (count (get-in ds-res [:BB 1 :data])))) (is (= 2 (count (get ds-res :QQ)))) - (is (= 20 (get-in ds-res [:QQ 0 :content-id]))) + (is (= 21 (get-in ds-res [:QQ 0 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 0 :data])))) - (is (= 21 (get-in ds-res [:QQ 1 :content-id]))) + (is (= 22 (get-in ds-res [:QQ 1 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 1 :data])))) (is (= 1 (count (get ds-res :BS)))) - (is (= 22 (get-in ds-res [:BS 0 :content-id]))) + (is (= 23 (get-in ds-res [:BS 0 :content-id]))) (is (= [0] (seq (get-in ds-res [:BS 0 :data])))) (is (= 2 (count (get ds-res :IN)))) - (is (= 23 (get-in ds-res [:IN 0 :content-id]))) + (is (= 24 (get-in ds-res [:IN 0 :content-id]))) (is (= [1] (seq (get-in ds-res [:IN 0 :data])))) - (is (= 24 (get-in ds-res [:IN 1 :content-id]))) + (is (= 25 (get-in ds-res [:IN 1 :content-id]))) (is (= "A" (String. ^bytes (get-in ds-res [:IN 1 :data])))) (is (= 1 (count (get ds-res :RS)))) - (is (= 25 (get-in ds-res [:RS 0 :content-id]))) + (is (= 26 (get-in ds-res [:RS 0 :content-id]))) (is (zero? (count (get-in ds-res [:RS 0 :data])))) (is (= 1 (count (get ds-res :PD)))) - (is (= 26 (get-in ds-res [:PD 0 :content-id]))) + (is (= 27 (get-in ds-res [:PD 0 :content-id]))) (is (zero? (count (get-in ds-res [:PD 0 :data])))) (is (= 1 (count (get ds-res :HC)))) - (is (= 27 (get-in ds-res [:HC 0 :content-id]))) + (is (= 28 (get-in ds-res [:HC 0 :content-id]))) (is (zero? (count (get-in ds-res [:HC 0 :data])))) (is (= 2 (count (get ds-res :SC)))) - (is (= 28 (get-in ds-res [:SC 0 :content-id]))) + (is (= 29 (get-in ds-res [:SC 0 :content-id]))) (is (= [2] (seq (get-in ds-res [:SC 0 :data])))) - (is (= 29 (get-in ds-res [:SC 1 :content-id]))) + (is (= 30 (get-in ds-res [:SC 1 :content-id]))) (is (= "CC" (String. ^bytes (get-in ds-res [:SC 1 :data])))) (is (= 1 (count (get ds-res :MQ)))) - (is (= 30 (get-in ds-res [:MQ 0 :content-id]))) + (is (= 31 (get-in ds-res [:MQ 0 :content-id]))) (is (= [0 15 60 15 0] (seq (get-in ds-res [:MQ 0 :data])))) (is (= 1 (count (get ds-res :BA)))) - (is (= 31 (get-in ds-res [:BA 0 :content-id]))) + (is (= 32 (get-in ds-res [:BA 0 :content-id]))) (is (zero? (count (get-in ds-res [:BA 0 :data])))) (is (= 1 (count (get ds-res :QS)))) - (is (= 32 (get-in ds-res [:QS 0 :content-id]))) + (is (= 33 (get-in ds-res [:QS 0 :content-id]))) (is (= "HFHHH##AACCCCFFEBBFFAEEEE" (->> (get-in ds-res [:QS 0 :data]) (map #(+ (long %) 33)) @@ -363,16 +363,16 @@ (into {} (:alignment-stats slice-ctx)))) (is (= 1 (count (get ds-res :BF)))) - (is (= 1 (get-in ds-res [:BF 0 :content-id]))) + (is (= 2 (get-in ds-res [:BF 0 :content-id]))) (is (= [0x45 0x80 0x85 0x45 0x80 0x85 0x45] (map #(bit-and % 0xff) (get-in ds-res [:BF 0 :data])))) (is (= 1 (count (get ds-res :CF)))) - (is (= 2 (get-in ds-res [:CF 0 :content-id]))) + (is (= 3 (get-in ds-res [:CF 0 :content-id]))) (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) (is (= 1 (count (get ds-res :RI)))) - (is (= 3 (get-in ds-res [:RI 0 :content-id]))) + (is (= 4 (get-in ds-res [:RI 0 :content-id]))) (is (= [0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f @@ -381,15 +381,15 @@ (map #(bit-and % 0xff) (get-in ds-res [:RI 0 :data])))) (is (= 1 (count (get ds-res :RL)))) - (is (= 4 (get-in ds-res [:RL 0 :content-id]))) + (is (= 5 (get-in ds-res [:RL 0 :content-id]))) (is (= [5 5 5 5 5] (seq (get-in ds-res [:RL 0 :data])))) (is (= 1 (count (get ds-res :AP)))) - (is (= 5 (get-in ds-res [:AP 0 :content-id]))) + (is (= 6 (get-in ds-res [:AP 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:AP 0 :data])))) (is (= 1 (count (get ds-res :RG)))) - (is (= 6 (get-in ds-res [:RG 0 :content-id]))) + (is (= 7 (get-in ds-res [:RG 0 :content-id]))) (is (= [0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f @@ -398,15 +398,15 @@ (map #(bit-and % 0xff) (get-in ds-res [:RG 0 :data])))) (is (= 1 (count (get ds-res :RN)))) - (is (= 7 (get-in ds-res [:RN 0 :content-id]))) + (is (= 8 (get-in ds-res [:RN 0 :content-id]))) (is (= "q001\tq001\tq002\tq002\tq003\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) (is (= 1 (count (get ds-res :MF)))) - (is (= 8 (get-in ds-res [:MF 0 :content-id]))) + (is (= 9 (get-in ds-res [:MF 0 :content-id]))) (is (= [2 2 2 2 2] (seq (get-in ds-res [:MF 0 :data])))) (is (= 1 (count (get ds-res :NS)))) - (is (= 9 (get-in ds-res [:NS 0 :content-id]))) + (is (= 10 (get-in ds-res [:NS 0 :content-id]))) (is (= [0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f @@ -415,88 +415,88 @@ (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) (is (= 1 (count (get ds-res :NP)))) - (is (= 10 (get-in ds-res [:NP 0 :content-id]))) + (is (= 11 (get-in ds-res [:NP 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:NP 0 :data])))) (is (= 1 (count (get ds-res :TS)))) - (is (= 11 (get-in ds-res [:TS 0 :content-id]))) + (is (= 12 (get-in ds-res [:TS 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:TS 0 :data])))) (is (= 1 (count (get ds-res :NF)))) - (is (= 12 (get-in ds-res [:NF 0 :content-id]))) + (is (= 13 (get-in ds-res [:NF 0 :content-id]))) (is (zero? (count (get-in ds-res [:NF 0 :data])))) (is (= 1 (count (get ds-res :TL)))) - (is (= 13 (get-in ds-res [:TL 0 :content-id]))) + (is (= 14 (get-in ds-res [:TL 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:TL 0 :data])))) (is (= 1 (count (get ds-res :FN)))) - (is (= 14 (get-in ds-res [:FN 0 :content-id]))) + (is (= 15 (get-in ds-res [:FN 0 :content-id]))) (is (zero? (count (get-in ds-res [:FN 0 :data])))) (is (= 1 (count (get ds-res :FC)))) - (is (= 15 (get-in ds-res [:FC 0 :content-id]))) + (is (= 16 (get-in ds-res [:FC 0 :content-id]))) (is (zero? (count (get-in ds-res [:FC 0 :data])))) (is (= 1 (count (get ds-res :FP)))) - (is (= 16 (get-in ds-res [:FP 0 :content-id]))) + (is (= 17 (get-in ds-res [:FP 0 :content-id]))) (is (zero? (count (get-in ds-res [:FP 0 :data])))) (is (= 1 (count (get ds-res :DL)))) - (is (= 17 (get-in ds-res [:DL 0 :content-id]))) + (is (= 18 (get-in ds-res [:DL 0 :content-id]))) (is (zero? (count (get-in ds-res [:DL 0 :data])))) (is (= 2 (count (get ds-res :BB)))) - (is (= 18 (get-in ds-res [:BB 0 :content-id]))) + (is (= 19 (get-in ds-res [:BB 0 :content-id]))) (is (zero? (count (get-in ds-res [:BB 0 :data])))) - (is (= 19 (get-in ds-res [:BB 1 :content-id]))) + (is (= 20 (get-in ds-res [:BB 1 :content-id]))) (is (zero? (count (get-in ds-res [:BB 1 :data])))) (is (= 2 (count (get ds-res :QQ)))) - (is (= 20 (get-in ds-res [:QQ 0 :content-id]))) + (is (= 21 (get-in ds-res [:QQ 0 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 0 :data])))) - (is (= 21 (get-in ds-res [:QQ 1 :content-id]))) + (is (= 22 (get-in ds-res [:QQ 1 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 1 :data])))) (is (= 1 (count (get ds-res :BS)))) - (is (= 22 (get-in ds-res [:BS 0 :content-id]))) + (is (= 23 (get-in ds-res [:BS 0 :content-id]))) (is (zero? (count (get-in ds-res [:BS 0 :data])))) (is (= 2 (count (get ds-res :IN)))) - (is (= 23 (get-in ds-res [:IN 0 :content-id]))) + (is (= 24 (get-in ds-res [:IN 0 :content-id]))) (is (zero? (count (get-in ds-res [:IN 0 :data])))) - (is (= 24 (get-in ds-res [:IN 1 :content-id]))) + (is (= 25 (get-in ds-res [:IN 1 :content-id]))) (is (zero? (count (get-in ds-res [:IN 1 :data])))) (is (= 1 (count (get ds-res :RS)))) - (is (= 25 (get-in ds-res [:RS 0 :content-id]))) + (is (= 26 (get-in ds-res [:RS 0 :content-id]))) (is (zero? (count (get-in ds-res [:RS 0 :data])))) (is (= 1 (count (get ds-res :PD)))) - (is (= 26 (get-in ds-res [:PD 0 :content-id]))) + (is (= 27 (get-in ds-res [:PD 0 :content-id]))) (is (zero? (count (get-in ds-res [:PD 0 :data])))) (is (= 1 (count (get ds-res :HC)))) - (is (= 27 (get-in ds-res [:HC 0 :content-id]))) + (is (= 28 (get-in ds-res [:HC 0 :content-id]))) (is (zero? (count (get-in ds-res [:HC 0 :data])))) (is (= 2 (count (get ds-res :SC)))) - (is (= 28 (get-in ds-res [:SC 0 :content-id]))) + (is (= 29 (get-in ds-res [:SC 0 :content-id]))) (is (zero? (count (get-in ds-res [:SC 0 :data])))) - (is (= 29 (get-in ds-res [:SC 1 :content-id]))) + (is (= 30 (get-in ds-res [:SC 1 :content-id]))) (is (zero? (count (get-in ds-res [:SC 1 :data])))) (is (= 1 (count (get ds-res :MQ)))) - (is (= 30 (get-in ds-res [:MQ 0 :content-id]))) + (is (= 31 (get-in ds-res [:MQ 0 :content-id]))) (is (zero? (count (get-in ds-res [:MQ 0 :data])))) (is (= 1 (count (get ds-res :BA)))) - (is (= 31 (get-in ds-res [:BA 0 :content-id]))) + (is (= 32 (get-in ds-res [:BA 0 :content-id]))) (is (= "AATCCATTGTTGGTATCTTGGCACA" (String. ^bytes (get-in ds-res [:BA 0 :data])))) (is (= 1 (count (get ds-res :QS)))) - (is (= 32 (get-in ds-res [:QS 0 :content-id]))) + (is (= 33 (get-in ds-res [:QS 0 :content-id]))) (is (= "CCFFFBDFADADDHFDDDFDBCCFD" (->> (get-in ds-res [:QS 0 :data]) (map #(+ (long %) 33)) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index 68086d16..b04b9f7c 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -1,12 +1,16 @@ (ns cljam.io.cram-test (:require [cljam.io.cram :as cram] + [cljam.io.cram.decode.structure :as struct] + [cljam.io.cram.reader :as reader] [cljam.io.sam :as sam] [cljam.test-common :as common :refer [clean-cache! deftest-remote prepare-cache! prepare-cavia! with-before-after]] [clojure.java.io :as io] - [clojure.test :refer [are deftest is testing]])) + [clojure.test :refer [are deftest is testing]]) + (:import [cljam.io.cram.reader CRAMReader] + [java.nio.channels FileChannel])) (def ^:private temp-cram-file (io/file common/temp-dir "test.cram")) (def ^:private temp-cram-file-2 (io/file common/temp-dir "test2.cram")) @@ -125,6 +129,60 @@ (is (= (cram/read-alignments r) (cram/read-alignments r')))))) +(defn- all-compression-headers [^CRAMReader r] + (let [^FileChannel ch (.-channel r)] + (letfn [(read1 [container-header bb] + (when-not (struct/eof-container? container-header) + (struct/decode-compression-header-block bb))) + (step [] + (when (< (.position ch) (.size ch)) + (when-let [header (#'reader/read-container-with r read1)] + (cons header (lazy-seq (step))))))] + (step)))) + +(deftest writer-with-reference-embedding-test + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (testing "Reference embedding will be enabled if `:embed-reference?` is set to true" + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-sorted-cram-file + {:reference common/test-fa-file + :embed-reference? true + :skip-sort-order-check? true + :ds-compressor-overrides {:embedded-ref :raw}})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-sorted-cram-file)] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r')))) + (with-open [r (cram/reader temp-sorted-cram-file)] + (let [headers (all-compression-headers r)] + (is (seq headers)) + (is (every? #(false? (get-in % [:preservation-map :RR])) headers))))) + (testing "Error when trying to embed reference sequences for a CRAM file not declared as `SO:coordinate`" + (with-open [r (cram/reader common/test-sorted-with-unknown-so-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file + {:reference common/test-fa-file + :embed-reference? true + :ds-compressor-overrides {:embedded-ref :raw}})] + (is (thrown-with-msg? Exception #"Cannot embed reference sequences for CRAM file not declared as sorted by coordinate" + (cram/write-header w (cram/read-header r)))))) + (testing "`:skip-sort-order-check?` skips the header check when embedding reference sequences" + (with-open [r (cram/reader common/test-sorted-with-unknown-so-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file + {:reference common/test-fa-file + :embed-reference? true + :skip-sort-order-check? true + :ds-compressor-overrides {:embedded-ref :raw}})] + (is (common/not-throw? (cram/write-header w (cram/read-header r)))))))) + (deftest writer-index-options-test (with-before-after {:before (prepare-cache!) :after (clean-cache!)}