From aad39bd847ea71c086f6bec29147e8bf1d978d4c Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Sat, 28 Nov 2020 23:39:04 +0100 Subject: [PATCH 01/23] row-diff transform for relation counts --- .../src/annotation/annotation_converters.cpp | 27 +++- .../src/annotation/annotation_converters.hpp | 3 +- .../annotate_column_compressed.hpp | 3 +- metagraph/src/annotation/row_diff_builder.cpp | 147 ++++++++++++++++-- metagraph/src/annotation/row_diff_builder.hpp | 3 +- metagraph/src/cli/config/config.cpp | 1 + metagraph/src/cli/transform_annotation.cpp | 3 +- 7 files changed, 163 insertions(+), 24 deletions(-) diff --git a/metagraph/src/annotation/annotation_converters.cpp b/metagraph/src/annotation/annotation_converters.cpp index c9d65ac9da..27e89446a6 100644 --- a/metagraph/src/annotation/annotation_converters.cpp +++ b/metagraph/src/annotation/annotation_converters.cpp @@ -1122,7 +1122,8 @@ void convert_to_row_diff(const std::vector &files, size_t mem_bytes, uint32_t max_path_length, std::filesystem::path dest_dir, - bool optimize) { + bool optimize, + bool with_counts) { if (!files.size()) return; @@ -1142,9 +1143,24 @@ void convert_to_row_diff(const std::vector &files, size_t mem_bytes_left = mem_bytes; std::vector file_batch; for ( ; i < files.size(); ++i) { - // also include two buffers (fwd and back) for each column transformed - uint64_t file_size = std::filesystem::file_size(files[i]) - + ROW_DIFF_BUFFER_SIZE * sizeof(uint64_t) * 2; + // take annotation size + uint64_t file_size = std::filesystem::file_size(files[i]); + if (!with_counts) { + // also include two buffers (fwd and back) for each column transformed + file_size += ROW_DIFF_BUFFER_SIZE * sizeof(uint64_t) * 2; + } else { + // each buffer will store pairs (idx, count) + file_size += ROW_DIFF_BUFFER_SIZE * (sizeof(uint64_t) * 2) * 2; + // also add k-mer counts + const auto &counts_fname = files[i] + ".counts"; + file_size += std::filesystem::file_size(counts_fname); + if (!std::filesystem::exists(counts_fname)) { + logger->warn("Could not find counts for annotation {}, skipped", + counts_fname); + continue; + } + } + if (file_size > mem_bytes) { logger->warn( "Not enough memory to process {}, requires {} MB, skipped", @@ -1180,7 +1196,8 @@ void convert_to_row_diff(const std::vector &files, convert_batch_to_row_diff( graph_fname, graph_fname + kRowDiffAnchorExt + (optimize ? "" : ".unopt"), - file_batch, dest_dir, row_reduction_fname, ROW_DIFF_BUFFER_SIZE); + file_batch, dest_dir, row_reduction_fname, ROW_DIFF_BUFFER_SIZE, + with_counts); logger->trace("Batch transformed in {} sec", timer.elapsed()); } diff --git a/metagraph/src/annotation/annotation_converters.hpp b/metagraph/src/annotation/annotation_converters.hpp index d4ed5294a6..ee0d828c3f 100644 --- a/metagraph/src/annotation/annotation_converters.hpp +++ b/metagraph/src/annotation/annotation_converters.hpp @@ -108,7 +108,8 @@ void convert_to_row_diff(const std::vector &files, size_t mem_bytes, uint32_t max_path_length, std::filesystem::path dest_dir, - bool optimize = false); + bool optimize = false, + bool with_counts = false); void convert_row_diff_to_col_compressed(const std::vector &files, const std::string &outfbase); diff --git a/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.hpp b/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.hpp index db832a88a7..3b089d17ca 100644 --- a/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.hpp +++ b/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.hpp @@ -25,6 +25,7 @@ class ColumnCompressed : public MultiLabelEncoded