Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IntRowDiff to annotate k-mer counts #315

Merged
merged 28 commits into from
May 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
aad39bd
row-diff transform for relation counts
karasikov Nov 28, 2020
bf91064
Merge branch 'rd_counts' into mk/ann
karasikov Apr 16, 2021
7e432e4
row-diff transform for counts
karasikov Apr 16, 2021
9b9d551
minor: removed constexpr
karasikov Apr 17, 2021
832f7d5
minor
karasikov Apr 18, 2021
938e709
cleanup
karasikov Apr 20, 2021
2b7256d
added IntRowDiff -- a representation for rd-transformed integer matrices
karasikov Apr 20, 2021
f4a8976
transform to IntRowDiff<Int-Multi-BRWT>
karasikov Apr 20, 2021
6f6260b
increment k-mer counts from any input source
karasikov Apr 20, 2021
c753da9
pack int-row-diff into column-compressed
karasikov Apr 21, 2021
df8e577
fixed ci
karasikov Apr 21, 2021
5a062e5
unified row-diff annotations and their loading
karasikov Apr 21, 2021
6bdd9f6
code negative values to pack into smaller integers, also other fixes
karasikov Apr 21, 2021
12bc0aa
added row-diff-int-brwt tests
karasikov Apr 21, 2021
8be1554
cleanup
karasikov Apr 21, 2021
1a919b0
minor
karasikov Apr 22, 2021
409ff91
Merge branch 'dev' into mk/ann
karasikov Apr 22, 2021
c1751be
Merge branch 'dev' into mk/ann
karasikov Apr 26, 2021
1eb2a68
skip empty columns, renamed relation counts -> values
karasikov Apr 26, 2021
63aca3b
allow missing values for empty columns
karasikov Apr 26, 2021
c546c99
skip empty annotations
karasikov Apr 26, 2021
fa8990e
minor
karasikov Apr 28, 2021
2020bcb
skip zero diffs, changed encoding
karasikov Apr 28, 2021
2a4a1ec
Merge remote-tracking branch 'origin/dev' into mk/ann
karasikov Apr 28, 2021
0e87207
added construction scripts
karasikov May 3, 2021
5e654cc
Merge remote-tracking branch 'origin/dev' into mk/ann
karasikov May 3, 2021
ec4aa46
added comment
karasikov May 3, 2021
64a80f8
cleanup
karasikov May 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 74 additions & 15 deletions metagraph/integration_tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
'hashstr': '.hashstrdbg'}

anno_file_extension = {'column': '.column.annodbg',
'row': '.row.annodbg'}
'row': '.row.annodbg',
'row_diff': '.row_diff.annodbg'}

GRAPH_TYPES = [graph_type for graph_type, _ in graph_file_extension.items()]

Expand All @@ -30,8 +31,8 @@ def setUpClass(cls):
cls.tempdir = TemporaryDirectory()

@staticmethod
def _get_stats(graph_filename):
stats_command = METAGRAPH + ' stats ' + graph_filename
def _get_stats(graph_path):
stats_command = METAGRAPH + ' stats ' + graph_path
res = subprocess.run(stats_command.split(), stdout=PIPE, stderr=PIPE)
return res

Expand Down Expand Up @@ -102,15 +103,73 @@ def _clean(graph, output, extra_params=''):
assert res.returncode == 0

@staticmethod
def _annotate_graph(input, graph_path, output, anno_repr):
annotate_command = '{exe} annotate --anno-header -i {graph} \
--anno-type {anno_repr} -o {outfile} -p {num_threads} {input}'.format(
exe=METAGRAPH,
num_threads=NUM_THREADS,
graph=graph_path,
anno_repr=anno_repr,
outfile=output,
input=input
)
res = subprocess.run([annotate_command], shell=True)
assert res.returncode == 0
def _annotate_graph(input, graph_path, output, anno_repr,
separate=False, no_fork_opt=False, no_anchor_opt=False):
target_anno = anno_repr
if anno_repr in {'row_sparse'} or anno_repr.endswith('brwt') or anno_repr.startswith('row_diff'):
target_anno = anno_repr
anno_repr = 'column'
elif anno_repr in {'flat', 'rbfish'}:
target_anno = anno_repr
anno_repr = 'row'

command = f'{METAGRAPH} annotate -p {NUM_THREADS} --anno-header \
-i {graph_path} --anno-type {anno_repr} \
-o {output} {input}'

with_counts = target_anno.endswith('int_brwt')
if with_counts:
command += ' --count-kmers'

res = subprocess.run([command], shell=True)
assert(res.returncode == 0)

if target_anno == anno_repr:
return

final_anno = target_anno
if final_anno.startswith('row_diff'):
target_anno = 'row_diff'

command = f'{METAGRAPH} transform_anno -p {NUM_THREADS} \
--anno-type {target_anno} -o {output} \
{output + anno_file_extension[anno_repr]}'

other_args = ' --count-kmers' if with_counts else ''

if target_anno == 'row_diff':
command += ' -i ' + graph_path

if not no_fork_opt:
if target_anno.startswith('row_diff'):
print('-- Building RowDiff without fork optimization...')
res = subprocess.run([command + other_args], shell=True)
assert(res.returncode == 0)

if target_anno == 'row_diff':
without_input_anno = command.split(' ')
without_input_anno.pop(-3)
without_input_anno = ' '.join(without_input_anno)
if not no_anchor_opt:
if separate:
print('-- Building RowDiff succ/pred...')
res = subprocess.run(['echo \"\" | ' + without_input_anno + ' --row-diff-stage 1'], shell=True)
assert(res.returncode == 0)
res = subprocess.run([command + ' --row-diff-stage 1' + other_args], shell=True)
assert(res.returncode == 0)
if separate:
print('-- Assigning anchors...')
res = subprocess.run(['echo \"\" | ' + without_input_anno + ' --row-diff-stage 2'], shell=True)
assert(res.returncode == 0)
res = subprocess.run([command + ' --row-diff-stage 2' + other_args], shell=True)
assert(res.returncode == 0)

if final_anno != target_anno:
rd_type = 'column' if with_counts else 'row_diff'
command = f'{METAGRAPH} transform_anno --anno-type {final_anno} --greedy -o {output} ' \
f'-i {graph_path} -p {NUM_THREADS} {output}.{rd_type}.annodbg'
res = subprocess.run([command], shell=True)
assert (res.returncode == 0)
os.remove(output + anno_file_extension[rd_type])
else:
os.remove(output + anno_file_extension[anno_repr])
112 changes: 17 additions & 95 deletions metagraph/integration_tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,13 @@
import glob
import os
from helpers import get_test_class_name
from base import TestingBase, METAGRAPH, TEST_DATA_DIR, graph_file_extension


"""Test graph construction"""

METAGRAPH = './metagraph'
DNA_MODE = os.readlink(METAGRAPH).endswith("_DNA")
PROTEIN_MODE = os.readlink(METAGRAPH).endswith("_Protein")
TEST_DATA_DIR = os.path.dirname(os.path.realpath(__file__)) + '/../tests/data'

graph_file_extension = {'succinct': '.dbg',
'bitmap': '.bitmapdbg',
'hash': '.orhashdbg',
'hashfast': '.hashfastdbg',
'hashstr': '.hashstrdbg'}

anno_file_extension = {'column': '.column.annodbg',
'row': '.row.annodbg',
Expand All @@ -30,6 +23,8 @@
'row_diff_sparse': '.row_diff_sparse.annodbg',
'rb_brwt': '.rb_brwt.annodbg',
'brwt': '.brwt.annodbg',
'int_brwt': '.int_brwt.annodbg',
'row_diff_int_brwt': '.row_diff_int_brwt.annodbg',
'rbfish': '.rbfish.annodbg',
'flat': '.flat.annodbg'}

Expand All @@ -46,79 +41,6 @@ def product(graph_types, anno_types):
result.append((graph, anno))
return result

def build_annotation(graph_filename, input_fasta, anno_repr, output_filename,
separate=False, no_fork_opt=False, no_anchor_opt=False):
target_anno = anno_repr
if anno_repr in {'rb_brwt', 'brwt', 'row_sparse'} or anno_repr.startswith('row_diff'):
target_anno = anno_repr
anno_repr = 'column'
elif anno_repr in {'flat', 'rbfish'}:
target_anno = anno_repr
anno_repr = 'row'

annotate_command = '{exe} annotate -p {num_threads} --anno-header -i {graph} \
--anno-type {anno_repr} -o {outfile} {input}'.format(
exe=METAGRAPH,
num_threads=NUM_THREADS,
graph=graph_filename,
anno_repr=anno_repr,
outfile=output_filename,
input=input_fasta
)
res = subprocess.run([annotate_command], shell=True)
assert(res.returncode == 0)

if target_anno == anno_repr:
return

final_anno = target_anno
if final_anno.startswith('row_diff'):
target_anno = 'row_diff'

annotate_command = '{exe} transform_anno -p {num_threads} \
--anno-type {target_anno} -o {outfile} {input}'.format(
exe=METAGRAPH,
num_threads=NUM_THREADS,
graph=graph_filename,
target_anno=target_anno,
outfile=output_filename,
input=output_filename + anno_file_extension[anno_repr]
)
if target_anno == 'row_diff':
annotate_command += ' -i ' + graph_filename

if not no_fork_opt:
print('-- Building RowDiff without fork optimization...')
res = subprocess.run([annotate_command], shell=True)
assert(res.returncode == 0)

if target_anno == 'row_diff':
without_input_anno = annotate_command.split(' ')
without_input_anno.pop(-3)
without_input_anno = ' '.join(without_input_anno)
if not no_anchor_opt:
if separate:
print('-- Building RowDiff succ/pred...')
res = subprocess.run(['echo \"\" | ' + without_input_anno + ' --row-diff-stage 1'], shell=True)
assert(res.returncode == 0)
res = subprocess.run([annotate_command + ' --row-diff-stage 1'], shell=True)
assert(res.returncode == 0)
if separate:
print('-- Assigning anchors...')
res = subprocess.run(['echo \"\" | ' + without_input_anno + ' --row-diff-stage 2'], shell=True)
assert(res.returncode == 0)
res = subprocess.run([annotate_command + ' --row-diff-stage 2'], shell=True)
assert(res.returncode == 0)

os.remove(output_filename + anno_file_extension[anno_repr])

if final_anno != target_anno:
annotate_command = f'{METAGRAPH} transform_anno --anno-type {final_anno} --greedy -o {output_filename} ' \
f'-i {graph_filename} -p {NUM_THREADS} {output_filename}.row_diff.annodbg'
res = subprocess.run([annotate_command], shell=True)
assert (res.returncode == 0)
os.remove(output_filename + anno_file_extension['row_diff'])


@parameterized_class(('graph_repr', 'anno_repr'),
input_values=product(
Expand All @@ -129,7 +51,7 @@ def build_annotation(graph_filename, input_fasta, anno_repr, output_filename,
),
class_name_func=get_test_class_name
)
class TestQuery(unittest.TestCase):
class TestQuery(TestingBase):
@classmethod
def setUpClass(cls):
cls.tempdir = TemporaryDirectory()
Expand Down Expand Up @@ -189,11 +111,11 @@ def check_suffix(anno_repr, suffix):
cls.anno_repr, no_fork_opt = check_suffix(cls.anno_repr, '_no_fork_opt')
cls.anno_repr, no_anchor_opt = check_suffix(cls.anno_repr, '_no_anchor_opt')

build_annotation(
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls._annotate_graph(
TEST_DATA_DIR + '/transcripts_100.fa',
cls.anno_repr,
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls.tempdir.name + '/annotation',
cls.anno_repr,
separate,
no_fork_opt,
no_anchor_opt
Expand Down Expand Up @@ -590,7 +512,7 @@ def test_batch_query_with_tiny_batch(self):
class_name_func=get_test_class_name
)
@unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets")
class TestQueryCanonical(unittest.TestCase):
class TestQueryCanonical(TestingBase):
@classmethod
def setUpClass(cls):
cls.tempdir = TemporaryDirectory()
Expand Down Expand Up @@ -640,11 +562,11 @@ def setUpClass(cls):
res = subprocess.run([convert_command], shell=True)
assert(res.returncode == 0)

build_annotation(
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls._annotate_graph(
TEST_DATA_DIR + '/transcripts_100.fa',
cls.anno_repr,
cls.tempdir.name + '/annotation'
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls.tempdir.name + '/annotation',
cls.anno_repr
)

# check annotation
Expand Down Expand Up @@ -774,7 +696,7 @@ def test_batch_query_with_tiny_batch(self):
class_name_func=get_test_class_name
)
@unittest.skipIf(PROTEIN_MODE, "No canonical mode for Protein alphabets")
class TestQueryPrimary(unittest.TestCase):
class TestQueryPrimary(TestingBase):
@classmethod
def setUpClass(cls):
cls.tempdir = TemporaryDirectory()
Expand Down Expand Up @@ -848,11 +770,11 @@ def setUpClass(cls):
res = subprocess.run([convert_command], shell=True)
assert(res.returncode == 0)

build_annotation(
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls._annotate_graph(
TEST_DATA_DIR + '/transcripts_100.fa',
cls.anno_repr,
cls.tempdir.name + '/annotation'
cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr],
cls.tempdir.name + '/annotation',
cls.anno_repr
)

# check annotation
Expand Down
19 changes: 17 additions & 2 deletions metagraph/src/annotation/annotation_converters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,8 @@ void convert_to_row_diff(const std::vector<std::string> &files,
fs::path out_dir,
fs::path swap_dir,
RowDiffStage construction_stage,
fs::path count_vector_fname) {
fs::path count_vector_fname,
bool with_values) {
if (out_dir.empty())
out_dir = "./";

Expand Down Expand Up @@ -1184,6 +1185,19 @@ void convert_to_row_diff(const std::vector<std::string> &files,
for ( ; i < files.size(); ++i) {
// also add some space for buffers for each column
uint64_t file_size = fs::file_size(files[i]) + ROW_DIFF_BUFFER_BYTES;
if (with_values) {
// also add k-mer counts
try {
const auto &values_fname
hmusta marked this conversation as resolved.
Show resolved Hide resolved
= utils::remove_suffix(files[i], ColumnCompressed<>::kExtension)
+ ColumnCompressed<>::kCountExtension;
file_size += fs::file_size(values_fname);
} catch (...) {
// Count vectors may be missing for empty annotations. If a count file
// is missing for a non-empty annotation, the error will be thrown later
// in convert_batch_to_row_diff, so we skip it here in any case.
}
}
if (file_size > mem_bytes) {
logger->warn("Not enough memory to process {}, requires {} MB, skipped",
files[i], file_size / 1e6);
Expand Down Expand Up @@ -1214,7 +1228,8 @@ void convert_to_row_diff(const std::vector<std::string> &files,
} else {
convert_batch_to_row_diff(graph_fname,
file_batch, out_dir, swap_dir, count_vector_fname, ROW_DIFF_BUFFER_BYTES,
construction_stage == RowDiffStage::COMPUTE_REDUCTION);
construction_stage == RowDiffStage::COMPUTE_REDUCTION,
with_values);
}

logger->trace("Batch processed in {} sec", timer.elapsed());
Expand Down
4 changes: 3 additions & 1 deletion metagraph/src/annotation/annotation_converters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ void convert_to_row_annotator(const ColumnCompressed<Label> &annotator,
* @param out_dir directory where the transformed columns will be dumped. Filenames are
* kept, extension is changed from 'column.annodbg' to 'row_diff.annodbg'
* @param swap_dir directory for temporary files
* @param with_values row-diff transform with k-mer counts/attributes
*/
enum class RowDiffStage { COUNT_LABELS = 0, COMPUTE_REDUCTION, CONVERT };
void convert_to_row_diff(const std::vector<std::string> &files,
Expand All @@ -113,7 +114,8 @@ void convert_to_row_diff(const std::vector<std::string> &files,
std::filesystem::path out_dir,
std::filesystem::path swap_dir,
RowDiffStage construction_stage,
std::filesystem::path count_vector_fname = "");
std::filesystem::path count_vector_fname = "",
bool with_values = false);

void convert_row_diff_to_col_compressed(const std::vector<std::string> &files,
const std::string &outfbase);
Expand Down
14 changes: 0 additions & 14 deletions metagraph/src/annotation/binary_matrix/row_diff/row_diff.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,6 @@ namespace mtg {
namespace annot {
namespace binmat {

template <class BaseMatrix>
void RowDiff<BaseMatrix>::serialize(const std::string &filename) const {
std::ofstream f(filename, ios::binary);
serialize(f);
f.close();
}

template <class BaseMatrix>
bool RowDiff<BaseMatrix>::load(const std::string &filename) {
std::ifstream f(filename, ios::binary);
bool result = load(f);
return result;
}

hmusta marked this conversation as resolved.
Show resolved Hide resolved
template <class BaseMatrix>
void RowDiff<BaseMatrix>::load_anchor(const std::string &filename) {
if (!std::filesystem::exists(filename)) {
Expand Down
Loading