forked from raymondr/cosmo
-
Notifications
You must be signed in to change notification settings - Fork 2
/
cosmo-build.cpp
73 lines (58 loc) · 2.62 KB
/
cosmo-build.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#include <iostream>
#include <fstream>
#include <libgen.h> // basename
#include "tclap/CmdLine.h"
#include <sdsl/bit_vectors.hpp>
#include <sdsl/wavelet_trees.hpp>
#include "io.hpp"
#include "debruijn_graph.hpp"
#include "algorithm.hpp"
using namespace std;
using namespace sdsl;
string extension = ".dbg";
struct parameters_t {
std::string input_filename = "";
std::string output_prefix = "";
};
void parse_arguments(int argc, char **argv, parameters_t & params);
void parse_arguments(int argc, char **argv, parameters_t & params)
{
TCLAP::CmdLine cmd("Cosmo Copyright (c) Alex Bowe (alexbowe.com) 2014", ' ', VERSION);
TCLAP::UnlabeledValueArg<std::string> input_filename_arg("input",
".packed edge file (output from pack-edges).", true, "", "input_file", cmd);
string output_short_form = "output_prefix";
TCLAP::ValueArg<std::string> output_prefix_arg("o", "output_prefix",
"Output prefix. Graph will be written to [" + output_short_form + "]" + extension + ". " +
"Default prefix: basename(input_file).", false, "", output_short_form, cmd);
cmd.parse( argc, argv );
params.input_filename = input_filename_arg.getValue();
params.output_prefix = output_prefix_arg.getValue();
}
int main(int argc, char* argv[]) {
parameters_t p;
parse_arguments(argc, argv, p);
ifstream input(p.input_filename, ios::in|ios::binary|ios::ate);
// Can add this to save a couple seconds off traversal - not really worth it.
//vector<size_t> minus_positions;
debruijn_graph<> dbg = debruijn_graph<>::load_from_packed_edges(input, "$ACGT"/*, &minus_positions*/);
input.close();
cerr << "k : " << dbg.k << endl;
cerr << "num_nodes() : " << dbg.num_nodes() << endl;
cerr << "num_edges() : " << dbg.num_edges() << endl;
cerr << "Total size : " << size_in_mega_bytes(dbg) << " MB" << endl;
cerr << "Bits per edge : " << bits_per_element(dbg) << " Bits" << endl;
// The parameter should be const... On my computer the parameter
// isn't const though, yet it doesn't modify the string...
// This is still done AFTER loading the file just in case
char * base_name = basename(const_cast<char*>(p.input_filename.c_str()));
string outfilename = ((p.output_prefix == "")? base_name : p.output_prefix) + extension;
store_to_file(dbg, outfilename);
#ifdef VAR_ORDER
wt_int<rrr_vector<63>> lcs;
construct(lcs, base_name + string(".lcs"), 1);
cerr << "LCS size : " << size_in_mega_bytes(lcs) << " MB" << endl;
cerr << "LCS bits/edge : " << bits_per_element(lcs) << " Bits" << endl;
store_to_file(lcs, outfilename + ".lcs.wt");
// TODO: Write compressed LCS
#endif
}