forked from raymondr/cosmo
-
Notifications
You must be signed in to change notification settings - Fork 2
/
io.hpp
148 lines (119 loc) · 4.78 KB
/
io.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#ifndef IO_H
#define IO_H
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#include <cstring>
#include <iostream>
#include <fstream>
#include <vector>
#include <tuple>
#include "dummies.hpp"
#include "kmer.hpp"
#include "debug.h"
#include "kmc_api/kmc_file.h"
static const size_t MAX_BITS_PER_KMER = 128;
static const size_t BUFFER_SIZE = 0x8000; // 32Kb data buffer
using namespace std;
#define DSK_FILE_RECORD_SIZE(NBITS) (((NBITS)/8) + 4)
int kmc_read_header(std::string fname, uint32_t &kmer_num_bits, uint32_t &k, uint64 &_total_kmers, uint32_t &num_colors, std::vector<CKMCFile *> &kmer_data_bases);
// Reads the DSK file input header
int dsk_read_header(int, uint32_t *, uint32_t *);
// Counts the number of records in the file - for allocation purposes
int dsk_num_records(int handle, uint32_t kmer_num_bits, size_t * num_records);
// Read kmers from file into the output array
size_t dsk_read_kmers(int handle, uint32_t kmer_num_bits, uint64_t * kmers_output, uint32_t k);
// Reads the cortex file input header
int cortex_read_header(int, uint32_t *, uint32_t *);
// Counts the number of records in the file - for allocation purposes
int cortex_num_records(const int handle, const uint32_t kmer_num_bits, size_t &num_records, uint32_t &number_of_colours);
// Read kmers from file into the output array
size_t kmc_read_kmers(const int handle, const uint32_t kmer_num_bits, const uint32_t num_colors, uint32_t k, std::vector<uint64_t>& kmers_output, std::vector<color_bv> &kmer_colors, std::vector<CKMCFile *> &kmer_data_bases);
size_t cortex_read_kmers(const int handle, const uint32_t kmer_num_bits, const uint32_t num_colors, const uint32_t k, uint64_t *const &kmers_output, std::vector<color_bv> &kmer_colors);
//void merge_and_output(FILE * outfile, uint64_t * table_a, uint64_t * table_b, uint64_t * incoming_dummies, size_t num_records, size_t num_incoming_dummies, uint32_t k);
typedef uint8_t packed_edge;
#define PACKED_WIDTH (5)
#define PACKED_CAPACITY (bitwidth<uint64_t>::width/PACKED_WIDTH)
inline
packed_edge pack_edge(uint8_t symbol, bool start_flag, bool end_flag) {
return (symbol << 2) | (start_flag << 1) | (end_flag);
}
inline
void append_packed_edge(uint64_t & block, packed_edge edge) {
block = (block << PACKED_WIDTH) | edge;
}
class PackedEdgeOutputer {
static const size_t capacity = PACKED_CAPACITY;
ostream & _os;
uint64_t _buf = 0;
size_t _len = 0;
vector<size_t> _counts = vector<size_t>(DNA_RADIX + 1, 0);
bool closed = false;
public:
PackedEdgeOutputer(ostream & os) : _os(os) {}
~PackedEdgeOutputer() {
close();
}
void flush() {
if (_len > 0) {
// Make it so its easy to access the ith member in a block uniformly
_buf <<= ((capacity - _len) * PACKED_WIDTH);
_os.write((char*)&_buf, sizeof(uint64_t));
}
//_os.flush(); // let _os handle this
_buf = 0;
_len = 0;
}
private:
void write_counts() {
vector<size_t> accum(_counts);
for (int i = 1; i < DNA_RADIX+1; i++) {
accum[i] += accum[i-1]; // accumulate
}
// write out counts
_os.write((char*)&accum[0], (DNA_RADIX+1) * sizeof(size_t));
}
public:
void close() {
if (closed) return;
flush();
write_counts();
closed = true;
//_os.close();
}
template <typename kmer_t>
// Might be nicer if this was a Functor with operator() instead
// but then need copy ctor etc
void write(edge_tag tag, const kmer_t & x, const uint32_t k, bool first_start_node, bool first_end_node) {
uint8_t f_sym = get_f(tag, x, k);
uint8_t w_sym = get_w(tag, x);
//printf("counts[%c]: %zu -> ", "$acgt"[f_sym], _counts[f_sym]);
_counts[f_sym]++;
//printf("%zu\n", _counts[f_sym]);
packed_edge edge = pack_edge(w_sym, first_start_node, first_end_node);
//cout << ((edge & 2) >> 1) << endl;
append_packed_edge(_buf, edge);
if (++_len == capacity) flush();
}
};
inline packed_edge get_packed_edge_from_block(uint64_t block, size_t i) {
return (block >> (PACKED_WIDTH * (PACKED_CAPACITY-i-1))) & 31;
}
template <typename BlockIterator>
inline packed_edge get_packed_edge(BlockIterator blocks, size_t i) {
size_t block_idx = i/PACKED_CAPACITY;
size_t local_idx = i%PACKED_CAPACITY;
return get_packed_edge_from_block(blocks[block_idx], local_idx);
}
typedef std::tuple<uint8_t, bool, bool> edge_tuple;
static inline uint8_t unpack_symbol(packed_edge x) { return x >> 2; }
static inline bool unpack_node_flag(packed_edge x) { return ((x & 2) >> 1); }
static inline bool unpack_edge_flag(packed_edge x) { return (x & 1); }
inline edge_tuple unpack_to_tuple(packed_edge x) {
return edge_tuple(unpack_symbol(x), unpack_node_flag(x), unpack_edge_flag(x));
}
template <typename BlockIterator>
inline edge_tuple get_edge(BlockIterator blocks, size_t i) {
return unpack_to_tuple(get_packed_edge(blocks, i));
}
#endif