diff --git a/benchmark/wavelet_trees/src/wt_time_and_space.cpp b/benchmark/wavelet_trees/src/wt_time_and_space.cpp index 6e050e81b..acf28fd5a 100644 --- a/benchmark/wavelet_trees/src/wt_time_and_space.cpp +++ b/benchmark/wavelet_trees/src/wt_time_and_space.cpp @@ -46,26 +46,27 @@ uint64_t test_inverse_select(const t_wt& wt, const vector& is, uint64 return cnt; } -// test interval_symbols +// test ys_in_x_range template uint64_t -test_interval_symbols(typename enable_if::value), - t_wt>::type&, const vector&, const vector&, size_type&, uint64_t, uint64_t) +test_ys_in_x_range(typename enable_if::value), + t_wt>::type&, const vector&, const vector&, uint64_t, uint64_t) { - return 0; // interval_symbols not implemented + return 0; // ys_in_x_range not implemented } template uint64_t -test_interval_symbols(typename enable_if::value, - t_wt>::type& wt, const vector& is, const vector& js, size_type& k, uint64_t mask, uint64_t times=100000000) +test_ys_in_x_range(typename enable_if::value, + t_wt>::type& wt, const vector& is, const vector& js, uint64_t mask, uint64_t times=100000000) { - vector tmp(wt.sigma); - vector tmp2(wt.sigma); uint64_t cnt=0; for (uint64_t i=0; i(*y_it)-std::get<1>(*y_it)); + ++y_it; + } } return cnt; } @@ -171,20 +172,24 @@ void prepare_for_select(const t_iv& iv, vector& cs, vector struct wt_trait { - static uint64_t test_access(const t_wt& wt, const vector& is, uint64_t mask, uint64_t times=100000000) { + static uint64_t test_access(const t_wt& wt, const vector& is, uint64_t mask, uint64_t times=100000000) + { return ::test_access(wt, is, mask, times); } - static uint64_t test_inverse_select(const t_wt& wt, const vector& is, uint64_t mask, uint64_t times=100000000) { + static uint64_t test_inverse_select(const t_wt& wt, const vector& is, uint64_t mask, uint64_t times=100000000) + { return ::test_inverse_select(wt, is, mask, times); } }; template struct wt_trait> { - static uint64_t test_access(const wt_gmr_rs&, const vector&, uint64_t, uint64_t) { + static uint64_t test_access(const wt_gmr_rs&, const vector&, uint64_t, uint64_t) + { return 0; } - static uint64_t test_inverse_select(const wt_gmr_rs&, const vector&, uint64_t, uint64_t) { + static uint64_t test_inverse_select(const wt_gmr_rs&, const vector&, uint64_t, uint64_t) + { return 0; } }; @@ -203,8 +208,6 @@ int main(int argc, char* argv[]) uint64_t check = 0; uint64_t size = 1< cs(size); vector is(size); vector is2(size); @@ -255,13 +258,13 @@ int main(int argc, char* argv[]) cout << "# inverse_select_time = " << duration_cast(stop-start).count()/(double)reps << endl; cout << "# inverse_select_check = " << check << endl; - // interval_symbols - const uint64_t reps_interval_symbols = wt.sigma < 10000 ? reps : reps/100; + // ys_in_x_range + const uint64_t reps_ys_in_x_range = wt.sigma < 10000 ? reps : reps/100; start = timer::now(); - check = test_interval_symbols(wt, is, js, k, mask, reps_interval_symbols); + check = test_ys_in_x_range(wt, is, js, mask, reps_ys_in_x_range); stop = timer::now(); - cout << "# interval_symbols_time = " << duration_cast(stop-start).count()/(double)reps_interval_symbols << endl; - cout << "# interval_symbols_check = " << check << endl; + cout << "# ys_in_x_range_time = " << duration_cast(stop-start).count()/(double)reps_ys_in_x_range << endl; + cout << "# ys_in_x_range_check = " << check << endl; // lex_count start = timer::now(); diff --git a/benchmark/wavelet_trees/visualize/wt.R b/benchmark/wavelet_trees/visualize/wt.R index b296a895d..68bdfdfb8 100644 --- a/benchmark/wavelet_trees/visualize/wt.R +++ b/benchmark/wavelet_trees/visualize/wt.R @@ -154,9 +154,9 @@ for(tc in unique(maindata$TC_ID)){ widths=c(1.35,1), heights=c(1,1,1)) #interval-symbols-plot - ivs <-data['interval_symbols_time'] + ivs <-data['ys_in_x_range_time'] rownames(ivs)<-id - plot_time_figure(t(ivs),"\\tt{interval\\_symbols}",xmax=max(xmax,max(ivs))) + plot_time_figure(t(ivs),"\\tt{ys\\_in\\_x\\_range}",xmax=max(xmax,max(ivs))) #constructor-plot con <-data['constructs_time'] diff --git a/examples/top_wt.cpp b/examples/top_wt.cpp new file mode 100644 index 000000000..3cbf28936 --- /dev/null +++ b/examples/top_wt.cpp @@ -0,0 +1,31 @@ +#include +#include +#include + +using namespace std; +using namespace sdsl; + +int main() +{ + wt_topk<> wtk; + construct_im(wtk, {{0,0,2},{1,2,3},{2,1,2},{3,0,2},{4,0,1},{5,1,4},{6,0,1},{7,1,1},{8,0,8},{9,2,5}}); + wtk.print_info(); + + auto topk_it = top_k(wtk, {2,0}, {7,1}); + while (topk_it) { + auto point_weight = *topk_it; + cout << point_weight.first <<" weight: "<,rmq_succinct_sct, dac_vector<>, false> wtk2; + construct_im(wtk2, {{0,0,2},{1,2,3},{2,1,2},{3,0,2},{4,0,1},{5,1,4},{6,0,1},{7,1,1},{8,0,8},{9,2,5}}); + wtk2.print_info(); + + auto topk_it2 = top_k(wtk2, {2,0}, {7,1}); + while (topk_it2) { + auto point_weight = *topk_it2; + cout << point_weight.first <<" weight: "< +#include +#include +#include + +using namespace sdsl; +using namespace std; + + +int main() +{ + typedef wt_int<> t_wt; + t_wt wt; + construct_im(wt, "9 4 3 2 1 4 6 3 1 4 6 5 3 2 1 3 5 3 2 3 4",'d'); + cout << wt << endl; + auto y_it = ys_in_x_range(wt, 0, wt.size()); + while (y_it) { + auto y = *y_it; + cout << get<0>(y) << " ("<< get<1>(y) << "," << get<2>(y) << ")" << endl; + ++y_it; + } + + cout << "count[0,"<(*mts_it); + cout << " ["<(get<1>(*mts_it)) <<","<(get<1>(*mts_it))<<"]"<& partial_lcp, bit_vector& index_done, std::s template void create_C_array(std::vector& C, const tWT& wt) { - uint64_t quantity; // quantity of characters in interval - std::vector cs(wt.sigma); // list of characters in the interval - std::vector rank_c_i(wt.sigma); // number of occurrence of character in [0 .. i-1] - std::vector rank_c_j(wt.sigma); // number of occurrence of character in [0 .. j-1] - C = std::vector(257, 0); - interval_symbols(wt, 0, wt.size(), quantity, cs, rank_c_i, rank_c_j); - for (uint64_t i=0; i(*y_it); + C[c+1] = std::get<2>(*y_it); } for (uint64_t i=1; i::size_type size_type; using node_type = k2_treap_ns::node_type; using point_type = k2_treap_ns::point_type; diff --git a/include/sdsl/k2_treap_algorithm.hpp b/include/sdsl/k2_treap_algorithm.hpp index 89a43dd26..20fdd3149 100644 --- a/include/sdsl/k2_treap_algorithm.hpp +++ b/include/sdsl/k2_treap_algorithm.hpp @@ -37,6 +37,15 @@ namespace sdsl { +// forward declaration +template +class k2_treap; + + + namespace k2_treap_ns { @@ -271,13 +280,17 @@ class range_iterator * \return Iterator to result in decreasing order. * \pre real(p1) <= real(p2) and imag(p1)<=imag(p2) */ -template -k2_treap_ns::top_k_iterator -top_k(const t_k2_treap& t, - k2_treap_ns::point_type p1, - k2_treap_ns::point_type p2) + +template +k2_treap_ns::top_k_iterator> + top_k(const k2_treap& t, + k2_treap_ns::point_type p1, + k2_treap_ns::point_type p2) { - return k2_treap_ns::top_k_iterator(t, p1, p2); + return k2_treap_ns::top_k_iterator>(t, p1, p2); } @@ -290,16 +303,21 @@ top_k(const t_k2_treap& t, * \pre real(p1) <= real(p2) and imag(p1)<=imag(p2) * real(range) <= imag(range) */ -template -k2_treap_ns::range_iterator -range_3d(const t_k2_treap& t, - k2_treap_ns::point_type p1, - k2_treap_ns::point_type p2, - k2_treap_ns::range_type range) +template +k2_treap_ns::range_iterator> + range_3d(const k2_treap& t, + k2_treap_ns::point_type p1, + k2_treap_ns::point_type p2, + k2_treap_ns::range_type range) { - return k2_treap_ns::range_iterator(t, p1, p2, range); + return k2_treap_ns::range_iterator>(t, p1, p2, range); } +namespace k2_treap_ns +{ // forward declaration template @@ -307,8 +325,10 @@ uint64_t __count(const t_k2_treap&, typename t_k2_treap::node_type); // forward declaration template -uint64_t _count(const t_k2_treap&, k2_treap_ns::point_type, - k2_treap_ns::point_type, typename t_k2_treap::node_type); +uint64_t _count(const t_k2_treap&, point_type, + point_type, typename t_k2_treap::node_type); + +} //! Count how many points are in the rectangle (p1,p2) /*! \param treap k2-treap @@ -317,34 +337,38 @@ uint64_t _count(const t_k2_treap&, k2_treap_ns::point_type, * \return The number of points in rectangle (p1,p2). * \pre real(p1) <= real(p2) and imag(p1)<=imag(p2) */ -template +template uint64_t -count(const t_k2_treap& treap, +count(const k2_treap& t, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2) { - if (treap.size() > 0) { - return _count(treap, p1, p2, treap.root()); + if (t.size() > 0) { + return k2_treap_ns::_count(t, p1, p2, t.root()); } return 0; } +namespace k2_treap_ns +{ template uint64_t -_count(const t_k2_treap& treap, - k2_treap_ns::point_type p1, - k2_treap_ns::point_type p2, +_count(const t_k2_treap& t, + point_type p1, + point_type p2, typename t_k2_treap::node_type v) { - using namespace k2_treap_ns; if (contained(p1, p2, v)) { - return __count(treap, v); + return __count(t, v); } else if (overlap(p1, p2, v)) { uint64_t res = contained(v.max_p, p1, p2); - auto nodes = treap.children(v); + auto nodes = t.children(v); for (auto node : nodes) { - res += _count(treap, p1, p2, node); + res += _count(t, p1, p2, node); } return res; } @@ -354,24 +378,17 @@ _count(const t_k2_treap& treap, template uint64_t -__count(const t_k2_treap& treap, +__count(const t_k2_treap& t, typename t_k2_treap::node_type v) { uint64_t res = 1; // count the point at the node - auto nodes = treap.children(v); + auto nodes = t.children(v); for (auto node : nodes) - res += __count(treap, node); + res += __count(t, node); return res; } - -// forward declaration -template -class k2_treap; - +} //! Specialized version of method ,,construct'' for k2_treaps. template point_type; typedef std::vector point_vec_type; @@ -98,7 +99,8 @@ class wm_int mutable int_vector<64> m_path_off; // array keeps track of path offset in select-like methods mutable int_vector<64> m_path_rank_off;// array keeps track of rank values for the offsets - void copy(const wm_int& wt) { + void copy(const wm_int& wt) + { m_size = wt.m_size; m_sigma = wt.m_sigma; m_tree = wt.m_tree; @@ -117,7 +119,8 @@ class wm_int private: - void init_buffers(uint32_t max_level) { + void init_buffers(uint32_t max_level) + { m_path_off = int_vector<64>(max_level+1); m_path_rank_off = int_vector<64>(max_level+1); } @@ -129,7 +132,8 @@ class wm_int const uint32_t& max_level = m_max_level; //!< Maximal level of the wavelet tree. //! Default constructor - wm_int() { + wm_int() + { init_buffers(m_max_level); }; @@ -145,7 +149,8 @@ class wm_int */ template wm_int(int_vector_buffer& buf, size_type size, - uint32_t max_level=0) : m_size(size) { + uint32_t max_level=0) : m_size(size) + { init_buffers(m_max_level); if (0 == m_size) return; @@ -231,17 +236,20 @@ class wm_int } //! Copy constructor - wm_int(const wm_int& wt) { + wm_int(const wm_int& wt) + { copy(wt); } //! Copy constructor - wm_int(wm_int&& wt) { + wm_int(wm_int&& wt) + { *this = std::move(wt); } //! Assignment operator - wm_int& operator=(const wm_int& wt) { + wm_int& operator=(const wm_int& wt) + { if (this != &wt) { copy(wt); } @@ -249,7 +257,8 @@ class wm_int } //! Assignment move operator - wm_int& operator=(wm_int&& wt) { + wm_int& operator=(wm_int&& wt) + { if (this != &wt) { m_size = wt.m_size; m_sigma = wt.m_sigma; @@ -270,7 +279,8 @@ class wm_int } //! Swap operator - void swap(wm_int& wt) { + void swap(wm_int& wt) + { if (this != &wt) { std::swap(m_size, wt.m_size); std::swap(m_sigma, wt.m_sigma); @@ -287,12 +297,14 @@ class wm_int } //! Returns the size of the original vector. - size_type size()const { + size_type size()const + { return m_size; } //! Returns whether the wavelet tree contains no data. - bool empty()const { + bool empty()const + { return m_size == 0; } @@ -302,7 +314,8 @@ class wm_int * \par Precondition * \f$ i < size() \f$ */ - value_type operator[](size_type i)const { + value_type operator[](size_type i)const + { assert(i < size()); value_type res = 0; for (uint32_t k=0; k < m_max_level; ++k) { @@ -329,7 +342,8 @@ class wm_int * \par Precondition * \f$ i \leq size() \f$ */ - size_type rank(size_type i, value_type c)const { + size_type rank(size_type i, value_type c)const + { assert(i <= size()); if (((1ULL)<<(m_max_level))<=c) { // c is greater than any symbol in wt return 0; @@ -360,7 +374,8 @@ class wm_int * \f$ i < size() \f$ */ std::pair - inverse_select(size_type i)const { + inverse_select(size_type i)const + { assert(i < size()); value_type c = 0; size_type b = 0; // start position of the interval @@ -392,7 +407,8 @@ class wm_int * \par Precondition * \f$ 1 \leq i \leq rank(size(), c) \f$ */ - size_type select(size_type i, value_type c)const { + size_type select(size_type i, value_type c)const + { assert(1 <= i and i <= rank(size(), c)); uint64_t mask = 1ULL << (m_max_level-1); m_path_off[0] = m_path_rank_off[0] = 0; @@ -438,7 +454,8 @@ class wm_int */ std::pair>> range_search_2d(size_type lb, size_type rb, value_type vlb, value_type vrb, - bool report=true) const { + bool report=true) const + { if (vrb > (1ULL << m_max_level)) vrb = (1ULL << m_max_level); @@ -460,7 +477,8 @@ class wm_int value_type vrb, size_type ilb, size_type is[], size_type rank_off[], point_vec_type& point_vec, bool report, size_type& cnt_answers) - const { + const + { using std::get; if (get<0>(r) > get<1>(r)) return; @@ -506,18 +524,21 @@ class wm_int } //! Returns a const_iterator to the first element. - const_iterator begin()const { + const_iterator begin()const + { return const_iterator(this, 0); } //! Returns a const_iterator to the element after the last element. - const_iterator end()const { + const_iterator end()const + { return const_iterator(this, size()); } //! Serializes the data structure into the given ostream - size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const + { structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); size_type written_bytes = 0; written_bytes += write_member(m_size, out, child, "size"); @@ -534,7 +555,8 @@ class wm_int } //! Loads the data structure from the given istream. - void load(std::istream& in) { + void load(std::istream& in) + { read_member(m_size, in); read_member(m_sigma, in); m_tree.load(in); @@ -572,36 +594,43 @@ class wm_int node_type& operator=(node_type&&) = default; // Comparator operator - bool operator==(const node_type& v) const { + bool operator==(const node_type& v) const + { return offset == v.offset; } // Smaller operator - bool operator<(const node_type& v) const { + bool operator<(const node_type& v) const + { return offset < v.offset; } // Greater operator - bool operator>(const node_type& v) const { + bool operator>(const node_type& v) const + { return offset > v.offset; } }; //! Checks if the node is a leaf node - bool is_leaf(const node_type& v) const { + bool is_leaf(const node_type& v) const + { return v.level == m_max_level; } - value_type sym(const node_type& v) const { + value_type sym(const node_type& v) const + { return v.sym; } - bool empty(const node_type& v) const { + bool empty(const node_type& v) const + { return v.size == (size_type)0; } //! Return the root node - node_type root() const { + node_type root() const + { return node_type(0, m_size, 0, 0); } @@ -610,8 +639,9 @@ class wm_int * \return Return a pair of nodes (left child, right child). * \pre !is_leaf(v) */ - std::pair - expand(const node_type& v) const { + std::array + expand(const node_type& v) const + { node_type v_right = v; return expand(std::move(v_right)); } @@ -621,8 +651,9 @@ class wm_int * \return Return a pair of nodes (left child, right child). * \pre !is_leaf(v) */ - std::pair - expand(node_type&& v) const { + std::array + expand(node_type&& v) const + { node_type v_left; size_type rank_b = m_tree_rank(v.offset); size_type ones = m_tree_rank(v.offset+v.size)-rank_b; // ones in [b..size) @@ -638,7 +669,7 @@ class wm_int v.level = v.level + 1; v.sym = (v.sym<<1)|1; - return std::make_pair(std::move(v_left), v); + return {std::move(v_left), v}; } //! Returns for each range its left and right child ranges @@ -651,9 +682,10 @@ class wm_int * range mapped to the right child of v. * \pre !is_leaf(v) and s>=v_s and e<=v_e */ - std::pair + std::array expand(const node_type& v, - const range_vec_type& ranges) const { + const range_vec_type& ranges) const + { auto ranges_copy = ranges; return expand(v, std::move(ranges_copy)); } @@ -668,9 +700,10 @@ class wm_int * range mapped to the right child of v. * \pre !is_leaf(v) and s>=v_s and e<=v_e */ - std::pair + std::array expand(const node_type& v, - range_vec_type&& ranges) const { + range_vec_type&& ranges) const + { auto v_sp_rank = m_tree_rank(v.offset); // this is already calculated in expand(v) range_vec_type res(ranges.size()); size_t i = 0; @@ -686,7 +719,7 @@ class wm_int r = range_type(left_sp, left_sp + left_size - 1); res[i++] = range_type(right_sp, right_sp + right_size - 1); } - return make_pair(ranges, std::move(res)); + return {ranges, std::move(res)}; } //! Returns for a range its left and right child ranges @@ -699,8 +732,9 @@ class wm_int * range mapped to the right child of v. * \pre !is_leaf(v) and s>=v_s and e<=v_e */ - std::pair - expand(const node_type& v, const range_type& r) const { + std::array + expand(const node_type& v, const range_type& r) const + { auto v_sp_rank = m_tree_rank(v.offset); // this is already calculated in expand(v) auto sp_rank = m_tree_rank(v.offset + r.first); auto right_size = m_tree_rank(v.offset + r.second + 1) @@ -710,14 +744,26 @@ class wm_int auto right_sp = sp_rank - v_sp_rank; auto left_sp = r.first - right_sp; - return make_pair(range_type(left_sp, left_sp + left_size - 1), - range_type(right_sp, right_sp + right_size - 1)); + return {range_type(left_sp, left_sp + left_size - 1), + range_type(right_sp, right_sp + right_size - 1) + }; } //! return the path to the leaf for a given symbol - std::pair path(value_type c) const { + std::pair path(value_type c) const + { return {m_max_level,c}; } + + //! Return the value range of a node v + std::array + value_range(const node_type& v) const + { + const uint64_t size = 1ULL << (m_max_level-v.level); + return {(v.sym<<(m_max_level-v.level)), (v.sym<<(m_max_level-v.level))+size-1}; + } + + }; }// end namespace sdsl diff --git a/include/sdsl/wt_algorithm.hpp b/include/sdsl/wt_algorithm.hpp index 61daeb715..b7c1a3320 100644 --- a/include/sdsl/wt_algorithm.hpp +++ b/include/sdsl/wt_algorithm.hpp @@ -2,19 +2,433 @@ #define INCLUDED_SDSL_WT_ALGORITHM #include +#include #include namespace sdsl { +template +struct has_expand; + +//! Returns if range1 overlaps with range2 +/*! + * \param range1 First range [r1, l1] + * \param range2 Second range [r2, l2]. + * \return If the two ranges overlap. + */ +template +bool +overlaps(const r_t1& range1, const r_t2& range2) +{ + return std::get<1>(range1) >= std::get<0>(range2) + and std::get<0>(range1) <= std::get<1>(range2); +} + +//! Returns if range1 is contained in range2 +/*! + * \param range1 First range [r1, l1] + * \param range2 Second range [r2, l2]. + * \return If range1 is contained in range2. + */ +template +bool +is_contained(const r_t1& range1, const r_t2& range2) +{ + return std::get<0>(range1) >= std::get<0>(range2) + and std::get<1>(range1) <= std::get<1>(range2); +} + + + +//! Count how many points are in [x_0,x_1] x [y_0, y_1] +/*! + * \param wt Wavelet tree representing a sequence. + * \param x_range x-range [x_0,x_1] in wt. + * \param y_range y-range [y_0,y_1] in wt. + * + * \return + */ +template +typename t_wt::size_type +count(const t_wt& wt, const range_type& x_range, const range_type& y_range, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) +{ + typedef typename t_wt::size_type size_type; + typedef typename t_wt::node_type node_type; + + if (x_range.first >= wt.size()) + return 0; + size_type res = 0; + std::stack> s; + + auto push_node = [&wt, &s, &y_range, &res](const node_type& v, const range_type& x_range) { + if (size(x_range) > 0) { + auto value_range = wt.value_range(v); + if (is_contained(value_range, y_range)) { + res += size(x_range); + } else if (overlaps(y_range, value_range)) { + s.emplace(v, x_range); + } + } + }; + + push_node(wt.root(), x_range); + while (!s.empty()) { + auto v = std::get<0>(s.top()); + auto x_range = std::get<1>(s.top()); + s.pop(); + if (!wt.is_leaf(v)) { + auto child_v = wt.expand(v); + auto child_r = wt.expand(v, x_range); + for (size_t i=0, j=child_v.size()-1; i < child_v.size(); ++i,--j) { + push_node(child_v[j], child_r[j]); + } + } + } + return res; +} + + template -struct has_interval_symbols; +class map_to_sorted_iterator +{ + static_assert(t_wt::traversable, "map_to_sorted_sequence requires t_wt to be traversable."); + static_assert(t_wt::lex_ordered, "map_to_sorted_sequence requires t_wt to be lex_ordered."); + public: + typedef void(*t_mfptr)(); + typedef typename t_wt::value_type value_type; + typedef typename t_wt::size_type size_type; + typedef std::tuple ret_type; + private: + typedef typename t_wt::node_type node_type; + typedef std::tuple state_type; + + const t_wt* m_wt = nullptr; + range_type m_y_range; + std::stack m_stack; + ret_type m_ret; + bool m_valid = false; + + void cond_push(const node_type& v, const range_type& x_range, size_type lex_sml) + { + if (size(x_range) > 0) { + auto value_range = m_wt->value_range(v); + if (overlaps(m_y_range, value_range)) { + m_stack.emplace(v, x_range, lex_sml); + } + } + } -template -struct _interval_symbols_wt; + public: + map_to_sorted_iterator() = default; + map_to_sorted_iterator(const map_to_sorted_iterator&) = default; + map_to_sorted_iterator(map_to_sorted_iterator&&) = default; + map_to_sorted_iterator& operator=(const map_to_sorted_iterator&) = default; + map_to_sorted_iterator& operator=(map_to_sorted_iterator&&) = default; + + map_to_sorted_iterator(const t_wt* wt, const range_type& x_range, + const range_type& y_range) : m_wt(wt), + m_y_range(y_range) + { + if (wt!=nullptr and size(x_range) > 0 and std::get<0>(x_range) < wt->size()) { + if (overlaps(y_range, m_wt->value_range(wt->root()))) { + m_stack.emplace(m_wt->root(), x_range, 0); + ++(*this); + } + } + } + + map_to_sorted_iterator& operator++() + { + m_valid = false; + while (!m_stack.empty()) { + auto v = std::get<0>(m_stack.top()); + auto x_range = std::get<1>(m_stack.top()); + auto lex_sml = std::get<2>(m_stack.top()); + m_stack.pop(); + if (!m_wt->is_leaf(v)) { + auto child_v = m_wt->expand(v); + auto child_r = m_wt->expand(v, x_range); + for (int i=1; i >= 0; --i) { + if (size(child_r[i]) > 0) { + if (i==1) + cond_push(child_v[i], child_r[i], lex_sml+m_wt->size(child_v[0])); + else + cond_push(child_v[i], child_r[i], lex_sml); + } + } + } else { + m_ret = ret_type(m_wt->sym(v), range_type(std::get<0>(x_range)+lex_sml,std::get<1>(x_range)+lex_sml), + lex_sml); + m_valid = true; + break; + } + } + return *this; + } + + //! Postfix increment of the iterator + map_to_sorted_iterator operator++(int) + { + map_to_sorted_iterator it = *this; + ++(*this); + return it; + } + + ret_type operator*() const + { + return m_ret; + } + + operator t_mfptr() const + { + return (t_mfptr)(m_valid); + } +}; + +template +map_to_sorted_iterator +map_to_sorted_sequence(const t_wt& wt, const range_type& x_range, const range_type& y_range, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) +{ + static_assert(t_wt::traversable, "map_to_sorted_sequence requires t_wt to be traversable."); + static_assert(t_wt::lex_ordered, "map_to_sorted_sequence requires t_wt to be lex_ordered."); + return map_to_sorted_iterator(&wt, x_range, y_range); +} + + + +template +class y_iterator +{ + static_assert(t_wt::traversable, "y_iterator requires t_wt to be traversable."); + public: + typedef void(*t_mfptr)(); + typedef typename t_wt::value_type value_type; + typedef typename t_wt::size_type size_type; + typedef std::tuple t_ret; + + private: + typedef typename t_wt::node_type node_type; + typedef std::pair t_state; + + const t_wt* m_wt = nullptr; + std::stack m_stack; + t_ret m_ret; + bool m_valid = false; + + public: + y_iterator() = default; + y_iterator(const y_iterator&) = default; + y_iterator(y_iterator&&) = default; + y_iterator& operator=(const y_iterator&) = default; + y_iterator& operator=(y_iterator&&) = default; + // wt wavelet tree + // lb inclusive + // rb exclusive + y_iterator(const t_wt& wt, size_type lb, size_type rb) : + m_wt(&wt), m_valid(wt.size()>0) + { + if (m_wt->size() > 0) { + if ((lb+1) == rb) { + auto res = m_wt->inverse_select(lb); + m_ret = t_ret(res.second, res.first, res.first+1); + } else if (rb > lb) { + m_stack.emplace(wt.root(), range_type(lb, rb-1)); + ++(*this); + } + } + } + + //! Prefix increment of the iterator + y_iterator& operator++() + { + m_valid = false; + while (!m_stack.empty()) { + auto v = std::get<0>(m_stack.top()); + auto r = std::get<1>(m_stack.top()); + m_stack.pop(); + if (m_wt->is_leaf(v)) { + m_ret = t_ret(m_wt->sym(v), r.first, r.second+1); + m_valid = true; + break; + } else { + auto child_v = m_wt->expand(v); + auto child_r = m_wt->expand(v, r); + if (!sdsl::empty(std::get<1>(child_r))) { + m_stack.emplace(std::get<1>(child_v), std::get<1>(child_r)); + } + if (!sdsl::empty(std::get<0>(child_r))) { + m_stack.emplace(std::get<0>(child_v), std::get<0>(child_r)); + } + } + } + return *this; + } + + //! Postfix increment of the iterator + y_iterator operator++(int) + { + y_iterator it = *this; + ++(*this); + return it; + } + + t_ret operator*() const + { + return m_ret; + } + + operator t_mfptr() const + { + return (t_mfptr)(m_valid); + } +}; + +//! Returns an iterator over all values in wt[i..j-1] +/*! + * \param wt The wavelet tree. + * \param i The start index (inclusive) of the interval. + * \param j The end index (exclusive) of the interval. + * \return Iterator to the result. The iterator points to + * triples (y-value, sp, ep), where [sp,ep) is the + * range in the leaf. I.e. ep-sp is the number of + * ys in the x-range [i..j). + * + * \par Time complexity + * Iterating over all k values in wt[i..j] takes \f$\Order{k\log\sigma} time. + * + * \par Precondition + * \f$ i \leq j \leq size() \f$ + */ +template +y_iterator +ys_in_x_range(const t_wt& wt, typename t_wt::size_type i, + typename t_wt::size_type j, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) +{ + static_assert(t_wt::traversable, "ys_in_x_range requires t_wt to be traversable."); + return y_iterator(wt, i, j); +} -template -struct has_expand; + +template +class yoff_iterator +{ + static_assert(t_wt::traversable, "yoff_iterator requires t_wt to be traversable."); + static_assert(t_wt::lex_ordered, "yoff_iterator requires t_wt to be lex_ordered."); + public: + typedef void(*t_mfptr)(); + typedef typename t_wt::value_type value_type; + typedef typename t_wt::size_type size_type; + typedef std::tuple t_ret; + + private: + typedef typename t_wt::node_type node_type; + typedef std::tuple t_state; + + const t_wt* m_wt = nullptr; + std::stack m_stack; + t_ret m_ret; + bool m_valid = false; + + public: + yoff_iterator() = default; + yoff_iterator(const yoff_iterator&) = default; + yoff_iterator(yoff_iterator&&) = default; + yoff_iterator& operator=(const yoff_iterator&) = default; + yoff_iterator& operator=(yoff_iterator&&) = default; + // wt wavelet tree + // lb inclusive + // rb exclusive + yoff_iterator(const t_wt& wt, size_type lb, size_type rb) : + m_wt(&wt), m_valid(wt.size()>0) + { + if (m_wt->size() > 0) { + if (rb > lb) { + m_stack.emplace(wt.root(), range_type(lb, rb-1), 0); + ++(*this); + } + } + } + + //! Prefix increment of the iterator + yoff_iterator& operator++() + { + m_valid = false; + while (!m_stack.empty()) { + auto v = std::get<0>(m_stack.top()); + auto r = std::get<1>(m_stack.top()); + auto lex_smaller = std::get<2>(m_stack.top()); + m_stack.pop(); + if (m_wt->is_leaf(v)) { + m_ret = t_ret(m_wt->sym(v), r.first, r.second+1, lex_smaller); + m_valid = true; + break; + } else { + auto child_v = m_wt->expand(v); + auto child_r = m_wt->expand(v, r); + if (!sdsl::empty(std::get<1>(child_r))) { + m_stack.emplace(std::get<1>(child_v), std::get<1>(child_r), + lex_smaller + m_wt->size(std::get<0>(child_v))); + } + if (!sdsl::empty(std::get<0>(child_r))) { + m_stack.emplace(std::get<0>(child_v), std::get<0>(child_r), lex_smaller); + } + } + } + return *this; + } + + //! Postfix increment of the iterator + yoff_iterator operator++(int) + { + yoff_iterator it = *this; + ++(*this); + return it; + } + + t_ret operator*() const + { + return m_ret; + } + + operator t_mfptr() const + { + return (t_mfptr)(m_valid); + } +}; + +//! Returns an iterator over all values in wt[i..j-1] +/*! + * \param wt The wavelet tree. + * \param i The start index (inclusive) of the interval. + * \param j The end index (exclusive) of the interval. + * \return Iterator to the result. The iterator points to + * triples (y-value, sp, ep), where [sp,ep) is the + * range in the leaf. I.e. ep-sp is the number of + * ys in the x-range [i..j). + * + * \par Time complexity + * Iterating over all k values in wt[i..j] takes \f$\Order{k\log\sigma} time. + * + * \par Precondition + * \f$ i \leq j \leq size() \f$ + */ +template +yoff_iterator +ys_and_off_in_x_range(const t_wt& wt, typename t_wt::size_type i, + typename t_wt::size_type j, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) +{ + static_assert(t_wt::traversable, "ys_in_x_range requires t_wt to be traversable."); + return yoff_iterator(wt, i, j); +} //! Intersection of elements in WT[s_0,e_0], WT[s_1,e_1],...,WT[s_k,e_k] /*! \param wt The wavelet tree object. @@ -25,10 +439,13 @@ struct has_expand; * contained in t different ranges. Frequency = accumulated * frequencies in all ranges. The tuples are ordered according * to value, if t_wt::lex_ordered=1. + * TODO: should return an iterator to the result */ template std::vector< std::pair > -intersect(const t_wt& wt, const std::vector& ranges, typename t_wt::size_type t=0) +intersect(const t_wt& wt, const std::vector& ranges, typename t_wt::size_type t=0, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) { using std::get; using size_type = typename t_wt::size_type; @@ -37,7 +454,7 @@ intersect(const t_wt& wt, const std::vector& ranges, typename t_wt:: using pnvr_type = std::pair; typedef std::stack stack_type; - static_assert(has_expand(const node_type&)>::value, + static_assert(has_expand(const node_type&)>::value, "intersect requires t_wt to have expand(const node_type&)"); using p_t = std::pair; @@ -94,13 +511,15 @@ intersect(const t_wt& wt, const std::vector& ranges, typename t_wt:: template std::pair quantile_freq(const t_wt& wt, typename t_wt::size_type lb, - typename t_wt::size_type rb, typename t_wt::size_type q) + typename t_wt::size_type rb, typename t_wt::size_type q, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) { static_assert(t_wt::lex_ordered, "quantile_freq requires a lex_ordered WT"); using std::get; using node_type = typename t_wt::node_type; - static_assert(has_expand(const node_type&)>::value, + static_assert(has_expand(const node_type&)>::value, "quantile_freq requires t_wt to have expand(const node_type&)"); node_type v = wt.root(); @@ -123,151 +542,6 @@ quantile_freq(const t_wt& wt, typename t_wt::size_type lb, return {wt.sym(v), size(r)}; }; - -template -void -_interval_symbols_rec(const t_wt& wt, range_type r, - typename t_wt::size_type& k, - std::vector& cs, - std::vector& rank_c_i, - std::vector& rank_c_j, - const typename t_wt::node_type& v) -{ - using std::get; - if (wt.is_leaf(v)) { - rank_c_i[k] = r.first; - rank_c_j[k] = r.second+1; - cs[k++] = wt.sym(v); - } else { - auto child = wt.expand(v); - auto child_ranges = wt.expand(v, r); - if (!empty(get<0>(child_ranges))) { - _interval_symbols_rec(wt, get<0>(child_ranges), k, cs, rank_c_i, - rank_c_j, get<0>(child)); - } - if (!empty(get<1>(child_ranges))) { - _interval_symbols_rec(wt, get<1>(child_ranges), k, cs, rank_c_i, - rank_c_j, get<1>(child)); - } - } -} - -template -void -_interval_symbols(const t_wt& wt, typename t_wt::size_type i, - typename t_wt::size_type j, - typename t_wt::size_type& k, - std::vector& cs, - std::vector& rank_c_i, - std::vector& rank_c_j) -{ - - assert(i <= j and j <= wt.size()); - k=0; - if ((i+1)==j) { - auto res = wt.inverse_select(i); - cs[0]=res.second; - rank_c_i[0]=res.first; - rank_c_j[0]=res.first+1; - k=1; - return; - } else if (j>i) { - _interval_symbols_rec(wt, range_type(i,j-1), k, cs, - rank_c_i, rank_c_j, wt.root()); - } -} - -//! For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c). -/*! - * \param i The start index (inclusive) of the interval. - * \param j The end index (exclusive) of the interval. - * \param k Reference for number of different symbols in [i..j-1]. - * \param cs Reference to a vector that will contain in - * cs[0..k-1] all symbols that occur in [i..j-1] in - * ascending order. - * \param rank_c_i Reference to a vector which equals - * rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$. - * \param rank_c_j Reference to a vector which equals - * rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$. - * \par Time complexity - * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ - * - * \par Precondition - * \f$ i \leq j \leq size() \f$ - * \f$ cs.size() \geq \sigma \f$ - * \f$ rank_{c_i}.size() \geq \sigma \f$ - * \f$ rank_{c_j}.size() \geq \sigma \f$ - */ -template -void -interval_symbols(const t_wt& wt, typename t_wt::size_type i, - typename t_wt::size_type j, - typename t_wt::size_type& k, - std::vector& cs, - std::vector& rank_c_i, - std::vector& rank_c_j) -{ - // check if wt has a built-in interval_symbols method - constexpr bool has_own = has_interval_symbols::value; - if (has_own) { // if yes, call it - _interval_symbols_wt::call(wt, i, j, k, - cs, rank_c_i, rank_c_j); - } else { // otherwise use generic implementation based on expand - _interval_symbols(wt, i,j, k, cs, rank_c_i, rank_c_j); - } -} - - - -// has_interval_symbols::value is true if class X has -// implement method interval_symbols -// Adapted solution from jrok's proposal: -// http://stackoverflow.com/questions/87372/check-if-a-class-has-a-member-function-of-a-given-signature -template -struct has_interval_symbols { - template - static constexpr auto check(T*) - -> typename - std::is_same< - decltype(std::declval().interval_symbols( - std::declval(), - std::declval(), - std::declval(), - std::declval&>(), - std::declval&>(), - std::declval&>() - )), - void>::type {return std::true_type();} - template - static constexpr std::false_type check(...) {return std::false_type();} - typedef decltype(check(nullptr)) type; - static constexpr bool value = type::value; -}; - -template -struct _interval_symbols_wt { - typedef typename t_wt::size_type size_type; - typedef typename t_wt::value_type value_type; - - static void call(const t_wt& wt, size_type i, size_type j, size_type& k, - std::vector& cs, std::vector& rank_c_i, - std::vector& rank_c_j) { - wt.interval_symbols(i,j,k,cs,rank_c_i,rank_c_j); - } -}; - - -template -struct _interval_symbols_wt { - typedef typename t_wt::size_type size_type; - typedef typename t_wt::value_type value_type; - - static void call(const t_wt&, size_type, size_type, size_type&, - std::vector&, std::vector&, - std::vector&) { - } -}; - template struct has_expand { static_assert(std::integral_constant::value, @@ -286,7 +560,7 @@ struct has_expand { static constexpr std::false_type check(...) { return std::false_type();} typedef decltype(check(nullptr)) type; static constexpr bool value = type::value; -}; + }; template struct has_range_search_2d { @@ -416,12 +690,14 @@ struct _symbols_calls_wt { typedef typename t_wt::value_type value_type; static std::pair - call_symbol_gte(const t_wt& wt,value_type c) { + call_symbol_gte(const t_wt& wt,value_type c) + { return wt.symbol_gte(c); } static std::pair - call_symbol_lte(const t_wt& wt,value_type c) { + call_symbol_lte(const t_wt& wt,value_type c) + { return wt.symbol_lte(c); } }; @@ -432,12 +708,14 @@ struct _symbols_calls_wt { typedef typename t_wt::value_type value_type; static std::pair - call_symbol_gte(const t_wt& wt,value_type c) { + call_symbol_gte(const t_wt& wt,value_type c) + { return _symbol_gte(wt,c); } static std::pair - call_symbol_lte(const t_wt& wt,value_type c) { + call_symbol_lte(const t_wt& wt,value_type c) + { return _symbol_lte(wt,c); } }; @@ -460,32 +738,36 @@ struct has_symbols_wt { //! Returns for a symbol c the previous smaller or equal symbol in the WT. /*! \param c the symbol - * \return A pair. The first element of the pair consititues if + * \return A pair. The first element of the pair indicates if * a valid answer was found (true) or no valid answer (false) * could be found. The second element contains the found symbol. */ template std::pair -symbol_lte(const t_wt& wt, typename t_wt::value_type c) +symbol_lte(const t_wt& wt, typename t_wt::value_type c, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) { static_assert(t_wt::lex_ordered, "symbols_lte requires a lex_ordered WT"); - // check if wt has a built-in interval_symbols method + // check if wt has a built-in symbols_gte method constexpr bool has_own = has_symbols_wt::value; return _symbols_calls_wt::call_symbol_lte(wt,c); } //! Returns for a symbol c the next larger or equal symbol in the WT. /*! \param c the symbol - * \return A pair. The first element of the pair consititues if + * \return A pair. The first element of the pair indicates if * a valid answer was found (true) or no valid answer (false) * could be found. The second element contains the found symbol. */ template std::pair -symbol_gte(const t_wt& wt, typename t_wt::value_type c) +symbol_gte(const t_wt& wt, typename t_wt::value_type c, + SDSL_UNUSED typename std::enable_if::value, wt_tag>::type x = wt_tag() + ) { static_assert(t_wt::lex_ordered, "symbols_gte requires a lex_ordered WT"); - // check if wt has a built-in interval_symbols method + // check if wt has a built-in symbol_gte_method constexpr bool has_own = has_symbols_wt::value; return _symbols_calls_wt::call_symbol_gte(wt,c); } @@ -496,7 +778,8 @@ symbol_gte(const t_wt& wt, typename t_wt::value_type c) * \param x_j upper bound of the x range * \param y_i lower bound of the y range * \param y_j upper bound of the y range - * \return a vector of increasing y values occuring in the range [x_i,x_j] + * \return a vector of increasing y values occurring in the range [x_i,x_j] + * TODO: should return an iterator to the result */ template std::vector @@ -511,15 +794,15 @@ restricted_unique_range_values(const t_wt& wt, std::vector unique_values; // make sure things are within bounds - if( x_j > wt.size()-1 ) x_j = wt.size()-1; - if( (x_i > x_j) || (y_i > y_j) ) { + if (x_j > wt.size()-1) x_j = wt.size()-1; + if ((x_i > x_j) || (y_i > y_j)) { return unique_values; } auto lower_y_bound = symbol_gte(wt,y_i); auto upper_y_bound = symbol_lte(wt,y_j); // is the y range valid? - if( !lower_y_bound.first || !upper_y_bound.first - || (lower_y_bound.second > upper_y_bound.second) ) { + if (!lower_y_bound.first || !upper_y_bound.first + || (lower_y_bound.second > upper_y_bound.second)) { return unique_values; } diff --git a/include/sdsl/wt_gmr.hpp b/include/sdsl/wt_gmr.hpp index c2bdbb989..495a0ca38 100644 --- a/include/sdsl/wt_gmr.hpp +++ b/include/sdsl/wt_gmr.hpp @@ -74,7 +74,8 @@ class inv_multi_perm_support inv_multi_perm_support() {}; //! Constructor - inv_multi_perm_support(const iv_type* perm, int_vector<>& iv, uint64_t chunksize) : m_perm(perm), m_chunksize(chunksize) { + inv_multi_perm_support(const iv_type* perm, int_vector<>& iv, uint64_t chunksize) : m_perm(perm), m_chunksize(chunksize) + { bit_vector marked(iv.size(), 0); bit_vector done(m_chunksize, 0); @@ -142,17 +143,20 @@ class inv_multi_perm_support //! Copy constructor inv_multi_perm_support(const inv_multi_perm_support& p) : m_perm(p.m_perm), m_chunksize(p.m_chunksize), m_back_pointer(p.m_back_pointer), m_marked(p.m_marked), - m_marked_rank(p.m_marked_rank) { + m_marked_rank(p.m_marked_rank) + { m_marked_rank.set_vector(&m_marked); } //! Move constructor - inv_multi_perm_support(inv_multi_perm_support&& p) { + inv_multi_perm_support(inv_multi_perm_support&& p) + { *this = std::move(p); } //! Assignment operation - inv_multi_perm_support& operator=(const inv_multi_perm_support& p) { + inv_multi_perm_support& operator=(const inv_multi_perm_support& p) + { if (this != &p) { m_perm = p.m_perm; m_chunksize = p.m_chunksize; @@ -165,7 +169,8 @@ class inv_multi_perm_support } //! Assignment move operation - inv_multi_perm_support& operator=(inv_multi_perm_support&& p) { + inv_multi_perm_support& operator=(inv_multi_perm_support&& p) + { if (this != &p) { m_perm = std::move(p.m_perm); m_chunksize = std::move(p.m_chunksize); @@ -178,7 +183,8 @@ class inv_multi_perm_support } //! Swap operation - void swap(inv_multi_perm_support& p) { + void swap(inv_multi_perm_support& p) + { if (this != &p) { std::swap(m_chunksize, p.m_chunksize); m_back_pointer.swap(p.m_back_pointer); @@ -188,12 +194,14 @@ class inv_multi_perm_support } //! Returns the size of the original vector. - size_type size() const { + size_type size() const + { return nullptr == m_perm ? 0 : m_perm->size(); } //! Returns whether the original vector contains no data. - bool empty()const { + bool empty()const + { return size() == 0; } @@ -202,7 +210,8 @@ class inv_multi_perm_support * \par Time complexity * \f$ \Order{t_s} \f$ */ - value_type operator[](size_type i) const { + value_type operator[](size_type i) const + { size_type off = (i/m_chunksize)*m_chunksize; size_type j = i, j_new=0; while ((j_new=((*m_perm)[j])+off) != i) { @@ -217,19 +226,22 @@ class inv_multi_perm_support } //! Returns a const_iterator to the first element. - const_iterator begin()const { + const_iterator begin()const + { return const_iterator(this, 0); } //! Returns a const_iterator to the element after the last element. - const_iterator end()const { + const_iterator end()const + { return const_iterator(this, size()); } void set_vector(const iv_type* v) { m_perm = v; } //! Serialize into stream - size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const + { structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); size_type written_bytes = 0; written_bytes += write_member(m_chunksize, out, child, "chunksize"); @@ -241,7 +253,8 @@ class inv_multi_perm_support } //! Load sampling from disk - void load(std::istream& in, const iv_type* v=nullptr) { + void load(std::istream& in, const iv_type* v=nullptr) + { set_vector(v); read_member(m_chunksize, in); m_back_pointer.load(in); @@ -301,6 +314,7 @@ class wt_gmr_rs typedef wt_tag index_category; typedef int_alphabet_tag alphabet_category; enum {lex_ordered=0}; + enum {traversable=false}; private: @@ -325,7 +339,8 @@ class wt_gmr_rs * \param size Size of the prefix of v, which should be indexed. */ template - wt_gmr_rs(int_vector_buffer& input, size_type size) : m_size(size) { + wt_gmr_rs(int_vector_buffer& input, size_type size) : m_size(size) + { // Determine max. symbol for (uint64_t i=0; im_block_size-1) { return 0; } @@ -513,7 +535,8 @@ class wt_gmr_rs * \par Precondition * \f$ i \leq size() \f$ */ - std::pair inverse_select(size_type i)const { + std::pair inverse_select(size_type i)const + { assert(i - wt_gmr(int_vector_buffer& input, size_type size) : m_size(size) { + wt_gmr(int_vector_buffer& input, size_type size) : m_size(size) + { // Determine max. symbol for (uint64_t i=0; im_max_symbol-1) { @@ -855,7 +890,8 @@ class wt_gmr * \par Precondition * \f$ i < size() \f$ */ - std::pair inverse_select(size_type i)const { + std::pair inverse_select(size_type i)const + { assert(i < size()); uint64_t chunk = i/m_chunksize; uint64_t x = m_ips[i]; @@ -877,7 +913,8 @@ class wt_gmr * \par Precondition * \f$ 1 \leq i \leq rank(size(), c) \f$ */ - size_type select(size_type i, value_type c)const { + size_type select(size_type i, value_type c)const + { assert(1 <= i and i <= rank(size(), c)); uint64_t ones_before_c = m_bv_blocks_select0(c*m_chunks+1)-(c*m_chunks); @@ -889,7 +926,8 @@ class wt_gmr } //! Serializes the data structure into the given ostream - size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const + { structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); size_type written_bytes = 0; written_bytes += write_member(m_size, out, child, "size"); @@ -910,7 +948,8 @@ class wt_gmr } //! Loads the data structure from the given istream. - void load(std::istream& in) { + void load(std::istream& in) + { read_member(m_size, in); read_member(m_max_symbol, in); read_member(m_chunks, in); diff --git a/include/sdsl/wt_int.hpp b/include/sdsl/wt_int.hpp index 1d7b4ca66..cbd0a4aa5 100644 --- a/include/sdsl/wt_int.hpp +++ b/include/sdsl/wt_int.hpp @@ -73,6 +73,7 @@ class wt_int typedef wt_tag index_category; typedef int_alphabet_tag alphabet_category; enum {lex_ordered=1}; + enum {traversable=true }; typedef std::pair point_type; typedef std::vector point_vec_type; @@ -91,7 +92,8 @@ class wt_int mutable int_vector<64> m_path_off; // array keeps track of path offset in select-like methods mutable int_vector<64> m_path_rank_off;// array keeps track of rank values for the offsets - void copy(const wt_int& wt) { + void copy(const wt_int& wt) + { m_size = wt.m_size; m_sigma = wt.m_sigma; m_tree = wt.m_tree; @@ -108,53 +110,12 @@ class wt_int private: - void init_buffers(uint32_t max_level) { + void init_buffers(uint32_t max_level) + { m_path_off = int_vector<64>(max_level+1); m_path_rank_off = int_vector<64>(max_level+1); } - // recursive internal version of the method interval_symbols - void _interval_symbols(size_type i, size_type j, size_type& k, - std::vector& cs, - std::vector& rank_c_i, - std::vector& rank_c_j, - size_type level, - size_type path, - size_type node_size, - size_type offset) const { - // invariant: j>i - - if (level >= m_max_level) { - rank_c_i[k]= i; - rank_c_j[k]= j; - cs[k++]= path; - return; - } - - size_type ones_before_o = m_tree_rank(offset); - size_type ones_before_i = m_tree_rank(offset+i) - ones_before_o; - size_type ones_before_j = m_tree_rank(offset+j) - ones_before_o; - size_type ones_before_end = m_tree_rank(offset+ node_size) - ones_before_o; - - // goto left child - if ((j-i)-(ones_before_j-ones_before_i)>0) { - size_type new_offset = offset + m_size; - size_type new_node_size = node_size - ones_before_end; - size_type new_i = i - ones_before_i; - size_type new_j = j - ones_before_j; - _interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, level+1, path<<1, new_node_size, new_offset); - } - - // goto right child - if ((ones_before_j-ones_before_i)>0) { - size_type new_offset = offset+(node_size - ones_before_end) + m_size; - size_type new_node_size = ones_before_end; - size_type new_i = ones_before_i; - size_type new_j = ones_before_j; - _interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, level+1, (path<<1)|1, new_node_size, new_offset); - } - } - public: const size_type& sigma = m_sigma; //!< Effective alphabet size of the wavelet tree. @@ -162,7 +123,8 @@ class wt_int const uint32_t& max_level = m_max_level; //!< Maximal level of the wavelet tree. //! Default constructor - wt_int() { + wt_int() + { init_buffers(m_max_level); }; @@ -178,7 +140,8 @@ class wt_int */ template wt_int(int_vector_buffer& buf, size_type size, - uint32_t max_level=0) : m_size(size) { + uint32_t max_level=0) : m_size(size) + { init_buffers(m_max_level); if (0 == m_size) return; @@ -268,17 +231,20 @@ class wt_int } //! Copy constructor - wt_int(const wt_int& wt) { + wt_int(const wt_int& wt) + { copy(wt); } //! Copy constructor - wt_int(wt_int&& wt) { + wt_int(wt_int&& wt) + { *this = std::move(wt); } //! Assignment operator - wt_int& operator=(const wt_int& wt) { + wt_int& operator=(const wt_int& wt) + { if (this != &wt) { copy(wt); } @@ -286,7 +252,8 @@ class wt_int } //! Assignment move operator - wt_int& operator=(wt_int&& wt) { + wt_int& operator=(wt_int&& wt) + { if (this != &wt) { m_size = wt.m_size; m_sigma = wt.m_sigma; @@ -305,7 +272,8 @@ class wt_int } //! Swap operator - void swap(wt_int& wt) { + void swap(wt_int& wt) + { if (this != &wt) { std::swap(m_size, wt.m_size); std::swap(m_sigma, wt.m_sigma); @@ -320,12 +288,14 @@ class wt_int } //! Returns the size of the original vector. - size_type size()const { + size_type size()const + { return m_size; } //! Returns whether the wavelet tree contains no data. - bool empty()const { + bool empty()const + { return m_size == 0; } @@ -335,7 +305,8 @@ class wt_int * \par Precondition * \f$ i < size() \f$ */ - value_type operator[](size_type i)const { + value_type operator[](size_type i)const + { assert(i < size()); size_type offset = 0; value_type res = 0; @@ -369,7 +340,8 @@ class wt_int * \par Precondition * \f$ i \leq size() \f$ */ - size_type rank(size_type i, value_type c)const { + size_type rank(size_type i, value_type c)const + { assert(i <= size()); if (((1ULL)<<(m_max_level))<=c) { // c is greater than any symbol in wt return 0; @@ -405,7 +377,8 @@ class wt_int * \f$ i < size() \f$ */ std::pair - inverse_select(size_type i)const { + inverse_select(size_type i)const + { assert(i < size()); value_type c = 0; @@ -438,7 +411,8 @@ class wt_int * \par Precondition * \f$ 1 \leq i \leq rank(size(), c) \f$ */ - size_type select(size_type i, value_type c)const { + size_type select(size_type i, value_type c)const + { assert(1 <= i and i <= rank(size(), c)); // possible optimization: if the array is a permutation we can start at the bottom of the tree size_type offset = 0; @@ -478,50 +452,6 @@ class wt_int return i-1; }; - - //! For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c). - /*! - * \param i The start index (inclusive) of the interval. - * \param j The end index (exclusive) of the interval. - * \param k Reference for number of different symbols in [i..j-1]. - * \param cs Reference to a vector that will contain in - * cs[0..k-1] all symbols that occur in [i..j-1] in - * ascending order. - * \param rank_c_i Reference to a vector which equals - * rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$. - * \param rank_c_j Reference to a vector which equals - * rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$. - * \par Time complexity - * \f$ \Order{\min{\sigma, k \log \sigma}} \f$ - * - * \par Precondition - * \f$ i \leq j \leq size() \f$ - * \f$ cs.size() \geq \sigma \f$ - * \f$ rank_{c_i}.size() \geq \sigma \f$ - * \f$ rank_{c_j}.size() \geq \sigma \f$ - */ - void interval_symbols(size_type i, size_type j, size_type& k, - std::vector& cs, - std::vector& rank_c_i, - std::vector& rank_c_j) const { - assert(i <= j and j <= size()); - k=0; - if (i==j) { - return; - } - if ((i+1)==j) { - auto res = inverse_select(i); - cs[0]=res.second; - rank_c_i[0]=res.first; - rank_c_j[0]=res.first+1; - k=1; - return; - } - - _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, 0, m_size, 0); - - } - //! How many symbols are lexicographic smaller/greater than c in [i..j-1]. /*! * \param i Start index (inclusive) of the interval. @@ -536,7 +466,8 @@ class wt_int * \f$ i \leq j \leq size() \f$ */ template> - t_ret_type lex_count(size_type i, size_type j, value_type c)const { + t_ret_type lex_count(size_type i, size_type j, value_type c)const + { assert(i <= j and j <= size()); if (((1ULL)<<(m_max_level))<=c) { // c is greater than any symbol in wt return t_ret_type {0, j-i, 0}; @@ -580,7 +511,8 @@ class wt_int * \f$ i \leq size() \f$ */ template> - t_ret_type lex_smaller_count(size_type i, value_type c) const { + t_ret_type lex_smaller_count(size_type i, value_type c) const + { assert(i <= size()); if (((1ULL)<<(m_max_level))<=c) { // c is greater than any symbol in wt return t_ret_type {0, i}; @@ -619,7 +551,8 @@ class wt_int */ std::pair>> range_search_2d(size_type lb, size_type rb, value_type vlb, value_type vrb, - bool report=true) const { + bool report=true) const + { size_type offsets[m_max_level+1]; size_type ones_before_os[m_max_level+1]; offsets[0] = 0; @@ -638,7 +571,8 @@ class wt_int size_type ilb, size_type node_size, size_type offsets[], size_type ones_before_os[], size_type path, point_vec_type& point_vec, bool report, size_type& cnt_answers) - const { + const + { if (lb > rb) return; if (level == m_max_level) { @@ -693,18 +627,21 @@ class wt_int } //! Returns a const_iterator to the first element. - const_iterator begin()const { + const_iterator begin()const + { return const_iterator(this, 0); } //! Returns a const_iterator to the element after the last element. - const_iterator end()const { + const_iterator end()const + { return const_iterator(this, size()); } //! Serializes the data structure into the given ostream - size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const { + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, std::string name="")const + { structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this)); size_type written_bytes = 0; written_bytes += write_member(m_size, out, child, "size"); @@ -719,7 +656,8 @@ class wt_int } //! Loads the data structure from the given istream. - void load(std::istream& in) { + void load(std::istream& in) + { read_member(m_size, in); read_member(m_sigma, in); m_tree.load(in); @@ -736,11 +674,12 @@ class wt_int size_type size = 0; size_type level = 0; value_type sym = 0; + size_type rank = 0; // Default constructor node_type(size_type o=0, size_type sz=0, size_type l=0, - value_type sy=0) : - offset(o), size(sz), level(l), sym(sy) {} + value_type sy=0, size_type r=0) : + offset(o), size(sz), level(l), sym(sy), rank(r) {} // Copy constructor node_type(const node_type&) = default; @@ -755,37 +694,58 @@ class wt_int node_type& operator=(node_type&&) = default; // Comparator operator - bool operator==(const node_type& v) const { + bool operator==(const node_type& v) const + { return offset == v.offset; } // Smaller operator - bool operator<(const node_type& v) const { + bool operator<(const node_type& v) const + { return offset < v.offset; } // Greater operator - bool operator>(const node_type& v) const { + bool operator>(const node_type& v) const + { return offset > v.offset; } }; //! Checks if the node is a leaf node - bool is_leaf(const node_type& v) const { + bool is_leaf(const node_type& v) const + { return v.level == m_max_level; } - value_type sym(const node_type& v) const { + value_type sym(const node_type& v) const + { return v.sym; } - bool empty(const node_type& v) const { + bool empty(const node_type& v) const + { return v.size == (size_type)0; } + size_type size(const node_type& v) const + { + return v.size; + } + //! Return the root node - node_type root() const { - return node_type(0, m_size, 0, 0); + node_type root() const + { + return node_type(0, m_size, 0, 0, 0); + } + + template + size_type rank(size_type i, const node_type& v) const + { + if (t_b) + return m_tree_rank(v.offset+i)-m_tree_rank(v.offset); + else + return v.size-rank<1>(i,v); } //! Returns the two child nodes of an inner node @@ -793,19 +753,21 @@ class wt_int * \return Return a pair of nodes (left child, right child). * \pre !is_leaf(v) */ - std::pair - expand(const node_type& v) const { + std::array + expand(const node_type& v) const + { node_type v_right = v; return expand(std::move(v_right)); } //! Returns the two child nodes of an inner node /*! \param v An inner node of a wavelet tree. - * \return Return a pair of nodes (left child, right child). + * \return Return an array of nodes (left child, right child). * \pre !is_leaf(v) */ - std::pair - expand(node_type&& v) const { + std::array + expand(node_type&& v) const + { node_type v_left; size_type offset_rank = m_tree_rank(v.offset); size_type ones = m_tree_rank(v.offset + v.size) - offset_rank; @@ -820,7 +782,7 @@ class wt_int v.level = v.level + 1; v.sym = (v.sym<<1)|1; - return std::make_pair(std::move(v_left), v); + return {std::move(v_left), v}; } //! Returns for each range its left and right child ranges @@ -833,9 +795,10 @@ class wt_int * range mapped to the right child of v. * \pre !is_leaf(v) and s>=v_s and e<=v_e */ - std::pair + std::array expand(const node_type& v, - const range_vec_type& ranges) const { + const range_vec_type& ranges) const + { auto ranges_copy = ranges; return expand(v, std::move(ranges_copy)); } @@ -850,9 +813,10 @@ class wt_int * range mapped to the right child of v. * \pre !is_leaf(v) and s>=v_s and e<=v_e */ - std::pair + std::array expand(const node_type& v, - range_vec_type&& ranges) const { + range_vec_type&& ranges) const + { auto v_sp_rank = m_tree_rank(v.offset); // this is already calculated in expand(v) range_vec_type res(ranges.size()); size_t i = 0; @@ -868,7 +832,7 @@ class wt_int r = range_type(left_sp, left_sp + left_size - 1); res[i++] = range_type(right_sp, right_sp + right_size - 1); } - return make_pair(ranges, std::move(res)); + return {ranges, std::move(res)}; } //! Returns for a range its left and right child ranges @@ -881,8 +845,9 @@ class wt_int * range mapped to the right child of v. * \pre !is_leaf(v) and s>=v_s and e<=v_e */ - std::pair - expand(const node_type& v, const range_type& r) const { + std::array + expand(const node_type& v, const range_type& r) const + { auto v_sp_rank = m_tree_rank(v.offset); // this is already calculated in expand(v) auto sp_rank = m_tree_rank(v.offset + r.first); auto right_size = m_tree_rank(v.offset + r.second + 1) @@ -892,14 +857,24 @@ class wt_int auto right_sp = sp_rank - v_sp_rank; auto left_sp = r.first - right_sp; - return make_pair(range_type(left_sp, left_sp + left_size - 1), - range_type(right_sp, right_sp + right_size - 1)); + return {range_type(left_sp, left_sp + left_size - 1), + range_type(right_sp, right_sp + right_size - 1) + }; } //! return the path to the leaf for a given symbol - std::pair path(value_type c) const { + std::pair path(value_type c) const + { return {m_max_level,c}; } + + //! Return the value range of a node v + std::array + value_range(const node_type& v) const + { + const uint64_t size = 1ULL << (m_max_level-v.level); + return {(v.sym<<(m_max_level-v.level)), (v.sym<<(m_max_level-v.level))+size-1}; + } }; }// end namespace sdsl diff --git a/include/sdsl/wt_pc.hpp b/include/sdsl/wt_pc.hpp index edfb46f13..8ac477c01 100644 --- a/include/sdsl/wt_pc.hpp +++ b/include/sdsl/wt_pc.hpp @@ -47,36 +47,37 @@ namespace sdsl * @ingroup wt */ template - > + class t_bitvector = bit_vector, + class t_rank = typename t_bitvector::rank_1_type, + class t_select = typename t_bitvector::select_1_type, + class t_select_zero = typename t_bitvector::select_0_type, + class t_tree_strat = byte_tree<> +> class wt_pc { - public: - typedef typename - t_tree_strat::template type tree_strat_type; - typedef int_vector<>::size_type size_type; - typedef typename - tree_strat_type::value_type value_type; - typedef typename t_bitvector::difference_type difference_type; - typedef random_access_const_iterator const_iterator; - typedef const_iterator iterator; - typedef t_bitvector bit_vector_type; - typedef t_rank rank_1_type; - typedef t_select select_1_type; - typedef t_select_zero select_0_type; - typedef wt_tag index_category; - typedef typename - tree_strat_type::alphabet_category alphabet_category; - typedef typename - t_shape::template type shape_type; - enum { lex_ordered=shape_type::lex_ordered }; - using node_type = typename tree_strat_type::node_type; - - private: +public: + typedef typename + t_tree_strat::template type tree_strat_type; + typedef int_vector<>::size_type size_type; + typedef typename + tree_strat_type::value_type value_type; + typedef typename t_bitvector::difference_type difference_type; + typedef random_access_const_iterator const_iterator; + typedef const_iterator iterator; + typedef t_bitvector bit_vector_type; + typedef t_rank rank_1_type; + typedef t_select select_1_type; + typedef t_select_zero select_0_type; + typedef wt_tag index_category; + typedef typename + tree_strat_type::alphabet_category alphabet_category; + typedef typename + t_shape::template type shape_type; + enum { lex_ordered=shape_type::lex_ordered }; + enum { traversable=true }; + using node_type = typename tree_strat_type::node_type; + +private: #ifdef WT_PC_CACHE mutable value_type m_last_access_answer; @@ -84,716 +85,644 @@ class wt_pc mutable size_type m_last_access_rl; #endif - size_type m_size = 0; // original text size - size_type m_sigma = 0; // alphabet size - bit_vector_type m_bv; // bit vector to store the wavelet tree - rank_1_type m_bv_rank; // rank support for the wavelet tree bit vector - select_1_type m_bv_select1; // select support for the wavelet tree bit vector - select_0_type m_bv_select0; - tree_strat_type m_tree; - - void copy(const wt_pc& wt) { - m_size = wt.m_size; - m_sigma = wt.m_sigma; - m_bv = wt.m_bv; - m_bv_rank = wt.m_bv_rank; - m_bv_rank.set_vector(&m_bv); - m_bv_select1 = wt.m_bv_select1; - m_bv_select1.set_vector(&m_bv); - m_bv_select0 = wt.m_bv_select0; - m_bv_select0.set_vector(&m_bv); - m_tree = wt.m_tree; - } - - // insert a character into the wavelet tree, see construct method - void insert_char(value_type old_chr, std::vector& bv_node_pos, - size_type times, bit_vector& bv) { - uint64_t p = m_tree.bit_path(old_chr); - uint32_t path_len = p>>56; - node_type v = m_tree.root(); - for (uint32_t l=0; l>= 1) { - if (p&1) { - bv.set_int(bv_node_pos[v], 0xFFFFFFFFFFFFFFFFULL,times); - } - bv_node_pos[v] += times; - v = m_tree.child(v, p&1); + size_type m_size = 0; // original text size + size_type m_sigma = 0; // alphabet size + bit_vector_type m_bv; // bit vector to store the wavelet tree + rank_1_type m_bv_rank; // rank support for the wavelet tree bit vector + select_1_type m_bv_select1; // select support for the wavelet tree bit vector + select_0_type m_bv_select0; + tree_strat_type m_tree; + + void copy(const wt_pc& wt) + { + m_size = wt.m_size; + m_sigma = wt.m_sigma; + m_bv = wt.m_bv; + m_bv_rank = wt.m_bv_rank; + m_bv_rank.set_vector(&m_bv); + m_bv_select1 = wt.m_bv_select1; + m_bv_select1.set_vector(&m_bv); + m_bv_select0 = wt.m_bv_select0; + m_bv_select0.set_vector(&m_bv); + m_tree = wt.m_tree; + } + + // insert a character into the wavelet tree, see construct method + void insert_char(value_type old_chr, std::vector& bv_node_pos, + size_type times, bit_vector& bv) + { + uint64_t p = m_tree.bit_path(old_chr); + uint32_t path_len = p>>56; + node_type v = m_tree.root(); + for (uint32_t l=0; l>= 1) { + if (p&1) { + bv.set_int(bv_node_pos[v], 0xFFFFFFFFFFFFFFFFULL,times); } + bv_node_pos[v] += times; + v = m_tree.child(v, p&1); } - - - - // calculates the tree shape returns the size of the WT bit vector - size_type construct_tree_shape(const std::vector& C) { - // vector for node of the tree - std::vector temp_nodes; //(2*m_sigma-1); - shape_type::construct_tree(C, temp_nodes); - // Convert code tree into BFS order in memory and - // calculate bv_pos values - size_type bv_size = 0; - tree_strat_type temp_tree(temp_nodes, bv_size, this); - m_tree.swap(temp_tree); - return bv_size; + } + + + + // calculates the tree shape returns the size of the WT bit vector + size_type construct_tree_shape(const std::vector& C) + { + // vector for node of the tree + std::vector temp_nodes; //(2*m_sigma-1); + shape_type::construct_tree(C, temp_nodes); + // Convert code tree into BFS order in memory and + // calculate bv_pos values + size_type bv_size = 0; + tree_strat_type temp_tree(temp_nodes, bv_size, this); + m_tree.swap(temp_tree); + return bv_size; + } + + void construct_init_rank_select() + { + util::init_support(m_bv_rank, &m_bv); + util::init_support(m_bv_select0, &m_bv); + util::init_support(m_bv_select1, &m_bv); + } + +public: + + const size_type& sigma = m_sigma; + const bit_vector_type& bv = m_bv; + + // Default constructor + wt_pc() {}; + + //! Construct the wavelet tree from a file_buffer + /*! + * \param input_buf File buffer of the input. + * \param size The length of the prefix. + * \par Time complexity + * \f$ \Order{n\log|\Sigma|}\f$, where \f$n=size\f$ + */ + wt_pc(int_vector_buffer& input_buf, + size_type size):m_size(size) + { + if (0 == m_size) + return; + // O(n + |\Sigma|\log|\Sigma|) algorithm for calculating node sizes + // TODO: C should also depend on the tree_strategy. C is just a mapping + // from a symbol to its frequency. So a map could be + // used for integer alphabets... + std::vector C; + // 1. Count occurrences of characters + calculate_character_occurences(input_buf, m_size, C); + // 2. Calculate effective alphabet size + calculate_effective_alphabet_size(C, m_sigma); + // 3. Generate tree shape + size_type tree_size = construct_tree_shape(C); + // 4. Generate wavelet tree bit sequence m_bv + bit_vector temp_bv(tree_size, 0); + + // Initializing starting position of wavelet tree nodes + std::vector bv_node_pos(m_tree.size(), 0); + for (size_type v=0; v < m_tree.size(); ++v) { + bv_node_pos[v] = m_tree.bv_pos(v); } - - void construct_init_rank_select() { - util::init_support(m_bv_rank, &m_bv); - util::init_support(m_bv_select0, &m_bv); - util::init_support(m_bv_select1, &m_bv); + if (input_buf.size() < size) { + throw std::logic_error("Stream size is smaller than size!"); + return; } - - // recursive internal version of the method interval_symbols - void - _interval_symbols(size_type i, size_type j, size_type& k, - std::vector& cs, - std::vector& rank_c_i, - std::vector& rank_c_j, node_type v) const { - // invariant: j>i - size_type i_new = (m_bv_rank(m_tree.bv_pos(v) + i) - - m_tree.bv_pos_rank(v)); - size_type j_new = (m_bv_rank(m_tree.bv_pos(v) + j) - - m_tree.bv_pos_rank(v)); - // goto left child - i -= i_new; j -= j_new; - if (i != j) { - node_type v_new = m_tree.child(v, 0); - if (!m_tree.is_leaf(v_new)) { - _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, v_new); - } else { - rank_c_i[k] = i; - rank_c_j[k] = j; - cs[k++] = m_tree.bv_pos_rank(v_new); - } - } - // goto right child - if (i_new!=j_new) { - node_type v_new = m_tree.child(v, 1); - if (!m_tree.is_leaf(v_new)) { - _interval_symbols(i_new, j_new, k, cs, rank_c_i, rank_c_j, - v_new); - } else { - rank_c_i[k] = i_new; - rank_c_j[k] = j_new; - cs[k++] = m_tree.bv_pos_rank(v_new); - } - } - } - - public: - - const size_type& sigma = m_sigma; - const bit_vector_type& bv = m_bv; - - // Default constructor - wt_pc() {}; - - //! Construct the wavelet tree from a file_buffer - /*! - * \param input_buf File buffer of the input. - * \param size The length of the prefix. - * \par Time complexity - * \f$ \Order{n\log|\Sigma|}\f$, where \f$n=size\f$ - */ - wt_pc(int_vector_buffer& input_buf, - size_type size):m_size(size) { - if (0 == m_size) - return; - // O(n + |\Sigma|\log|\Sigma|) algorithm for calculating node sizes - // TODO: C should also depend on the tree_strategy. C is just a mapping - // from a symbol to its frequency. So a map could be - // used for integer alphabets... - std::vector C; - // 1. Count occurrences of characters - calculate_character_occurences(input_buf, m_size, C); - // 2. Calculate effective alphabet size - calculate_effective_alphabet_size(C, m_sigma); - // 3. Generate tree shape - size_type tree_size = construct_tree_shape(C); - // 4. Generate wavelet tree bit sequence m_bv - bit_vector temp_bv(tree_size, 0); - - // Initializing starting position of wavelet tree nodes - std::vector bv_node_pos(m_tree.size(), 0); - for (size_type v=0; v < m_tree.size(); ++v) { - bv_node_pos[v] = m_tree.bv_pos(v); - } - if (input_buf.size() < size) { - throw std::logic_error("Stream size is smaller than size!"); - return; - } - value_type old_chr = input_buf[0]; - uint32_t times = 0; - for (size_type i=0; i < m_size; ++i) { - value_type chr = input_buf[i]; - if (chr != old_chr) { + value_type old_chr = input_buf[0]; + uint32_t times = 0; + for (size_type i=0; i < m_size; ++i) { + value_type chr = input_buf[i]; + if (chr != old_chr) { + insert_char(old_chr, bv_node_pos, times, temp_bv); + times = 1; + old_chr = chr; + } else { // chr == old_chr + ++times; + if (times == 64) { insert_char(old_chr, bv_node_pos, times, temp_bv); - times = 1; - old_chr = chr; - } else { // chr == old_chr - ++times; - if (times == 64) { - insert_char(old_chr, bv_node_pos, times, temp_bv); - times = 0; - } + times = 0; } } - if (times > 0) { - insert_char(old_chr, bv_node_pos, times, temp_bv); - } - m_bv = bit_vector_type(std::move(temp_bv)); - // 5. Initialize rank and select data structures for m_bv - construct_init_rank_select(); - // 6. Finish inner nodes by precalculating the bv_pos_rank values - m_tree.init_node_ranks(m_bv_rank); } - - - //! Copy constructor - wt_pc(const wt_pc& wt) { copy(wt); } - - wt_pc(wt_pc&& wt) { - *this = std::move(wt); + if (times > 0) { + insert_char(old_chr, bv_node_pos, times, temp_bv); } - - //! Assignment operator - wt_pc& operator=(const wt_pc& wt) { - if (this != &wt) { - copy(wt); - } - return *this; + m_bv = bit_vector_type(std::move(temp_bv)); + // 5. Initialize rank and select data structures for m_bv + construct_init_rank_select(); + // 6. Finish inner nodes by precalculating the bv_pos_rank values + m_tree.init_node_ranks(m_bv_rank); + } + + + //! Copy constructor + wt_pc(const wt_pc& wt) { copy(wt); } + + wt_pc(wt_pc&& wt) + { + *this = std::move(wt); + } + + //! Assignment operator + wt_pc& operator=(const wt_pc& wt) + { + if (this != &wt) { + copy(wt); } + return *this; + } - //! Assignment operator - wt_pc& operator=(wt_pc&& wt) { - if (this != &wt) { - m_size = wt.m_size; - m_sigma = wt.m_sigma; - m_bv = std::move(wt.m_bv); - m_bv_rank = std::move(wt.m_bv_rank); - m_bv_rank.set_vector(&m_bv); - m_bv_select1 = std::move(wt.m_bv_select1); - m_bv_select1.set_vector(&m_bv); - m_bv_select0 = std::move(wt.m_bv_select0); - m_bv_select0.set_vector(&m_bv); - m_tree = std::move(wt.m_tree); - } - return *this; + //! Assignment operator + wt_pc& operator=(wt_pc&& wt) + { + if (this != &wt) { + m_size = wt.m_size; + m_sigma = wt.m_sigma; + m_bv = std::move(wt.m_bv); + m_bv_rank = std::move(wt.m_bv_rank); + m_bv_rank.set_vector(&m_bv); + m_bv_select1 = std::move(wt.m_bv_select1); + m_bv_select1.set_vector(&m_bv); + m_bv_select0 = std::move(wt.m_bv_select0); + m_bv_select0.set_vector(&m_bv); + m_tree = std::move(wt.m_tree); } - - - //! Swap operator - void swap(wt_pc& wt) { - if (this != &wt) { - std::swap(m_size, wt.m_size); - std::swap(m_sigma, wt.m_sigma); - m_bv.swap(wt.m_bv); - util::swap_support(m_bv_rank, wt.m_bv_rank, - &m_bv, &(wt.m_bv)); - - util::swap_support(m_bv_select1, wt.m_bv_select1, - &m_bv, &(wt.m_bv)); - util::swap_support(m_bv_select0, wt.m_bv_select0, - &m_bv, &(wt.m_bv)); - m_tree.swap(wt.m_tree); - } + return *this; + } + + + //! Swap operator + void swap(wt_pc& wt) + { + if (this != &wt) { + std::swap(m_size, wt.m_size); + std::swap(m_sigma, wt.m_sigma); + m_bv.swap(wt.m_bv); + util::swap_support(m_bv_rank, wt.m_bv_rank, + &m_bv, &(wt.m_bv)); + + util::swap_support(m_bv_select1, wt.m_bv_select1, + &m_bv, &(wt.m_bv)); + util::swap_support(m_bv_select0, wt.m_bv_select0, + &m_bv, &(wt.m_bv)); + m_tree.swap(wt.m_tree); } - - //! Returns the size of the original vector. - size_type size()const { return m_size; } - - //! Returns whether the wavelet tree contains no data. - bool empty()const { return m_size == 0; } - - //! Recovers the i-th symbol of the original vector. - /*! - * \param i Index in the original vector. - * \return The i-th symbol of the original vector. - * \par Time complexity - * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the - * zero order entropy of the sequence - * - * \par Precondition - * \f$ i < size() \f$ - */ - value_type operator[](size_type i)const { - assert(i < size()); - // which stores how many of the next symbols are equal - // with the current char - node_type v = m_tree.root(); // start at root node - while (!m_tree.is_leaf(v)) { // while not a leaf - if (m_bv[ m_tree.bv_pos(v) + i]) { // goto right child - i = m_bv_rank(m_tree.bv_pos(v) + i) + } + + //! Returns the size of the original vector. + size_type size()const { return m_size; } + + //! Returns whether the wavelet tree contains no data. + bool empty()const { return m_size == 0; } + + //! Recovers the i-th symbol of the original vector. + /*! + * \param i Index in the original vector. + * \return The i-th symbol of the original vector. + * \par Time complexity + * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the + * zero order entropy of the sequence + * + * \par Precondition + * \f$ i < size() \f$ + */ + value_type operator[](size_type i)const + { + assert(i < size()); + // which stores how many of the next symbols are equal + // with the current char + node_type v = m_tree.root(); // start at root node + while (!m_tree.is_leaf(v)) { // while not a leaf + if (m_bv[ m_tree.bv_pos(v) + i]) { // goto right child + i = m_bv_rank(m_tree.bv_pos(v) + i) - m_tree.bv_pos_rank(v); - v = m_tree.child(v,1); - } else { // goto the left child - i -= (m_bv_rank(m_tree.bv_pos(v) + i) - - m_tree.bv_pos_rank(v)); - v = m_tree.child(v,0); - } - } - // if v is a leaf bv_pos_rank returns symbol itself - return m_tree.bv_pos_rank(v); - }; - - //! Calculates how many symbols c are in the prefix [0..i-1]. - /*! - * \param i Exclusive right bound of the range. - * \param c Symbol c. - * \return Number of occurrences of symbol c in the prefix [0..i-1]. - * \par Time complexity - * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the - * zero order entropy of the sequence - * - * \par Precondition - * \f$ i \leq size() \f$ - */ - size_type rank(size_type i, value_type c)const { - assert(i <= size()); - if (!m_tree.is_valid(m_tree.c_to_leaf(c))) { - return 0; // if `c` was not in the text - } - if (m_sigma == 1) { - return i; // if m_sigma == 1 answer is trivial - } - uint64_t p = m_tree.bit_path(c); - uint32_t path_len = (p>>56); - size_type result = i; - node_type v = m_tree.root(); - for (uint32_t l=0; l>= 1) { - if (p&1) { - result = (m_bv_rank(m_tree.bv_pos(v)+result) - - m_tree.bv_pos_rank(v)); - } else { - result -= (m_bv_rank(m_tree.bv_pos(v)+result) - - m_tree.bv_pos_rank(v)); - } - v = m_tree.child(v, p&1); // goto child - } - return result; - }; - - //! Calculates how many times symbol wt[i] occurs in the prefix [0..i-1]. - /*! - * \param i The index of the symbol. - * \return Pair (rank(wt[i],i),wt[i]) - * \par Time complexity - * \f$ \Order{H_0} \f$ - * - * \par Precondition - * \f$ i < size() \f$ - */ - std::pair - inverse_select(size_type i)const { - assert(i < size()); - node_type v = m_tree.root(); - while (!m_tree.is_leaf(v)) { // while not a leaf - if (m_bv[m_tree.bv_pos(v) + i]) { // goto right child - i = (m_bv_rank(m_tree.bv_pos(v) + i) - - m_tree.bv_pos_rank(v)); - v = m_tree.child(v, 1); - } else { // goto left child - i -= (m_bv_rank(m_tree.bv_pos(v) + i) - - m_tree.bv_pos_rank(v)); - v = m_tree.child(v,0); - } + v = m_tree.child(v,1); + } else { // goto the left child + i -= (m_bv_rank(m_tree.bv_pos(v) + i) + - m_tree.bv_pos_rank(v)); + v = m_tree.child(v,0); } - // if v is a leaf bv_pos_rank returns symbol itself - return std::make_pair(i, (value_type)m_tree.bv_pos_rank(v)); } - - //! Calculates the ith occurrence of the symbol c in the supported vector. - /*! - * \param i The ith occurrence. - * \param c The symbol c. - * \par Time complexity - * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the zero order - * entropy of the sequence - * - * \par Precondition - * \f$ 1 \leq i \leq rank(size(), c) \f$ - */ - size_type select(size_type i, value_type c)const { - assert(1 <= i and i <= rank(size(), c)); - node_type v = m_tree.c_to_leaf(c); - if (!m_tree.is_valid(v)) { // if c was not in the text - return m_size; // -> return a position right to the end - } - if (m_sigma == 1) { - return std::min(i-1,m_size); - } - size_type result = i-1; // otherwise - uint64_t p = m_tree.bit_path(c); - uint32_t path_len = (p>>56); - // path_len > 0, since we have handled m_sigma = 1. - p <<= (64-path_len); - for (uint32_t l=0; l& cs, - std::vector& rank_c_i, - std::vector& rank_c_j) const { - assert(i <= j and j <= size()); - if (i==j) { - k = 0; - } else if (1==m_sigma) { - k = 1; - cs[0] = m_tree.bv_pos_rank(m_tree.root()); - rank_c_i[0] = std::min(i,m_size); - rank_c_j[0] = std::min(j,m_size); - } else if ((j-i)==1) { - k = 1; - auto rc = inverse_select(i); - rank_c_i[0] = rc.first; cs[0] = rc.second; - rank_c_j[0] = rank_c_i[0]+1; - } else if ((j-i)==2) { - auto rc = inverse_select(i); - rank_c_i[0] = rc.first; cs[0] = rc.second; - rc = inverse_select(i+1); - rank_c_i[1] = rc.first; cs[1] = rc.second; - - if (cs[0]==cs[1]) { - k = 1; - rank_c_j[0] = rank_c_i[0]+2; - } else { - k = 2; - if (lex_ordered and cs[0] > cs[1]) { - std::swap(cs[0], cs[1]); - std::swap(rank_c_i[0], rank_c_i[1]); - } - rank_c_j[0] = rank_c_i[0]+1; - rank_c_j[1] = rank_c_i[1]+1; - } + // if v is a leaf bv_pos_rank returns symbol itself + return m_tree.bv_pos_rank(v); + }; + + //! Calculates how many symbols c are in the prefix [0..i-1]. + /*! + * \param i Exclusive right bound of the range. + * \param c Symbol c. + * \return Number of occurrences of symbol c in the prefix [0..i-1]. + * \par Time complexity + * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the + * zero order entropy of the sequence + * + * \par Precondition + * \f$ i \leq size() \f$ + */ + size_type rank(size_type i, value_type c)const + { + assert(i <= size()); + if (!m_tree.is_valid(m_tree.c_to_leaf(c))) { + return 0; // if `c` was not in the text + } + if (m_sigma == 1) { + return i; // if m_sigma == 1 answer is trivial + } + uint64_t p = m_tree.bit_path(c); + uint32_t path_len = (p>>56); + size_type result = i; + node_type v = m_tree.root(); + for (uint32_t l=0; l>= 1) { + if (p&1) { + result = (m_bv_rank(m_tree.bv_pos(v)+result) + - m_tree.bv_pos_rank(v)); } else { - k = 0; - _interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0); + result -= (m_bv_rank(m_tree.bv_pos(v)+result) + - m_tree.bv_pos_rank(v)); } + v = m_tree.child(v, p&1); // goto child } - - - //! How many symbols are lexicographic smaller/greater than c in [i..j-1]. - /*! - * \param i Start index (inclusive) of the interval. - * \param j End index (exclusive) of the interval. - * \param c Symbol c. - * \return A triple containing: - * * rank(i,c) - * * #symbols smaller than c in [i..j-1] - * * #symbols greater than c in [i..j-1] - * - * \par Precondition - * \f$ i \leq j \leq size() \f$ - * \note - * This method is only available if lex_ordered = true - */ - template> - typename std::enable_if::type - lex_count(size_type i, size_type j, value_type c) const { - assert(i <= j and j <= size()); - if (1==m_sigma) { - value_type _c = m_tree.bv_pos_rank(m_tree.root()); - if (c == _c) { // c is the only symbol in the wt - return t_ret_type {i,0,0}; - } else if (c < _c) { - return t_ret_type {0,0,j-i}; - } else { - return t_ret_type {0,j-i,0}; - } - } - if (i==j) { - return t_ret_type {rank(i,c),0,0}; - } - uint64_t p = m_tree.bit_path(c); - uint32_t path_len = p>>56; - if (path_len == 0) { // path_len=0: => c is not present - value_type _c = (value_type)p; - if (c == _c) { // c is smaller than any symbol in wt - return t_ret_type {0, 0, j-i}; - } - auto res = lex_count(i, j, _c); - return t_ret_type {0, j-i-std::get<2>(res),std::get<2>(res)}; - } - size_type smaller = 0, greater = 0; - node_type v = m_tree.root(); - for (uint32_t l=0; l>= 1) { - size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+i) - - m_tree.bv_pos_rank(v)); - size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+j) - - m_tree.bv_pos_rank(v)); - - if (p&1) { - smaller += j - r1_2 - i + r1_1; - i = r1_1; - j = r1_2; - } else { - greater += r1_2 - r1_1; - i -= r1_1; - j -= r1_2; - } - v = m_tree.child(v, p&1); + return result; + }; + + //! Calculates how many times symbol wt[i] occurs in the prefix [0..i-1]. + /*! + * \param i The index of the symbol. + * \return Pair (rank(wt[i],i),wt[i]) + * \par Time complexity + * \f$ \Order{H_0} \f$ + * + * \par Precondition + * \f$ i < size() \f$ + */ + std::pair + inverse_select(size_type i)const + { + assert(i < size()); + node_type v = m_tree.root(); + while (!m_tree.is_leaf(v)) { // while not a leaf + if (m_bv[m_tree.bv_pos(v) + i]) { // goto right child + i = (m_bv_rank(m_tree.bv_pos(v) + i) + - m_tree.bv_pos_rank(v)); + v = m_tree.child(v, 1); + } else { // goto left child + i -= (m_bv_rank(m_tree.bv_pos(v) + i) + - m_tree.bv_pos_rank(v)); + v = m_tree.child(v,0); } - return t_ret_type {i, smaller, greater}; - }; - - //! How many symbols are lexicographic smaller than c in [0..i-1]. - /*! - * \param i Exclusive right bound of the range. - * \param c Symbol c. - * \return A tuple containing: - * * rank(i,c) - * * #symbols smaller than c in [0..i-1] - * \par Precondition - * \f$ i \leq size() \f$ - * \note - * This method is only available if lex_ordered = true - */ - template> - typename std::enable_if::type - lex_smaller_count(size_type i, value_type c)const { - assert(i <= size()); - if (1==m_sigma) { - value_type _c = m_tree.bv_pos_rank(m_tree.root()); - if (c == _c) { // c is the only symbol in the wt - return t_ret_type {i,0}; - } else if (c < _c) { - return t_ret_type {0,0}; - } else { - return t_ret_type {0,i}; - } - } - - uint64_t p = m_tree.bit_path(c); - uint32_t path_len = p>>56; - if (path_len == 0) { // path_len=0: => c is not present - value_type _c = (value_type)p; - if (c == _c) { // c is smaller than any symbol in wt - return t_ret_type {0, 0}; - } - auto res = lex_smaller_count(i, _c); - return t_ret_type {0, std::get<0>(res)+std::get<1>(res)}; - } - size_type result = 0; - size_type all = i; // possible occurrences of c - node_type v = m_tree.root(); - for (uint32_t l=0; l>= 1) { - size_type ones = (m_bv_rank(m_tree.bv_pos(v)+all) - - m_tree.bv_pos_rank(v)); - if (p&1) { - result += all - ones; - all = ones; - } else { - all -= ones; - } - v = m_tree.child(v, p&1); - } - return t_ret_type {all, result}; - } - - //! Returns a const_iterator to the first element. - const_iterator begin()const { - return const_iterator(this, 0); } - - //! Returns a const_iterator to the element after the last element. - const_iterator end()const { - return const_iterator(this, size()); + // if v is a leaf bv_pos_rank returns symbol itself + return std::make_pair(i, (value_type)m_tree.bv_pos_rank(v)); + } + + //! Calculates the ith occurrence of the symbol c in the supported vector. + /*! + * \param i The ith occurrence. + * \param c The symbol c. + * \par Time complexity + * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the zero order + * entropy of the sequence + * + * \par Precondition + * \f$ 1 \leq i \leq rank(size(), c) \f$ + */ + size_type select(size_type i, value_type c)const + { + assert(1 <= i and i <= rank(size(), c)); + node_type v = m_tree.c_to_leaf(c); + if (!m_tree.is_valid(v)) { // if c was not in the text + return m_size; // -> return a position right to the end } - - //! Serializes the data structure into the given ostream - size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, - std::string name="") const { - structure_tree_node* child = structure_tree::add_child( - v, name, util::class_name(*this)); - size_type written_bytes = 0; - written_bytes += write_member(m_size,out,child, "size"); - written_bytes += write_member(m_sigma,out,child, "sigma"); - written_bytes += m_bv.serialize(out,child,"bv"); - written_bytes += m_bv_rank.serialize(out,child,"bv_rank"); - written_bytes += m_bv_select1.serialize(out,child,"bv_select_1"); - written_bytes += m_bv_select0.serialize(out,child,"bv_select_0"); - written_bytes += m_tree.serialize(out,child,"tree"); - structure_tree::add_size(child, written_bytes); - return written_bytes; + if (m_sigma == 1) { + return std::min(i-1,m_size); } - - //! Loads the data structure from the given istream. - void load(std::istream& in) { - read_member(m_size, in); - read_member(m_sigma, in); - m_bv.load(in); - m_bv_rank.load(in, &m_bv); - m_bv_select1.load(in, &m_bv); - m_bv_select0.load(in, &m_bv); - m_tree.load(in); + size_type result = i-1; // otherwise + uint64_t p = m_tree.bit_path(c); + uint32_t path_len = (p>>56); + // path_len > 0, since we have handled m_sigma = 1. + p <<= (64-path_len); + for (uint32_t l=0; l> + typename std::enable_if::type + lex_count(size_type i, size_type j, value_type c) const + { + assert(i <= j and j <= size()); + if (1==m_sigma) { + value_type _c = m_tree.bv_pos_rank(m_tree.root()); + if (c == _c) { // c is the only symbol in the wt + return t_ret_type {i,0,0}; + } else if (c < _c) { + return t_ret_type {0,0,j-i}; + } else { + return t_ret_type {0,j-i,0}; + } } - - //! Symbol for a leaf - value_type sym(const node_type& v) const { - return m_tree.bv_pos_rank(v); + if (i==j) { + return t_ret_type {rank(i,c),0,0}; } - - bool empty(const node_type&) const { - return true; + uint64_t p = m_tree.bit_path(c); + uint32_t path_len = p>>56; + if (path_len == 0) { // path_len=0: => c is not present + value_type _c = (value_type)p; + if (c == _c) { // c is smaller than any symbol in wt + return t_ret_type {0, 0, j-i}; + } + auto res = lex_count(i, j, _c); + return t_ret_type {0, j-i-std::get<2>(res),std::get<2>(res)}; } - - //! Returns the root node - node_type root() const { - return m_tree.root(); + size_type smaller = 0, greater = 0; + node_type v = m_tree.root(); + for (uint32_t l=0; l>= 1) { + size_type r1_1 = (m_bv_rank(m_tree.bv_pos(v)+i) + - m_tree.bv_pos_rank(v)); + size_type r1_2 = (m_bv_rank(m_tree.bv_pos(v)+j) + - m_tree.bv_pos_rank(v)); + + if (p&1) { + smaller += j - r1_2 - i + r1_1; + i = r1_1; + j = r1_2; + } else { + greater += r1_2 - r1_1; + i -= r1_1; + j -= r1_2; + } + v = m_tree.child(v, p&1); } - - //! Returns the two child nodes of an inner node - /*! \param v An inner node of a wavelet tree. - * \return Return a pair of nodes (left child, right child). - * \pre !is_leaf(v) - */ - std::pair - expand(const node_type& v) const { - return std::make_pair(m_tree.child(v,0), m_tree.child(v,1)); + return t_ret_type {i, smaller, greater}; + }; + + //! How many symbols are lexicographic smaller than c in [0..i-1]. + /*! + * \param i Exclusive right bound of the range. + * \param c Symbol c. + * \return A tuple containing: + * * rank(i,c) + * * #symbols smaller than c in [0..i-1] + * \par Precondition + * \f$ i \leq size() \f$ + * \note + * This method is only available if lex_ordered = true + */ + template> + typename std::enable_if::type + lex_smaller_count(size_type i, value_type c)const + { + assert(i <= size()); + if (1==m_sigma) { + value_type _c = m_tree.bv_pos_rank(m_tree.root()); + if (c == _c) { // c is the only symbol in the wt + return t_ret_type {i,0}; + } else if (c < _c) { + return t_ret_type {0,0}; + } else { + return t_ret_type {0,i}; + } } - //! Returns for each range its left and right child ranges - /*! \param v An inner node of an wavelet tree. - * \param ranges A vector of ranges. Each range [s,e] - * has to be contained in v=[v_s,v_e]. - * \return A vector a range pairs. The first element of each - * range pair correspond to the original range - * mapped to the left child of v; the second element to the - * range mapped to the right child of v. - * \pre !is_leaf(v) and s>=v_s and e<=v_e - */ - std::pair - expand(const node_type& v, - const range_vec_type& ranges) const { - auto ranges_copy = ranges; - return expand(v, std::move(ranges_copy)); + uint64_t p = m_tree.bit_path(c); + uint32_t path_len = p>>56; + if (path_len == 0) { // path_len=0: => c is not present + value_type _c = (value_type)p; + if (c == _c) { // c is smaller than any symbol in wt + return t_ret_type {0, 0}; + } + auto res = lex_smaller_count(i, _c); + return t_ret_type {0, std::get<0>(res)+std::get<1>(res)}; } - - //! Returns for each range its left and right child ranges - /*! \param v An inner node of an wavelet tree. - * \param ranges A vector of ranges. Each range [s,e] - * has to be contained in v=[v_s,v_e]. - * \return A vector a range pairs. The first element of each - * range pair correspond to the original range - * mapped to the left child of v; the second element to the - * range mapped to the right child of v. - * \pre !is_leaf(v) and s>=v_s and e<=v_e - */ - std::pair - expand(const node_type& v, - range_vec_type&& ranges) const { - auto v_sp_rank = m_tree.bv_pos_rank(v); - range_vec_type res(ranges.size()); - size_t i = 0; - for (auto& r : ranges) { - auto sp_rank = m_bv_rank(m_tree.bv_pos(v) + r.first); - auto right_size = m_bv_rank(m_tree.bv_pos(v) + r.second + 1) - - sp_rank; - auto left_size = (r.second-r.first+1)-right_size; - - auto right_sp = sp_rank - v_sp_rank; - auto left_sp = r.first - right_sp; - - r = range_type(left_sp, left_sp + left_size - 1); - res[i++] = range_type(right_sp, right_sp + right_size - 1); + size_type result = 0; + size_type all = i; // possible occurrences of c + node_type v = m_tree.root(); + for (uint32_t l=0; l>= 1) { + size_type ones = (m_bv_rank(m_tree.bv_pos(v)+all) + - m_tree.bv_pos_rank(v)); + if (p&1) { + result += all - ones; + all = ones; + } else { + all -= ones; } - return make_pair(ranges, std::move(res)); + v = m_tree.child(v, p&1); } - - //! Returns for a range its left and right child ranges - /*! \param v An inner node of an wavelet tree. - * \param r A ranges [s,e], such that [s,e] is - * contained in v=[v_s,v_e]. - * \return A range pair. The first element of the - * range pair correspond to the original range - * mapped to the left child of v; the second element to the - * range mapped to the right child of v. - * \pre !is_leaf(v) and s>=v_s and e<=v_e - */ - std::pair - expand(const node_type& v, const range_type& r) const { - auto v_sp_rank = m_tree.bv_pos_rank(v); + return t_ret_type {all, result}; + } + + //! Returns a const_iterator to the first element. + const_iterator begin()const + { + return const_iterator(this, 0); + } + + //! Returns a const_iterator to the element after the last element. + const_iterator end()const + { + return const_iterator(this, size()); + } + + //! Serializes the data structure into the given ostream + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, + std::string name="") const + { + structure_tree_node* child = structure_tree::add_child( + v, name, util::class_name(*this)); + size_type written_bytes = 0; + written_bytes += write_member(m_size,out,child, "size"); + written_bytes += write_member(m_sigma,out,child, "sigma"); + written_bytes += m_bv.serialize(out,child,"bv"); + written_bytes += m_bv_rank.serialize(out,child,"bv_rank"); + written_bytes += m_bv_select1.serialize(out,child,"bv_select_1"); + written_bytes += m_bv_select0.serialize(out,child,"bv_select_0"); + written_bytes += m_tree.serialize(out,child,"tree"); + structure_tree::add_size(child, written_bytes); + return written_bytes; + } + + //! Loads the data structure from the given istream. + void load(std::istream& in) + { + read_member(m_size, in); + read_member(m_sigma, in); + m_bv.load(in); + m_bv_rank.load(in, &m_bv); + m_bv_select1.load(in, &m_bv); + m_bv_select0.load(in, &m_bv); + m_tree.load(in); + } + + //! Checks if the node is a leaf node + bool is_leaf(const node_type& v) const + { + return m_tree.is_leaf(v); + } + + //! Symbol for a leaf + value_type sym(const node_type& v) const + { + return m_tree.bv_pos_rank(v); + } + + bool empty(const node_type&) const + { + return true; + } + + //! Returns the root node + node_type root() const + { + return m_tree.root(); + } + + //! Returns the two child nodes of an inner node + /*! \param v An inner node of a wavelet tree. + * \return Return a pair of nodes (left child, right child). + * \pre !is_leaf(v) + */ + std::pair + expand(const node_type& v) const + { + return std::make_pair(m_tree.child(v,0), m_tree.child(v,1)); + } + + //! Returns for each range its left and right child ranges + /*! \param v An inner node of an wavelet tree. + * \param ranges A vector of ranges. Each range [s,e] + * has to be contained in v=[v_s,v_e]. + * \return A vector a range pairs. The first element of each + * range pair correspond to the original range + * mapped to the left child of v; the second element to the + * range mapped to the right child of v. + * \pre !is_leaf(v) and s>=v_s and e<=v_e + */ + std::pair + expand(const node_type& v, + const range_vec_type& ranges) const + { + auto ranges_copy = ranges; + return expand(v, std::move(ranges_copy)); + } + + //! Returns for each range its left and right child ranges + /*! \param v An inner node of an wavelet tree. + * \param ranges A vector of ranges. Each range [s,e] + * has to be contained in v=[v_s,v_e]. + * \return A vector a range pairs. The first element of each + * range pair correspond to the original range + * mapped to the left child of v; the second element to the + * range mapped to the right child of v. + * \pre !is_leaf(v) and s>=v_s and e<=v_e + */ + std::pair + expand(const node_type& v, + range_vec_type&& ranges) const + { + auto v_sp_rank = m_tree.bv_pos_rank(v); + range_vec_type res(ranges.size()); + size_t i = 0; + for (auto& r : ranges) { auto sp_rank = m_bv_rank(m_tree.bv_pos(v) + r.first); auto right_size = m_bv_rank(m_tree.bv_pos(v) + r.second + 1) - - sp_rank; + - sp_rank; auto left_size = (r.second-r.first+1)-right_size; auto right_sp = sp_rank - v_sp_rank; auto left_sp = r.first - right_sp; - return make_pair(range_type(left_sp, left_sp + left_size - 1), - range_type(right_sp, right_sp + right_size - 1)); - } - - //! return the path to the leaf for a given symbol - std::pair path(value_type c) const { - uint64_t path = m_tree.bit_path(c); - uint64_t path_len = path >> 56; - // reverse the path till we fix the ordering - path = bits::rev(path); - path = path >> (64-path_len); // remove the length - return {path_len,path}; - } - - //! Returns for a symbol c the next larger or equal symbol in the WT. - /*! \param c the symbol - * \return A pair. The first element of the pair consititues if - * a valid answer was found (true) or no valid answer (false) - * could be found. The second element contains the found symbol. - */ - std::pair symbol_gte(value_type c) const { - return m_tree.symbol_gte(c); - } - - //! Returns for a symbol c the previous smaller or equal symbol in the WT. - /*! \param c the symbol - * \return A pair. The first element of the pair consititues if - * a valid answer was found (true) or no valid answer (false) - * could be found. The second element contains the found symbol. - */ - std::pair symbol_lte(value_type c) const { - return m_tree.symbol_lte(c); + r = range_type(left_sp, left_sp + left_size - 1); + res[i++] = range_type(right_sp, right_sp + right_size - 1); } + return make_pair(ranges, std::move(res)); + } + + //! Returns for a range its left and right child ranges + /*! \param v An inner node of an wavelet tree. + * \param r A ranges [s,e], such that [s,e] is + * contained in v=[v_s,v_e]. + * \return A range pair. The first element of the + * range pair correspond to the original range + * mapped to the left child of v; the second element to the + * range mapped to the right child of v. + * \pre !is_leaf(v) and s>=v_s and e<=v_e + */ + std::pair + expand(const node_type& v, const range_type& r) const + { + auto v_sp_rank = m_tree.bv_pos_rank(v); + auto sp_rank = m_bv_rank(m_tree.bv_pos(v) + r.first); + auto right_size = m_bv_rank(m_tree.bv_pos(v) + r.second + 1) + - sp_rank; + auto left_size = (r.second-r.first+1)-right_size; + + auto right_sp = sp_rank - v_sp_rank; + auto left_sp = r.first - right_sp; + + return make_pair(range_type(left_sp, left_sp + left_size - 1), + range_type(right_sp, right_sp + right_size - 1)); + } + + //! return the path to the leaf for a given symbol + std::pair path(value_type c) const + { + uint64_t path = m_tree.bit_path(c); + uint64_t path_len = path >> 56; + // reverse the path till we fix the ordering + path = bits::rev(path); + path = path >> (64-path_len); // remove the length + return {path_len,path}; + } + + //! Returns for a symbol c the next larger or equal symbol in the WT. + /*! \param c the symbol + * \return A pair. The first element of the pair consititues if + * a valid answer was found (true) or no valid answer (false) + * could be found. The second element contains the found symbol. + */ + std::pair symbol_gte(value_type c) const + { + return m_tree.symbol_gte(c); + } + + //! Returns for a symbol c the previous smaller or equal symbol in the WT. + /*! \param c the symbol + * \return A pair. The first element of the pair consititues if + * a valid answer was found (true) or no valid answer (false) + * could be found. The second element contains the found symbol. + */ + std::pair symbol_lte(value_type c) const + { + return m_tree.symbol_lte(c); + } }; } -#endif +#endif \ No newline at end of file diff --git a/include/sdsl/wt_rlmn.hpp b/include/sdsl/wt_rlmn.hpp index d28d0d118..811afc9bd 100644 --- a/include/sdsl/wt_rlmn.hpp +++ b/include/sdsl/wt_rlmn.hpp @@ -44,16 +44,19 @@ struct wt_rlmn_trait { typedef int_vector<> C_type; typedef int_vector<> C_bf_rank_type; - static std::map temp_C() { + static std::map temp_C() + { return std::map(); } - static C_type init_C(std::map& C, uint64_t size) { + static C_type init_C(std::map& C, uint64_t size) + { uint64_t max_symbol = (--C.end())->first; return C_type(max_symbol+1, 0, bits::hi(size)+1); } - static C_bf_rank_type init_C_bf_rank(const C_type& C, uint64_t size) { + static C_bf_rank_type init_C_bf_rank(const C_type& C, uint64_t size) + { return C_bf_rank_type(C.size(), 0, bits::hi(size)+1); } }; @@ -64,15 +67,18 @@ struct wt_rlmn_trait { typedef int_vector<64> C_type; typedef int_vector<64> C_bf_rank_type; - static int_vector<64> temp_C() { + static int_vector<64> temp_C() + { return int_vector<64>(256, 0); } - static C_type init_C(C_type& C, uint64_t) { + static C_type init_C(C_type& C, uint64_t) + { return C; } - static C_bf_rank_type init_C_bf_rank(const C_type&, uint64_t) { + static C_bf_rank_type init_C_bf_rank(const C_type&, uint64_t) + { return int_vector<64>(256,0); } }; @@ -117,6 +123,7 @@ class wt_rlmn typedef wt_tag index_category; typedef typename t_wt::alphabet_category alphabet_category; enum { lex_ordered=false }; // TODO: is should be possible + enum { traversable=false }; enum { width = wt_rlmn_trait::width }; typedef typename wt_rlmn_trait::C_type C_type; @@ -144,7 +151,8 @@ class wt_rlmn // the prefixes m_bf[0..m_C[0]],m_bf[0..m_C[1]],....,m_bf[0..m_C[255]]; // named C_s in the original paper - void copy(const wt_rlmn& wt) { + void copy(const wt_rlmn& wt) + { m_size = wt.m_size; m_bl = wt.m_bl; m_bf = wt.m_bf; @@ -173,7 +181,8 @@ class wt_rlmn * \param size The length of the prefix of the text, for which * the wavelet tree should be build. */ - wt_rlmn(int_vector_buffer& text_buf, size_type size):m_size(size) { + wt_rlmn(int_vector_buffer& text_buf, size_type size):m_size(size) + { std::string temp_file = text_buf.filename() + + "_wt_rlmn_" + util::to_string(util::pid()) + "_" + util::to_string(util::id()); @@ -234,17 +243,20 @@ class wt_rlmn } //! Copy constructor - wt_rlmn(const wt_rlmn& wt) { + wt_rlmn(const wt_rlmn& wt) + { copy(wt); } //! Move constructor - wt_rlmn(wt_rlmn&& wt) { + wt_rlmn(wt_rlmn&& wt) + { *this = std::move(wt); } //! Assignment operator - wt_rlmn& operator=(const wt_rlmn& wt) { + wt_rlmn& operator=(const wt_rlmn& wt) + { if (this != &wt) { copy(wt); } @@ -252,7 +264,8 @@ class wt_rlmn } //! Assignment move operator - wt_rlmn& operator=(wt_rlmn&& wt) { + wt_rlmn& operator=(wt_rlmn&& wt) + { if (this != &wt) { m_size = std::move(wt.m_size); m_bl = std::move(wt.m_bl); @@ -273,7 +286,8 @@ class wt_rlmn } //! Swap operator - void swap(wt_rlmn& wt) { + void swap(wt_rlmn& wt) + { if (this != &wt) { std::swap(m_size, wt.m_size); m_bl.swap(wt.m_bl); @@ -300,12 +314,14 @@ class wt_rlmn } //! Returns the size of the original vector. - size_type size()const { + size_type size()const + { return m_size; } //! Returns whether the wavelet tree contains no data. - bool empty()const { + bool empty()const + { return 0 == m_size; } @@ -316,7 +332,8 @@ class wt_rlmn * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the * zero order entropy of the sequence */ - value_type operator[](size_type i)const { + value_type operator[](size_type i)const + { assert(i < size()); return m_wt[m_bl_rank(i+1)-1]; }; @@ -330,7 +347,8 @@ class wt_rlmn * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the * zero order entropy of the sequence */ - size_type rank(size_type i, value_type c)const { + size_type rank(size_type i, value_type c)const + { assert(i <= size()); if (i == 0) return 0; @@ -354,7 +372,8 @@ class wt_rlmn * \f$ \Order{H_0} \f$ */ std::pair - inverse_select(size_type i)const { + inverse_select(size_type i)const + { assert(i < size()); if (i == 0) { return std::make_pair(0, m_wt[0]); @@ -381,7 +400,8 @@ class wt_rlmn * \f$ \Order{H_0} \f$ on average, where \f$ H_0 \f$ is the zero order * entropy of the sequence */ - size_type select(size_type i, value_type c)const { + size_type select(size_type i, value_type c)const + { assert(i > 0); assert(i <= rank(size(), c)); size_type c_runs = m_bf_rank(m_C[c]+i) - m_C_bf_rank[c]; @@ -390,18 +410,21 @@ class wt_rlmn }; //! Returns a const_iterator to the first element. - const_iterator begin()const { + const_iterator begin()const + { return const_iterator(this, 0); } //! Returns a const_iterator to the element after the last element. - const_iterator end()const { + const_iterator end()const + { return const_iterator(this, size()); } //! Serializes the data structure into the given ostream size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, - std::string name="")const { + std::string name="")const + { structure_tree_node* child = structure_tree::add_child( v, name, util::class_name(*this)); size_type written_bytes = 0; @@ -420,7 +443,8 @@ class wt_rlmn } //! Loads the data structure from the given istream. - void load(std::istream& in) { + void load(std::istream& in) + { read_member(m_size, in); m_bl.load(in); m_bf.load(in); diff --git a/include/sdsl/wt_topk.hpp b/include/sdsl/wt_topk.hpp new file mode 100644 index 000000000..acf5b9265 --- /dev/null +++ b/include/sdsl/wt_topk.hpp @@ -0,0 +1,294 @@ +/* sdsl - succinct data structures library + Copyright (C) 2013 Simon Gog + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see http://www.gnu.org/licenses/ . +*/ +/*! \file wt_topk.hpp + \brief wt_topk.hpp contains a class for the wavelet tree plus rmq data structure. + This data structure is based on the solution for solving top-k queries on grids by + G.Navarro, Y. Nekrich and L. Russo, Space-Efficient Data-Analysis Queries on Grids. + Theoretical Computer Science 482:60-72, 2013 + + \author Simon Gog, Roberto Konow +*/ + +#ifndef INCLUDED_SDSL_WT_TOPK +#define INCLUDED_SDSL_WT_TOPK + +#include "sdsl/vectors.hpp" +#include "sdsl/wavelet_trees.hpp" +#include "sdsl/wt_algorithm.hpp" +#include "sdsl/rmq_support.hpp" +#include "sdsl/wt_topk_algorithm.hpp" +#include +#include +#include +#include +#include + +namespace sdsl +{ + +typedef std::complex _point_type; +typedef std::pair<_point_type, uint64_t> _point_val_type; + + +template +struct _permute { + template + static _point_val_type x(const t_wt& wt, t_w weight, t_c c, t_pos cpos, t_pos cbegin) + { + return {{wt.select(cpos-cbegin+1, c)}, weight}; + } +}; + +template<> +struct _permute { + template + static _point_val_type x(const t_wt&, t_w weight, t_c c, t_pos cpos, t_pos) + { + return {{cpos, c}, weight}; + } +}; + +template, + typename t_rmq=rmq_succinct_sct, + typename t_weight_vec=dac_vector<>, + bool t_perm_x=false> + +class wt_topk +{ + public: + typedef std::complex point_type; + typedef int_vector<>::size_type size_type; + typedef sdsl::range_type range_type; + + class top_k_iterator + { + public: + typedef void(*t_mfptr)(); + typedef std::pair t_point_val; + // state consists of weight, range, index of minimum in range, y-value, begin y-interval in sorted sequence + typedef std::array t_state; + + private: + const wt_topk* m_topk; + std::priority_queue m_pq; + t_point_val m_point_val; + bool m_valid = false; + public: + top_k_iterator() = default; + top_k_iterator(const top_k_iterator&) = default; + top_k_iterator(top_k_iterator&&) = default; + top_k_iterator& operator=(const top_k_iterator&) = default; + top_k_iterator& operator=(top_k_iterator&&) = default; + top_k_iterator(const wt_topk& topk, point_type p1, point_type p2) : + m_topk(&topk), m_valid(topk.size()>0) + { + if (m_topk->size() > 0) { + auto iv_it = map_to_sorted_sequence(m_topk->m_wt, + {real(p1), real(p2)}, {imag(p1), imag(p2)}); + while (iv_it) { + auto r = std::get<1>(*iv_it); + uint64_t max_idx = m_topk->m_rmq(std::get<0>(r), std::get<1>(r)); + uint64_t max_w = m_topk->m_weights[max_idx]; + m_pq.push({max_w, std::get<0>(r), std::get<1>(r), max_idx, + std::get<0>(*iv_it), std::get<2>(*iv_it) + }); + ++iv_it; + } + ++(*this); + } + } + + //! Prefix increment of the iterator + top_k_iterator& operator++() + { + auto push_node = [this](uint64_t begin, uint64_t end, uint64_t y, uint64_t offset) { + if (end > begin) { + uint64_t max_idx = m_topk->m_rmq(begin, end-1); + uint64_t max_w = m_topk->m_weights[max_idx]; + m_pq.push({max_w, begin, end-1, max_idx, y, offset}); + } + }; + m_valid = false; + if (!m_pq.empty()) { + auto s = m_pq.top(); + + m_point_val = _permute::x(m_topk->m_wt, s[0], s[4], s[3], s[5]); + + m_pq.pop(); + push_node(s[1], s[3], s[4],s[5]); + push_node(s[3]+1, s[2]+1, s[4], s[5]); + m_valid = true; + } + return *this; + } + + //! Postfix increment of the iterator + top_k_iterator operator++(int) + { + top_k_iterator it = *this; + ++(*this); + return it; + } + + t_point_val operator*() const + { + return m_point_val; + } + + operator t_mfptr() const + { + return (t_mfptr)(m_valid); + } + }; + + private: + t_wt m_wt; // wavelet tree over y-values + t_weight_vec m_weights; // array for weights + t_rmq m_rmq; // range maximum query structure on top of the weights + + public: + enum { permuted_x = t_perm_x }; + + wt_topk() = default; + + wt_topk(const wt_topk& tr) = default; + wt_topk(wt_topk&& tr) = default; + wt_topk& operator=(const wt_topk& tr) = default; + wt_topk& operator=(wt_topk&& tr) = default; + + //! Number of points in the grid + size_type size() const + { + return m_wt.size(); + } + + void swap(wt_topk& tr) + { + if (this != &tr) { + m_wt.swap(tr.m_wt); + m_rmq.swap(tr.m_rmq); + m_weights.swap(tr.m_weights); + } + } + + wt_topk(int_vector_buffer<>&, + int_vector_buffer<>& buf_y, + int_vector_buffer<>& buf_w) + { + std::string temp_file = buf_w.filename() + + + "_wt_topk_" + std::to_string(util::pid()) + + "_" + std::to_string(util::id()); + // (1) Calculate permuted weight vector + { + int_vector<> perm = sorted_perm(buf_y); + int_vector<> weights; + load_from_file(weights, buf_w.filename()); + + int_vector_buffer<> permuted_weights(temp_file, std::ios::out); + for (size_type i=0; i permuted_weights; + load_from_file(permuted_weights, temp_file); + m_rmq = t_rmq(&permuted_weights); + } + // (3) Construct the WT over y values + m_wt = t_wt(buf_y, buf_y.size()); // construct the wavelet trees over the y-values + // (4) Load the permuted weights and delete temporary file + { + int_vector_buffer<> permuted_weights(temp_file); + m_weights = t_weight_vec(permuted_weights); + } + sdsl::remove(temp_file); + } + + top_k_iterator topk(point_type p1, point_type p2) const + { + return top_k_iterator(*this, p1, p2); + } + + + //! Serializes the data structure into the given ostream + size_type serialize(std::ostream& out, structure_tree_node* v=nullptr, + std::string name="") const + { + structure_tree_node* child = structure_tree::add_child( + v, name, util::class_name(*this)); + size_type written_bytes = 0; + written_bytes += m_wt.serialize(out, child, "wt"); + written_bytes += m_rmq.serialize(out, child, "rmq"); + written_bytes += m_weights.serialize(out, child, "weights"); + structure_tree::add_size(child, written_bytes); + return written_bytes; + } + + //! Loads the data structure from the given istream. + void load(std::istream& in) + { + m_wt.load(in); + m_rmq.load(in); + m_weights.load(in); + } + + // Count how many points are in the rectangle (p1,p2) + /*! \param p1 Lower left corner of the rectangle. + * \param p2 Upper right corner of the rectangle. + * \return The number of points in rectangle (p1,p2). + * \pre real(p1) <= real(p2) and imag(p1)<=imag(p2) + */ + uint64_t count(point_type p1, point_type p2) const + { + const range_type x_r(real(p1), real(p2)); + const range_type y_r(imag(p1), imag(p2)); + return sdsl::count(m_wt, x_r, y_r); + } + + void print_info() const + { + std::cout<<"m_wt ="< buf_y(file+".y", std::ios::in); + int_vector_buffer<> buf_w(file+".w", std::ios::in); + wt_topk tmp(buf_x, buf_y, buf_w); + tmp.swap(idx); +} + +//! Specialized version of method ,,construct_im'' for wt_topk +template +void +construct_im(wt_topk& idx, std::vector> data) +{ + std::string y_file = ram_file_name(std::to_string(util::pid())+"_"+std::to_string(util::id())); + std::string w_file = ram_file_name(std::to_string(util::pid())+"_"+std::to_string(util::id())); + { + int_vector<> y(data.size()); + int_vector<> w(data.size()); + for (size_t i=0; i y_buf(y_file); + int_vector_buffer<> w_buf(w_file); + wt_topk tmp(y_buf, y_buf, w_buf); + tmp.swap(idx); +} + +//! Count how many points are in the rectangle (p1,p2) +template +uint64_t +count(const wt_topk& wt, + typename wt_topk::point_type p1, + typename wt_topk::point_type p2) +{ + return wt.count(p1, p2); +} + + + +} + +#endif diff --git a/lib/construct_lcp.cpp b/lib/construct_lcp.cpp index 0b897872b..72525d67d 100644 --- a/lib/construct_lcp.cpp +++ b/lib/construct_lcp.cpp @@ -644,11 +644,6 @@ void construct_lcp_bwt_based(cache_config& config) size_type use_queue_and_wt = n/2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree // else use dictionary and wavelet tree - size_type quantity; // quantity of characters in interval - std::vector cs(wt_bwt.sigma); // list of characters in the interval - std::vector rank_c_i(wt_bwt.sigma);// number of occurrence of character in [0 .. i-1] - std::vector rank_c_j(wt_bwt.sigma);// number of occurrence of character in [0 .. j-1] - // Calculate how many bit are for each lcp value available, to limit the memory usage to 20n bit = 2,5n byte, use at moste 8 bit size_type bb = (n*20-size_in_bytes(wt_bwt)*8*1.25-5*n)/n; // 20n - size of wavelet tree * 1.25 for rank support - 5n for bit arrays - n for index_done array if (n*20 < size_in_bytes(wt_bwt)*8*1.25+5*n) { @@ -680,11 +675,11 @@ void construct_lcp_bwt_based(cache_config& config) // calculate first intervals partial_lcp[0] = 0; index_done[0] = true; - interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j); - for (size_type i=0; i(*y_it); + size_type a_new = C[c] + std::get<1>(*y_it); + size_type b_new = C[c] + std::get<2>(*y_it); // Save LCP value if not seen before if (!index_done[b_new]) { @@ -740,11 +735,11 @@ void construct_lcp_bwt_based(cache_config& config) size_type b = q.front(); q.pop(); --intervals; - interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j); - for (size_type i=0; i(*y_it); + size_type a_new = C[c] + std::get<1>(*y_it); + size_type b_new = C[c] + std::get<2>(*y_it); // Save LCP value if not seen before if (!index_done[b_new] and phase == 0) { @@ -774,11 +769,11 @@ void construct_lcp_bwt_based(cache_config& config) size_type b2 = util::next_bit(dict[source], a2+1); while (b2 < dict[source].size()) { - interval_symbols(wt_bwt, ((a2-1)>>1), (b2>>1), quantity, cs, rank_c_i, rank_c_j); - for (size_type i=0; i>1), (b2>>1)); + for (size_type i=0; y_it; ++i, ++y_it) { + unsigned char c = std::get<0>(*y_it); + size_type a_new = C[c] + std::get<1>(*y_it); + size_type b_new = C[c] + std::get<2>(*y_it); // Save LCP value if not seen before if (!index_done[b_new] and phase == 0) { partial_lcp[b_new] = lcp_value; @@ -874,11 +869,6 @@ void construct_lcp_bwt_based2(cache_config& config) size_type use_queue_and_wt = n/2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree // else use dictionary and wavelet tree - size_type quantity; // quantity of characters in interval - std::vector cs(wt_bwt.sigma); // list of characters in the interval - std::vector rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1] - std::vector rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1] - // External storage of LCP-Positions-Array bool new_lcp_value = false; uint8_t int_width = bits::hi(n)+2; @@ -902,11 +892,11 @@ void construct_lcp_bwt_based2(cache_config& config) index_done[0] = true; // calculate first intervals - interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j); - for (size_type i=0; i(*y_it); + size_type a_new = C[c] + std::get<1>(*y_it); + size_type b_new = C[c] + std::get<2>(*y_it); // Save LCP value and corresponding interval if not seen before if (!index_done[b_new]) { @@ -967,11 +957,11 @@ void construct_lcp_bwt_based2(cache_config& config) size_type b = q.front(); q.pop(); --intervals; - interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j); - for (size_type i=0; i(*y_it); + size_type a_new = C[c] + std::get<1>(*y_it); + size_type b_new = C[c] + std::get<2>(*y_it); // Save LCP value and corresponding interval if not seen before if (!index_done[b_new]) { @@ -1000,11 +990,11 @@ void construct_lcp_bwt_based2(cache_config& config) size_type b2 = util::next_bit(dict[source], a2+1); while (b2 < dict[source].size()) { - interval_symbols(wt_bwt, ((a2-1)>>1), (b2>>1), quantity, cs, rank_c_i, rank_c_j); - for (size_type i=0; i>1), (b2>>1)); + for (size_type i=0; y_it; ++i, ++y_it) { + unsigned char c = std::get<0>(*y_it); + size_type a_new = C[c] + std::get<1>(*y_it); + size_type b_new = C[c] + std::get<2>(*y_it); // Save LCP value if not seen before if (!index_done[b_new]) { // Save position of LCP-value diff --git a/test/K2TreapTest.cpp b/test/K2TreapTest.cpp index e2d070600..e50f9f87c 100644 --- a/test/K2TreapTest.cpp +++ b/test/K2TreapTest.cpp @@ -98,7 +98,7 @@ TYPED_TEST(K2TreapTest, SizeAndTopk) maxy = *max_element(y.begin(), y.end()); } uint64_t minx=0, miny=0; - topk_test(k2treap, {minx,maxx}, {miny,maxy}, x, y, w); + topk_test(k2treap, {minx,miny}, {maxx,maxy}, x, y, w); if (x.size() > 0) { std::mt19937_64 rng; diff --git a/test/Makefile b/test/Makefile index 119b88d82..4cc440f32 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,5 +1,6 @@ +SHELL=bash include ../Make.helper -CXX_FLAGS=$(MY_CXX_FLAGS) -Wall -Werror -Wunused-parameter -g -O3 \ +CXX_FLAGS=$(MY_CXX_FLAGS) -g -O3 \ -I$(INC_DIR) \ -I../build/external/gtest-1.6.0/include \ -L$(LIB_DIR) \ @@ -54,6 +55,9 @@ SEARCH_BIDIRECTIONAL_TESTS:=$(patsubst %,search-bidirectional-test/%,$(SEARCH_BI K2TREAP_TC_PATH:=$(call config_column,K2TreapTest.config,1) K2TREAP_TESTS:=$(patsubst %,k2treap-test/%,$(K2TREAP_TC_PATH)) +WTTOPK_TC_PATH:=$(call config_column,WtTopkTest.config,1) +WTTOPK_TESTS:=$(patsubst %,wttopk-test/%,$(WTTOPK_TC_PATH)) + TC_PATHS:= $(WT_BYTE_TC_PATHS) $(WT_INT_TC_PATHS) \ $(CSA_BYTE_TC_PATHS) $(CSA_INT_TC_PATHS) \ $(CST_BYTE_TC_PATHS) $(CST_INT_TC_PATHS) @@ -83,7 +87,8 @@ test: bits-test \ search-bidirectional-test \ sorted-int-stack-test \ nn-dict-dynamic-test \ - k2treap-test + k2treap-test \ + wttopk-test build-test: $(EXECS) cd ../tutorial; make build-test @@ -160,6 +165,9 @@ search-bidirectional-test: generators $(SEARCH_BIDIRECTIONAL_TESTS) k2treap-test: ./K2TreapTest.x generators $(K2TREAP_TESTS) +wttopk-test: ./WtTopkTest.x generators $(WTTOPK_TESTS) + + bit-vector-test/test_cases/%: ./BitVectorTest.x test_cases/% @echo "TEST_CASE: test_cases/$*" @$(PREFIX) ./BitVectorTest.x test_cases/$* @@ -242,6 +250,20 @@ k2treap-test/%: ./K2TreapTest.x @echo "TEST_CASE: test_cases/$* in-memory" $(PREFIX) ./K2TreapTest.x test_cases/K2T.$* $(TMP_DIR)/K2T.$* in-memory +wttopk-test/%: ./WtTopkTest.x + $(eval X:=$(call config_select,WtTopkTest.config,$*,2)) + $(eval Y:=$(call config_select,WtTopkTest.config,$*,3)) + $(eval W:=$(call config_select,WtTopkTest.config,$*,4)) + make $(X) $(Y) $(W) + @echo "X=$X, Y=$Y, W=$W" + mv $(X) test_cases/WTK.$*.x + mv $(Y) test_cases/WTK.$*.y + mv $(W) test_cases/WTK.$*.w + @echo "TEST_CASE: $* semi-external" + $(PREFIX) ./WtTopkTest.x test_cases/WTK.$* $(TMP_DIR)/WTK.$* + @echo "TEST_CASE: test_cases/$* in-memory" + $(PREFIX) ./WtTopkTest.x test_cases/WTK.$* $(TMP_DIR)/WTK.$* in-memory + # Format: test_cases/int-vec.[LEN].[WIDTH].[DEFAULT].[SEED] test_cases/int-vec.%: IntVectorGenerator.x @echo "Generate input test_cases/int-vec.$*" @@ -270,6 +292,17 @@ test_cases/int-vec-k2t.%: IntVectorGenerator.x $(eval SEED:=$(call dim,4,$*)) @./IntVectorGenerator.x $@ $(LEN) $(WIDTH) $(DEFAULT) $(SEED) +# Format: test_cases/int-vec-wtk.[LEN].[WIDTH].[DEFAULT].[SEED].[x|y|w] +test_cases/int-vec-wtk.%: IntVectorGenerator.x + @echo "Generate input test_cases/int-vec-wtk.$*" + $(eval LEN:=$(call dim,1,$*)) + $(eval WIDTH:=$(call dim,2,$*)) + $(eval DEFAULT:=$(call dim,3,$*)) + $(eval SEED:=$(call dim,4,$*)) + @./IntVectorGenerator.x $@ $(LEN) $(WIDTH) $(DEFAULT) + + + test_cases/bit-vec.%: BitVectorGenerator.x @echo "Generate input test_cases/bit-vec.$*" @./BitVectorGenerator.x $@ $* diff --git a/test/WtByteTest.cpp b/test/WtByteTest.cpp index 0d5adc7e0..5cddf4511 100644 --- a/test/WtByteTest.cpp +++ b/test/WtByteTest.cpp @@ -177,52 +177,47 @@ TYPED_TEST(WtByteTest, InverseSelect) template void -test_interval_symbols(typename std::enable_if::value), - t_wt>::type&) +test_ys_in_x_range(typename std::enable_if::value), + t_wt>::type&) { - // interval_symbols not implemented + // ys_in_x_range not implemented } template void -test_interval_symbols(typename std::enable_if::value, - t_wt>::type& wt) +test_ys_in_x_range(typename std::enable_if::value, + t_wt>::type& wt) { - - typedef typename t_wt::value_type value_type; ASSERT_TRUE(load_from_file(wt, temp_file)); - int_vector<8> text; - ASSERT_TRUE(load_vector_from_file(text, test_file, 1)); std::mt19937_64 rng; std::uniform_int_distribution distribution(0, wt.size()); auto dice = bind(distribution, rng); - size_type k; - std::vector cs(wt.sigma); - std::vector rank_c_i(wt.sigma); - std::vector rank_c_j(wt.sigma); for (size_type t=0; t<(wt.size()/100+100); ++t) { + vector cs; size_type i = dice(), j = dice(); if (i>j) { std::swap(j,i); } - interval_symbols(wt, i, j, k, cs, rank_c_i, rank_c_j); + auto y_it = ys_in_x_range(wt, i, j); size_type symbols = (j-i); - for (size_type m = 0; m(*y_it); + cs.push_back(c); + ASSERT_EQ(wt.rank(i, c), get<1>(*y_it)); + ASSERT_EQ(wt.rank(j, c), get<2>(*y_it)); + ASSERT_LT((size_type)0, get<2>(*y_it) - get<1>(*y_it)); + symbols -= (get<2>(*y_it) - get<1>(*y_it)); if (m>0 and t_wt::lex_ordered) { - ASSERT_LT(cs[m-1],cs[m]); + ASSERT_LT(cs[m-1],cs[m])<<"m="<(res2)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(rank_c_i_n[c], get<0>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(num_i_s, get<1>(res2)) << "lex_smaller_count(" << i << "," << c << ")"; num_i_s += rank_c_i_n[c]; auto res3 = wt.lex_smaller_count(j, (value_type)c); - ASSERT_EQ(rank_c_j_n[c], std::get<0>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; - ASSERT_EQ(num_j_s, std::get<1>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(rank_c_j_n[c], get<0>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; + ASSERT_EQ(num_j_s, get<1>(res3)) << "lex_smaller_count(" << i << "," << c << ")"; num_j_s += rank_c_j_n[c]; } } diff --git a/test/WtIntTest.cpp b/test/WtIntTest.cpp index 9d4ce08e1..26b2dfcfb 100644 --- a/test/WtIntTest.cpp +++ b/test/WtIntTest.cpp @@ -178,60 +178,56 @@ TYPED_TEST(WtIntTest, LoadAndInverseSelect) template void -test_interval_symbols(typename enable_if::value), - t_wt>::type&) +test_ys_in_x_range(typename enable_if::type&) { - // interval_symbols not implemented + // ys_in_x_range not implemented } template void -test_interval_symbols(typename enable_if::value, - t_wt>::type& wt) +test_ys_in_x_range(typename enable_if::type& wt) { ASSERT_TRUE(load_from_file(wt, temp_file)); - size_type k = 0; - vector rank_c_i(wt.sigma); - vector rank_c_j(wt.sigma); - vector::value_type> cs(wt.sigma); - mt19937_64 rng; for (size_type n=1; n<4; ++n) { uniform_int_distribution distribution(0, n*n*n*10); auto dice = bind(distribution, rng); for (size_type i=0, j=0; i < wt.size(); i=j) { + vector cs; j = min(wt.size(),i+dice()); - interval_symbols(wt, i, j, k, cs, rank_c_i, rank_c_j); + auto y_it = ys_in_x_range(wt, i, j); size_type symbols = (j-i); - for (size_type m = 0; m(*y_it); + cs.emplace_back(c); + ASSERT_EQ(wt.rank(i, c), get<1>(*y_it)); + ASSERT_EQ(wt.rank(j, c), get<2>(*y_it)); + ASSERT_LT((size_type)0, get<2>(*y_it)-get<1>(*y_it)); + symbols -= (get<2>(*y_it)-get<1>(*y_it)); if (m>0 and t_wt::lex_ordered) { - ASSERT_LT(cs[m-1],cs[m]); + ASSERT_LT(cs[m-1],cs[m])<<"m="< +#include +#include +#include // for std::min. std::sort +#include + +namespace +{ + +using namespace sdsl; +using namespace std; + +typedef int_vector<>::size_type size_type; +typedef tuple t_xyw; + +string test_file; +string temp_file; +bool in_memory; + +template +class WtTopkTest : public ::testing::Test { }; + +using testing::Types; + +typedef Types< +<<<<<<< HEAD +wt_topk, rmq_succinct_sct, dac_vector<> > +======= +wt_topk<> +>>>>>>> wt_topk +> Implementations; + +TYPED_TEST_CASE(WtTopkTest, Implementations); + +TYPED_TEST(WtTopkTest, CreateAndStoreTest) +{ +<<<<<<< HEAD + TypeParam topk_wt; + construct(topk_wt, test_file); + ASSERT_TRUE(store_to_file(topk_wt, temp_file)); +} + +struct my_xyw_comp { + bool operator()(const t_xyw& a, const t_xyw& b) const + { + if (get<2>(a) != get<2>(b)) + return get<2>(a) > get<2>(b); + else if (get<0>(a) != get<0>(b)) + return get<0>(a) < get<0>(b); + return get<1>(a) < get<1>(b); + } +}; + +template +void topk_test( + const t_topk_wt& topk_wt, +======= + TypeParam wttopk; + construct(wttopk, test_file); + ASSERT_TRUE(store_to_file(wttopk, temp_file)); +} + + +template +void topk_test( + const t_wttopk& wttopk, +>>>>>>> wt_topk + complex min_xy, + complex max_xy, + const int_vector<>& x, + const int_vector<>& y, + const int_vector<>& w) +{ +<<<<<<< HEAD + auto res_it = top_k(topk_wt, {real(min_xy),imag(min_xy)}, {real(max_xy),imag(max_xy)}); +======= + auto res_it = top_k(wttopk, {real(min_xy),imag(min_xy)}, {real(max_xy),imag(max_xy)}); + typedef tuple t_xyw; +>>>>>>> wt_topk + vector vec; + for (uint64_t i = 0; i < x.size(); ++i) { + if (x[i] >= real(min_xy) and x[i] <= real(max_xy) + and y[i] >= imag(min_xy) and y[i] <= imag(max_xy)) { + vec.emplace_back(x[i], y[i], w[i]); + } + } +<<<<<<< HEAD + sort(vec.begin(), vec.end(), my_xyw_comp()); + uint64_t cnt = 0; + vector vec2; + while (res_it) { + ASSERT_TRUE(cnt < vec.size()); + auto p = *res_it; + vec2.emplace_back(real(p.first), imag(p.first), p.second); + if (vec2.size() > 1) { + EXPECT_TRUE(get<2>(vec2[vec2.size()-2]) >= get<2>(vec2[vec2.size()-1])) + << get<2>(vec2[vec2.size()-2]) <<" < " << get<2>(vec2[vec2.size()-1]); + } + ++res_it; + ++cnt; +======= + sort(vec.begin(), vec.end(), [](const t_xyw& a, const t_xyw& b) { + return get<2>(a) > get<2>(b); + }); + + uint64_t cnt = 0; + if (vec.size() != 0) { + while (res_it) { + ASSERT_TRUE(cnt < vec.size()); + auto p = *res_it; + ASSERT_EQ(get<2>(vec[cnt]), p.second); + ++res_it; + ++cnt; + } +>>>>>>> wt_topk + } + ASSERT_FALSE(res_it); + sort(vec2.begin(), vec2.end(), my_xyw_comp()); + ASSERT_EQ(vec.size(), vec2.size()); + for (size_t i=0; i>>>>>> wt_topk + uint64_t maxx=0, maxy=0; + if (x.size() > 0) { + maxx = *max_element(x.begin(), x.end()); + maxy = *max_element(y.begin(), y.end()); + } + uint64_t minx=0, miny=0; +<<<<<<< HEAD + topk_test(topk_wt, {minx,miny}, {maxx,maxy}, x, y, w); +======= + topk_test(wttopk, {minx,maxx}, {miny,maxy}, x, y, w); +>>>>>>> wt_topk + + if (x.size() > 0) { + std::mt19937_64 rng; + std::uniform_int_distribution distribution(0, x.size()-1); + auto dice = bind(distribution, rng); + for (size_t i=0; i<20; ++i) { + auto idx = dice(); + uint64_t xx = x[idx]; + uint64_t yy = y[idx]; + uint64_t dd = 20; + uint64_t minx=0, miny=0, maxx=xx+dd, maxy=yy+dd; + if (xx >= dd) + minx = xx - dd; + if (y >= dd) + miny = yy - dd; +<<<<<<< HEAD + topk_test(topk_wt, {minx, miny}, {maxx,maxy}, x, y, w); +======= + topk_test(wttopk, {minx, miny}, {maxx,maxy}, x, y, w); +>>>>>>> wt_topk + } + } +} +// RK: Commenting this one, as we may implement range_3d +// +//template +//void range3d_test( +// const t_wttopk& wttopk, +// complex min_xy, +// complex max_xy, +// complex z, +// const int_vector<>& x, +// const int_vector<>& y, +// const int_vector<>& w) +//{ +// auto res_it = range_3d(wttopk, {real(min_xy),imag(min_xy)}, +// {real(max_xy),imag(max_xy)}, +// {real(z), imag(z)}); +// typedef tuple t_xyw; +// vector vec; +// for (uint64_t i = 0; i < x.size(); ++i) { +// if (x[i] >= real(min_xy) and x[i] <= real(max_xy) +// and y[i] >= imag(min_xy) and y[i] <= imag(max_xy)) { +// vec.emplace_back(x[i], y[i], w[i]); +// } +// } +// sort(vec.begin(), vec.end(), [](const t_xyw& a, const t_xyw& b) { +// if (get<2>(a) != get<2>(b)) +// return get<2>(a) > get<2>(b); +// else if (get<0>(a) != get<0>(b)) +// return get<0>(a) < get<0>(b); +// return get<1>(a) < get<1>(b); +// }); +// uint64_t cnt = 0; +// while (res_it) { +// ASSERT_TRUE(cnt < vec.size()); +// auto p = *res_it; +// ASSERT_EQ(get<2>(vec[cnt]), p.second); +// ASSERT_EQ(get<0>(vec[cnt]), real(p.first)); +// ASSERT_EQ(get<1>(vec[cnt]), imag(p.first)); +// ++res_it; +// ++cnt; +// } +// ASSERT_FALSE(res_it); +//} +// +//TYPED_TEST(WtTopkTest, Range3d) +//{ +// TypeParam wttopk; +// ASSERT_TRUE(load_from_file(wttopk, temp_file)); +// int_vector<> x,y,w; +// ASSERT_TRUE(load_from_file(x, test_file+".x")); +// ASSERT_TRUE(load_from_file(y, test_file+".y")); +// ASSERT_EQ(x.size(), y.size()); +// ASSERT_TRUE(load_from_file(w, test_file+".w")); +// ASSERT_EQ(x.size(), w.size()); +// ASSERT_EQ(x.size(), k2treap.size()); +// if (x.size() > 0) { +// std::mt19937_64 rng; +// std::uniform_int_distribution distribution(0, x.size()-1); +// auto dice = bind(distribution, rng); +// for (size_t i=0; i<20; ++i) { +// auto idx = dice(); +// uint64_t xx = x[idx]; +// uint64_t yy = y[idx]; +// uint64_t ww = w[idx]; +// uint64_t dd = 20; +// uint64_t dw = 100; +// uint64_t minx=0, miny=0, maxx=xx+dd, maxy=yy+dd, minw=0, maxw=ww+dw; +// if (xx >= dd) +// minx = xx - dd; +// if (yy >= dd) +// miny = yy - dd; +// if (ww >= dw) +// minw = ww - dw; +// range3d_test(wttopk, {minx, miny}, {maxx,maxy}, {minw,maxw}, x, y, w); +// } +// } +//} + +<<<<<<< HEAD +/* + +template +void range3d_test( + const t_topk_wt& topk_wt, + complex min_xy, + complex max_xy, + complex z, + const int_vector<>& x, + const int_vector<>& y, + const int_vector<>& w) +{ + auto res_it = range_3d(topk_wt, {real(min_xy),imag(min_xy)}, + {real(max_xy),imag(max_xy)}, + {real(z), imag(z)}); + typedef tuple t_xyw; + vector vec; + for (uint64_t i = 0; i < x.size(); ++i) { + if (x[i] >= real(min_xy) and x[i] <= real(max_xy) + and y[i] >= imag(min_xy) and y[i] <= imag(max_xy)) { + vec.emplace_back(x[i], y[i], w[i]); + } + } + sort(vec.begin(), vec.end(), [](const t_xyw& a, const t_xyw& b) { + if (get<2>(a) != get<2>(b)) + return get<2>(a) > get<2>(b); + else if (get<0>(a) != get<0>(b)) + return get<0>(a) < get<0>(b); + return get<1>(a) < get<1>(b); + }); + uint64_t cnt = 0; + while (res_it) { + ASSERT_TRUE(cnt < vec.size()); + auto p = *res_it; + ASSERT_EQ(get<2>(vec[cnt]), p.second); + ASSERT_EQ(get<0>(vec[cnt]), real(p.first)); + ASSERT_EQ(get<1>(vec[cnt]), imag(p.first)); + ++res_it; + ++cnt; + } + ASSERT_FALSE(res_it); +} +*/ + +/* +TYPED_TEST(WtTopkTest, Range3d) +{ + TypeParam topk_wt; + ASSERT_TRUE(load_from_file(topk_wt, temp_file)); + int_vector<> x,y,w; + ASSERT_TRUE(load_from_file(x, test_file+".x")); + ASSERT_TRUE(load_from_file(y, test_file+".y")); + ASSERT_EQ(x.size(), y.size()); + ASSERT_TRUE(load_from_file(w, test_file+".w")); + ASSERT_EQ(x.size(), w.size()); + ASSERT_EQ(x.size(), topk_wt.size()); + if (x.size() > 0) { + std::mt19937_64 rng; + std::uniform_int_distribution distribution(0, x.size()-1); + auto dice = bind(distribution, rng); + for (size_t i=0; i<20; ++i) { + auto idx = dice(); + uint64_t xx = x[idx]; + uint64_t yy = y[idx]; + uint64_t ww = w[idx]; + uint64_t dd = 20; + uint64_t dw = 100; + uint64_t minx=0, miny=0, maxx=xx+dd, maxy=yy+dd, minw=0, maxw=ww+dw; + if (xx >= dd) + minx = xx - dd; + if (yy >= dd) + miny = yy - dd; + if (ww >= dw) + minw = ww - dw; + range3d_test(topk_wt, {minx, miny}, {maxx,maxy}, {minw,maxw}, x, y, w); + } + } +} + +*/ + +template +void count_test( + const t_topk_wt& topk_wt, + complex min_xy, + complex max_xy, + const int_vector<>& x, + const int_vector<>& y) +{ + uint64_t cnt = 0; + for (uint64_t i = 0; i < x.size(); ++i) { + if (x[i] >= real(min_xy) and x[i] <= real(max_xy) + and y[i] >= imag(min_xy) and y[i] <= imag(max_xy)) { + ++cnt; + } + } + ASSERT_EQ(cnt, count(topk_wt, {real(min_xy),imag(min_xy)}, {real(max_xy),imag(max_xy)})); +} + +TYPED_TEST(WtTopkTest, Count) +{ + TypeParam topk_wt; + ASSERT_TRUE(load_from_file(topk_wt, temp_file)); + int_vector<> x,y; + ASSERT_TRUE(load_from_file(x, test_file+".x")); + ASSERT_TRUE(load_from_file(y, test_file+".y")); + ASSERT_EQ(x.size(), y.size()); + ASSERT_EQ(x.size(), topk_wt.size()); + if (x.size() > 0) { + std::mt19937_64 rng; + std::uniform_int_distribution distribution(0, x.size()-1); + auto dice = bind(distribution, rng); + for (size_t i=0; i<3; ++i) { + auto idx1 = dice(); + auto idx2 = dice(); + uint64_t x1 = x[idx1]; + uint64_t y1 = y[idx1]; + uint64_t x2 = x[idx2]; + uint64_t y2 = y[idx2]; + count_test(topk_wt, {std::min(x1,x2), std::min(y1,y2)}, {std::max(x1,x2),std::max(y1,y2)}, x, y); + } + } +} +======= +>>>>>>> wt_topk + +} // namespace + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + if (argc < 3) { + // LCOV_EXCL_START + cout << "Usage: " << argv[0] << " file temp_file [in-memory]" << endl; + cout << " (1) Generates a k2-treap out of file.x, file.y, and file.w." << endl; + cout << " Result is stored in temp_file." << endl; + cout << " If `in-memory` is specified, the in-memory construction is tested." << endl; + cout << " (2) Performs tests." << endl; + cout << " (3) Deletes temp_file." << endl; + return 1; + // LCOV_EXCL_STOP + } + test_file = argv[1]; + temp_file = argv[2]; + in_memory = argc > 3; + if (in_memory) { + auto load_and_store_in_mem = [&](string suf) { + int_vector<> data; + string file = temp_file + suf; + load_vector_from_file(data,file); + string ram_file = ram_file_name(file); + store_to_file(data, ram_file); + }; + load_and_store_in_mem("x"); + load_and_store_in_mem("y"); + load_and_store_in_mem("w"); + temp_file = ram_file_name(temp_file); + } + return RUN_ALL_TESTS(); +} diff --git a/test/test_cases/100a.txt b/test/test_cases/100a.txt deleted file mode 100644 index d69076544..000000000 --- a/test/test_cases/100a.txt +++ /dev/null @@ -1 +0,0 @@ -aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa \ No newline at end of file diff --git a/test/test_cases/all_symbols.txt b/test/test_cases/all_symbols.txt deleted file mode 100644 index c86626638..000000000 Binary files a/test/test_cases/all_symbols.txt and /dev/null differ diff --git a/test/test_cases/empty.txt b/test/test_cases/empty.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/test/test_cases/example01.txt b/test/test_cases/example01.txt deleted file mode 100644 index 1f4f5feea..000000000 --- a/test/test_cases/example01.txt +++ /dev/null @@ -1 +0,0 @@ -mississippi diff --git a/test/test_cases/keeper.int b/test/test_cases/keeper.int deleted file mode 100644 index 105d98d61..000000000 Binary files a/test/test_cases/keeper.int and /dev/null differ