From c2be3cfa3a29b6419adf3d908fff7edb80604c92 Mon Sep 17 00:00:00 2001 From: Mohit Chhaya Date: Thu, 11 Jul 2024 13:11:21 -0800 Subject: [PATCH] Improve non-GCC implementation of statistics functions Quantile/Rank, fixes #77 Signed-off-by: Mohit Chhaya Signed-off-by: Tim Paine --- cpp/csp/cppnodes/statsimpl.h | 243 +++++++++-------------------------- vcpkg.json | 1 + 2 files changed, 65 insertions(+), 179 deletions(-) diff --git a/cpp/csp/cppnodes/statsimpl.h b/cpp/csp/cppnodes/statsimpl.h index ee5cf267..61cda234 100644 --- a/cpp/csp/cppnodes/statsimpl.h +++ b/cpp/csp/cppnodes/statsimpl.h @@ -8,11 +8,9 @@ #include #include #include - -#ifdef __linux__ -#include -#include -#endif +#include +#include +#include namespace csp::cppnodes { @@ -1087,19 +1085,8 @@ class WeightedKurtosis bool m_excess; }; -#ifdef __linux__ -template -using ost = __gnu_pbds::tree; - -template -void ost_erase( ost &t, double & v ) -{ - int rank = t.order_of_key( v ); - auto it = t.find_by_order( rank ); - t.erase( it ); -} -#endif +template +using ost = boost::multi_index::multi_index_container, Comparator>>>; class Quantile { @@ -1142,11 +1129,7 @@ class Quantile void remove( double x ) { - #ifdef __linux__ - ost_erase( m_tree, x ); - #else m_tree.erase( m_tree.find( x ) ); - #endif } void reset() @@ -1165,113 +1148,60 @@ class Quantile double target = std::get( m_quants[index]._data ) * ( m_tree.size() - 1 ); int ft = floor( target ); int ct = ceil( target ); + auto fIt = m_tree.get<0>().nth( ft ); + auto cIt = ( ft == ct ) ? fIt : std::next( fIt ); double qtl = 0.0; - #ifdef __linux__ - switch ( m_interpolation ) - { - case LINEAR: - if( ft == target ) - { - qtl = *m_tree.find_by_order( ft ); - } - else - { - double lower = *m_tree.find_by_order( ft ); - double higher = *m_tree.find_by_order( ct ); - qtl = ( 1 - target + ft ) * lower + ( 1 - ct + target ) * higher; - } - break; - case LOWER: - qtl = *m_tree.find_by_order( ft ); - break; - case HIGHER: - qtl = *m_tree.find_by_order( ct ); - break; - case MIDPOINT: - if( ft == target ) - { - qtl = *m_tree.find_by_order( ft ); - } - else - { - double lower = *m_tree.find_by_order( ft ); - double higher = *m_tree.find_by_order( ct ); - qtl = ( higher+lower ) / 2; - } - break; - case NEAREST: - if( target - ft < ct - target ) - { - qtl = *m_tree.find_by_order( ft ); - } - else - { - qtl = *m_tree.find_by_order( ct ); - } - break; - default: - break; - } - #else - auto it = m_tree.begin(); - std::advance( it, ft ); switch ( m_interpolation ) { - case LINEAR: - if( ft == target ) - { - qtl = *it; - } - else - { - double lower = *it; - double higher = *++it; - qtl = ( 1 - target + ft ) * lower + ( 1 - ct + target ) * higher; - } - break; - case LOWER: - qtl = *it; - break; - case HIGHER: - qtl = ( ft == ct ? *it : *++it ); - break; - case MIDPOINT: - if( ft == target ) - { - qtl = *it; - } - else - { - double lower = *it; - double higher = *++it; - qtl = ( higher+lower ) / 2; - } - break; - case NEAREST: - if( target - ft <= ct - target ) - { - qtl = *it; - } - else - { - qtl = *++it; - } - break; - default: - break; + case LINEAR: + if ( ft == target ) + { + qtl = *fIt; + } + else + { + double lower = *fIt; + double higher = *cIt; + qtl = ( 1 - target + ft ) * lower + ( 1 - ct + target ) * higher; + } + break; + case LOWER: + qtl = *fIt; + break; + case HIGHER: + qtl = *cIt; + break; + case MIDPOINT: + if ( ft == target ) + { + qtl = *fIt; + } + else + { + double lower = *fIt; + double higher = *cIt; + qtl = ( higher + lower ) / 2; + } + break; + case NEAREST: + if ( target - ft < ct - target ) + { + qtl = *fIt; + } + else + { + qtl = *cIt; + } + break; + default: + break; } - #endif return qtl; } private: - - #ifdef __linux__ - ost> m_tree; - #else - std::multiset m_tree; - #endif + ost> m_tree; std::vector m_quants; int64_t m_interpolation; }; @@ -1359,14 +1289,10 @@ class Rank else { m_lastval = x; - #ifdef __linux__ if( m_method == MAX ) m_maxtree.insert( x ); else m_mintree.insert( x ); - #else - m_tree.insert( x ); - #endif } } @@ -1374,104 +1300,63 @@ class Rank { if( likely( !isnan( x ) ) ) { - #ifdef __linux__ - if( m_method == MAX ) - ost_erase( m_maxtree, x ); + if ( m_method == MAX ) + m_maxtree.erase ( m_maxtree.find( x ) ); else - ost_erase( m_mintree, x ); - #else - m_tree.erase( m_tree.find( x ) ); - #endif + m_mintree.erase ( m_mintree.find( x ) ); } } void reset() { - #ifdef __linux__ if( m_method == MAX ) m_maxtree.clear(); else m_mintree.clear(); - #else - m_tree.clear(); - #endif } double compute() const { // Verify tree is not empty and lastValue is valid // Last value can only ever be NaN if the "keep" nan option is used - #ifdef __linux__ if( likely( !isnan( m_lastval ) && ( ( m_method == MAX && m_maxtree.size() > 0 ) || m_mintree.size() > 0 ) ) ) { switch( m_method ) { case MIN: { - if( m_mintree.size() == 1 ) + if ( m_mintree.size() == 1 ) return 0; - return m_mintree.order_of_key( m_lastval ); + return m_mintree.get<0>().find_rank( m_lastval ); } case MAX: { - if( m_maxtree.size() == 1 ) + if ( m_maxtree.size() == 1 ) return 0; - return m_maxtree.size() - 1 - m_maxtree.order_of_key( m_lastval ); + return m_maxtree.size() - 1 - m_maxtree.get<0>().find_rank( m_lastval ); } case AVG: { - // Need to iterate to find average rank - if( m_mintree.size() == 1 ) + if ( m_mintree.size() == 1 ) return 0; - - int min_rank = m_mintree.order_of_key( m_lastval ); + + int min_rank = m_mintree.get<0>().find_rank( m_lastval ); int max_rank = min_rank; - auto it = m_mintree.find_by_order( min_rank ); + auto it = m_mintree.get<0>().nth( min_rank ); it++; - for( ; it != m_mintree.end() && *it == m_lastval ; it++ ) max_rank++; + for( ; it != m_mintree.end() && *it == m_lastval ; it++ ) max_rank++; // While this is in theory O(n), in reality this loop is only interated once, since there are likely no duplicate values or very few. return ( double )( min_rank + max_rank ) / 2; } - - default: - break; - } - } - #else - if( likely( !isnan( m_lastval ) && m_tree.size() > 0 ) ) - { - switch( m_method ) - { - case MIN: - { - return std::distance( m_tree.begin(), m_tree.find( m_lastval ) ); - } - case MAX: - { - auto end_range = m_tree.equal_range( m_lastval ).second; - return std::distance( m_tree.begin(), std::prev( end_range ) ); - } - case AVG: - { - auto range = m_tree.equal_range( m_lastval ); - return std::distance( m_tree.begin(), range.first ) + ( double )std::distance( range.first, std::prev( range.second ) ) / 2; - } default: break; } } - #endif - return std::numeric_limits::quiet_NaN(); } private: - - #ifdef __linux__ - ost> m_mintree; - ost> m_maxtree; - #else - std::multiset m_tree; - #endif + ost> m_mintree; + ost> m_maxtree; double m_lastval; int64_t m_method; diff --git a/vcpkg.json b/vcpkg.json index 324ae504..7373e063 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -5,6 +5,7 @@ "abseil", "arrow", "boost-beast", + "boost-multi-index", "brotli", "exprtk", "gtest",