From bcf9425a8fc8bfe4a08840749a16cf83e1bc89e8 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Tue, 8 Oct 2024 14:54:04 +0100 Subject: [PATCH] Compute whole column variance using numerically stable approach (#16448) We use the pairwise approach of Chan, Golub, and LeVeque (1983). - Closes #16444 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/16448 --- .../reduction/detail/reduction_operators.cuh | 48 ++++--- cpp/src/reductions/compound.cuh | 26 ++-- cpp/tests/reductions/reduction_tests.cpp | 131 ++++++++---------- .../java/ai/rapids/cudf/ReductionTest.java | 4 +- .../cudf/cudf/core/column/numerical_base.py | 10 +- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 18 +-- 7 files changed, 116 insertions(+), 123 deletions(-) diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh index 4cf8564ab3a..5694362af8f 100644 --- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh +++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh @@ -31,17 +31,41 @@ namespace detail { // intermediate data structure to compute `var`, `std` template struct var_std { - ResultType value; /// the value - ResultType value_squared; /// the value of squared - - CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0) - : value(_value), value_squared(_value_squared){}; + // Uses the pairwise approach of Chan, Golub, and LeVeque, + // _Algorithms for computing the sample variance: analysis and + // recommendations_ (1983) + // https://doi.org/10.1080/00031305.1983.10483115 + // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf + // This is a modification of Youngs and Cramer's online approach. + ResultType running_sum; + ResultType running_square_deviations; + size_type count; + + CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0) + : running_sum(t), running_square_deviations(s), count(n){}; using this_t = var_std; CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { - return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared)); + // Updates as per equations 1.5a and 1.5b in the paper + // T_{1,m+n} = T_{1,m} + T_{m+1,n+1} + // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2 + // Here the first m samples are in this, the remaining n samples are in rhs. + auto const m = this->count; + auto const n = rhs.count; + // Avoid division by zero. + if (m == 0) { return rhs; } + if (n == 0) { return *this; } + auto const tm = this->running_sum; + auto const tn = rhs.running_sum; + auto const sm = this->running_square_deviations; + auto const sn = rhs.running_square_deviations; + auto const tmn = tm + tn; + auto const diff = ((static_cast(n) / m) * tm) - tn; + // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow + auto const smn = sm + sn + ((static_cast(m) / n) / (m + n)) * diff * diff; + return {tmn, smn, m + n}; }; }; @@ -50,10 +74,7 @@ template struct transformer_var_std { using OutputType = var_std; - CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) - { - return OutputType(value, value * value); - }; + CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; }; }; // ------------------------------------------------------------------------ @@ -257,12 +278,7 @@ struct variance : public compound_op { cudf::size_type const& count, cudf::size_type const& ddof) { - ResultType mean = input.value / count; - ResultType asum = input.value_squared; - cudf::size_type div = count - ddof; - ResultType var = asum / div - ((mean * mean) * count) / div; - - return var; + return input.running_square_deviations / (count - ddof); }; }; }; diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 6bc8b48832f..cd9fade164a 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -18,13 +18,18 @@ #include #include +#include #include +#include #include #include #include #include +#include +#include + namespace cudf { namespace reduction { namespace compound { @@ -53,9 +58,17 @@ std::unique_ptr compound_reduction(column_view const& col, { auto const valid_count = col.size() - col.null_count(); + // All null input produces all null output + if (valid_count == 0 || + // Only care about ddof for standard deviation and variance right now + valid_count <= ddof && (std::is_same_v || + std::is_same_v)) { + auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr); + result->set_valid_async(false, stream); + return result; + } // reduction by iterator auto dcol = cudf::column_device_view::create(col, stream); - std::unique_ptr result; Op compound_op{}; if (!cudf::is_dictionary(col.type())) { @@ -63,25 +76,21 @@ std::unique_ptr compound_reduction(column_view const& col, auto it = thrust::make_transform_iterator( dcol->pair_begin(), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), compound_op.template get_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } } else { auto it = thrust::make_transform_iterator( cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } - - // set scalar is valid - result->set_valid_async(col.null_count() < col.size(), stream); - return result; }; // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std) @@ -137,6 +146,7 @@ struct element_type_dispatcher { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error); return cudf::type_dispatcher( output_dtype, result_type_dispatcher(), col, output_dtype, ddof, stream, mr); } diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 1e9e13ded93..bdb98372836 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -33,8 +33,12 @@ #include #include +#include #include +#include +#include +#include #include using aggregation = cudf::aggregation; @@ -765,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean) expected_value_nulls); } +template +double calc_var(std::vector const& v, int ddof, std::vector const& mask = {}) +{ + auto const values = [&]() { + if (mask.empty()) { return v; } + std::vector masked{}; + thrust::copy_if( + v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; }); + return masked; + }(); + auto const valid_count = values.size(); + double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count; + double const sq_sum_of_differences = + std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) { + return acc + std::pow(v - mean, 2); + }); + return sq_sum_of_differences / (valid_count - ddof); +} + // This test is disabled for only a Debug build because a compiler error // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu #ifdef NDEBUG @@ -777,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [](std::vector& v, cudf::size_type valid_count, int ddof) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls std::vector v = convert_values(int_values); cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); auto const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -811,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(v, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(v, host_bools, T{0}); - - double var_nulls = calc_var(replaced_array, valid_count, ddof); - double std_nulls = std::sqrt(var_nulls); + double var_nulls = calc_var(v, ddof, host_bools); + double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } // ---------------------------------------------------------------------------- @@ -1139,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [ddof](std::vector& v, cudf::size_type valid_count) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::fixed_width_column_wrapper col(int_values.begin(), int_values.end()); - double var = calc_var(int_values, int_values.size()); + double var = calc_var(int_values, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -1172,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(int_values, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(int_values, host_bools, int{0}); - - double var_nulls = calc_var(replaced_array, valid_count); + double var_nulls = calc_var(int_values, ddof, host_bools); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } //------------------------------------------------------------------- @@ -2471,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector v = convert_values(int_values); cudf::data_type output_type{cudf::type_to_id()}; - auto calc_var = [](std::vector const& v, cudf::size_type valid_count, cudf::size_type ddof) { - double mean = std::accumulate(v.cbegin(), v.cend(), double{0}); - mean /= valid_count; - double sum_of_sq = std::accumulate( - v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - auto const div = valid_count - ddof; - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); cudf::size_type const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -2497,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector validity({true, true, false, true, true, true, false, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); - cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true); - - double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof); + double var_nulls = calc_var(v, ddof, validity); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, - var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, - std_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, + var_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, + std_nulls); } TYPED_TEST(DictionaryReductionTest, NthElement) diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java index 8cc7df1ce7f..6bd6603d71b 100644 --- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java @@ -612,13 +612,13 @@ void testWithSetOutputType() { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.666667f); + try (Scalar expected = Scalar.fromFloat(1.6666666f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.variance(DType.FLOAT32)) { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.2909945f); + try (Scalar expected = Scalar.fromFloat(1.2909944f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.standardDeviation(DType.FLOAT32)) { assertEquals(expected, result); diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3b8dd05c13a..f6ab91f2f01 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -180,9 +180,12 @@ def var( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "var", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def std( self, @@ -190,9 +193,12 @@ def std( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "std", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index acd97c2047c..41ee94b72c8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2943,7 +2943,7 @@ def corr(self, other, method="pearson", min_periods=None): >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.corr(ser2, method="pearson") - -0.20454263717316112 + -0.20454263717316126 >>> ser1.corr(ser2, method="spearman") -0.5 """ diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4a2345fc009..976b12a9ab5 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2525,23 +2525,7 @@ def test_dti_asi8(): @pytest.mark.parametrize( "method, kwargs", - [ - ["mean", {}], - pytest.param( - "std", - {}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - pytest.param( - "std", - {"ddof": 0}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - ], + [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], ) def test_dti_reduction(method, kwargs): pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")