From 0e7485cd9dce54338b9ff22678d430470dfe742c Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 13 Oct 2023 14:03:58 +0800 Subject: [PATCH 01/54] wip Signed-off-by: Haoyang Li --- src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/cast_float_to_string.cu | 360 ++++++++++++++++++ src/main/cpp/src/cast_string.hpp | 5 + src/main/cpp/tests/CMakeLists.txt | 3 + src/main/cpp/tests/cast_float_to_string.cpp | 90 +++++ .../nvidia/spark/rapids/jni/CastStrings.java | 7 +- 6 files changed, 463 insertions(+), 3 deletions(-) create mode 100644 src/main/cpp/src/cast_float_to_string.cu create mode 100644 src/main/cpp/tests/cast_float_to_string.cpp diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 600e6ac245..6f5c3d2239 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -161,6 +161,7 @@ add_library( src/ZOrderJni.cpp src/bloom_filter.cu src/cast_decimal_to_string.cu + src/cast_float_to_string.cu src/cast_string.cu src/cast_string_to_float.cu src/datetime_rebase.cu diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu new file mode 100644 index 0000000000..fed57f5e91 --- /dev/null +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cast_string.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace cudf; + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +struct float_to_string_fn { + // significant digits is independent of scientific notation range + // digits more than this may require using long values instead of ints + static constexpr unsigned int significant_digits = 10; + // maximum power-of-10 that will fit in 32-bits + static constexpr unsigned int nine_digits = 1000000000; // 1x10^9 + // Range of numbers here is for normalizing the value. + // If the value is above or below the following limits, the output is converted to + // scientific notation in order to show (at most) the number of significant digits. + static constexpr double upper_limit = 1000000000; // max is 1x10^9 + static constexpr double lower_limit = 0.0001; // printf uses scientific notation below this + // Tables for doing normalization: converting to exponent form + // IEEE double float has maximum exponent of 305 so these should cover everything + double const upper10[9] = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256}; + double const lower10[9] = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256}; + double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255}; + + // utility for quickly converting known integer range to character array + __device__ char* int2str(int value, char* output) + { + if (value == 0) { + *output++ = '0'; + return output; + } + char buffer[significant_digits]; // should be big-enough for significant digits + char* ptr = buffer; + while (value > 0) { + *ptr++ = (char)('0' + (value % 10)); + value /= 10; + } + while (ptr != buffer) + *output++ = *--ptr; // 54321 -> 12345 + return output; + } + + /** + * @brief Dissect a float value into integer, decimal, and exponent components. + * + * @return The number of decimal places. + */ + __device__ int dissect_value(double value, + unsigned int& integer, + unsigned int& decimal, + int& exp10) + { + int decimal_places = significant_digits - 1; + // normalize step puts value between lower-limit and upper-limit + // by adjusting the exponent up or down + exp10 = 0; + if (value > upper_limit) { + int fx = 256; + for (int idx = 8; idx >= 0; --idx) { + if (value >= upper10[idx]) { + value *= lower10[idx]; + exp10 += fx; + } + fx = fx >> 1; + } + } else if ((value > 0.0) && (value < lower_limit)) { + int fx = 256; + for (int idx = 8; idx >= 0; --idx) { + if (value < blower10[idx]) { + value *= upper10[idx]; + exp10 -= fx; + } + fx = fx >> 1; + } + } + // + unsigned int max_digits = nine_digits; + integer = (unsigned int)value; + for (unsigned int i = integer; i >= 10; i /= 10) { + --decimal_places; + max_digits /= 10; + } + double remainder = (value - (double)integer) * (double)max_digits; + decimal = (unsigned int)remainder; + remainder -= (double)decimal; + decimal += (unsigned int)(2.0 * remainder); + if (decimal >= max_digits) { + decimal = 0; + ++integer; + if (exp10 && (integer >= 10)) { + ++exp10; + integer = 1; + } + } + // + while ((decimal % 10) == 0 && (decimal_places > 0)) { + decimal /= 10; + --decimal_places; + } + return decimal_places; + } + + /** + * @brief Main kernel method for converting float value to char output array. + * + * Output need not be more than (significant_digits + 7) bytes: + * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent + * + * @param value Float value to convert. + * @param output Memory to write output characters. + * @return Number of bytes written. + */ + __device__ int float_to_string(double value, char* output) + { + // check for valid value + if (std::isnan(value)) { + memcpy(output, "NaN", 3); + return 3; + } + bool bneg = false; + if (signbit(value)) { // handles -0.0 too + value = -value; + bneg = true; + } + if (std::isinf(value)) { + if (bneg) + memcpy(output, "-Inf", 4); + else + memcpy(output, "Inf", 3); + return bneg ? 4 : 3; + } + + // dissect value into components + unsigned int integer = 0, decimal = 0; + int exp10 = 0; + int decimal_places = dissect_value(value, integer, decimal, exp10); + // + // now build the string from the + // components: sign, integer, decimal, exp10, decimal_places + // + // sign + char* ptr = output; + if (bneg) *ptr++ = '-'; + // integer + ptr = int2str(integer, ptr); + // decimal + *ptr++ = '.'; + if (decimal_places) { + char buffer[10]; + char* pb = buffer; + while (decimal_places--) { + *pb++ = (char)('0' + (decimal % 10)); + decimal /= 10; + } + while (pb != buffer) // reverses the digits + *ptr++ = *--pb; // e.g. 54321 -> 12345 + } else + *ptr++ = '0'; // always include at least .0 + // exponent + if (exp10) { + *ptr++ = 'E'; + if (exp10 < 0) { + *ptr++ = '-'; + exp10 = -exp10; + } else + *ptr++ = '+'; + if (exp10 < 10) *ptr++ = '0'; // extra zero-pad + ptr = int2str(exp10, ptr); + } + // done + return (int)(ptr - output); // number of bytes written + } + + /** + * @brief Compute how man bytes are needed to hold the output string. + * + * @param value Float value to convert. + * @return Number of bytes required. + */ + __device__ int compute_ftos_size(double value) + { + if (std::isnan(value)) return 3; // NaN + bool bneg = false; + if (signbit(value)) { // handles -0.0 too + value = -value; + bneg = true; + } + if (std::isinf(value)) return 3 + (int)bneg; // Inf + + // dissect float into parts + unsigned int integer = 0, decimal = 0; + int exp10 = 0; + int decimal_places = dissect_value(value, integer, decimal, exp10); + // now count up the components + // sign + int count = (int)bneg; + // integer + count += (int)(integer == 0); + while (integer > 0) { + integer /= 10; + ++count; + } // log10(integer) + // decimal + ++count; // decimal point + if (decimal_places) + count += decimal_places; + else + ++count; // always include .0 + // exponent + if (exp10) { + count += 2; // 'e±' + if (exp10 < 0) exp10 = -exp10; + count += (int)(exp10 < 10); // padding + while (exp10 > 0) { + exp10 /= 10; + ++count; + } // log10(exp10) + } + return count; + } +}; + +template +struct from_floats_fn { + column_device_view d_floats; + size_type* d_offsets; + char* d_chars; + + __device__ size_type compute_output_size(FloatType value) + { + float_to_string_fn fts; + return static_cast(fts.compute_ftos_size(static_cast(value))); + } + + __device__ void float_to_string(size_type idx) + { + FloatType value = d_floats.element(idx); + float_to_string_fn fts; + fts.float_to_string(static_cast(value), d_chars + d_offsets[idx]); + } + + __device__ void operator()(size_type idx) + { + if (d_floats.is_null(idx)) { + if (d_chars == nullptr) { d_offsets[idx] = 0; } + return; + } + if (d_chars != nullptr) { + float_to_string(idx); + } else { + d_offsets[idx] = compute_output_size(d_floats.element(idx)); + } + } +}; + +/** + * @brief This dispatch method is for converting floats into strings. + * + * The template function declaration ensures only float types are allowed. + */ +struct dispatch_from_floats_fn { + template >* = nullptr> + std::unique_ptr operator()(column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + size_type strings_count = floats.size(); + auto column = column_device_view::create(floats, stream); + auto d_column = *column; + + // copy the null mask + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); + + auto [offsets, chars] = + cudf::strings::detail::make_strings_children(from_floats_fn{d_column}, strings_count, stream, mr); + + return make_strings_column(strings_count, + std::move(offsets), + std::move(chars), + floats.null_count(), + std::move(null_mask)); + } + + // non-float types throw an exception + template >* = nullptr> + std::unique_ptr operator()(column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("Values for from_floats function must be a float type."); + } +}; + +} // namespace + +// This will convert all float column types into a strings column. +std::unique_ptr from_floats(column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type strings_count = floats.size(); + if (strings_count == 0) return make_empty_column(type_id::STRING); + + return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr); +} + +} // namespace detail + +// external API +std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_floats(floats, cudf::get_default_stream(), mr); +} + +} // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index df74407355..fc2270ca8c 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -115,6 +115,11 @@ std::unique_ptr string_to_float( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr float_to_string( + cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + std::unique_ptr decimal_to_non_ansi_string( cudf::column_view const& input, rmm::cuda_stream_view stream, diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 5b95291351..b33c3955af 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -51,6 +51,9 @@ ConfigureTest(CAST_STRING ConfigureTest(CAST_DECIMAL_TO_STRING cast_decimal_to_string.cpp) +ConfigureTest(CAST_FLOAT_TO_STRING + cast_float_to_string.cpp) + ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp new file mode 100644 index 0000000000..ae342087d0 --- /dev/null +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +using namespace cudf; + +struct FloatToStringTests : public cudf::test::BaseFixture {}; + +TEST_F(StringsConvertTest, FromFloats32) +{ + std::vector h_floats{100, + 654321.25, + -12761.125, + 0, + 5, + -4, + std::numeric_limits::quiet_NaN(), + 839542223232.79, + -0.0}; + std::vector h_expected{ + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422433e+11", "-0.0"}; + + cudf::test::fixed_width_column_wrapper floats( + h_floats.begin(), + h_floats.end(), + thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + + auto results = cudf::strings::from_floats(floats); + + cudf::test::strings_column_wrapper expected( + h_expected.begin(), + h_expected.end(), + thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} + +TEST_F(StringsConvertTest, FromFloats64) +{ + std::vector h_floats{100, + 654321.25, + -12761.125, + 0, + 5, + -4, + std::numeric_limits::quiet_NaN(), + 839542223232.794248339, + -0.0}; + std::vector h_expected{ + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422232e+11", "-0.0"}; + + cudf::test::fixed_width_column_wrapper floats( + h_floats.begin(), + h_floats.end(), + thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + + auto results = cudf::strings::from_floats(floats); + + cudf::test::strings_column_wrapper expected( + h_expected.begin(), + h_expected.end(), + thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); +} \ No newline at end of file diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index eab42c41f6..7a31b0241b 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -81,13 +81,13 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st } /** - * Convert a decimal column to a string column. + * Convert a float column to a string column. * * @param cv the column data to process * @return the converted column */ - public static ColumnVector fromDecimal(ColumnView cv) { - return new ColumnVector(fromDecimal(cv.getNativeView())); + public static ColumnVector fromFloat(ColumnView cv) { + return new ColumnVector(fromFloat(cv.getNativeView())); } /** @@ -137,6 +137,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled int precision, int scale); private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype); private static native long fromDecimal(long nativeColumnView); + private static native long fromFloat(long nativeColumnView); private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); From 2c04fff68fd1c2ca51f1dd74616326615905b625 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 18 Oct 2023 09:55:20 +0800 Subject: [PATCH 02/54] wip Signed-off-by: Haoyang Li --- src/main/cpp/src/CastStringJni.cpp | 15 +++++ src/main/cpp/src/cast_float_to_string.cu | 69 ++++++++++++--------- src/main/cpp/tests/cast_float_to_string.cpp | 17 +++-- 3 files changed, 65 insertions(+), 36 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index d09bc33e4c..ff8ee2afd4 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -109,6 +109,21 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat( CATCH_CAST_EXCEPTION(env, 0); } +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat( + JNIEnv* env, jclass, jlong input_column, jint j_dtype) +{ + JNI_NULL_CHECK(env, input_column, "input column is null", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::column_view cv{*reinterpret_cast(input_column)}; + return cudf::jni::release_as_jlong( + spark_rapids_jni::float_to_string(cv, cudf::get_default_stream())); + } + CATCH_CAST_EXCEPTION(env, 0); +} + JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal(JNIEnv* env, jclass, jlong input_column) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index fed57f5e91..3560c375e3 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -49,12 +49,14 @@ namespace spark_rapids_jni { namespace detail { namespace { -struct float_to_string_fn { +struct ftos_converter { // significant digits is independent of scientific notation range // digits more than this may require using long values instead of ints - static constexpr unsigned int significant_digits = 10; + static constexpr unsigned int significant_digits = 17; // maximum power-of-10 that will fit in 32-bits - static constexpr unsigned int nine_digits = 1000000000; // 1x10^9 + // static constexpr unsigned long long nine_digits = 1000000000; // 1x10^9 + static constexpr unsigned long long fifteen_digits = 1000000000000000; + static constexpr unsigned long long sixteen_digits = 10000000000000000; // Range of numbers here is for normalizing the value. // If the value is above or below the following limits, the output is converted to // scientific notation in order to show (at most) the number of significant digits. @@ -91,10 +93,9 @@ struct float_to_string_fn { */ __device__ int dissect_value(double value, unsigned int& integer, - unsigned int& decimal, + unsigned long long& decimal, int& exp10) { - int decimal_places = significant_digits - 1; // normalize step puts value between lower-limit and upper-limit // by adjusting the exponent up or down exp10 = 0; @@ -118,16 +119,18 @@ struct float_to_string_fn { } } // - unsigned int max_digits = nine_digits; + int decimal_places = significant_digits - (exp10? 2 : 1); + unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); integer = (unsigned int)value; for (unsigned int i = integer; i >= 10; i /= 10) { --decimal_places; max_digits /= 10; } - double remainder = (value - (double)integer) * (double)max_digits; - decimal = (unsigned int)remainder; + double diff = value - (double)integer; + double remainder = diff * (double)max_digits; + decimal = (unsigned long long)remainder; remainder -= (double)decimal; - decimal += (unsigned int)(2.0 * remainder); + decimal += (unsigned long long)(2.0 * remainder); // round up if (decimal >= max_digits) { decimal = 0; ++integer; @@ -168,14 +171,15 @@ struct float_to_string_fn { } if (std::isinf(value)) { if (bneg) - memcpy(output, "-Inf", 4); + memcpy(output, "-Infinity", 9); else - memcpy(output, "Inf", 3); - return bneg ? 4 : 3; + memcpy(output, "Infinity", 8); + return bneg ? 9 : 8; } // dissect value into components - unsigned int integer = 0, decimal = 0; + unsigned int integer = 0; + unsigned long long decimal = 0; int exp10 = 0; int decimal_places = dissect_value(value, integer, decimal, exp10); // @@ -190,7 +194,7 @@ struct float_to_string_fn { // decimal *ptr++ = '.'; if (decimal_places) { - char buffer[10]; + char buffer[17]; char* pb = buffer; while (decimal_places--) { *pb++ = (char)('0' + (decimal % 10)); @@ -206,9 +210,8 @@ struct float_to_string_fn { if (exp10 < 0) { *ptr++ = '-'; exp10 = -exp10; - } else - *ptr++ = '+'; - if (exp10 < 10) *ptr++ = '0'; // extra zero-pad + } + // if (exp10 < 10) *ptr++ = '0'; // extra zero-pad ptr = int2str(exp10, ptr); } // done @@ -232,7 +235,8 @@ struct float_to_string_fn { if (std::isinf(value)) return 3 + (int)bneg; // Inf // dissect float into parts - unsigned int integer = 0, decimal = 0; + unsigned int integer = 0; + unsigned long long decimal = 0; int exp10 = 0; int decimal_places = dissect_value(value, integer, decimal, exp10); // now count up the components @@ -252,8 +256,11 @@ struct float_to_string_fn { ++count; // always include .0 // exponent if (exp10) { - count += 2; // 'e±' - if (exp10 < 0) exp10 = -exp10; + count ++; // 'e±' + if (exp10 < 0) { + count ++; + exp10 = -exp10; + } count += (int)(exp10 < 10); // padding while (exp10 > 0) { exp10 /= 10; @@ -265,21 +272,21 @@ struct float_to_string_fn { }; template -struct from_floats_fn { +struct float_to_string_fn { column_device_view d_floats; size_type* d_offsets; char* d_chars; __device__ size_type compute_output_size(FloatType value) { - float_to_string_fn fts; + ftos_converter fts; return static_cast(fts.compute_ftos_size(static_cast(value))); } __device__ void float_to_string(size_type idx) { FloatType value = d_floats.element(idx); - float_to_string_fn fts; + ftos_converter fts; fts.float_to_string(static_cast(value), d_chars + d_offsets[idx]); } @@ -302,7 +309,7 @@ struct from_floats_fn { * * The template function declaration ensures only float types are allowed. */ -struct dispatch_from_floats_fn { +struct dispatch_float_to_string_fn { template >* = nullptr> std::unique_ptr operator()(column_view const& floats, rmm::cuda_stream_view stream, @@ -316,7 +323,7 @@ struct dispatch_from_floats_fn { rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); auto [offsets, chars] = - cudf::strings::detail::make_strings_children(from_floats_fn{d_column}, strings_count, stream, mr); + cudf::strings::detail::make_strings_children(float_to_string_fn{d_column}, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets), @@ -331,30 +338,32 @@ struct dispatch_from_floats_fn { rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { - CUDF_FAIL("Values for from_floats function must be a float type."); + CUDF_FAIL("Values for float_to_string function must be a float type."); } }; } // namespace // This will convert all float column types into a strings column. -std::unique_ptr from_floats(column_view const& floats, +std::unique_ptr float_to_string(column_view const& floats, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { size_type strings_count = floats.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); - return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr); + return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr); } } // namespace detail // external API -std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) +std::unique_ptr float_to_string(column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_floats(floats, cudf::get_default_stream(), mr); + return detail::float_to_string(floats, stream, mr); } } // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index ae342087d0..605df95742 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -29,9 +29,11 @@ using namespace cudf; +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; + struct FloatToStringTests : public cudf::test::BaseFixture {}; -TEST_F(StringsConvertTest, FromFloats32) +TEST_F(FloatToStringTests, FromFloats32) { std::vector h_floats{100, 654321.25, @@ -43,14 +45,14 @@ TEST_F(StringsConvertTest, FromFloats32) 839542223232.79, -0.0}; std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422433e+11", "-0.0"}; + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"}; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), h_floats.end(), thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - auto results = cudf::strings::from_floats(floats); + auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); cudf::test::strings_column_wrapper expected( h_expected.begin(), @@ -60,11 +62,13 @@ TEST_F(StringsConvertTest, FromFloats32) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } -TEST_F(StringsConvertTest, FromFloats64) +TEST_F(FloatToStringTests, FromFloats64) { std::vector h_floats{100, 654321.25, -12761.125, + 1.123456789123456789, + 0.000000000000000000123456789123456789, 0, 5, -4, @@ -72,14 +76,15 @@ TEST_F(StringsConvertTest, FromFloats64) 839542223232.794248339, -0.0}; std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422232e+11", "-0.0"}; + "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", + "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), h_floats.end(), thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - auto results = cudf::strings::from_floats(floats); + auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); cudf::test::strings_column_wrapper expected( h_expected.begin(), From cbce72469eadfb29bc88bcc4c07afe84872c60f5 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 18 Oct 2023 18:22:05 +0800 Subject: [PATCH 03/54] Add float to string kernel Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 23 ++++++++++++------- .../nvidia/spark/rapids/jni/CastStrings.java | 10 ++++++++ thirdparty/cudf | 2 +- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index 3560c375e3..13a71754e4 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -55,13 +55,13 @@ struct ftos_converter { static constexpr unsigned int significant_digits = 17; // maximum power-of-10 that will fit in 32-bits // static constexpr unsigned long long nine_digits = 1000000000; // 1x10^9 - static constexpr unsigned long long fifteen_digits = 1000000000000000; + // static constexpr unsigned long long fifteen_digits = 1000000000000000; static constexpr unsigned long long sixteen_digits = 10000000000000000; // Range of numbers here is for normalizing the value. // If the value is above or below the following limits, the output is converted to // scientific notation in order to show (at most) the number of significant digits. - static constexpr double upper_limit = 1000000000; // max is 1x10^9 - static constexpr double lower_limit = 0.0001; // printf uses scientific notation below this + static constexpr double upper_limit = 10000000; // max is 1x10^7 + static constexpr double lower_limit = 0.001; // printf uses scientific notation below this // Tables for doing normalization: converting to exponent form // IEEE double float has maximum exponent of 305 so these should cover everything double const upper10[9] = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256}; @@ -119,8 +119,16 @@ struct ftos_converter { } } // - int decimal_places = significant_digits - (exp10? 2 : 1); - unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); + // int decimal_places = significant_digits - (exp10? 2 : 1); + // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); + int decimal_places = significant_digits - 1; + unsigned long long max_digits = sixteen_digits; + double temp_value = value; + while (temp_value < 1.0 && temp_value > 0.0) { + max_digits *= 10; + temp_value *= 10.0; + decimal_places++; + } integer = (unsigned int)value; for (unsigned int i = integer; i >= 10; i /= 10) { --decimal_places; @@ -194,7 +202,7 @@ struct ftos_converter { // decimal *ptr++ = '.'; if (decimal_places) { - char buffer[17]; + char buffer[18]; char* pb = buffer; while (decimal_places--) { *pb++ = (char)('0' + (decimal % 10)); @@ -232,7 +240,7 @@ struct ftos_converter { value = -value; bneg = true; } - if (std::isinf(value)) return 3 + (int)bneg; // Inf + if (std::isinf(value)) return 8 + (int)bneg; // Inf // dissect float into parts unsigned int integer = 0; @@ -261,7 +269,6 @@ struct ftos_converter { count ++; exp10 = -exp10; } - count += (int)(exp10 < 10); // padding while (exp10 > 0) { exp10 /= 10; ++count; diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 7a31b0241b..3002e1cdab 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -90,6 +90,16 @@ public static ColumnVector fromFloat(ColumnView cv) { return new ColumnVector(fromFloat(cv.getNativeView())); } + /** + * Convert a decimal column to a string column. + * + * @param cv the column data to process + * @return the converted column + */ + public static ColumnVector fromDecimal(ColumnView cv) { + return new ColumnVector(fromDecimal(cv.getNativeView())); + } + /** * Convert a string column to a given floating-point type column. * diff --git a/thirdparty/cudf b/thirdparty/cudf index 5f05c180b8..fa4e8ab1af 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5f05c180b80b70fc09ea58aef2494c57edc44b9c +Subproject commit fa4e8ab1af4acfd2c88a619b4d9693f4a5fda168 From 8d7ead2093613de6d322b42157130182086e7891 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 19 Oct 2023 15:57:57 +0800 Subject: [PATCH 04/54] Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Mike Wilson --- src/main/cpp/src/cast_float_to_string.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index 13a71754e4..15d6e9cba5 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -178,10 +178,11 @@ struct ftos_converter { bneg = true; } if (std::isinf(value)) { - if (bneg) + if (bneg) { memcpy(output, "-Infinity", 9); - else + } else { memcpy(output, "Infinity", 8); + } return bneg ? 9 : 8; } From 9ab20893bad7eb87d78ba5500ba9e763dad954a0 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 19 Oct 2023 15:58:05 +0800 Subject: [PATCH 05/54] Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Mike Wilson --- src/main/cpp/src/cast_float_to_string.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index 15d6e9cba5..ca7e9b95db 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -172,11 +172,14 @@ struct ftos_converter { memcpy(output, "NaN", 3); return 3; } - bool bneg = false; - if (signbit(value)) { // handles -0.0 too - value = -value; - bneg = true; - } + bool const bneg = [&value]() { + if (signbit(value)) { // handles -0.0 too + value = -value; + return true; + } else { + return false; + } + }(); if (std::isinf(value)) { if (bneg) { memcpy(output, "-Infinity", 9); From c3b3d6464445ee1393d48122a4192424d34b18b8 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 19 Oct 2023 17:52:57 +0800 Subject: [PATCH 06/54] address comments and use different precision for float Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 36 +++++++++++---------- src/main/cpp/tests/cast_float_to_string.cpp | 2 +- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index ca7e9b95db..d1f66f772d 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -52,15 +52,14 @@ namespace { struct ftos_converter { // significant digits is independent of scientific notation range // digits more than this may require using long values instead of ints - static constexpr unsigned int significant_digits = 17; - // maximum power-of-10 that will fit in 32-bits - // static constexpr unsigned long long nine_digits = 1000000000; // 1x10^9 - // static constexpr unsigned long long fifteen_digits = 1000000000000000; - static constexpr unsigned long long sixteen_digits = 10000000000000000; + static constexpr unsigned int significant_digits_float = 9; + static constexpr unsigned int significant_digits_double = 17; + static constexpr unsigned int eight_digits = 100000000; // 1x10^8 + static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16 // Range of numbers here is for normalizing the value. // If the value is above or below the following limits, the output is converted to // scientific notation in order to show (at most) the number of significant digits. - static constexpr double upper_limit = 10000000; // max is 1x10^7 + static constexpr double upper_limit = 10000000; // Spark's max is 1x10^7 static constexpr double lower_limit = 0.001; // printf uses scientific notation below this // Tables for doing normalization: converting to exponent form // IEEE double float has maximum exponent of 305 so these should cover everything @@ -75,7 +74,7 @@ struct ftos_converter { *output++ = '0'; return output; } - char buffer[significant_digits]; // should be big-enough for significant digits + char buffer[significant_digits_double]; // should be big-enough for significant digits char* ptr = buffer; while (value > 0) { *ptr++ = (char)('0' + (value % 10)); @@ -94,7 +93,8 @@ struct ftos_converter { __device__ int dissect_value(double value, unsigned int& integer, unsigned long long& decimal, - int& exp10) + int& exp10, + bool is_float = false) { // normalize step puts value between lower-limit and upper-limit // by adjusting the exponent up or down @@ -121,8 +121,8 @@ struct ftos_converter { // // int decimal_places = significant_digits - (exp10? 2 : 1); // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); - int decimal_places = significant_digits - 1; - unsigned long long max_digits = sixteen_digits; + int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1; + unsigned long long max_digits = (is_float? eight_digits: sixteen_digits); double temp_value = value; while (temp_value < 1.0 && temp_value > 0.0) { max_digits *= 10; @@ -165,7 +165,7 @@ struct ftos_converter { * @param output Memory to write output characters. * @return Number of bytes written. */ - __device__ int float_to_string(double value, char* output) + __device__ int float_to_string(double value, char* output, bool is_float) { // check for valid value if (std::isnan(value)) { @@ -193,7 +193,7 @@ struct ftos_converter { unsigned int integer = 0; unsigned long long decimal = 0; int exp10 = 0; - int decimal_places = dissect_value(value, integer, decimal, exp10); + int decimal_places = dissect_value(value, integer, decimal, exp10, is_float); // // now build the string from the // components: sign, integer, decimal, exp10, decimal_places @@ -206,7 +206,7 @@ struct ftos_converter { // decimal *ptr++ = '.'; if (decimal_places) { - char buffer[18]; + char buffer[significant_digits_double]; char* pb = buffer; while (decimal_places--) { *pb++ = (char)('0' + (decimal % 10)); @@ -236,7 +236,7 @@ struct ftos_converter { * @param value Float value to convert. * @return Number of bytes required. */ - __device__ int compute_ftos_size(double value) + __device__ int compute_ftos_size(double value, bool is_float) { if (std::isnan(value)) return 3; // NaN bool bneg = false; @@ -250,7 +250,7 @@ struct ftos_converter { unsigned int integer = 0; unsigned long long decimal = 0; int exp10 = 0; - int decimal_places = dissect_value(value, integer, decimal, exp10); + int decimal_places = dissect_value(value, integer, decimal, exp10, is_float); // now count up the components // sign int count = (int)bneg; @@ -291,14 +291,16 @@ struct float_to_string_fn { __device__ size_type compute_output_size(FloatType value) { ftos_converter fts; - return static_cast(fts.compute_ftos_size(static_cast(value))); + bool is_float = std::is_same_v; + return static_cast(fts.compute_ftos_size(static_cast(value), is_float)); } __device__ void float_to_string(size_type idx) { FloatType value = d_floats.element(idx); ftos_converter fts; - fts.float_to_string(static_cast(value), d_chars + d_offsets[idx]); + bool is_float = std::is_same_v; + fts.float_to_string(static_cast(value), d_chars + d_offsets[idx], is_float); } __device__ void operator()(size_type idx) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index 605df95742..a86d988724 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -42,7 +42,7 @@ TEST_F(FloatToStringTests, FromFloats32) 5, -4, std::numeric_limits::quiet_NaN(), - 839542223232.79, + 123456789012.34, -0.0}; std::vector h_expected{ "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"}; From ebb123811c15247f4c6f1fe3fea11716517dcc28 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 30 Oct 2023 09:38:15 +0800 Subject: [PATCH 07/54] a runnable format_number demo Signed-off-by: Haoyang Li --- src/main/cpp/CMakeLists.txt | 2 +- src/main/cpp/src/CastStringJni.cpp | 6 +- src/main/cpp/src/cast_float_to_string.cu | 382 ------------ src/main/cpp/src/cast_string.hpp | 3 +- src/main/cpp/src/format_float.cu | 576 ++++++++++++++++++ src/main/cpp/tests/CMakeLists.txt | 4 +- ...t_float_to_string.cpp => format_float.cpp} | 14 +- .../nvidia/spark/rapids/jni/CastStrings.java | 8 +- 8 files changed, 595 insertions(+), 400 deletions(-) delete mode 100644 src/main/cpp/src/cast_float_to_string.cu create mode 100644 src/main/cpp/src/format_float.cu rename src/main/cpp/tests/{cast_float_to_string.cpp => format_float.cpp} (86%) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 8f90b9078e..745a9df2a7 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -163,7 +163,7 @@ add_library( src/ZOrderJni.cpp src/bloom_filter.cu src/cast_decimal_to_string.cu - src/cast_float_to_string.cu + src/format_float.cu src/cast_string.cu src/cast_string_to_float.cu src/datetime_rebase.cu diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index ff8ee2afd4..824ddad8e1 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -109,8 +109,8 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat( CATCH_CAST_EXCEPTION(env, 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat( - JNIEnv* env, jclass, jlong input_column, jint j_dtype) +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat( + JNIEnv* env, jclass, jlong input_column, jint d, jint j_dtype) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); @@ -119,7 +119,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat( cudf::column_view cv{*reinterpret_cast(input_column)}; return cudf::jni::release_as_jlong( - spark_rapids_jni::float_to_string(cv, cudf::get_default_stream())); + spark_rapids_jni::format_float(cv, d, cudf::get_default_stream())); } CATCH_CAST_EXCEPTION(env, 0); } diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu deleted file mode 100644 index d1f66f772d..0000000000 --- a/src/main/cpp/src/cast_float_to_string.cu +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "cast_string.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -using namespace cudf; - -namespace spark_rapids_jni { - -namespace detail { -namespace { - -struct ftos_converter { - // significant digits is independent of scientific notation range - // digits more than this may require using long values instead of ints - static constexpr unsigned int significant_digits_float = 9; - static constexpr unsigned int significant_digits_double = 17; - static constexpr unsigned int eight_digits = 100000000; // 1x10^8 - static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16 - // Range of numbers here is for normalizing the value. - // If the value is above or below the following limits, the output is converted to - // scientific notation in order to show (at most) the number of significant digits. - static constexpr double upper_limit = 10000000; // Spark's max is 1x10^7 - static constexpr double lower_limit = 0.001; // printf uses scientific notation below this - // Tables for doing normalization: converting to exponent form - // IEEE double float has maximum exponent of 305 so these should cover everything - double const upper10[9] = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256}; - double const lower10[9] = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256}; - double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255}; - - // utility for quickly converting known integer range to character array - __device__ char* int2str(int value, char* output) - { - if (value == 0) { - *output++ = '0'; - return output; - } - char buffer[significant_digits_double]; // should be big-enough for significant digits - char* ptr = buffer; - while (value > 0) { - *ptr++ = (char)('0' + (value % 10)); - value /= 10; - } - while (ptr != buffer) - *output++ = *--ptr; // 54321 -> 12345 - return output; - } - - /** - * @brief Dissect a float value into integer, decimal, and exponent components. - * - * @return The number of decimal places. - */ - __device__ int dissect_value(double value, - unsigned int& integer, - unsigned long long& decimal, - int& exp10, - bool is_float = false) - { - // normalize step puts value between lower-limit and upper-limit - // by adjusting the exponent up or down - exp10 = 0; - if (value > upper_limit) { - int fx = 256; - for (int idx = 8; idx >= 0; --idx) { - if (value >= upper10[idx]) { - value *= lower10[idx]; - exp10 += fx; - } - fx = fx >> 1; - } - } else if ((value > 0.0) && (value < lower_limit)) { - int fx = 256; - for (int idx = 8; idx >= 0; --idx) { - if (value < blower10[idx]) { - value *= upper10[idx]; - exp10 -= fx; - } - fx = fx >> 1; - } - } - // - // int decimal_places = significant_digits - (exp10? 2 : 1); - // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); - int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1; - unsigned long long max_digits = (is_float? eight_digits: sixteen_digits); - double temp_value = value; - while (temp_value < 1.0 && temp_value > 0.0) { - max_digits *= 10; - temp_value *= 10.0; - decimal_places++; - } - integer = (unsigned int)value; - for (unsigned int i = integer; i >= 10; i /= 10) { - --decimal_places; - max_digits /= 10; - } - double diff = value - (double)integer; - double remainder = diff * (double)max_digits; - decimal = (unsigned long long)remainder; - remainder -= (double)decimal; - decimal += (unsigned long long)(2.0 * remainder); // round up - if (decimal >= max_digits) { - decimal = 0; - ++integer; - if (exp10 && (integer >= 10)) { - ++exp10; - integer = 1; - } - } - // - while ((decimal % 10) == 0 && (decimal_places > 0)) { - decimal /= 10; - --decimal_places; - } - return decimal_places; - } - - /** - * @brief Main kernel method for converting float value to char output array. - * - * Output need not be more than (significant_digits + 7) bytes: - * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent - * - * @param value Float value to convert. - * @param output Memory to write output characters. - * @return Number of bytes written. - */ - __device__ int float_to_string(double value, char* output, bool is_float) - { - // check for valid value - if (std::isnan(value)) { - memcpy(output, "NaN", 3); - return 3; - } - bool const bneg = [&value]() { - if (signbit(value)) { // handles -0.0 too - value = -value; - return true; - } else { - return false; - } - }(); - if (std::isinf(value)) { - if (bneg) { - memcpy(output, "-Infinity", 9); - } else { - memcpy(output, "Infinity", 8); - } - return bneg ? 9 : 8; - } - - // dissect value into components - unsigned int integer = 0; - unsigned long long decimal = 0; - int exp10 = 0; - int decimal_places = dissect_value(value, integer, decimal, exp10, is_float); - // - // now build the string from the - // components: sign, integer, decimal, exp10, decimal_places - // - // sign - char* ptr = output; - if (bneg) *ptr++ = '-'; - // integer - ptr = int2str(integer, ptr); - // decimal - *ptr++ = '.'; - if (decimal_places) { - char buffer[significant_digits_double]; - char* pb = buffer; - while (decimal_places--) { - *pb++ = (char)('0' + (decimal % 10)); - decimal /= 10; - } - while (pb != buffer) // reverses the digits - *ptr++ = *--pb; // e.g. 54321 -> 12345 - } else - *ptr++ = '0'; // always include at least .0 - // exponent - if (exp10) { - *ptr++ = 'E'; - if (exp10 < 0) { - *ptr++ = '-'; - exp10 = -exp10; - } - // if (exp10 < 10) *ptr++ = '0'; // extra zero-pad - ptr = int2str(exp10, ptr); - } - // done - return (int)(ptr - output); // number of bytes written - } - - /** - * @brief Compute how man bytes are needed to hold the output string. - * - * @param value Float value to convert. - * @return Number of bytes required. - */ - __device__ int compute_ftos_size(double value, bool is_float) - { - if (std::isnan(value)) return 3; // NaN - bool bneg = false; - if (signbit(value)) { // handles -0.0 too - value = -value; - bneg = true; - } - if (std::isinf(value)) return 8 + (int)bneg; // Inf - - // dissect float into parts - unsigned int integer = 0; - unsigned long long decimal = 0; - int exp10 = 0; - int decimal_places = dissect_value(value, integer, decimal, exp10, is_float); - // now count up the components - // sign - int count = (int)bneg; - // integer - count += (int)(integer == 0); - while (integer > 0) { - integer /= 10; - ++count; - } // log10(integer) - // decimal - ++count; // decimal point - if (decimal_places) - count += decimal_places; - else - ++count; // always include .0 - // exponent - if (exp10) { - count ++; // 'e±' - if (exp10 < 0) { - count ++; - exp10 = -exp10; - } - while (exp10 > 0) { - exp10 /= 10; - ++count; - } // log10(exp10) - } - return count; - } -}; - -template -struct float_to_string_fn { - column_device_view d_floats; - size_type* d_offsets; - char* d_chars; - - __device__ size_type compute_output_size(FloatType value) - { - ftos_converter fts; - bool is_float = std::is_same_v; - return static_cast(fts.compute_ftos_size(static_cast(value), is_float)); - } - - __device__ void float_to_string(size_type idx) - { - FloatType value = d_floats.element(idx); - ftos_converter fts; - bool is_float = std::is_same_v; - fts.float_to_string(static_cast(value), d_chars + d_offsets[idx], is_float); - } - - __device__ void operator()(size_type idx) - { - if (d_floats.is_null(idx)) { - if (d_chars == nullptr) { d_offsets[idx] = 0; } - return; - } - if (d_chars != nullptr) { - float_to_string(idx); - } else { - d_offsets[idx] = compute_output_size(d_floats.element(idx)); - } - } -}; - -/** - * @brief This dispatch method is for converting floats into strings. - * - * The template function declaration ensures only float types are allowed. - */ -struct dispatch_float_to_string_fn { - template >* = nullptr> - std::unique_ptr operator()(column_view const& floats, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - size_type strings_count = floats.size(); - auto column = column_device_view::create(floats, stream); - auto d_column = *column; - - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); - - auto [offsets, chars] = - cudf::strings::detail::make_strings_children(float_to_string_fn{d_column}, strings_count, stream, mr); - - return make_strings_column(strings_count, - std::move(offsets), - std::move(chars), - floats.null_count(), - std::move(null_mask)); - } - - // non-float types throw an exception - template >* = nullptr> - std::unique_ptr operator()(column_view const&, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const - { - CUDF_FAIL("Values for float_to_string function must be a float type."); - } -}; - -} // namespace - -// This will convert all float column types into a strings column. -std::unique_ptr float_to_string(column_view const& floats, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - size_type strings_count = floats.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); - - return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr); -} - -} // namespace detail - -// external API -std::unique_ptr float_to_string(column_view const& floats, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::float_to_string(floats, stream, mr); -} - -} // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index fc2270ca8c..4f64bf4ef3 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -115,8 +115,9 @@ std::unique_ptr string_to_float( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::unique_ptr float_to_string( +std::unique_ptr format_float( cudf::column_view const& input, + int d, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu new file mode 100644 index 0000000000..5972d108a9 --- /dev/null +++ b/src/main/cpp/src/format_float.cu @@ -0,0 +1,576 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cast_string.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace cudf; + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +struct ftos_converter { + // significant digits is independent of scientific notation range + // digits more than this may require using long values instead of ints + // static constexpr unsigned int significant_digits_float = 9; + // static constexpr unsigned int significant_digits_double = 17; + // static constexpr unsigned int eight_digits = 100000000; // 1x10^8 + static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16 + // Range of numbers here is for normalizing the value. + // If the value is above or below the following limits, the output is converted to + // scientific notation in order to show (at most) the number of significant digits. + static constexpr double upper_limit = 10000000; // Spark's max is 1x10^7 + static constexpr double lower_limit = 0.001; // printf uses scientific notation below this + // Tables for doing normalization: converting to exponent form + // IEEE double float has maximum exponent of 305 so these should cover everything + double const upper10[9] = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256}; + double const lower10[9] = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256}; + double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255}; + + // // utility for quickly converting known integer range to character array + // __device__ char* int2str(int value, char* output) + // { + // if (value == 0) { + // *output++ = '0'; + // return output; + // } + // char buffer[significant_digits_double]; // should be big-enough for significant digits + // char* ptr = buffer; + // while (value > 0) { + // *ptr++ = (char)('0' + (value % 10)); + // value /= 10; + // } + // while (ptr != buffer) + // *output++ = *--ptr; // 54321 -> 12345 + // return output; + // } + + // // Add separator every 3 digits for integer part + // __device__ char* format_int(int value, char* output) + // { + // if (value == 0) { + // *output++ = '0'; + // return output; + // } + // char buffer[30]; // TODO: avoid hard-coded size + // char* ptr = buffer; + // int sep_count = 0; + // while (value > 0) { + // if (sep_count == 3) { + // *ptr++ = ','; + // sep_count = 0; + // } + // *ptr++ = (char)('0' + (value % 10)); + // value /= 10; + // sep_count++; + // } + // while (ptr != buffer) + // *output++ = *--ptr; // 543,21 -> 12,345 + // return output; + // } + + __device__ char* ll2str(long long n, char* result) { + if (n == 0) { + *result++ = '0'; + return result; + } + char buffer[18]; // should be big-enough for significant digits + char* ptr = buffer; + while (n > 0) { + *ptr++ = (char)('0' + (n % 10)); + n /= 10; + } + while (ptr != buffer) + *result++ = *--ptr; // 54321 -> 12345 + return result; + } + + // __device__ char* format_ll(long long n, char* result, char* dec_ptr, int& dec_pos, int exp10) { + // if (n == 0) { + // *result++ = '0'; + // return result; + // } + // int sep_count = 0; + // char buffer[305]; // should be big-enough for significant digits + // char* ptr = buffer; + // while (n > 0) { + // if (sep_count == 3) { + // *ptr++ = ','; + // sep_count = 0; + // } + // *ptr++ = (char)('0' + (n % 10)); + // n /= 10; + // sep_count++; + // } + // int len = dec_ptr - dec_str; + // int dec_pos = 0; + // while (exp10--) { + // if (sep_count == 3) { + // *ptr++ = ','; + // sep_count = 0; + // } + // if (dec_pos < len) { + // *ptr++ = dec_str[dec_pos++]; + // } else { + // *ptr++ = '0'; + // } + // sep_count++; + // } + // while (ptr != buffer) { + // *result++ = *--ptr; // 54321 -> 12345 + // } + // return result; + // } + + // /** + // * @brief Dissect a float value into integer, decimal, and exponent components. + // * + // * @return The number of decimal places. + // */ + // __device__ int dissect_value(double value, + // int digits, + // unsigned int& integer, + // unsigned long long& decimal, + // int& exp10, + // bool is_float = false) + // { + // // normalize step puts value between lower-limit and upper-limit + // // by adjusting the exponent up or down + // exp10 = 0; + // if (value > upper_limit) { + // int fx = 256; + // for (int idx = 8; idx >= 0; --idx) { + // if (value >= upper10[idx]) { + // value *= lower10[idx]; + // exp10 += fx; + // } + // fx = fx >> 1; + // } + // } else if ((value > 0.0) && (value < lower_limit)) { + // int fx = 256; + // for (int idx = 8; idx >= 0; --idx) { + // if (value < blower10[idx]) { + // value *= upper10[idx]; + // exp10 -= fx; + // } + // fx = fx >> 1; + // } + // } + // // + // // int decimal_places = significant_digits - (exp10? 2 : 1); + // // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); + // int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1; + // unsigned long long max_digits = (is_float? eight_digits: sixteen_digits); + // double temp_value = value; + // while (temp_value < 1.0 && temp_value > 0.0) { + // max_digits *= 10; + // temp_value *= 10.0; + // decimal_places++; + // } + // integer = (unsigned int)value; + // for (unsigned int i = integer; i >= 10; i /= 10) { + // --decimal_places; + // max_digits /= 10; + // } + // double diff = value - (double)integer; + // double remainder = diff * (double)max_digits; + // decimal = (unsigned long long)remainder; + // remainder -= (double)decimal; + // decimal += (unsigned long long)(2.0 * remainder); // round up + // if (decimal >= max_digits) { + // decimal = 0; + // ++integer; + // if (exp10 && (integer >= 10)) { + // ++exp10; + // integer = 1; + // } + // } + // // + // while ((decimal % 10) == 0 && (decimal_places > 0)) { + // decimal /= 10; + // --decimal_places; + // } + // return decimal_places; + // } + + /** + * @brief Main kernel method for converting float value to char output array. + * + * Output need not be more than (significant_digits + 7) bytes: + * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent + * + * @param value Float value to convert. + * @param output Memory to write output characters. + * @return Number of bytes written. + */ + __device__ int format_float(double value, int digits, char* output, bool is_float) + { + // check for valid value + if (std::isnan(value)) { + memcpy(output, "NaN", 3); + return 3; + } + bool const bneg = [&value]() { + if (signbit(value)) { // handles -0.0 too + value = -value; + return true; + } else { + return false; + } + }(); + if (std::isinf(value)) { + if (bneg) { + memcpy(output, "-Infinity", 9); + } else { + memcpy(output, "Infinity", 8); + } + return bneg ? 9 : 8; + } + + // dissect value into components + // unsigned int integer = 0; + // unsigned long long decimal = 0; + int exp10 = 0; + // int decimal_places = dissect_value(value, digits, integer, decimal, exp10, is_float); + // + // now build the string from the + // components: sign, integer, decimal, exp10, decimal_places + // + // sign + char* ptr = output; + if (bneg) *ptr++ = '-'; + // int exp10 = 0; + if (value > upper_limit) { + int fx = 256; + for (int idx = 8; idx >= 0; --idx) { + if (value >= upper10[idx]) { + value *= lower10[idx]; + exp10 += fx; + } + fx = fx >> 1; + } + } else if ((value > 0.0) && (value < lower_limit)) { + int fx = 256; + for (int idx = 8; idx >= 0; --idx) { + if (value < blower10[idx]) { + value *= upper10[idx]; + exp10 -= fx; + } + fx = fx >> 1; + } + } + // x * 10^exp10 + char dec_str[18]; + if (exp10 > 0) { + long long int_part = static_cast(value); + double decimal_double = value - double(int_part); + long long dec_part = decimal_double * sixteen_digits; + char* dec_ptr = ll2str(dec_part, dec_str); + // ptr = format_ll(int_part, ptr, dec_ptr, dec_pos, exp10); + if (int_part == 0) { + *ptr++ = '0'; + } else { + int sep_count = 0; + char buffer[23]; // should be big-enough for significant digits + char* buf_ptr = buffer; + while (int_part > 0) { + if (sep_count == 3) { + *buf_ptr++ = ','; + sep_count = 0; + } + *buf_ptr++ = (char)('0' + (int_part % 10)); + int_part /= 10; + sep_count++; + } + while (buf_ptr != buffer) { + *ptr++ = *--buf_ptr; // 54321 -> 12345 + } + int len = dec_ptr - dec_str; + int dec_pos = 0; + while (exp10--) { + if (sep_count == 3) { + *ptr++ = ','; + sep_count = 0; + } + if (dec_pos < len) { + *ptr++ = dec_str[dec_pos++]; + } else { + *ptr++ = '0'; + } + sep_count++; + } + *ptr++ = '.'; + while (digits--) { + if (dec_pos < len) { + *ptr++ = dec_str[dec_pos++]; + } else { + *ptr++ = '0'; + } + } + } + } else if (exp10 == 0) { + long long int_part = static_cast(value); + double decimal_double = value - double(int_part); + long long dec_part = decimal_double * sixteen_digits; + if (int_part == 0) { + *ptr++ = '0'; + } else { + int sep_count = 0; + char buffer[23]; // should be big-enough for significant digits + char* buf_ptr = buffer; + while (int_part > 0) { + if (sep_count == 3) { + *buf_ptr++ = ','; + sep_count = 0; + } + *buf_ptr++ = (char)('0' + (int_part % 10)); + int_part /= 10; + sep_count++; + } + while (buf_ptr != buffer) { + *ptr++ = *--buf_ptr; // 54321 -> 12345 + } + } + // ptr = ll2str(int_part, ptr); + *ptr++ = '.'; + char* dec_ptr = ll2str(dec_part, dec_str); + int len = dec_ptr - dec_str; + int dec_pos = 0; + while (digits--) { + if (dec_pos < len) { + *ptr++ = dec_str[dec_pos++]; + } else { + *ptr++ = '0'; + } + } + } else { + // exp10 < 0 + *ptr++ = '0'; + *ptr++ = '.'; + long long dec_part = value * sixteen_digits; + char* dec_ptr = ll2str(dec_part, dec_str); + int len = dec_ptr - dec_str; + int dec_pos = 0; + while (digits--) { + if (exp10 < -1) { + *ptr++ = '0'; + exp10++; + } else if (dec_pos < len) { + *ptr++ = dec_str[dec_pos++]; + } else { + *ptr++ = '0'; + } + } + } + return int(ptr - output); + } + + __device__ int int_part_len(double value) + { + int exp10 = 0; + if (value > upper_limit) { + int fx = 256; + for (int idx = 8; idx >= 0; --idx) { + if (value >= upper10[idx]) { + value *= lower10[idx]; + exp10 += fx; + } + fx = fx >> 1; + } + } + int cnt = 0; + if (value == 0.0) { + return 1; + } + while (value >= 1.0) { + value /= 10.0; + ++cnt; + } + if (exp10) { + cnt += exp10; + } + return cnt; + } + + /** + * @brief Compute how man bytes are needed to hold the output string. + * + * @param value Float value to convert. + * @return Number of bytes required. + */ + __device__ int compute_ftos_size(double value, int digits, bool is_float) + { + if (std::isnan(value)) return 3; // NaN + bool const bneg = [&value]() { + if (signbit(value)) { // handles -0.0 too + value = -value; + return true; + } else { + return false; + } + }(); + if (std::isinf(value)) return 8 + (int)bneg; // Inf + + int int_len = int_part_len(value); + // sign + int count = (int)bneg; + // integer + count += int_len; + // decimal + count += 1 + digits; + int sep_count = 0; + while (int_len > 0) { // speedup with math? + if (sep_count == 3) { + ++count; + sep_count = 0; + } + int_len--; + ++sep_count; + } // log10(integer) + return count; + } +}; + +template +struct format_float_fn { + column_device_view d_floats; + int digits; + size_type* d_offsets; + char* d_chars; + + __device__ size_type compute_output_size(FloatType value, int digits) + { + ftos_converter fts; + bool is_float = std::is_same_v; + return static_cast(fts.compute_ftos_size(static_cast(value), digits, is_float)); + } + + __device__ void format_float(size_type idx, int digits) + { + FloatType value = d_floats.element(idx); + ftos_converter fts; + bool is_float = std::is_same_v; + fts.format_float(static_cast(value), digits, d_chars + d_offsets[idx], is_float); + } + + __device__ void operator()(size_type idx) + { + if (d_floats.is_null(idx)) { + if (d_chars == nullptr) { d_offsets[idx] = 0; } + return; + } + if (d_chars != nullptr) { + format_float(idx, digits); + } else { + d_offsets[idx] = compute_output_size(d_floats.element(idx), digits); + } + } +}; + +/** + * @brief This dispatch method is for converting floats into strings. + * + * The template function declaration ensures only float types are allowed. + */ +struct dispatch_format_float_fn { + template >* = nullptr> + std::unique_ptr operator()(column_view const& floats, + int digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + size_type strings_count = floats.size(); + auto column = column_device_view::create(floats, stream); + auto d_column = *column; + + // copy the null mask + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); + + auto [offsets, chars] = + cudf::strings::detail::make_strings_children(format_float_fn{d_column, digits}, strings_count, stream, mr); + + return make_strings_column(strings_count, + std::move(offsets), + std::move(chars), + floats.null_count(), + std::move(null_mask)); + } + + // non-float types throw an exception + template >* = nullptr> + std::unique_ptr operator()(column_view const&, + int, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("Values for format_float function must be a float type."); + } +}; + +} // namespace + +// This will convert all float column types into a strings column. +std::unique_ptr format_float(column_view const& floats, + int digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type strings_count = floats.size(); + if (strings_count == 0) return make_empty_column(type_id::STRING); + + return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr); +} + +} // namespace detail + +// external API +std::unique_ptr format_float(column_view const& floats, + int digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::format_float(floats, digits, stream, mr); +} + +} // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 76f0b52912..345a669092 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -51,8 +51,8 @@ ConfigureTest(CAST_STRING ConfigureTest(CAST_DECIMAL_TO_STRING cast_decimal_to_string.cpp) -ConfigureTest(CAST_FLOAT_TO_STRING - cast_float_to_string.cpp) +ConfigureTest(FORMAT_FLOAT + format_float.cpp) ConfigureTest(DATETIME_REBASE datetime_rebase.cpp) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/format_float.cpp similarity index 86% rename from src/main/cpp/tests/cast_float_to_string.cpp rename to src/main/cpp/tests/format_float.cpp index a86d988724..3e03578f4c 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -31,9 +31,9 @@ using namespace cudf; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; -struct FloatToStringTests : public cudf::test::BaseFixture {}; +struct FormatFloatTests : public cudf::test::BaseFixture {}; -TEST_F(FloatToStringTests, FromFloats32) +TEST_F(FormatFloatTests, FormatFloats32) { std::vector h_floats{100, 654321.25, @@ -45,14 +45,14 @@ TEST_F(FloatToStringTests, FromFloats32) 123456789012.34, -0.0}; std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"}; + "100.0", "654,321.25", "-12,761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"}; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), h_floats.end(), thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); + auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); cudf::test::strings_column_wrapper expected( h_expected.begin(), @@ -62,7 +62,7 @@ TEST_F(FloatToStringTests, FromFloats32) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } -TEST_F(FloatToStringTests, FromFloats64) +TEST_F(FormatFloatTests, FormatFloats64) { std::vector h_floats{100, 654321.25, @@ -76,7 +76,7 @@ TEST_F(FloatToStringTests, FromFloats64) 839542223232.794248339, -0.0}; std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", + "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; cudf::test::fixed_width_column_wrapper floats( @@ -84,7 +84,7 @@ TEST_F(FloatToStringTests, FromFloats64) h_floats.end(), thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); + auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); cudf::test::strings_column_wrapper expected( h_expected.begin(), diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 3002e1cdab..ab07dc39dc 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -81,13 +81,13 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st } /** - * Convert a float column to a string column. + * Convert a float column to a formatted string column. * * @param cv the column data to process * @return the converted column */ - public static ColumnVector fromFloat(ColumnView cv) { - return new ColumnVector(fromFloat(cv.getNativeView())); + public static ColumnVector formatFloat(ColumnView cv, int d) { + return new ColumnVector(formatFloat(cv.getNativeView(), d)); } /** @@ -147,7 +147,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled int precision, int scale); private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype); private static native long fromDecimal(long nativeColumnView); - private static native long fromFloat(long nativeColumnView); + private static native long formatFloat(long nativeColumnView, int d); private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); From 007cf5ebc74a623e8f82df4feafd2416c8397af9 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 6 Nov 2023 15:58:47 +0800 Subject: [PATCH 08/54] rewrite the solution with ryu Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 1348 ++++++++++++++++--- src/main/cpp/tests/cast_float_to_string.cpp | 2 +- 2 files changed, 1127 insertions(+), 223 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index d1f66f772d..a594377b4a 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -41,6 +41,11 @@ #include #include #include +#include +// #include +#include +// #include +// #include using namespace cudf; @@ -49,236 +54,1135 @@ namespace spark_rapids_jni { namespace detail { namespace { +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_64 { + uint64_t mantissa; + // Decimal exponent's range is -324 to 308 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_64; + +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_32 { + uint32_t mantissa; + // Decimal exponent's range is -45 to 38 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_32; + struct ftos_converter { - // significant digits is independent of scientific notation range - // digits more than this may require using long values instead of ints - static constexpr unsigned int significant_digits_float = 9; - static constexpr unsigned int significant_digits_double = 17; - static constexpr unsigned int eight_digits = 100000000; // 1x10^8 - static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16 - // Range of numbers here is for normalizing the value. - // If the value is above or below the following limits, the output is converted to - // scientific notation in order to show (at most) the number of significant digits. - static constexpr double upper_limit = 10000000; // Spark's max is 1x10^7 - static constexpr double lower_limit = 0.001; // printf uses scientific notation below this - // Tables for doing normalization: converting to exponent form - // IEEE double float has maximum exponent of 305 so these should cover everything - double const upper10[9] = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256}; - double const lower10[9] = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256}; - double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255}; - - // utility for quickly converting known integer range to character array - __device__ char* int2str(int value, char* output) - { - if (value == 0) { - *output++ = '0'; - return output; - } - char buffer[significant_digits_double]; // should be big-enough for significant digits - char* ptr = buffer; - while (value > 0) { - *ptr++ = (char)('0' + (value % 10)); - value /= 10; - } - while (ptr != buffer) - *output++ = *--ptr; // 54321 -> 12345 - return output; - } - - /** - * @brief Dissect a float value into integer, decimal, and exponent components. - * - * @return The number of decimal places. - */ - __device__ int dissect_value(double value, - unsigned int& integer, - unsigned long long& decimal, - int& exp10, - bool is_float = false) - { - // normalize step puts value between lower-limit and upper-limit - // by adjusting the exponent up or down - exp10 = 0; - if (value > upper_limit) { - int fx = 256; - for (int idx = 8; idx >= 0; --idx) { - if (value >= upper10[idx]) { - value *= lower10[idx]; - exp10 += fx; + + // These tables are generated by PrintDoubleLookupTable. + static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; + static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; + static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); + static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); + static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; + static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; + static constexpr unsigned int DOUBLE_BIAS = 1023; + static constexpr unsigned int FLOAT_MANTISSA_BITS = 23; + static constexpr unsigned int FLOAT_EXPONENT_BITS = 8; + static constexpr unsigned int FLOAT_BIAS = 127; + + + // Returns the number of decimal digits in v, which must not contain more than 9 digits. + __device__ inline uint32_t decimalLength9(const uint32_t v) { + // Function precondition: v is not a 10-digit number. + // (f2s: 9 digits are sufficient for round-tripping.) + // (d2fixed: We print 9-digit blocks.) + assert(v < 1000000000); + if (v >= 100000000) { return 9; } + if (v >= 10000000) { return 8; } + if (v >= 1000000) { return 7; } + if (v >= 100000) { return 6; } + if (v >= 10000) { return 5; } + if (v >= 1000) { return 4; } + if (v >= 100) { return 3; } + if (v >= 10) { return 2; } + return 1; + } + + const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = { + { 1u, 2305843009213693952u }, + { 5955668970331000884u, 1784059615882449851u }, + { 8982663654677661702u, 1380349269358112757u }, + { 7286864317269821294u, 2135987035920910082u }, + { 7005857020398200553u, 1652639921975621497u }, + { 17965325103354776697u, 1278668206209430417u }, + { 8928596168509315048u, 1978643211784836272u }, + { 10075671573058298858u, 1530901034580419511u }, + { 597001226353042382u, 1184477304306571148u }, + { 1527430471115325346u, 1832889850782397517u }, + { 12533209867169019542u, 1418129833677084982u }, + { 5577825024675947042u, 2194449627517475473u }, + { 11006974540203867551u, 1697873161311732311u }, + { 10313493231639821582u, 1313665730009899186u }, + { 12701016819766672773u, 2032799256770390445u } + }; + + const uint32_t POW5_INV_OFFSETS[19] = { + 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, + 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, + 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, + 0x00000000 + }; + + const uint64_t DOUBLE_POW5_SPLIT2[13][2] = { + { 0u, 1152921504606846976u }, + { 0u, 1490116119384765625u }, + { 1032610780636961552u, 1925929944387235853u }, + { 7910200175544436838u, 1244603055572228341u }, + { 16941905809032713930u, 1608611746708759036u }, + { 13024893955298202172u, 2079081953128979843u }, + { 6607496772837067824u, 1343575221513417750u }, + { 17332926989895652603u, 1736530273035216783u }, + { 13037379183483547984u, 2244412773384604712u }, + { 1605989338741628675u, 1450417759929778918u }, + { 9630225068416591280u, 1874621017369538693u }, + { 665883850346957067u, 1211445438634777304u }, + { 14931890668723713708u, 1565756531257009982u } + }; + + const uint32_t POW5_OFFSETS[21] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, + 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, + 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, + 0x55559155, 0x51405555, 0x00000105 + }; + + static constexpr uint32_t POW5_TABLE_SIZE = 26; + const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { + 1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, + 1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, + 30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, + 19073486328125ull, 95367431640625ull, 476837158203125ull, + 2384185791015625ull, 11920928955078125ull, 59604644775390625ull, + 298023223876953125ull //, 1490116119384765625ull + }; + + // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528. + __device__ inline int32_t log2pow5(const int32_t e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) ((((uint32_t) e) * 1217359) >> 19); + } + + // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. + __device__ inline int32_t pow5bits(const int32_t e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); + } + + // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. + __device__ inline int32_t ceil_log2pow5(const int32_t e) { + return log2pow5(e) + 1; + } + + // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. + __device__ inline uint32_t log10Pow2(const int32_t e) { + // The first value this approximation fails for is 2^1651 which is just greater than 10^297. + assert(e >= 0); + assert(e <= 1650); + return (((uint32_t) e) * 78913) >> 18; + } + + // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. + __device__ inline uint32_t log10Pow5(const int32_t e) { + // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. + assert(e >= 0); + assert(e <= 2620); + return (((uint32_t) e) * 732923) >> 20; + } + + __device__ inline uint32_t pow5factor_32(uint32_t value) { + uint32_t count = 0; + for (;;) { + assert(value != 0); + const uint32_t q = value / 5; + const uint32_t r = value % 5; + if (r != 0) { + break; + } + value = q; + ++count; + } + return count; + } + + // Returns true if value is divisible by 5^p. + __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) { + return pow5factor_32(value) >= p; + } + + // Returns true if value is divisible by 2^p. + __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) { + // __builtin_ctz doesn't appear to be faster here. + return (value & ((1u << p) - 1)) == 0; + } + + // It seems to be slightly faster to avoid uint128_t here, although the + // generated code for uint128_t looks slightly nicer. + __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) { + assert(shift > 32); + + // The casts here help MSVC to avoid calls to the __allmul library + // function. + const uint32_t factorLo = (uint32_t)(factor); + const uint32_t factorHi = (uint32_t)(factor >> 32); + const uint64_t bits0 = (uint64_t)m * factorLo; + const uint64_t bits1 = (uint64_t)m * factorHi; + + const uint64_t sum = (bits0 >> 32) + bits1; + const uint64_t shiftedSum = sum >> (shift - 32); + assert(shiftedSum <= UINT32_MAX); + return (uint32_t) shiftedSum; + + } + + __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) { + if (mantissa) { + memcpy(result, "NaN", 3); + return 3; + } + if (sign) { + result[0] = '-'; + } + if (exponent) { + memcpy(result + sign, "Infinity", 8); + return sign + 8; + } + memcpy(result + sign, "0.0", 3); + return sign + 3; + } + + __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) { + if (mantissa) { + return 3; + } + if (exponent) { + return sign + 8; + } + return sign + 3; + } + + __device__ inline uint32_t float_to_bits(const float f) { + uint32_t bits = 0; + memcpy(&bits, &f, sizeof(float)); + return bits; + } + + __device__ inline uint64_t double_to_bits(const double d) { + uint64_t bits = 0; + memcpy(&bits, &d, sizeof(double)); + return bits; + } + + __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { + // The casts here help MSVC to avoid calls to the __allmul library function. + const uint32_t aLo = (uint32_t)a; + const uint32_t aHi = (uint32_t)(a >> 32); + const uint32_t bLo = (uint32_t)b; + const uint32_t bHi = (uint32_t)(b >> 32); + + const uint64_t b00 = (uint64_t)aLo * bLo; + const uint64_t b01 = (uint64_t)aLo * bHi; + const uint64_t b10 = (uint64_t)aHi * bLo; + const uint64_t b11 = (uint64_t)aHi * bHi; + + const uint32_t b00Lo = (uint32_t)b00; + const uint32_t b00Hi = (uint32_t)(b00 >> 32); + + const uint64_t mid1 = b10 + b00Hi; + const uint32_t mid1Lo = (uint32_t)(mid1); + const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); + + const uint64_t mid2 = b01 + mid1Lo; + const uint32_t mid2Lo = (uint32_t)(mid2); + const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); + + const uint64_t pHi = b11 + mid1Hi + mid2Hi; + const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; + + *productHi = pHi; + return pLo; + } + + __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { + // We don't need to handle the case dist >= 64 here (see above). + assert(dist < 64); + assert(dist > 0); + return (hi << (64 - dist)) | (lo >> dist); + } + + __device__ inline uint64_t div5(const uint64_t x) { + return x / 5; + } + + __device__ inline uint64_t div10(const uint64_t x) { + return x / 10; + } + + __device__ inline uint64_t div100(const uint64_t x) { + return x / 100; + } + + __device__ inline uint64_t div1e8(const uint64_t x) { + return x / 100000000; + } + + __device__ inline uint64_t div1e9(const uint64_t x) { + return x / 1000000000; + } + + __device__ inline uint32_t mod1e9(const uint64_t x) { + return (uint32_t) (x - 1000000000 * div1e9(x)); + } + + __device__ inline uint32_t pow5Factor(uint64_t value) { + const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) + const uint64_t n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 + uint32_t count = 0; + for (;;) { + assert(value != 0); + value *= m_inv_5; + if (value > n_div_5) + break; + ++count; + } + return count; + } + + // Returns true if value is divisible by 5^p. + __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { + // I tried a case distinction on p, but there was no performance difference. + return pow5Factor(value) >= p; + } + + // Returns true if value is divisible by 2^p. + __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { + assert(value != 0); + assert(p < 64); + // __builtin_ctzll doesn't appear to be faster here. + return (value & ((1ull << p) - 1)) == 0; + } + + __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) { + // m is maximum 55 bits + uint64_t high1; // 128 + const uint64_t low1 = umul128(m, mul[1], &high1); // 64 + uint64_t high0; // 64 + umul128(m, mul[0], &high0); // 0 + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + return shiftright128(sum, high1, j - 64); + } + + __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j, + uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) { + *vp = mulShift64(4 * m + 2, mul, j); + *vm = mulShift64(4 * m - 1 - mmShift, mul, j); + return mulShift64(4 * m, mul, j); + } + + // Computes 5^i in the form required by Ryu, and stores it in the given pointer. + __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) { + const uint32_t base = i / POW5_TABLE_SIZE; + const uint32_t base2 = base * POW5_TABLE_SIZE; + const uint32_t offset = i - base2; + const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base]; + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + const uint64_t m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + const uint64_t low1 = umul128(m, mul[1], &high1); + uint64_t high0; + const uint64_t low0 = umul128(m, mul[0], &high0); + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + const uint32_t delta = pow5bits(i) - pow5bits(base2); + result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); + } + + // Computes 5^-i in the form required by Ryu, and stores it in the given pointer. + __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) { + const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; + const uint32_t base2 = base * POW5_TABLE_SIZE; + const uint32_t offset = base2 - i; + const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + const uint64_t m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + const uint64_t low1 = umul128(m, mul[1], &high1); + uint64_t high0; + const uint64_t low0 = umul128(m, mul[0] - 1, &high0); + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + const uint32_t delta = pow5bits(base2) - pow5bits(i); + result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); + } + + __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) { + // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup + // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the + // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + return mulShift32(m, pow5[1] + 1, j); + } + + __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) { + uint64_t pow5[2]; + double_computePow5(i, pow5); + return mulShift32(m, pow5[1], j); + } + + __device__ inline uint32_t decimalLength17(const uint64_t v) { + // This is slightly faster than a loop. + // The average output length is 16.38 digits, so we check high-to-low. + // Function precondition: v is not an 18, 19, or 20-digit number. + // (17 digits are sufficient for round-tripping.) + assert(v < 100000000000000000L); + if (v >= 10000000000000000L) { return 17; } + if (v >= 1000000000000000L) { return 16; } + if (v >= 100000000000000L) { return 15; } + if (v >= 10000000000000L) { return 14; } + if (v >= 1000000000000L) { return 13; } + if (v >= 100000000000L) { return 12; } + if (v >= 10000000000L) { return 11; } + if (v >= 1000000000L) { return 10; } + if (v >= 100000000L) { return 9; } + if (v >= 10000000L) { return 8; } + if (v >= 1000000L) { return 7; } + if (v >= 100000L) { return 6; } + if (v >= 10000L) { return 5; } + if (v >= 1000L) { return 4; } + if (v >= 100L) { return 3; } + if (v >= 10L) { return 2; } + return 1; + } + + __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) { + int32_t e2; + uint64_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + } + const bool even = (m2 & 1) == 0; + const bool acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + const uint64_t mv = 4 * m2; + // Implicit bool -> int conversion. True is 1, false is 0. + const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + // We would compute mp and mm like this: + // uint64_t mp = 4 * m2 + 2; + // uint64_t mm = mv - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 128-bit arithmetic. + uint64_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + if (e2 >= 0) { + // I tried special-casing q == 0, but there was no effect on performance. + // This expression is slightly faster than max(0, log10Pow2(e2) - 1). + const uint32_t q = log10Pow2(e2) - (e2 > 3); + e10 = (int32_t) q; + const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + const int32_t i = -e2 + (int32_t) q + k; + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); + + if (q <= 21) { + // This should use q <= 22, but I think 21 is also safe. Smaller values + // may still be safe, but it's more difficult to reason about them. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); + if (mvMod5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5(mv, q); + } else if (acceptBounds) { + // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q + // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q + // <=> true && pow5Factor(mm) >= q, since e2 >= q. + vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); + } else { + // Same as min(e2 + 1, pow5Factor(mp)) >= q. + vp -= multipleOfPowerOf5(mv + 2, q); } - fx = fx >> 1; - } - } else if ((value > 0.0) && (value < lower_limit)) { - int fx = 256; - for (int idx = 8; idx >= 0; --idx) { - if (value < blower10[idx]) { - value *= upper10[idx]; - exp10 -= fx; + } + } else { + // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). + const uint32_t q = log10Pow5(-e2) - (-e2 > 1); + e10 = (int32_t) q + e2; + const int32_t i = -e2 - (int32_t) q; + const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; + const int32_t j = (int32_t) q - k; + + uint64_t pow5[2]; + double_computePow5(i, pow5); + vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); + + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; } - fx = fx >> 1; - } - } - // - // int decimal_places = significant_digits - (exp10? 2 : 1); - // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); - int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1; - unsigned long long max_digits = (is_float? eight_digits: sixteen_digits); - double temp_value = value; - while (temp_value < 1.0 && temp_value > 0.0) { - max_digits *= 10; - temp_value *= 10.0; - decimal_places++; - } - integer = (unsigned int)value; - for (unsigned int i = integer; i >= 10; i /= 10) { - --decimal_places; - max_digits /= 10; - } - double diff = value - (double)integer; - double remainder = diff * (double)max_digits; - decimal = (unsigned long long)remainder; - remainder -= (double)decimal; - decimal += (unsigned long long)(2.0 * remainder); // round up - if (decimal >= max_digits) { - decimal = 0; - ++integer; - if (exp10 && (integer >= 10)) { - ++exp10; - integer = 1; - } - } - // - while ((decimal % 10) == 0 && (decimal_places > 0)) { - decimal /= 10; - --decimal_places; - } - return decimal_places; - } - - /** - * @brief Main kernel method for converting float value to char output array. - * - * Output need not be more than (significant_digits + 7) bytes: - * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent - * - * @param value Float value to convert. - * @param output Memory to write output characters. - * @return Number of bytes written. - */ - __device__ int float_to_string(double value, char* output, bool is_float) - { - // check for valid value - if (std::isnan(value)) { - memcpy(output, "NaN", 3); - return 3; + } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. + // We want to know if the full product has at least q trailing zeros. + // We need to compute min(p2(mv), p5(mv) - e2) >= q + // <=> p2(mv) >= q && p5(mv) - e2 >= q + // <=> p2(mv) >= q (because -e2 >= q) + vrIsTrailingZeros = multipleOfPowerOf2(mv, q); + } } - bool const bneg = [&value]() { - if (signbit(value)) { // handles -0.0 too - value = -value; - return true; + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint8_t lastRemovedDigit = 0; + uint64_t output; + // On average, we remove ~2 digits. + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~0.7%). + for (;;) { + const uint64_t vpDiv10 = div10(vp); + const uint64_t vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vmIsTrailingZeros &= vmMod10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + if (vmIsTrailingZeros) { + for (;;) { + const uint64_t vmDiv10 = div10(vm); + const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + if (vmMod10 != 0) { + break; + } + const uint64_t vpDiv10 = div10(vp); + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + } + + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~99.3%). Percentages below are relative to this. + bool roundUp = false; + const uint64_t vpDiv100 = div100(vp); + const uint64_t vmDiv100 = div100(vm); + if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). + const uint64_t vrDiv100 = div100(vr); + const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; + removed += 2; + } + // Loop iterations below (approximately), without optimization above: + // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% + // Loop iterations below (approximately), with optimization above: + // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% + for (;;) { + const uint64_t vpDiv10 = div10(vp); + const uint64_t vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + roundUp = vrMod10 >= 5; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || roundUp); + } + const int32_t exp = e10 + removed; + + floating_decimal_64 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; + } + + __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) { + int32_t e2; + uint32_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; + } + const bool even = (m2 & 1) == 0; + const bool acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + const uint32_t mv = 4 * m2; + const uint32_t mp = 4 * m2 + 2; + // Implicit bool -> int conversion. True is 1, false is 0. + const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + const uint32_t mm = 4 * m2 - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 64-bit arithmetic. + uint32_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + uint8_t lastRemovedDigit = 0; + if (e2 >= 0) { + const uint32_t q = log10Pow2(e2); + e10 = (int32_t) q; + const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + const int32_t i = -e2 + (int32_t) q + k; + vr = mulPow5InvDivPow2(mv, q, i); + vp = mulPow5InvDivPow2(mp, q, i); + vm = mulPow5InvDivPow2(mm, q, i); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + // We need to know one removed digit even if we are not going to loop below. We could use + // q = X - 1 above, except that would require 33 bits for the result, and we've found that + // 32-bit arithmetic is faster even on 64-bit machines. + const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; + lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); + } + if (q <= 9) { + // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + if (mv % 5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); + } else if (acceptBounds) { + vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); + } else { + vp -= multipleOfPowerOf5_32(mp, q); + } + } + } else { + const uint32_t q = log10Pow5(-e2); + e10 = (int32_t) q + e2; + const int32_t i = -e2 - (int32_t) q; + const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT; + int32_t j = (int32_t) q - k; + vr = mulPow5divPow2(mv, (uint32_t) i, j); + vp = mulPow5divPow2(mp, (uint32_t) i, j); + vm = mulPow5divPow2(mm, (uint32_t) i, j); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); + } + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. + vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint32_t output; + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~4.0%). + while (vp / 10 > vm / 10) { + vmIsTrailingZeros &= vm % 10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + if (vmIsTrailingZeros) { + while (vm % 10 == 0) { + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + } + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~96.0%). Percentages below are relative to this. + // Loop iterations below (approximately): + // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% + while (vp / 10 > vm / 10) { + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || lastRemovedDigit >= 5); + } + const int32_t exp = e10 + removed; + + floating_decimal_32 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; + } + + __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + // Values in the interval [1E-3, 1E7) are special. + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (uint32_t i = 0; i < olength - 1; ++i) { + const uint32_t c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = '0' + output % 10; + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + // Print 'E', the exponent sign, and the exponent, which has at most three digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 100) { + result[index++] = (char) ('0' + exp / 100); + exp %= 100; + result[index++] = (char) ('0' + exp / 10); + } else if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; + } + + __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) { + int index = 0; + if (sign) { + index++; + } + + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + exp = -exp; + index++; + } + if (exp >= 100) { + index += 3; + } else if (exp >= 10) { + index += 2; } else { - return false; + index++; } - }(); - if (std::isinf(value)) { - if (bneg) { - memcpy(output, "-Infinity", 9); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + index += exp + 3; } else { - memcpy(output, "Infinity", 8); - } - return bneg ? 9 : 8; - } - - // dissect value into components - unsigned int integer = 0; - unsigned long long decimal = 0; - int exp10 = 0; - int decimal_places = dissect_value(value, integer, decimal, exp10, is_float); - // - // now build the string from the - // components: sign, integer, decimal, exp10, decimal_places - // - // sign - char* ptr = output; - if (bneg) *ptr++ = '-'; - // integer - ptr = int2str(integer, ptr); - // decimal - *ptr++ = '.'; - if (decimal_places) { - char buffer[significant_digits_double]; - char* pb = buffer; - while (decimal_places--) { - *pb++ = (char)('0' + (decimal % 10)); - decimal /= 10; - } - while (pb != buffer) // reverses the digits - *ptr++ = *--pb; // e.g. 54321 -> 12345 - } else - *ptr++ = '0'; // always include at least .0 - // exponent - if (exp10) { - *ptr++ = 'E'; - if (exp10 < 0) { - *ptr++ = '-'; - exp10 = -exp10; - } - // if (exp10 < 10) *ptr++ = '0'; // extra zero-pad - ptr = int2str(exp10, ptr); - } - // done - return (int)(ptr - output); // number of bytes written - } - - /** - * @brief Compute how man bytes are needed to hold the output string. - * - * @param value Float value to convert. - * @return Number of bytes required. - */ - __device__ int compute_ftos_size(double value, bool is_float) - { - if (std::isnan(value)) return 3; // NaN - bool bneg = false; - if (signbit(value)) { // handles -0.0 too - value = -value; - bneg = true; - } - if (std::isinf(value)) return 8 + (int)bneg; // Inf - - // dissect float into parts - unsigned int integer = 0; - unsigned long long decimal = 0; - int exp10 = 0; - int decimal_places = dissect_value(value, integer, decimal, exp10, is_float); - // now count up the components - // sign - int count = (int)bneg; - // integer - count += (int)(integer == 0); - while (integer > 0) { - integer /= 10; - ++count; - } // log10(integer) - // decimal - ++count; // decimal point - if (decimal_places) - count += decimal_places; - else - ++count; // always include .0 - // exponent - if (exp10) { - count ++; // 'e±' - if (exp10 < 0) { - count ++; - exp10 = -exp10; - } - while (exp10 > 0) { - exp10 /= 10; - ++count; - } // log10(exp10) + index += olength + 1; + } } - return count; + return index; + } + + __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (int i = 0; i < olength - 1; i++) { + int c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = (char) ('0' + output % 10); + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + + // Print 'E', the exponent sign, and the exponent, which has at most two digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; + } + + __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + index++; + } + + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + index++; + exp = -exp; + } + if (exp >= 10) { + index++; + } + index++; + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + index += exp + 3; + } else { + // Decimal dot is somewhere between the digits. + index += olength + 1; + } + } + return index; + } + + __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent, + floating_decimal_64* const v) { + const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + + if (e2 > 0) { + // f = m2 * 2^e2 >= 2^53 is an integer. + // Ignore this case for now. + return false; + } + + if (e2 < -52) { + // f < 1. + return false; + } + + // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. + // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. + const uint64_t mask = (1ull << -e2) - 1; + const uint64_t fraction = m2 & mask; + if (fraction != 0) { + return false; + } + + // f is an integer in the range [1, 2^53). + // Note: mantissa might contain trailing (decimal) 0's. + // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). + v->mantissa = m2 >> -e2; + v->exponent = 0; + return true; + } + + __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + const uint64_t bits = double_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; + const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); + const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + floating_decimal_64 v; + const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); + if (isSmallInt) { + // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. + // For scientific notation we need to move these zeros into the exponent. + // (This is not needed for fixed-point notation, so it might be beneficial to trim + // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) + for (;;) { + const uint64_t q = div10(v.mantissa); + const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); + if (r != 0) { + break; + } + v.mantissa = q; + ++v.exponent; + } + } else { + v = d2d(ieeeMantissa, ieeeExponent); + } + return v; + } + + __device__ int d2s_buffered_n(double f, char* result) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); + } + + __device__ int compute_d2s_size(double value) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return d2s_size(v, sign); + } + + __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + const uint32_t bits = float_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; + const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); + const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); + + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + return f2d(ieeeMantissa, ieeeExponent); + } + + __device__ int f2s_buffered_n(float f, char* result) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); + } + + __device__ int compute_f2s_size(float value) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return f2s_size(v, sign); + } + + __device__ int compute_ftos_size(double value, bool is_float) { + if (is_float) { + return compute_f2s_size(value); + } else { + return compute_d2s_size(value); + } + } + + __device__ int float_to_string(double value, char* output, bool is_float) { + if (is_float) { + return f2s_buffered_n(value, output); + } else { + return d2s_buffered_n(value, output); + } } }; diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index a86d988724..d75741b8a0 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -45,7 +45,7 @@ TEST_F(FloatToStringTests, FromFloats32) 123456789012.34, -0.0}; std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"}; + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"}; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), From 1264317a3ca9bf820eac184c53490a31b93b6c47 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 6 Nov 2023 16:32:05 +0800 Subject: [PATCH 09/54] update license Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index a594377b4a..d23442f173 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -1,17 +1,16 @@ +/* Not a contribution + * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as + * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. */ #include "cast_string.hpp" From a87a4039372b0b1e0bca596866c5cedbe1e0a845 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 7 Nov 2023 17:02:05 +0800 Subject: [PATCH 10/54] clean up Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index d23442f173..ce2a8aedc6 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -1,7 +1,7 @@ /* Not a contribution * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: -/* + * * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: LicenseRef-NvidiaProprietary * From 979dc39bf289f037f85c752724714d7530bd6df2 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 13 Nov 2023 16:30:03 +0800 Subject: [PATCH 11/54] Split ftos_converter out Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 1171 +--------------------- src/main/cpp/src/ftos_converter.cu | 1163 +++++++++++++++++++++ thirdparty/cudf | 2 +- 3 files changed, 1177 insertions(+), 1159 deletions(-) create mode 100644 src/main/cpp/src/ftos_converter.cu diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index ce2a8aedc6..eaf0c989b9 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -1,16 +1,17 @@ -/* Not a contribution - * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as - * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: +/* + * Copyright (c) 2023, NVIDIA CORPORATION. * - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual - * property and proprietary rights in and to this material, related - * documentation and any modifications thereto. Any use, reproduction, - * disclosure or distribution of this material and related documentation - * without an express license agreement from NVIDIA CORPORATION or - * its affiliates is strictly prohibited. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "cast_string.hpp" @@ -30,21 +31,7 @@ #include #include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -// #include -#include -// #include -// #include +#include using namespace cudf; @@ -53,1138 +40,6 @@ namespace spark_rapids_jni { namespace detail { namespace { -// A floating decimal representing m * 10^e. -typedef struct floating_decimal_64 { - uint64_t mantissa; - // Decimal exponent's range is -324 to 308 - // inclusive, and can fit in a short if needed. - int32_t exponent; -} floating_decimal_64; - -// A floating decimal representing m * 10^e. -typedef struct floating_decimal_32 { - uint32_t mantissa; - // Decimal exponent's range is -45 to 38 - // inclusive, and can fit in a short if needed. - int32_t exponent; -} floating_decimal_32; - -struct ftos_converter { - - // These tables are generated by PrintDoubleLookupTable. - static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; - static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; - static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); - static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); - static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; - static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; - static constexpr unsigned int DOUBLE_BIAS = 1023; - static constexpr unsigned int FLOAT_MANTISSA_BITS = 23; - static constexpr unsigned int FLOAT_EXPONENT_BITS = 8; - static constexpr unsigned int FLOAT_BIAS = 127; - - - // Returns the number of decimal digits in v, which must not contain more than 9 digits. - __device__ inline uint32_t decimalLength9(const uint32_t v) { - // Function precondition: v is not a 10-digit number. - // (f2s: 9 digits are sufficient for round-tripping.) - // (d2fixed: We print 9-digit blocks.) - assert(v < 1000000000); - if (v >= 100000000) { return 9; } - if (v >= 10000000) { return 8; } - if (v >= 1000000) { return 7; } - if (v >= 100000) { return 6; } - if (v >= 10000) { return 5; } - if (v >= 1000) { return 4; } - if (v >= 100) { return 3; } - if (v >= 10) { return 2; } - return 1; - } - - const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = { - { 1u, 2305843009213693952u }, - { 5955668970331000884u, 1784059615882449851u }, - { 8982663654677661702u, 1380349269358112757u }, - { 7286864317269821294u, 2135987035920910082u }, - { 7005857020398200553u, 1652639921975621497u }, - { 17965325103354776697u, 1278668206209430417u }, - { 8928596168509315048u, 1978643211784836272u }, - { 10075671573058298858u, 1530901034580419511u }, - { 597001226353042382u, 1184477304306571148u }, - { 1527430471115325346u, 1832889850782397517u }, - { 12533209867169019542u, 1418129833677084982u }, - { 5577825024675947042u, 2194449627517475473u }, - { 11006974540203867551u, 1697873161311732311u }, - { 10313493231639821582u, 1313665730009899186u }, - { 12701016819766672773u, 2032799256770390445u } - }; - - const uint32_t POW5_INV_OFFSETS[19] = { - 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, - 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, - 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, - 0x00000000 - }; - - const uint64_t DOUBLE_POW5_SPLIT2[13][2] = { - { 0u, 1152921504606846976u }, - { 0u, 1490116119384765625u }, - { 1032610780636961552u, 1925929944387235853u }, - { 7910200175544436838u, 1244603055572228341u }, - { 16941905809032713930u, 1608611746708759036u }, - { 13024893955298202172u, 2079081953128979843u }, - { 6607496772837067824u, 1343575221513417750u }, - { 17332926989895652603u, 1736530273035216783u }, - { 13037379183483547984u, 2244412773384604712u }, - { 1605989338741628675u, 1450417759929778918u }, - { 9630225068416591280u, 1874621017369538693u }, - { 665883850346957067u, 1211445438634777304u }, - { 14931890668723713708u, 1565756531257009982u } - }; - - const uint32_t POW5_OFFSETS[21] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, - 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, - 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, - 0x55559155, 0x51405555, 0x00000105 - }; - - static constexpr uint32_t POW5_TABLE_SIZE = 26; - const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { - 1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, - 1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, - 30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, - 19073486328125ull, 95367431640625ull, 476837158203125ull, - 2384185791015625ull, 11920928955078125ull, 59604644775390625ull, - 298023223876953125ull //, 1490116119384765625ull - }; - - // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528. - __device__ inline int32_t log2pow5(const int32_t e) { - // This approximation works up to the point that the multiplication overflows at e = 3529. - // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater - // than 2^9297. - assert(e >= 0); - assert(e <= 3528); - return (int32_t) ((((uint32_t) e) * 1217359) >> 19); - } - - // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. - __device__ inline int32_t pow5bits(const int32_t e) { - // This approximation works up to the point that the multiplication overflows at e = 3529. - // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater - // than 2^9297. - assert(e >= 0); - assert(e <= 3528); - return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); - } - - // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. - __device__ inline int32_t ceil_log2pow5(const int32_t e) { - return log2pow5(e) + 1; - } - - // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. - __device__ inline uint32_t log10Pow2(const int32_t e) { - // The first value this approximation fails for is 2^1651 which is just greater than 10^297. - assert(e >= 0); - assert(e <= 1650); - return (((uint32_t) e) * 78913) >> 18; - } - - // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. - __device__ inline uint32_t log10Pow5(const int32_t e) { - // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. - assert(e >= 0); - assert(e <= 2620); - return (((uint32_t) e) * 732923) >> 20; - } - - __device__ inline uint32_t pow5factor_32(uint32_t value) { - uint32_t count = 0; - for (;;) { - assert(value != 0); - const uint32_t q = value / 5; - const uint32_t r = value % 5; - if (r != 0) { - break; - } - value = q; - ++count; - } - return count; - } - - // Returns true if value is divisible by 5^p. - __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) { - return pow5factor_32(value) >= p; - } - - // Returns true if value is divisible by 2^p. - __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) { - // __builtin_ctz doesn't appear to be faster here. - return (value & ((1u << p) - 1)) == 0; - } - - // It seems to be slightly faster to avoid uint128_t here, although the - // generated code for uint128_t looks slightly nicer. - __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) { - assert(shift > 32); - - // The casts here help MSVC to avoid calls to the __allmul library - // function. - const uint32_t factorLo = (uint32_t)(factor); - const uint32_t factorHi = (uint32_t)(factor >> 32); - const uint64_t bits0 = (uint64_t)m * factorLo; - const uint64_t bits1 = (uint64_t)m * factorHi; - - const uint64_t sum = (bits0 >> 32) + bits1; - const uint64_t shiftedSum = sum >> (shift - 32); - assert(shiftedSum <= UINT32_MAX); - return (uint32_t) shiftedSum; - - } - - __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) { - if (mantissa) { - memcpy(result, "NaN", 3); - return 3; - } - if (sign) { - result[0] = '-'; - } - if (exponent) { - memcpy(result + sign, "Infinity", 8); - return sign + 8; - } - memcpy(result + sign, "0.0", 3); - return sign + 3; - } - - __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) { - if (mantissa) { - return 3; - } - if (exponent) { - return sign + 8; - } - return sign + 3; - } - - __device__ inline uint32_t float_to_bits(const float f) { - uint32_t bits = 0; - memcpy(&bits, &f, sizeof(float)); - return bits; - } - - __device__ inline uint64_t double_to_bits(const double d) { - uint64_t bits = 0; - memcpy(&bits, &d, sizeof(double)); - return bits; - } - - __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { - // The casts here help MSVC to avoid calls to the __allmul library function. - const uint32_t aLo = (uint32_t)a; - const uint32_t aHi = (uint32_t)(a >> 32); - const uint32_t bLo = (uint32_t)b; - const uint32_t bHi = (uint32_t)(b >> 32); - - const uint64_t b00 = (uint64_t)aLo * bLo; - const uint64_t b01 = (uint64_t)aLo * bHi; - const uint64_t b10 = (uint64_t)aHi * bLo; - const uint64_t b11 = (uint64_t)aHi * bHi; - - const uint32_t b00Lo = (uint32_t)b00; - const uint32_t b00Hi = (uint32_t)(b00 >> 32); - - const uint64_t mid1 = b10 + b00Hi; - const uint32_t mid1Lo = (uint32_t)(mid1); - const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); - - const uint64_t mid2 = b01 + mid1Lo; - const uint32_t mid2Lo = (uint32_t)(mid2); - const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); - - const uint64_t pHi = b11 + mid1Hi + mid2Hi; - const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; - - *productHi = pHi; - return pLo; - } - - __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { - // We don't need to handle the case dist >= 64 here (see above). - assert(dist < 64); - assert(dist > 0); - return (hi << (64 - dist)) | (lo >> dist); - } - - __device__ inline uint64_t div5(const uint64_t x) { - return x / 5; - } - - __device__ inline uint64_t div10(const uint64_t x) { - return x / 10; - } - - __device__ inline uint64_t div100(const uint64_t x) { - return x / 100; - } - - __device__ inline uint64_t div1e8(const uint64_t x) { - return x / 100000000; - } - - __device__ inline uint64_t div1e9(const uint64_t x) { - return x / 1000000000; - } - - __device__ inline uint32_t mod1e9(const uint64_t x) { - return (uint32_t) (x - 1000000000 * div1e9(x)); - } - - __device__ inline uint32_t pow5Factor(uint64_t value) { - const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) - const uint64_t n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 - uint32_t count = 0; - for (;;) { - assert(value != 0); - value *= m_inv_5; - if (value > n_div_5) - break; - ++count; - } - return count; - } - - // Returns true if value is divisible by 5^p. - __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { - // I tried a case distinction on p, but there was no performance difference. - return pow5Factor(value) >= p; - } - - // Returns true if value is divisible by 2^p. - __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { - assert(value != 0); - assert(p < 64); - // __builtin_ctzll doesn't appear to be faster here. - return (value & ((1ull << p) - 1)) == 0; - } - - __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) { - // m is maximum 55 bits - uint64_t high1; // 128 - const uint64_t low1 = umul128(m, mul[1], &high1); // 64 - uint64_t high0; // 64 - umul128(m, mul[0], &high0); // 0 - const uint64_t sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - return shiftright128(sum, high1, j - 64); - } - - __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j, - uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) { - *vp = mulShift64(4 * m + 2, mul, j); - *vm = mulShift64(4 * m - 1 - mmShift, mul, j); - return mulShift64(4 * m, mul, j); - } - - // Computes 5^i in the form required by Ryu, and stores it in the given pointer. - __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) { - const uint32_t base = i / POW5_TABLE_SIZE; - const uint32_t base2 = base * POW5_TABLE_SIZE; - const uint32_t offset = i - base2; - const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base]; - if (offset == 0) { - result[0] = mul[0]; - result[1] = mul[1]; - return; - } - const uint64_t m = DOUBLE_POW5_TABLE[offset]; - uint64_t high1; - const uint64_t low1 = umul128(m, mul[1], &high1); - uint64_t high0; - const uint64_t low0 = umul128(m, mul[0], &high0); - const uint64_t sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - // high1 | sum | low0 - const uint32_t delta = pow5bits(i) - pow5bits(base2); - result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); - result[1] = shiftright128(sum, high1, delta); - } - - // Computes 5^-i in the form required by Ryu, and stores it in the given pointer. - __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) { - const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; - const uint32_t base2 = base * POW5_TABLE_SIZE; - const uint32_t offset = base2 - i; - const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 - if (offset == 0) { - result[0] = mul[0]; - result[1] = mul[1]; - return; - } - const uint64_t m = DOUBLE_POW5_TABLE[offset]; - uint64_t high1; - const uint64_t low1 = umul128(m, mul[1], &high1); - uint64_t high0; - const uint64_t low0 = umul128(m, mul[0] - 1, &high0); - const uint64_t sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - // high1 | sum | low0 - const uint32_t delta = pow5bits(base2) - pow5bits(i); - result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); - result[1] = shiftright128(sum, high1, delta); - } - - __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) { - // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup - // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the - // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. - uint64_t pow5[2]; - double_computeInvPow5(q, pow5); - return mulShift32(m, pow5[1] + 1, j); - } - - __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) { - uint64_t pow5[2]; - double_computePow5(i, pow5); - return mulShift32(m, pow5[1], j); - } - - __device__ inline uint32_t decimalLength17(const uint64_t v) { - // This is slightly faster than a loop. - // The average output length is 16.38 digits, so we check high-to-low. - // Function precondition: v is not an 18, 19, or 20-digit number. - // (17 digits are sufficient for round-tripping.) - assert(v < 100000000000000000L); - if (v >= 10000000000000000L) { return 17; } - if (v >= 1000000000000000L) { return 16; } - if (v >= 100000000000000L) { return 15; } - if (v >= 10000000000000L) { return 14; } - if (v >= 1000000000000L) { return 13; } - if (v >= 100000000000L) { return 12; } - if (v >= 10000000000L) { return 11; } - if (v >= 1000000000L) { return 10; } - if (v >= 100000000L) { return 9; } - if (v >= 10000000L) { return 8; } - if (v >= 1000000L) { return 7; } - if (v >= 100000L) { return 6; } - if (v >= 10000L) { return 5; } - if (v >= 1000L) { return 4; } - if (v >= 100L) { return 3; } - if (v >= 10L) { return 2; } - return 1; - } - - __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) { - int32_t e2; - uint64_t m2; - if (ieeeExponent == 0) { - // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; - m2 = ieeeMantissa; - } else { - e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; - m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - } - const bool even = (m2 & 1) == 0; - const bool acceptBounds = even; - - // Step 2: Determine the interval of valid decimal representations. - const uint64_t mv = 4 * m2; - // Implicit bool -> int conversion. True is 1, false is 0. - const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - // We would compute mp and mm like this: - // uint64_t mp = 4 * m2 + 2; - // uint64_t mm = mv - 1 - mmShift; - - // Step 3: Convert to a decimal power base using 128-bit arithmetic. - uint64_t vr, vp, vm; - int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; - if (e2 >= 0) { - // I tried special-casing q == 0, but there was no effect on performance. - // This expression is slightly faster than max(0, log10Pow2(e2) - 1). - const uint32_t q = log10Pow2(e2) - (e2 > 3); - e10 = (int32_t) q; - const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; - const int32_t i = -e2 + (int32_t) q + k; - uint64_t pow5[2]; - double_computeInvPow5(q, pow5); - vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); - - if (q <= 21) { - // This should use q <= 22, but I think 21 is also safe. Smaller values - // may still be safe, but it's more difficult to reason about them. - // Only one of mp, mv, and mm can be a multiple of 5, if any. - const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); - if (mvMod5 == 0) { - vrIsTrailingZeros = multipleOfPowerOf5(mv, q); - } else if (acceptBounds) { - // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q - // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q - // <=> true && pow5Factor(mm) >= q, since e2 >= q. - vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); - } else { - // Same as min(e2 + 1, pow5Factor(mp)) >= q. - vp -= multipleOfPowerOf5(mv + 2, q); - } - } - } else { - // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). - const uint32_t q = log10Pow5(-e2) - (-e2 > 1); - e10 = (int32_t) q + e2; - const int32_t i = -e2 - (int32_t) q; - const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; - const int32_t j = (int32_t) q - k; - - uint64_t pow5[2]; - double_computePow5(i, pow5); - vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); - - if (q <= 1) { - // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. - // mv = 4 * m2, so it always has at least two trailing 0 bits. - vrIsTrailingZeros = true; - if (acceptBounds) { - // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. - vmIsTrailingZeros = mmShift == 1; - } else { - // mp = mv + 2, so it always has at least one trailing 0 bit. - --vp; - } - } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. - // We want to know if the full product has at least q trailing zeros. - // We need to compute min(p2(mv), p5(mv) - e2) >= q - // <=> p2(mv) >= q && p5(mv) - e2 >= q - // <=> p2(mv) >= q (because -e2 >= q) - vrIsTrailingZeros = multipleOfPowerOf2(mv, q); - } - } - - // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; - uint8_t lastRemovedDigit = 0; - uint64_t output; - // On average, we remove ~2 digits. - if (vmIsTrailingZeros || vrIsTrailingZeros) { - // General case, which happens rarely (~0.7%). - for (;;) { - const uint64_t vpDiv10 = div10(vp); - const uint64_t vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { - break; - } - const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); - const uint64_t vrDiv10 = div10(vr); - const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - vmIsTrailingZeros &= vmMod10 == 0; - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - - if (vmIsTrailingZeros) { - for (;;) { - const uint64_t vmDiv10 = div10(vm); - const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); - if (vmMod10 != 0) { - break; - } - const uint64_t vpDiv10 = div10(vp); - const uint64_t vrDiv10 = div10(vr); - const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - } - - if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { - // Round even if the exact number is .....50..0. - lastRemovedDigit = 4; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); - } else { - // Specialized for the common case (~99.3%). Percentages below are relative to this. - bool roundUp = false; - const uint64_t vpDiv100 = div100(vp); - const uint64_t vmDiv100 = div100(vm); - if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). - const uint64_t vrDiv100 = div100(vr); - const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); - roundUp = vrMod100 >= 50; - vr = vrDiv100; - vp = vpDiv100; - vm = vmDiv100; - removed += 2; - } - // Loop iterations below (approximately), without optimization above: - // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% - // Loop iterations below (approximately), with optimization above: - // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% - for (;;) { - const uint64_t vpDiv10 = div10(vp); - const uint64_t vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { - break; - } - const uint64_t vrDiv10 = div10(vr); - const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - roundUp = vrMod10 >= 5; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + (vr == vm || roundUp); - } - const int32_t exp = e10 + removed; - - floating_decimal_64 fd; - fd.exponent = exp; - fd.mantissa = output; - return fd; - } - - __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) { - int32_t e2; - uint32_t m2; - if (ieeeExponent == 0) { - // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; - m2 = ieeeMantissa; - } else { - e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; - m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; - } - const bool even = (m2 & 1) == 0; - const bool acceptBounds = even; - - // Step 2: Determine the interval of valid decimal representations. - const uint32_t mv = 4 * m2; - const uint32_t mp = 4 * m2 + 2; - // Implicit bool -> int conversion. True is 1, false is 0. - const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - const uint32_t mm = 4 * m2 - 1 - mmShift; - - // Step 3: Convert to a decimal power base using 64-bit arithmetic. - uint32_t vr, vp, vm; - int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; - uint8_t lastRemovedDigit = 0; - if (e2 >= 0) { - const uint32_t q = log10Pow2(e2); - e10 = (int32_t) q; - const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; - const int32_t i = -e2 + (int32_t) q + k; - vr = mulPow5InvDivPow2(mv, q, i); - vp = mulPow5InvDivPow2(mp, q, i); - vm = mulPow5InvDivPow2(mm, q, i); - if (q != 0 && (vp - 1) / 10 <= vm / 10) { - // We need to know one removed digit even if we are not going to loop below. We could use - // q = X - 1 above, except that would require 33 bits for the result, and we've found that - // 32-bit arithmetic is faster even on 64-bit machines. - const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; - lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); - } - if (q <= 9) { - // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. - // Only one of mp, mv, and mm can be a multiple of 5, if any. - if (mv % 5 == 0) { - vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); - } else if (acceptBounds) { - vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); - } else { - vp -= multipleOfPowerOf5_32(mp, q); - } - } - } else { - const uint32_t q = log10Pow5(-e2); - e10 = (int32_t) q + e2; - const int32_t i = -e2 - (int32_t) q; - const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT; - int32_t j = (int32_t) q - k; - vr = mulPow5divPow2(mv, (uint32_t) i, j); - vp = mulPow5divPow2(mp, (uint32_t) i, j); - vm = mulPow5divPow2(mm, (uint32_t) i, j); - if (q != 0 && (vp - 1) / 10 <= vm / 10) { - j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); - lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); - } - if (q <= 1) { - // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. - // mv = 4 * m2, so it always has at least two trailing 0 bits. - vrIsTrailingZeros = true; - if (acceptBounds) { - // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. - vmIsTrailingZeros = mmShift == 1; - } else { - // mp = mv + 2, so it always has at least one trailing 0 bit. - --vp; - } - } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. - vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); - } - } - - // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; - uint32_t output; - if (vmIsTrailingZeros || vrIsTrailingZeros) { - // General case, which happens rarely (~4.0%). - while (vp / 10 > vm / 10) { - vmIsTrailingZeros &= vm % 10 == 0; - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) (vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - if (vmIsTrailingZeros) { - while (vm % 10 == 0) { - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) (vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - } - if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { - // Round even if the exact number is .....50..0. - lastRemovedDigit = 4; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); - } else { - // Specialized for the common case (~96.0%). Percentages below are relative to this. - // Loop iterations below (approximately): - // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% - while (vp / 10 > vm / 10) { - lastRemovedDigit = (uint8_t) (vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + (vr == vm || lastRemovedDigit >= 5); - } - const int32_t exp = e10 + removed; - - floating_decimal_32 fd; - fd.exponent = exp; - fd.mantissa = output; - return fd; - } - - __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) { - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { - result[index++] = '-'; - } - - uint64_t output = v.mantissa; - const uint32_t olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t) olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - // Values in the interval [1E-3, 1E7) are special. - if (scientificNotation) { - // Print in the format x.xxxxxE-yy. - for (uint32_t i = 0; i < olength - 1; ++i) { - const uint32_t c = output % 10; output /= 10; - result[index + olength - i] = (char) ('0' + c); - } - result[index] = '0' + output % 10; - result[index + 1] = '.'; - index += olength + 1; - if (olength == 1) { - result[index++] = '0'; - } - // Print 'E', the exponent sign, and the exponent, which has at most three digits. - result[index++] = 'E'; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - if (exp >= 100) { - result[index++] = (char) ('0' + exp / 100); - exp %= 100; - result[index++] = (char) ('0' + exp / 10); - } else if (exp >= 10) { - result[index++] = (char) ('0' + exp / 10); - } - result[index++] = (char) ('0' + exp % 10); - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - result[index++] = '0'; - result[index++] = '.'; - for (int i = -1; i > exp; i--) { - result[index++] = '0'; - } - int current = index; - for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - index++; - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength; - for (int i = olength; i < exp + 1; i++) { - result[index++] = '0'; - } - result[index++] = '.'; - result[index++] = '0'; - } else { - // Decimal dot is somewhere between the digits. - int current = index + 1; - for (int i = 0; i < olength; i++) { - if (olength - i - 1 == exp) { - result[current + olength - i - 1] = '.'; - current--; - } - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength + 1; - } - } - return index; - } - - __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) { - int index = 0; - if (sign) { - index++; - } - - uint64_t output = v.mantissa; - const uint32_t olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t) olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - index += olength + 1; - if (olength == 1) { - index++; - } - // 'E' - index++; - if (exp < 0) { - exp = -exp; - index++; - } - if (exp >= 100) { - index += 3; - } else if (exp >= 10) { - index += 2; - } else { - index++; - } - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - index += 1 - exp + olength; - } else if (exp + 1 >= olength) { - index += exp + 3; - } else { - index += olength + 1; - } - } - return index; - } - - __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) { - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { - result[index++] = '-'; - } - - uint32_t output = v.mantissa; - const uint32_t olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - // Print in the format x.xxxxxE-yy. - for (int i = 0; i < olength - 1; i++) { - int c = output % 10; output /= 10; - result[index + olength - i] = (char) ('0' + c); - } - result[index] = (char) ('0' + output % 10); - result[index + 1] = '.'; - index += olength + 1; - if (olength == 1) { - result[index++] = '0'; - } - - // Print 'E', the exponent sign, and the exponent, which has at most two digits. - result[index++] = 'E'; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - if (exp >= 10) { - result[index++] = (char) ('0' + exp / 10); - } - result[index++] = (char) ('0' + exp % 10); - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - result[index++] = '0'; - result[index++] = '.'; - for (int i = -1; i > exp; i--) { - result[index++] = '0'; - } - int current = index; - for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - index++; - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength; - for (int i = olength; i < exp + 1; i++) { - result[index++] = '0'; - } - result[index++] = '.'; - result[index++] = '0'; - } else { - // Decimal dot is somewhere between the digits. - int current = index + 1; - for (int i = 0; i < olength; i++) { - if (olength - i - 1 == exp) { - result[current + olength - i - 1] = '.'; - current--; - } - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength + 1; - } - } - return index; - } - - __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) { - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { - index++; - } - - uint32_t output = v.mantissa; - const uint32_t olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - index += olength + 1; - if (olength == 1) { - index++; - } - // 'E' - index++; - if (exp < 0) { - index++; - exp = -exp; - } - if (exp >= 10) { - index++; - } - index++; - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - index += 1 - exp + olength; - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - index += exp + 3; - } else { - // Decimal dot is somewhere between the digits. - index += olength + 1; - } - } - return index; - } - - __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent, - floating_decimal_64* const v) { - const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; - - if (e2 > 0) { - // f = m2 * 2^e2 >= 2^53 is an integer. - // Ignore this case for now. - return false; - } - - if (e2 < -52) { - // f < 1. - return false; - } - - // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. - // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. - const uint64_t mask = (1ull << -e2) - 1; - const uint64_t fraction = m2 & mask; - if (fraction != 0) { - return false; - } - - // f is an integer in the range [1, 2^53). - // Note: mantissa might contain trailing (decimal) 0's. - // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). - v->mantissa = m2 >> -e2; - v->exponent = 0; - return true; - } - - __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { - // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. - const uint64_t bits = double_to_bits(f); - - // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; - const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); - const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); - // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { - special = true; - return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; - } - special = false; - floating_decimal_64 v; - const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); - if (isSmallInt) { - // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. - // For scientific notation we need to move these zeros into the exponent. - // (This is not needed for fixed-point notation, so it might be beneficial to trim - // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) - for (;;) { - const uint64_t q = div10(v.mantissa); - const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); - if (r != 0) { - break; - } - v.mantissa = q; - ++v.exponent; - } - } else { - v = d2d(ieeeMantissa, ieeeExponent); - } - return v; - } - - __device__ int d2s_buffered_n(double f, char* result) { - bool sign = false, special = false; - floating_decimal_64 v = d2d(f, sign, special); - if (special) { - return copy_special_str(result, sign, v.exponent, v.mantissa); - } - return to_chars(v, sign, result); - } - - __device__ int compute_d2s_size(double value) { - bool sign = false, special = false; - floating_decimal_64 v = d2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } - return d2s_size(v, sign); - } - - __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { - // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. - const uint32_t bits = float_to_bits(f); - - // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; - const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); - const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); - - // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { - special = true; - return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; - } - special = false; - return f2d(ieeeMantissa, ieeeExponent); - } - - __device__ int f2s_buffered_n(float f, char* result) { - bool sign = false, special = false; - floating_decimal_32 v = f2d(f, sign, special); - if (special) { - return copy_special_str(result, sign, v.exponent, v.mantissa); - } - return to_chars(v, sign, result); - } - - __device__ int compute_f2s_size(float value) { - bool sign = false, special = false; - floating_decimal_32 v = f2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } - return f2s_size(v, sign); - } - - __device__ int compute_ftos_size(double value, bool is_float) { - if (is_float) { - return compute_f2s_size(value); - } else { - return compute_d2s_size(value); - } - } - - __device__ int float_to_string(double value, char* output, bool is_float) { - if (is_float) { - return f2s_buffered_n(value, output); - } else { - return d2s_buffered_n(value, output); - } - } -}; - template struct float_to_string_fn { column_device_view d_floats; diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu new file mode 100644 index 0000000000..7d5cf0716f --- /dev/null +++ b/src/main/cpp/src/ftos_converter.cu @@ -0,0 +1,1163 @@ +/* Not a contribution + * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as + * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: + * + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#include +#include +#include +#include +#include + +using namespace cudf; + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_64 { + uint64_t mantissa; + // Decimal exponent's range is -324 to 308 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_64; + +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_32 { + uint32_t mantissa; + // Decimal exponent's range is -45 to 38 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_32; + +struct ftos_converter { + + // These tables are generated by PrintDoubleLookupTable. + static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; + static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; + static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); + static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); + static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; + static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; + static constexpr unsigned int DOUBLE_BIAS = 1023; + static constexpr unsigned int FLOAT_MANTISSA_BITS = 23; + static constexpr unsigned int FLOAT_EXPONENT_BITS = 8; + static constexpr unsigned int FLOAT_BIAS = 127; + + + // Returns the number of decimal digits in v, which must not contain more than 9 digits. + __device__ inline uint32_t decimalLength9(const uint32_t v) { + // Function precondition: v is not a 10-digit number. + // (f2s: 9 digits are sufficient for round-tripping.) + // (d2fixed: We print 9-digit blocks.) + assert(v < 1000000000); + if (v >= 100000000) { return 9; } + if (v >= 10000000) { return 8; } + if (v >= 1000000) { return 7; } + if (v >= 100000) { return 6; } + if (v >= 10000) { return 5; } + if (v >= 1000) { return 4; } + if (v >= 100) { return 3; } + if (v >= 10) { return 2; } + return 1; + } + + const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = { + { 1u, 2305843009213693952u }, + { 5955668970331000884u, 1784059615882449851u }, + { 8982663654677661702u, 1380349269358112757u }, + { 7286864317269821294u, 2135987035920910082u }, + { 7005857020398200553u, 1652639921975621497u }, + { 17965325103354776697u, 1278668206209430417u }, + { 8928596168509315048u, 1978643211784836272u }, + { 10075671573058298858u, 1530901034580419511u }, + { 597001226353042382u, 1184477304306571148u }, + { 1527430471115325346u, 1832889850782397517u }, + { 12533209867169019542u, 1418129833677084982u }, + { 5577825024675947042u, 2194449627517475473u }, + { 11006974540203867551u, 1697873161311732311u }, + { 10313493231639821582u, 1313665730009899186u }, + { 12701016819766672773u, 2032799256770390445u } + }; + + const uint32_t POW5_INV_OFFSETS[19] = { + 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, + 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, + 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, + 0x00000000 + }; + + const uint64_t DOUBLE_POW5_SPLIT2[13][2] = { + { 0u, 1152921504606846976u }, + { 0u, 1490116119384765625u }, + { 1032610780636961552u, 1925929944387235853u }, + { 7910200175544436838u, 1244603055572228341u }, + { 16941905809032713930u, 1608611746708759036u }, + { 13024893955298202172u, 2079081953128979843u }, + { 6607496772837067824u, 1343575221513417750u }, + { 17332926989895652603u, 1736530273035216783u }, + { 13037379183483547984u, 2244412773384604712u }, + { 1605989338741628675u, 1450417759929778918u }, + { 9630225068416591280u, 1874621017369538693u }, + { 665883850346957067u, 1211445438634777304u }, + { 14931890668723713708u, 1565756531257009982u } + }; + + const uint32_t POW5_OFFSETS[21] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, + 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, + 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, + 0x55559155, 0x51405555, 0x00000105 + }; + + static constexpr uint32_t POW5_TABLE_SIZE = 26; + const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { + 1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, + 1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, + 30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, + 19073486328125ull, 95367431640625ull, 476837158203125ull, + 2384185791015625ull, 11920928955078125ull, 59604644775390625ull, + 298023223876953125ull //, 1490116119384765625ull + }; + + // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528. + __device__ inline int32_t log2pow5(const int32_t e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) ((((uint32_t) e) * 1217359) >> 19); + } + + // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. + __device__ inline int32_t pow5bits(const int32_t e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); + } + + // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. + __device__ inline int32_t ceil_log2pow5(const int32_t e) { + return log2pow5(e) + 1; + } + + // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. + __device__ inline uint32_t log10Pow2(const int32_t e) { + // The first value this approximation fails for is 2^1651 which is just greater than 10^297. + assert(e >= 0); + assert(e <= 1650); + return (((uint32_t) e) * 78913) >> 18; + } + + // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. + __device__ inline uint32_t log10Pow5(const int32_t e) { + // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. + assert(e >= 0); + assert(e <= 2620); + return (((uint32_t) e) * 732923) >> 20; + } + + __device__ inline uint32_t pow5factor_32(uint32_t value) { + uint32_t count = 0; + for (;;) { + assert(value != 0); + const uint32_t q = value / 5; + const uint32_t r = value % 5; + if (r != 0) { + break; + } + value = q; + ++count; + } + return count; + } + + // Returns true if value is divisible by 5^p. + __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) { + return pow5factor_32(value) >= p; + } + + // Returns true if value is divisible by 2^p. + __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) { + // __builtin_ctz doesn't appear to be faster here. + return (value & ((1u << p) - 1)) == 0; + } + + // It seems to be slightly faster to avoid uint128_t here, although the + // generated code for uint128_t looks slightly nicer. + __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) { + assert(shift > 32); + + // The casts here help MSVC to avoid calls to the __allmul library + // function. + const uint32_t factorLo = (uint32_t)(factor); + const uint32_t factorHi = (uint32_t)(factor >> 32); + const uint64_t bits0 = (uint64_t)m * factorLo; + const uint64_t bits1 = (uint64_t)m * factorHi; + + const uint64_t sum = (bits0 >> 32) + bits1; + const uint64_t shiftedSum = sum >> (shift - 32); + assert(shiftedSum <= UINT32_MAX); + return (uint32_t) shiftedSum; + + } + + __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) { + if (mantissa) { + memcpy(result, "NaN", 3); + return 3; + } + if (sign) { + result[0] = '-'; + } + if (exponent) { + memcpy(result + sign, "Infinity", 8); + return sign + 8; + } + memcpy(result + sign, "0.0", 3); + return sign + 3; + } + + __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) { + if (mantissa) { + return 3; + } + if (exponent) { + return sign + 8; + } + return sign + 3; + } + + __device__ inline uint32_t float_to_bits(const float f) { + uint32_t bits = 0; + memcpy(&bits, &f, sizeof(float)); + return bits; + } + + __device__ inline uint64_t double_to_bits(const double d) { + uint64_t bits = 0; + memcpy(&bits, &d, sizeof(double)); + return bits; + } + + __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { + // The casts here help MSVC to avoid calls to the __allmul library function. + const uint32_t aLo = (uint32_t)a; + const uint32_t aHi = (uint32_t)(a >> 32); + const uint32_t bLo = (uint32_t)b; + const uint32_t bHi = (uint32_t)(b >> 32); + + const uint64_t b00 = (uint64_t)aLo * bLo; + const uint64_t b01 = (uint64_t)aLo * bHi; + const uint64_t b10 = (uint64_t)aHi * bLo; + const uint64_t b11 = (uint64_t)aHi * bHi; + + const uint32_t b00Lo = (uint32_t)b00; + const uint32_t b00Hi = (uint32_t)(b00 >> 32); + + const uint64_t mid1 = b10 + b00Hi; + const uint32_t mid1Lo = (uint32_t)(mid1); + const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); + + const uint64_t mid2 = b01 + mid1Lo; + const uint32_t mid2Lo = (uint32_t)(mid2); + const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); + + const uint64_t pHi = b11 + mid1Hi + mid2Hi; + const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; + + *productHi = pHi; + return pLo; + } + + __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { + // We don't need to handle the case dist >= 64 here (see above). + assert(dist < 64); + assert(dist > 0); + return (hi << (64 - dist)) | (lo >> dist); + } + + __device__ inline uint64_t div5(const uint64_t x) { + return x / 5; + } + + __device__ inline uint64_t div10(const uint64_t x) { + return x / 10; + } + + __device__ inline uint64_t div100(const uint64_t x) { + return x / 100; + } + + __device__ inline uint64_t div1e8(const uint64_t x) { + return x / 100000000; + } + + __device__ inline uint64_t div1e9(const uint64_t x) { + return x / 1000000000; + } + + __device__ inline uint32_t mod1e9(const uint64_t x) { + return (uint32_t) (x - 1000000000 * div1e9(x)); + } + + __device__ inline uint32_t pow5Factor(uint64_t value) { + const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) + const uint64_t n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 + uint32_t count = 0; + for (;;) { + assert(value != 0); + value *= m_inv_5; + if (value > n_div_5) + break; + ++count; + } + return count; + } + + // Returns true if value is divisible by 5^p. + __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { + // I tried a case distinction on p, but there was no performance difference. + return pow5Factor(value) >= p; + } + + // Returns true if value is divisible by 2^p. + __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { + assert(value != 0); + assert(p < 64); + // __builtin_ctzll doesn't appear to be faster here. + return (value & ((1ull << p) - 1)) == 0; + } + + __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) { + // m is maximum 55 bits + uint64_t high1; // 128 + const uint64_t low1 = umul128(m, mul[1], &high1); // 64 + uint64_t high0; // 64 + umul128(m, mul[0], &high0); // 0 + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + return shiftright128(sum, high1, j - 64); + } + + __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j, + uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) { + *vp = mulShift64(4 * m + 2, mul, j); + *vm = mulShift64(4 * m - 1 - mmShift, mul, j); + return mulShift64(4 * m, mul, j); + } + + // Computes 5^i in the form required by Ryu, and stores it in the given pointer. + __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) { + const uint32_t base = i / POW5_TABLE_SIZE; + const uint32_t base2 = base * POW5_TABLE_SIZE; + const uint32_t offset = i - base2; + const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base]; + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + const uint64_t m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + const uint64_t low1 = umul128(m, mul[1], &high1); + uint64_t high0; + const uint64_t low0 = umul128(m, mul[0], &high0); + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + const uint32_t delta = pow5bits(i) - pow5bits(base2); + result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); + } + + // Computes 5^-i in the form required by Ryu, and stores it in the given pointer. + __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) { + const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; + const uint32_t base2 = base * POW5_TABLE_SIZE; + const uint32_t offset = base2 - i; + const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + const uint64_t m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + const uint64_t low1 = umul128(m, mul[1], &high1); + uint64_t high0; + const uint64_t low0 = umul128(m, mul[0] - 1, &high0); + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + const uint32_t delta = pow5bits(base2) - pow5bits(i); + result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); + } + + __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) { + // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup + // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the + // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + return mulShift32(m, pow5[1] + 1, j); + } + + __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) { + uint64_t pow5[2]; + double_computePow5(i, pow5); + return mulShift32(m, pow5[1], j); + } + + __device__ inline uint32_t decimalLength17(const uint64_t v) { + // This is slightly faster than a loop. + // The average output length is 16.38 digits, so we check high-to-low. + // Function precondition: v is not an 18, 19, or 20-digit number. + // (17 digits are sufficient for round-tripping.) + assert(v < 100000000000000000L); + if (v >= 10000000000000000L) { return 17; } + if (v >= 1000000000000000L) { return 16; } + if (v >= 100000000000000L) { return 15; } + if (v >= 10000000000000L) { return 14; } + if (v >= 1000000000000L) { return 13; } + if (v >= 100000000000L) { return 12; } + if (v >= 10000000000L) { return 11; } + if (v >= 1000000000L) { return 10; } + if (v >= 100000000L) { return 9; } + if (v >= 10000000L) { return 8; } + if (v >= 1000000L) { return 7; } + if (v >= 100000L) { return 6; } + if (v >= 10000L) { return 5; } + if (v >= 1000L) { return 4; } + if (v >= 100L) { return 3; } + if (v >= 10L) { return 2; } + return 1; + } + + __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) { + int32_t e2; + uint64_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + } + const bool even = (m2 & 1) == 0; + const bool acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + const uint64_t mv = 4 * m2; + // Implicit bool -> int conversion. True is 1, false is 0. + const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + // We would compute mp and mm like this: + // uint64_t mp = 4 * m2 + 2; + // uint64_t mm = mv - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 128-bit arithmetic. + uint64_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + if (e2 >= 0) { + // I tried special-casing q == 0, but there was no effect on performance. + // This expression is slightly faster than max(0, log10Pow2(e2) - 1). + const uint32_t q = log10Pow2(e2) - (e2 > 3); + e10 = (int32_t) q; + const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + const int32_t i = -e2 + (int32_t) q + k; + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); + + if (q <= 21) { + // This should use q <= 22, but I think 21 is also safe. Smaller values + // may still be safe, but it's more difficult to reason about them. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); + if (mvMod5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5(mv, q); + } else if (acceptBounds) { + // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q + // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q + // <=> true && pow5Factor(mm) >= q, since e2 >= q. + vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); + } else { + // Same as min(e2 + 1, pow5Factor(mp)) >= q. + vp -= multipleOfPowerOf5(mv + 2, q); + } + } + } else { + // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). + const uint32_t q = log10Pow5(-e2) - (-e2 > 1); + e10 = (int32_t) q + e2; + const int32_t i = -e2 - (int32_t) q; + const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; + const int32_t j = (int32_t) q - k; + + uint64_t pow5[2]; + double_computePow5(i, pow5); + vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); + + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. + // We want to know if the full product has at least q trailing zeros. + // We need to compute min(p2(mv), p5(mv) - e2) >= q + // <=> p2(mv) >= q && p5(mv) - e2 >= q + // <=> p2(mv) >= q (because -e2 >= q) + vrIsTrailingZeros = multipleOfPowerOf2(mv, q); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint8_t lastRemovedDigit = 0; + uint64_t output; + // On average, we remove ~2 digits. + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~0.7%). + for (;;) { + const uint64_t vpDiv10 = div10(vp); + const uint64_t vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vmIsTrailingZeros &= vmMod10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + if (vmIsTrailingZeros) { + for (;;) { + const uint64_t vmDiv10 = div10(vm); + const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + if (vmMod10 != 0) { + break; + } + const uint64_t vpDiv10 = div10(vp); + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + } + + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~99.3%). Percentages below are relative to this. + bool roundUp = false; + const uint64_t vpDiv100 = div100(vp); + const uint64_t vmDiv100 = div100(vm); + if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). + const uint64_t vrDiv100 = div100(vr); + const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; + removed += 2; + } + // Loop iterations below (approximately), without optimization above: + // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% + // Loop iterations below (approximately), with optimization above: + // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% + for (;;) { + const uint64_t vpDiv10 = div10(vp); + const uint64_t vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + roundUp = vrMod10 >= 5; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || roundUp); + } + const int32_t exp = e10 + removed; + + floating_decimal_64 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; + } + + __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) { + int32_t e2; + uint32_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; + } + const bool even = (m2 & 1) == 0; + const bool acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + const uint32_t mv = 4 * m2; + const uint32_t mp = 4 * m2 + 2; + // Implicit bool -> int conversion. True is 1, false is 0. + const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + const uint32_t mm = 4 * m2 - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 64-bit arithmetic. + uint32_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + uint8_t lastRemovedDigit = 0; + if (e2 >= 0) { + const uint32_t q = log10Pow2(e2); + e10 = (int32_t) q; + const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + const int32_t i = -e2 + (int32_t) q + k; + vr = mulPow5InvDivPow2(mv, q, i); + vp = mulPow5InvDivPow2(mp, q, i); + vm = mulPow5InvDivPow2(mm, q, i); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + // We need to know one removed digit even if we are not going to loop below. We could use + // q = X - 1 above, except that would require 33 bits for the result, and we've found that + // 32-bit arithmetic is faster even on 64-bit machines. + const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; + lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); + } + if (q <= 9) { + // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + if (mv % 5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); + } else if (acceptBounds) { + vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); + } else { + vp -= multipleOfPowerOf5_32(mp, q); + } + } + } else { + const uint32_t q = log10Pow5(-e2); + e10 = (int32_t) q + e2; + const int32_t i = -e2 - (int32_t) q; + const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT; + int32_t j = (int32_t) q - k; + vr = mulPow5divPow2(mv, (uint32_t) i, j); + vp = mulPow5divPow2(mp, (uint32_t) i, j); + vm = mulPow5divPow2(mm, (uint32_t) i, j); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); + } + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. + vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint32_t output; + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~4.0%). + while (vp / 10 > vm / 10) { + vmIsTrailingZeros &= vm % 10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + if (vmIsTrailingZeros) { + while (vm % 10 == 0) { + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + } + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~96.0%). Percentages below are relative to this. + // Loop iterations below (approximately): + // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% + while (vp / 10 > vm / 10) { + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || lastRemovedDigit >= 5); + } + const int32_t exp = e10 + removed; + + floating_decimal_32 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; + } + + __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + // Values in the interval [1E-3, 1E7) are special. + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (uint32_t i = 0; i < olength - 1; ++i) { + const uint32_t c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = '0' + output % 10; + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + // Print 'E', the exponent sign, and the exponent, which has at most three digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 100) { + result[index++] = (char) ('0' + exp / 100); + exp %= 100; + result[index++] = (char) ('0' + exp / 10); + } else if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; + } + + __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) { + int index = 0; + if (sign) { + index++; + } + + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + exp = -exp; + index++; + } + if (exp >= 100) { + index += 3; + } else if (exp >= 10) { + index += 2; + } else { + index++; + } + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + index += exp + 3; + } else { + index += olength + 1; + } + } + return index; + } + + __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (int i = 0; i < olength - 1; i++) { + int c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = (char) ('0' + output % 10); + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + + // Print 'E', the exponent sign, and the exponent, which has at most two digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; + } + + __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + index++; + } + + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + index++; + exp = -exp; + } + if (exp >= 10) { + index++; + } + index++; + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + index += exp + 3; + } else { + // Decimal dot is somewhere between the digits. + index += olength + 1; + } + } + return index; + } + + __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent, + floating_decimal_64* const v) { + const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + + if (e2 > 0) { + // f = m2 * 2^e2 >= 2^53 is an integer. + // Ignore this case for now. + return false; + } + + if (e2 < -52) { + // f < 1. + return false; + } + + // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. + // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. + const uint64_t mask = (1ull << -e2) - 1; + const uint64_t fraction = m2 & mask; + if (fraction != 0) { + return false; + } + + // f is an integer in the range [1, 2^53). + // Note: mantissa might contain trailing (decimal) 0's. + // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). + v->mantissa = m2 >> -e2; + v->exponent = 0; + return true; + } + + __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + const uint64_t bits = double_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; + const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); + const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + floating_decimal_64 v; + const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); + if (isSmallInt) { + // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. + // For scientific notation we need to move these zeros into the exponent. + // (This is not needed for fixed-point notation, so it might be beneficial to trim + // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) + for (;;) { + const uint64_t q = div10(v.mantissa); + const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); + if (r != 0) { + break; + } + v.mantissa = q; + ++v.exponent; + } + } else { + v = d2d(ieeeMantissa, ieeeExponent); + } + return v; + } + + __device__ int d2s_buffered_n(double f, char* result) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); + } + + __device__ int compute_d2s_size(double value) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return d2s_size(v, sign); + } + + __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + const uint32_t bits = float_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; + const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); + const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); + + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + return f2d(ieeeMantissa, ieeeExponent); + } + + __device__ int f2s_buffered_n(float f, char* result) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); + } + + __device__ int compute_f2s_size(float value) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return f2s_size(v, sign); + } + + __device__ int compute_ftos_size(double value, bool is_float) { + if (is_float) { + return compute_f2s_size(value); + } else { + return compute_d2s_size(value); + } + } + + __device__ int float_to_string(double value, char* output, bool is_float) { + if (is_float) { + return f2s_buffered_n(value, output); + } else { + return d2s_buffered_n(value, output); + } + } +}; + +} +} +} \ No newline at end of file diff --git a/thirdparty/cudf b/thirdparty/cudf index fa4e8ab1af..87d2a36f04 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit fa4e8ab1af4acfd2c88a619b4d9693f4a5fda168 +Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 From 4c75bc797c0618705b1b37c0faae271619bedc8c Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 13 Nov 2023 16:52:13 +0800 Subject: [PATCH 12/54] clean up Signed-off-by: Haoyang Li --- src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/ftos_converter.cu | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 8f90b9078e..a6ac8ac98c 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -168,6 +168,7 @@ add_library( src/cast_string_to_float.cu src/datetime_rebase.cu src/decimal_utils.cu + src/ftos_converter.cu src/histogram.cu src/map_utils.cu src/murmur_hash.cu diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 7d5cf0716f..2d5424319e 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -19,8 +19,6 @@ #include #include -using namespace cudf; - namespace spark_rapids_jni { namespace detail { @@ -1160,4 +1158,4 @@ struct ftos_converter { } } -} \ No newline at end of file +} From f1c11e6bace6b79de768cb4d9ad0482b5377c7fd Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 14 Nov 2023 17:53:13 +0800 Subject: [PATCH 13/54] resolve cudf conflicts Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 87d2a36f04..5d09d38bc8 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 +Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 From 760799be1b82cc7b5caf4aee63d718d1a48e921d Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 14 Nov 2023 17:57:17 +0800 Subject: [PATCH 14/54] resolve cudf conflicts Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 5d09d38bc8..54c00e2d2f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 +Subproject commit 54c00e2d2f6d7049c91594f9670f5b25d587f9f2 From bfba655b7e3dde0daff03c0822b6c152c03bfc5c Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 14 Nov 2023 17:59:09 +0800 Subject: [PATCH 15/54] resolve cudf conflicts Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 54c00e2d2f..5d09d38bc8 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 54c00e2d2f6d7049c91594f9670f5b25d587f9f2 +Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 From ad27fee1efd9bc714ebec51d6e8e71258cbbc4e9 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 14 Nov 2023 18:20:51 +0800 Subject: [PATCH 16/54] resolve cudf conflicts Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 5d09d38bc8..5935ef3ce2 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 +Subproject commit 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d From 40a4cb8f8da0bbd852dba668e417b1e0921895a2 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 14 Nov 2023 23:17:25 +0800 Subject: [PATCH 17/54] remove cudf changes Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 5935ef3ce2..e982d3736f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d +Subproject commit e982d3736f095e680298af85bde732d9b5a73122 From 05f55175d26044b196f099d42a9f55c8ab5ecd98 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 14 Nov 2023 23:22:42 +0800 Subject: [PATCH 18/54] remove cudf changes Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index e982d3736f..87d2a36f04 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit e982d3736f095e680298af85bde732d9b5a73122 +Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 From 8ed59bd538726d55c14de20c4631942beb565be6 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 16 Nov 2023 09:56:38 +0800 Subject: [PATCH 19/54] add ryu Signed-off-by: Haoyang Li --- src/main/cpp/CMakeLists.txt | 1 + src/main/cpp/src/format_float.cu | 108 --- src/main/cpp/src/ftos_converter.cu | 1162 ++++++++++++++++++++++++++++ 3 files changed, 1163 insertions(+), 108 deletions(-) create mode 100644 src/main/cpp/src/ftos_converter.cu diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 745a9df2a7..fb88705259 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -168,6 +168,7 @@ add_library( src/cast_string_to_float.cu src/datetime_rebase.cu src/decimal_utils.cu + src/ftos_converter.cu src/histogram.cu src/map_utils.cu src/murmur_hash.cu diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index 5972d108a9..73c8bb7be1 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -125,114 +125,6 @@ struct ftos_converter { return result; } - // __device__ char* format_ll(long long n, char* result, char* dec_ptr, int& dec_pos, int exp10) { - // if (n == 0) { - // *result++ = '0'; - // return result; - // } - // int sep_count = 0; - // char buffer[305]; // should be big-enough for significant digits - // char* ptr = buffer; - // while (n > 0) { - // if (sep_count == 3) { - // *ptr++ = ','; - // sep_count = 0; - // } - // *ptr++ = (char)('0' + (n % 10)); - // n /= 10; - // sep_count++; - // } - // int len = dec_ptr - dec_str; - // int dec_pos = 0; - // while (exp10--) { - // if (sep_count == 3) { - // *ptr++ = ','; - // sep_count = 0; - // } - // if (dec_pos < len) { - // *ptr++ = dec_str[dec_pos++]; - // } else { - // *ptr++ = '0'; - // } - // sep_count++; - // } - // while (ptr != buffer) { - // *result++ = *--ptr; // 54321 -> 12345 - // } - // return result; - // } - - // /** - // * @brief Dissect a float value into integer, decimal, and exponent components. - // * - // * @return The number of decimal places. - // */ - // __device__ int dissect_value(double value, - // int digits, - // unsigned int& integer, - // unsigned long long& decimal, - // int& exp10, - // bool is_float = false) - // { - // // normalize step puts value between lower-limit and upper-limit - // // by adjusting the exponent up or down - // exp10 = 0; - // if (value > upper_limit) { - // int fx = 256; - // for (int idx = 8; idx >= 0; --idx) { - // if (value >= upper10[idx]) { - // value *= lower10[idx]; - // exp10 += fx; - // } - // fx = fx >> 1; - // } - // } else if ((value > 0.0) && (value < lower_limit)) { - // int fx = 256; - // for (int idx = 8; idx >= 0; --idx) { - // if (value < blower10[idx]) { - // value *= upper10[idx]; - // exp10 -= fx; - // } - // fx = fx >> 1; - // } - // } - // // - // // int decimal_places = significant_digits - (exp10? 2 : 1); - // // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits); - // int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1; - // unsigned long long max_digits = (is_float? eight_digits: sixteen_digits); - // double temp_value = value; - // while (temp_value < 1.0 && temp_value > 0.0) { - // max_digits *= 10; - // temp_value *= 10.0; - // decimal_places++; - // } - // integer = (unsigned int)value; - // for (unsigned int i = integer; i >= 10; i /= 10) { - // --decimal_places; - // max_digits /= 10; - // } - // double diff = value - (double)integer; - // double remainder = diff * (double)max_digits; - // decimal = (unsigned long long)remainder; - // remainder -= (double)decimal; - // decimal += (unsigned long long)(2.0 * remainder); // round up - // if (decimal >= max_digits) { - // decimal = 0; - // ++integer; - // if (exp10 && (integer >= 10)) { - // ++exp10; - // integer = 1; - // } - // } - // // - // while ((decimal % 10) == 0 && (decimal_places > 0)) { - // decimal /= 10; - // --decimal_places; - // } - // return decimal_places; - // } - /** * @brief Main kernel method for converting float value to char output array. * diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu new file mode 100644 index 0000000000..55d2e4282c --- /dev/null +++ b/src/main/cpp/src/ftos_converter.cu @@ -0,0 +1,1162 @@ +/* Not a contribution + * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as + * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: + * + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * + * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual + * property and proprietary rights in and to this material, related + * documentation and any modifications thereto. Any use, reproduction, + * disclosure or distribution of this material and related documentation + * without an express license agreement from NVIDIA CORPORATION or + * its affiliates is strictly prohibited. + */ + +#include +#include +#include +#include +#include + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_64 { + uint64_t mantissa; + // Decimal exponent's range is -324 to 308 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_64; + +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_32 { + uint32_t mantissa; + // Decimal exponent's range is -45 to 38 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_32; + +struct ftos_converter { + + // These tables are generated by PrintDoubleLookupTable. + static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; + static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; + static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); + static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); + static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; + static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; + static constexpr unsigned int DOUBLE_BIAS = 1023; + static constexpr unsigned int FLOAT_MANTISSA_BITS = 23; + static constexpr unsigned int FLOAT_EXPONENT_BITS = 8; + static constexpr unsigned int FLOAT_BIAS = 127; + + + // Returns the number of decimal digits in v, which must not contain more than 9 digits. + __device__ inline uint32_t decimalLength9(const uint32_t v) { + // Function precondition: v is not a 10-digit number. + // (f2s: 9 digits are sufficient for round-tripping.) + // (d2fixed: We print 9-digit blocks.) + assert(v < 1000000000); + if (v >= 100000000) { return 9; } + if (v >= 10000000) { return 8; } + if (v >= 1000000) { return 7; } + if (v >= 100000) { return 6; } + if (v >= 10000) { return 5; } + if (v >= 1000) { return 4; } + if (v >= 100) { return 3; } + if (v >= 10) { return 2; } + return 1; + } + + const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = { + { 1u, 2305843009213693952u }, + { 5955668970331000884u, 1784059615882449851u }, + { 8982663654677661702u, 1380349269358112757u }, + { 7286864317269821294u, 2135987035920910082u }, + { 7005857020398200553u, 1652639921975621497u }, + { 17965325103354776697u, 1278668206209430417u }, + { 8928596168509315048u, 1978643211784836272u }, + { 10075671573058298858u, 1530901034580419511u }, + { 597001226353042382u, 1184477304306571148u }, + { 1527430471115325346u, 1832889850782397517u }, + { 12533209867169019542u, 1418129833677084982u }, + { 5577825024675947042u, 2194449627517475473u }, + { 11006974540203867551u, 1697873161311732311u }, + { 10313493231639821582u, 1313665730009899186u }, + { 12701016819766672773u, 2032799256770390445u } + }; + + const uint32_t POW5_INV_OFFSETS[19] = { + 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, + 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, + 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, + 0x00000000 + }; + + const uint64_t DOUBLE_POW5_SPLIT2[13][2] = { + { 0u, 1152921504606846976u }, + { 0u, 1490116119384765625u }, + { 1032610780636961552u, 1925929944387235853u }, + { 7910200175544436838u, 1244603055572228341u }, + { 16941905809032713930u, 1608611746708759036u }, + { 13024893955298202172u, 2079081953128979843u }, + { 6607496772837067824u, 1343575221513417750u }, + { 17332926989895652603u, 1736530273035216783u }, + { 13037379183483547984u, 2244412773384604712u }, + { 1605989338741628675u, 1450417759929778918u }, + { 9630225068416591280u, 1874621017369538693u }, + { 665883850346957067u, 1211445438634777304u }, + { 14931890668723713708u, 1565756531257009982u } + }; + + const uint32_t POW5_OFFSETS[21] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, + 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, + 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, + 0x55559155, 0x51405555, 0x00000105 + }; + + static constexpr uint32_t POW5_TABLE_SIZE = 26; + const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { + 1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, + 1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, + 30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, + 19073486328125ull, 95367431640625ull, 476837158203125ull, + 2384185791015625ull, 11920928955078125ull, 59604644775390625ull, + 298023223876953125ull //, 1490116119384765625ull + }; + + // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528. + __device__ inline int32_t log2pow5(const int32_t e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) ((((uint32_t) e) * 1217359) >> 19); + } + + // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. + __device__ inline int32_t pow5bits(const int32_t e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); + } + + // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. + __device__ inline int32_t ceil_log2pow5(const int32_t e) { + return log2pow5(e) + 1; + } + + // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. + __device__ inline uint32_t log10Pow2(const int32_t e) { + // The first value this approximation fails for is 2^1651 which is just greater than 10^297. + assert(e >= 0); + assert(e <= 1650); + return (((uint32_t) e) * 78913) >> 18; + } + + // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. + __device__ inline uint32_t log10Pow5(const int32_t e) { + // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. + assert(e >= 0); + assert(e <= 2620); + return (((uint32_t) e) * 732923) >> 20; + } + + __device__ inline uint32_t pow5factor_32(uint32_t value) { + uint32_t count = 0; + for (;;) { + assert(value != 0); + const uint32_t q = value / 5; + const uint32_t r = value % 5; + if (r != 0) { + break; + } + value = q; + ++count; + } + return count; + } + + // Returns true if value is divisible by 5^p. + __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) { + return pow5factor_32(value) >= p; + } + + // Returns true if value is divisible by 2^p. + __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) { + // __builtin_ctz doesn't appear to be faster here. + return (value & ((1u << p) - 1)) == 0; + } + + // It seems to be slightly faster to avoid uint128_t here, although the + // generated code for uint128_t looks slightly nicer. + __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) { + assert(shift > 32); + + // The casts here help MSVC to avoid calls to the __allmul library + // function. + const uint32_t factorLo = (uint32_t)(factor); + const uint32_t factorHi = (uint32_t)(factor >> 32); + const uint64_t bits0 = (uint64_t)m * factorLo; + const uint64_t bits1 = (uint64_t)m * factorHi; + + const uint64_t sum = (bits0 >> 32) + bits1; + const uint64_t shiftedSum = sum >> (shift - 32); + assert(shiftedSum <= UINT32_MAX); + return (uint32_t) shiftedSum; + + } + + __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) { + if (mantissa) { + memcpy(result, "NaN", 3); + return 3; + } + if (sign) { + result[0] = '-'; + } + if (exponent) { + memcpy(result + sign, "Infinity", 8); + return sign + 8; + } + memcpy(result + sign, "0.0", 3); + return sign + 3; + } + + __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) { + if (mantissa) { + return 3; + } + if (exponent) { + return sign + 8; + } + return sign + 3; + } + + __device__ inline uint32_t float_to_bits(const float f) { + uint32_t bits = 0; + memcpy(&bits, &f, sizeof(float)); + return bits; + } + + __device__ inline uint64_t double_to_bits(const double d) { + uint64_t bits = 0; + memcpy(&bits, &d, sizeof(double)); + return bits; + } + + __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { + // The casts here help MSVC to avoid calls to the __allmul library function. + const uint32_t aLo = (uint32_t)a; + const uint32_t aHi = (uint32_t)(a >> 32); + const uint32_t bLo = (uint32_t)b; + const uint32_t bHi = (uint32_t)(b >> 32); + + const uint64_t b00 = (uint64_t)aLo * bLo; + const uint64_t b01 = (uint64_t)aLo * bHi; + const uint64_t b10 = (uint64_t)aHi * bLo; + const uint64_t b11 = (uint64_t)aHi * bHi; + + const uint32_t b00Lo = (uint32_t)b00; + const uint32_t b00Hi = (uint32_t)(b00 >> 32); + + const uint64_t mid1 = b10 + b00Hi; + const uint32_t mid1Lo = (uint32_t)(mid1); + const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); + + const uint64_t mid2 = b01 + mid1Lo; + const uint32_t mid2Lo = (uint32_t)(mid2); + const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); + + const uint64_t pHi = b11 + mid1Hi + mid2Hi; + const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; + + *productHi = pHi; + return pLo; + } + + __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { + // We don't need to handle the case dist >= 64 here (see above). + assert(dist < 64); + assert(dist > 0); + return (hi << (64 - dist)) | (lo >> dist); + } + + __device__ inline uint64_t div5(const uint64_t x) { + return x / 5; + } + + __device__ inline uint64_t div10(const uint64_t x) { + return x / 10; + } + + __device__ inline uint64_t div100(const uint64_t x) { + return x / 100; + } + + __device__ inline uint64_t div1e8(const uint64_t x) { + return x / 100000000; + } + + __device__ inline uint64_t div1e9(const uint64_t x) { + return x / 1000000000; + } + + __device__ inline uint32_t mod1e9(const uint64_t x) { + return (uint32_t) (x - 1000000000 * div1e9(x)); + } + + __device__ inline uint32_t pow5Factor(uint64_t value) { + const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) + const uint64_t n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 + uint32_t count = 0; + for (;;) { + assert(value != 0); + value *= m_inv_5; + if (value > n_div_5) + break; + ++count; + } + return count; + } + + // Returns true if value is divisible by 5^p. + __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { + // I tried a case distinction on p, but there was no performance difference. + return pow5Factor(value) >= p; + } + + // Returns true if value is divisible by 2^p. + __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { + assert(value != 0); + assert(p < 64); + // __builtin_ctzll doesn't appear to be faster here. + return (value & ((1ull << p) - 1)) == 0; + } + + __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) { + // m is maximum 55 bits + uint64_t high1; // 128 + const uint64_t low1 = umul128(m, mul[1], &high1); // 64 + uint64_t high0; // 64 + umul128(m, mul[0], &high0); // 0 + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + return shiftright128(sum, high1, j - 64); + } + + __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j, + uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) { + *vp = mulShift64(4 * m + 2, mul, j); + *vm = mulShift64(4 * m - 1 - mmShift, mul, j); + return mulShift64(4 * m, mul, j); + } + + // Computes 5^i in the form required by Ryu, and stores it in the given pointer. + __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) { + const uint32_t base = i / POW5_TABLE_SIZE; + const uint32_t base2 = base * POW5_TABLE_SIZE; + const uint32_t offset = i - base2; + const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base]; + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + const uint64_t m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + const uint64_t low1 = umul128(m, mul[1], &high1); + uint64_t high0; + const uint64_t low0 = umul128(m, mul[0], &high0); + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + const uint32_t delta = pow5bits(i) - pow5bits(base2); + result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); + } + + // Computes 5^-i in the form required by Ryu, and stores it in the given pointer. + __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) { + const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; + const uint32_t base2 = base * POW5_TABLE_SIZE; + const uint32_t offset = base2 - i; + const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + const uint64_t m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + const uint64_t low1 = umul128(m, mul[1], &high1); + uint64_t high0; + const uint64_t low0 = umul128(m, mul[0] - 1, &high0); + const uint64_t sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + const uint32_t delta = pow5bits(base2) - pow5bits(i); + result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); + } + + __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) { + // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup + // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the + // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + return mulShift32(m, pow5[1] + 1, j); + } + + __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) { + uint64_t pow5[2]; + double_computePow5(i, pow5); + return mulShift32(m, pow5[1], j); + } + + __device__ inline uint32_t decimalLength17(const uint64_t v) { + // This is slightly faster than a loop. + // The average output length is 16.38 digits, so we check high-to-low. + // Function precondition: v is not an 18, 19, or 20-digit number. + // (17 digits are sufficient for round-tripping.) + assert(v < 100000000000000000L); + if (v >= 10000000000000000L) { return 17; } + if (v >= 1000000000000000L) { return 16; } + if (v >= 100000000000000L) { return 15; } + if (v >= 10000000000000L) { return 14; } + if (v >= 1000000000000L) { return 13; } + if (v >= 100000000000L) { return 12; } + if (v >= 10000000000L) { return 11; } + if (v >= 1000000000L) { return 10; } + if (v >= 100000000L) { return 9; } + if (v >= 10000000L) { return 8; } + if (v >= 1000000L) { return 7; } + if (v >= 100000L) { return 6; } + if (v >= 10000L) { return 5; } + if (v >= 1000L) { return 4; } + if (v >= 100L) { return 3; } + if (v >= 10L) { return 2; } + return 1; + } + + __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) { + int32_t e2; + uint64_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + } + const bool even = (m2 & 1) == 0; + const bool acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + const uint64_t mv = 4 * m2; + // Implicit bool -> int conversion. True is 1, false is 0. + const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + // We would compute mp and mm like this: + // uint64_t mp = 4 * m2 + 2; + // uint64_t mm = mv - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 128-bit arithmetic. + uint64_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + if (e2 >= 0) { + // I tried special-casing q == 0, but there was no effect on performance. + // This expression is slightly faster than max(0, log10Pow2(e2) - 1). + const uint32_t q = log10Pow2(e2) - (e2 > 3); + e10 = (int32_t) q; + const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + const int32_t i = -e2 + (int32_t) q + k; + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); + + if (q <= 21) { + // This should use q <= 22, but I think 21 is also safe. Smaller values + // may still be safe, but it's more difficult to reason about them. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); + if (mvMod5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5(mv, q); + } else if (acceptBounds) { + // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q + // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q + // <=> true && pow5Factor(mm) >= q, since e2 >= q. + vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); + } else { + // Same as min(e2 + 1, pow5Factor(mp)) >= q. + vp -= multipleOfPowerOf5(mv + 2, q); + } + } + } else { + // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). + const uint32_t q = log10Pow5(-e2) - (-e2 > 1); + e10 = (int32_t) q + e2; + const int32_t i = -e2 - (int32_t) q; + const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; + const int32_t j = (int32_t) q - k; + + uint64_t pow5[2]; + double_computePow5(i, pow5); + vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); + + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. + // We want to know if the full product has at least q trailing zeros. + // We need to compute min(p2(mv), p5(mv) - e2) >= q + // <=> p2(mv) >= q && p5(mv) - e2 >= q + // <=> p2(mv) >= q (because -e2 >= q) + vrIsTrailingZeros = multipleOfPowerOf2(mv, q); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint8_t lastRemovedDigit = 0; + uint64_t output; + // On average, we remove ~2 digits. + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~0.7%). + for (;;) { + const uint64_t vpDiv10 = div10(vp); + const uint64_t vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vmIsTrailingZeros &= vmMod10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + if (vmIsTrailingZeros) { + for (;;) { + const uint64_t vmDiv10 = div10(vm); + const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + if (vmMod10 != 0) { + break; + } + const uint64_t vpDiv10 = div10(vp); + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + } + + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~99.3%). Percentages below are relative to this. + bool roundUp = false; + const uint64_t vpDiv100 = div100(vp); + const uint64_t vmDiv100 = div100(vm); + if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). + const uint64_t vrDiv100 = div100(vr); + const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; + removed += 2; + } + // Loop iterations below (approximately), without optimization above: + // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% + // Loop iterations below (approximately), with optimization above: + // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% + for (;;) { + const uint64_t vpDiv10 = div10(vp); + const uint64_t vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + const uint64_t vrDiv10 = div10(vr); + const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + roundUp = vrMod10 >= 5; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || roundUp); + } + const int32_t exp = e10 + removed; + + floating_decimal_64 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; + } + + __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) { + int32_t e2; + uint32_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; + } + const bool even = (m2 & 1) == 0; + const bool acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + const uint32_t mv = 4 * m2; + const uint32_t mp = 4 * m2 + 2; + // Implicit bool -> int conversion. True is 1, false is 0. + const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + const uint32_t mm = 4 * m2 - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 64-bit arithmetic. + uint32_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + uint8_t lastRemovedDigit = 0; + if (e2 >= 0) { + const uint32_t q = log10Pow2(e2); + e10 = (int32_t) q; + const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + const int32_t i = -e2 + (int32_t) q + k; + vr = mulPow5InvDivPow2(mv, q, i); + vp = mulPow5InvDivPow2(mp, q, i); + vm = mulPow5InvDivPow2(mm, q, i); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + // We need to know one removed digit even if we are not going to loop below. We could use + // q = X - 1 above, except that would require 33 bits for the result, and we've found that + // 32-bit arithmetic is faster even on 64-bit machines. + const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; + lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); + } + if (q <= 9) { + // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + if (mv % 5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); + } else if (acceptBounds) { + vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); + } else { + vp -= multipleOfPowerOf5_32(mp, q); + } + } + } else { + const uint32_t q = log10Pow5(-e2); + e10 = (int32_t) q + e2; + const int32_t i = -e2 - (int32_t) q; + const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT; + int32_t j = (int32_t) q - k; + vr = mulPow5divPow2(mv, (uint32_t) i, j); + vp = mulPow5divPow2(mp, (uint32_t) i, j); + vm = mulPow5divPow2(mm, (uint32_t) i, j); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); + } + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. + vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint32_t output; + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~4.0%). + while (vp / 10 > vm / 10) { + vmIsTrailingZeros &= vm % 10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + if (vmIsTrailingZeros) { + while (vm % 10 == 0) { + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + } + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~96.0%). Percentages below are relative to this. + // Loop iterations below (approximately): + // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% + while (vp / 10 > vm / 10) { + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || lastRemovedDigit >= 5); + } + const int32_t exp = e10 + removed; + + floating_decimal_32 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; + } + + __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + // Values in the interval [1E-3, 1E7) are special. + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (uint32_t i = 0; i < olength - 1; ++i) { + const uint32_t c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = '0' + output % 10; + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + // Print 'E', the exponent sign, and the exponent, which has at most three digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 100) { + result[index++] = (char) ('0' + exp / 100); + exp %= 100; + result[index++] = (char) ('0' + exp / 10); + } else if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; + } + + __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) { + int index = 0; + if (sign) { + index++; + } + + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + exp = -exp; + index++; + } + if (exp >= 100) { + index += 3; + } else if (exp >= 10) { + index += 2; + } else { + index++; + } + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + index += exp + 3; + } else { + index += olength + 1; + } + } + return index; + } + + __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (int i = 0; i < olength - 1; i++) { + int c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = (char) ('0' + output % 10); + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + + // Print 'E', the exponent sign, and the exponent, which has at most two digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; + } + + __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + index++; + } + + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + index++; + exp = -exp; + } + if (exp >= 10) { + index++; + } + index++; + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + index += exp + 3; + } else { + // Decimal dot is somewhere between the digits. + index += olength + 1; + } + } + return index; + } + + __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent, + floating_decimal_64* const v) { + const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + + if (e2 > 0) { + // f = m2 * 2^e2 >= 2^53 is an integer. + // Ignore this case for now. + return false; + } + + if (e2 < -52) { + // f < 1. + return false; + } + + // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. + // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. + const uint64_t mask = (1ull << -e2) - 1; + const uint64_t fraction = m2 & mask; + if (fraction != 0) { + return false; + } + + // f is an integer in the range [1, 2^53). + // Note: mantissa might contain trailing (decimal) 0's. + // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). + v->mantissa = m2 >> -e2; + v->exponent = 0; + return true; + } + + __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + const uint64_t bits = double_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; + const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); + const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + floating_decimal_64 v; + const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); + if (isSmallInt) { + // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. + // For scientific notation we need to move these zeros into the exponent. + // (This is not needed for fixed-point notation, so it might be beneficial to trim + // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) + for (;;) { + const uint64_t q = div10(v.mantissa); + const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); + if (r != 0) { + break; + } + v.mantissa = q; + ++v.exponent; + } + } else { + v = d2d(ieeeMantissa, ieeeExponent); + } + return v; + } + + __device__ int d2s_buffered_n(double f, char* result) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); + } + + __device__ int compute_d2s_size(double value) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return d2s_size(v, sign); + } + + __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + const uint32_t bits = float_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; + const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); + const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); + + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + return f2d(ieeeMantissa, ieeeExponent); + } + + __device__ int f2s_buffered_n(float f, char* result) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); + } + + __device__ int compute_f2s_size(float value) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return f2s_size(v, sign); + } + + __device__ int compute_ftos_size(double value, bool is_float) { + if (is_float) { + return compute_f2s_size(value); + } else { + return compute_d2s_size(value); + } + } + + __device__ int float_to_string(double value, char* output, bool is_float) { + if (is_float) { + return f2s_buffered_n(value, output); + } else { + return d2s_buffered_n(value, output); + } + } + +}; + +} +} +} \ No newline at end of file From da2197b826b86b3520150ab4dbddedc9f1f09417 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 16 Nov 2023 17:43:58 +0800 Subject: [PATCH 20/54] Add copyright and notice Signed-off-by: Haoyang Li --- NOTICE | 21 +++++++++++++++++++ src/main/cpp/src/CastStringJni.cpp | 2 +- src/main/cpp/src/cast_string.hpp | 2 +- .../nvidia/spark/rapids/jni/CastStrings.java | 2 +- 4 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 NOTICE diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000000..53333b52c5 --- /dev/null +++ b/NOTICE @@ -0,0 +1,21 @@ +RAPIDS Accelerator JNI For Apache Spark +Copyright (c) 2022-2023, NVIDIA CORPORATION + +-------------------------------------------------------------------------------- + +This project includes code from ryu (https://github.com/ulfjack/ryu). + +ryu +Copyright (2018) Ulf Adams and contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index ff8ee2afd4..093b51188b 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index fc2270ca8c..c4f850b47f 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 3002e1cdab..022cb93085 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 2c6cdcbfbf220dad145468ee003b6ca3ea6f4116 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 17 Nov 2023 10:31:26 +0800 Subject: [PATCH 21/54] Fix copyrights and license Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 97 ++++++++++++++++++++---------- thirdparty/cudf | 2 +- 2 files changed, 67 insertions(+), 32 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 2d5424319e..1ff5fe8543 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -1,7 +1,24 @@ -/* Not a contribution - * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as - * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: - * +// Copyright 2018 Ulf Adams +// +// The contents of this file may be used under the terms of the Apache License, +// Version 2.0. +// +// (See accompanying file LICENSE-Apache or copy at +// http://www.apache.org/licenses/LICENSE-2.0) +// +// Alternatively, the contents of this file may be used under the terms of +// the Boost Software License, Version 1.0. +// (See accompanying file LICENSE-Boost or copy at +// https://www.boost.org/LICENSE_1_0.txt) +// +// Unless required by applicable law or agreed to in writing, this software +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. + +// Not a contribution +// Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as +// NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: +/* * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: LicenseRef-NvidiaProprietary * @@ -24,6 +41,7 @@ namespace spark_rapids_jni { namespace detail { namespace { +// d2s.c from ryu // A floating decimal representing m * 10^e. typedef struct floating_decimal_64 { uint64_t mantissa; @@ -32,6 +50,7 @@ typedef struct floating_decimal_64 { int32_t exponent; } floating_decimal_64; +// f2s.c from ryu // A floating decimal representing m * 10^e. typedef struct floating_decimal_32 { uint32_t mantissa; @@ -42,6 +61,8 @@ typedef struct floating_decimal_32 { struct ftos_converter { + //===== constants from ryu ===== + // These tables are generated by PrintDoubleLookupTable. static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; @@ -54,24 +75,6 @@ struct ftos_converter { static constexpr unsigned int FLOAT_EXPONENT_BITS = 8; static constexpr unsigned int FLOAT_BIAS = 127; - - // Returns the number of decimal digits in v, which must not contain more than 9 digits. - __device__ inline uint32_t decimalLength9(const uint32_t v) { - // Function precondition: v is not a 10-digit number. - // (f2s: 9 digits are sufficient for round-tripping.) - // (d2fixed: We print 9-digit blocks.) - assert(v < 1000000000); - if (v >= 100000000) { return 9; } - if (v >= 10000000) { return 8; } - if (v >= 1000000) { return 7; } - if (v >= 100000) { return 6; } - if (v >= 10000) { return 5; } - if (v >= 1000) { return 4; } - if (v >= 100) { return 3; } - if (v >= 10) { return 2; } - return 1; - } - const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = { { 1u, 2305843009213693952u }, { 5955668970331000884u, 1784059615882449851u }, @@ -130,6 +133,25 @@ struct ftos_converter { 298023223876953125ull //, 1490116119384765625ull }; + //===== common.h from ryu ===== + + // Returns the number of decimal digits in v, which must not contain more than 9 digits. + __device__ inline uint32_t decimalLength9(const uint32_t v) { + // Function precondition: v is not a 10-digit number. + // (f2s: 9 digits are sufficient for round-tripping.) + // (d2fixed: We print 9-digit blocks.) + assert(v < 1000000000); + if (v >= 100000000) { return 9; } + if (v >= 10000000) { return 8; } + if (v >= 1000000) { return 7; } + if (v >= 100000) { return 6; } + if (v >= 10000) { return 5; } + if (v >= 1000) { return 4; } + if (v >= 100) { return 3; } + if (v >= 10) { return 2; } + return 1; + } + // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528. __device__ inline int32_t log2pow5(const int32_t e) { // This approximation works up to the point that the multiplication overflows at e = 3529. @@ -254,6 +276,8 @@ struct ftos_converter { return bits; } + //===== d2s_intrinsics.h from ryu ===== + __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { // The casts here help MSVC to avoid calls to the __allmul library function. const uint32_t aLo = (uint32_t)a; @@ -363,6 +387,8 @@ struct ftos_converter { return mulShift64(4 * m, mul, j); } + //===== d2s_small_table.h from ryu ===== + // Computes 5^i in the form required by Ryu, and stores it in the given pointer. __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) { const uint32_t base = i / POW5_TABLE_SIZE; @@ -415,6 +441,8 @@ struct ftos_converter { result[1] = shiftright128(sum, high1, delta); } + //===== f2s_intrinsics.h from ryu ===== + __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) { // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the @@ -430,6 +458,8 @@ struct ftos_converter { return mulShift32(m, pow5[1], j); } + //===== d2s.c and f2s.c from ryu ===== + __device__ inline uint32_t decimalLength17(const uint64_t v) { // This is slightly faster than a loop. // The average output length is 16.38 digits, so we check high-to-low. @@ -1094,15 +1124,6 @@ struct ftos_converter { return to_chars(v, sign, result); } - __device__ int compute_d2s_size(double value) { - bool sign = false, special = false; - floating_decimal_64 v = d2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } - return d2s_size(v, sign); - } - __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. const uint32_t bits = float_to_bits(f); @@ -1130,6 +1151,18 @@ struct ftos_converter { return to_chars(v, sign, result); } + + //===== compute float to string size ===== + + __device__ int compute_d2s_size(double value) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return d2s_size(v, sign); + } + __device__ int compute_f2s_size(float value) { bool sign = false, special = false; floating_decimal_32 v = f2d(value, sign, special); @@ -1139,6 +1172,8 @@ struct ftos_converter { return f2s_size(v, sign); } + //===== APIs ===== + __device__ int compute_ftos_size(double value, bool is_float) { if (is_float) { return compute_f2s_size(value); diff --git a/thirdparty/cudf b/thirdparty/cudf index 4313cfa9b3..87d2a36f04 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc +Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 From 32287554fd016cf77ce737f9d61a7e89724250e4 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 17 Nov 2023 12:02:13 +0800 Subject: [PATCH 22/54] cudf conflict resolve Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 87d2a36f04..4313cfa9b3 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 +Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc From d7be0d7cc1f89e489ac6a4a74a4f6c8650500e48 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 17 Nov 2023 18:08:09 +0800 Subject: [PATCH 23/54] Add format_float kernel Signed-off-by: Haoyang Li --- src/main/cpp/src/format_float.cu | 2 +- src/main/cpp/src/ftos_converter.cu | 381 +++++++++++++++++++++++++++- src/main/cpp/tests/format_float.cpp | 39 ++- 3 files changed, 409 insertions(+), 13 deletions(-) diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index 2590991629..cdac863553 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -51,7 +51,7 @@ struct format_float_fn { { ftos_converter fts; bool is_float = std::is_same_v; - return static_cast(fts.compute_ftos_size(static_cast(value), digits, is_float)); + return static_cast(fts.compute_format_float_size(static_cast(value), digits, is_float)); } __device__ void format_float(size_type idx, int digits) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 1ff5fe8543..69197fdf64 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -238,7 +238,7 @@ struct ftos_converter { } - __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) { + __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa, const int d=1) { if (mantissa) { memcpy(result, "NaN", 3); return 3; @@ -250,18 +250,21 @@ struct ftos_converter { memcpy(result + sign, "Infinity", 8); return sign + 8; } - memcpy(result + sign, "0.0", 3); - return sign + 3; + memcpy(result + sign, "0.", 2); + for (int i = 0; i < d; i++) { + result[sign + 2 + i] = '0'; + } + return sign + 2 + d; } - __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) { + __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa, const int d=1) { if (mantissa) { return 3; } if (exponent) { return sign + 8; } - return sign + 3; + return sign + 2 + d; } __device__ inline uint32_t float_to_bits(const float f) { @@ -1189,6 +1192,374 @@ struct ftos_converter { return d2s_buffered_n(value, output); } } + + //===== format float ===== + + const uint64_t POW10_TABLE[19] = { + 1ull, 10ull, 100ull, 1000ull, 10000ull, 100000ull, 1000000ull, 10000000ull, + 100000000ull, 1000000000ull, 10000000000ull, 100000000000ull, 1000000000000ull, + 10000000000000ull, 100000000000000ull, 1000000000000000ull, 10000000000000000ull, + 100000000000000000ull + }; + + template + __device__ inline T round_half_even(const T input, const int olength, const int d) { + if (d > olength) { + T num = input; + for (int i = 0; i < d - olength; i++) { + num *= 10; + } + return num; + } + T div = POW10_TABLE[olength - d]; + T mod = input % div; + T num = input / div; + if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) { + num++; + } + return num; + } + + __device__ inline int to_formated_chars(const floating_decimal_64 v, const bool sign, char* const result, int d=10) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + result[index++] = '0'; + result[index++] = '.'; + int actural_round = d; + for (int i = -1; i > exp; i--) { + index_for_carrier = index; + result[index++] = '0'; + actural_round--; + if (actural_round == 0) { + break; + } + } + int actural_olength = fmin(int(olength), actural_round); + uint64_t rounded_output = round_half_even(output, olength, actural_round); + // check if carry + if (rounded_output >= POW10_TABLE[actural_olength]) { + result[index_for_carrier] = '1'; + rounded_output -= POW10_TABLE[actural_olength]; + } + int current = index; + for (int i = 0; i < actural_olength; i++) { + result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10); + rounded_output /= 10; + index++; + } + actural_round -= actural_olength; + if (actural_round > 0) { + for (int i = 0; i < actural_round; i++) { + result[index++] = '0'; + } + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + (exp - index) / 3; + int sep_cnt = 0; + int rev_index = 0; + for (int i = olength; i < exp + 1; i++) { + result[integer_len - (rev_index++) - 1] = '0'; + sep_cnt++; + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + } + for (int i = 0; i < olength; i++) { + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10); + sep_cnt++; + output /= 10; + } + index = integer_len; + result[index++] = '.'; + for (int i = 0; i < d; i++) { + result[index++] = '0'; + } + } else { + uint32_t temp_d = d, tailing_zero = 0; + if (exp + d > olength) { + temp_d = olength - exp; + tailing_zero = d - temp_d; + } + uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint64_t decimal = rounded_output % pow10; + uint32_t integer_len = decimalLength17(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + uint32_t sep_cnt = 0; + int rev_index = 0; + for (int i = 0; i < integer_len; i++) { + if (sep_cnt == 3) { + result[formated_integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10); + sep_cnt++; + integer /= 10; + } + index = formated_integer_len; + result[index++] = '.'; + int current = index; + for (int i = 0; i < tailing_zero; i++) { + result[current + d - i - 1] = '0'; + index++; + } + for (int i = tailing_zero; i < d; i++) { + result[current + d - i - 1] = (char) ('0' + decimal % 10); + decimal /= 10; + index++; + } + } + return index; + } + + __device__ inline int format_float_size(const floating_decimal_64 v, const bool sign, int d=10) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + index++; + } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + index+=2; + int actural_round = d; + index += exp + 1; + int actural_olength = fmin(int(olength), actural_round); + index += actural_olength; + actural_round -= actural_olength; + if (actural_round > 0) { + index += actural_round; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + (exp - index) / 3; + index = integer_len; + index++; + index += d; + } else { + uint32_t temp_d = d, tailing_zero = 0; + if (exp + d > olength) { + temp_d = olength - exp; + tailing_zero = d - temp_d; + } + uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint32_t integer_len = decimalLength17(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + index = formated_integer_len; + index++; + index += d; + } + return index; + } + + __device__ inline int to_formated_chars(const floating_decimal_32 v, const bool sign, char* const result, int d=10) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + uint32_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + result[index++] = '0'; + result[index++] = '.'; + int actural_round = d; + for (int i = -1; i > exp; i--) { + index_for_carrier = index; + result[index++] = '0'; + actural_round--; + if (actural_round == 0) { + break; + } + } + int actural_olength = fmin(int(olength), actural_round); + uint64_t rounded_output = round_half_even(output, olength, actural_round); + // check if carry + if (rounded_output >= POW10_TABLE[actural_olength]) { + result[index_for_carrier] = '1'; + rounded_output -= POW10_TABLE[actural_olength]; + } + int current = index; + for (int i = 0; i < actural_olength; i++) { + result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10); + rounded_output /= 10; + index++; + } + actural_round -= actural_olength; + if (actural_round > 0) { + for (int i = 0; i < actural_round; i++) { + result[index++] = '0'; + } + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + (exp - index) / 3; + int sep_cnt = 0; + int rev_index = 0; + for (int i = olength; i < exp + 1; i++) { + result[integer_len - (rev_index++) - 1] = '0'; + sep_cnt++; + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + } + for (int i = 0; i < olength; i++) { + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10); + sep_cnt++; + output /= 10; + } + index = integer_len; + result[index++] = '.'; + for (int i = 0; i < d; i++) { + result[index++] = '0'; + } + } else { + uint32_t temp_d = d, tailing_zero = 0; + if (exp + d > olength) { + temp_d = olength - exp; + tailing_zero = d - temp_d; + } + uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint32_t pow10 = POW10_TABLE[temp_d]; + uint32_t integer = rounded_output / pow10; + uint32_t decimal = rounded_output % pow10; + uint32_t integer_len = decimalLength9(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + uint32_t sep_cnt = 0; + int rev_index = 0; + for (int i = 0; i < integer_len; i++) { + if (sep_cnt == 3) { + result[formated_integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10); + sep_cnt++; + integer /= 10; + } + index = formated_integer_len; + result[index++] = '.'; + int current = index; + for (int i = 0; i < tailing_zero; i++) { + result[current + d - i - 1] = '0'; + index++; + } + for (int i = tailing_zero; i < d; i++) { + result[current + d - i - 1] = (char) ('0' + decimal % 10); + decimal /= 10; + index++; + } + } + return index; + } + + __device__ inline int format_float_size(const floating_decimal_32 v, const bool sign, int d=10) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + index++; + } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength9(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + index+=2; + int actural_round = d; + index += exp + 1; + int actural_olength = fmin(int(olength), actural_round); + index += actural_olength; + actural_round -= actural_olength; + if (actural_round > 0) { + index += actural_round; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + (exp - index) / 3; + index = integer_len; + index++; + index += d; + } else { + uint32_t temp_d = d, tailing_zero = 0; + if (exp + d > olength) { + temp_d = olength - exp; + tailing_zero = d - temp_d; + } + uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint32_t pow10 = POW10_TABLE[temp_d]; + uint32_t integer = rounded_output / pow10; + uint32_t integer_len = decimalLength9(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + index = formated_integer_len; + index++; + index += d; + } + return index; + } + + __device__ int compute_format_float_size(double value, int d, bool is_float) { + bool sign = false, special = false; + if (is_float) { + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa, d); + } + return format_float_size(v, sign, d); + } else { + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa, d); + } + return format_float_size(v, sign, d); + } + } + + __device__ int format_float(double value, int d, char* output, bool is_float) { + bool sign = false, special = false; + if (is_float) { + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return copy_special_str(output, sign, v.exponent, v.mantissa, d); + } + return to_formated_chars(v, sign, output, d); + } else { + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return copy_special_str(output, sign, v.exponent, v.mantissa, d); + } + return to_formated_chars(v, sign, output, d); + } + } + }; } diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index 3e03578f4c..459d2e0b7f 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -43,9 +43,10 @@ TEST_F(FormatFloatTests, FormatFloats32) -4, std::numeric_limits::quiet_NaN(), 123456789012.34, - -0.0}; + -0.0 + }; std::vector h_expected{ - "100.0", "654,321.25", "-12,761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"}; + "100.00000", "654,321.25000", "-12,761.12500", "0.00000", "5.00000", "-4.00000", "NaN", "123,456,790,000.00000", "-0.00000"}; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), @@ -64,20 +65,44 @@ TEST_F(FormatFloatTests, FormatFloats32) TEST_F(FormatFloatTests, FormatFloats64) { - std::vector h_floats{100, + // std::vector h_floats{100, + // 654321.25, + // -12761.125, + // 1.123456789123456789, + // 0.000000000000000000123456789123456789, + // 0, + // 5, + // -4, + // std::numeric_limits::quiet_NaN(), + // 839542223232.794248339, + // -0.0}; + // std::vector h_expected{ + // "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", + // "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; + + std::vector h_floats{100, 654321.25, -12761.125, 1.123456789123456789, - 0.000000000000000000123456789123456789, 0, 5, -4, std::numeric_limits::quiet_NaN(), 839542223232.794248339, - -0.0}; + -0.0 + }; std::vector h_expected{ - "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", - "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; + "100.00000", + "654,321.25000", + "-12,761.12500", + "1.12346", + "0.00000", + "5.00000", + "-4.00000", + "NaN", + "839,542,223,232.79420", + "-0.00000" + }; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), From 5397f120cfa4314fa66ec5aaffd3e2b012540108 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 17 Nov 2023 18:19:05 +0800 Subject: [PATCH 24/54] clean up Signed-off-by: Haoyang Li --- src/main/cpp/tests/format_float.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index 459d2e0b7f..4bf1e17c56 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -65,21 +65,6 @@ TEST_F(FormatFloatTests, FormatFloats32) TEST_F(FormatFloatTests, FormatFloats64) { - // std::vector h_floats{100, - // 654321.25, - // -12761.125, - // 1.123456789123456789, - // 0.000000000000000000123456789123456789, - // 0, - // 5, - // -4, - // std::numeric_limits::quiet_NaN(), - // 839542223232.794248339, - // -0.0}; - // std::vector h_expected{ - // "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", - // "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; - std::vector h_floats{100, 654321.25, -12761.125, From 8aeeb6b00237d7bbd1e42d3add3e853105d6c46d Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 20 Nov 2023 11:48:39 +0800 Subject: [PATCH 25/54] Fixed two bugs Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 69197fdf64..e11df5faf0 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -1264,7 +1264,7 @@ struct ftos_converter { } } else if (exp + 1 >= olength) { // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + (exp - index) / 3; + int integer_len = index + exp + 1 + exp / 3; int sep_cnt = 0; int rev_index = 0; for (int i = olength; i < exp + 1; i++) { @@ -1343,6 +1343,7 @@ struct ftos_converter { index+=2; int actural_round = d; index += exp + 1; + actural_round -= exp + 1; int actural_olength = fmin(int(olength), actural_round); index += actural_olength; actural_round -= actural_olength; @@ -1351,7 +1352,7 @@ struct ftos_converter { } } else if (exp + 1 >= olength) { // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + (exp - index) / 3; + int integer_len = index + exp + 1 + exp / 3; index = integer_len; index++; index += d; @@ -1417,7 +1418,7 @@ struct ftos_converter { } } else if (exp + 1 >= olength) { // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + (exp - index) / 3; + int integer_len = index + exp + 1 + exp / 3; int sep_cnt = 0; int rev_index = 0; for (int i = olength; i < exp + 1; i++) { @@ -1496,6 +1497,7 @@ struct ftos_converter { index+=2; int actural_round = d; index += exp + 1; + actural_round -= exp + 1; int actural_olength = fmin(int(olength), actural_round); index += actural_olength; actural_round -= actural_olength; @@ -1504,7 +1506,7 @@ struct ftos_converter { } } else if (exp + 1 >= olength) { // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + (exp - index) / 3; + int integer_len = index + exp + 1 + exp / 3; index = integer_len; index++; index += d; From a6578c775af069cd76805ad46df72d7ad235251a Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 20 Nov 2023 12:21:57 +0800 Subject: [PATCH 26/54] Added a failed case back Signed-off-by: Haoyang Li --- src/main/cpp/tests/format_float.cpp | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index 4bf1e17c56..b630039263 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -69,6 +69,7 @@ TEST_F(FormatFloatTests, FormatFloats64) 654321.25, -12761.125, 1.123456789123456789, + 0.000000000000000000123456789123456789, 0, 5, -4, @@ -77,17 +78,8 @@ TEST_F(FormatFloatTests, FormatFloats64) -0.0 }; std::vector h_expected{ - "100.00000", - "654,321.25000", - "-12,761.12500", - "1.12346", - "0.00000", - "5.00000", - "-4.00000", - "NaN", - "839,542,223,232.79420", - "-0.00000" - }; + "100.00000", "654,321.25000", "-12,761.12500", "1.12346", "0.00000", "0.00000", "5.00000", + "-4.00000", "NaN", "839,542,223,232.79420", "-0.00000"}; cudf::test::fixed_width_column_wrapper floats( h_floats.begin(), From 9b7fb4a531dcb29fe77f5698284b1a6a69f95964 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 20 Nov 2023 18:02:04 +0800 Subject: [PATCH 27/54] Refactor Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 75 +++++++----------------------- 1 file changed, 18 insertions(+), 57 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index e11df5faf0..e516ba8910 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -1204,6 +1204,7 @@ struct ftos_converter { template __device__ inline T round_half_even(const T input, const int olength, const int d) { + // "round" a integer to d digits, with the half-even rounding mode. if (d > olength) { T num = input; for (int i = 0; i < d - olength; i++) { @@ -1220,8 +1221,7 @@ struct ftos_converter { return num; } - __device__ inline int to_formated_chars(const floating_decimal_64 v, const bool sign, char* const result, int d=10) { - // Step 5: Print the decimal representation. + __device__ inline int to_formated_chars(const floating_decimal_64 v, const bool sign, char* const result, int d) { int index = 0; if (sign) { result[index++] = '-'; @@ -1299,6 +1299,7 @@ struct ftos_converter { uint64_t pow10 = POW10_TABLE[temp_d]; uint64_t integer = rounded_output / pow10; uint64_t decimal = rounded_output % pow10; + // calculate integer length after format to cover carry case uint32_t integer_len = decimalLength17(integer); uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; uint32_t sep_cnt = 0; @@ -1328,8 +1329,7 @@ struct ftos_converter { return index; } - __device__ inline int format_float_size(const floating_decimal_64 v, const bool sign, int d=10) { - // Step 5: Print the decimal representation. + __device__ inline int format_float_size(const floating_decimal_64 v, const bool sign, int d) { int index = 0; if (sign) { index++; @@ -1338,44 +1338,24 @@ struct ftos_converter { const uint32_t olength = decimalLength17(output); int32_t exp = v.exponent + (int32_t) olength - 1; if (exp < 0) { - // Decimal dot is before any of the digits. - int index_for_carrier = index; - index+=2; - int actural_round = d; - index += exp + 1; - actural_round -= exp + 1; - int actural_olength = fmin(int(olength), actural_round); - index += actural_olength; - actural_round -= actural_olength; - if (actural_round > 0) { - index += actural_round; - } + index += 2 + d; } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + exp / 3; - index = integer_len; - index++; - index += d; + index += exp + 1 + exp / 3 + 1 + d; } else { - uint32_t temp_d = d, tailing_zero = 0; + uint32_t temp_d = d; if (exp + d > olength) { temp_d = olength - exp; - tailing_zero = d - temp_d; } uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); uint64_t pow10 = POW10_TABLE[temp_d]; uint64_t integer = rounded_output / pow10; uint32_t integer_len = decimalLength17(integer); - uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; - index = formated_integer_len; - index++; - index += d; + index += integer_len + (integer_len - 1) / 3 + 1 + d; } return index; } - __device__ inline int to_formated_chars(const floating_decimal_32 v, const bool sign, char* const result, int d=10) { - // Step 5: Print the decimal representation. + __device__ inline int to_formated_chars(const floating_decimal_32 v, const bool sign, char* const result, int d) { int index = 0; if (sign) { result[index++] = '-'; @@ -1453,6 +1433,7 @@ struct ftos_converter { uint32_t pow10 = POW10_TABLE[temp_d]; uint32_t integer = rounded_output / pow10; uint32_t decimal = rounded_output % pow10; + // calculate integer length after format to cover carry case uint32_t integer_len = decimalLength9(integer); uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; uint32_t sep_cnt = 0; @@ -1482,8 +1463,7 @@ struct ftos_converter { return index; } - __device__ inline int format_float_size(const floating_decimal_32 v, const bool sign, int d=10) { - // Step 5: Print the decimal representation. + __device__ inline int format_float_size(const floating_decimal_32 v, const bool sign, int d) { int index = 0; if (sign) { index++; @@ -1492,38 +1472,19 @@ struct ftos_converter { const uint32_t olength = decimalLength9(output); int32_t exp = v.exponent + (int32_t) olength - 1; if (exp < 0) { - // Decimal dot is before any of the digits. - int index_for_carrier = index; - index+=2; - int actural_round = d; - index += exp + 1; - actural_round -= exp + 1; - int actural_olength = fmin(int(olength), actural_round); - index += actural_olength; - actural_round -= actural_olength; - if (actural_round > 0) { - index += actural_round; - } + index += 2 + d; } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - int integer_len = index + exp + 1 + exp / 3; - index = integer_len; - index++; - index += d; + index += exp + 1 + exp / 3 + 1 + d; } else { - uint32_t temp_d = d, tailing_zero = 0; + uint32_t temp_d = d; if (exp + d > olength) { temp_d = olength - exp; - tailing_zero = d - temp_d; } - uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1); - uint32_t pow10 = POW10_TABLE[temp_d]; - uint32_t integer = rounded_output / pow10; + uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; uint32_t integer_len = decimalLength9(integer); - uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; - index = formated_integer_len; - index++; - index += d; + index += integer_len + (integer_len - 1) / 3 + 1 + d; } return index; } From 41967d91b97398676653c19b0a7c9cb1492f7d88 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 21 Nov 2023 02:03:03 +0800 Subject: [PATCH 28/54] Handle d=0 case Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 36 ++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index e516ba8910..664feb706e 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -238,7 +238,7 @@ struct ftos_converter { } - __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa, const int d=1) { + __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa, const int d = 1) { if (mantissa) { memcpy(result, "NaN", 3); return 3; @@ -250,7 +250,12 @@ struct ftos_converter { memcpy(result + sign, "Infinity", 8); return sign + 8; } - memcpy(result + sign, "0.", 2); + result[sign] = '0'; + if (d == 0) { + return sign + 1; + } else { + result[sign + 1] = '.'; + } for (int i = 0; i < d; i++) { result[sign + 2 + i] = '0'; } @@ -264,6 +269,9 @@ struct ftos_converter { if (exponent) { return sign + 8; } + if (d == 0) { + return sign + 1; + } return sign + 2 + d; } @@ -1233,6 +1241,9 @@ struct ftos_converter { // Decimal dot is before any of the digits. int index_for_carrier = index; result[index++] = '0'; + if (d == 0) { + return index; + } result[index++] = '.'; int actural_round = d; for (int i = -1; i > exp; i--) { @@ -1285,6 +1296,9 @@ struct ftos_converter { output /= 10; } index = integer_len; + if (d == 0) { + return index; + } result[index++] = '.'; for (int i = 0; i < d; i++) { result[index++] = '0'; @@ -1314,6 +1328,9 @@ struct ftos_converter { integer /= 10; } index = formated_integer_len; + if (d == 0) { + return index; + } result[index++] = '.'; int current = index; for (int i = 0; i < tailing_zero; i++) { @@ -1352,6 +1369,9 @@ struct ftos_converter { uint32_t integer_len = decimalLength17(integer); index += integer_len + (integer_len - 1) / 3 + 1 + d; } + if (d == 0) { + index--; + } return index; } @@ -1367,6 +1387,9 @@ struct ftos_converter { // Decimal dot is before any of the digits. int index_for_carrier = index; result[index++] = '0'; + if (d == 0) { + return index; + } result[index++] = '.'; int actural_round = d; for (int i = -1; i > exp; i--) { @@ -1419,6 +1442,9 @@ struct ftos_converter { output /= 10; } index = integer_len; + if (d == 0) { + return index; + } result[index++] = '.'; for (int i = 0; i < d; i++) { result[index++] = '0'; @@ -1448,6 +1474,9 @@ struct ftos_converter { integer /= 10; } index = formated_integer_len; + if (d == 0) { + return index; + } result[index++] = '.'; int current = index; for (int i = 0; i < tailing_zero; i++) { @@ -1486,6 +1515,9 @@ struct ftos_converter { uint32_t integer_len = decimalLength9(integer); index += integer_len + (integer_len - 1) / 3 + 1 + d; } + if (d == 0) { + index--; + } return index; } From dc570cbc3f04067612580144d490485415f1b0be Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 21 Nov 2023 08:56:46 +0800 Subject: [PATCH 29/54] Add nv apache license to ftos_converter Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 1ff5fe8543..2190381260 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -15,19 +15,20 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. -// Not a contribution -// Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as -// NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: /* - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * Copyright (c) 2023, NVIDIA CORPORATION. * - * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual - * property and proprietary rights in and to this material, related - * documentation and any modifications thereto. Any use, reproduction, - * disclosure or distribution of this material and related documentation - * without an express license agreement from NVIDIA CORPORATION or - * its affiliates is strictly prohibited. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include From 96333ca53e550ba9cdbdfe66d0c538df87e105a1 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 21 Nov 2023 08:56:46 +0800 Subject: [PATCH 30/54] Add nv apache license to ftos_converter Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 664feb706e..835715f3d1 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -15,19 +15,20 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. -// Not a contribution -// Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as -// NVIDIA-proprietary are not a contribution and subject to the following terms and conditions: /* - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: LicenseRef-NvidiaProprietary + * Copyright (c) 2023, NVIDIA CORPORATION. * - * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual - * property and proprietary rights in and to this material, related - * documentation and any modifications thereto. Any use, reproduction, - * disclosure or distribution of this material and related documentation - * without an express license agreement from NVIDIA CORPORATION or - * its affiliates is strictly prohibited. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include From c36ce9435f6b620984aa7e0978e0137804544d5e Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 21 Nov 2023 16:02:58 +0800 Subject: [PATCH 31/54] Fix an rounding bug Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cu | 6 ++++++ thirdparty/cudf | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 835715f3d1..d226052b46 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -1252,6 +1252,9 @@ struct ftos_converter { result[index++] = '0'; actural_round--; if (actural_round == 0) { + if (i != exp + 1) { + return index; + } // else, possible carry break; } } @@ -1398,6 +1401,9 @@ struct ftos_converter { result[index++] = '0'; actural_round--; if (actural_round == 0) { + if (i != exp + 1) { + return index; + } // else, possible carry break; } } diff --git a/thirdparty/cudf b/thirdparty/cudf index 4313cfa9b3..8a0a08f34f 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc +Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 From 360a77bb87a83ca6bcd96f2637fb477118fdffe8 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 21 Nov 2023 23:55:53 +0800 Subject: [PATCH 32/54] Update src/main/cpp/src/ftos_converter.cu Co-authored-by: Jason Lowe --- src/main/cpp/src/ftos_converter.cu | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu index 2190381260..0c05373ed9 100644 --- a/src/main/cpp/src/ftos_converter.cu +++ b/src/main/cpp/src/ftos_converter.cu @@ -1,21 +1,5 @@ -// Copyright 2018 Ulf Adams -// -// The contents of this file may be used under the terms of the Apache License, -// Version 2.0. -// -// (See accompanying file LICENSE-Apache or copy at -// http://www.apache.org/licenses/LICENSE-2.0) -// -// Alternatively, the contents of this file may be used under the terms of -// the Boost Software License, Version 1.0. -// (See accompanying file LICENSE-Boost or copy at -// https://www.boost.org/LICENSE_1_0.txt) -// -// Unless required by applicable law or agreed to in writing, this software -// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. - /* + * Copyright 2018 Ulf Adams * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); From ced33b6cf203d3bd46c6f07b0dc7cd0669156503 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 22 Nov 2023 14:58:12 +0800 Subject: [PATCH 33/54] address some comments Signed-off-by: Haoyang Li --- NOTICE | 1 - src/main/cpp/CMakeLists.txt | 1 - src/main/cpp/src/cast_float_to_string.cu | 59 +- src/main/cpp/src/ftos_converter.cu | 1181 ------------------- src/main/cpp/src/ftos_converter.cuh | 1156 ++++++++++++++++++ src/main/cpp/tests/cast_float_to_string.cpp | 4 +- 6 files changed, 1185 insertions(+), 1217 deletions(-) delete mode 100644 src/main/cpp/src/ftos_converter.cu create mode 100644 src/main/cpp/src/ftos_converter.cuh diff --git a/NOTICE b/NOTICE index 53333b52c5..a0975c00c8 100644 --- a/NOTICE +++ b/NOTICE @@ -5,7 +5,6 @@ Copyright (c) 2022-2023, NVIDIA CORPORATION This project includes code from ryu (https://github.com/ulfjack/ryu). -ryu Copyright (2018) Ulf Adams and contributors. Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index a6ac8ac98c..8f90b9078e 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -168,7 +168,6 @@ add_library( src/cast_string_to_float.cu src/datetime_rebase.cu src/decimal_utils.cu - src/ftos_converter.cu src/histogram.cu src/map_utils.cu src/murmur_hash.cu diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index eaf0c989b9..e22947ab9e 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -15,6 +15,7 @@ */ #include "cast_string.hpp" +#include "ftos_converter.cuh" #include #include @@ -28,13 +29,10 @@ #include #include #include + #include #include -#include - -using namespace cudf; - namespace spark_rapids_jni { namespace detail { @@ -42,26 +40,25 @@ namespace { template struct float_to_string_fn { - column_device_view d_floats; - size_type* d_offsets; + cudf::column_device_view d_floats; + cudf::size_type* d_offsets; char* d_chars; - __device__ size_type compute_output_size(FloatType value) + __device__ cudf::size_type compute_output_size(FloatType value) const { - ftos_converter fts; - bool is_float = std::is_same_v; - return static_cast(fts.compute_ftos_size(static_cast(value), is_float)); + bool constexpr is_float = std::is_same_v; + return static_cast(ftos_converter::compute_ftos_size(static_cast(value), is_float)); } - __device__ void float_to_string(size_type idx) + __device__ void float_to_string(cudf::size_type idx) const { - FloatType value = d_floats.element(idx); - ftos_converter fts; - bool is_float = std::is_same_v; - fts.float_to_string(static_cast(value), d_chars + d_offsets[idx], is_float); + auto const value = d_floats.element(idx); + bool constexpr is_float = std::is_same_v; + auto const output = d_chars + d_offsets[idx]; + ftos_converter::float_to_string(static_cast(value), is_float, output); } - __device__ void operator()(size_type idx) + __device__ void operator()(cudf::size_type idx) const { if (d_floats.is_null(idx)) { if (d_chars == nullptr) { d_offsets[idx] = 0; } @@ -82,32 +79,28 @@ struct float_to_string_fn { */ struct dispatch_float_to_string_fn { template >* = nullptr> - std::unique_ptr operator()(column_view const& floats, + std::unique_ptr operator()(cudf::column_view const& floats, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const + rmm::mr::device_memory_resource* mr) { - size_type strings_count = floats.size(); - auto column = column_device_view::create(floats, stream); - auto d_column = *column; - - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); + auto const strings_count = floats.size(); + auto const input_ptr = cudf::column_device_view::create(floats, stream); auto [offsets, chars] = - cudf::strings::detail::make_strings_children(float_to_string_fn{d_column}, strings_count, stream, mr); + cudf::strings::detail::make_strings_children(float_to_string_fn{*input_ptr}, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets), std::move(chars), floats.null_count(), - std::move(null_mask)); + std::move(cudf::detail::copy_bitmask(floats, stream, mr))); } // non-float types throw an exception template >* = nullptr> - std::unique_ptr operator()(column_view const&, + std::unique_ptr operator()(cudf::column_view const&, rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const + rmm::mr::device_memory_resource*) { CUDF_FAIL("Values for float_to_string function must be a float type."); } @@ -116,12 +109,14 @@ struct dispatch_float_to_string_fn { } // namespace // This will convert all float column types into a strings column. -std::unique_ptr float_to_string(column_view const& floats, +std::unique_ptr float_to_string(cudf::column_view const& floats, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = floats.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + auto const strings_count = floats.size(); + if (strings_count == 0) { + return cudf::make_empty_column(cudf::type_id::STRING); + } return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr); } @@ -129,7 +124,7 @@ std::unique_ptr float_to_string(column_view const& floats, } // namespace detail // external API -std::unique_ptr float_to_string(column_view const& floats, +std::unique_ptr float_to_string(cudf::column_view const& floats, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu deleted file mode 100644 index 0c05373ed9..0000000000 --- a/src/main/cpp/src/ftos_converter.cu +++ /dev/null @@ -1,1181 +0,0 @@ -/* - * Copyright 2018 Ulf Adams - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace spark_rapids_jni { - -namespace detail { -namespace { - -// d2s.c from ryu -// A floating decimal representing m * 10^e. -typedef struct floating_decimal_64 { - uint64_t mantissa; - // Decimal exponent's range is -324 to 308 - // inclusive, and can fit in a short if needed. - int32_t exponent; -} floating_decimal_64; - -// f2s.c from ryu -// A floating decimal representing m * 10^e. -typedef struct floating_decimal_32 { - uint32_t mantissa; - // Decimal exponent's range is -45 to 38 - // inclusive, and can fit in a short if needed. - int32_t exponent; -} floating_decimal_32; - -struct ftos_converter { - - //===== constants from ryu ===== - - // These tables are generated by PrintDoubleLookupTable. - static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; - static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; - static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); - static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); - static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; - static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; - static constexpr unsigned int DOUBLE_BIAS = 1023; - static constexpr unsigned int FLOAT_MANTISSA_BITS = 23; - static constexpr unsigned int FLOAT_EXPONENT_BITS = 8; - static constexpr unsigned int FLOAT_BIAS = 127; - - const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = { - { 1u, 2305843009213693952u }, - { 5955668970331000884u, 1784059615882449851u }, - { 8982663654677661702u, 1380349269358112757u }, - { 7286864317269821294u, 2135987035920910082u }, - { 7005857020398200553u, 1652639921975621497u }, - { 17965325103354776697u, 1278668206209430417u }, - { 8928596168509315048u, 1978643211784836272u }, - { 10075671573058298858u, 1530901034580419511u }, - { 597001226353042382u, 1184477304306571148u }, - { 1527430471115325346u, 1832889850782397517u }, - { 12533209867169019542u, 1418129833677084982u }, - { 5577825024675947042u, 2194449627517475473u }, - { 11006974540203867551u, 1697873161311732311u }, - { 10313493231639821582u, 1313665730009899186u }, - { 12701016819766672773u, 2032799256770390445u } - }; - - const uint32_t POW5_INV_OFFSETS[19] = { - 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, - 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, - 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, - 0x00000000 - }; - - const uint64_t DOUBLE_POW5_SPLIT2[13][2] = { - { 0u, 1152921504606846976u }, - { 0u, 1490116119384765625u }, - { 1032610780636961552u, 1925929944387235853u }, - { 7910200175544436838u, 1244603055572228341u }, - { 16941905809032713930u, 1608611746708759036u }, - { 13024893955298202172u, 2079081953128979843u }, - { 6607496772837067824u, 1343575221513417750u }, - { 17332926989895652603u, 1736530273035216783u }, - { 13037379183483547984u, 2244412773384604712u }, - { 1605989338741628675u, 1450417759929778918u }, - { 9630225068416591280u, 1874621017369538693u }, - { 665883850346957067u, 1211445438634777304u }, - { 14931890668723713708u, 1565756531257009982u } - }; - - const uint32_t POW5_OFFSETS[21] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, - 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, - 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, - 0x55559155, 0x51405555, 0x00000105 - }; - - static constexpr uint32_t POW5_TABLE_SIZE = 26; - const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { - 1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, - 1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, - 30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, - 19073486328125ull, 95367431640625ull, 476837158203125ull, - 2384185791015625ull, 11920928955078125ull, 59604644775390625ull, - 298023223876953125ull //, 1490116119384765625ull - }; - - //===== common.h from ryu ===== - - // Returns the number of decimal digits in v, which must not contain more than 9 digits. - __device__ inline uint32_t decimalLength9(const uint32_t v) { - // Function precondition: v is not a 10-digit number. - // (f2s: 9 digits are sufficient for round-tripping.) - // (d2fixed: We print 9-digit blocks.) - assert(v < 1000000000); - if (v >= 100000000) { return 9; } - if (v >= 10000000) { return 8; } - if (v >= 1000000) { return 7; } - if (v >= 100000) { return 6; } - if (v >= 10000) { return 5; } - if (v >= 1000) { return 4; } - if (v >= 100) { return 3; } - if (v >= 10) { return 2; } - return 1; - } - - // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528. - __device__ inline int32_t log2pow5(const int32_t e) { - // This approximation works up to the point that the multiplication overflows at e = 3529. - // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater - // than 2^9297. - assert(e >= 0); - assert(e <= 3528); - return (int32_t) ((((uint32_t) e) * 1217359) >> 19); - } - - // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. - __device__ inline int32_t pow5bits(const int32_t e) { - // This approximation works up to the point that the multiplication overflows at e = 3529. - // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater - // than 2^9297. - assert(e >= 0); - assert(e <= 3528); - return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); - } - - // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. - __device__ inline int32_t ceil_log2pow5(const int32_t e) { - return log2pow5(e) + 1; - } - - // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. - __device__ inline uint32_t log10Pow2(const int32_t e) { - // The first value this approximation fails for is 2^1651 which is just greater than 10^297. - assert(e >= 0); - assert(e <= 1650); - return (((uint32_t) e) * 78913) >> 18; - } - - // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. - __device__ inline uint32_t log10Pow5(const int32_t e) { - // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. - assert(e >= 0); - assert(e <= 2620); - return (((uint32_t) e) * 732923) >> 20; - } - - __device__ inline uint32_t pow5factor_32(uint32_t value) { - uint32_t count = 0; - for (;;) { - assert(value != 0); - const uint32_t q = value / 5; - const uint32_t r = value % 5; - if (r != 0) { - break; - } - value = q; - ++count; - } - return count; - } - - // Returns true if value is divisible by 5^p. - __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) { - return pow5factor_32(value) >= p; - } - - // Returns true if value is divisible by 2^p. - __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) { - // __builtin_ctz doesn't appear to be faster here. - return (value & ((1u << p) - 1)) == 0; - } - - // It seems to be slightly faster to avoid uint128_t here, although the - // generated code for uint128_t looks slightly nicer. - __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) { - assert(shift > 32); - - // The casts here help MSVC to avoid calls to the __allmul library - // function. - const uint32_t factorLo = (uint32_t)(factor); - const uint32_t factorHi = (uint32_t)(factor >> 32); - const uint64_t bits0 = (uint64_t)m * factorLo; - const uint64_t bits1 = (uint64_t)m * factorHi; - - const uint64_t sum = (bits0 >> 32) + bits1; - const uint64_t shiftedSum = sum >> (shift - 32); - assert(shiftedSum <= UINT32_MAX); - return (uint32_t) shiftedSum; - - } - - __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) { - if (mantissa) { - memcpy(result, "NaN", 3); - return 3; - } - if (sign) { - result[0] = '-'; - } - if (exponent) { - memcpy(result + sign, "Infinity", 8); - return sign + 8; - } - memcpy(result + sign, "0.0", 3); - return sign + 3; - } - - __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) { - if (mantissa) { - return 3; - } - if (exponent) { - return sign + 8; - } - return sign + 3; - } - - __device__ inline uint32_t float_to_bits(const float f) { - uint32_t bits = 0; - memcpy(&bits, &f, sizeof(float)); - return bits; - } - - __device__ inline uint64_t double_to_bits(const double d) { - uint64_t bits = 0; - memcpy(&bits, &d, sizeof(double)); - return bits; - } - - //===== d2s_intrinsics.h from ryu ===== - - __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) { - // The casts here help MSVC to avoid calls to the __allmul library function. - const uint32_t aLo = (uint32_t)a; - const uint32_t aHi = (uint32_t)(a >> 32); - const uint32_t bLo = (uint32_t)b; - const uint32_t bHi = (uint32_t)(b >> 32); - - const uint64_t b00 = (uint64_t)aLo * bLo; - const uint64_t b01 = (uint64_t)aLo * bHi; - const uint64_t b10 = (uint64_t)aHi * bLo; - const uint64_t b11 = (uint64_t)aHi * bHi; - - const uint32_t b00Lo = (uint32_t)b00; - const uint32_t b00Hi = (uint32_t)(b00 >> 32); - - const uint64_t mid1 = b10 + b00Hi; - const uint32_t mid1Lo = (uint32_t)(mid1); - const uint32_t mid1Hi = (uint32_t)(mid1 >> 32); - - const uint64_t mid2 = b01 + mid1Lo; - const uint32_t mid2Lo = (uint32_t)(mid2); - const uint32_t mid2Hi = (uint32_t)(mid2 >> 32); - - const uint64_t pHi = b11 + mid1Hi + mid2Hi; - const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo; - - *productHi = pHi; - return pLo; - } - - __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) { - // We don't need to handle the case dist >= 64 here (see above). - assert(dist < 64); - assert(dist > 0); - return (hi << (64 - dist)) | (lo >> dist); - } - - __device__ inline uint64_t div5(const uint64_t x) { - return x / 5; - } - - __device__ inline uint64_t div10(const uint64_t x) { - return x / 10; - } - - __device__ inline uint64_t div100(const uint64_t x) { - return x / 100; - } - - __device__ inline uint64_t div1e8(const uint64_t x) { - return x / 100000000; - } - - __device__ inline uint64_t div1e9(const uint64_t x) { - return x / 1000000000; - } - - __device__ inline uint32_t mod1e9(const uint64_t x) { - return (uint32_t) (x - 1000000000 * div1e9(x)); - } - - __device__ inline uint32_t pow5Factor(uint64_t value) { - const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) - const uint64_t n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 - uint32_t count = 0; - for (;;) { - assert(value != 0); - value *= m_inv_5; - if (value > n_div_5) - break; - ++count; - } - return count; - } - - // Returns true if value is divisible by 5^p. - __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) { - // I tried a case distinction on p, but there was no performance difference. - return pow5Factor(value) >= p; - } - - // Returns true if value is divisible by 2^p. - __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) { - assert(value != 0); - assert(p < 64); - // __builtin_ctzll doesn't appear to be faster here. - return (value & ((1ull << p) - 1)) == 0; - } - - __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) { - // m is maximum 55 bits - uint64_t high1; // 128 - const uint64_t low1 = umul128(m, mul[1], &high1); // 64 - uint64_t high0; // 64 - umul128(m, mul[0], &high0); // 0 - const uint64_t sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - return shiftright128(sum, high1, j - 64); - } - - __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j, - uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) { - *vp = mulShift64(4 * m + 2, mul, j); - *vm = mulShift64(4 * m - 1 - mmShift, mul, j); - return mulShift64(4 * m, mul, j); - } - - //===== d2s_small_table.h from ryu ===== - - // Computes 5^i in the form required by Ryu, and stores it in the given pointer. - __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) { - const uint32_t base = i / POW5_TABLE_SIZE; - const uint32_t base2 = base * POW5_TABLE_SIZE; - const uint32_t offset = i - base2; - const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base]; - if (offset == 0) { - result[0] = mul[0]; - result[1] = mul[1]; - return; - } - const uint64_t m = DOUBLE_POW5_TABLE[offset]; - uint64_t high1; - const uint64_t low1 = umul128(m, mul[1], &high1); - uint64_t high0; - const uint64_t low0 = umul128(m, mul[0], &high0); - const uint64_t sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - // high1 | sum | low0 - const uint32_t delta = pow5bits(i) - pow5bits(base2); - result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); - result[1] = shiftright128(sum, high1, delta); - } - - // Computes 5^-i in the form required by Ryu, and stores it in the given pointer. - __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) { - const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; - const uint32_t base2 = base * POW5_TABLE_SIZE; - const uint32_t offset = base2 - i; - const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 - if (offset == 0) { - result[0] = mul[0]; - result[1] = mul[1]; - return; - } - const uint64_t m = DOUBLE_POW5_TABLE[offset]; - uint64_t high1; - const uint64_t low1 = umul128(m, mul[1], &high1); - uint64_t high0; - const uint64_t low0 = umul128(m, mul[0] - 1, &high0); - const uint64_t sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - // high1 | sum | low0 - const uint32_t delta = pow5bits(base2) - pow5bits(i); - result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); - result[1] = shiftright128(sum, high1, delta); - } - - //===== f2s_intrinsics.h from ryu ===== - - __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) { - // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup - // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the - // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. - uint64_t pow5[2]; - double_computeInvPow5(q, pow5); - return mulShift32(m, pow5[1] + 1, j); - } - - __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) { - uint64_t pow5[2]; - double_computePow5(i, pow5); - return mulShift32(m, pow5[1], j); - } - - //===== d2s.c and f2s.c from ryu ===== - - __device__ inline uint32_t decimalLength17(const uint64_t v) { - // This is slightly faster than a loop. - // The average output length is 16.38 digits, so we check high-to-low. - // Function precondition: v is not an 18, 19, or 20-digit number. - // (17 digits are sufficient for round-tripping.) - assert(v < 100000000000000000L); - if (v >= 10000000000000000L) { return 17; } - if (v >= 1000000000000000L) { return 16; } - if (v >= 100000000000000L) { return 15; } - if (v >= 10000000000000L) { return 14; } - if (v >= 1000000000000L) { return 13; } - if (v >= 100000000000L) { return 12; } - if (v >= 10000000000L) { return 11; } - if (v >= 1000000000L) { return 10; } - if (v >= 100000000L) { return 9; } - if (v >= 10000000L) { return 8; } - if (v >= 1000000L) { return 7; } - if (v >= 100000L) { return 6; } - if (v >= 10000L) { return 5; } - if (v >= 1000L) { return 4; } - if (v >= 100L) { return 3; } - if (v >= 10L) { return 2; } - return 1; - } - - __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) { - int32_t e2; - uint64_t m2; - if (ieeeExponent == 0) { - // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; - m2 = ieeeMantissa; - } else { - e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; - m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - } - const bool even = (m2 & 1) == 0; - const bool acceptBounds = even; - - // Step 2: Determine the interval of valid decimal representations. - const uint64_t mv = 4 * m2; - // Implicit bool -> int conversion. True is 1, false is 0. - const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - // We would compute mp and mm like this: - // uint64_t mp = 4 * m2 + 2; - // uint64_t mm = mv - 1 - mmShift; - - // Step 3: Convert to a decimal power base using 128-bit arithmetic. - uint64_t vr, vp, vm; - int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; - if (e2 >= 0) { - // I tried special-casing q == 0, but there was no effect on performance. - // This expression is slightly faster than max(0, log10Pow2(e2) - 1). - const uint32_t q = log10Pow2(e2) - (e2 > 3); - e10 = (int32_t) q; - const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; - const int32_t i = -e2 + (int32_t) q + k; - uint64_t pow5[2]; - double_computeInvPow5(q, pow5); - vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); - - if (q <= 21) { - // This should use q <= 22, but I think 21 is also safe. Smaller values - // may still be safe, but it's more difficult to reason about them. - // Only one of mp, mv, and mm can be a multiple of 5, if any. - const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); - if (mvMod5 == 0) { - vrIsTrailingZeros = multipleOfPowerOf5(mv, q); - } else if (acceptBounds) { - // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q - // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q - // <=> true && pow5Factor(mm) >= q, since e2 >= q. - vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); - } else { - // Same as min(e2 + 1, pow5Factor(mp)) >= q. - vp -= multipleOfPowerOf5(mv + 2, q); - } - } - } else { - // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). - const uint32_t q = log10Pow5(-e2) - (-e2 > 1); - e10 = (int32_t) q + e2; - const int32_t i = -e2 - (int32_t) q; - const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; - const int32_t j = (int32_t) q - k; - - uint64_t pow5[2]; - double_computePow5(i, pow5); - vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); - - if (q <= 1) { - // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. - // mv = 4 * m2, so it always has at least two trailing 0 bits. - vrIsTrailingZeros = true; - if (acceptBounds) { - // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. - vmIsTrailingZeros = mmShift == 1; - } else { - // mp = mv + 2, so it always has at least one trailing 0 bit. - --vp; - } - } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. - // We want to know if the full product has at least q trailing zeros. - // We need to compute min(p2(mv), p5(mv) - e2) >= q - // <=> p2(mv) >= q && p5(mv) - e2 >= q - // <=> p2(mv) >= q (because -e2 >= q) - vrIsTrailingZeros = multipleOfPowerOf2(mv, q); - } - } - - // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; - uint8_t lastRemovedDigit = 0; - uint64_t output; - // On average, we remove ~2 digits. - if (vmIsTrailingZeros || vrIsTrailingZeros) { - // General case, which happens rarely (~0.7%). - for (;;) { - const uint64_t vpDiv10 = div10(vp); - const uint64_t vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { - break; - } - const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); - const uint64_t vrDiv10 = div10(vr); - const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - vmIsTrailingZeros &= vmMod10 == 0; - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - - if (vmIsTrailingZeros) { - for (;;) { - const uint64_t vmDiv10 = div10(vm); - const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); - if (vmMod10 != 0) { - break; - } - const uint64_t vpDiv10 = div10(vp); - const uint64_t vrDiv10 = div10(vr); - const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - } - - if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { - // Round even if the exact number is .....50..0. - lastRemovedDigit = 4; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); - } else { - // Specialized for the common case (~99.3%). Percentages below are relative to this. - bool roundUp = false; - const uint64_t vpDiv100 = div100(vp); - const uint64_t vmDiv100 = div100(vm); - if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). - const uint64_t vrDiv100 = div100(vr); - const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); - roundUp = vrMod100 >= 50; - vr = vrDiv100; - vp = vpDiv100; - vm = vmDiv100; - removed += 2; - } - // Loop iterations below (approximately), without optimization above: - // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% - // Loop iterations below (approximately), with optimization above: - // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% - for (;;) { - const uint64_t vpDiv10 = div10(vp); - const uint64_t vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { - break; - } - const uint64_t vrDiv10 = div10(vr); - const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - roundUp = vrMod10 >= 5; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + (vr == vm || roundUp); - } - const int32_t exp = e10 + removed; - - floating_decimal_64 fd; - fd.exponent = exp; - fd.mantissa = output; - return fd; - } - - __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) { - int32_t e2; - uint32_t m2; - if (ieeeExponent == 0) { - // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; - m2 = ieeeMantissa; - } else { - e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; - m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; - } - const bool even = (m2 & 1) == 0; - const bool acceptBounds = even; - - // Step 2: Determine the interval of valid decimal representations. - const uint32_t mv = 4 * m2; - const uint32_t mp = 4 * m2 + 2; - // Implicit bool -> int conversion. True is 1, false is 0. - const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - const uint32_t mm = 4 * m2 - 1 - mmShift; - - // Step 3: Convert to a decimal power base using 64-bit arithmetic. - uint32_t vr, vp, vm; - int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; - uint8_t lastRemovedDigit = 0; - if (e2 >= 0) { - const uint32_t q = log10Pow2(e2); - e10 = (int32_t) q; - const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; - const int32_t i = -e2 + (int32_t) q + k; - vr = mulPow5InvDivPow2(mv, q, i); - vp = mulPow5InvDivPow2(mp, q, i); - vm = mulPow5InvDivPow2(mm, q, i); - if (q != 0 && (vp - 1) / 10 <= vm / 10) { - // We need to know one removed digit even if we are not going to loop below. We could use - // q = X - 1 above, except that would require 33 bits for the result, and we've found that - // 32-bit arithmetic is faster even on 64-bit machines. - const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; - lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); - } - if (q <= 9) { - // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. - // Only one of mp, mv, and mm can be a multiple of 5, if any. - if (mv % 5 == 0) { - vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); - } else if (acceptBounds) { - vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); - } else { - vp -= multipleOfPowerOf5_32(mp, q); - } - } - } else { - const uint32_t q = log10Pow5(-e2); - e10 = (int32_t) q + e2; - const int32_t i = -e2 - (int32_t) q; - const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT; - int32_t j = (int32_t) q - k; - vr = mulPow5divPow2(mv, (uint32_t) i, j); - vp = mulPow5divPow2(mp, (uint32_t) i, j); - vm = mulPow5divPow2(mm, (uint32_t) i, j); - if (q != 0 && (vp - 1) / 10 <= vm / 10) { - j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); - lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); - } - if (q <= 1) { - // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. - // mv = 4 * m2, so it always has at least two trailing 0 bits. - vrIsTrailingZeros = true; - if (acceptBounds) { - // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. - vmIsTrailingZeros = mmShift == 1; - } else { - // mp = mv + 2, so it always has at least one trailing 0 bit. - --vp; - } - } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. - vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); - } - } - - // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; - uint32_t output; - if (vmIsTrailingZeros || vrIsTrailingZeros) { - // General case, which happens rarely (~4.0%). - while (vp / 10 > vm / 10) { - vmIsTrailingZeros &= vm % 10 == 0; - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) (vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - if (vmIsTrailingZeros) { - while (vm % 10 == 0) { - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) (vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - } - if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { - // Round even if the exact number is .....50..0. - lastRemovedDigit = 4; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); - } else { - // Specialized for the common case (~96.0%). Percentages below are relative to this. - // Loop iterations below (approximately): - // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% - while (vp / 10 > vm / 10) { - lastRemovedDigit = (uint8_t) (vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + (vr == vm || lastRemovedDigit >= 5); - } - const int32_t exp = e10 + removed; - - floating_decimal_32 fd; - fd.exponent = exp; - fd.mantissa = output; - return fd; - } - - __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) { - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { - result[index++] = '-'; - } - - uint64_t output = v.mantissa; - const uint32_t olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t) olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - // Values in the interval [1E-3, 1E7) are special. - if (scientificNotation) { - // Print in the format x.xxxxxE-yy. - for (uint32_t i = 0; i < olength - 1; ++i) { - const uint32_t c = output % 10; output /= 10; - result[index + olength - i] = (char) ('0' + c); - } - result[index] = '0' + output % 10; - result[index + 1] = '.'; - index += olength + 1; - if (olength == 1) { - result[index++] = '0'; - } - // Print 'E', the exponent sign, and the exponent, which has at most three digits. - result[index++] = 'E'; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - if (exp >= 100) { - result[index++] = (char) ('0' + exp / 100); - exp %= 100; - result[index++] = (char) ('0' + exp / 10); - } else if (exp >= 10) { - result[index++] = (char) ('0' + exp / 10); - } - result[index++] = (char) ('0' + exp % 10); - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - result[index++] = '0'; - result[index++] = '.'; - for (int i = -1; i > exp; i--) { - result[index++] = '0'; - } - int current = index; - for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - index++; - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength; - for (int i = olength; i < exp + 1; i++) { - result[index++] = '0'; - } - result[index++] = '.'; - result[index++] = '0'; - } else { - // Decimal dot is somewhere between the digits. - int current = index + 1; - for (int i = 0; i < olength; i++) { - if (olength - i - 1 == exp) { - result[current + olength - i - 1] = '.'; - current--; - } - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength + 1; - } - } - return index; - } - - __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) { - int index = 0; - if (sign) { - index++; - } - - uint64_t output = v.mantissa; - const uint32_t olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t) olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - index += olength + 1; - if (olength == 1) { - index++; - } - // 'E' - index++; - if (exp < 0) { - exp = -exp; - index++; - } - if (exp >= 100) { - index += 3; - } else if (exp >= 10) { - index += 2; - } else { - index++; - } - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - index += 1 - exp + olength; - } else if (exp + 1 >= olength) { - index += exp + 3; - } else { - index += olength + 1; - } - } - return index; - } - - __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) { - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { - result[index++] = '-'; - } - - uint32_t output = v.mantissa; - const uint32_t olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - // Print in the format x.xxxxxE-yy. - for (int i = 0; i < olength - 1; i++) { - int c = output % 10; output /= 10; - result[index + olength - i] = (char) ('0' + c); - } - result[index] = (char) ('0' + output % 10); - result[index + 1] = '.'; - index += olength + 1; - if (olength == 1) { - result[index++] = '0'; - } - - // Print 'E', the exponent sign, and the exponent, which has at most two digits. - result[index++] = 'E'; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - if (exp >= 10) { - result[index++] = (char) ('0' + exp / 10); - } - result[index++] = (char) ('0' + exp % 10); - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - result[index++] = '0'; - result[index++] = '.'; - for (int i = -1; i > exp; i--) { - result[index++] = '0'; - } - int current = index; - for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - index++; - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength; - for (int i = olength; i < exp + 1; i++) { - result[index++] = '0'; - } - result[index++] = '.'; - result[index++] = '0'; - } else { - // Decimal dot is somewhere between the digits. - int current = index + 1; - for (int i = 0; i < olength; i++) { - if (olength - i - 1 == exp) { - result[current + olength - i - 1] = '.'; - current--; - } - result[current + olength - i - 1] = (char) ('0' + output % 10); - output /= 10; - } - index += olength + 1; - } - } - return index; - } - - __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) { - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { - index++; - } - - uint32_t output = v.mantissa; - const uint32_t olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - index += olength + 1; - if (olength == 1) { - index++; - } - // 'E' - index++; - if (exp < 0) { - index++; - exp = -exp; - } - if (exp >= 10) { - index++; - } - index++; - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - index += 1 - exp + olength; - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - index += exp + 3; - } else { - // Decimal dot is somewhere between the digits. - index += olength + 1; - } - } - return index; - } - - __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent, - floating_decimal_64* const v) { - const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; - - if (e2 > 0) { - // f = m2 * 2^e2 >= 2^53 is an integer. - // Ignore this case for now. - return false; - } - - if (e2 < -52) { - // f < 1. - return false; - } - - // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. - // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. - const uint64_t mask = (1ull << -e2) - 1; - const uint64_t fraction = m2 & mask; - if (fraction != 0) { - return false; - } - - // f is an integer in the range [1, 2^53). - // Note: mantissa might contain trailing (decimal) 0's. - // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). - v->mantissa = m2 >> -e2; - v->exponent = 0; - return true; - } - - __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { - // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. - const uint64_t bits = double_to_bits(f); - - // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; - const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); - const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); - // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { - special = true; - return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; - } - special = false; - floating_decimal_64 v; - const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); - if (isSmallInt) { - // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. - // For scientific notation we need to move these zeros into the exponent. - // (This is not needed for fixed-point notation, so it might be beneficial to trim - // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) - for (;;) { - const uint64_t q = div10(v.mantissa); - const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); - if (r != 0) { - break; - } - v.mantissa = q; - ++v.exponent; - } - } else { - v = d2d(ieeeMantissa, ieeeExponent); - } - return v; - } - - __device__ int d2s_buffered_n(double f, char* result) { - bool sign = false, special = false; - floating_decimal_64 v = d2d(f, sign, special); - if (special) { - return copy_special_str(result, sign, v.exponent, v.mantissa); - } - return to_chars(v, sign, result); - } - - __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { - // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. - const uint32_t bits = float_to_bits(f); - - // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; - const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); - const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); - - // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { - special = true; - return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; - } - special = false; - return f2d(ieeeMantissa, ieeeExponent); - } - - __device__ int f2s_buffered_n(float f, char* result) { - bool sign = false, special = false; - floating_decimal_32 v = f2d(f, sign, special); - if (special) { - return copy_special_str(result, sign, v.exponent, v.mantissa); - } - return to_chars(v, sign, result); - } - - - //===== compute float to string size ===== - - __device__ int compute_d2s_size(double value) { - bool sign = false, special = false; - floating_decimal_64 v = d2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } - return d2s_size(v, sign); - } - - __device__ int compute_f2s_size(float value) { - bool sign = false, special = false; - floating_decimal_32 v = f2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } - return f2s_size(v, sign); - } - - //===== APIs ===== - - __device__ int compute_ftos_size(double value, bool is_float) { - if (is_float) { - return compute_f2s_size(value); - } else { - return compute_d2s_size(value); - } - } - - __device__ int float_to_string(double value, char* output, bool is_float) { - if (is_float) { - return f2s_buffered_n(value, output); - } else { - return d2s_buffered_n(value, output); - } - } -}; - -} -} -} diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh new file mode 100644 index 0000000000..b9905ae567 --- /dev/null +++ b/src/main/cpp/src/ftos_converter.cuh @@ -0,0 +1,1156 @@ +/* + * Copyright 2018 Ulf Adams + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +namespace spark_rapids_jni::ftos_converter { + +namespace { + +// d2s.c from ryu +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_64 { + uint64_t mantissa; + // Decimal exponent's range is -324 to 308 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_64; + +// f2s.c from ryu +// A floating decimal representing m * 10^e. +typedef struct floating_decimal_32 { + uint32_t mantissa; + // Decimal exponent's range is -45 to 38 + // inclusive, and can fit in a short if needed. + int32_t exponent; +} floating_decimal_32; + +//===== constants from ryu ===== + +// These tables are generated by PrintDoubleLookupTable. +constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; +constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; +constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); +constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); +constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; +constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; +constexpr unsigned int DOUBLE_BIAS = 1023; +constexpr unsigned int FLOAT_MANTISSA_BITS = 23; +constexpr unsigned int FLOAT_EXPONENT_BITS = 8; +constexpr unsigned int FLOAT_BIAS = 127; + +__constant__ +uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = { + { 1u, 2305843009213693952u }, + { 5955668970331000884u, 1784059615882449851u }, + { 8982663654677661702u, 1380349269358112757u }, + { 7286864317269821294u, 2135987035920910082u }, + { 7005857020398200553u, 1652639921975621497u }, + { 17965325103354776697u, 1278668206209430417u }, + { 8928596168509315048u, 1978643211784836272u }, + { 10075671573058298858u, 1530901034580419511u }, + { 597001226353042382u, 1184477304306571148u }, + { 1527430471115325346u, 1832889850782397517u }, + { 12533209867169019542u, 1418129833677084982u }, + { 5577825024675947042u, 2194449627517475473u }, + { 11006974540203867551u, 1697873161311732311u }, + { 10313493231639821582u, 1313665730009899186u }, + { 12701016819766672773u, 2032799256770390445u } +}; + +__constant__ +uint32_t const POW5_INV_OFFSETS[19] = { + 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, + 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, + 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, + 0x00000000 +}; + +__constant__ +uint64_t const DOUBLE_POW5_SPLIT2[13][2] = { + { 0u, 1152921504606846976u }, + { 0u, 1490116119384765625u }, + { 1032610780636961552u, 1925929944387235853u }, + { 7910200175544436838u, 1244603055572228341u }, + { 16941905809032713930u, 1608611746708759036u }, + { 13024893955298202172u, 2079081953128979843u }, + { 6607496772837067824u, 1343575221513417750u }, + { 17332926989895652603u, 1736530273035216783u }, + { 13037379183483547984u, 2244412773384604712u }, + { 1605989338741628675u, 1450417759929778918u }, + { 9630225068416591280u, 1874621017369538693u }, + { 665883850346957067u, 1211445438634777304u }, + { 14931890668723713708u, 1565756531257009982u } +}; + +__constant__ +uint32_t const POW5_OFFSETS[21] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, + 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, + 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, + 0x55559155, 0x51405555, 0x00000105 +}; + +constexpr uint32_t POW5_TABLE_SIZE = 26; + +__constant__ +uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { +1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, +1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, +30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, +19073486328125ull, 95367431640625ull, 476837158203125ull, +2384185791015625ull, 11920928955078125ull, 59604644775390625ull, +298023223876953125ull //, 1490116119384765625ull +}; + +//===== common.h from ryu ===== + +// Returns the number of decimal digits in v, which must not contain more than 9 digits. +__device__ inline uint32_t decimalLength9(uint32_t const v) { + // Function precondition: v is not a 10-digit number. + // (f2s: 9 digits are sufficient for round-tripping.) + // (d2fixed: We print 9-digit blocks.) + assert(v < 1000000000); + if (v >= 100000000) { return 9; } + if (v >= 10000000) { return 8; } + if (v >= 1000000) { return 7; } + if (v >= 100000) { return 6; } + if (v >= 10000) { return 5; } + if (v >= 1000) { return 4; } + if (v >= 100) { return 3; } + if (v >= 10) { return 2; } + return 1; +} + +// Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. +__device__ inline int32_t pow5bits(int32_t const e) { + // This approximation works up to the point that the multiplication overflows at e = 3529. + // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater + // than 2^9297. + assert(e >= 0); + assert(e <= 3528); + return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); +} + +// Returns floor(log_10(2^e)); requires 0 <= e <= 1650. +__device__ inline uint32_t log10Pow2(int32_t const e) { + // The first value this approximation fails for is 2^1651 which is just greater than 10^297. + assert(e >= 0); + assert(e <= 1650); + return (((uint32_t) e) * 78913) >> 18; +} + +// Returns floor(log_10(5^e)); requires 0 <= e <= 2620. +__device__ inline uint32_t log10Pow5(int32_t const e) { + // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. + assert(e >= 0); + assert(e <= 2620); + return (((uint32_t) e) * 732923) >> 20; +} + +__device__ inline uint32_t pow5factor_32(uint32_t value) { + uint32_t count = 0; + for (;;) { + assert(value != 0); + uint32_t const q = value / 5; + uint32_t const r = value % 5; + if (r != 0) { + break; + } + value = q; + ++count; + } + return count; +} + +// Returns true if value is divisible by 5^p. +__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) { + return pow5factor_32(value) >= p; +} + +// Returns true if value is divisible by 2^p. +__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) { + // __builtin_ctz doesn't appear to be faster here. + return (value & ((1u << p) - 1)) == 0; +} + +// It seems to be slightly faster to avoid uint128_t here, although the +// generated code for uint128_t looks slightly nicer. +__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) { + assert(shift > 32); + + // The casts here help MSVC to avoid calls to the __allmul library + // function. + uint32_t const factorLo = (uint32_t)(factor); + uint32_t const factorHi = (uint32_t)(factor >> 32); + uint64_t const bits0 = (uint64_t)m * factorLo; + uint64_t const bits1 = (uint64_t)m * factorHi; + + uint64_t const sum = (bits0 >> 32) + bits1; + uint64_t const shiftedSum = sum >> (shift - 32); + assert(shiftedSum <= UINT32_MAX); + return (uint32_t) shiftedSum; + +} + +__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa) { + if (mantissa) { + memcpy(result, "NaN", 3); + return 3; + } + if (sign) { + result[0] = '-'; + } + if (exponent) { + memcpy(result + sign, "Infinity", 8); + return sign + 8; + } + memcpy(result + sign, "0.0", 3); + return sign + 3; +} + +__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) { + if (mantissa) { + return 3; + } + if (exponent) { + return sign + 8; + } + return sign + 3; +} + +__device__ inline uint32_t float_to_bits(float const f) { + uint32_t bits = 0; + memcpy(&bits, &f, sizeof(float)); + return bits; +} + +__device__ inline uint64_t double_to_bits(double const d) { + uint64_t bits = 0; + memcpy(&bits, &d, sizeof(double)); + return bits; +} + +//===== d2s_intrinsics.h from ryu ===== + +__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) { + // The casts here help MSVC to avoid calls to the __allmul library function. + uint32_t const aLo = (uint32_t)a; + uint32_t const aHi = (uint32_t)(a >> 32); + uint32_t const bLo = (uint32_t)b; + uint32_t const bHi = (uint32_t)(b >> 32); + + uint64_t const b00 = (uint64_t)aLo * bLo; + uint64_t const b01 = (uint64_t)aLo * bHi; + uint64_t const b10 = (uint64_t)aHi * bLo; + uint64_t const b11 = (uint64_t)aHi * bHi; + + uint32_t const b00Lo = (uint32_t)b00; + uint32_t const b00Hi = (uint32_t)(b00 >> 32); + + uint64_t const mid1 = b10 + b00Hi; + uint32_t const mid1Lo = (uint32_t)(mid1); + uint32_t const mid1Hi = (uint32_t)(mid1 >> 32); + + uint64_t const mid2 = b01 + mid1Lo; + uint32_t const mid2Lo = (uint32_t)(mid2); + uint32_t const mid2Hi = (uint32_t)(mid2 >> 32); + + uint64_t const pHi = b11 + mid1Hi + mid2Hi; + uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo; + + *productHi = pHi; + return pLo; +} + +__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) { + // We don't need to handle the case dist >= 64 here (see above). + assert(dist < 64); + assert(dist > 0); + return (hi << (64 - dist)) | (lo >> dist); +} + +__device__ inline uint64_t div5(uint64_t const x) { + return x / 5; +} + +__device__ inline uint64_t div10(uint64_t const x) { + return x / 10; +} + +__device__ inline uint64_t div100(uint64_t const x) { + return x / 100; +} + +__device__ inline uint32_t pow5Factor(uint64_t value) { + uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) + uint64_t const n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 + uint32_t count = 0; + for (;;) { + assert(value != 0); + value *= m_inv_5; + if (value > n_div_5) + break; + ++count; + } + return count; +} + +// Returns true if value is divisible by 5^p. +__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) { + // I tried a case distinction on p, but there was no performance difference. + return pow5Factor(value) >= p; +} + +// Returns true if value is divisible by 2^p. +__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) { + assert(value != 0); + assert(p < 64); + // __builtin_ctzll doesn't appear to be faster here. + return (value & ((1ull << p) - 1)) == 0; +} + +__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) { + // m is maximum 55 bits + uint64_t high1; // 128 + uint64_t const low1 = umul128(m, mul[1], &high1); // 64 + uint64_t high0; // 64 + umul128(m, mul[0], &high0); // 0 + uint64_t const sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + return shiftright128(sum, high1, j - 64); +} + +__device__ inline uint64_t mulShiftAll64(uint64_t const m, uint64_t const* const mul, int32_t const j, + uint64_t* const vp, uint64_t* const vm, uint32_t const mmShift) { + *vp = mulShift64(4 * m + 2, mul, j); + *vm = mulShift64(4 * m - 1 - mmShift, mul, j); + return mulShift64(4 * m, mul, j); +} + +//===== d2s_small_table.h from ryu ===== + +// Computes 5^i in the form required by Ryu, and stores it in the given pointer. +__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) { + uint32_t const base = i / POW5_TABLE_SIZE; + uint32_t const base2 = base * POW5_TABLE_SIZE; + uint32_t const offset = i - base2; + uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base]; + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + uint64_t const m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + uint64_t const low1 = umul128(m, mul[1], &high1); + uint64_t high0; + uint64_t const low0 = umul128(m, mul[0], &high0); + uint64_t const sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + uint32_t const delta = pow5bits(i) - pow5bits(base2); + result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); +} + +// Computes 5^-i in the form required by Ryu, and stores it in the given pointer. +__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) { + uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; + uint32_t const base2 = base * POW5_TABLE_SIZE; + uint32_t const offset = base2 - i; + uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 + if (offset == 0) { + result[0] = mul[0]; + result[1] = mul[1]; + return; + } + uint64_t const m = DOUBLE_POW5_TABLE[offset]; + uint64_t high1; + uint64_t const low1 = umul128(m, mul[1], &high1); + uint64_t high0; + uint64_t const low0 = umul128(m, mul[0] - 1, &high0); + uint64_t const sum = high0 + low1; + if (sum < high0) { + ++high1; // overflow into high1 + } + // high1 | sum | low0 + uint32_t const delta = pow5bits(base2) - pow5bits(i); + result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[1] = shiftright128(sum, high1, delta); +} + +//===== f2s_intrinsics.h from ryu ===== + +__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) { + // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup + // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the + // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + return mulShift32(m, pow5[1] + 1, j); +} + +__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) { + uint64_t pow5[2]; + double_computePow5(i, pow5); + return mulShift32(m, pow5[1], j); +} + +//===== d2s.c and f2s.c from ryu ===== + +__device__ inline uint32_t decimalLength17(uint64_t const v) { + // This is slightly faster than a loop. + // The average output length is 16.38 digits, so we check high-to-low. + // Function precondition: v is not an 18, 19, or 20-digit number. + // (17 digits are sufficient for round-tripping.) + assert(v < 100000000000000000L); + if (v >= 10000000000000000L) { return 17; } + if (v >= 1000000000000000L) { return 16; } + if (v >= 100000000000000L) { return 15; } + if (v >= 10000000000000L) { return 14; } + if (v >= 1000000000000L) { return 13; } + if (v >= 100000000000L) { return 12; } + if (v >= 10000000000L) { return 11; } + if (v >= 1000000000L) { return 10; } + if (v >= 100000000L) { return 9; } + if (v >= 10000000L) { return 8; } + if (v >= 1000000L) { return 7; } + if (v >= 100000L) { return 6; } + if (v >= 10000L) { return 5; } + if (v >= 1000L) { return 4; } + if (v >= 100L) { return 3; } + if (v >= 10L) { return 2; } + return 1; +} + +__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) { + int32_t e2; + uint64_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + } + bool const even = (m2 & 1) == 0; + bool const acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + uint64_t const mv = 4 * m2; + // Implicit bool -> int conversion. True is 1, false is 0. + uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + // We would compute mp and mm like this: + // uint64_t mp = 4 * m2 + 2; + // uint64_t mm = mv - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 128-bit arithmetic. + uint64_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + if (e2 >= 0) { + // I tried special-casing q == 0, but there was no effect on performance. + // This expression is slightly faster than max(0, log10Pow2(e2) - 1). + uint32_t const q = log10Pow2(e2) - (e2 > 3); + e10 = (int32_t) q; + int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + int32_t const i = -e2 + (int32_t) q + k; + uint64_t pow5[2]; + double_computeInvPow5(q, pow5); + vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); + + if (q <= 21) { + // This should use q <= 22, but I think 21 is also safe. Smaller values + // may still be safe, but it's more difficult to reason about them. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + uint32_t const mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); + if (mvMod5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5(mv, q); + } else if (acceptBounds) { + // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q + // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q + // <=> true && pow5Factor(mm) >= q, since e2 >= q. + vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); + } else { + // Same as min(e2 + 1, pow5Factor(mp)) >= q. + vp -= multipleOfPowerOf5(mv + 2, q); + } + } + } else { + // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). + uint32_t const q = log10Pow5(-e2) - (-e2 > 1); + e10 = (int32_t) q + e2; + int32_t const i = -e2 - (int32_t) q; + int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; + int32_t const j = (int32_t) q - k; + + uint64_t pow5[2]; + double_computePow5(i, pow5); + vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); + + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. + // We want to know if the full product has at least q trailing zeros. + // We need to compute min(p2(mv), p5(mv) - e2) >= q + // <=> p2(mv) >= q && p5(mv) - e2 >= q + // <=> p2(mv) >= q (because -e2 >= q) + vrIsTrailingZeros = multipleOfPowerOf2(mv, q); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint8_t lastRemovedDigit = 0; + uint64_t output; + // On average, we remove ~2 digits. + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~0.7%). + for (;;) { + uint64_t const vpDiv10 = div10(vp); + uint64_t const vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + uint64_t const vrDiv10 = div10(vr); + uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vmIsTrailingZeros &= vmMod10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + if (vmIsTrailingZeros) { + for (;;) { + uint64_t const vmDiv10 = div10(vm); + uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + if (vmMod10 != 0) { + break; + } + uint64_t const vpDiv10 = div10(vp); + uint64_t const vrDiv10 = div10(vr); + uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + } + + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~99.3%). Percentages below are relative to this. + bool roundUp = false; + uint64_t const vpDiv100 = div100(vp); + uint64_t const vmDiv100 = div100(vm); + if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). + uint64_t const vrDiv100 = div100(vr); + uint32_t const vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; + removed += 2; + } + // Loop iterations below (approximately), without optimization above: + // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% + // Loop iterations below (approximately), with optimization above: + // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% + for (;;) { + uint64_t const vpDiv10 = div10(vp); + uint64_t const vmDiv10 = div10(vm); + if (vpDiv10 <= vmDiv10) { + break; + } + uint64_t const vrDiv10 = div10(vr); + uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + roundUp = vrMod10 >= 5; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; + ++removed; + } + + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || roundUp); + } + int32_t const exp = e10 + removed; + + floating_decimal_64 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; +} + +__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) { + int32_t e2; + uint32_t m2; + if (ieeeExponent == 0) { + // We subtract 2 so that the bounds computation has 2 additional bits. + e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = ieeeMantissa; + } else { + e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; + } + bool const even = (m2 & 1) == 0; + bool const acceptBounds = even; + + // Step 2: Determine the interval of valid decimal representations. + uint32_t const mv = 4 * m2; + uint32_t const mp = 4 * m2 + 2; + // Implicit bool -> int conversion. True is 1, false is 0. + uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; + uint32_t const mm = 4 * m2 - 1 - mmShift; + + // Step 3: Convert to a decimal power base using 64-bit arithmetic. + uint32_t vr, vp, vm; + int32_t e10; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; + uint8_t lastRemovedDigit = 0; + if (e2 >= 0) { + uint32_t const q = log10Pow2(e2); + e10 = (int32_t) q; + int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; + int32_t const i = -e2 + (int32_t) q + k; + vr = mulPow5InvDivPow2(mv, q, i); + vp = mulPow5InvDivPow2(mp, q, i); + vm = mulPow5InvDivPow2(mm, q, i); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + // We need to know one removed digit even if we are not going to loop below. We could use + // q = X - 1 above, except that would require 33 bits for the result, and we've found that + // 32-bit arithmetic is faster even on 64-bit machines. + int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; + lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); + } + if (q <= 9) { + // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. + // Only one of mp, mv, and mm can be a multiple of 5, if any. + if (mv % 5 == 0) { + vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); + } else if (acceptBounds) { + vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); + } else { + vp -= multipleOfPowerOf5_32(mp, q); + } + } + } else { + uint32_t const q = log10Pow5(-e2); + e10 = (int32_t) q + e2; + int32_t const i = -e2 - (int32_t) q; + int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT; + int32_t j = (int32_t) q - k; + vr = mulPow5divPow2(mv, (uint32_t) i, j); + vp = mulPow5divPow2(mp, (uint32_t) i, j); + vm = mulPow5divPow2(mm, (uint32_t) i, j); + if (q != 0 && (vp - 1) / 10 <= vm / 10) { + j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); + } + if (q <= 1) { + // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. + // mv = 4 * m2, so it always has at least two trailing 0 bits. + vrIsTrailingZeros = true; + if (acceptBounds) { + // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. + vmIsTrailingZeros = mmShift == 1; + } else { + // mp = mv + 2, so it always has at least one trailing 0 bit. + --vp; + } + } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. + vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); + } + } + + // Step 4: Find the shortest decimal representation in the interval of valid representations. + int32_t removed = 0; + uint32_t output; + if (vmIsTrailingZeros || vrIsTrailingZeros) { + // General case, which happens rarely (~4.0%). + while (vp / 10 > vm / 10) { + vmIsTrailingZeros &= vm % 10 == 0; + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + if (vmIsTrailingZeros) { + while (vm % 10 == 0) { + vrIsTrailingZeros &= lastRemovedDigit == 0; + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + } + if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { + // Round even if the exact number is .....50..0. + lastRemovedDigit = 4; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); + } else { + // Specialized for the common case (~96.0%). Percentages below are relative to this. + // Loop iterations below (approximately): + // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% + while (vp / 10 > vm / 10) { + lastRemovedDigit = (uint8_t) (vr % 10); + vr /= 10; + vp /= 10; + vm /= 10; + ++removed; + } + // We need to take vr + 1 if vr is outside bounds or we need to round up. + output = vr + (vr == vm || lastRemovedDigit >= 5); + } + int32_t const exp = e10 + removed; + + floating_decimal_32 fd; + fd.exponent = exp; + fd.mantissa = output; + return fd; +} + +__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + // Values in the interval [1E-3, 1E7) are special. + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (uint32_t i = 0; i < olength - 1; ++i) { + uint32_t const c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = '0' + output % 10; + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + // Print 'E', the exponent sign, and the exponent, which has at most three digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 100) { + result[index++] = (char) ('0' + exp / 100); + exp %= 100; + result[index++] = (char) ('0' + exp / 10); + } else if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; +} + +__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) { + int index = 0; + if (sign) { + index++; + } + + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + exp = -exp; + index++; + } + if (exp >= 100) { + index += 3; + } else if (exp >= 10) { + index += 2; + } else { + index++; + } + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + index += exp + 3; + } else { + index += olength + 1; + } + } + return index; +} + +__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + result[index++] = '-'; + } + + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + // Print in the format x.xxxxxE-yy. + for (int i = 0; i < olength - 1; i++) { + int c = output % 10; output /= 10; + result[index + olength - i] = (char) ('0' + c); + } + result[index] = (char) ('0' + output % 10); + result[index + 1] = '.'; + index += olength + 1; + if (olength == 1) { + result[index++] = '0'; + } + + // Print 'E', the exponent sign, and the exponent, which has at most two digits. + result[index++] = 'E'; + if (exp < 0) { + result[index++] = '-'; + exp = -exp; + } + if (exp >= 10) { + result[index++] = (char) ('0' + exp / 10); + } + result[index++] = (char) ('0' + exp % 10); + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + result[index++] = '0'; + result[index++] = '.'; + for (int i = -1; i > exp; i--) { + result[index++] = '0'; + } + int current = index; + for (int i = 0; i < olength; i++) { + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + index++; + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + for (int i = 0; i < olength; i++) { + result[index + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength; + for (int i = olength; i < exp + 1; i++) { + result[index++] = '0'; + } + result[index++] = '.'; + result[index++] = '0'; + } else { + // Decimal dot is somewhere between the digits. + int current = index + 1; + for (int i = 0; i < olength; i++) { + if (olength - i - 1 == exp) { + result[current + olength - i - 1] = '.'; + current--; + } + result[current + olength - i - 1] = (char) ('0' + output % 10); + output /= 10; + } + index += olength + 1; + } + } + return index; +} + +__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) { + // Step 5: Print the decimal representation. + int index = 0; + if (sign) { + index++; + } + + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; + bool scientificNotation = (exp < -3) || (exp >= 7); + + if (scientificNotation) { + index += olength + 1; + if (olength == 1) { + index++; + } + // 'E' + index++; + if (exp < 0) { + index++; + exp = -exp; + } + if (exp >= 10) { + index++; + } + index++; + } else { + // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). + if (exp < 0) { + // Decimal dot is before any of the digits. + index += 1 - exp + olength; + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + index += exp + 3; + } else { + // Decimal dot is somewhere between the digits. + index += olength + 1; + } + } + return index; +} + +__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const ieeeExponent, + floating_decimal_64* const v) { + uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; + int32_t const e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + + if (e2 > 0) { + // f = m2 * 2^e2 >= 2^53 is an integer. + // Ignore this case for now. + return false; + } + + if (e2 < -52) { + // f < 1. + return false; + } + + // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. + // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. + uint64_t const mask = (1ull << -e2) - 1; + uint64_t const fraction = m2 & mask; + if (fraction != 0) { + return false; + } + + // f is an integer in the range [1, 2^53). + // Note: mantissa might contain trailing (decimal) 0's. + // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). + v->mantissa = m2 >> -e2; + v->exponent = 0; + return true; +} + +__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + uint64_t const bits = double_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; + uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); + uint32_t const ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + floating_decimal_64 v; + bool const isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); + if (isSmallInt) { + // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. + // For scientific notation we need to move these zeros into the exponent. + // (This is not needed for fixed-point notation, so it might be beneficial to trim + // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) + for (;;) { + uint64_t const q = div10(v.mantissa); + uint32_t const r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); + if (r != 0) { + break; + } + v.mantissa = q; + ++v.exponent; + } + } else { + v = d2d(ieeeMantissa, ieeeExponent); + } + return v; +} + +__device__ int d2s_buffered_n(double f, char* result) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); +} + +__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { + // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. + uint32_t const bits = float_to_bits(f); + + // Decode bits into sign, mantissa, and exponent. + ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; + uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); + uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); + + // Case distinction; exit early for the easy cases. + if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + special = true; + return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; + } + special = false; + return f2d(ieeeMantissa, ieeeExponent); +} + +__device__ int f2s_buffered_n(float f, char* result) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(f, sign, special); + if (special) { + return copy_special_str(result, sign, v.exponent, v.mantissa); + } + return to_chars(v, sign, result); +} + + +//===== compute float to string size ===== + +__device__ int compute_d2s_size(double value) { + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return d2s_size(v, sign); +} + +__device__ int compute_f2s_size(float value) { + bool sign = false, special = false; + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa); + } + return f2s_size(v, sign); +} + +} // namespace + +//===== APIs ===== + +__device__ int compute_ftos_size(double value, bool is_float) { + if (is_float) { + return compute_f2s_size(value); + } else { + return compute_d2s_size(value); + } +} + +__device__ int float_to_string(double value, bool is_float, char* output) { + if (is_float) { + return f2s_buffered_n(value, output); + } else { + return d2s_buffered_n(value, output); + } +} + +} // namespace spark-rapids-jni::ftos_converter diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index d75741b8a0..0d4f62ac1b 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include "cast_string.hpp" #include #include @@ -29,7 +29,7 @@ using namespace cudf; -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; +constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; struct FloatToStringTests : public cudf::test::BaseFixture {}; From 131e48c3f49d84e8c5ad18d59e5f9ad56b04d86e Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 23 Nov 2023 16:26:12 +0800 Subject: [PATCH 34/54] cudf conflict Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 168533a8ad..823d3214a9 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243 +Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 From 3c09c49a175df5b89620126b47b85f23014d12a0 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 23 Nov 2023 16:28:20 +0800 Subject: [PATCH 35/54] Update src/main/cpp/src/cast_float_to_string.cu Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- src/main/cpp/src/cast_float_to_string.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index e22947ab9e..c41936c3bb 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -93,7 +93,7 @@ struct dispatch_float_to_string_fn { std::move(offsets), std::move(chars), floats.null_count(), - std::move(cudf::detail::copy_bitmask(floats, stream, mr))); + cudf::detail::copy_bitmask(floats, stream, mr)); } // non-float types throw an exception From 346c1f7e599f9f2c7c6041908526d2fde095bda2 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 23 Nov 2023 17:17:14 +0800 Subject: [PATCH 36/54] Make it runable again Signed-off-by: Haoyang Li --- src/main/cpp/src/format_float.cu | 140 ++++++++++ src/main/cpp/src/ftos_converter.cuh | 382 +++++++++++++++++++++++++++- 2 files changed, 517 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index e69de29bb2..24fd39367b 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cast_string.hpp" +#include "ftos_converter.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace spark_rapids_jni { + +namespace detail { +namespace { + +template +struct format_float_fn { + cudf::column_device_view d_floats; + int digits; + cudf::size_type* d_offsets; + char* d_chars; + + __device__ cudf::size_type compute_output_size(FloatType value, int digits) + { + bool is_float = std::is_same_v; + return static_cast(ftos_converter::compute_format_float_size(static_cast(value), digits, is_float)); + } + + __device__ void format_float(cudf::size_type idx, int digits) + { + FloatType value = d_floats.element(idx); + bool is_float = std::is_same_v; + ftos_converter::format_float(static_cast(value), digits, d_chars + d_offsets[idx], is_float); + } + + __device__ void operator()(cudf::size_type idx) + { + if (d_floats.is_null(idx)) { + if (d_chars == nullptr) { d_offsets[idx] = 0; } + return; + } + if (d_chars != nullptr) { + format_float(idx, digits); + } else { + d_offsets[idx] = compute_output_size(d_floats.element(idx), digits); + } + } +}; + +/** + * @brief This dispatch method is for converting floats into strings. + * + * The template function declaration ensures only float types are allowed. + */ +struct dispatch_format_float_fn { + template >* = nullptr> + std::unique_ptr operator()(cudf::column_view const& floats, + int digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + cudf::size_type strings_count = floats.size(); + auto column = cudf::column_device_view::create(floats, stream); + auto d_column = *column; + + // copy the null mask + rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); + + auto [offsets, chars] = + cudf::strings::detail::make_strings_children(format_float_fn{d_column, digits}, strings_count, stream, mr); + + return cudf::make_strings_column(strings_count, + std::move(offsets), + std::move(chars), + floats.null_count(), + std::move(null_mask)); + } + + // non-float types throw an exception + template >* = nullptr> + std::unique_ptr operator()(cudf::column_view const&, + int, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("Values for format_float function must be a float type."); + } +}; + +} // namespace + +// This will convert all float column types into a strings column. +std::unique_ptr format_float(cudf::column_view const& floats, + int digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + cudf::size_type strings_count = floats.size(); + if (strings_count == 0) return cudf::make_empty_column(cudf::type_id::STRING); + + return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr); +} + +} // namespace detail + +// external API +std::unique_ptr format_float(cudf::column_view const& floats, + int digits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::format_float(floats, digits, stream, mr); +} + +} // namespace spark_rapids_jni \ No newline at end of file diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index b9905ae567..50c64996be 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -211,7 +211,7 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i } -__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa) { +__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa, int const d = 1) { if (mantissa) { memcpy(result, "NaN", 3); return 3; @@ -223,18 +223,29 @@ __device__ inline int copy_special_str(char * const result, bool const sign, boo memcpy(result + sign, "Infinity", 8); return sign + 8; } - memcpy(result + sign, "0.0", 3); - return sign + 3; + result[sign] = '0'; + if (d == 0) { + return sign + 1; + } else { + result[sign + 1] = '.'; + } + for (int i = 0; i < d; i++) { + result[sign + 2 + i] = '0'; + } + return sign + 2 + d; } -__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) { +__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa, int const d=1) { if (mantissa) { return 3; } if (exponent) { return sign + 8; } - return sign + 3; + if (d == 0) { + return sign + 1; + } + return sign + 2 + d; } __device__ inline uint32_t float_to_bits(float const f) { @@ -1153,4 +1164,365 @@ __device__ int float_to_string(double value, bool is_float, char* output) { } } +//===== format float ===== + +__constant__ +uint64_t const POW10_TABLE[19] = { +1ull, 10ull, 100ull, 1000ull, 10000ull, 100000ull, 1000000ull, 10000000ull, +100000000ull, 1000000000ull, 10000000000ull, 100000000000ull, 1000000000000ull, +10000000000000ull, 100000000000000ull, 1000000000000000ull, 10000000000000000ull, +100000000000000000ull +}; + +template +__device__ inline T round_half_even(T const input, int const olength, int const d) { + // "round" a integer to d digits, with the half-even rounding mode. + if (d > olength) { + T num = input; + for (int i = 0; i < d - olength; i++) { + num *= 10; + } + return num; + } + T div = POW10_TABLE[olength - d]; + T mod = input % div; + T num = input / div; + if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) { + num++; + } + return num; +} + +__device__ inline int to_formated_chars(floating_decimal_64 const v, bool const sign, char* const result, int d) { + int index = 0; + if (sign) { + result[index++] = '-'; + } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + result[index++] = '0'; + if (d == 0) { + return index; + } + result[index++] = '.'; + int actural_round = d; + for (int i = -1; i > exp; i--) { + index_for_carrier = index; + result[index++] = '0'; + actural_round--; + if (actural_round == 0) { + if (i != exp + 1) { + return index; + } // else, possible carry + break; + } + } + int actural_olength = fmin(int(olength), actural_round); + uint64_t rounded_output = round_half_even(output, olength, actural_round); + // check if carry + if (rounded_output >= POW10_TABLE[actural_olength]) { + result[index_for_carrier] = '1'; + rounded_output -= POW10_TABLE[actural_olength]; + } + int current = index; + for (int i = 0; i < actural_olength; i++) { + result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10); + rounded_output /= 10; + index++; + } + actural_round -= actural_olength; + if (actural_round > 0) { + for (int i = 0; i < actural_round; i++) { + result[index++] = '0'; + } + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + exp / 3; + int sep_cnt = 0; + int rev_index = 0; + for (int i = olength; i < exp + 1; i++) { + result[integer_len - (rev_index++) - 1] = '0'; + sep_cnt++; + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + } + for (int i = 0; i < olength; i++) { + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10); + sep_cnt++; + output /= 10; + } + index = integer_len; + if (d == 0) { + return index; + } + result[index++] = '.'; + for (int i = 0; i < d; i++) { + result[index++] = '0'; + } + } else { + uint32_t temp_d = d, tailing_zero = 0; + if (exp + d > olength) { + temp_d = olength - exp; + tailing_zero = d - temp_d; + } + uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint64_t decimal = rounded_output % pow10; + // calculate integer length after format to cover carry case + uint32_t integer_len = decimalLength17(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + uint32_t sep_cnt = 0; + int rev_index = 0; + for (int i = 0; i < integer_len; i++) { + if (sep_cnt == 3) { + result[formated_integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10); + sep_cnt++; + integer /= 10; + } + index = formated_integer_len; + if (d == 0) { + return index; + } + result[index++] = '.'; + int current = index; + for (int i = 0; i < tailing_zero; i++) { + result[current + d - i - 1] = '0'; + index++; + } + for (int i = tailing_zero; i < d; i++) { + result[current + d - i - 1] = (char) ('0' + decimal % 10); + decimal /= 10; + index++; + } + } + return index; +} + +__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int d) { + int index = 0; + if (sign) { + index++; + } + uint64_t output = v.mantissa; + const uint32_t olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + index += 2 + d; + } else if (exp + 1 >= olength) { + index += exp + 1 + exp / 3 + 1 + d; + } else { + uint32_t temp_d = d; + if (exp + d > olength) { + temp_d = olength - exp; + } + uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint32_t integer_len = decimalLength17(integer); + index += integer_len + (integer_len - 1) / 3 + 1 + d; + } + if (d == 0) { + index--; + } + return index; +} + +__device__ inline int to_formated_chars(floating_decimal_32 const v, bool const sign, char* const result, int d) { + int index = 0; + if (sign) { + result[index++] = '-'; + } + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + // Decimal dot is before any of the digits. + int index_for_carrier = index; + result[index++] = '0'; + if (d == 0) { + return index; + } + result[index++] = '.'; + int actural_round = d; + for (int i = -1; i > exp; i--) { + index_for_carrier = index; + result[index++] = '0'; + actural_round--; + if (actural_round == 0) { + if (i != exp + 1) { + return index; + } // else, possible carry + break; + } + } + int actural_olength = fmin(int(olength), actural_round); + uint64_t rounded_output = round_half_even(output, olength, actural_round); + // check if carry + if (rounded_output >= POW10_TABLE[actural_olength]) { + result[index_for_carrier] = '1'; + rounded_output -= POW10_TABLE[actural_olength]; + } + int current = index; + for (int i = 0; i < actural_olength; i++) { + result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10); + rounded_output /= 10; + index++; + } + actural_round -= actural_olength; + if (actural_round > 0) { + for (int i = 0; i < actural_round; i++) { + result[index++] = '0'; + } + } + } else if (exp + 1 >= olength) { + // Decimal dot is after any of the digits. + int integer_len = index + exp + 1 + exp / 3; + int sep_cnt = 0; + int rev_index = 0; + for (int i = olength; i < exp + 1; i++) { + result[integer_len - (rev_index++) - 1] = '0'; + sep_cnt++; + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + } + for (int i = 0; i < olength; i++) { + if (sep_cnt == 3) { + result[integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10); + sep_cnt++; + output /= 10; + } + index = integer_len; + if (d == 0) { + return index; + } + result[index++] = '.'; + for (int i = 0; i < d; i++) { + result[index++] = '0'; + } + } else { + uint32_t temp_d = d, tailing_zero = 0; + if (exp + d > olength) { + temp_d = olength - exp; + tailing_zero = d - temp_d; + } + uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint32_t pow10 = POW10_TABLE[temp_d]; + uint32_t integer = rounded_output / pow10; + uint32_t decimal = rounded_output % pow10; + // calculate integer length after format to cover carry case + uint32_t integer_len = decimalLength9(integer); + uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + uint32_t sep_cnt = 0; + int rev_index = 0; + for (int i = 0; i < integer_len; i++) { + if (sep_cnt == 3) { + result[formated_integer_len - (rev_index++) - 1] = ','; + sep_cnt = 0; + } + result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10); + sep_cnt++; + integer /= 10; + } + index = formated_integer_len; + if (d == 0) { + return index; + } + result[index++] = '.'; + int current = index; + for (int i = 0; i < tailing_zero; i++) { + result[current + d - i - 1] = '0'; + index++; + } + for (int i = tailing_zero; i < d; i++) { + result[current + d - i - 1] = (char) ('0' + decimal % 10); + decimal /= 10; + index++; + } + } + return index; +} + +__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int d) { + int index = 0; + if (sign) { + index++; + } + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + (int32_t) olength - 1; + if (exp < 0) { + index += 2 + d; + } else if (exp + 1 >= olength) { + index += exp + 1 + exp / 3 + 1 + d; + } else { + uint32_t temp_d = d; + if (exp + d > olength) { + temp_d = olength - exp; + } + uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); + uint64_t pow10 = POW10_TABLE[temp_d]; + uint64_t integer = rounded_output / pow10; + uint32_t integer_len = decimalLength9(integer); + index += integer_len + (integer_len - 1) / 3 + 1 + d; + } + if (d == 0) { + index--; + } + return index; +} + +__device__ int compute_format_float_size(double value, int d, bool is_float) { + bool sign = false, special = false; + if (is_float) { + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa, d); + } + return format_float_size(v, sign, d); + } else { + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return special_str_size(sign, v.exponent, v.mantissa, d); + } + return format_float_size(v, sign, d); + } +} + +__device__ int format_float(double value, int d, char* output, bool is_float) { + bool sign = false, special = false; + if (is_float) { + floating_decimal_32 v = f2d(value, sign, special); + if (special) { + return copy_special_str(output, sign, v.exponent, v.mantissa, d); + } + return to_formated_chars(v, sign, output, d); + } else { + floating_decimal_64 v = d2d(value, sign, special); + if (special) { + return copy_special_str(output, sign, v.exponent, v.mantissa, d); + } + return to_formated_chars(v, sign, output, d); + } +} + } // namespace spark-rapids-jni::ftos_converter From 98918ce1a8a8bb45ebd4be0e3634144bb8200368 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 23 Nov 2023 17:56:27 +0800 Subject: [PATCH 37/54] address some comments Signed-off-by: Haoyang Li --- src/main/cpp/src/CastStringJni.cpp | 4 +- src/main/cpp/src/cast_string.hpp | 2 +- src/main/cpp/src/format_float.cu | 36 +++--- src/main/cpp/src/ftos_converter.cuh | 120 +++++++++--------- src/main/cpp/tests/cast_decimal_to_string.cpp | 3 +- src/main/cpp/tests/cast_string.cpp | 3 +- src/main/cpp/tests/format_float.cpp | 3 +- .../nvidia/spark/rapids/jni/CastStrings.java | 7 +- 8 files changed, 91 insertions(+), 87 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 17bfeaf50b..063fabe222 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -110,7 +110,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat( } JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat( - JNIEnv* env, jclass, jlong input_column, jint d, jint j_dtype) + JNIEnv* env, jclass, jlong input_column, jint digits, jint j_dtype) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); @@ -119,7 +119,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat cudf::column_view cv{*reinterpret_cast(input_column)}; return cudf::jni::release_as_jlong( - spark_rapids_jni::format_float(cv, d, cudf::get_default_stream())); + spark_rapids_jni::format_float(cv, digits, cudf::get_default_stream())); } CATCH_CAST_EXCEPTION(env, 0); } diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index e194919ffb..5c370f9185 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -117,7 +117,7 @@ std::unique_ptr string_to_float( std::unique_ptr format_float( cudf::column_view const& input, - int d, + int digits, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index 24fd39367b..3d3fedbfb9 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -29,6 +29,7 @@ #include #include #include + #include #include @@ -40,24 +41,25 @@ namespace { template struct format_float_fn { cudf::column_device_view d_floats; - int digits; + int const digits; cudf::size_type* d_offsets; char* d_chars; - __device__ cudf::size_type compute_output_size(FloatType value, int digits) + __device__ cudf::size_type compute_output_size(FloatType value, int digits) const { - bool is_float = std::is_same_v; + bool constexpr is_float = std::is_same_v; return static_cast(ftos_converter::compute_format_float_size(static_cast(value), digits, is_float)); } - __device__ void format_float(cudf::size_type idx, int digits) + __device__ void format_float(cudf::size_type idx, int digits) const { - FloatType value = d_floats.element(idx); - bool is_float = std::is_same_v; - ftos_converter::format_float(static_cast(value), digits, d_chars + d_offsets[idx], is_float); + auto const value = d_floats.element(idx); + bool constexpr is_float = std::is_same_v; + auto const output = d_chars + d_offsets[idx]; + ftos_converter::format_float(static_cast(value), digits, is_float, output); } - __device__ void operator()(cudf::size_type idx) + __device__ void operator()(cudf::size_type idx) const { if (d_floats.is_null(idx)) { if (d_chars == nullptr) { d_offsets[idx] = 0; } @@ -83,21 +85,17 @@ struct dispatch_format_float_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - cudf::size_type strings_count = floats.size(); - auto column = cudf::column_device_view::create(floats, stream); - auto d_column = *column; - - // copy the null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr); + auto const strings_count = floats.size(); + auto const input_ptr = cudf::column_device_view::create(floats, stream); auto [offsets, chars] = - cudf::strings::detail::make_strings_children(format_float_fn{d_column, digits}, strings_count, stream, mr); + cudf::strings::detail::make_strings_children(format_float_fn{*input_ptr, digits}, strings_count, stream, mr); return cudf::make_strings_column(strings_count, std::move(offsets), std::move(chars), floats.null_count(), - std::move(null_mask)); + cudf::detail::copy_bitmask(floats, stream, mr)); } // non-float types throw an exception @@ -119,8 +117,10 @@ std::unique_ptr format_float(cudf::column_view const& floats, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - cudf::size_type strings_count = floats.size(); - if (strings_count == 0) return cudf::make_empty_column(cudf::type_id::STRING); + auto const strings_count = floats.size(); + if (strings_count == 0) { + return cudf::make_empty_column(cudf::type_id::STRING); + } return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr); } diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index 50c64996be..a3056d7e46 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -211,7 +211,7 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i } -__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa, int const d = 1) { +__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa, int const digits = 1) { if (mantissa) { memcpy(result, "NaN", 3); return 3; @@ -224,28 +224,28 @@ __device__ inline int copy_special_str(char * const result, bool const sign, boo return sign + 8; } result[sign] = '0'; - if (d == 0) { + if (digits == 0) { return sign + 1; } else { result[sign + 1] = '.'; } - for (int i = 0; i < d; i++) { + for (int i = 0; i < digits; i++) { result[sign + 2 + i] = '0'; } - return sign + 2 + d; + return sign + 2 + digits; } -__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa, int const d=1) { +__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa, int const digits = 1) { if (mantissa) { return 3; } if (exponent) { return sign + 8; } - if (d == 0) { + if (digits == 0) { return sign + 1; } - return sign + 2 + d; + return sign + 2 + digits; } __device__ inline uint32_t float_to_bits(float const f) { @@ -1175,16 +1175,16 @@ uint64_t const POW10_TABLE[19] = { }; template -__device__ inline T round_half_even(T const input, int const olength, int const d) { - // "round" a integer to d digits, with the half-even rounding mode. - if (d > olength) { +__device__ inline T round_half_even(T const input, int const olength, int const digits) { + // "round" a integer to digits digits, with the half-even rounding mode. + if (digits > olength) { T num = input; - for (int i = 0; i < d - olength; i++) { + for (int i = 0; i < digits - olength; i++) { num *= 10; } return num; } - T div = POW10_TABLE[olength - d]; + T div = POW10_TABLE[olength - digits]; T mod = input % div; T num = input / div; if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) { @@ -1193,7 +1193,7 @@ __device__ inline T round_half_even(T const input, int const olength, int const return num; } -__device__ inline int to_formated_chars(floating_decimal_64 const v, bool const sign, char* const result, int d) { +__device__ inline int to_formated_chars(floating_decimal_64 const v, bool const sign, char* const result, int digits) { int index = 0; if (sign) { result[index++] = '-'; @@ -1205,11 +1205,11 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const // Decimal dot is before any of the digits. int index_for_carrier = index; result[index++] = '0'; - if (d == 0) { + if (digits == 0) { return index; } result[index++] = '.'; - int actural_round = d; + int actural_round = digits; for (int i = -1; i > exp; i--) { index_for_carrier = index; result[index++] = '0'; @@ -1263,18 +1263,18 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const output /= 10; } index = integer_len; - if (d == 0) { + if (digits == 0) { return index; } result[index++] = '.'; - for (int i = 0; i < d; i++) { + for (int i = 0; i < digits; i++) { result[index++] = '0'; } } else { - uint32_t temp_d = d, tailing_zero = 0; - if (exp + d > olength) { + uint32_t temp_d = digits, tailing_zero = 0; + if (exp + digits > olength) { temp_d = olength - exp; - tailing_zero = d - temp_d; + tailing_zero = digits - temp_d; } uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); uint64_t pow10 = POW10_TABLE[temp_d]; @@ -1295,17 +1295,17 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const integer /= 10; } index = formated_integer_len; - if (d == 0) { + if (digits == 0) { return index; } result[index++] = '.'; int current = index; for (int i = 0; i < tailing_zero; i++) { - result[current + d - i - 1] = '0'; + result[current + digits - i - 1] = '0'; index++; } - for (int i = tailing_zero; i < d; i++) { - result[current + d - i - 1] = (char) ('0' + decimal % 10); + for (int i = tailing_zero; i < digits; i++) { + result[current + digits - i - 1] = (char) ('0' + decimal % 10); decimal /= 10; index++; } @@ -1313,7 +1313,7 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const return index; } -__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int d) { +__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int digits) { int index = 0; if (sign) { index++; @@ -1322,27 +1322,27 @@ __device__ inline int format_float_size(floating_decimal_64 const v, bool const const uint32_t olength = decimalLength17(output); int32_t exp = v.exponent + (int32_t) olength - 1; if (exp < 0) { - index += 2 + d; + index += 2 + digits; } else if (exp + 1 >= olength) { - index += exp + 1 + exp / 3 + 1 + d; + index += exp + 1 + exp / 3 + 1 + digits; } else { - uint32_t temp_d = d; - if (exp + d > olength) { + uint32_t temp_d = digits; + if (exp + digits > olength) { temp_d = olength - exp; } uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); uint64_t pow10 = POW10_TABLE[temp_d]; uint64_t integer = rounded_output / pow10; uint32_t integer_len = decimalLength17(integer); - index += integer_len + (integer_len - 1) / 3 + 1 + d; + index += integer_len + (integer_len - 1) / 3 + 1 + digits; } - if (d == 0) { + if (digits == 0) { index--; } return index; } -__device__ inline int to_formated_chars(floating_decimal_32 const v, bool const sign, char* const result, int d) { +__device__ inline int to_formated_chars(floating_decimal_32 const v, bool const sign, char* const result, int digits) { int index = 0; if (sign) { result[index++] = '-'; @@ -1354,11 +1354,11 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const // Decimal dot is before any of the digits. int index_for_carrier = index; result[index++] = '0'; - if (d == 0) { + if (digits == 0) { return index; } result[index++] = '.'; - int actural_round = d; + int actural_round = digits; for (int i = -1; i > exp; i--) { index_for_carrier = index; result[index++] = '0'; @@ -1412,18 +1412,18 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const output /= 10; } index = integer_len; - if (d == 0) { + if (digits == 0) { return index; } result[index++] = '.'; - for (int i = 0; i < d; i++) { + for (int i = 0; i < digits; i++) { result[index++] = '0'; } } else { - uint32_t temp_d = d, tailing_zero = 0; - if (exp + d > olength) { + uint32_t temp_d = digits, tailing_zero = 0; + if (exp + digits > olength) { temp_d = olength - exp; - tailing_zero = d - temp_d; + tailing_zero = digits - temp_d; } uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1); uint32_t pow10 = POW10_TABLE[temp_d]; @@ -1444,17 +1444,17 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const integer /= 10; } index = formated_integer_len; - if (d == 0) { + if (digits == 0) { return index; } result[index++] = '.'; int current = index; for (int i = 0; i < tailing_zero; i++) { - result[current + d - i - 1] = '0'; + result[current + digits - i - 1] = '0'; index++; } - for (int i = tailing_zero; i < d; i++) { - result[current + d - i - 1] = (char) ('0' + decimal % 10); + for (int i = tailing_zero; i < digits; i++) { + result[current + digits - i - 1] = (char) ('0' + decimal % 10); decimal /= 10; index++; } @@ -1462,7 +1462,7 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const return index; } -__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int d) { +__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int digits) { int index = 0; if (sign) { index++; @@ -1471,57 +1471,57 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const uint32_t const olength = decimalLength9(output); int32_t exp = v.exponent + (int32_t) olength - 1; if (exp < 0) { - index += 2 + d; + index += 2 + digits; } else if (exp + 1 >= olength) { - index += exp + 1 + exp / 3 + 1 + d; + index += exp + 1 + exp / 3 + 1 + digits; } else { - uint32_t temp_d = d; - if (exp + d > olength) { + uint32_t temp_d = digits; + if (exp + digits > olength) { temp_d = olength - exp; } uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1); uint64_t pow10 = POW10_TABLE[temp_d]; uint64_t integer = rounded_output / pow10; uint32_t integer_len = decimalLength9(integer); - index += integer_len + (integer_len - 1) / 3 + 1 + d; + index += integer_len + (integer_len - 1) / 3 + 1 + digits; } - if (d == 0) { + if (digits == 0) { index--; } return index; } -__device__ int compute_format_float_size(double value, int d, bool is_float) { +__device__ int compute_format_float_size(double value, int digits, bool is_float) { bool sign = false, special = false; if (is_float) { floating_decimal_32 v = f2d(value, sign, special); if (special) { - return special_str_size(sign, v.exponent, v.mantissa, d); + return special_str_size(sign, v.exponent, v.mantissa, digits); } - return format_float_size(v, sign, d); + return format_float_size(v, sign, digits); } else { floating_decimal_64 v = d2d(value, sign, special); if (special) { - return special_str_size(sign, v.exponent, v.mantissa, d); + return special_str_size(sign, v.exponent, v.mantissa, digits); } - return format_float_size(v, sign, d); + return format_float_size(v, sign, digits); } } -__device__ int format_float(double value, int d, char* output, bool is_float) { +__device__ int format_float(double value, int digits, bool is_float, char* output) { bool sign = false, special = false; if (is_float) { floating_decimal_32 v = f2d(value, sign, special); if (special) { - return copy_special_str(output, sign, v.exponent, v.mantissa, d); + return copy_special_str(output, sign, v.exponent, v.mantissa, digits); } - return to_formated_chars(v, sign, output, d); + return to_formated_chars(v, sign, output, digits); } else { floating_decimal_64 v = d2d(value, sign, special); if (special) { - return copy_special_str(output, sign, v.exponent, v.mantissa, d); + return copy_special_str(output, sign, v.exponent, v.mantissa, digits); } - return to_formated_chars(v, sign, output, d); + return to_formated_chars(v, sign, output, digits); } } diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp index 1a93354339..ba1aaf05c8 100644 --- a/src/main/cpp/tests/cast_decimal_to_string.cpp +++ b/src/main/cpp/tests/cast_decimal_to_string.cpp @@ -24,9 +24,10 @@ #include -#include #include +#include + using namespace cudf; template diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp index c736d5971f..1f7aaaad21 100644 --- a/src/main/cpp/tests/cast_string.cpp +++ b/src/main/cpp/tests/cast_string.cpp @@ -24,9 +24,10 @@ #include -#include #include +#include + using namespace cudf; template diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index acafd231c4..9f02b2b0b6 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -24,9 +24,10 @@ #include -#include #include +#include + using namespace cudf; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index feda669e32..6a9751fb98 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -85,9 +85,10 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st * * @param cv the column data to process * @return the converted column + * @param digits the number of digits to display after the decimal point */ - public static ColumnVector formatFloat(ColumnView cv, int d) { - return new ColumnVector(formatFloat(cv.getNativeView(), d)); + public static ColumnVector formatFloat(ColumnView cv, int digits) { + return new ColumnVector(formatFloat(cv.getNativeView(), digits)); } /** @@ -147,7 +148,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled int precision, int scale); private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype); private static native long fromDecimal(long nativeColumnView); - private static native long formatFloat(long nativeColumnView, int d); + private static native long formatFloat(long nativeColumnView, int digits); private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); From b78e3b35da66956847f222e330f0c33e9aaacc32 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 27 Nov 2023 17:28:45 +0800 Subject: [PATCH 38/54] addressed comments Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 4 +- src/main/cpp/tests/cast_float_to_string.cpp | 53 ++++----------------- thirdparty/cudf | 2 +- 3 files changed, 13 insertions(+), 46 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index c41936c3bb..1c9eb2c4d6 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -78,7 +78,7 @@ struct float_to_string_fn { * The template function declaration ensures only float types are allowed. */ struct dispatch_float_to_string_fn { - template >* = nullptr> + template )> std::unique_ptr operator()(cudf::column_view const& floats, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -97,7 +97,7 @@ struct dispatch_float_to_string_fn { } // non-float types throw an exception - template >* = nullptr> + template )> std::unique_ptr operator()(cudf::column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index 0d4f62ac1b..806b3eaad5 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -35,61 +35,28 @@ struct FloatToStringTests : public cudf::test::BaseFixture {}; TEST_F(FloatToStringTests, FromFloats32) { - std::vector h_floats{100, - 654321.25, - -12761.125, - 0, - 5, - -4, - std::numeric_limits::quiet_NaN(), - 123456789012.34, - -0.0}; - std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"}; - - cudf::test::fixed_width_column_wrapper floats( - h_floats.begin(), - h_floats.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + auto const floats = cudf::test::fixed_width_column_wrapper { + 100.0f, 654321.25f, -12761.125f, 0.f, 5.0f, -4.0f, std::numeric_limits::quiet_NaN(), 123456789012.34f, -0.0f}; auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); - cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + auto const expected = cudf::test::strings_column_wrapper{ + "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } TEST_F(FloatToStringTests, FromFloats64) { - std::vector h_floats{100, - 654321.25, - -12761.125, - 1.123456789123456789, - 0.000000000000000000123456789123456789, - 0, - 5, - -4, - std::numeric_limits::quiet_NaN(), - 839542223232.794248339, - -0.0}; - std::vector h_expected{ - "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", - "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; - - cudf::test::fixed_width_column_wrapper floats( - h_floats.begin(), - h_floats.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + auto const floats = cudf::test::fixed_width_column_wrapper { + 100.0d, 654321.25d, -12761.125d, 1.123456789123456789d, 0.000000000000000000123456789123456789d, + 0.0d, 5.0d, -4.0d, std::numeric_limits::quiet_NaN(), 839542223232.794248339d, -0.0d}; auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); - cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + auto const expected = cudf::test::strings_column_wrapper{ + "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", + "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } \ No newline at end of file diff --git a/thirdparty/cudf b/thirdparty/cudf index 823d3214a9..168533a8ad 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3 +Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243 From d2cba4f165d2d3feb7bcf362ae81fde05c16557f Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 27 Nov 2023 18:02:08 +0800 Subject: [PATCH 39/54] Address comments Signed-off-by: Haoyang Li --- .../cpp/benchmarks/cast_string_to_float.cpp | 2 +- src/main/cpp/src/CastStringJni.cpp | 2 +- src/main/cpp/src/format_float.cu | 4 +- src/main/cpp/tests/cast_decimal_to_string.cpp | 2 +- src/main/cpp/tests/cast_string.cpp | 2 +- src/main/cpp/tests/format_float.cpp | 53 ++++--------------- .../nvidia/spark/rapids/jni/CastStrings.java | 6 +-- 7 files changed, 18 insertions(+), 53 deletions(-) diff --git a/src/main/cpp/benchmarks/cast_string_to_float.cpp b/src/main/cpp/benchmarks/cast_string_to_float.cpp index d94f9d26a0..32e245aa98 100644 --- a/src/main/cpp/benchmarks/cast_string_to_float.cpp +++ b/src/main/cpp/benchmarks/cast_string_to_float.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include "cast_string.hpp" #include diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 063fabe222..720f8514fc 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -109,7 +109,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat( CATCH_CAST_EXCEPTION(env, 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat( +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloatWithFormat( JNIEnv* env, jclass, jlong input_column, jint digits, jint j_dtype) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index 3d3fedbfb9..ec3c934415 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -79,7 +79,7 @@ struct format_float_fn { * The template function declaration ensures only float types are allowed. */ struct dispatch_format_float_fn { - template >* = nullptr> + template )> std::unique_ptr operator()(cudf::column_view const& floats, int digits, rmm::cuda_stream_view stream, @@ -99,7 +99,7 @@ struct dispatch_format_float_fn { } // non-float types throw an exception - template >* = nullptr> + template )> std::unique_ptr operator()(cudf::column_view const&, int, rmm::cuda_stream_view, diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp index ba1aaf05c8..05002c373c 100644 --- a/src/main/cpp/tests/cast_decimal_to_string.cpp +++ b/src/main/cpp/tests/cast_decimal_to_string.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include "cast_string.hpp" #include #include diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp index 1f7aaaad21..0a3f221894 100644 --- a/src/main/cpp/tests/cast_string.cpp +++ b/src/main/cpp/tests/cast_string.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include "cast_string.hpp" #include #include diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index 9f02b2b0b6..6d510d83a0 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -36,63 +36,28 @@ struct FormatFloatTests : public cudf::test::BaseFixture {}; TEST_F(FormatFloatTests, FormatFloats32) { - std::vector h_floats{100, - 654321.25, - -12761.125, - 0, - 5, - -4, - std::numeric_limits::quiet_NaN(), - 123456789012.34, - -0.0 - }; - std::vector h_expected{ + auto const floats = cudf::test::fixed_width_column_wrapper { + 100.0f, 654321.25f, -12761.125f, 0.0f, 5.0f, -4.0f, std::numeric_limits::quiet_NaN(), 123456789012.34f, -0.0f}; + + auto const expected = cudf::test::strings_column_wrapper{ "100.00000", "654,321.25000", "-12,761.12500", "0.00000", "5.00000", "-4.00000", "NaN", "123,456,790,000.00000", "-0.00000"}; - cudf::test::fixed_width_column_wrapper floats( - h_floats.begin(), - h_floats.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); - cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } TEST_F(FormatFloatTests, FormatFloats64) { - std::vector h_floats{100, - 654321.25, - -12761.125, - 1.123456789123456789, - 0.000000000000000000123456789123456789, - 0, - 5, - -4, - std::numeric_limits::quiet_NaN(), - 839542223232.794248339, - -0.0 - }; - std::vector h_expected{ + auto const floats = cudf::test::fixed_width_column_wrapper { + 100.0d, 654321.25d, -12761.125d, 1.123456789123456789d, 0.000000000000000000123456789123456789d, + 0.0d, 5.0d, -4.0d, std::numeric_limits::quiet_NaN(), 839542223232.794248339d, -0.0d}; + + auto const expected = cudf::test::strings_column_wrapper{ "100.00000", "654,321.25000", "-12,761.12500", "1.12346", "0.00000", "0.00000", "5.00000", "-4.00000", "NaN", "839,542,223,232.79420", "-0.00000"}; - cudf::test::fixed_width_column_wrapper floats( - h_floats.begin(), - h_floats.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream()); - cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } \ No newline at end of file diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 6a9751fb98..6ce4687b23 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -87,8 +87,8 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st * @return the converted column * @param digits the number of digits to display after the decimal point */ - public static ColumnVector formatFloat(ColumnView cv, int digits) { - return new ColumnVector(formatFloat(cv.getNativeView(), digits)); + public static ColumnVector fromFloatWithFormat(ColumnView cv, int digits) { + return new ColumnVector(fromFloatWithFormat(cv.getNativeView(), digits)); } /** @@ -148,7 +148,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled int precision, int scale); private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype); private static native long fromDecimal(long nativeColumnView); - private static native long formatFloat(long nativeColumnView, int digits); + private static native long fromFloatWithFormat(long nativeColumnView, int digits); private static native long toIntegersWithBase(long nativeColumnView, int base, boolean ansiEnabled, int dtype); private static native long fromIntegersWithBase(long nativeColumnView, int base); From 04d1c4f10cd95c7122de091d564138ada5aea4bc Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 4 Dec 2023 17:20:22 +0800 Subject: [PATCH 40/54] clang format Signed-off-by: Haoyang Li --- src/main/cpp/src/CastStringJni.cpp | 9 +- src/main/cpp/src/cast_float_to_string.cu | 36 +- src/main/cpp/src/ftos_converter.cuh | 705 ++++++++++---------- src/main/cpp/tests/cast_float_to_string.cpp | 42 +- thirdparty/cudf | 2 +- 5 files changed, 420 insertions(+), 374 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 093b51188b..933fc15e34 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -109,15 +109,16 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat( CATCH_CAST_EXCEPTION(env, 0); } -JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat( - JNIEnv* env, jclass, jlong input_column, jint j_dtype) +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(JNIEnv* env, + jclass, + jlong input_column) { JNI_NULL_CHECK(env, input_column, "input column is null", 0); try { cudf::jni::auto_set_device(env); - cudf::column_view cv{*reinterpret_cast(input_column)}; + auto const& cv = *reinterpret_cast(input_column); return cudf::jni::release_as_jlong( spark_rapids_jni::float_to_string(cv, cudf::get_default_stream())); } @@ -133,7 +134,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal try { cudf::jni::auto_set_device(env); - cudf::column_view cv{*reinterpret_cast(input_column)}; + auto const& cv = *reinterpret_cast(input_column); return cudf::jni::release_as_jlong( spark_rapids_jni::decimal_to_non_ansi_string(cv, cudf::get_default_stream())); } diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index 1c9eb2c4d6..31d3f69d11 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -47,14 +47,15 @@ struct float_to_string_fn { __device__ cudf::size_type compute_output_size(FloatType value) const { bool constexpr is_float = std::is_same_v; - return static_cast(ftos_converter::compute_ftos_size(static_cast(value), is_float)); + return static_cast( + ftos_converter::compute_ftos_size(static_cast(value), is_float)); } __device__ void float_to_string(cudf::size_type idx) const { - auto const value = d_floats.element(idx); + auto const value = d_floats.element(idx); bool constexpr is_float = std::is_same_v; - auto const output = d_chars + d_offsets[idx]; + auto const output = d_chars + d_offsets[idx]; ftos_converter::float_to_string(static_cast(value), is_float, output); } @@ -80,14 +81,16 @@ struct float_to_string_fn { struct dispatch_float_to_string_fn { template )> std::unique_ptr operator()(cudf::column_view const& floats, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const strings_count = floats.size(); + if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } + auto const input_ptr = cudf::column_device_view::create(floats, stream); - auto [offsets, chars] = - cudf::strings::detail::make_strings_children(float_to_string_fn{*input_ptr}, strings_count, stream, mr); + auto [offsets, chars] = cudf::strings::detail::make_strings_children( + float_to_string_fn{*input_ptr}, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets), @@ -99,8 +102,8 @@ struct dispatch_float_to_string_fn { // non-float types throw an exception template )> std::unique_ptr operator()(cudf::column_view const&, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) { CUDF_FAIL("Values for float_to_string function must be a float type."); } @@ -110,23 +113,18 @@ struct dispatch_float_to_string_fn { // This will convert all float column types into a strings column. std::unique_ptr float_to_string(cudf::column_view const& floats, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto const strings_count = floats.size(); - if (strings_count == 0) { - return cudf::make_empty_column(cudf::type_id::STRING); - } - return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr); } } // namespace detail // external API -std::unique_ptr float_to_string(cudf::column_view const& floats, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr float_to_string(cudf::column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::float_to_string(floats, stream, mr); diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index b9905ae567..444f790d3c 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -15,11 +15,11 @@ * limitations under the License. */ +#include #include +#include #include #include -#include -#include namespace spark_rapids_jni::ftos_converter { @@ -47,84 +47,109 @@ typedef struct floating_decimal_32 { // These tables are generated by PrintDoubleLookupTable. constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; -constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; -constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); -constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); -constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; -constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; -constexpr unsigned int DOUBLE_BIAS = 1023; -constexpr unsigned int FLOAT_MANTISSA_BITS = 23; -constexpr unsigned int FLOAT_EXPONENT_BITS = 8; -constexpr unsigned int FLOAT_BIAS = 127; - -__constant__ -uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = { - { 1u, 2305843009213693952u }, - { 5955668970331000884u, 1784059615882449851u }, - { 8982663654677661702u, 1380349269358112757u }, - { 7286864317269821294u, 2135987035920910082u }, - { 7005857020398200553u, 1652639921975621497u }, - { 17965325103354776697u, 1278668206209430417u }, - { 8928596168509315048u, 1978643211784836272u }, - { 10075671573058298858u, 1530901034580419511u }, - { 597001226353042382u, 1184477304306571148u }, - { 1527430471115325346u, 1832889850782397517u }, - { 12533209867169019542u, 1418129833677084982u }, - { 5577825024675947042u, 2194449627517475473u }, - { 11006974540203867551u, 1697873161311732311u }, - { 10313493231639821582u, 1313665730009899186u }, - { 12701016819766672773u, 2032799256770390445u } -}; - -__constant__ -uint32_t const POW5_INV_OFFSETS[19] = { - 0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555, - 0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054, - 0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554, - 0x00000000 -}; - -__constant__ -uint64_t const DOUBLE_POW5_SPLIT2[13][2] = { - { 0u, 1152921504606846976u }, - { 0u, 1490116119384765625u }, - { 1032610780636961552u, 1925929944387235853u }, - { 7910200175544436838u, 1244603055572228341u }, - { 16941905809032713930u, 1608611746708759036u }, - { 13024893955298202172u, 2079081953128979843u }, - { 6607496772837067824u, 1343575221513417750u }, - { 17332926989895652603u, 1736530273035216783u }, - { 13037379183483547984u, 2244412773384604712u }, - { 1605989338741628675u, 1450417759929778918u }, - { 9630225068416591280u, 1874621017369538693u }, - { 665883850346957067u, 1211445438634777304u }, - { 14931890668723713708u, 1565756531257009982u } -}; - -__constant__ -uint32_t const POW5_OFFSETS[21] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, - 0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, - 0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151, - 0x55559155, 0x51405555, 0x00000105 -}; +constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; +constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); +constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); +constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; +constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; +constexpr unsigned int DOUBLE_BIAS = 1023; +constexpr unsigned int FLOAT_MANTISSA_BITS = 23; +constexpr unsigned int FLOAT_EXPONENT_BITS = 8; +constexpr unsigned int FLOAT_BIAS = 127; + +__constant__ uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = { + {1u, 2305843009213693952u}, + {5955668970331000884u, 1784059615882449851u}, + {8982663654677661702u, 1380349269358112757u}, + {7286864317269821294u, 2135987035920910082u}, + {7005857020398200553u, 1652639921975621497u}, + {17965325103354776697u, 1278668206209430417u}, + {8928596168509315048u, 1978643211784836272u}, + {10075671573058298858u, 1530901034580419511u}, + {597001226353042382u, 1184477304306571148u}, + {1527430471115325346u, 1832889850782397517u}, + {12533209867169019542u, 1418129833677084982u}, + {5577825024675947042u, 2194449627517475473u}, + {11006974540203867551u, 1697873161311732311u}, + {10313493231639821582u, 1313665730009899186u}, + {12701016819766672773u, 2032799256770390445u}}; + +__constant__ uint32_t const POW5_INV_OFFSETS[19] = {0x54544554, + 0x04055545, + 0x10041000, + 0x00400414, + 0x40010000, + 0x41155555, + 0x00000454, + 0x00010044, + 0x40000000, + 0x44000041, + 0x50454450, + 0x55550054, + 0x51655554, + 0x40004000, + 0x01000001, + 0x00010500, + 0x51515411, + 0x05555554, + 0x00000000}; + +__constant__ uint64_t const DOUBLE_POW5_SPLIT2[13][2] = { + {0u, 1152921504606846976u}, + {0u, 1490116119384765625u}, + {1032610780636961552u, 1925929944387235853u}, + {7910200175544436838u, 1244603055572228341u}, + {16941905809032713930u, 1608611746708759036u}, + {13024893955298202172u, 2079081953128979843u}, + {6607496772837067824u, 1343575221513417750u}, + {17332926989895652603u, 1736530273035216783u}, + {13037379183483547984u, 2244412773384604712u}, + {1605989338741628675u, 1450417759929778918u}, + {9630225068416591280u, 1874621017369538693u}, + {665883850346957067u, 1211445438634777304u}, + {14931890668723713708u, 1565756531257009982u}}; + +__constant__ uint32_t const POW5_OFFSETS[21] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, 0x55545555, + 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, 0x45555550, 0x40004000, + 0x96440440, 0x55565565, 0x54454045, 0x40154151, 0x55559155, 0x51405555, 0x00000105}; constexpr uint32_t POW5_TABLE_SIZE = 26; -__constant__ -uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { -1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull, -1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull, -30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull, -19073486328125ull, 95367431640625ull, 476837158203125ull, -2384185791015625ull, 11920928955078125ull, 59604644775390625ull, -298023223876953125ull //, 1490116119384765625ull +__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { + 1ull, + 5ull, + 25ull, + 125ull, + 625ull, + 3125ull, + 15625ull, + 78125ull, + 390625ull, + 1953125ull, + 9765625ull, + 48828125ull, + 244140625ull, + 1220703125ull, + 6103515625ull, + 30517578125ull, + 152587890625ull, + 762939453125ull, + 3814697265625ull, + 19073486328125ull, + 95367431640625ull, + 476837158203125ull, + 2384185791015625ull, + 11920928955078125ull, + 59604644775390625ull, + 298023223876953125ull //, 1490116119384765625ull }; //===== common.h from ryu ===== // Returns the number of decimal digits in v, which must not contain more than 9 digits. -__device__ inline uint32_t decimalLength9(uint32_t const v) { +__device__ inline uint32_t decimalLength9(uint32_t const v) +{ // Function precondition: v is not a 10-digit number. // (f2s: 9 digits are sufficient for round-tripping.) // (d2fixed: We print 9-digit blocks.) @@ -141,40 +166,42 @@ __device__ inline uint32_t decimalLength9(uint32_t const v) { } // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. -__device__ inline int32_t pow5bits(int32_t const e) { +__device__ inline int32_t pow5bits(int32_t const e) +{ // This approximation works up to the point that the multiplication overflows at e = 3529. // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater // than 2^9297. assert(e >= 0); assert(e <= 3528); - return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1); + return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1); } // Returns floor(log_10(2^e)); requires 0 <= e <= 1650. -__device__ inline uint32_t log10Pow2(int32_t const e) { +__device__ inline uint32_t log10Pow2(int32_t const e) +{ // The first value this approximation fails for is 2^1651 which is just greater than 10^297. assert(e >= 0); assert(e <= 1650); - return (((uint32_t) e) * 78913) >> 18; + return (((uint32_t)e) * 78913) >> 18; } // Returns floor(log_10(5^e)); requires 0 <= e <= 2620. -__device__ inline uint32_t log10Pow5(int32_t const e) { +__device__ inline uint32_t log10Pow5(int32_t const e) +{ // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. assert(e >= 0); assert(e <= 2620); - return (((uint32_t) e) * 732923) >> 20; + return (((uint32_t)e) * 732923) >> 20; } -__device__ inline uint32_t pow5factor_32(uint32_t value) { +__device__ inline uint32_t pow5factor_32(uint32_t value) +{ uint32_t count = 0; for (;;) { assert(value != 0); uint32_t const q = value / 5; uint32_t const r = value % 5; - if (r != 0) { - break; - } + if (r != 0) { break; } value = q; ++count; } @@ -182,43 +209,47 @@ __device__ inline uint32_t pow5factor_32(uint32_t value) { } // Returns true if value is divisible by 5^p. -__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) { +__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) +{ return pow5factor_32(value) >= p; } // Returns true if value is divisible by 2^p. -__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) { +__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) +{ // __builtin_ctz doesn't appear to be faster here. return (value & ((1u << p) - 1)) == 0; } // It seems to be slightly faster to avoid uint128_t here, although the // generated code for uint128_t looks slightly nicer. -__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) { +__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) +{ assert(shift > 32); // The casts here help MSVC to avoid calls to the __allmul library // function. uint32_t const factorLo = (uint32_t)(factor); uint32_t const factorHi = (uint32_t)(factor >> 32); - uint64_t const bits0 = (uint64_t)m * factorLo; - uint64_t const bits1 = (uint64_t)m * factorHi; + uint64_t const bits0 = (uint64_t)m * factorLo; + uint64_t const bits1 = (uint64_t)m * factorHi; - uint64_t const sum = (bits0 >> 32) + bits1; + uint64_t const sum = (bits0 >> 32) + bits1; uint64_t const shiftedSum = sum >> (shift - 32); assert(shiftedSum <= UINT32_MAX); - return (uint32_t) shiftedSum; - + return (uint32_t)shiftedSum; } -__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa) { +__device__ inline int copy_special_str(char* const result, + bool const sign, + bool const exponent, + bool const mantissa) +{ if (mantissa) { memcpy(result, "NaN", 3); return 3; } - if (sign) { - result[0] = '-'; - } + if (sign) { result[0] = '-'; } if (exponent) { memcpy(result + sign, "Infinity", 8); return sign + 8; @@ -227,23 +258,22 @@ __device__ inline int copy_special_str(char * const result, bool const sign, boo return sign + 3; } -__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) { - if (mantissa) { - return 3; - } - if (exponent) { - return sign + 8; - } +__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) +{ + if (mantissa) { return 3; } + if (exponent) { return sign + 8; } return sign + 3; } -__device__ inline uint32_t float_to_bits(float const f) { +__device__ inline uint32_t float_to_bits(float const f) +{ uint32_t bits = 0; memcpy(&bits, &f, sizeof(float)); return bits; } -__device__ inline uint64_t double_to_bits(double const d) { +__device__ inline uint64_t double_to_bits(double const d) +{ uint64_t bits = 0; memcpy(&bits, &d, sizeof(double)); return bits; @@ -251,7 +281,8 @@ __device__ inline uint64_t double_to_bits(double const d) { //===== d2s_intrinsics.h from ryu ===== -__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) { +__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) +{ // The casts here help MSVC to avoid calls to the __allmul library function. uint32_t const aLo = (uint32_t)a; uint32_t const aHi = (uint32_t)(a >> 32); @@ -266,11 +297,11 @@ __device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* uint32_t const b00Lo = (uint32_t)b00; uint32_t const b00Hi = (uint32_t)(b00 >> 32); - uint64_t const mid1 = b10 + b00Hi; + uint64_t const mid1 = b10 + b00Hi; uint32_t const mid1Lo = (uint32_t)(mid1); uint32_t const mid1Hi = (uint32_t)(mid1 >> 32); - uint64_t const mid2 = b01 + mid1Lo; + uint64_t const mid2 = b01 + mid1Lo; uint32_t const mid2Lo = (uint32_t)(mid2); uint32_t const mid2Hi = (uint32_t)(mid2 >> 32); @@ -281,68 +312,71 @@ __device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* return pLo; } -__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) { +__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) +{ // We don't need to handle the case dist >= 64 here (see above). assert(dist < 64); assert(dist > 0); return (hi << (64 - dist)) | (lo >> dist); } -__device__ inline uint64_t div5(uint64_t const x) { - return x / 5; -} +__device__ inline uint64_t div5(uint64_t const x) { return x / 5; } -__device__ inline uint64_t div10(uint64_t const x) { - return x / 10; -} +__device__ inline uint64_t div10(uint64_t const x) { return x / 10; } -__device__ inline uint64_t div100(uint64_t const x) { - return x / 100; -} +__device__ inline uint64_t div100(uint64_t const x) { return x / 100; } -__device__ inline uint32_t pow5Factor(uint64_t value) { - uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) - uint64_t const n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 - uint32_t count = 0; +__device__ inline uint32_t pow5Factor(uint64_t value) +{ + uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) + uint64_t const n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 + uint32_t count = 0; for (;;) { assert(value != 0); value *= m_inv_5; - if (value > n_div_5) - break; + if (value > n_div_5) break; ++count; } return count; } // Returns true if value is divisible by 5^p. -__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) { +__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) +{ // I tried a case distinction on p, but there was no performance difference. return pow5Factor(value) >= p; } // Returns true if value is divisible by 2^p. -__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) { +__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) +{ assert(value != 0); assert(p < 64); // __builtin_ctzll doesn't appear to be faster here. return (value & ((1ull << p) - 1)) == 0; } -__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) { +__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) +{ // m is maximum 55 bits - uint64_t high1; // 128 - uint64_t const low1 = umul128(m, mul[1], &high1); // 64 - uint64_t high0; // 64 - umul128(m, mul[0], &high0); // 0 + uint64_t high1; // 128 + uint64_t const low1 = umul128(m, mul[1], &high1); // 64 + uint64_t high0; // 64 + umul128(m, mul[0], &high0); // 0 uint64_t const sum = high0 + low1; if (sum < high0) { - ++high1; // overflow into high1 + ++high1; // overflow into high1 } return shiftright128(sum, high1, j - 64); } -__device__ inline uint64_t mulShiftAll64(uint64_t const m, uint64_t const* const mul, int32_t const j, - uint64_t* const vp, uint64_t* const vm, uint32_t const mmShift) { +__device__ inline uint64_t mulShiftAll64(uint64_t const m, + uint64_t const* const mul, + int32_t const j, + uint64_t* const vp, + uint64_t* const vm, + uint32_t const mmShift) +{ *vp = mulShift64(4 * m + 2, mul, j); *vm = mulShift64(4 * m - 1 - mmShift, mul, j); return mulShift64(4 * m, mul, j); @@ -351,10 +385,11 @@ __device__ inline uint64_t mulShiftAll64(uint64_t const m, uint64_t const* const //===== d2s_small_table.h from ryu ===== // Computes 5^i in the form required by Ryu, and stores it in the given pointer. -__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) { - uint32_t const base = i / POW5_TABLE_SIZE; - uint32_t const base2 = base * POW5_TABLE_SIZE; - uint32_t const offset = i - base2; +__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) +{ + uint32_t const base = i / POW5_TABLE_SIZE; + uint32_t const base2 = base * POW5_TABLE_SIZE; + uint32_t const offset = i - base2; uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base]; if (offset == 0) { result[0] = mul[0]; @@ -366,9 +401,9 @@ __device__ inline void double_computePow5(uint32_t const i, uint64_t* const resu uint64_t const low1 = umul128(m, mul[1], &high1); uint64_t high0; uint64_t const low0 = umul128(m, mul[0], &high0); - uint64_t const sum = high0 + low1; + uint64_t const sum = high0 + low1; if (sum < high0) { - ++high1; // overflow into high1 + ++high1; // overflow into high1 } // high1 | sum | low0 uint32_t const delta = pow5bits(i) - pow5bits(base2); @@ -377,11 +412,12 @@ __device__ inline void double_computePow5(uint32_t const i, uint64_t* const resu } // Computes 5^-i in the form required by Ryu, and stores it in the given pointer. -__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) { - uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; - uint32_t const base2 = base * POW5_TABLE_SIZE; - uint32_t const offset = base2 - i; - uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 +__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) +{ + uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; + uint32_t const base2 = base * POW5_TABLE_SIZE; + uint32_t const offset = base2 - i; + uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 if (offset == 0) { result[0] = mul[0]; result[1] = mul[1]; @@ -392,28 +428,32 @@ __device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const r uint64_t const low1 = umul128(m, mul[1], &high1); uint64_t high0; uint64_t const low0 = umul128(m, mul[0] - 1, &high0); - uint64_t const sum = high0 + low1; + uint64_t const sum = high0 + low1; if (sum < high0) { - ++high1; // overflow into high1 + ++high1; // overflow into high1 } // high1 | sum | low0 uint32_t const delta = pow5bits(base2) - pow5bits(i); - result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); + result[0] = + shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); result[1] = shiftright128(sum, high1, delta); } //===== f2s_intrinsics.h from ryu ===== -__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) { - // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup - // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the - // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits. +__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) +{ + // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double + // lookup table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely + // on the fact that the added 1 that's already stored in the table never overflows into the upper + // 64 bits. uint64_t pow5[2]; double_computeInvPow5(q, pow5); return mulShift32(m, pow5[1] + 1, j); } -__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) { +__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) +{ uint64_t pow5[2]; double_computePow5(i, pow5); return mulShift32(m, pow5[1], j); @@ -421,7 +461,8 @@ __device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, in //===== d2s.c and f2s.c from ryu ===== -__device__ inline uint32_t decimalLength17(uint64_t const v) { +__device__ inline uint32_t decimalLength17(uint64_t const v) +{ // This is slightly faster than a loop. // The average output length is 16.38 digits, so we check high-to-low. // Function precondition: v is not an 18, 19, or 20-digit number. @@ -446,7 +487,8 @@ __device__ inline uint32_t decimalLength17(uint64_t const v) { return 1; } -__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) { +__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) +{ int32_t e2; uint64_t m2; if (ieeeExponent == 0) { @@ -454,10 +496,10 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; m2 = ieeeMantissa; } else { - e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; + e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; } - bool const even = (m2 & 1) == 0; + bool const even = (m2 & 1) == 0; bool const acceptBounds = even; // Step 2: Determine the interval of valid decimal representations. @@ -477,9 +519,9 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t // I tried special-casing q == 0, but there was no effect on performance. // This expression is slightly faster than max(0, log10Pow2(e2) - 1). uint32_t const q = log10Pow2(e2) - (e2 > 3); - e10 = (int32_t) q; - int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; - int32_t const i = -e2 + (int32_t) q + k; + e10 = (int32_t)q; + int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; + int32_t const i = -e2 + (int32_t)q + k; uint64_t pow5[2]; double_computeInvPow5(q, pow5); vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); @@ -488,7 +530,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t // This should use q <= 22, but I think 21 is also safe. Smaller values // may still be safe, but it's more difficult to reason about them. // Only one of mp, mv, and mm can be a multiple of 5, if any. - uint32_t const mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv)); + uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv)); if (mvMod5 == 0) { vrIsTrailingZeros = multipleOfPowerOf5(mv, q); } else if (acceptBounds) { @@ -504,10 +546,10 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t } else { // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). uint32_t const q = log10Pow5(-e2) - (-e2 > 1); - e10 = (int32_t) q + e2; - int32_t const i = -e2 - (int32_t) q; - int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; - int32_t const j = (int32_t) q - k; + e10 = (int32_t)q + e2; + int32_t const i = -e2 - (int32_t)q; + int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; + int32_t const j = (int32_t)q - k; uint64_t pow5[2]; double_computePow5(i, pow5); @@ -524,7 +566,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t // mp = mv + 2, so it always has at least one trailing 0 bit. --vp; } - } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. + } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. // We want to know if the full product has at least q trailing zeros. // We need to compute min(p2(mv), p5(mv) - e2) >= q // <=> p2(mv) >= q && p5(mv) - e2 >= q @@ -534,7 +576,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t } // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; + int32_t removed = 0; uint8_t lastRemovedDigit = 0; uint64_t output; // On average, we remove ~2 digits. @@ -543,36 +585,32 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t for (;;) { uint64_t const vpDiv10 = div10(vp); uint64_t const vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { - break; - } - uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); + if (vpDiv10 <= vmDiv10) { break; } + uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); vmIsTrailingZeros &= vmMod10 == 0; vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; + lastRemovedDigit = (uint8_t)vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; ++removed; } if (vmIsTrailingZeros) { for (;;) { uint64_t const vmDiv10 = div10(vm); - uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10); - if (vmMod10 != 0) { - break; - } + uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); + if (vmMod10 != 0) { break; } uint64_t const vpDiv10 = div10(vp); uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); + uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; + lastRemovedDigit = (uint8_t)vrMod10; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; ++removed; } } @@ -585,16 +623,16 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); } else { // Specialized for the common case (~99.3%). Percentages below are relative to this. - bool roundUp = false; + bool roundUp = false; uint64_t const vpDiv100 = div100(vp); uint64_t const vmDiv100 = div100(vm); - if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). + if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). uint64_t const vrDiv100 = div100(vr); - uint32_t const vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100); - roundUp = vrMod100 >= 50; - vr = vrDiv100; - vp = vpDiv100; - vm = vmDiv100; + uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100); + roundUp = vrMod100 >= 50; + vr = vrDiv100; + vp = vpDiv100; + vm = vmDiv100; removed += 2; } // Loop iterations below (approximately), without optimization above: @@ -604,15 +642,13 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t for (;;) { uint64_t const vpDiv10 = div10(vp); uint64_t const vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { - break; - } + if (vpDiv10 <= vmDiv10) { break; } uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10); - roundUp = vrMod10 >= 5; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; + uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); + roundUp = vrMod10 >= 5; + vr = vrDiv10; + vp = vpDiv10; + vm = vmDiv10; ++removed; } @@ -627,7 +663,8 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t return fd; } -__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) { +__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) +{ int32_t e2; uint32_t m2; if (ieeeExponent == 0) { @@ -635,10 +672,10 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; m2 = ieeeMantissa; } else { - e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; + e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; } - bool const even = (m2 & 1) == 0; + bool const even = (m2 & 1) == 0; bool const acceptBounds = even; // Step 2: Determine the interval of valid decimal representations. @@ -646,28 +683,28 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t uint32_t const mp = 4 * m2 + 2; // Implicit bool -> int conversion. True is 1, false is 0. uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - uint32_t const mm = 4 * m2 - 1 - mmShift; + uint32_t const mm = 4 * m2 - 1 - mmShift; // Step 3: Convert to a decimal power base using 64-bit arithmetic. uint32_t vr, vp, vm; int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; + bool vmIsTrailingZeros = false; + bool vrIsTrailingZeros = false; uint8_t lastRemovedDigit = 0; if (e2 >= 0) { uint32_t const q = log10Pow2(e2); - e10 = (int32_t) q; - int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1; - int32_t const i = -e2 + (int32_t) q + k; - vr = mulPow5InvDivPow2(mv, q, i); - vp = mulPow5InvDivPow2(mp, q, i); - vm = mulPow5InvDivPow2(mm, q, i); + e10 = (int32_t)q; + int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; + int32_t const i = -e2 + (int32_t)q + k; + vr = mulPow5InvDivPow2(mv, q, i); + vp = mulPow5InvDivPow2(mp, q, i); + vm = mulPow5InvDivPow2(mm, q, i); if (q != 0 && (vp - 1) / 10 <= vm / 10) { // We need to know one removed digit even if we are not going to loop below. We could use // q = X - 1 above, except that would require 33 bits for the result, and we've found that // 32-bit arithmetic is faster even on 64-bit machines. - int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1; - lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10); + int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1; + lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10); } if (q <= 9) { // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. @@ -682,16 +719,16 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t } } else { uint32_t const q = log10Pow5(-e2); - e10 = (int32_t) q + e2; - int32_t const i = -e2 - (int32_t) q; - int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT; - int32_t j = (int32_t) q - k; - vr = mulPow5divPow2(mv, (uint32_t) i, j); - vp = mulPow5divPow2(mp, (uint32_t) i, j); - vm = mulPow5divPow2(mm, (uint32_t) i, j); + e10 = (int32_t)q + e2; + int32_t const i = -e2 - (int32_t)q; + int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT; + int32_t j = (int32_t)q - k; + vr = mulPow5divPow2(mv, (uint32_t)i, j); + vp = mulPow5divPow2(mp, (uint32_t)i, j); + vm = mulPow5divPow2(mm, (uint32_t)i, j); if (q != 0 && (vp - 1) / 10 <= vm / 10) { - j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); - lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10); + j = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); + lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10); } if (q <= 1) { // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. @@ -704,7 +741,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t // mp = mv + 2, so it always has at least one trailing 0 bit. --vp; } - } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. + } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); } } @@ -717,7 +754,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t while (vp / 10 > vm / 10) { vmIsTrailingZeros &= vm % 10 == 0; vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) (vr % 10); + lastRemovedDigit = (uint8_t)(vr % 10); vr /= 10; vp /= 10; vm /= 10; @@ -726,7 +763,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t if (vmIsTrailingZeros) { while (vm % 10 == 0) { vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t) (vr % 10); + lastRemovedDigit = (uint8_t)(vr % 10); vr /= 10; vp /= 10; vm /= 10; @@ -744,7 +781,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t // Loop iterations below (approximately): // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% while (vp / 10 > vm / 10) { - lastRemovedDigit = (uint8_t) (vr % 10); + lastRemovedDigit = (uint8_t)(vr % 10); vr /= 10; vp /= 10; vm /= 10; @@ -761,45 +798,43 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t return fd; } -__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) { +__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) +{ // Step 5: Print the decimal representation. int index = 0; - if (sign) { - result[index++] = '-'; - } + if (sign) { result[index++] = '-'; } - uint64_t output = v.mantissa; - uint32_t const olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t) olength - 1; + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t)olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); - + // Values in the interval [1E-3, 1E7) are special. if (scientificNotation) { // Print in the format x.xxxxxE-yy. for (uint32_t i = 0; i < olength - 1; ++i) { - uint32_t const c = output % 10; output /= 10; - result[index + olength - i] = (char) ('0' + c); + uint32_t const c = output % 10; + output /= 10; + result[index + olength - i] = (char)('0' + c); } - result[index] = '0' + output % 10; + result[index] = '0' + output % 10; result[index + 1] = '.'; index += olength + 1; - if (olength == 1) { - result[index++] = '0'; - } + if (olength == 1) { result[index++] = '0'; } // Print 'E', the exponent sign, and the exponent, which has at most three digits. result[index++] = 'E'; if (exp < 0) { result[index++] = '-'; - exp = -exp; + exp = -exp; } if (exp >= 100) { - result[index++] = (char) ('0' + exp / 100); - exp %= 100; - result[index++] = (char) ('0' + exp / 10); - } else if (exp >= 10) { - result[index++] = (char) ('0' + exp / 10); - } - result[index++] = (char) ('0' + exp % 10); + result[index++] = (char)('0' + exp / 100); + exp %= 100; + result[index++] = (char)('0' + exp / 10); + } else if (exp >= 10) { + result[index++] = (char)('0' + exp / 10); + } + result[index++] = (char)('0' + exp % 10); } else { // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). if (exp < 0) { @@ -811,14 +846,14 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha } int current = index; for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char) ('0' + output % 10); + result[current + olength - i - 1] = (char)('0' + output % 10); output /= 10; index++; } } else if (exp + 1 >= olength) { // Decimal dot is after any of the digits. for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char) ('0' + output % 10); + result[index + olength - i - 1] = (char)('0' + output % 10); output /= 10; } index += olength; @@ -835,7 +870,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha result[current + olength - i - 1] = '.'; current--; } - result[current + olength - i - 1] = (char) ('0' + output % 10); + result[current + olength - i - 1] = (char)('0' + output % 10); output /= 10; } index += olength + 1; @@ -844,22 +879,19 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha return index; } -__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) { +__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) +{ int index = 0; - if (sign) { - index++; - } + if (sign) { index++; } - uint64_t output = v.mantissa; - uint32_t const olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t) olength - 1; + uint64_t output = v.mantissa; + uint32_t const olength = decimalLength17(output); + int32_t exp = v.exponent + (int32_t)olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); - + if (scientificNotation) { index += olength + 1; - if (olength == 1) { - index++; - } + if (olength == 1) { index++; } // 'E' index++; if (exp < 0) { @@ -886,41 +918,37 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) { return index; } -__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) { +__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) +{ // Step 5: Print the decimal representation. int index = 0; - if (sign) { - result[index++] = '-'; - } + if (sign) { result[index++] = '-'; } - uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); if (scientificNotation) { // Print in the format x.xxxxxE-yy. for (int i = 0; i < olength - 1; i++) { - int c = output % 10; output /= 10; - result[index + olength - i] = (char) ('0' + c); + int c = output % 10; + output /= 10; + result[index + olength - i] = (char)('0' + c); } - result[index] = (char) ('0' + output % 10); + result[index] = (char)('0' + output % 10); result[index + 1] = '.'; index += olength + 1; - if (olength == 1) { - result[index++] = '0'; - } + if (olength == 1) { result[index++] = '0'; } // Print 'E', the exponent sign, and the exponent, which has at most two digits. result[index++] = 'E'; if (exp < 0) { result[index++] = '-'; - exp = -exp; - } - if (exp >= 10) { - result[index++] = (char) ('0' + exp / 10); + exp = -exp; } - result[index++] = (char) ('0' + exp % 10); + if (exp >= 10) { result[index++] = (char)('0' + exp / 10); } + result[index++] = (char)('0' + exp % 10); } else { // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). if (exp < 0) { @@ -932,14 +960,14 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha } int current = index; for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char) ('0' + output % 10); + result[current + olength - i - 1] = (char)('0' + output % 10); output /= 10; index++; } } else if (exp + 1 >= olength) { // Decimal dot is after any of the digits. for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char) ('0' + output % 10); + result[index + olength - i - 1] = (char)('0' + output % 10); output /= 10; } index += olength; @@ -956,7 +984,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha result[current + olength - i - 1] = '.'; current--; } - result[current + olength - i - 1] = (char) ('0' + output % 10); + result[current + olength - i - 1] = (char)('0' + output % 10); output /= 10; } index += olength + 1; @@ -965,32 +993,27 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha return index; } -__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) { +__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) +{ // Step 5: Print the decimal representation. int index = 0; - if (sign) { - index++; - } + if (sign) { index++; } - uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; + uint32_t output = v.mantissa; + uint32_t const olength = decimalLength9(output); + int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); if (scientificNotation) { index += olength + 1; - if (olength == 1) { - index++; - } + if (olength == 1) { index++; } // 'E' index++; if (exp < 0) { index++; exp = -exp; } - if (exp >= 10) { - index++; - } + if (exp >= 10) { index++; } index++; } else { // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). @@ -1008,10 +1031,12 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) { return index; } -__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const ieeeExponent, - floating_decimal_64* const v) { +__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, + uint32_t const ieeeExponent, + floating_decimal_64* const v) +{ uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - int32_t const e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; + int32_t const e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; if (e2 > 0) { // f = m2 * 2^e2 >= 2^53 is an integer. @@ -1026,11 +1051,9 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. - uint64_t const mask = (1ull << -e2) - 1; + uint64_t const mask = (1ull << -e2) - 1; uint64_t const fraction = m2 & mask; - if (fraction != 0) { - return false; - } + if (fraction != 0) { return false; } // f is an integer in the range [1, 2^53). // Note: mantissa might contain trailing (decimal) 0's. @@ -1040,16 +1063,19 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const return true; } -__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { +__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) +{ // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. uint64_t const bits = double_to_bits(f); // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; + ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); - uint32_t const ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); + uint32_t const ieeeExponent = + (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || + (ieeeExponent == 0 && ieeeMantissa == 0)) { special = true; return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; } @@ -1063,10 +1089,8 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) for (;;) { uint64_t const q = div10(v.mantissa); - uint32_t const r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q); - if (r != 0) { - break; - } + uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q); + if (r != 0) { break; } v.mantissa = q; ++v.exponent; } @@ -1076,26 +1100,27 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { return v; } -__device__ int d2s_buffered_n(double f, char* result) { +__device__ int d2s_buffered_n(double f, char* result) +{ bool sign = false, special = false; floating_decimal_64 v = d2d(f, sign, special); - if (special) { - return copy_special_str(result, sign, v.exponent, v.mantissa); - } + if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); } return to_chars(v, sign, result); } -__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { +__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) +{ // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. uint32_t const bits = float_to_bits(f); // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; + ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) { + if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || + (ieeeExponent == 0 && ieeeMantissa == 0)) { special = true; return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; } @@ -1103,54 +1128,52 @@ __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { return f2d(ieeeMantissa, ieeeExponent); } -__device__ int f2s_buffered_n(float f, char* result) { +__device__ int f2s_buffered_n(float f, char* result) +{ bool sign = false, special = false; floating_decimal_32 v = f2d(f, sign, special); - if (special) { - return copy_special_str(result, sign, v.exponent, v.mantissa); - } + if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); } return to_chars(v, sign, result); } - //===== compute float to string size ===== -__device__ int compute_d2s_size(double value) { +__device__ int compute_d2s_size(double value) +{ bool sign = false, special = false; floating_decimal_64 v = d2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } + if (special) { return special_str_size(sign, v.exponent, v.mantissa); } return d2s_size(v, sign); } -__device__ int compute_f2s_size(float value) { +__device__ int compute_f2s_size(float value) +{ bool sign = false, special = false; floating_decimal_32 v = f2d(value, sign, special); - if (special) { - return special_str_size(sign, v.exponent, v.mantissa); - } + if (special) { return special_str_size(sign, v.exponent, v.mantissa); } return f2s_size(v, sign); } -} // namespace +} // namespace //===== APIs ===== -__device__ int compute_ftos_size(double value, bool is_float) { +__device__ int compute_ftos_size(double value, bool is_float) +{ if (is_float) { - return compute_f2s_size(value); + return compute_f2s_size(value); } else { - return compute_d2s_size(value); + return compute_d2s_size(value); } } -__device__ int float_to_string(double value, bool is_float, char* output) { - if (is_float) { - return f2s_buffered_n(value, output); - } else { - return d2s_buffered_n(value, output); - } +__device__ int float_to_string(double value, bool is_float, char* output) +{ + if (is_float) { + return f2s_buffered_n(value, output); + } else { + return d2s_buffered_n(value, output); + } } -} // namespace spark-rapids-jni::ftos_converter +} // namespace spark_rapids_jni::ftos_converter diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index 806b3eaad5..128b5fd592 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -35,8 +34,16 @@ struct FloatToStringTests : public cudf::test::BaseFixture {}; TEST_F(FloatToStringTests, FromFloats32) { - auto const floats = cudf::test::fixed_width_column_wrapper { - 100.0f, 654321.25f, -12761.125f, 0.f, 5.0f, -4.0f, std::numeric_limits::quiet_NaN(), 123456789012.34f, -0.0f}; + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0f, + 654321.25f, + -12761.125f, + 0.f, + 5.0f, + -4.0f, + std::numeric_limits::quiet_NaN(), + 123456789012.34f, + -0.0f}; auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); @@ -48,15 +55,32 @@ TEST_F(FloatToStringTests, FromFloats32) TEST_F(FloatToStringTests, FromFloats64) { - auto const floats = cudf::test::fixed_width_column_wrapper { - 100.0d, 654321.25d, -12761.125d, 1.123456789123456789d, 0.000000000000000000123456789123456789d, - 0.0d, 5.0d, -4.0d, std::numeric_limits::quiet_NaN(), 839542223232.794248339d, -0.0d}; + auto const floats = + cudf::test::fixed_width_column_wrapper{100.0d, + 654321.25d, + -12761.125d, + 1.123456789123456789d, + 0.000000000000000000123456789123456789d, + 0.0d, + 5.0d, + -4.0d, + std::numeric_limits::quiet_NaN(), + 839542223232.794248339d, + -0.0d}; auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream()); - auto const expected = cudf::test::strings_column_wrapper{ - "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", - "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"}; + auto const expected = cudf::test::strings_column_wrapper{"100.0", + "654321.25", + "-12761.125", + "1.1234567891234568", + "1.234567891234568E-19", + "0.0", + "5.0", + "-4.0", + "NaN", + "8.395422232327942E11", + "-0.0"}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity); } \ No newline at end of file diff --git a/thirdparty/cudf b/thirdparty/cudf index c8074b5176..168533a8ad 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit c8074b5176a74630101c78c43c24b66141352b24 +Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243 From 388cb500f4ee08f02857b15bd2ff6c6799c66388 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 4 Dec 2023 18:05:59 +0800 Subject: [PATCH 41/54] Address comments Signed-off-by: Haoyang Li --- src/main/cpp/src/cast_float_to_string.cu | 7 ------- src/main/cpp/tests/cast_float_to_string.cpp | 2 -- thirdparty/cudf | 2 +- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu index 31d3f69d11..050aaf742f 100644 --- a/src/main/cpp/src/cast_float_to_string.cu +++ b/src/main/cpp/src/cast_float_to_string.cu @@ -18,16 +18,9 @@ #include "ftos_converter.cuh" #include -#include -#include #include #include -#include -#include #include -#include -#include -#include #include #include diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index 128b5fd592..f9f8cd44a6 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -18,8 +18,6 @@ #include #include -#include -#include #include diff --git a/thirdparty/cudf b/thirdparty/cudf index 168533a8ad..c8074b5176 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243 +Subproject commit c8074b5176a74630101c78c43c24b66141352b24 From 54fa73cbed0840a19b0046ef0f670028cf8d1056 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Mon, 4 Dec 2023 18:12:21 +0800 Subject: [PATCH 42/54] Address comments Signed-off-by: Haoyang Li --- src/main/cpp/tests/cast_float_to_string.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp index f9f8cd44a6..ac2b2c0e24 100644 --- a/src/main/cpp/tests/cast_float_to_string.cpp +++ b/src/main/cpp/tests/cast_float_to_string.cpp @@ -19,8 +19,6 @@ #include #include -#include - #include #include From 3d19638d83336f438bb30e142f1ec4567f7c7802 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Thu, 7 Dec 2023 17:26:52 +0800 Subject: [PATCH 43/54] address comments Signed-off-by: Haoyang Li --- .../cpp/benchmarks/cast_string_to_float.cpp | 2 +- src/main/cpp/src/ftos_converter.cuh | 54 +++++++++---------- src/main/cpp/tests/cast_decimal_to_string.cpp | 2 +- src/main/cpp/tests/cast_string.cpp | 2 +- 4 files changed, 29 insertions(+), 31 deletions(-) diff --git a/src/main/cpp/benchmarks/cast_string_to_float.cpp b/src/main/cpp/benchmarks/cast_string_to_float.cpp index 32e245aa98..d94f9d26a0 100644 --- a/src/main/cpp/benchmarks/cast_string_to_float.cpp +++ b/src/main/cpp/benchmarks/cast_string_to_float.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "cast_string.hpp" +#include #include diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index 10b99b4d7e..df1772bc99 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -116,34 +116,32 @@ __constant__ uint32_t const POW5_OFFSETS[21] = { constexpr uint32_t POW5_TABLE_SIZE = 26; -__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { - 1ull, - 5ull, - 25ull, - 125ull, - 625ull, - 3125ull, - 15625ull, - 78125ull, - 390625ull, - 1953125ull, - 9765625ull, - 48828125ull, - 244140625ull, - 1220703125ull, - 6103515625ull, - 30517578125ull, - 152587890625ull, - 762939453125ull, - 3814697265625ull, - 19073486328125ull, - 95367431640625ull, - 476837158203125ull, - 2384185791015625ull, - 11920928955078125ull, - 59604644775390625ull, - 298023223876953125ull //, 1490116119384765625ull -}; +__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {1ull, + 5ull, + 25ull, + 125ull, + 625ull, + 3125ull, + 15625ull, + 78125ull, + 390625ull, + 1953125ull, + 9765625ull, + 48828125ull, + 244140625ull, + 1220703125ull, + 6103515625ull, + 30517578125ull, + 152587890625ull, + 762939453125ull, + 3814697265625ull, + 19073486328125ull, + 95367431640625ull, + 476837158203125ull, + 2384185791015625ull, + 11920928955078125ull, + 59604644775390625ull, + 298023223876953125ull}; //===== common.h from ryu ===== diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp index 05002c373c..ba1aaf05c8 100644 --- a/src/main/cpp/tests/cast_decimal_to_string.cpp +++ b/src/main/cpp/tests/cast_decimal_to_string.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "cast_string.hpp" +#include #include #include diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp index 0a3f221894..1f7aaaad21 100644 --- a/src/main/cpp/tests/cast_string.cpp +++ b/src/main/cpp/tests/cast_string.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "cast_string.hpp" +#include #include #include From 8d02a3f6a23255580c2fc51483d78544e8fb6a86 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 8 Dec 2023 15:01:41 +0800 Subject: [PATCH 44/54] fix build after upmerge Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cuh | 1202 +-------------------------- 1 file changed, 12 insertions(+), 1190 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index 09c58b88fb..fe71df924f 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -15,6 +15,8 @@ * limitations under the License. */ +#pragma once + #include #include #include @@ -1074,7 +1076,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, return true; } -__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) +__device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) { // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. uint64_t const bits = double_to_bits(f); @@ -1111,7 +1113,7 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) return v; } -__device__ int d2s_buffered_n(double f, char* result) +__device__ inline int d2s_buffered_n(double f, char* result) { bool sign = false, special = false; floating_decimal_64 v = d2d(f, sign, special); @@ -1119,7 +1121,7 @@ __device__ int d2s_buffered_n(double f, char* result) return to_chars(v, sign, result); } -__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) +__device__ inline floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) { // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. uint32_t const bits = float_to_bits(f); @@ -1139,7 +1141,7 @@ __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) return f2d(ieeeMantissa, ieeeExponent); } -__device__ int f2s_buffered_n(float f, char* result) +__device__ inline int f2s_buffered_n(float f, char* result) { bool sign = false, special = false; floating_decimal_32 v = f2d(f, sign, special); @@ -1149,7 +1151,7 @@ __device__ int f2s_buffered_n(float f, char* result) //===== compute float to string size ===== -__device__ int compute_d2s_size(double value) +__device__ inline int compute_d2s_size(double value) { bool sign = false, special = false; floating_decimal_64 v = d2d(value, sign, special); @@ -1157,7 +1159,7 @@ __device__ int compute_d2s_size(double value) return d2s_size(v, sign); } -__device__ int compute_f2s_size(float value) +__device__ inline int compute_f2s_size(float value) { bool sign = false, special = false; floating_decimal_32 v = f2d(value, sign, special); @@ -1169,7 +1171,7 @@ __device__ int compute_f2s_size(float value) //===== APIs ===== -__device__ int compute_ftos_size(double value, bool is_float) +__device__ inline int compute_ftos_size(double value, bool is_float) { if (is_float) { return compute_f2s_size(value); @@ -1178,7 +1180,7 @@ __device__ int compute_ftos_size(double value, bool is_float) } } -__device__ int float_to_string(double value, bool is_float, char* output) +__device__ inline int float_to_string(double value, bool is_float, char* output) { if (is_float) { return f2s_buffered_n(value, output); @@ -1502,7 +1504,7 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const return index; } -__device__ int compute_format_float_size(double value, int digits, bool is_float) +__device__ inline int compute_format_float_size(double value, int digits, bool is_float) { bool sign = false, special = false; if (is_float) { @@ -1516,7 +1518,7 @@ __device__ int compute_format_float_size(double value, int digits, bool is_float } } -__device__ int format_float(double value, int digits, bool is_float, char* output) +__device__ inline int format_float(double value, int digits, bool is_float, char* output) { bool sign = false, special = false; if (is_float) { @@ -1531,1183 +1533,3 @@ __device__ int format_float(double value, int digits, bool is_float, char* outpu } } // namespace spark_rapids_jni::ftos_converter - -/* - * Copyright 2018 Ulf Adams - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -namespace spark_rapids_jni::ftos_converter { - -namespace { - -// d2s.c from ryu -// A floating decimal representing m * 10^e. -typedef struct floating_decimal_64 { - uint64_t mantissa; - // Decimal exponent's range is -324 to 308 - // inclusive, and can fit in a short if needed. - int32_t exponent; -} floating_decimal_64; - -// f2s.c from ryu -// A floating decimal representing m * 10^e. -typedef struct floating_decimal_32 { - uint32_t mantissa; - // Decimal exponent's range is -45 to 38 - // inclusive, and can fit in a short if needed. - int32_t exponent; -} floating_decimal_32; - -//===== constants from ryu ===== - -// These tables are generated by PrintDoubleLookupTable. -constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125; -constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125; -constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64); -constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64); -constexpr unsigned int DOUBLE_MANTISSA_BITS = 52; -constexpr unsigned int DOUBLE_EXPONENT_BITS = 11; -constexpr unsigned int DOUBLE_BIAS = 1023; -constexpr unsigned int FLOAT_MANTISSA_BITS = 23; -constexpr unsigned int FLOAT_EXPONENT_BITS = 8; -constexpr unsigned int FLOAT_BIAS = 127; - -__constant__ uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = { - {1u, 2305843009213693952u}, - {5955668970331000884u, 1784059615882449851u}, - {8982663654677661702u, 1380349269358112757u}, - {7286864317269821294u, 2135987035920910082u}, - {7005857020398200553u, 1652639921975621497u}, - {17965325103354776697u, 1278668206209430417u}, - {8928596168509315048u, 1978643211784836272u}, - {10075671573058298858u, 1530901034580419511u}, - {597001226353042382u, 1184477304306571148u}, - {1527430471115325346u, 1832889850782397517u}, - {12533209867169019542u, 1418129833677084982u}, - {5577825024675947042u, 2194449627517475473u}, - {11006974540203867551u, 1697873161311732311u}, - {10313493231639821582u, 1313665730009899186u}, - {12701016819766672773u, 2032799256770390445u}}; - -__constant__ uint32_t const POW5_INV_OFFSETS[19] = {0x54544554, - 0x04055545, - 0x10041000, - 0x00400414, - 0x40010000, - 0x41155555, - 0x00000454, - 0x00010044, - 0x40000000, - 0x44000041, - 0x50454450, - 0x55550054, - 0x51655554, - 0x40004000, - 0x01000001, - 0x00010500, - 0x51515411, - 0x05555554, - 0x00000000}; - -__constant__ uint64_t const DOUBLE_POW5_SPLIT2[13][2] = { - {0u, 1152921504606846976u}, - {0u, 1490116119384765625u}, - {1032610780636961552u, 1925929944387235853u}, - {7910200175544436838u, 1244603055572228341u}, - {16941905809032713930u, 1608611746708759036u}, - {13024893955298202172u, 2079081953128979843u}, - {6607496772837067824u, 1343575221513417750u}, - {17332926989895652603u, 1736530273035216783u}, - {13037379183483547984u, 2244412773384604712u}, - {1605989338741628675u, 1450417759929778918u}, - {9630225068416591280u, 1874621017369538693u}, - {665883850346957067u, 1211445438634777304u}, - {14931890668723713708u, 1565756531257009982u}}; - -__constant__ uint32_t const POW5_OFFSETS[21] = { - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, 0x55545555, - 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, 0x45555550, 0x40004000, - 0x96440440, 0x55565565, 0x54454045, 0x40154151, 0x55559155, 0x51405555, 0x00000105}; - -constexpr uint32_t POW5_TABLE_SIZE = 26; - -__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = { - 1ull, - 5ull, - 25ull, - 125ull, - 625ull, - 3125ull, - 15625ull, - 78125ull, - 390625ull, - 1953125ull, - 9765625ull, - 48828125ull, - 244140625ull, - 1220703125ull, - 6103515625ull, - 30517578125ull, - 152587890625ull, - 762939453125ull, - 3814697265625ull, - 19073486328125ull, - 95367431640625ull, - 476837158203125ull, - 2384185791015625ull, - 11920928955078125ull, - 59604644775390625ull, - 298023223876953125ull //, 1490116119384765625ull -}; - -//===== common.h from ryu ===== - -// Returns the number of decimal digits in v, which must not contain more than 9 digits. -__device__ inline uint32_t decimalLength9(uint32_t const v) -{ - // Function precondition: v is not a 10-digit number. - // (f2s: 9 digits are sufficient for round-tripping.) - // (d2fixed: We print 9-digit blocks.) - assert(v < 1000000000); - if (v >= 100000000) { return 9; } - if (v >= 10000000) { return 8; } - if (v >= 1000000) { return 7; } - if (v >= 100000) { return 6; } - if (v >= 10000) { return 5; } - if (v >= 1000) { return 4; } - if (v >= 100) { return 3; } - if (v >= 10) { return 2; } - return 1; -} - -// Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528. -__device__ inline int32_t pow5bits(int32_t const e) -{ - // This approximation works up to the point that the multiplication overflows at e = 3529. - // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater - // than 2^9297. - assert(e >= 0); - assert(e <= 3528); - return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1); -} - -// Returns floor(log_10(2^e)); requires 0 <= e <= 1650. -__device__ inline uint32_t log10Pow2(int32_t const e) -{ - // The first value this approximation fails for is 2^1651 which is just greater than 10^297. - assert(e >= 0); - assert(e <= 1650); - return (((uint32_t)e) * 78913) >> 18; -} - -// Returns floor(log_10(5^e)); requires 0 <= e <= 2620. -__device__ inline uint32_t log10Pow5(int32_t const e) -{ - // The first value this approximation fails for is 5^2621 which is just greater than 10^1832. - assert(e >= 0); - assert(e <= 2620); - return (((uint32_t)e) * 732923) >> 20; -} - -__device__ inline uint32_t pow5factor_32(uint32_t value) -{ - uint32_t count = 0; - for (;;) { - assert(value != 0); - uint32_t const q = value / 5; - uint32_t const r = value % 5; - if (r != 0) { break; } - value = q; - ++count; - } - return count; -} - -// Returns true if value is divisible by 5^p. -__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) -{ - return pow5factor_32(value) >= p; -} - -// Returns true if value is divisible by 2^p. -__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) -{ - // __builtin_ctz doesn't appear to be faster here. - return (value & ((1u << p) - 1)) == 0; -} - -// It seems to be slightly faster to avoid uint128_t here, although the -// generated code for uint128_t looks slightly nicer. -__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) -{ - assert(shift > 32); - - // The casts here help MSVC to avoid calls to the __allmul library - // function. - uint32_t const factorLo = (uint32_t)(factor); - uint32_t const factorHi = (uint32_t)(factor >> 32); - uint64_t const bits0 = (uint64_t)m * factorLo; - uint64_t const bits1 = (uint64_t)m * factorHi; - - uint64_t const sum = (bits0 >> 32) + bits1; - uint64_t const shiftedSum = sum >> (shift - 32); - assert(shiftedSum <= UINT32_MAX); - return (uint32_t)shiftedSum; -} - -__device__ inline int copy_special_str(char* const result, - bool const sign, - bool const exponent, - bool const mantissa) -{ - if (mantissa) { - memcpy(result, "NaN", 3); - return 3; - } - if (sign) { result[0] = '-'; } - if (exponent) { - memcpy(result + sign, "Infinity", 8); - return sign + 8; - } - memcpy(result + sign, "0.0", 3); - return sign + 3; -} - -__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) -{ - if (mantissa) { return 3; } - if (exponent) { return sign + 8; } - return sign + 3; -} - -__device__ inline uint32_t float_to_bits(float const f) -{ - uint32_t bits = 0; - memcpy(&bits, &f, sizeof(float)); - return bits; -} - -__device__ inline uint64_t double_to_bits(double const d) -{ - uint64_t bits = 0; - memcpy(&bits, &d, sizeof(double)); - return bits; -} - -//===== d2s_intrinsics.h from ryu ===== - -__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) -{ - // The casts here help MSVC to avoid calls to the __allmul library function. - uint32_t const aLo = (uint32_t)a; - uint32_t const aHi = (uint32_t)(a >> 32); - uint32_t const bLo = (uint32_t)b; - uint32_t const bHi = (uint32_t)(b >> 32); - - uint64_t const b00 = (uint64_t)aLo * bLo; - uint64_t const b01 = (uint64_t)aLo * bHi; - uint64_t const b10 = (uint64_t)aHi * bLo; - uint64_t const b11 = (uint64_t)aHi * bHi; - - uint32_t const b00Lo = (uint32_t)b00; - uint32_t const b00Hi = (uint32_t)(b00 >> 32); - - uint64_t const mid1 = b10 + b00Hi; - uint32_t const mid1Lo = (uint32_t)(mid1); - uint32_t const mid1Hi = (uint32_t)(mid1 >> 32); - - uint64_t const mid2 = b01 + mid1Lo; - uint32_t const mid2Lo = (uint32_t)(mid2); - uint32_t const mid2Hi = (uint32_t)(mid2 >> 32); - - uint64_t const pHi = b11 + mid1Hi + mid2Hi; - uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo; - - *productHi = pHi; - return pLo; -} - -__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) -{ - // We don't need to handle the case dist >= 64 here (see above). - assert(dist < 64); - assert(dist > 0); - return (hi << (64 - dist)) | (lo >> dist); -} - -__device__ inline uint64_t div5(uint64_t const x) { return x / 5; } - -__device__ inline uint64_t div10(uint64_t const x) { return x / 10; } - -__device__ inline uint64_t div100(uint64_t const x) { return x / 100; } - -__device__ inline uint32_t pow5Factor(uint64_t value) -{ - uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64) - uint64_t const n_div_5 = 3689348814741910323u; // #{ n | n = 0 (mod 2^64) } = 2^64 / 5 - uint32_t count = 0; - for (;;) { - assert(value != 0); - value *= m_inv_5; - if (value > n_div_5) break; - ++count; - } - return count; -} - -// Returns true if value is divisible by 5^p. -__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) -{ - // I tried a case distinction on p, but there was no performance difference. - return pow5Factor(value) >= p; -} - -// Returns true if value is divisible by 2^p. -__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) -{ - assert(value != 0); - assert(p < 64); - // __builtin_ctzll doesn't appear to be faster here. - return (value & ((1ull << p) - 1)) == 0; -} - -__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) -{ - // m is maximum 55 bits - uint64_t high1; // 128 - uint64_t const low1 = umul128(m, mul[1], &high1); // 64 - uint64_t high0; // 64 - umul128(m, mul[0], &high0); // 0 - uint64_t const sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - return shiftright128(sum, high1, j - 64); -} - -__device__ inline uint64_t mulShiftAll64(uint64_t const m, - uint64_t const* const mul, - int32_t const j, - uint64_t* const vp, - uint64_t* const vm, - uint32_t const mmShift) -{ - *vp = mulShift64(4 * m + 2, mul, j); - *vm = mulShift64(4 * m - 1 - mmShift, mul, j); - return mulShift64(4 * m, mul, j); -} - -//===== d2s_small_table.h from ryu ===== - -// Computes 5^i in the form required by Ryu, and stores it in the given pointer. -__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) -{ - uint32_t const base = i / POW5_TABLE_SIZE; - uint32_t const base2 = base * POW5_TABLE_SIZE; - uint32_t const offset = i - base2; - uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base]; - if (offset == 0) { - result[0] = mul[0]; - result[1] = mul[1]; - return; - } - uint64_t const m = DOUBLE_POW5_TABLE[offset]; - uint64_t high1; - uint64_t const low1 = umul128(m, mul[1], &high1); - uint64_t high0; - uint64_t const low0 = umul128(m, mul[0], &high0); - uint64_t const sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - // high1 | sum | low0 - uint32_t const delta = pow5bits(i) - pow5bits(base2); - result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); - result[1] = shiftright128(sum, high1, delta); -} - -// Computes 5^-i in the form required by Ryu, and stores it in the given pointer. -__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) -{ - uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE; - uint32_t const base2 = base * POW5_TABLE_SIZE; - uint32_t const offset = base2 - i; - uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2 - if (offset == 0) { - result[0] = mul[0]; - result[1] = mul[1]; - return; - } - uint64_t const m = DOUBLE_POW5_TABLE[offset]; - uint64_t high1; - uint64_t const low1 = umul128(m, mul[1], &high1); - uint64_t high0; - uint64_t const low0 = umul128(m, mul[0] - 1, &high0); - uint64_t const sum = high0 + low1; - if (sum < high0) { - ++high1; // overflow into high1 - } - // high1 | sum | low0 - uint32_t const delta = pow5bits(base2) - pow5bits(i); - result[0] = - shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3); - result[1] = shiftright128(sum, high1, delta); -} - -//===== f2s_intrinsics.h from ryu ===== - -__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) -{ - // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double - // lookup table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely - // on the fact that the added 1 that's already stored in the table never overflows into the upper - // 64 bits. - uint64_t pow5[2]; - double_computeInvPow5(q, pow5); - return mulShift32(m, pow5[1] + 1, j); -} - -__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) -{ - uint64_t pow5[2]; - double_computePow5(i, pow5); - return mulShift32(m, pow5[1], j); -} - -//===== d2s.c and f2s.c from ryu ===== - -__device__ inline uint32_t decimalLength17(uint64_t const v) -{ - // This is slightly faster than a loop. - // The average output length is 16.38 digits, so we check high-to-low. - // Function precondition: v is not an 18, 19, or 20-digit number. - // (17 digits are sufficient for round-tripping.) - assert(v < 100000000000000000L); - if (v >= 10000000000000000L) { return 17; } - if (v >= 1000000000000000L) { return 16; } - if (v >= 100000000000000L) { return 15; } - if (v >= 10000000000000L) { return 14; } - if (v >= 1000000000000L) { return 13; } - if (v >= 100000000000L) { return 12; } - if (v >= 10000000000L) { return 11; } - if (v >= 1000000000L) { return 10; } - if (v >= 100000000L) { return 9; } - if (v >= 10000000L) { return 8; } - if (v >= 1000000L) { return 7; } - if (v >= 100000L) { return 6; } - if (v >= 10000L) { return 5; } - if (v >= 1000L) { return 4; } - if (v >= 100L) { return 3; } - if (v >= 10L) { return 2; } - return 1; -} - -__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) -{ - int32_t e2; - uint64_t m2; - if (ieeeExponent == 0) { - // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; - m2 = ieeeMantissa; - } else { - e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2; - m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - } - bool const even = (m2 & 1) == 0; - bool const acceptBounds = even; - - // Step 2: Determine the interval of valid decimal representations. - uint64_t const mv = 4 * m2; - // Implicit bool -> int conversion. True is 1, false is 0. - uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - // We would compute mp and mm like this: - // uint64_t mp = 4 * m2 + 2; - // uint64_t mm = mv - 1 - mmShift; - - // Step 3: Convert to a decimal power base using 128-bit arithmetic. - uint64_t vr, vp, vm; - int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; - if (e2 >= 0) { - // I tried special-casing q == 0, but there was no effect on performance. - // This expression is slightly faster than max(0, log10Pow2(e2) - 1). - uint32_t const q = log10Pow2(e2) - (e2 > 3); - e10 = (int32_t)q; - int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; - int32_t const i = -e2 + (int32_t)q + k; - uint64_t pow5[2]; - double_computeInvPow5(q, pow5); - vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift); - - if (q <= 21) { - // This should use q <= 22, but I think 21 is also safe. Smaller values - // may still be safe, but it's more difficult to reason about them. - // Only one of mp, mv, and mm can be a multiple of 5, if any. - uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv)); - if (mvMod5 == 0) { - vrIsTrailingZeros = multipleOfPowerOf5(mv, q); - } else if (acceptBounds) { - // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q - // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q - // <=> true && pow5Factor(mm) >= q, since e2 >= q. - vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q); - } else { - // Same as min(e2 + 1, pow5Factor(mp)) >= q. - vp -= multipleOfPowerOf5(mv + 2, q); - } - } - } else { - // This expression is slightly faster than max(0, log10Pow5(-e2) - 1). - uint32_t const q = log10Pow5(-e2) - (-e2 > 1); - e10 = (int32_t)q + e2; - int32_t const i = -e2 - (int32_t)q; - int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT; - int32_t const j = (int32_t)q - k; - - uint64_t pow5[2]; - double_computePow5(i, pow5); - vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift); - - if (q <= 1) { - // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. - // mv = 4 * m2, so it always has at least two trailing 0 bits. - vrIsTrailingZeros = true; - if (acceptBounds) { - // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. - vmIsTrailingZeros = mmShift == 1; - } else { - // mp = mv + 2, so it always has at least one trailing 0 bit. - --vp; - } - } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here. - // We want to know if the full product has at least q trailing zeros. - // We need to compute min(p2(mv), p5(mv) - e2) >= q - // <=> p2(mv) >= q && p5(mv) - e2 >= q - // <=> p2(mv) >= q (because -e2 >= q) - vrIsTrailingZeros = multipleOfPowerOf2(mv, q); - } - } - - // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; - uint8_t lastRemovedDigit = 0; - uint64_t output; - // On average, we remove ~2 digits. - if (vmIsTrailingZeros || vrIsTrailingZeros) { - // General case, which happens rarely (~0.7%). - for (;;) { - uint64_t const vpDiv10 = div10(vp); - uint64_t const vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { break; } - uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); - uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); - vmIsTrailingZeros &= vmMod10 == 0; - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - - if (vmIsTrailingZeros) { - for (;;) { - uint64_t const vmDiv10 = div10(vm); - uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10); - if (vmMod10 != 0) { break; } - uint64_t const vpDiv10 = div10(vp); - uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)vrMod10; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - } - - if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { - // Round even if the exact number is .....50..0. - lastRemovedDigit = 4; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); - } else { - // Specialized for the common case (~99.3%). Percentages below are relative to this. - bool roundUp = false; - uint64_t const vpDiv100 = div100(vp); - uint64_t const vmDiv100 = div100(vm); - if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%). - uint64_t const vrDiv100 = div100(vr); - uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100); - roundUp = vrMod100 >= 50; - vr = vrDiv100; - vp = vpDiv100; - vm = vmDiv100; - removed += 2; - } - // Loop iterations below (approximately), without optimization above: - // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02% - // Loop iterations below (approximately), with optimization above: - // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02% - for (;;) { - uint64_t const vpDiv10 = div10(vp); - uint64_t const vmDiv10 = div10(vm); - if (vpDiv10 <= vmDiv10) { break; } - uint64_t const vrDiv10 = div10(vr); - uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10); - roundUp = vrMod10 >= 5; - vr = vrDiv10; - vp = vpDiv10; - vm = vmDiv10; - ++removed; - } - - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + (vr == vm || roundUp); - } - int32_t const exp = e10 + removed; - - floating_decimal_64 fd; - fd.exponent = exp; - fd.mantissa = output; - return fd; -} - -__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) -{ - int32_t e2; - uint32_t m2; - if (ieeeExponent == 0) { - // We subtract 2 so that the bounds computation has 2 additional bits. - e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; - m2 = ieeeMantissa; - } else { - e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2; - m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa; - } - bool const even = (m2 & 1) == 0; - bool const acceptBounds = even; - - // Step 2: Determine the interval of valid decimal representations. - uint32_t const mv = 4 * m2; - uint32_t const mp = 4 * m2 + 2; - // Implicit bool -> int conversion. True is 1, false is 0. - uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1; - uint32_t const mm = 4 * m2 - 1 - mmShift; - - // Step 3: Convert to a decimal power base using 64-bit arithmetic. - uint32_t vr, vp, vm; - int32_t e10; - bool vmIsTrailingZeros = false; - bool vrIsTrailingZeros = false; - uint8_t lastRemovedDigit = 0; - if (e2 >= 0) { - uint32_t const q = log10Pow2(e2); - e10 = (int32_t)q; - int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1; - int32_t const i = -e2 + (int32_t)q + k; - vr = mulPow5InvDivPow2(mv, q, i); - vp = mulPow5InvDivPow2(mp, q, i); - vm = mulPow5InvDivPow2(mm, q, i); - if (q != 0 && (vp - 1) / 10 <= vm / 10) { - // We need to know one removed digit even if we are not going to loop below. We could use - // q = X - 1 above, except that would require 33 bits for the result, and we've found that - // 32-bit arithmetic is faster even on 64-bit machines. - int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1; - lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10); - } - if (q <= 9) { - // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well. - // Only one of mp, mv, and mm can be a multiple of 5, if any. - if (mv % 5 == 0) { - vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q); - } else if (acceptBounds) { - vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q); - } else { - vp -= multipleOfPowerOf5_32(mp, q); - } - } - } else { - uint32_t const q = log10Pow5(-e2); - e10 = (int32_t)q + e2; - int32_t const i = -e2 - (int32_t)q; - int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT; - int32_t j = (int32_t)q - k; - vr = mulPow5divPow2(mv, (uint32_t)i, j); - vp = mulPow5divPow2(mp, (uint32_t)i, j); - vm = mulPow5divPow2(mm, (uint32_t)i, j); - if (q != 0 && (vp - 1) / 10 <= vm / 10) { - j = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT); - lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10); - } - if (q <= 1) { - // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits. - // mv = 4 * m2, so it always has at least two trailing 0 bits. - vrIsTrailingZeros = true; - if (acceptBounds) { - // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1. - vmIsTrailingZeros = mmShift == 1; - } else { - // mp = mv + 2, so it always has at least one trailing 0 bit. - --vp; - } - } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here. - vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1); - } - } - - // Step 4: Find the shortest decimal representation in the interval of valid representations. - int32_t removed = 0; - uint32_t output; - if (vmIsTrailingZeros || vrIsTrailingZeros) { - // General case, which happens rarely (~4.0%). - while (vp / 10 > vm / 10) { - vmIsTrailingZeros &= vm % 10 == 0; - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)(vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - if (vmIsTrailingZeros) { - while (vm % 10 == 0) { - vrIsTrailingZeros &= lastRemovedDigit == 0; - lastRemovedDigit = (uint8_t)(vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - } - if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) { - // Round even if the exact number is .....50..0. - lastRemovedDigit = 4; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5); - } else { - // Specialized for the common case (~96.0%). Percentages below are relative to this. - // Loop iterations below (approximately): - // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01% - while (vp / 10 > vm / 10) { - lastRemovedDigit = (uint8_t)(vr % 10); - vr /= 10; - vp /= 10; - vm /= 10; - ++removed; - } - // We need to take vr + 1 if vr is outside bounds or we need to round up. - output = vr + (vr == vm || lastRemovedDigit >= 5); - } - int32_t const exp = e10 + removed; - - floating_decimal_32 fd; - fd.exponent = exp; - fd.mantissa = output; - return fd; -} - -__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) -{ - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { result[index++] = '-'; } - - uint64_t output = v.mantissa; - uint32_t const olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t)olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - // Values in the interval [1E-3, 1E7) are special. - if (scientificNotation) { - // Print in the format x.xxxxxE-yy. - for (uint32_t i = 0; i < olength - 1; ++i) { - uint32_t const c = output % 10; - output /= 10; - result[index + olength - i] = (char)('0' + c); - } - result[index] = '0' + output % 10; - result[index + 1] = '.'; - index += olength + 1; - if (olength == 1) { result[index++] = '0'; } - // Print 'E', the exponent sign, and the exponent, which has at most three digits. - result[index++] = 'E'; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - if (exp >= 100) { - result[index++] = (char)('0' + exp / 100); - exp %= 100; - result[index++] = (char)('0' + exp / 10); - } else if (exp >= 10) { - result[index++] = (char)('0' + exp / 10); - } - result[index++] = (char)('0' + exp % 10); - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - result[index++] = '0'; - result[index++] = '.'; - for (int i = -1; i > exp; i--) { - result[index++] = '0'; - } - int current = index; - for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char)('0' + output % 10); - output /= 10; - index++; - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char)('0' + output % 10); - output /= 10; - } - index += olength; - for (int i = olength; i < exp + 1; i++) { - result[index++] = '0'; - } - result[index++] = '.'; - result[index++] = '0'; - } else { - // Decimal dot is somewhere between the digits. - int current = index + 1; - for (int i = 0; i < olength; i++) { - if (olength - i - 1 == exp) { - result[current + olength - i - 1] = '.'; - current--; - } - result[current + olength - i - 1] = (char)('0' + output % 10); - output /= 10; - } - index += olength + 1; - } - } - return index; -} - -__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) -{ - int index = 0; - if (sign) { index++; } - - uint64_t output = v.mantissa; - uint32_t const olength = decimalLength17(output); - int32_t exp = v.exponent + (int32_t)olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - index += olength + 1; - if (olength == 1) { index++; } - // 'E' - index++; - if (exp < 0) { - exp = -exp; - index++; - } - if (exp >= 100) { - index += 3; - } else if (exp >= 10) { - index += 2; - } else { - index++; - } - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - index += 1 - exp + olength; - } else if (exp + 1 >= olength) { - index += exp + 3; - } else { - index += olength + 1; - } - } - return index; -} - -__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) -{ - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { result[index++] = '-'; } - - uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - // Print in the format x.xxxxxE-yy. - for (int i = 0; i < olength - 1; i++) { - int c = output % 10; - output /= 10; - result[index + olength - i] = (char)('0' + c); - } - result[index] = (char)('0' + output % 10); - result[index + 1] = '.'; - index += olength + 1; - if (olength == 1) { result[index++] = '0'; } - - // Print 'E', the exponent sign, and the exponent, which has at most two digits. - result[index++] = 'E'; - if (exp < 0) { - result[index++] = '-'; - exp = -exp; - } - if (exp >= 10) { result[index++] = (char)('0' + exp / 10); } - result[index++] = (char)('0' + exp % 10); - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - result[index++] = '0'; - result[index++] = '.'; - for (int i = -1; i > exp; i--) { - result[index++] = '0'; - } - int current = index; - for (int i = 0; i < olength; i++) { - result[current + olength - i - 1] = (char)('0' + output % 10); - output /= 10; - index++; - } - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - for (int i = 0; i < olength; i++) { - result[index + olength - i - 1] = (char)('0' + output % 10); - output /= 10; - } - index += olength; - for (int i = olength; i < exp + 1; i++) { - result[index++] = '0'; - } - result[index++] = '.'; - result[index++] = '0'; - } else { - // Decimal dot is somewhere between the digits. - int current = index + 1; - for (int i = 0; i < olength; i++) { - if (olength - i - 1 == exp) { - result[current + olength - i - 1] = '.'; - current--; - } - result[current + olength - i - 1] = (char)('0' + output % 10); - output /= 10; - } - index += olength + 1; - } - } - return index; -} - -__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) -{ - // Step 5: Print the decimal representation. - int index = 0; - if (sign) { index++; } - - uint32_t output = v.mantissa; - uint32_t const olength = decimalLength9(output); - int32_t exp = v.exponent + olength - 1; - bool scientificNotation = (exp < -3) || (exp >= 7); - - if (scientificNotation) { - index += olength + 1; - if (olength == 1) { index++; } - // 'E' - index++; - if (exp < 0) { - index++; - exp = -exp; - } - if (exp >= 10) { index++; } - index++; - } else { - // Otherwise follow the Java spec for values in the interval [1E-3, 1E7). - if (exp < 0) { - // Decimal dot is before any of the digits. - index += 1 - exp + olength; - } else if (exp + 1 >= olength) { - // Decimal dot is after any of the digits. - index += exp + 3; - } else { - // Decimal dot is somewhere between the digits. - index += olength + 1; - } - } - return index; -} - -__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, - uint32_t const ieeeExponent, - floating_decimal_64* const v) -{ - uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa; - int32_t const e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS; - - if (e2 > 0) { - // f = m2 * 2^e2 >= 2^53 is an integer. - // Ignore this case for now. - return false; - } - - if (e2 < -52) { - // f < 1. - return false; - } - - // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53. - // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0. - uint64_t const mask = (1ull << -e2) - 1; - uint64_t const fraction = m2 & mask; - if (fraction != 0) { return false; } - - // f is an integer in the range [1, 2^53). - // Note: mantissa might contain trailing (decimal) 0's. - // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17(). - v->mantissa = m2 >> -e2; - v->exponent = 0; - return true; -} - -__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) -{ - // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. - uint64_t const bits = double_to_bits(f); - - // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0; - uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1); - uint32_t const ieeeExponent = - (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1)); - // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || - (ieeeExponent == 0 && ieeeMantissa == 0)) { - special = true; - return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent}; - } - special = false; - floating_decimal_64 v; - bool const isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v); - if (isSmallInt) { - // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros. - // For scientific notation we need to move these zeros into the exponent. - // (This is not needed for fixed-point notation, so it might be beneficial to trim - // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.) - for (;;) { - uint64_t const q = div10(v.mantissa); - uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q); - if (r != 0) { break; } - v.mantissa = q; - ++v.exponent; - } - } else { - v = d2d(ieeeMantissa, ieeeExponent); - } - return v; -} - -__device__ int d2s_buffered_n(double f, char* result) -{ - bool sign = false, special = false; - floating_decimal_64 v = d2d(f, sign, special); - if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); } - return to_chars(v, sign, result); -} - -__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) -{ - // Step 1: Decode the floating-point number, and unify normalized and subnormal cases. - uint32_t const bits = float_to_bits(f); - - // Decode bits into sign, mantissa, and exponent. - ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0; - uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1); - uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1); - - // Case distinction; exit early for the easy cases. - if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || - (ieeeExponent == 0 && ieeeMantissa == 0)) { - special = true; - return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent}; - } - special = false; - return f2d(ieeeMantissa, ieeeExponent); -} - -__device__ int f2s_buffered_n(float f, char* result) -{ - bool sign = false, special = false; - floating_decimal_32 v = f2d(f, sign, special); - if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); } - return to_chars(v, sign, result); -} - -//===== compute float to string size ===== - -__device__ int compute_d2s_size(double value) -{ - bool sign = false, special = false; - floating_decimal_64 v = d2d(value, sign, special); - if (special) { return special_str_size(sign, v.exponent, v.mantissa); } - return d2s_size(v, sign); -} - -__device__ int compute_f2s_size(float value) -{ - bool sign = false, special = false; - floating_decimal_32 v = f2d(value, sign, special); - if (special) { return special_str_size(sign, v.exponent, v.mantissa); } - return f2s_size(v, sign); -} - -} // namespace - -//===== APIs ===== - -__device__ int compute_ftos_size(double value, bool is_float) -{ - if (is_float) { - return compute_f2s_size(value); - } else { - return compute_d2s_size(value); - } -} - -__device__ int float_to_string(double value, bool is_float, char* output) -{ - if (is_float) { - return f2s_buffered_n(value, output); - } else { - return d2s_buffered_n(value, output); - } -} - -} // namespace spark_rapids_jni::ftos_converter From 62ff4f7dce335b644e4536aeb2f5482193474ecf Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Tue, 12 Dec 2023 13:24:56 +0800 Subject: [PATCH 45/54] move inf/nan replacement to kernel Signed-off-by: Haoyang Li --- src/main/cpp/src/ftos_converter.cuh | 69 ++++++++++++++++++++--------- src/main/cpp/tests/format_float.cpp | 4 +- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index fe71df924f..e684f73921 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -243,8 +243,7 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i __device__ inline int copy_special_str(char* const result, bool const sign, bool const exponent, - bool const mantissa, - int const digits = 1) + bool const mantissa) { if (mantissa) { memcpy(result, "NaN", 3); @@ -255,27 +254,15 @@ __device__ inline int copy_special_str(char* const result, memcpy(result + sign, "Infinity", 8); return sign + 8; } - result[sign] = '0'; - if (digits == 0) { - return sign + 1; - } else { - result[sign + 1] = '.'; - } - for (int i = 0; i < digits; i++) { - result[sign + 2 + i] = '0'; - } - return sign + 2 + digits; + memcpy(result + sign, "0.0", 3); + return sign + 3; } -__device__ inline int special_str_size(bool const sign, - bool const exponent, - bool const mantissa, - int const digits = 1) +__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) { if (mantissa) { return 3; } if (exponent) { return sign + 8; } - if (digits == 0) { return sign + 1; } - return sign + 2 + digits; + return sign + 3; } __device__ inline uint32_t float_to_bits(float const f) @@ -1504,16 +1491,54 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const return index; } +__device__ inline int copy_format_special_str(char* const result, + bool const sign, + bool const exponent, + bool const mantissa, + int const digits = 1) +{ + if (mantissa) { + memcpy(result, "\xEF\xBF\xBD", 3); // U+FFFD, replacement character, NaN + return 3; + } + if (sign) { result[0] = '-'; } + if (exponent) { + memcpy(result + sign, "\xE2\x88\x9E", 3); // U+221E, infinity symbol + return sign + 3; + } + result[sign] = '0'; + if (digits == 0) { + return sign + 1; + } else { + result[sign + 1] = '.'; + } + for (int i = 0; i < digits; i++) { + result[sign + 2 + i] = '0'; + } + return sign + 2 + digits; +} + +__device__ inline int special_format_str_size(bool const sign, + bool const exponent, + bool const mantissa, + int const digits = 1) +{ + if (mantissa) { return 3; } + if (exponent) { return sign + 3; } + if (digits == 0) { return sign + 1; } + return sign + 2 + digits; +} + __device__ inline int compute_format_float_size(double value, int digits, bool is_float) { bool sign = false, special = false; if (is_float) { floating_decimal_32 v = f2d(value, sign, special); - if (special) { return special_str_size(sign, v.exponent, v.mantissa, digits); } + if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } return format_float_size(v, sign, digits); } else { floating_decimal_64 v = d2d(value, sign, special); - if (special) { return special_str_size(sign, v.exponent, v.mantissa, digits); } + if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); } return format_float_size(v, sign, digits); } } @@ -1523,11 +1548,11 @@ __device__ inline int format_float(double value, int digits, bool is_float, char bool sign = false, special = false; if (is_float) { floating_decimal_32 v = f2d(value, sign, special); - if (special) { return copy_special_str(output, sign, v.exponent, v.mantissa, digits); } + if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } return to_formated_chars(v, sign, output, digits); } else { floating_decimal_64 v = d2d(value, sign, special); - if (special) { return copy_special_str(output, sign, v.exponent, v.mantissa, digits); } + if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); } return to_formated_chars(v, sign, output, digits); } } diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index 5d4e79eebf..c8e5d84260 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -48,7 +48,7 @@ TEST_F(FormatFloatTests, FormatFloats32) "0.00000", "5.00000", "-4.00000", - "NaN", + "\xEF\xBF\xBD", "123,456,790,000.00000", "-0.00000"}; @@ -80,7 +80,7 @@ TEST_F(FormatFloatTests, FormatFloats64) "0.00000", "5.00000", "-4.00000", - "NaN", + "\xEF\xBF\xBD", "839,542,223,232.79420", "-0.00000"}; From 10bfe094fd526fec275c229b28cb4daf3615c64d Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 13 Dec 2023 16:09:32 +0800 Subject: [PATCH 46/54] Apply suggestions from code review Co-authored-by: Mike Wilson Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- src/main/cpp/src/cast_string.hpp | 4 ++-- src/main/cpp/src/format_float.cu | 9 +++++---- .../java/com/nvidia/spark/rapids/jni/CastStrings.java | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp index 84df3f71b1..43ec36e576 100644 --- a/src/main/cpp/src/cast_string.hpp +++ b/src/main/cpp/src/cast_string.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,7 +117,7 @@ std::unique_ptr string_to_float( std::unique_ptr format_float( cudf::column_view const& input, - int digits, + int const digits, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index cdcb75fc9a..b9bbb26cb2 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -23,6 +23,7 @@ #include #include + #include #include @@ -75,7 +76,7 @@ struct format_float_fn { struct dispatch_format_float_fn { template )> std::unique_ptr operator()(cudf::column_view const& floats, - int digits, + int const digits, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { @@ -97,7 +98,7 @@ struct dispatch_format_float_fn { // non-float types throw an exception template )> std::unique_ptr operator()(cudf::column_view const&, - int, + int const, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { @@ -109,7 +110,7 @@ struct dispatch_format_float_fn { // This will convert all float column types into a strings column. std::unique_ptr format_float(cudf::column_view const& floats, - int digits, + int const digits, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -120,7 +121,7 @@ std::unique_ptr format_float(cudf::column_view const& floats, // external API std::unique_ptr format_float(cudf::column_view const& floats, - int digits, + int const digits, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index 93c3c0f21a..cd6f62371b 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From e264ba90d4d86a12f12a0777a5cd14b2183c60e0 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Wed, 13 Dec 2023 11:49:28 +0330 Subject: [PATCH 47/54] address comments Signed-off-by: Haoyang Li --- src/main/cpp/src/CastStringJni.cpp | 1 - src/main/cpp/src/format_float.cu | 11 +++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp index 2e73d0c4ab..b7d898a0c8 100644 --- a/src/main/cpp/src/CastStringJni.cpp +++ b/src/main/cpp/src/CastStringJni.cpp @@ -1,5 +1,4 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index b9bbb26cb2..f0310c9336 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -23,7 +23,6 @@ #include #include - #include #include @@ -35,23 +34,23 @@ namespace { template struct format_float_fn { cudf::column_device_view d_floats; - int const digits; + int digits; cudf::size_type* d_offsets; char* d_chars; - __device__ cudf::size_type compute_output_size(FloatType value, int digits) const + __device__ cudf::size_type compute_output_size(FloatType value, int digits_) const { bool constexpr is_float = std::is_same_v; return static_cast( - ftos_converter::compute_format_float_size(static_cast(value), digits, is_float)); + ftos_converter::compute_format_float_size(static_cast(value), digits_, is_float)); } - __device__ void format_float(cudf::size_type idx, int digits) const + __device__ void format_float(cudf::size_type idx, int digits_) const { auto const value = d_floats.element(idx); bool constexpr is_float = std::is_same_v; auto const output = d_chars + d_offsets[idx]; - ftos_converter::format_float(static_cast(value), digits, is_float, output); + ftos_converter::format_float(static_cast(value), digits_, is_float, output); } __device__ void operator()(cudf::size_type idx) const From eab61eb08714564612b4460345136a59ad2e97f6 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 15 Dec 2023 09:01:47 +0800 Subject: [PATCH 48/54] Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- src/main/cpp/src/format_float.cu | 4 ++-- src/main/cpp/tests/format_float.cpp | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index f0310c9336..e1f3dd9662 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -38,14 +38,14 @@ struct format_float_fn { cudf::size_type* d_offsets; char* d_chars; - __device__ cudf::size_type compute_output_size(FloatType value, int digits_) const + __device__ cudf::size_type compute_output_size(FloatType value) const { bool constexpr is_float = std::is_same_v; return static_cast( ftos_converter::compute_format_float_size(static_cast(value), digits_, is_float)); } - __device__ void format_float(cudf::size_type idx, int digits_) const + __device__ void format_float(cudf::size_type idx) const { auto const value = d_floats.element(idx); bool constexpr is_float = std::is_same_v; diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index c8e5d84260..9aab3d566c 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -19,7 +19,6 @@ #include #include -#include #include From 9892cae00ecec99c8fe1afcc689a3c661f252a1a Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 15 Dec 2023 09:24:32 +0800 Subject: [PATCH 49/54] address comments Signed-off-by: Haoyang Li --- src/main/cpp/src/format_float.cu | 8 ++++---- .../java/com/nvidia/spark/rapids/jni/CastStrings.java | 2 +- thirdparty/cudf | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index e1f3dd9662..d4b8ca8f16 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -42,7 +42,7 @@ struct format_float_fn { { bool constexpr is_float = std::is_same_v; return static_cast( - ftos_converter::compute_format_float_size(static_cast(value), digits_, is_float)); + ftos_converter::compute_format_float_size(static_cast(value), digits, is_float)); } __device__ void format_float(cudf::size_type idx) const @@ -50,7 +50,7 @@ struct format_float_fn { auto const value = d_floats.element(idx); bool constexpr is_float = std::is_same_v; auto const output = d_chars + d_offsets[idx]; - ftos_converter::format_float(static_cast(value), digits_, is_float, output); + ftos_converter::format_float(static_cast(value), digits, is_float, output); } __device__ void operator()(cudf::size_type idx) const @@ -60,9 +60,9 @@ struct format_float_fn { return; } if (d_chars != nullptr) { - format_float(idx, digits); + format_float(idx); } else { - d_offsets[idx] = compute_output_size(d_floats.element(idx), digits); + d_offsets[idx] = compute_output_size(d_floats.element(idx)); } } }; diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java index cd6f62371b..2b2267f034 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java @@ -84,8 +84,8 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st * Convert a float column to a formatted string column. * * @param cv the column data to process - * @return the converted column * @param digits the number of digits to display after the decimal point + * @return the converted column */ public static ColumnVector fromFloatWithFormat(ColumnView cv, int digits) { return new ColumnVector(fromFloatWithFormat(cv.getNativeView(), digits)); diff --git a/thirdparty/cudf b/thirdparty/cudf index 248aa2c887..420dc5d787 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68 +Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094 From 8bf5b1c6477f6cd1b72e4affe7ab48b7336e1ec4 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 15 Dec 2023 15:57:15 +0800 Subject: [PATCH 50/54] cudf Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 420dc5d787..cee642916c 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094 +Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f From 81ba4a0a56925954ae91715870cc4f5a8b7d0fff Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 15 Dec 2023 15:58:29 +0800 Subject: [PATCH 51/54] cudf Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index cee642916c..2cb8f3da3a 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f +Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2 From efb27369475333d0facd4a4aa10fef12b6c4e47c Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 15 Dec 2023 16:01:43 +0800 Subject: [PATCH 52/54] format Signed-off-by: Haoyang Li --- src/main/cpp/tests/format_float.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp index 9aab3d566c..b9d77593db 100644 --- a/src/main/cpp/tests/format_float.cpp +++ b/src/main/cpp/tests/format_float.cpp @@ -19,7 +19,6 @@ #include #include - #include using namespace cudf; From 0505d71745e6e8cf56dd7befc89b506016fd2406 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Fri, 15 Dec 2023 23:24:08 +0800 Subject: [PATCH 53/54] cudf reset Signed-off-by: Haoyang Li --- thirdparty/cudf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 2cb8f3da3a..248aa2c887 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2 +Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68 From 20415e7c739eec9f7d5ef261e039363fb162aff2 Mon Sep 17 00:00:00 2001 From: Haoyang Li Date: Sat, 16 Dec 2023 05:10:04 +0800 Subject: [PATCH 54/54] Apply suggestions from code review Co-authored-by: Mike Wilson --- src/main/cpp/src/format_float.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu index d4b8ca8f16..d9ecbe8206 100644 --- a/src/main/cpp/src/format_float.cu +++ b/src/main/cpp/src/format_float.cu @@ -38,14 +38,14 @@ struct format_float_fn { cudf::size_type* d_offsets; char* d_chars; - __device__ cudf::size_type compute_output_size(FloatType value) const + __device__ cudf::size_type compute_output_size(FloatType const value) const { bool constexpr is_float = std::is_same_v; return static_cast( ftos_converter::compute_format_float_size(static_cast(value), digits, is_float)); } - __device__ void format_float(cudf::size_type idx) const + __device__ void format_float(cudf::size_type const idx) const { auto const value = d_floats.element(idx); bool constexpr is_float = std::is_same_v; @@ -53,7 +53,7 @@ struct format_float_fn { ftos_converter::format_float(static_cast(value), digits, is_float, output); } - __device__ void operator()(cudf::size_type idx) const + __device__ void operator()(cudf::size_type const idx) const { if (d_floats.is_null(idx)) { if (d_chars == nullptr) { d_offsets[idx] = 0; }