From 0e7485cd9dce54338b9ff22678d430470dfe742c Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 13 Oct 2023 14:03:58 +0800
Subject: [PATCH 01/54] wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/CMakeLists.txt                   |   1 +
 src/main/cpp/src/cast_float_to_string.cu      | 360 ++++++++++++++++++
 src/main/cpp/src/cast_string.hpp              |   5 +
 src/main/cpp/tests/CMakeLists.txt             |   3 +
 src/main/cpp/tests/cast_float_to_string.cpp   |  90 +++++
 .../nvidia/spark/rapids/jni/CastStrings.java  |   7 +-
 6 files changed, 463 insertions(+), 3 deletions(-)
 create mode 100644 src/main/cpp/src/cast_float_to_string.cu
 create mode 100644 src/main/cpp/tests/cast_float_to_string.cpp

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 600e6ac245..6f5c3d2239 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -161,6 +161,7 @@ add_library(
   src/ZOrderJni.cpp
   src/bloom_filter.cu
   src/cast_decimal_to_string.cu
+  src/cast_float_to_string.cu
   src/cast_string.cu
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
new file mode 100644
index 0000000000..fed57f5e91
--- /dev/null
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cast_string.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/convert/int_to_string.cuh>
+#include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/generate.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/transform.h>
+
+#include <cuda/std/climits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+using namespace cudf;
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+struct float_to_string_fn {
+  // significant digits is independent of scientific notation range
+  // digits more than this may require using long values instead of ints
+  static constexpr unsigned int significant_digits = 10;
+  // maximum power-of-10 that will fit in 32-bits
+  static constexpr unsigned int nine_digits = 1000000000;  // 1x10^9
+  // Range of numbers here is for normalizing the value.
+  // If the value is above or below the following limits, the output is converted to
+  // scientific notation in order to show (at most) the number of significant digits.
+  static constexpr double upper_limit = 1000000000;  // max is 1x10^9
+  static constexpr double lower_limit = 0.0001;      // printf uses scientific notation below this
+  // Tables for doing normalization: converting to exponent form
+  // IEEE double float has maximum exponent of 305 so these should cover everything
+  double const upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
+  double const lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
+  double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
+
+  // utility for quickly converting known integer range to character array
+  __device__ char* int2str(int value, char* output)
+  {
+    if (value == 0) {
+      *output++ = '0';
+      return output;
+    }
+    char buffer[significant_digits];  // should be big-enough for significant digits
+    char* ptr = buffer;
+    while (value > 0) {
+      *ptr++ = (char)('0' + (value % 10));
+      value /= 10;
+    }
+    while (ptr != buffer)
+      *output++ = *--ptr;  // 54321 -> 12345
+    return output;
+  }
+
+  /**
+   * @brief Dissect a float value into integer, decimal, and exponent components.
+   *
+   * @return The number of decimal places.
+   */
+  __device__ int dissect_value(double value,
+                               unsigned int& integer,
+                               unsigned int& decimal,
+                               int& exp10)
+  {
+    int decimal_places = significant_digits - 1;
+    // normalize step puts value between lower-limit and upper-limit
+    // by adjusting the exponent up or down
+    exp10 = 0;
+    if (value > upper_limit) {
+      int fx = 256;
+      for (int idx = 8; idx >= 0; --idx) {
+        if (value >= upper10[idx]) {
+          value *= lower10[idx];
+          exp10 += fx;
+        }
+        fx = fx >> 1;
+      }
+    } else if ((value > 0.0) && (value < lower_limit)) {
+      int fx = 256;
+      for (int idx = 8; idx >= 0; --idx) {
+        if (value < blower10[idx]) {
+          value *= upper10[idx];
+          exp10 -= fx;
+        }
+        fx = fx >> 1;
+      }
+    }
+    //
+    unsigned int max_digits = nine_digits;
+    integer                 = (unsigned int)value;
+    for (unsigned int i = integer; i >= 10; i /= 10) {
+      --decimal_places;
+      max_digits /= 10;
+    }
+    double remainder = (value - (double)integer) * (double)max_digits;
+    decimal          = (unsigned int)remainder;
+    remainder -= (double)decimal;
+    decimal += (unsigned int)(2.0 * remainder);
+    if (decimal >= max_digits) {
+      decimal = 0;
+      ++integer;
+      if (exp10 && (integer >= 10)) {
+        ++exp10;
+        integer = 1;
+      }
+    }
+    //
+    while ((decimal % 10) == 0 && (decimal_places > 0)) {
+      decimal /= 10;
+      --decimal_places;
+    }
+    return decimal_places;
+  }
+
+  /**
+   * @brief Main kernel method for converting float value to char output array.
+   *
+   * Output need not be more than (significant_digits + 7) bytes:
+   * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent
+   *
+   * @param value Float value to convert.
+   * @param output Memory to write output characters.
+   * @return Number of bytes written.
+   */
+  __device__ int float_to_string(double value, char* output)
+  {
+    // check for valid value
+    if (std::isnan(value)) {
+      memcpy(output, "NaN", 3);
+      return 3;
+    }
+    bool bneg = false;
+    if (signbit(value)) {  // handles -0.0 too
+      value = -value;
+      bneg  = true;
+    }
+    if (std::isinf(value)) {
+      if (bneg)
+        memcpy(output, "-Inf", 4);
+      else
+        memcpy(output, "Inf", 3);
+      return bneg ? 4 : 3;
+    }
+
+    // dissect value into components
+    unsigned int integer = 0, decimal = 0;
+    int exp10          = 0;
+    int decimal_places = dissect_value(value, integer, decimal, exp10);
+    //
+    // now build the string from the
+    // components: sign, integer, decimal, exp10, decimal_places
+    //
+    // sign
+    char* ptr = output;
+    if (bneg) *ptr++ = '-';
+    // integer
+    ptr = int2str(integer, ptr);
+    // decimal
+    *ptr++ = '.';
+    if (decimal_places) {
+      char buffer[10];
+      char* pb = buffer;
+      while (decimal_places--) {
+        *pb++ = (char)('0' + (decimal % 10));
+        decimal /= 10;
+      }
+      while (pb != buffer)  // reverses the digits
+        *ptr++ = *--pb;     // e.g. 54321 -> 12345
+    } else
+      *ptr++ = '0';  // always include at least .0
+    // exponent
+    if (exp10) {
+      *ptr++ = 'E';
+      if (exp10 < 0) {
+        *ptr++ = '-';
+        exp10  = -exp10;
+      } else
+        *ptr++ = '+';
+      if (exp10 < 10) *ptr++ = '0';  // extra zero-pad
+      ptr = int2str(exp10, ptr);
+    }
+    // done
+    return (int)(ptr - output);  // number of bytes written
+  }
+
+  /**
+   * @brief Compute how man bytes are needed to hold the output string.
+   *
+   * @param value Float value to convert.
+   * @return Number of bytes required.
+   */
+  __device__ int compute_ftos_size(double value)
+  {
+    if (std::isnan(value)) return 3;  // NaN
+    bool bneg = false;
+    if (signbit(value)) {  // handles -0.0 too
+      value = -value;
+      bneg  = true;
+    }
+    if (std::isinf(value)) return 3 + (int)bneg;  // Inf
+
+    // dissect float into parts
+    unsigned int integer = 0, decimal = 0;
+    int exp10          = 0;
+    int decimal_places = dissect_value(value, integer, decimal, exp10);
+    // now count up the components
+    // sign
+    int count = (int)bneg;
+    // integer
+    count += (int)(integer == 0);
+    while (integer > 0) {
+      integer /= 10;
+      ++count;
+    }  // log10(integer)
+    // decimal
+    ++count;  // decimal point
+    if (decimal_places)
+      count += decimal_places;
+    else
+      ++count;  // always include .0
+    // exponent
+    if (exp10) {
+      count += 2;  // 'e±'
+      if (exp10 < 0) exp10 = -exp10;
+      count += (int)(exp10 < 10);  // padding
+      while (exp10 > 0) {
+        exp10 /= 10;
+        ++count;
+      }  // log10(exp10)
+    }
+    return count;
+  }
+};
+
+template <typename FloatType>
+struct from_floats_fn {
+  column_device_view d_floats;
+  size_type* d_offsets;
+  char* d_chars;
+
+  __device__ size_type compute_output_size(FloatType value)
+  {
+    float_to_string_fn fts;
+    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value)));
+  }
+
+  __device__ void float_to_string(size_type idx)
+  {
+    FloatType value = d_floats.element<FloatType>(idx);
+    float_to_string_fn fts;
+    fts.float_to_string(static_cast<double>(value), d_chars + d_offsets[idx]);
+  }
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_floats.is_null(idx)) {
+      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      return;
+    }
+    if (d_chars != nullptr) {
+      float_to_string(idx);
+    } else {
+      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+    }
+  }
+};
+
+/**
+ * @brief This dispatch method is for converting floats into strings.
+ *
+ * The template function declaration ensures only float types are allowed.
+ */
+struct dispatch_from_floats_fn {
+  template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& floats,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    size_type strings_count = floats.size();
+    auto column             = column_device_view::create(floats, stream);
+    auto d_column           = *column;
+
+    // copy the null mask
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
+
+    auto [offsets, chars] =
+      cudf::strings::detail::make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+
+    return make_strings_column(strings_count,
+                               std::move(offsets),
+                               std::move(chars),
+                               floats.null_count(),
+                               std::move(null_mask));
+  }
+
+  // non-float types throw an exception
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
+  {
+    CUDF_FAIL("Values for from_floats function must be a float type.");
+  }
+};
+
+}  // namespace
+
+// This will convert all float column types into a strings column.
+std::unique_ptr<column> from_floats(column_view const& floats,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  size_type strings_count = floats.size();
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
+
+  return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::from_floats(floats, cudf::get_default_stream(), mr);
+}
+
+}  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index df74407355..fc2270ca8c 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -115,6 +115,11 @@ std::unique_ptr<cudf::column> string_to_float(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::column> float_to_string(
+  cudf::column_view const& input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 std::unique_ptr<cudf::column> decimal_to_non_ansi_string(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream,
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index 5b95291351..b33c3955af 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -51,6 +51,9 @@ ConfigureTest(CAST_STRING
 ConfigureTest(CAST_DECIMAL_TO_STRING
     cast_decimal_to_string.cpp)
 
+ConfigureTest(CAST_FLOAT_TO_STRING
+    cast_float_to_string.cpp)
+
 ConfigureTest(DATETIME_REBASE
     datetime_rebase.cpp)
 
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
new file mode 100644
index 0000000000..ae342087d0
--- /dev/null
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cast_string.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/strings/convert/convert_floats.hpp>
+
+#include <limits>
+#include <rmm/device_uvector.hpp>
+
+using namespace cudf;
+
+struct FloatToStringTests : public cudf::test::BaseFixture {};
+
+TEST_F(StringsConvertTest, FromFloats32)
+{
+  std::vector<float> h_floats{100,
+                              654321.25,
+                              -12761.125,
+                              0,
+                              5,
+                              -4,
+                              std::numeric_limits<float>::quiet_NaN(),
+                              839542223232.79,
+                              -0.0};
+  std::vector<char const*> h_expected{
+    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422433e+11", "-0.0"};
+
+  cudf::test::fixed_width_column_wrapper<float> floats(
+    h_floats.begin(),
+    h_floats.end(),
+    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+
+  auto results = cudf::strings::from_floats(floats);
+
+  cudf::test::strings_column_wrapper expected(
+    h_expected.begin(),
+    h_expected.end(),
+    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+}
+
+TEST_F(StringsConvertTest, FromFloats64)
+{
+  std::vector<double> h_floats{100,
+                               654321.25,
+                               -12761.125,
+                               0,
+                               5,
+                               -4,
+                               std::numeric_limits<double>::quiet_NaN(),
+                               839542223232.794248339,
+                               -0.0};
+  std::vector<char const*> h_expected{
+    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422232e+11", "-0.0"};
+
+  cudf::test::fixed_width_column_wrapper<double> floats(
+    h_floats.begin(),
+    h_floats.end(),
+    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+
+  auto results = cudf::strings::from_floats(floats);
+
+  cudf::test::strings_column_wrapper expected(
+    h_expected.begin(),
+    h_expected.end(),
+    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+}
\ No newline at end of file
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index eab42c41f6..7a31b0241b 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -81,13 +81,13 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
   }
 
   /**
-   * Convert a decimal column to a string column.
+   * Convert a float column to a string column.
    *
    * @param cv the column data to process
    * @return the converted column
    */
-  public static ColumnVector fromDecimal(ColumnView cv) {
-    return new ColumnVector(fromDecimal(cv.getNativeView()));
+  public static ColumnVector fromFloat(ColumnView cv) {
+    return new ColumnVector(fromFloat(cv.getNativeView()));
   }
 
   /**
@@ -137,6 +137,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
       int precision, int scale);
   private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype);
   private static native long fromDecimal(long nativeColumnView);
+  private static native long fromFloat(long nativeColumnView);
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);

From 2c04fff68fd1c2ca51f1dd74616326615905b625 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 18 Oct 2023 09:55:20 +0800
Subject: [PATCH 02/54] wip

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/CastStringJni.cpp          | 15 +++++
 src/main/cpp/src/cast_float_to_string.cu    | 69 ++++++++++++---------
 src/main/cpp/tests/cast_float_to_string.cpp | 17 +++--
 3 files changed, 65 insertions(+), 36 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index d09bc33e4c..ff8ee2afd4 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -109,6 +109,21 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat(
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(
+  JNIEnv* env, jclass, jlong input_column, jint j_dtype)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::column_view cv{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::float_to_string(cv, cudf::get_default_stream()));
+  }
+  CATCH_CAST_EXCEPTION(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal(JNIEnv* env,
                                                                                  jclass,
                                                                                  jlong input_column)
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index fed57f5e91..3560c375e3 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -49,12 +49,14 @@ namespace spark_rapids_jni {
 namespace detail {
 namespace {
 
-struct float_to_string_fn {
+struct ftos_converter {
   // significant digits is independent of scientific notation range
   // digits more than this may require using long values instead of ints
-  static constexpr unsigned int significant_digits = 10;
+  static constexpr unsigned int significant_digits = 17;
   // maximum power-of-10 that will fit in 32-bits
-  static constexpr unsigned int nine_digits = 1000000000;  // 1x10^9
+  // static constexpr unsigned long long nine_digits = 1000000000;  // 1x10^9
+  static constexpr unsigned long long fifteen_digits = 1000000000000000;
+  static constexpr unsigned long long sixteen_digits = 10000000000000000;
   // Range of numbers here is for normalizing the value.
   // If the value is above or below the following limits, the output is converted to
   // scientific notation in order to show (at most) the number of significant digits.
@@ -91,10 +93,9 @@ struct float_to_string_fn {
    */
   __device__ int dissect_value(double value,
                                unsigned int& integer,
-                               unsigned int& decimal,
+                               unsigned long long& decimal,
                                int& exp10)
   {
-    int decimal_places = significant_digits - 1;
     // normalize step puts value between lower-limit and upper-limit
     // by adjusting the exponent up or down
     exp10 = 0;
@@ -118,16 +119,18 @@ struct float_to_string_fn {
       }
     }
     //
-    unsigned int max_digits = nine_digits;
+    int decimal_places = significant_digits - (exp10? 2 : 1);
+    unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
     integer                 = (unsigned int)value;
     for (unsigned int i = integer; i >= 10; i /= 10) {
       --decimal_places;
       max_digits /= 10;
     }
-    double remainder = (value - (double)integer) * (double)max_digits;
-    decimal          = (unsigned int)remainder;
+    double diff = value - (double)integer;
+    double remainder = diff * (double)max_digits;
+    decimal          = (unsigned long long)remainder;
     remainder -= (double)decimal;
-    decimal += (unsigned int)(2.0 * remainder);
+    decimal += (unsigned long long)(2.0 * remainder); // round up
     if (decimal >= max_digits) {
       decimal = 0;
       ++integer;
@@ -168,14 +171,15 @@ struct float_to_string_fn {
     }
     if (std::isinf(value)) {
       if (bneg)
-        memcpy(output, "-Inf", 4);
+        memcpy(output, "-Infinity", 9);
       else
-        memcpy(output, "Inf", 3);
-      return bneg ? 4 : 3;
+        memcpy(output, "Infinity", 8);
+      return bneg ? 9 : 8;
     }
 
     // dissect value into components
-    unsigned int integer = 0, decimal = 0;
+    unsigned int integer = 0;
+    unsigned long long decimal = 0;
     int exp10          = 0;
     int decimal_places = dissect_value(value, integer, decimal, exp10);
     //
@@ -190,7 +194,7 @@ struct float_to_string_fn {
     // decimal
     *ptr++ = '.';
     if (decimal_places) {
-      char buffer[10];
+      char buffer[17];
       char* pb = buffer;
       while (decimal_places--) {
         *pb++ = (char)('0' + (decimal % 10));
@@ -206,9 +210,8 @@ struct float_to_string_fn {
       if (exp10 < 0) {
         *ptr++ = '-';
         exp10  = -exp10;
-      } else
-        *ptr++ = '+';
-      if (exp10 < 10) *ptr++ = '0';  // extra zero-pad
+      }
+      // if (exp10 < 10) *ptr++ = '0';  // extra zero-pad
       ptr = int2str(exp10, ptr);
     }
     // done
@@ -232,7 +235,8 @@ struct float_to_string_fn {
     if (std::isinf(value)) return 3 + (int)bneg;  // Inf
 
     // dissect float into parts
-    unsigned int integer = 0, decimal = 0;
+    unsigned int integer = 0;
+    unsigned long long decimal = 0;
     int exp10          = 0;
     int decimal_places = dissect_value(value, integer, decimal, exp10);
     // now count up the components
@@ -252,8 +256,11 @@ struct float_to_string_fn {
       ++count;  // always include .0
     // exponent
     if (exp10) {
-      count += 2;  // 'e±'
-      if (exp10 < 0) exp10 = -exp10;
+      count ++;  // 'e±'
+      if (exp10 < 0) {
+        count ++;
+        exp10 = -exp10;
+      }
       count += (int)(exp10 < 10);  // padding
       while (exp10 > 0) {
         exp10 /= 10;
@@ -265,21 +272,21 @@ struct float_to_string_fn {
 };
 
 template <typename FloatType>
-struct from_floats_fn {
+struct float_to_string_fn {
   column_device_view d_floats;
   size_type* d_offsets;
   char* d_chars;
 
   __device__ size_type compute_output_size(FloatType value)
   {
-    float_to_string_fn fts;
+    ftos_converter fts;
     return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value)));
   }
 
   __device__ void float_to_string(size_type idx)
   {
     FloatType value = d_floats.element<FloatType>(idx);
-    float_to_string_fn fts;
+    ftos_converter fts;
     fts.float_to_string(static_cast<double>(value), d_chars + d_offsets[idx]);
   }
 
@@ -302,7 +309,7 @@ struct from_floats_fn {
  *
  * The template function declaration ensures only float types are allowed.
  */
-struct dispatch_from_floats_fn {
+struct dispatch_float_to_string_fn {
   template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& floats,
                                      rmm::cuda_stream_view stream,
@@ -316,7 +323,7 @@ struct dispatch_from_floats_fn {
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
 
     auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+      cudf::strings::detail::make_strings_children(float_to_string_fn<FloatType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
@@ -331,30 +338,32 @@ struct dispatch_from_floats_fn {
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*) const
   {
-    CUDF_FAIL("Values for from_floats function must be a float type.");
+    CUDF_FAIL("Values for float_to_string function must be a float type.");
   }
 };
 
 }  // namespace
 
 // This will convert all float column types into a strings column.
-std::unique_ptr<column> from_floats(column_view const& floats,
+std::unique_ptr<column> float_to_string(column_view const& floats,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = floats.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr);
+  return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> float_to_string(column_view const& floats, 
+                                      rmm::cuda_stream_view stream, 
+                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_floats(floats, cudf::get_default_stream(), mr);
+  return detail::float_to_string(floats, stream, mr);
 }
 
 }  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index ae342087d0..605df95742 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -29,9 +29,11 @@
 
 using namespace cudf;
 
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+
 struct FloatToStringTests : public cudf::test::BaseFixture {};
 
-TEST_F(StringsConvertTest, FromFloats32)
+TEST_F(FloatToStringTests, FromFloats32)
 {
   std::vector<float> h_floats{100,
                               654321.25,
@@ -43,14 +45,14 @@ TEST_F(StringsConvertTest, FromFloats32)
                               839542223232.79,
                               -0.0};
   std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422433e+11", "-0.0"};
+    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<float> floats(
     h_floats.begin(),
     h_floats.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  auto results = cudf::strings::from_floats(floats);
+  auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
 
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
@@ -60,11 +62,13 @@ TEST_F(StringsConvertTest, FromFloats32)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
 
-TEST_F(StringsConvertTest, FromFloats64)
+TEST_F(FloatToStringTests, FromFloats64)
 {
   std::vector<double> h_floats{100,
                                654321.25,
                                -12761.125,
+                               1.123456789123456789,
+                               0.000000000000000000123456789123456789,
                                0,
                                5,
                                -4,
@@ -72,14 +76,15 @@ TEST_F(StringsConvertTest, FromFloats64)
                                839542223232.794248339,
                                -0.0};
   std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.395422232e+11", "-0.0"};
+    "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", 
+    "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<double> floats(
     h_floats.begin(),
     h_floats.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  auto results = cudf::strings::from_floats(floats);
+  auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
 
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),

From cbce72469eadfb29bc88bcc4c07afe84872c60f5 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 18 Oct 2023 18:22:05 +0800
Subject: [PATCH 03/54] Add float to string kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu      | 23 ++++++++++++-------
 .../nvidia/spark/rapids/jni/CastStrings.java  | 10 ++++++++
 thirdparty/cudf                               |  2 +-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 3560c375e3..13a71754e4 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -55,13 +55,13 @@ struct ftos_converter {
   static constexpr unsigned int significant_digits = 17;
   // maximum power-of-10 that will fit in 32-bits
   // static constexpr unsigned long long nine_digits = 1000000000;  // 1x10^9
-  static constexpr unsigned long long fifteen_digits = 1000000000000000;
+  // static constexpr unsigned long long fifteen_digits = 1000000000000000;
   static constexpr unsigned long long sixteen_digits = 10000000000000000;
   // Range of numbers here is for normalizing the value.
   // If the value is above or below the following limits, the output is converted to
   // scientific notation in order to show (at most) the number of significant digits.
-  static constexpr double upper_limit = 1000000000;  // max is 1x10^9
-  static constexpr double lower_limit = 0.0001;      // printf uses scientific notation below this
+  static constexpr double upper_limit = 10000000;  // max is 1x10^7
+  static constexpr double lower_limit = 0.001;      // printf uses scientific notation below this
   // Tables for doing normalization: converting to exponent form
   // IEEE double float has maximum exponent of 305 so these should cover everything
   double const upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
@@ -119,8 +119,16 @@ struct ftos_converter {
       }
     }
     //
-    int decimal_places = significant_digits - (exp10? 2 : 1);
-    unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
+    // int decimal_places = significant_digits - (exp10? 2 : 1);
+    // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
+    int decimal_places = significant_digits - 1;
+    unsigned long long max_digits = sixteen_digits;
+    double temp_value = value;
+    while (temp_value < 1.0 && temp_value > 0.0) {
+      max_digits *= 10;
+      temp_value *= 10.0;
+      decimal_places++;
+    }
     integer                 = (unsigned int)value;
     for (unsigned int i = integer; i >= 10; i /= 10) {
       --decimal_places;
@@ -194,7 +202,7 @@ struct ftos_converter {
     // decimal
     *ptr++ = '.';
     if (decimal_places) {
-      char buffer[17];
+      char buffer[18];
       char* pb = buffer;
       while (decimal_places--) {
         *pb++ = (char)('0' + (decimal % 10));
@@ -232,7 +240,7 @@ struct ftos_converter {
       value = -value;
       bneg  = true;
     }
-    if (std::isinf(value)) return 3 + (int)bneg;  // Inf
+    if (std::isinf(value)) return 8 + (int)bneg;  // Inf
 
     // dissect float into parts
     unsigned int integer = 0;
@@ -261,7 +269,6 @@ struct ftos_converter {
         count ++;
         exp10 = -exp10;
       }
-      count += (int)(exp10 < 10);  // padding
       while (exp10 > 0) {
         exp10 /= 10;
         ++count;
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 7a31b0241b..3002e1cdab 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -90,6 +90,16 @@ public static ColumnVector fromFloat(ColumnView cv) {
     return new ColumnVector(fromFloat(cv.getNativeView()));
   }
 
+  /**
+   * Convert a decimal column to a string column.
+   *
+   * @param cv the column data to process
+   * @return the converted column
+   */
+  public static ColumnVector fromDecimal(ColumnView cv) {
+    return new ColumnVector(fromDecimal(cv.getNativeView()));
+  }
+
   /**
    * Convert a string column to a given floating-point type column.
    *
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5f05c180b8..fa4e8ab1af 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5f05c180b80b70fc09ea58aef2494c57edc44b9c
+Subproject commit fa4e8ab1af4acfd2c88a619b4d9693f4a5fda168

From 8d7ead2093613de6d322b42157130182086e7891 Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Thu, 19 Oct 2023 15:57:57 +0800
Subject: [PATCH 04/54] Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
---
 src/main/cpp/src/cast_float_to_string.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 13a71754e4..15d6e9cba5 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -178,10 +178,11 @@ struct ftos_converter {
       bneg  = true;
     }
     if (std::isinf(value)) {
-      if (bneg)
+      if (bneg) {
         memcpy(output, "-Infinity", 9);
-      else
+      } else {
         memcpy(output, "Infinity", 8);
+      }
       return bneg ? 9 : 8;
     }
 

From 9ab20893bad7eb87d78ba5500ba9e763dad954a0 Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Thu, 19 Oct 2023 15:58:05 +0800
Subject: [PATCH 05/54] Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
---
 src/main/cpp/src/cast_float_to_string.cu | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 15d6e9cba5..ca7e9b95db 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -172,11 +172,14 @@ struct ftos_converter {
       memcpy(output, "NaN", 3);
       return 3;
     }
-    bool bneg = false;
-    if (signbit(value)) {  // handles -0.0 too
-      value = -value;
-      bneg  = true;
-    }
+    bool const bneg = [&value]() {
+      if (signbit(value)) {  // handles -0.0 too
+        value = -value;
+        return true;
+      } else {
+        return false;
+      }
+    }();
     if (std::isinf(value)) {
       if (bneg) {
         memcpy(output, "-Infinity", 9);

From c3b3d6464445ee1393d48122a4192424d34b18b8 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 19 Oct 2023 17:52:57 +0800
Subject: [PATCH 06/54] address comments and use different precision for float

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu    | 36 +++++++++++----------
 src/main/cpp/tests/cast_float_to_string.cpp |  2 +-
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index ca7e9b95db..d1f66f772d 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -52,15 +52,14 @@ namespace {
 struct ftos_converter {
   // significant digits is independent of scientific notation range
   // digits more than this may require using long values instead of ints
-  static constexpr unsigned int significant_digits = 17;
-  // maximum power-of-10 that will fit in 32-bits
-  // static constexpr unsigned long long nine_digits = 1000000000;  // 1x10^9
-  // static constexpr unsigned long long fifteen_digits = 1000000000000000;
-  static constexpr unsigned long long sixteen_digits = 10000000000000000;
+  static constexpr unsigned int significant_digits_float = 9;
+  static constexpr unsigned int significant_digits_double = 17;
+  static constexpr unsigned int eight_digits = 100000000;  // 1x10^8
+  static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16
   // Range of numbers here is for normalizing the value.
   // If the value is above or below the following limits, the output is converted to
   // scientific notation in order to show (at most) the number of significant digits.
-  static constexpr double upper_limit = 10000000;  // max is 1x10^7
+  static constexpr double upper_limit = 10000000;  // Spark's max is 1x10^7
   static constexpr double lower_limit = 0.001;      // printf uses scientific notation below this
   // Tables for doing normalization: converting to exponent form
   // IEEE double float has maximum exponent of 305 so these should cover everything
@@ -75,7 +74,7 @@ struct ftos_converter {
       *output++ = '0';
       return output;
     }
-    char buffer[significant_digits];  // should be big-enough for significant digits
+    char buffer[significant_digits_double];  // should be big-enough for significant digits
     char* ptr = buffer;
     while (value > 0) {
       *ptr++ = (char)('0' + (value % 10));
@@ -94,7 +93,8 @@ struct ftos_converter {
   __device__ int dissect_value(double value,
                                unsigned int& integer,
                                unsigned long long& decimal,
-                               int& exp10)
+                               int& exp10,
+                               bool is_float = false)
   {
     // normalize step puts value between lower-limit and upper-limit
     // by adjusting the exponent up or down
@@ -121,8 +121,8 @@ struct ftos_converter {
     //
     // int decimal_places = significant_digits - (exp10? 2 : 1);
     // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
-    int decimal_places = significant_digits - 1;
-    unsigned long long max_digits = sixteen_digits;
+    int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1;
+    unsigned long long max_digits = (is_float? eight_digits: sixteen_digits);
     double temp_value = value;
     while (temp_value < 1.0 && temp_value > 0.0) {
       max_digits *= 10;
@@ -165,7 +165,7 @@ struct ftos_converter {
    * @param output Memory to write output characters.
    * @return Number of bytes written.
    */
-  __device__ int float_to_string(double value, char* output)
+  __device__ int float_to_string(double value, char* output, bool is_float)
   {
     // check for valid value
     if (std::isnan(value)) {
@@ -193,7 +193,7 @@ struct ftos_converter {
     unsigned int integer = 0;
     unsigned long long decimal = 0;
     int exp10          = 0;
-    int decimal_places = dissect_value(value, integer, decimal, exp10);
+    int decimal_places = dissect_value(value, integer, decimal, exp10, is_float);
     //
     // now build the string from the
     // components: sign, integer, decimal, exp10, decimal_places
@@ -206,7 +206,7 @@ struct ftos_converter {
     // decimal
     *ptr++ = '.';
     if (decimal_places) {
-      char buffer[18];
+      char buffer[significant_digits_double];
       char* pb = buffer;
       while (decimal_places--) {
         *pb++ = (char)('0' + (decimal % 10));
@@ -236,7 +236,7 @@ struct ftos_converter {
    * @param value Float value to convert.
    * @return Number of bytes required.
    */
-  __device__ int compute_ftos_size(double value)
+  __device__ int compute_ftos_size(double value, bool is_float)
   {
     if (std::isnan(value)) return 3;  // NaN
     bool bneg = false;
@@ -250,7 +250,7 @@ struct ftos_converter {
     unsigned int integer = 0;
     unsigned long long decimal = 0;
     int exp10          = 0;
-    int decimal_places = dissect_value(value, integer, decimal, exp10);
+    int decimal_places = dissect_value(value, integer, decimal, exp10, is_float);
     // now count up the components
     // sign
     int count = (int)bneg;
@@ -291,14 +291,16 @@ struct float_to_string_fn {
   __device__ size_type compute_output_size(FloatType value)
   {
     ftos_converter fts;
-    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value)));
+    bool is_float = std::is_same_v<FloatType, float>;
+    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value), is_float));
   }
 
   __device__ void float_to_string(size_type idx)
   {
     FloatType value = d_floats.element<FloatType>(idx);
     ftos_converter fts;
-    fts.float_to_string(static_cast<double>(value), d_chars + d_offsets[idx]);
+    bool is_float = std::is_same_v<FloatType, float>;
+    fts.float_to_string(static_cast<double>(value), d_chars + d_offsets[idx], is_float);
   }
 
   __device__ void operator()(size_type idx)
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index 605df95742..a86d988724 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -42,7 +42,7 @@ TEST_F(FloatToStringTests, FromFloats32)
                               5,
                               -4,
                               std::numeric_limits<float>::quiet_NaN(),
-                              839542223232.79,
+                              123456789012.34,
                               -0.0};
   std::vector<char const*> h_expected{
     "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"};

From ebb123811c15247f4c6f1fe3fea11716517dcc28 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 30 Oct 2023 09:38:15 +0800
Subject: [PATCH 07/54] a runnable format_number demo

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/CMakeLists.txt                   |   2 +-
 src/main/cpp/src/CastStringJni.cpp            |   6 +-
 src/main/cpp/src/cast_float_to_string.cu      | 382 ------------
 src/main/cpp/src/cast_string.hpp              |   3 +-
 src/main/cpp/src/format_float.cu              | 576 ++++++++++++++++++
 src/main/cpp/tests/CMakeLists.txt             |   4 +-
 ...t_float_to_string.cpp => format_float.cpp} |  14 +-
 .../nvidia/spark/rapids/jni/CastStrings.java  |   8 +-
 8 files changed, 595 insertions(+), 400 deletions(-)
 delete mode 100644 src/main/cpp/src/cast_float_to_string.cu
 create mode 100644 src/main/cpp/src/format_float.cu
 rename src/main/cpp/tests/{cast_float_to_string.cpp => format_float.cpp} (86%)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 8f90b9078e..745a9df2a7 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -163,7 +163,7 @@ add_library(
   src/ZOrderJni.cpp
   src/bloom_filter.cu
   src/cast_decimal_to_string.cu
-  src/cast_float_to_string.cu
+  src/format_float.cu
   src/cast_string.cu
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index ff8ee2afd4..824ddad8e1 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -109,8 +109,8 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat(
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(
-  JNIEnv* env, jclass, jlong input_column, jint j_dtype)
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat(
+  JNIEnv* env, jclass, jlong input_column, jint d, jint j_dtype)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
 
@@ -119,7 +119,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(
 
     cudf::column_view cv{*reinterpret_cast<cudf::column_view const*>(input_column)};
     return cudf::jni::release_as_jlong(
-      spark_rapids_jni::float_to_string(cv, cudf::get_default_stream()));
+      spark_rapids_jni::format_float(cv, d, cudf::get_default_stream()));
   }
   CATCH_CAST_EXCEPTION(env, 0);
 }
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
deleted file mode 100644
index d1f66f772d..0000000000
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cast_string.hpp"
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/generate.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
-#include <thrust/transform.h>
-
-#include <cuda/std/climits>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-
-using namespace cudf;
-
-namespace spark_rapids_jni {
-
-namespace detail {
-namespace {
-
-struct ftos_converter {
-  // significant digits is independent of scientific notation range
-  // digits more than this may require using long values instead of ints
-  static constexpr unsigned int significant_digits_float = 9;
-  static constexpr unsigned int significant_digits_double = 17;
-  static constexpr unsigned int eight_digits = 100000000;  // 1x10^8
-  static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16
-  // Range of numbers here is for normalizing the value.
-  // If the value is above or below the following limits, the output is converted to
-  // scientific notation in order to show (at most) the number of significant digits.
-  static constexpr double upper_limit = 10000000;  // Spark's max is 1x10^7
-  static constexpr double lower_limit = 0.001;      // printf uses scientific notation below this
-  // Tables for doing normalization: converting to exponent form
-  // IEEE double float has maximum exponent of 305 so these should cover everything
-  double const upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
-  double const lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
-  double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
-
-  // utility for quickly converting known integer range to character array
-  __device__ char* int2str(int value, char* output)
-  {
-    if (value == 0) {
-      *output++ = '0';
-      return output;
-    }
-    char buffer[significant_digits_double];  // should be big-enough for significant digits
-    char* ptr = buffer;
-    while (value > 0) {
-      *ptr++ = (char)('0' + (value % 10));
-      value /= 10;
-    }
-    while (ptr != buffer)
-      *output++ = *--ptr;  // 54321 -> 12345
-    return output;
-  }
-
-  /**
-   * @brief Dissect a float value into integer, decimal, and exponent components.
-   *
-   * @return The number of decimal places.
-   */
-  __device__ int dissect_value(double value,
-                               unsigned int& integer,
-                               unsigned long long& decimal,
-                               int& exp10,
-                               bool is_float = false)
-  {
-    // normalize step puts value between lower-limit and upper-limit
-    // by adjusting the exponent up or down
-    exp10 = 0;
-    if (value > upper_limit) {
-      int fx = 256;
-      for (int idx = 8; idx >= 0; --idx) {
-        if (value >= upper10[idx]) {
-          value *= lower10[idx];
-          exp10 += fx;
-        }
-        fx = fx >> 1;
-      }
-    } else if ((value > 0.0) && (value < lower_limit)) {
-      int fx = 256;
-      for (int idx = 8; idx >= 0; --idx) {
-        if (value < blower10[idx]) {
-          value *= upper10[idx];
-          exp10 -= fx;
-        }
-        fx = fx >> 1;
-      }
-    }
-    //
-    // int decimal_places = significant_digits - (exp10? 2 : 1);
-    // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
-    int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1;
-    unsigned long long max_digits = (is_float? eight_digits: sixteen_digits);
-    double temp_value = value;
-    while (temp_value < 1.0 && temp_value > 0.0) {
-      max_digits *= 10;
-      temp_value *= 10.0;
-      decimal_places++;
-    }
-    integer                 = (unsigned int)value;
-    for (unsigned int i = integer; i >= 10; i /= 10) {
-      --decimal_places;
-      max_digits /= 10;
-    }
-    double diff = value - (double)integer;
-    double remainder = diff * (double)max_digits;
-    decimal          = (unsigned long long)remainder;
-    remainder -= (double)decimal;
-    decimal += (unsigned long long)(2.0 * remainder); // round up
-    if (decimal >= max_digits) {
-      decimal = 0;
-      ++integer;
-      if (exp10 && (integer >= 10)) {
-        ++exp10;
-        integer = 1;
-      }
-    }
-    //
-    while ((decimal % 10) == 0 && (decimal_places > 0)) {
-      decimal /= 10;
-      --decimal_places;
-    }
-    return decimal_places;
-  }
-
-  /**
-   * @brief Main kernel method for converting float value to char output array.
-   *
-   * Output need not be more than (significant_digits + 7) bytes:
-   * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent
-   *
-   * @param value Float value to convert.
-   * @param output Memory to write output characters.
-   * @return Number of bytes written.
-   */
-  __device__ int float_to_string(double value, char* output, bool is_float)
-  {
-    // check for valid value
-    if (std::isnan(value)) {
-      memcpy(output, "NaN", 3);
-      return 3;
-    }
-    bool const bneg = [&value]() {
-      if (signbit(value)) {  // handles -0.0 too
-        value = -value;
-        return true;
-      } else {
-        return false;
-      }
-    }();
-    if (std::isinf(value)) {
-      if (bneg) {
-        memcpy(output, "-Infinity", 9);
-      } else {
-        memcpy(output, "Infinity", 8);
-      }
-      return bneg ? 9 : 8;
-    }
-
-    // dissect value into components
-    unsigned int integer = 0;
-    unsigned long long decimal = 0;
-    int exp10          = 0;
-    int decimal_places = dissect_value(value, integer, decimal, exp10, is_float);
-    //
-    // now build the string from the
-    // components: sign, integer, decimal, exp10, decimal_places
-    //
-    // sign
-    char* ptr = output;
-    if (bneg) *ptr++ = '-';
-    // integer
-    ptr = int2str(integer, ptr);
-    // decimal
-    *ptr++ = '.';
-    if (decimal_places) {
-      char buffer[significant_digits_double];
-      char* pb = buffer;
-      while (decimal_places--) {
-        *pb++ = (char)('0' + (decimal % 10));
-        decimal /= 10;
-      }
-      while (pb != buffer)  // reverses the digits
-        *ptr++ = *--pb;     // e.g. 54321 -> 12345
-    } else
-      *ptr++ = '0';  // always include at least .0
-    // exponent
-    if (exp10) {
-      *ptr++ = 'E';
-      if (exp10 < 0) {
-        *ptr++ = '-';
-        exp10  = -exp10;
-      }
-      // if (exp10 < 10) *ptr++ = '0';  // extra zero-pad
-      ptr = int2str(exp10, ptr);
-    }
-    // done
-    return (int)(ptr - output);  // number of bytes written
-  }
-
-  /**
-   * @brief Compute how man bytes are needed to hold the output string.
-   *
-   * @param value Float value to convert.
-   * @return Number of bytes required.
-   */
-  __device__ int compute_ftos_size(double value, bool is_float)
-  {
-    if (std::isnan(value)) return 3;  // NaN
-    bool bneg = false;
-    if (signbit(value)) {  // handles -0.0 too
-      value = -value;
-      bneg  = true;
-    }
-    if (std::isinf(value)) return 8 + (int)bneg;  // Inf
-
-    // dissect float into parts
-    unsigned int integer = 0;
-    unsigned long long decimal = 0;
-    int exp10          = 0;
-    int decimal_places = dissect_value(value, integer, decimal, exp10, is_float);
-    // now count up the components
-    // sign
-    int count = (int)bneg;
-    // integer
-    count += (int)(integer == 0);
-    while (integer > 0) {
-      integer /= 10;
-      ++count;
-    }  // log10(integer)
-    // decimal
-    ++count;  // decimal point
-    if (decimal_places)
-      count += decimal_places;
-    else
-      ++count;  // always include .0
-    // exponent
-    if (exp10) {
-      count ++;  // 'e±'
-      if (exp10 < 0) {
-        count ++;
-        exp10 = -exp10;
-      }
-      while (exp10 > 0) {
-        exp10 /= 10;
-        ++count;
-      }  // log10(exp10)
-    }
-    return count;
-  }
-};
-
-template <typename FloatType>
-struct float_to_string_fn {
-  column_device_view d_floats;
-  size_type* d_offsets;
-  char* d_chars;
-
-  __device__ size_type compute_output_size(FloatType value)
-  {
-    ftos_converter fts;
-    bool is_float = std::is_same_v<FloatType, float>;
-    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value), is_float));
-  }
-
-  __device__ void float_to_string(size_type idx)
-  {
-    FloatType value = d_floats.element<FloatType>(idx);
-    ftos_converter fts;
-    bool is_float = std::is_same_v<FloatType, float>;
-    fts.float_to_string(static_cast<double>(value), d_chars + d_offsets[idx], is_float);
-  }
-
-  __device__ void operator()(size_type idx)
-  {
-    if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
-      return;
-    }
-    if (d_chars != nullptr) {
-      float_to_string(idx);
-    } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
-    }
-  }
-};
-
-/**
- * @brief This dispatch method is for converting floats into strings.
- *
- * The template function declaration ensures only float types are allowed.
- */
-struct dispatch_float_to_string_fn {
-  template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& floats,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    size_type strings_count = floats.size();
-    auto column             = column_device_view::create(floats, stream);
-    auto d_column           = *column;
-
-    // copy the null mask
-    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
-
-    auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(float_to_string_fn<FloatType>{d_column}, strings_count, stream, mr);
-
-    return make_strings_column(strings_count,
-                               std::move(offsets),
-                               std::move(chars),
-                               floats.null_count(),
-                               std::move(null_mask));
-  }
-
-  // non-float types throw an exception
-  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const&,
-                                     rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
-  {
-    CUDF_FAIL("Values for float_to_string function must be a float type.");
-  }
-};
-
-}  // namespace
-
-// This will convert all float column types into a strings column.
-std::unique_ptr<column> float_to_string(column_view const& floats,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  size_type strings_count = floats.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-
-  return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
-}
-
-}  // namespace detail
-
-// external API
-std::unique_ptr<column> float_to_string(column_view const& floats, 
-                                      rmm::cuda_stream_view stream, 
-                                      rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::float_to_string(floats, stream, mr);
-}
-
-}  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index fc2270ca8c..4f64bf4ef3 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -115,8 +115,9 @@ std::unique_ptr<cudf::column> string_to_float(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> float_to_string(
+std::unique_ptr<cudf::column> format_float(
   cudf::column_view const& input,
+  int d,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
new file mode 100644
index 0000000000..5972d108a9
--- /dev/null
+++ b/src/main/cpp/src/format_float.cu
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cast_string.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/convert/int_to_string.cuh>
+#include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/generate.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/optional.h>
+#include <thrust/transform.h>
+
+#include <cuda/std/climits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+using namespace cudf;
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+struct ftos_converter {
+  // significant digits is independent of scientific notation range
+  // digits more than this may require using long values instead of ints
+  // static constexpr unsigned int significant_digits_float = 9;
+  // static constexpr unsigned int significant_digits_double = 17;
+  // static constexpr unsigned int eight_digits = 100000000;  // 1x10^8
+  static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16
+  // Range of numbers here is for normalizing the value.
+  // If the value is above or below the following limits, the output is converted to
+  // scientific notation in order to show (at most) the number of significant digits.
+  static constexpr double upper_limit = 10000000;  // Spark's max is 1x10^7
+  static constexpr double lower_limit = 0.001;      // printf uses scientific notation below this
+  // Tables for doing normalization: converting to exponent form
+  // IEEE double float has maximum exponent of 305 so these should cover everything
+  double const upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
+  double const lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
+  double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
+
+  // // utility for quickly converting known integer range to character array
+  // __device__ char* int2str(int value, char* output)
+  // {
+  //   if (value == 0) {
+  //     *output++ = '0';
+  //     return output;
+  //   }
+  //   char buffer[significant_digits_double];  // should be big-enough for significant digits
+  //   char* ptr = buffer;
+  //   while (value > 0) {
+  //     *ptr++ = (char)('0' + (value % 10));
+  //     value /= 10;
+  //   }
+  //   while (ptr != buffer)
+  //     *output++ = *--ptr;  // 54321 -> 12345
+  //   return output;
+  // }
+
+  // // Add separator every 3 digits for integer part
+  // __device__ char* format_int(int value, char* output)
+  // {
+  //   if (value == 0) {
+  //     *output++ = '0';
+  //     return output;
+  //   }
+  //   char buffer[30];  // TODO: avoid hard-coded size
+  //   char* ptr = buffer;
+  //   int sep_count = 0;
+  //   while (value > 0) {
+  //     if (sep_count == 3) {
+  //       *ptr++ = ',';
+  //       sep_count = 0;
+  //     }
+  //     *ptr++ = (char)('0' + (value % 10));
+  //     value /= 10;
+  //     sep_count++;
+  //   }
+  //   while (ptr != buffer)
+  //     *output++ = *--ptr;  // 543,21 -> 12,345
+  //   return output;
+  // }
+
+  __device__ char* ll2str(long long n, char* result) {
+    if (n == 0) {
+      *result++ = '0';
+      return result;
+    }
+    char buffer[18];  // should be big-enough for significant digits
+    char* ptr = buffer;
+    while (n > 0) {
+      *ptr++ = (char)('0' + (n % 10));
+      n /= 10;
+    }
+    while (ptr != buffer)
+      *result++ = *--ptr;  // 54321 -> 12345
+    return result;
+  }
+
+  // __device__ char* format_ll(long long n, char* result, char* dec_ptr, int& dec_pos, int exp10) {
+  //   if (n == 0) {
+  //     *result++ = '0';
+  //     return result;
+  //   }
+  //   int sep_count = 0;
+  //   char buffer[305];  // should be big-enough for significant digits
+  //   char* ptr = buffer;
+  //   while (n > 0) {
+  //       if (sep_count == 3) {
+  //           *ptr++ = ',';
+  //           sep_count = 0;
+  //       }
+  //       *ptr++ = (char)('0' + (n % 10));
+  //       n /= 10;
+  //       sep_count++;
+  //   }
+  //   int len = dec_ptr - dec_str;
+  //   int dec_pos = 0;
+  //   while (exp10--) {
+  //       if (sep_count == 3) {
+  //           *ptr++ = ',';
+  //           sep_count = 0;
+  //       }
+  //       if (dec_pos < len) {
+  //         *ptr++ = dec_str[dec_pos++];
+  //       } else {
+  //         *ptr++ = '0';
+  //       }
+  //       sep_count++;
+  //   }
+  //   while (ptr != buffer) {
+  //       *result++ = *--ptr;  // 54321 -> 12345
+  //   }
+  //   return result;
+  // }
+
+  // /**
+  //  * @brief Dissect a float value into integer, decimal, and exponent components.
+  //  *
+  //  * @return The number of decimal places.
+  //  */
+  // __device__ int dissect_value(double value,
+  //                              int digits,
+  //                              unsigned int& integer,
+  //                              unsigned long long& decimal,
+  //                              int& exp10,
+  //                              bool is_float = false)
+  // {
+  //   // normalize step puts value between lower-limit and upper-limit
+  //   // by adjusting the exponent up or down
+  //   exp10 = 0;
+  //   if (value > upper_limit) {
+  //     int fx = 256;
+  //     for (int idx = 8; idx >= 0; --idx) {
+  //       if (value >= upper10[idx]) {
+  //         value *= lower10[idx];
+  //         exp10 += fx;
+  //       }
+  //       fx = fx >> 1;
+  //     }
+  //   } else if ((value > 0.0) && (value < lower_limit)) {
+  //     int fx = 256;
+  //     for (int idx = 8; idx >= 0; --idx) {
+  //       if (value < blower10[idx]) {
+  //         value *= upper10[idx];
+  //         exp10 -= fx;
+  //       }
+  //       fx = fx >> 1;
+  //     }
+  //   }
+  //   //
+  //   // int decimal_places = significant_digits - (exp10? 2 : 1);
+  //   // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
+  //   int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1;
+  //   unsigned long long max_digits = (is_float? eight_digits: sixteen_digits);
+  //   double temp_value = value;
+  //   while (temp_value < 1.0 && temp_value > 0.0) {
+  //     max_digits *= 10;
+  //     temp_value *= 10.0;
+  //     decimal_places++;
+  //   }
+  //   integer                 = (unsigned int)value;
+  //   for (unsigned int i = integer; i >= 10; i /= 10) {
+  //     --decimal_places;
+  //     max_digits /= 10;
+  //   }
+  //   double diff = value - (double)integer;
+  //   double remainder = diff * (double)max_digits;
+  //   decimal          = (unsigned long long)remainder;
+  //   remainder -= (double)decimal;
+  //   decimal += (unsigned long long)(2.0 * remainder); // round up
+  //   if (decimal >= max_digits) {
+  //     decimal = 0;
+  //     ++integer;
+  //     if (exp10 && (integer >= 10)) {
+  //       ++exp10;
+  //       integer = 1;
+  //     }
+  //   }
+  //   //
+  //   while ((decimal % 10) == 0 && (decimal_places > 0)) {
+  //     decimal /= 10;
+  //     --decimal_places;
+  //   }
+  //   return decimal_places;
+  // }
+
+  /**
+   * @brief Main kernel method for converting float value to char output array.
+   *
+   * Output need not be more than (significant_digits + 7) bytes:
+   * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent
+   *
+   * @param value Float value to convert.
+   * @param output Memory to write output characters.
+   * @return Number of bytes written.
+   */
+  __device__ int format_float(double value, int digits, char* output, bool is_float)
+  {
+    // check for valid value
+    if (std::isnan(value)) {
+      memcpy(output, "NaN", 3);
+      return 3;
+    }
+    bool const bneg = [&value]() {
+      if (signbit(value)) {  // handles -0.0 too
+        value = -value;
+        return true;
+      } else {
+        return false;
+      }
+    }();
+    if (std::isinf(value)) {
+      if (bneg) {
+        memcpy(output, "-Infinity", 9);
+      } else {
+        memcpy(output, "Infinity", 8);
+      }
+      return bneg ? 9 : 8;
+    }
+
+    // dissect value into components
+    // unsigned int integer = 0;
+    // unsigned long long decimal = 0;
+    int exp10          = 0;
+    // int decimal_places = dissect_value(value, digits, integer, decimal, exp10, is_float);
+    //
+    // now build the string from the
+    // components: sign, integer, decimal, exp10, decimal_places
+    //
+    // sign
+    char* ptr = output;
+    if (bneg) *ptr++ = '-';
+    // int exp10 = 0;
+    if (value > upper_limit) {
+      int fx = 256;
+      for (int idx = 8; idx >= 0; --idx) {
+        if (value >= upper10[idx]) {
+          value *= lower10[idx];
+          exp10 += fx;
+        }
+        fx = fx >> 1;
+      }
+    } else if ((value > 0.0) && (value < lower_limit)) {
+      int fx = 256;
+      for (int idx = 8; idx >= 0; --idx) {
+        if (value < blower10[idx]) {
+          value *= upper10[idx];
+          exp10 -= fx;
+        }
+        fx = fx >> 1;
+      }
+    }
+    // x * 10^exp10
+    char dec_str[18];
+    if (exp10 > 0) {
+      long long int_part = static_cast<long long>(value);
+      double decimal_double = value - double(int_part);
+      long long dec_part = decimal_double * sixteen_digits;
+      char* dec_ptr = ll2str(dec_part, dec_str);  
+      // ptr = format_ll(int_part, ptr, dec_ptr, dec_pos, exp10);   
+      if (int_part == 0) {
+        *ptr++ = '0';
+      } else {
+        int sep_count = 0;
+        char buffer[23];  // should be big-enough for significant digits
+        char* buf_ptr = buffer;
+        while (int_part > 0) {
+          if (sep_count == 3) {
+              *buf_ptr++ = ',';
+              sep_count = 0;
+          }
+          *buf_ptr++ = (char)('0' + (int_part % 10));
+          int_part /= 10;
+          sep_count++;
+        }
+        while (buf_ptr != buffer) {
+          *ptr++ = *--buf_ptr;  // 54321 -> 12345
+        }
+        int len = dec_ptr - dec_str;
+        int dec_pos = 0;
+        while (exp10--) {
+          if (sep_count == 3) {
+            *ptr++ = ',';
+            sep_count = 0;
+          }
+          if (dec_pos < len) {
+            *ptr++ = dec_str[dec_pos++];
+          } else {
+            *ptr++ = '0';
+          }
+          sep_count++;
+        }
+        *ptr++ = '.';
+        while (digits--) {
+          if (dec_pos < len) {
+            *ptr++ = dec_str[dec_pos++];
+          } else {
+            *ptr++ = '0';
+          }
+        }
+      }
+    } else if (exp10 == 0) {
+        long long int_part = static_cast<long long>(value);
+        double decimal_double = value - double(int_part);
+        long long dec_part = decimal_double * sixteen_digits;
+        if (int_part == 0) {
+          *ptr++ = '0';
+        } else {
+          int sep_count = 0;
+          char buffer[23];  // should be big-enough for significant digits
+          char* buf_ptr = buffer;
+          while (int_part > 0) {
+            if (sep_count == 3) {
+              *buf_ptr++ = ',';
+              sep_count = 0;
+            }
+            *buf_ptr++ = (char)('0' + (int_part % 10));
+            int_part /= 10;
+            sep_count++;
+          }
+          while (buf_ptr != buffer) {
+            *ptr++ = *--buf_ptr;  // 54321 -> 12345
+          }
+        }
+        // ptr = ll2str(int_part, ptr);
+        *ptr++ = '.';
+        char* dec_ptr = ll2str(dec_part, dec_str); 
+        int len = dec_ptr - dec_str;
+        int dec_pos = 0;
+        while (digits--) {
+            if (dec_pos < len) {
+                *ptr++ =  dec_str[dec_pos++];
+            } else {
+                *ptr++ =  '0';
+            }
+        }
+    } else {
+        // exp10 < 0
+        *ptr++ = '0';
+        *ptr++ = '.';
+        long long dec_part = value * sixteen_digits;
+        char* dec_ptr = ll2str(dec_part, dec_str);
+        int len = dec_ptr - dec_str;
+        int dec_pos = 0;
+        while (digits--) {
+          if (exp10 < -1) {
+            *ptr++ =  '0';
+            exp10++;
+          } else if (dec_pos < len) {
+            *ptr++ =  dec_str[dec_pos++];
+          } else {
+            *ptr++ =  '0';
+          }
+        }
+    }
+    return int(ptr - output);
+  }
+
+  __device__ int int_part_len(double value)
+  {
+    int exp10 = 0;
+    if (value > upper_limit) {
+      int fx = 256;
+      for (int idx = 8; idx >= 0; --idx) {
+        if (value >= upper10[idx]) {
+          value *= lower10[idx];
+          exp10 += fx;
+        }
+        fx = fx >> 1;
+      }
+    }
+    int cnt = 0;
+    if (value == 0.0) {
+      return 1;
+    }
+    while (value >= 1.0) {
+      value /= 10.0;
+      ++cnt;
+    }
+    if (exp10) {
+      cnt += exp10;
+    }
+    return cnt;
+  }
+
+  /**
+   * @brief Compute how man bytes are needed to hold the output string.
+   *
+   * @param value Float value to convert.
+   * @return Number of bytes required.
+   */
+  __device__ int compute_ftos_size(double value, int digits, bool is_float)
+  {
+    if (std::isnan(value)) return 3;  // NaN
+    bool const bneg = [&value]() {
+      if (signbit(value)) {  // handles -0.0 too
+        value = -value;
+        return true;
+      } else {
+        return false;
+      }
+    }();
+    if (std::isinf(value)) return 8 + (int)bneg;  // Inf
+
+    int int_len = int_part_len(value);
+    // sign
+    int count = (int)bneg;
+    // integer
+    count += int_len;
+    // decimal
+    count += 1 + digits;
+    int sep_count = 0;
+    while (int_len > 0) { // speedup with math?
+      if (sep_count == 3) {
+        ++count;
+        sep_count = 0;
+      }
+      int_len--;
+      ++sep_count;
+    }  // log10(integer)
+    return count;
+  }
+};
+
+template <typename FloatType>
+struct format_float_fn {
+  column_device_view d_floats;
+  int digits;
+  size_type* d_offsets;
+  char* d_chars;
+
+  __device__ size_type compute_output_size(FloatType value, int digits)
+  {
+    ftos_converter fts;
+    bool is_float = std::is_same_v<FloatType, float>;
+    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value), digits, is_float));
+  }
+
+  __device__ void format_float(size_type idx, int digits)
+  {
+    FloatType value = d_floats.element<FloatType>(idx);
+    ftos_converter fts;
+    bool is_float = std::is_same_v<FloatType, float>;
+    fts.format_float(static_cast<double>(value), digits, d_chars + d_offsets[idx], is_float);
+  }
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_floats.is_null(idx)) {
+      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      return;
+    }
+    if (d_chars != nullptr) {
+      format_float(idx, digits);
+    } else {
+      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx), digits);
+    }
+  }
+};
+
+/**
+ * @brief This dispatch method is for converting floats into strings.
+ *
+ * The template function declaration ensures only float types are allowed.
+ */
+struct dispatch_format_float_fn {
+  template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& floats,
+                                     int digits,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    size_type strings_count = floats.size();
+    auto column             = column_device_view::create(floats, stream);
+    auto d_column           = *column;
+
+    // copy the null mask
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
+
+    auto [offsets, chars] =
+      cudf::strings::detail::make_strings_children(format_float_fn<FloatType>{d_column, digits}, strings_count, stream, mr);
+
+    return make_strings_column(strings_count,
+                               std::move(offsets),
+                               std::move(chars),
+                               floats.null_count(),
+                               std::move(null_mask));
+  }
+
+  // non-float types throw an exception
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const&,
+                                     int,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
+  {
+    CUDF_FAIL("Values for format_float function must be a float type.");
+  }
+};
+
+}  // namespace
+
+// This will convert all float column types into a strings column.
+std::unique_ptr<column> format_float(column_view const& floats,
+                                    int digits,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  size_type strings_count = floats.size();
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
+
+  return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<column> format_float(column_view const& floats, 
+                                      int digits,
+                                      rmm::cuda_stream_view stream, 
+                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::format_float(floats, digits, stream, mr);
+}
+
+}  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt
index 76f0b52912..345a669092 100644
--- a/src/main/cpp/tests/CMakeLists.txt
+++ b/src/main/cpp/tests/CMakeLists.txt
@@ -51,8 +51,8 @@ ConfigureTest(CAST_STRING
 ConfigureTest(CAST_DECIMAL_TO_STRING
     cast_decimal_to_string.cpp)
 
-ConfigureTest(CAST_FLOAT_TO_STRING
-    cast_float_to_string.cpp)
+ConfigureTest(FORMAT_FLOAT
+    format_float.cpp)
 
 ConfigureTest(DATETIME_REBASE
     datetime_rebase.cpp)
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/format_float.cpp
similarity index 86%
rename from src/main/cpp/tests/cast_float_to_string.cpp
rename to src/main/cpp/tests/format_float.cpp
index a86d988724..3e03578f4c 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -31,9 +31,9 @@ using namespace cudf;
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
 
-struct FloatToStringTests : public cudf::test::BaseFixture {};
+struct FormatFloatTests : public cudf::test::BaseFixture {};
 
-TEST_F(FloatToStringTests, FromFloats32)
+TEST_F(FormatFloatTests, FormatFloats32)
 {
   std::vector<float> h_floats{100,
                               654321.25,
@@ -45,14 +45,14 @@ TEST_F(FloatToStringTests, FromFloats32)
                               123456789012.34,
                               -0.0};
   std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"};
+    "100.0", "654,321.25", "-12,761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<float> floats(
     h_floats.begin(),
     h_floats.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
+  auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
 
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
@@ -62,7 +62,7 @@ TEST_F(FloatToStringTests, FromFloats32)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
 
-TEST_F(FloatToStringTests, FromFloats64)
+TEST_F(FormatFloatTests, FormatFloats64)
 {
   std::vector<double> h_floats{100,
                                654321.25,
@@ -76,7 +76,7 @@ TEST_F(FloatToStringTests, FromFloats64)
                                839542223232.794248339,
                                -0.0};
   std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", 
+    "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", 
     "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<double> floats(
@@ -84,7 +84,7 @@ TEST_F(FloatToStringTests, FromFloats64)
     h_floats.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
+  auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
 
   cudf::test::strings_column_wrapper expected(
     h_expected.begin(),
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 3002e1cdab..ab07dc39dc 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -81,13 +81,13 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
   }
 
   /**
-   * Convert a float column to a string column.
+   * Convert a float column to a formatted string column.
    *
    * @param cv the column data to process
    * @return the converted column
    */
-  public static ColumnVector fromFloat(ColumnView cv) {
-    return new ColumnVector(fromFloat(cv.getNativeView()));
+  public static ColumnVector formatFloat(ColumnView cv, int d) {
+    return new ColumnVector(formatFloat(cv.getNativeView(), d));
   }
 
   /**
@@ -147,7 +147,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
       int precision, int scale);
   private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype);
   private static native long fromDecimal(long nativeColumnView);
-  private static native long fromFloat(long nativeColumnView);
+  private static native long formatFloat(long nativeColumnView, int d);
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);

From 007cf5ebc74a623e8f82df4feafd2416c8397af9 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 6 Nov 2023 15:58:47 +0800
Subject: [PATCH 08/54] rewrite the solution with ryu

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu    | 1348 ++++++++++++++++---
 src/main/cpp/tests/cast_float_to_string.cpp |    2 +-
 2 files changed, 1127 insertions(+), 223 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index d1f66f772d..a594377b4a 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -41,6 +41,11 @@
 #include <cuda/std/climits>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
+#include <cuda/std/cassert>
+// #include <cuda/std/cstdbool>
+#include <cuda/std/cstdint>
+// #include <cuda/std/cstdlib>
+// #include <cuda/std/cstring>
 
 using namespace cudf;
 
@@ -49,236 +54,1135 @@ namespace spark_rapids_jni {
 namespace detail {
 namespace {
 
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_64 {
+  uint64_t mantissa;
+  // Decimal exponent's range is -324 to 308
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_64;
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_32 {
+  uint32_t mantissa;
+  // Decimal exponent's range is -45 to 38
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_32;
+
 struct ftos_converter {
-  // significant digits is independent of scientific notation range
-  // digits more than this may require using long values instead of ints
-  static constexpr unsigned int significant_digits_float = 9;
-  static constexpr unsigned int significant_digits_double = 17;
-  static constexpr unsigned int eight_digits = 100000000;  // 1x10^8
-  static constexpr unsigned long long sixteen_digits = 10000000000000000; // 1x10^16
-  // Range of numbers here is for normalizing the value.
-  // If the value is above or below the following limits, the output is converted to
-  // scientific notation in order to show (at most) the number of significant digits.
-  static constexpr double upper_limit = 10000000;  // Spark's max is 1x10^7
-  static constexpr double lower_limit = 0.001;      // printf uses scientific notation below this
-  // Tables for doing normalization: converting to exponent form
-  // IEEE double float has maximum exponent of 305 so these should cover everything
-  double const upper10[9]  = {10, 100, 10000, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256};
-  double const lower10[9]  = {.1, .01, .0001, 1e-8, 1e-16, 1e-32, 1e-64, 1e-128, 1e-256};
-  double const blower10[9] = {1.0, .1, .001, 1e-7, 1e-15, 1e-31, 1e-63, 1e-127, 1e-255};
-
-  // utility for quickly converting known integer range to character array
-  __device__ char* int2str(int value, char* output)
-  {
-    if (value == 0) {
-      *output++ = '0';
-      return output;
-    }
-    char buffer[significant_digits_double];  // should be big-enough for significant digits
-    char* ptr = buffer;
-    while (value > 0) {
-      *ptr++ = (char)('0' + (value % 10));
-      value /= 10;
-    }
-    while (ptr != buffer)
-      *output++ = *--ptr;  // 54321 -> 12345
-    return output;
-  }
-
-  /**
-   * @brief Dissect a float value into integer, decimal, and exponent components.
-   *
-   * @return The number of decimal places.
-   */
-  __device__ int dissect_value(double value,
-                               unsigned int& integer,
-                               unsigned long long& decimal,
-                               int& exp10,
-                               bool is_float = false)
-  {
-    // normalize step puts value between lower-limit and upper-limit
-    // by adjusting the exponent up or down
-    exp10 = 0;
-    if (value > upper_limit) {
-      int fx = 256;
-      for (int idx = 8; idx >= 0; --idx) {
-        if (value >= upper10[idx]) {
-          value *= lower10[idx];
-          exp10 += fx;
+
+  // These tables are generated by PrintDoubleLookupTable.
+  static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
+  static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
+  static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
+  static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
+  static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
+  static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
+  static constexpr unsigned int DOUBLE_BIAS = 1023;
+  static constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
+  static constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
+  static constexpr unsigned int FLOAT_BIAS = 127;
+
+
+  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
+  __device__ inline uint32_t decimalLength9(const uint32_t v) {
+    // Function precondition: v is not a 10-digit number.
+    // (f2s: 9 digits are sufficient for round-tripping.)
+    // (d2fixed: We print 9-digit blocks.)
+    assert(v < 1000000000);
+    if (v >= 100000000) { return 9; }
+    if (v >= 10000000) { return 8; }
+    if (v >= 1000000) { return 7; }
+    if (v >= 100000) { return 6; }
+    if (v >= 10000) { return 5; }
+    if (v >= 1000) { return 4; }
+    if (v >= 100) { return 3; }
+    if (v >= 10) { return 2; }
+    return 1;
+  }
+
+  const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = {
+    {                    1u, 2305843009213693952u },
+    {  5955668970331000884u, 1784059615882449851u },
+    {  8982663654677661702u, 1380349269358112757u },
+    {  7286864317269821294u, 2135987035920910082u },
+    {  7005857020398200553u, 1652639921975621497u },
+    { 17965325103354776697u, 1278668206209430417u },
+    {  8928596168509315048u, 1978643211784836272u },
+    { 10075671573058298858u, 1530901034580419511u },
+    {   597001226353042382u, 1184477304306571148u },
+    {  1527430471115325346u, 1832889850782397517u },
+    { 12533209867169019542u, 1418129833677084982u },
+    {  5577825024675947042u, 2194449627517475473u },
+    { 11006974540203867551u, 1697873161311732311u },
+    { 10313493231639821582u, 1313665730009899186u },
+    { 12701016819766672773u, 2032799256770390445u }
+  };
+
+  const uint32_t POW5_INV_OFFSETS[19] = {
+    0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
+    0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
+    0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
+    0x00000000
+  };
+
+  const uint64_t DOUBLE_POW5_SPLIT2[13][2] = {
+    {                    0u, 1152921504606846976u },
+    {                    0u, 1490116119384765625u },
+    {  1032610780636961552u, 1925929944387235853u },
+    {  7910200175544436838u, 1244603055572228341u },
+    { 16941905809032713930u, 1608611746708759036u },
+    { 13024893955298202172u, 2079081953128979843u },
+    {  6607496772837067824u, 1343575221513417750u },
+    { 17332926989895652603u, 1736530273035216783u },
+    { 13037379183483547984u, 2244412773384604712u },
+    {  1605989338741628675u, 1450417759929778918u },
+    {  9630225068416591280u, 1874621017369538693u },
+    {   665883850346957067u, 1211445438634777304u },
+    { 14931890668723713708u, 1565756531257009982u }
+  };
+
+  const uint32_t POW5_OFFSETS[21] = {
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
+    0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
+    0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
+    0x55559155, 0x51405555, 0x00000105
+  };
+
+  static constexpr uint32_t POW5_TABLE_SIZE = 26;
+  const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+  1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
+  1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
+  30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
+  19073486328125ull, 95367431640625ull, 476837158203125ull,
+  2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
+  298023223876953125ull //, 1490116119384765625ull
+  };
+
+  // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528.
+  __device__ inline int32_t log2pow5(const int32_t e) {
+    // This approximation works up to the point that the multiplication overflows at e = 3529.
+    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+    // than 2^9297.
+    assert(e >= 0);
+    assert(e <= 3528);
+    return (int32_t) ((((uint32_t) e) * 1217359) >> 19);
+  }
+
+  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+  __device__ inline int32_t pow5bits(const int32_t e) {
+    // This approximation works up to the point that the multiplication overflows at e = 3529.
+    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+    // than 2^9297.
+    assert(e >= 0);
+    assert(e <= 3528);
+    return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
+  }
+
+  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+  __device__ inline int32_t ceil_log2pow5(const int32_t e) {
+    return log2pow5(e) + 1;
+  }
+
+  // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
+  __device__ inline uint32_t log10Pow2(const int32_t e) {
+    // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
+    assert(e >= 0);
+    assert(e <= 1650);
+    return (((uint32_t) e) * 78913) >> 18;
+  }
+
+  // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
+  __device__ inline uint32_t log10Pow5(const int32_t e) {
+    // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
+    assert(e >= 0);
+    assert(e <= 2620);
+    return (((uint32_t) e) * 732923) >> 20;
+  }
+
+  __device__ inline uint32_t pow5factor_32(uint32_t value) {
+    uint32_t count = 0;
+    for (;;) {
+      assert(value != 0);
+      const uint32_t q = value / 5;
+      const uint32_t r = value % 5;
+      if (r != 0) {
+        break;
+      }
+      value = q;
+      ++count;
+    }
+    return count;
+  }
+
+  // Returns true if value is divisible by 5^p.
+  __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) {
+    return pow5factor_32(value) >= p;
+  }
+
+  // Returns true if value is divisible by 2^p.
+  __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) {
+    // __builtin_ctz doesn't appear to be faster here.
+    return (value & ((1u << p) - 1)) == 0;
+  }
+
+  // It seems to be slightly faster to avoid uint128_t here, although the
+  // generated code for uint128_t looks slightly nicer.
+  __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) {
+    assert(shift > 32);
+
+    // The casts here help MSVC to avoid calls to the __allmul library
+    // function.
+    const uint32_t factorLo = (uint32_t)(factor);
+    const uint32_t factorHi = (uint32_t)(factor >> 32);
+    const uint64_t bits0 = (uint64_t)m * factorLo;
+    const uint64_t bits1 = (uint64_t)m * factorHi;
+
+    const uint64_t sum = (bits0 >> 32) + bits1;
+    const uint64_t shiftedSum = sum >> (shift - 32);
+    assert(shiftedSum <= UINT32_MAX);
+    return (uint32_t) shiftedSum;
+
+  }
+
+  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
+    if (mantissa) {
+      memcpy(result, "NaN", 3);
+      return 3;
+    }
+    if (sign) {
+      result[0] = '-';
+    }
+    if (exponent) {
+      memcpy(result + sign, "Infinity", 8);
+      return sign + 8;
+    }
+    memcpy(result + sign, "0.0", 3);
+    return sign + 3;
+  }
+
+  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) {
+    if (mantissa) {
+      return 3;
+    }
+    if (exponent) {
+      return sign + 8;
+    }
+    return sign + 3;
+  }
+
+  __device__ inline uint32_t float_to_bits(const float f) {
+    uint32_t bits = 0;
+    memcpy(&bits, &f, sizeof(float));
+    return bits;
+  }
+
+  __device__ inline uint64_t double_to_bits(const double d) {
+    uint64_t bits = 0;
+    memcpy(&bits, &d, sizeof(double));
+    return bits;
+  }
+
+  __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
+    // The casts here help MSVC to avoid calls to the __allmul library function.
+    const uint32_t aLo = (uint32_t)a;
+    const uint32_t aHi = (uint32_t)(a >> 32);
+    const uint32_t bLo = (uint32_t)b;
+    const uint32_t bHi = (uint32_t)(b >> 32);
+
+    const uint64_t b00 = (uint64_t)aLo * bLo;
+    const uint64_t b01 = (uint64_t)aLo * bHi;
+    const uint64_t b10 = (uint64_t)aHi * bLo;
+    const uint64_t b11 = (uint64_t)aHi * bHi;
+
+    const uint32_t b00Lo = (uint32_t)b00;
+    const uint32_t b00Hi = (uint32_t)(b00 >> 32);
+
+    const uint64_t mid1 = b10 + b00Hi;
+    const uint32_t mid1Lo = (uint32_t)(mid1);
+    const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
+
+    const uint64_t mid2 = b01 + mid1Lo;
+    const uint32_t mid2Lo = (uint32_t)(mid2);
+    const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
+
+    const uint64_t pHi = b11 + mid1Hi + mid2Hi;
+    const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
+
+    *productHi = pHi;
+    return pLo;
+  }
+
+  __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
+    // We don't need to handle the case dist >= 64 here (see above).
+    assert(dist < 64);
+    assert(dist > 0);
+    return (hi << (64 - dist)) | (lo >> dist);
+  }
+
+  __device__ inline uint64_t div5(const uint64_t x) {
+    return x / 5;
+  }
+
+  __device__ inline uint64_t div10(const uint64_t x) {
+    return x / 10;
+  }
+
+  __device__ inline uint64_t div100(const uint64_t x) {
+    return x / 100;
+  }
+
+  __device__ inline uint64_t div1e8(const uint64_t x) {
+    return x / 100000000;
+  }
+
+  __device__ inline uint64_t div1e9(const uint64_t x) {
+    return x / 1000000000;
+  }
+
+  __device__ inline uint32_t mod1e9(const uint64_t x) {
+    return (uint32_t) (x - 1000000000 * div1e9(x));
+  }
+
+  __device__ inline uint32_t pow5Factor(uint64_t value) {
+    const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
+    const uint64_t n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
+    uint32_t count = 0;
+    for (;;) {
+      assert(value != 0);
+      value *= m_inv_5;
+      if (value > n_div_5)
+        break;
+      ++count;
+    }
+    return count;
+  }
+
+  // Returns true if value is divisible by 5^p.
+  __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
+    // I tried a case distinction on p, but there was no performance difference.
+    return pow5Factor(value) >= p;
+  }
+
+  // Returns true if value is divisible by 2^p.
+  __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
+    assert(value != 0);
+    assert(p < 64);
+    // __builtin_ctzll doesn't appear to be faster here.
+    return (value & ((1ull << p) - 1)) == 0;
+  }
+
+  __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
+    // m is maximum 55 bits
+    uint64_t high1;                                   // 128
+    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
+    uint64_t high0;                                   // 64
+    umul128(m, mul[0], &high0);                       // 0
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    return shiftright128(sum, high1, j - 64);
+  }
+
+  __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
+    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
+    *vp = mulShift64(4 * m + 2, mul, j);
+    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
+    return mulShift64(4 * m, mul, j);
+  }
+
+  // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+  __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) {
+    const uint32_t base = i / POW5_TABLE_SIZE;
+    const uint32_t base2 = base * POW5_TABLE_SIZE;
+    const uint32_t offset = i - base2;
+    const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
+    if (offset == 0) {
+      result[0] = mul[0];
+      result[1] = mul[1];
+      return;
+    }
+    const uint64_t m = DOUBLE_POW5_TABLE[offset];
+    uint64_t high1;
+    const uint64_t low1 = umul128(m, mul[1], &high1);
+    uint64_t high0;
+    const uint64_t low0 = umul128(m, mul[0], &high0);
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    // high1 | sum | low0
+    const uint32_t delta = pow5bits(i) - pow5bits(base2);
+    result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+    result[1] = shiftright128(sum, high1, delta);
+  }
+
+  // Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+  __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
+    const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+    const uint32_t base2 = base * POW5_TABLE_SIZE;
+    const uint32_t offset = base2 - i;
+    const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+    if (offset == 0) {
+      result[0] = mul[0];
+      result[1] = mul[1];
+      return;
+    }
+    const uint64_t m = DOUBLE_POW5_TABLE[offset];
+    uint64_t high1;
+    const uint64_t low1 = umul128(m, mul[1], &high1);
+    uint64_t high0;
+    const uint64_t low0 = umul128(m, mul[0] - 1, &high0);
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    // high1 | sum | low0
+    const uint32_t delta = pow5bits(base2) - pow5bits(i);
+    result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+    result[1] = shiftright128(sum, high1, delta);
+  }
+
+  __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
+    // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
+    // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
+    // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
+    uint64_t pow5[2];
+    double_computeInvPow5(q, pow5);
+    return mulShift32(m, pow5[1] + 1, j);
+  }
+
+  __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) {
+    uint64_t pow5[2];
+    double_computePow5(i, pow5);
+    return mulShift32(m, pow5[1], j);
+  }
+
+  __device__ inline uint32_t decimalLength17(const uint64_t v) {
+    // This is slightly faster than a loop.
+    // The average output length is 16.38 digits, so we check high-to-low.
+    // Function precondition: v is not an 18, 19, or 20-digit number.
+    // (17 digits are sufficient for round-tripping.)
+    assert(v < 100000000000000000L);
+    if (v >= 10000000000000000L) { return 17; }
+    if (v >= 1000000000000000L) { return 16; }
+    if (v >= 100000000000000L) { return 15; }
+    if (v >= 10000000000000L) { return 14; }
+    if (v >= 1000000000000L) { return 13; }
+    if (v >= 100000000000L) { return 12; }
+    if (v >= 10000000000L) { return 11; }
+    if (v >= 1000000000L) { return 10; }
+    if (v >= 100000000L) { return 9; }
+    if (v >= 10000000L) { return 8; }
+    if (v >= 1000000L) { return 7; }
+    if (v >= 100000L) { return 6; }
+    if (v >= 10000L) { return 5; }
+    if (v >= 1000L) { return 4; }
+    if (v >= 100L) { return 3; }
+    if (v >= 10L) { return 2; }
+    return 1;
+  }
+
+  __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
+    int32_t e2;
+    uint64_t m2;
+    if (ieeeExponent == 0) {
+      // We subtract 2 so that the bounds computation has 2 additional bits.
+      e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+      m2 = ieeeMantissa;
+    } else {
+      e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+      m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+    }
+    const bool even = (m2 & 1) == 0;
+    const bool acceptBounds = even;
+
+    // Step 2: Determine the interval of valid decimal representations.
+    const uint64_t mv = 4 * m2;
+    // Implicit bool -> int conversion. True is 1, false is 0.
+    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+    // We would compute mp and mm like this:
+    // uint64_t mp = 4 * m2 + 2;
+    // uint64_t mm = mv - 1 - mmShift;
+
+    // Step 3: Convert to a decimal power base using 128-bit arithmetic.
+    uint64_t vr, vp, vm;
+    int32_t e10;
+    bool vmIsTrailingZeros = false;
+    bool vrIsTrailingZeros = false;
+    if (e2 >= 0) {
+      // I tried special-casing q == 0, but there was no effect on performance.
+      // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
+      const uint32_t q = log10Pow2(e2) - (e2 > 3);
+      e10 = (int32_t) q;
+      const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+      const int32_t i = -e2 + (int32_t) q + k;
+      uint64_t pow5[2];
+      double_computeInvPow5(q, pow5);
+      vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
+
+      if (q <= 21) {
+        // This should use q <= 22, but I think 21 is also safe. Smaller values
+        // may still be safe, but it's more difficult to reason about them.
+        // Only one of mp, mv, and mm can be a multiple of 5, if any.
+        const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
+        if (mvMod5 == 0) {
+          vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+        } else if (acceptBounds) {
+          // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
+          // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
+          // <=> true && pow5Factor(mm) >= q, since e2 >= q.
+          vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
+        } else {
+          // Same as min(e2 + 1, pow5Factor(mp)) >= q.
+          vp -= multipleOfPowerOf5(mv + 2, q);
         }
-        fx = fx >> 1;
-      }
-    } else if ((value > 0.0) && (value < lower_limit)) {
-      int fx = 256;
-      for (int idx = 8; idx >= 0; --idx) {
-        if (value < blower10[idx]) {
-          value *= upper10[idx];
-          exp10 -= fx;
+      }
+    } else {
+      // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
+      const uint32_t q = log10Pow5(-e2) - (-e2 > 1);
+      e10 = (int32_t) q + e2;
+      const int32_t i = -e2 - (int32_t) q;
+      const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+      const int32_t j = (int32_t) q - k;
+
+      uint64_t pow5[2];
+      double_computePow5(i, pow5);
+      vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
+
+      if (q <= 1) {
+        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+        // mv = 4 * m2, so it always has at least two trailing 0 bits.
+        vrIsTrailingZeros = true;
+        if (acceptBounds) {
+          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+          vmIsTrailingZeros = mmShift == 1;
+        } else {
+          // mp = mv + 2, so it always has at least one trailing 0 bit.
+          --vp;
         }
-        fx = fx >> 1;
-      }
-    }
-    //
-    // int decimal_places = significant_digits - (exp10? 2 : 1);
-    // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
-    int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1;
-    unsigned long long max_digits = (is_float? eight_digits: sixteen_digits);
-    double temp_value = value;
-    while (temp_value < 1.0 && temp_value > 0.0) {
-      max_digits *= 10;
-      temp_value *= 10.0;
-      decimal_places++;
-    }
-    integer                 = (unsigned int)value;
-    for (unsigned int i = integer; i >= 10; i /= 10) {
-      --decimal_places;
-      max_digits /= 10;
-    }
-    double diff = value - (double)integer;
-    double remainder = diff * (double)max_digits;
-    decimal          = (unsigned long long)remainder;
-    remainder -= (double)decimal;
-    decimal += (unsigned long long)(2.0 * remainder); // round up
-    if (decimal >= max_digits) {
-      decimal = 0;
-      ++integer;
-      if (exp10 && (integer >= 10)) {
-        ++exp10;
-        integer = 1;
-      }
-    }
-    //
-    while ((decimal % 10) == 0 && (decimal_places > 0)) {
-      decimal /= 10;
-      --decimal_places;
-    }
-    return decimal_places;
-  }
-
-  /**
-   * @brief Main kernel method for converting float value to char output array.
-   *
-   * Output need not be more than (significant_digits + 7) bytes:
-   * 7 = 1 sign, 1 decimal point, 1 exponent ('e'), 1 exponent-sign, 3 digits for exponent
-   *
-   * @param value Float value to convert.
-   * @param output Memory to write output characters.
-   * @return Number of bytes written.
-   */
-  __device__ int float_to_string(double value, char* output, bool is_float)
-  {
-    // check for valid value
-    if (std::isnan(value)) {
-      memcpy(output, "NaN", 3);
-      return 3;
+      } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
+        // We want to know if the full product has at least q trailing zeros.
+        // We need to compute min(p2(mv), p5(mv) - e2) >= q
+        // <=> p2(mv) >= q && p5(mv) - e2 >= q
+        // <=> p2(mv) >= q (because -e2 >= q)
+        vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
+      }
     }
-    bool const bneg = [&value]() {
-      if (signbit(value)) {  // handles -0.0 too
-        value = -value;
-        return true;
+
+    // Step 4: Find the shortest decimal representation in the interval of valid representations.
+    int32_t removed = 0;
+    uint8_t lastRemovedDigit = 0;
+    uint64_t output;
+    // On average, we remove ~2 digits.
+    if (vmIsTrailingZeros || vrIsTrailingZeros) {
+      // General case, which happens rarely (~0.7%).
+      for (;;) {
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vmDiv10 = div10(vm);
+        if (vpDiv10 <= vmDiv10) {
+          break;
+        }
+        const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        vmIsTrailingZeros &= vmMod10 == 0;
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) vrMod10;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+
+      if (vmIsTrailingZeros) {
+        for (;;) {
+          const uint64_t vmDiv10 = div10(vm);
+          const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+          if (vmMod10 != 0) {
+            break;
+          }
+          const uint64_t vpDiv10 = div10(vp);
+          const uint64_t vrDiv10 = div10(vr);
+          const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+          vrIsTrailingZeros &= lastRemovedDigit == 0;
+          lastRemovedDigit = (uint8_t) vrMod10;
+          vr = vrDiv10;
+          vp = vpDiv10;
+          vm = vmDiv10;
+          ++removed;
+        }
+      }
+
+      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+        // Round even if the exact number is .....50..0.
+        lastRemovedDigit = 4;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+    } else {
+      // Specialized for the common case (~99.3%). Percentages below are relative to this.
+      bool roundUp = false;
+      const uint64_t vpDiv100 = div100(vp);
+      const uint64_t vmDiv100 = div100(vm);
+      if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
+        const uint64_t vrDiv100 = div100(vr);
+        const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
+        roundUp = vrMod100 >= 50;
+        vr = vrDiv100;
+        vp = vpDiv100;
+        vm = vmDiv100;
+        removed += 2;
+      }
+      // Loop iterations below (approximately), without optimization above:
+      // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+      // Loop iterations below (approximately), with optimization above:
+      // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+      for (;;) {
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vmDiv10 = div10(vm);
+        if (vpDiv10 <= vmDiv10) {
+          break;
+        }
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        roundUp = vrMod10 >= 5;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + (vr == vm || roundUp);
+    }
+    const int32_t exp = e10 + removed;
+
+    floating_decimal_64 fd;
+    fd.exponent = exp;
+    fd.mantissa = output;
+    return fd;
+  }
+
+  __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) {
+    int32_t e2;
+    uint32_t m2;
+    if (ieeeExponent == 0) {
+      // We subtract 2 so that the bounds computation has 2 additional bits.
+      e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+      m2 = ieeeMantissa;
+    } else {
+      e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+      m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
+    }
+    const bool even = (m2 & 1) == 0;
+    const bool acceptBounds = even;
+
+    // Step 2: Determine the interval of valid decimal representations.
+    const uint32_t mv = 4 * m2;
+    const uint32_t mp = 4 * m2 + 2;
+    // Implicit bool -> int conversion. True is 1, false is 0.
+    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+    const uint32_t mm = 4 * m2 - 1 - mmShift;
+
+    // Step 3: Convert to a decimal power base using 64-bit arithmetic.
+    uint32_t vr, vp, vm;
+    int32_t e10;
+    bool vmIsTrailingZeros = false;
+    bool vrIsTrailingZeros = false;
+    uint8_t lastRemovedDigit = 0;
+    if (e2 >= 0) {
+      const uint32_t q = log10Pow2(e2);
+      e10 = (int32_t) q;
+      const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+      const int32_t i = -e2 + (int32_t) q + k;
+      vr = mulPow5InvDivPow2(mv, q, i);
+      vp = mulPow5InvDivPow2(mp, q, i);
+      vm = mulPow5InvDivPow2(mm, q, i);
+      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+        // We need to know one removed digit even if we are not going to loop below. We could use
+        // q = X - 1 above, except that would require 33 bits for the result, and we've found that
+        // 32-bit arithmetic is faster even on 64-bit machines.
+        const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
+        lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
+      }
+      if (q <= 9) {
+        // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
+        // Only one of mp, mv, and mm can be a multiple of 5, if any.
+        if (mv % 5 == 0) {
+          vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
+        } else if (acceptBounds) {
+          vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
+        } else {
+          vp -= multipleOfPowerOf5_32(mp, q);
+        }
+      }
+    } else {
+      const uint32_t q = log10Pow5(-e2);
+      e10 = (int32_t) q + e2;
+      const int32_t i = -e2 - (int32_t) q;
+      const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+      int32_t j = (int32_t) q - k;
+      vr = mulPow5divPow2(mv, (uint32_t) i, j);
+      vp = mulPow5divPow2(mp, (uint32_t) i, j);
+      vm = mulPow5divPow2(mm, (uint32_t) i, j);
+      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+        j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+        lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
+      }
+      if (q <= 1) {
+        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+        // mv = 4 * m2, so it always has at least two trailing 0 bits.
+        vrIsTrailingZeros = true;
+        if (acceptBounds) {
+          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+          vmIsTrailingZeros = mmShift == 1;
+        } else {
+          // mp = mv + 2, so it always has at least one trailing 0 bit.
+          --vp;
+        }
+      } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
+        vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
+      }
+    }
+
+    // Step 4: Find the shortest decimal representation in the interval of valid representations.
+    int32_t removed = 0;
+    uint32_t output;
+    if (vmIsTrailingZeros || vrIsTrailingZeros) {
+      // General case, which happens rarely (~4.0%).
+      while (vp / 10 > vm / 10) {
+        vmIsTrailingZeros &= vm % 10 == 0;
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+      if (vmIsTrailingZeros) {
+        while (vm % 10 == 0) {
+          vrIsTrailingZeros &= lastRemovedDigit == 0;
+          lastRemovedDigit = (uint8_t) (vr % 10);
+          vr /= 10;
+          vp /= 10;
+          vm /= 10;
+          ++removed;
+        }
+      }
+      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+        // Round even if the exact number is .....50..0.
+        lastRemovedDigit = 4;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+    } else {
+      // Specialized for the common case (~96.0%). Percentages below are relative to this.
+      // Loop iterations below (approximately):
+      // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
+      while (vp / 10 > vm / 10) {
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + (vr == vm || lastRemovedDigit >= 5);
+    }
+    const int32_t exp = e10 + removed;
+
+    floating_decimal_32 fd;
+    fd.exponent = exp;
+    fd.mantissa = output;
+    return fd;
+  }
+
+  __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+    
+    // Values in the interval [1E-3, 1E7) are special.
+    if (scientificNotation) {
+      // Print in the format x.xxxxxE-yy.
+      for (uint32_t i = 0; i < olength - 1; ++i) {
+        const uint32_t c = output % 10; output /= 10;
+        result[index + olength - i] = (char) ('0' + c);
+      }
+      result[index] = '0' + output % 10;
+      result[index + 1] = '.';
+      index += olength + 1;
+      if (olength == 1) {
+        result[index++] = '0';
+      }
+      // Print 'E', the exponent sign, and the exponent, which has at most three digits.
+      result[index++] = 'E';
+      if (exp < 0) {
+        result[index++] = '-';
+        exp = -exp;
+      }
+      if (exp >= 100) {
+          result[index++] = (char) ('0' + exp / 100);
+          exp %= 100;
+          result[index++] = (char) ('0' + exp / 10);
+        } else if (exp >= 10) {
+          result[index++] = (char) ('0' + exp / 10);
+        }
+        result[index++] = (char) ('0' + exp % 10);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        result[index++] = '0';
+        result[index++] = '.';
+        for (int i = -1; i > exp; i--) {
+          result[index++] = '0';
+        }
+        int current = index;
+        for (int i = 0; i < olength; i++) {
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+          index++;
+        }
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        for (int i = 0; i < olength; i++) {
+          result[index + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength;
+        for (int i = olength; i < exp + 1; i++) {
+          result[index++] = '0';
+        }
+        result[index++] = '.';
+        result[index++] = '0';
+      } else {
+        // Decimal dot is somewhere between the digits.
+        int current = index + 1;
+        for (int i = 0; i < olength; i++) {
+          if (olength - i - 1 == exp) {
+            result[current + olength - i - 1] = '.';
+            current--;
+          }
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) {
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+    
+    if (scientificNotation) {
+      index += olength + 1;
+      if (olength == 1) {
+        index++;
+      }
+      // 'E'
+      index++;
+      if (exp < 0) {
+        exp = -exp;
+        index++;
+      }
+      if (exp >= 100) {
+        index += 3;
+      } else if (exp >= 10) {
+        index += 2;
       } else {
-        return false;
+        index++;
       }
-    }();
-    if (std::isinf(value)) {
-      if (bneg) {
-        memcpy(output, "-Infinity", 9);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        index += 1 - exp + olength;
+      } else if (exp + 1 >= olength) {
+        index += exp + 3;
       } else {
-        memcpy(output, "Infinity", 8);
-      }
-      return bneg ? 9 : 8;
-    }
-
-    // dissect value into components
-    unsigned int integer = 0;
-    unsigned long long decimal = 0;
-    int exp10          = 0;
-    int decimal_places = dissect_value(value, integer, decimal, exp10, is_float);
-    //
-    // now build the string from the
-    // components: sign, integer, decimal, exp10, decimal_places
-    //
-    // sign
-    char* ptr = output;
-    if (bneg) *ptr++ = '-';
-    // integer
-    ptr = int2str(integer, ptr);
-    // decimal
-    *ptr++ = '.';
-    if (decimal_places) {
-      char buffer[significant_digits_double];
-      char* pb = buffer;
-      while (decimal_places--) {
-        *pb++ = (char)('0' + (decimal % 10));
-        decimal /= 10;
-      }
-      while (pb != buffer)  // reverses the digits
-        *ptr++ = *--pb;     // e.g. 54321 -> 12345
-    } else
-      *ptr++ = '0';  // always include at least .0
-    // exponent
-    if (exp10) {
-      *ptr++ = 'E';
-      if (exp10 < 0) {
-        *ptr++ = '-';
-        exp10  = -exp10;
-      }
-      // if (exp10 < 10) *ptr++ = '0';  // extra zero-pad
-      ptr = int2str(exp10, ptr);
-    }
-    // done
-    return (int)(ptr - output);  // number of bytes written
-  }
-
-  /**
-   * @brief Compute how man bytes are needed to hold the output string.
-   *
-   * @param value Float value to convert.
-   * @return Number of bytes required.
-   */
-  __device__ int compute_ftos_size(double value, bool is_float)
-  {
-    if (std::isnan(value)) return 3;  // NaN
-    bool bneg = false;
-    if (signbit(value)) {  // handles -0.0 too
-      value = -value;
-      bneg  = true;
-    }
-    if (std::isinf(value)) return 8 + (int)bneg;  // Inf
-
-    // dissect float into parts
-    unsigned int integer = 0;
-    unsigned long long decimal = 0;
-    int exp10          = 0;
-    int decimal_places = dissect_value(value, integer, decimal, exp10, is_float);
-    // now count up the components
-    // sign
-    int count = (int)bneg;
-    // integer
-    count += (int)(integer == 0);
-    while (integer > 0) {
-      integer /= 10;
-      ++count;
-    }  // log10(integer)
-    // decimal
-    ++count;  // decimal point
-    if (decimal_places)
-      count += decimal_places;
-    else
-      ++count;  // always include .0
-    // exponent
-    if (exp10) {
-      count ++;  // 'e±'
-      if (exp10 < 0) {
-        count ++;
-        exp10 = -exp10;
-      }
-      while (exp10 > 0) {
-        exp10 /= 10;
-        ++count;
-      }  // log10(exp10)
+        index += olength + 1;
+      }
     }
-    return count;
+    return index;
+  }
+
+  __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      // Print in the format x.xxxxxE-yy.
+      for (int i = 0; i < olength - 1; i++) {
+        int c = output % 10; output /= 10;
+        result[index + olength - i] = (char) ('0' + c);
+      }
+      result[index] = (char) ('0' + output % 10);
+      result[index + 1] = '.';
+      index += olength + 1;
+      if (olength == 1) {
+        result[index++] = '0';
+      }
+
+      // Print 'E', the exponent sign, and the exponent, which has at most two digits.
+      result[index++] = 'E';
+      if (exp < 0) {
+        result[index++] = '-';
+        exp = -exp;
+      }
+      if (exp >= 10) {
+        result[index++] = (char) ('0' + exp / 10);
+      }
+      result[index++] = (char) ('0' + exp % 10);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        result[index++] = '0';
+        result[index++] = '.';
+        for (int i = -1; i > exp; i--) {
+          result[index++] = '0';
+        }
+        int current = index;
+        for (int i = 0; i < olength; i++) {
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+          index++;
+        }
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        for (int i = 0; i < olength; i++) {
+          result[index + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength;
+        for (int i = olength; i < exp + 1; i++) {
+          result[index++] = '0';
+        }
+        result[index++] = '.';
+        result[index++] = '0';
+      } else {
+        // Decimal dot is somewhere between the digits.
+        int current = index + 1;
+        for (int i = 0; i < olength; i++) {
+          if (olength - i - 1 == exp) {
+            result[current + olength - i - 1] = '.';
+            current--;
+          }
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      index += olength + 1;
+      if (olength == 1) {
+        index++;
+      }
+      // 'E'
+      index++;
+      if (exp < 0) {
+        index++;
+        exp = -exp;
+      }
+      if (exp >= 10) {
+        index++;
+      }
+      index++;
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        index += 1 - exp + olength;
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        index += exp + 3;
+      } else {
+        // Decimal dot is somewhere between the digits.
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent,
+    floating_decimal_64* const v) {
+    const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+    const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+
+    if (e2 > 0) {
+      // f = m2 * 2^e2 >= 2^53 is an integer.
+      // Ignore this case for now.
+      return false;
+    }
+
+    if (e2 < -52) {
+      // f < 1.
+      return false;
+    }
+
+    // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
+    // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
+    const uint64_t mask = (1ull << -e2) - 1;
+    const uint64_t fraction = m2 & mask;
+    if (fraction != 0) {
+      return false;
+    }
+
+    // f is an integer in the range [1, 2^53).
+    // Note: mantissa might contain trailing (decimal) 0's.
+    // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+    v->mantissa = m2 >> -e2;
+    v->exponent = 0;
+    return true;
+  }
+
+  __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
+    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+    const uint64_t bits = double_to_bits(f);
+
+    // Decode bits into sign, mantissa, and exponent.
+    ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+    const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
+    const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+    // Case distinction; exit early for the easy cases.
+    if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+      special = true;
+      return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
+    }
+    special = false;
+    floating_decimal_64 v;
+    const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
+    if (isSmallInt) {
+      // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
+      // For scientific notation we need to move these zeros into the exponent.
+      // (This is not needed for fixed-point notation, so it might be beneficial to trim
+      // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
+      for (;;) {
+        const uint64_t q = div10(v.mantissa);
+        const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
+        if (r != 0) {
+          break;
+        }
+        v.mantissa = q;
+        ++v.exponent;
+      }
+    } else {
+      v = d2d(ieeeMantissa, ieeeExponent);
+    }
+    return v;
+  }
+
+  __device__ int d2s_buffered_n(double f, char* result) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(f, sign, special);
+    if (special) {
+      return copy_special_str(result, sign, v.exponent, v.mantissa);
+    }
+    return to_chars(v, sign, result);
+  }
+
+  __device__ int compute_d2s_size(double value) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return d2s_size(v, sign);
+  }
+
+  __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
+    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+    const uint32_t bits = float_to_bits(f);
+
+    // Decode bits into sign, mantissa, and exponent.
+    ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+    const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
+    const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
+
+    // Case distinction; exit early for the easy cases.
+    if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+      special = true;
+      return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
+    }
+    special = false;
+    return f2d(ieeeMantissa, ieeeExponent);
+  }
+
+  __device__ int f2s_buffered_n(float f, char* result) {
+    bool sign = false, special = false;
+    floating_decimal_32 v = f2d(f, sign, special);
+    if (special) {
+      return copy_special_str(result, sign, v.exponent, v.mantissa);
+    }
+    return to_chars(v, sign, result);
+  }
+
+  __device__ int compute_f2s_size(float value) {
+    bool sign = false, special = false;
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return f2s_size(v, sign);
+  }
+
+  __device__ int compute_ftos_size(double value, bool is_float) {
+    if (is_float) {
+        return compute_f2s_size(value);
+    } else {
+        return compute_d2s_size(value);
+    }
+  }
+
+  __device__ int float_to_string(double value, char* output, bool is_float) {
+      if (is_float) {
+          return f2s_buffered_n(value, output);
+      } else {
+          return d2s_buffered_n(value, output);
+      }
   }
 };
 
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index a86d988724..d75741b8a0 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -45,7 +45,7 @@ TEST_F(FloatToStringTests, FromFloats32)
                               123456789012.34,
                               -0.0};
   std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"};
+    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"};
 
   cudf::test::fixed_width_column_wrapper<float> floats(
     h_floats.begin(),

From 1264317a3ca9bf820eac184c53490a31b93b6c47 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 6 Nov 2023 16:32:05 +0800
Subject: [PATCH 09/54] update license

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index a594377b4a..d23442f173 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -1,17 +1,16 @@
+/* Not a contribution
+ * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
+ * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
  *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
  */
 
 #include "cast_string.hpp"

From a87a4039372b0b1e0bca596866c5cedbe1e0a845 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 7 Nov 2023 17:02:05 +0800
Subject: [PATCH 10/54] clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index d23442f173..ce2a8aedc6 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -1,7 +1,7 @@
 /* Not a contribution
  * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
  * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
-/*
+ *
  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
  *

From 979dc39bf289f037f85c752724714d7530bd6df2 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 13 Nov 2023 16:30:03 +0800
Subject: [PATCH 11/54] Split ftos_converter out

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu | 1171 +---------------------
 src/main/cpp/src/ftos_converter.cu       | 1163 +++++++++++++++++++++
 thirdparty/cudf                          |    2 +-
 3 files changed, 1177 insertions(+), 1159 deletions(-)
 create mode 100644 src/main/cpp/src/ftos_converter.cu

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index ce2a8aedc6..eaf0c989b9 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -1,16 +1,17 @@
-/* Not a contribution
- * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
- * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include "cast_string.hpp"
@@ -30,21 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/generate.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
-#include <thrust/transform.h>
-
-#include <cuda/std/climits>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-#include <cuda/std/cassert>
-// #include <cuda/std/cstdbool>
-#include <cuda/std/cstdint>
-// #include <cuda/std/cstdlib>
-// #include <cuda/std/cstring>
+#include <ftos_converter.cu>
 
 using namespace cudf;
 
@@ -53,1138 +40,6 @@ namespace spark_rapids_jni {
 namespace detail {
 namespace {
 
-// A floating decimal representing m * 10^e.
-typedef struct floating_decimal_64 {
-  uint64_t mantissa;
-  // Decimal exponent's range is -324 to 308
-  // inclusive, and can fit in a short if needed.
-  int32_t exponent;
-} floating_decimal_64;
-
-// A floating decimal representing m * 10^e.
-typedef struct floating_decimal_32 {
-  uint32_t mantissa;
-  // Decimal exponent's range is -45 to 38
-  // inclusive, and can fit in a short if needed.
-  int32_t exponent;
-} floating_decimal_32;
-
-struct ftos_converter {
-
-  // These tables are generated by PrintDoubleLookupTable.
-  static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
-  static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
-  static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
-  static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
-  static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
-  static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
-  static constexpr unsigned int DOUBLE_BIAS = 1023;
-  static constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
-  static constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
-  static constexpr unsigned int FLOAT_BIAS = 127;
-
-
-  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
-  __device__ inline uint32_t decimalLength9(const uint32_t v) {
-    // Function precondition: v is not a 10-digit number.
-    // (f2s: 9 digits are sufficient for round-tripping.)
-    // (d2fixed: We print 9-digit blocks.)
-    assert(v < 1000000000);
-    if (v >= 100000000) { return 9; }
-    if (v >= 10000000) { return 8; }
-    if (v >= 1000000) { return 7; }
-    if (v >= 100000) { return 6; }
-    if (v >= 10000) { return 5; }
-    if (v >= 1000) { return 4; }
-    if (v >= 100) { return 3; }
-    if (v >= 10) { return 2; }
-    return 1;
-  }
-
-  const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = {
-    {                    1u, 2305843009213693952u },
-    {  5955668970331000884u, 1784059615882449851u },
-    {  8982663654677661702u, 1380349269358112757u },
-    {  7286864317269821294u, 2135987035920910082u },
-    {  7005857020398200553u, 1652639921975621497u },
-    { 17965325103354776697u, 1278668206209430417u },
-    {  8928596168509315048u, 1978643211784836272u },
-    { 10075671573058298858u, 1530901034580419511u },
-    {   597001226353042382u, 1184477304306571148u },
-    {  1527430471115325346u, 1832889850782397517u },
-    { 12533209867169019542u, 1418129833677084982u },
-    {  5577825024675947042u, 2194449627517475473u },
-    { 11006974540203867551u, 1697873161311732311u },
-    { 10313493231639821582u, 1313665730009899186u },
-    { 12701016819766672773u, 2032799256770390445u }
-  };
-
-  const uint32_t POW5_INV_OFFSETS[19] = {
-    0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
-    0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
-    0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
-    0x00000000
-  };
-
-  const uint64_t DOUBLE_POW5_SPLIT2[13][2] = {
-    {                    0u, 1152921504606846976u },
-    {                    0u, 1490116119384765625u },
-    {  1032610780636961552u, 1925929944387235853u },
-    {  7910200175544436838u, 1244603055572228341u },
-    { 16941905809032713930u, 1608611746708759036u },
-    { 13024893955298202172u, 2079081953128979843u },
-    {  6607496772837067824u, 1343575221513417750u },
-    { 17332926989895652603u, 1736530273035216783u },
-    { 13037379183483547984u, 2244412773384604712u },
-    {  1605989338741628675u, 1450417759929778918u },
-    {  9630225068416591280u, 1874621017369538693u },
-    {   665883850346957067u, 1211445438634777304u },
-    { 14931890668723713708u, 1565756531257009982u }
-  };
-
-  const uint32_t POW5_OFFSETS[21] = {
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
-    0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
-    0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
-    0x55559155, 0x51405555, 0x00000105
-  };
-
-  static constexpr uint32_t POW5_TABLE_SIZE = 26;
-  const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
-  1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
-  1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
-  30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
-  19073486328125ull, 95367431640625ull, 476837158203125ull,
-  2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
-  298023223876953125ull //, 1490116119384765625ull
-  };
-
-  // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528.
-  __device__ inline int32_t log2pow5(const int32_t e) {
-    // This approximation works up to the point that the multiplication overflows at e = 3529.
-    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
-    // than 2^9297.
-    assert(e >= 0);
-    assert(e <= 3528);
-    return (int32_t) ((((uint32_t) e) * 1217359) >> 19);
-  }
-
-  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
-  __device__ inline int32_t pow5bits(const int32_t e) {
-    // This approximation works up to the point that the multiplication overflows at e = 3529.
-    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
-    // than 2^9297.
-    assert(e >= 0);
-    assert(e <= 3528);
-    return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
-  }
-
-  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
-  __device__ inline int32_t ceil_log2pow5(const int32_t e) {
-    return log2pow5(e) + 1;
-  }
-
-  // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
-  __device__ inline uint32_t log10Pow2(const int32_t e) {
-    // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
-    assert(e >= 0);
-    assert(e <= 1650);
-    return (((uint32_t) e) * 78913) >> 18;
-  }
-
-  // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
-  __device__ inline uint32_t log10Pow5(const int32_t e) {
-    // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
-    assert(e >= 0);
-    assert(e <= 2620);
-    return (((uint32_t) e) * 732923) >> 20;
-  }
-
-  __device__ inline uint32_t pow5factor_32(uint32_t value) {
-    uint32_t count = 0;
-    for (;;) {
-      assert(value != 0);
-      const uint32_t q = value / 5;
-      const uint32_t r = value % 5;
-      if (r != 0) {
-        break;
-      }
-      value = q;
-      ++count;
-    }
-    return count;
-  }
-
-  // Returns true if value is divisible by 5^p.
-  __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) {
-    return pow5factor_32(value) >= p;
-  }
-
-  // Returns true if value is divisible by 2^p.
-  __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) {
-    // __builtin_ctz doesn't appear to be faster here.
-    return (value & ((1u << p) - 1)) == 0;
-  }
-
-  // It seems to be slightly faster to avoid uint128_t here, although the
-  // generated code for uint128_t looks slightly nicer.
-  __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) {
-    assert(shift > 32);
-
-    // The casts here help MSVC to avoid calls to the __allmul library
-    // function.
-    const uint32_t factorLo = (uint32_t)(factor);
-    const uint32_t factorHi = (uint32_t)(factor >> 32);
-    const uint64_t bits0 = (uint64_t)m * factorLo;
-    const uint64_t bits1 = (uint64_t)m * factorHi;
-
-    const uint64_t sum = (bits0 >> 32) + bits1;
-    const uint64_t shiftedSum = sum >> (shift - 32);
-    assert(shiftedSum <= UINT32_MAX);
-    return (uint32_t) shiftedSum;
-
-  }
-
-  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
-    if (mantissa) {
-      memcpy(result, "NaN", 3);
-      return 3;
-    }
-    if (sign) {
-      result[0] = '-';
-    }
-    if (exponent) {
-      memcpy(result + sign, "Infinity", 8);
-      return sign + 8;
-    }
-    memcpy(result + sign, "0.0", 3);
-    return sign + 3;
-  }
-
-  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) {
-    if (mantissa) {
-      return 3;
-    }
-    if (exponent) {
-      return sign + 8;
-    }
-    return sign + 3;
-  }
-
-  __device__ inline uint32_t float_to_bits(const float f) {
-    uint32_t bits = 0;
-    memcpy(&bits, &f, sizeof(float));
-    return bits;
-  }
-
-  __device__ inline uint64_t double_to_bits(const double d) {
-    uint64_t bits = 0;
-    memcpy(&bits, &d, sizeof(double));
-    return bits;
-  }
-
-  __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
-    // The casts here help MSVC to avoid calls to the __allmul library function.
-    const uint32_t aLo = (uint32_t)a;
-    const uint32_t aHi = (uint32_t)(a >> 32);
-    const uint32_t bLo = (uint32_t)b;
-    const uint32_t bHi = (uint32_t)(b >> 32);
-
-    const uint64_t b00 = (uint64_t)aLo * bLo;
-    const uint64_t b01 = (uint64_t)aLo * bHi;
-    const uint64_t b10 = (uint64_t)aHi * bLo;
-    const uint64_t b11 = (uint64_t)aHi * bHi;
-
-    const uint32_t b00Lo = (uint32_t)b00;
-    const uint32_t b00Hi = (uint32_t)(b00 >> 32);
-
-    const uint64_t mid1 = b10 + b00Hi;
-    const uint32_t mid1Lo = (uint32_t)(mid1);
-    const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
-
-    const uint64_t mid2 = b01 + mid1Lo;
-    const uint32_t mid2Lo = (uint32_t)(mid2);
-    const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
-
-    const uint64_t pHi = b11 + mid1Hi + mid2Hi;
-    const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
-
-    *productHi = pHi;
-    return pLo;
-  }
-
-  __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
-    // We don't need to handle the case dist >= 64 here (see above).
-    assert(dist < 64);
-    assert(dist > 0);
-    return (hi << (64 - dist)) | (lo >> dist);
-  }
-
-  __device__ inline uint64_t div5(const uint64_t x) {
-    return x / 5;
-  }
-
-  __device__ inline uint64_t div10(const uint64_t x) {
-    return x / 10;
-  }
-
-  __device__ inline uint64_t div100(const uint64_t x) {
-    return x / 100;
-  }
-
-  __device__ inline uint64_t div1e8(const uint64_t x) {
-    return x / 100000000;
-  }
-
-  __device__ inline uint64_t div1e9(const uint64_t x) {
-    return x / 1000000000;
-  }
-
-  __device__ inline uint32_t mod1e9(const uint64_t x) {
-    return (uint32_t) (x - 1000000000 * div1e9(x));
-  }
-
-  __device__ inline uint32_t pow5Factor(uint64_t value) {
-    const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
-    const uint64_t n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
-    uint32_t count = 0;
-    for (;;) {
-      assert(value != 0);
-      value *= m_inv_5;
-      if (value > n_div_5)
-        break;
-      ++count;
-    }
-    return count;
-  }
-
-  // Returns true if value is divisible by 5^p.
-  __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
-    // I tried a case distinction on p, but there was no performance difference.
-    return pow5Factor(value) >= p;
-  }
-
-  // Returns true if value is divisible by 2^p.
-  __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
-    assert(value != 0);
-    assert(p < 64);
-    // __builtin_ctzll doesn't appear to be faster here.
-    return (value & ((1ull << p) - 1)) == 0;
-  }
-
-  __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
-    // m is maximum 55 bits
-    uint64_t high1;                                   // 128
-    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
-    uint64_t high0;                                   // 64
-    umul128(m, mul[0], &high0);                       // 0
-    const uint64_t sum = high0 + low1;
-    if (sum < high0) {
-      ++high1; // overflow into high1
-    }
-    return shiftright128(sum, high1, j - 64);
-  }
-
-  __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
-    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
-    *vp = mulShift64(4 * m + 2, mul, j);
-    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
-    return mulShift64(4 * m, mul, j);
-  }
-
-  // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
-  __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) {
-    const uint32_t base = i / POW5_TABLE_SIZE;
-    const uint32_t base2 = base * POW5_TABLE_SIZE;
-    const uint32_t offset = i - base2;
-    const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
-    if (offset == 0) {
-      result[0] = mul[0];
-      result[1] = mul[1];
-      return;
-    }
-    const uint64_t m = DOUBLE_POW5_TABLE[offset];
-    uint64_t high1;
-    const uint64_t low1 = umul128(m, mul[1], &high1);
-    uint64_t high0;
-    const uint64_t low0 = umul128(m, mul[0], &high0);
-    const uint64_t sum = high0 + low1;
-    if (sum < high0) {
-      ++high1; // overflow into high1
-    }
-    // high1 | sum | low0
-    const uint32_t delta = pow5bits(i) - pow5bits(base2);
-    result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
-    result[1] = shiftright128(sum, high1, delta);
-  }
-
-  // Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
-  __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
-    const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
-    const uint32_t base2 = base * POW5_TABLE_SIZE;
-    const uint32_t offset = base2 - i;
-    const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
-    if (offset == 0) {
-      result[0] = mul[0];
-      result[1] = mul[1];
-      return;
-    }
-    const uint64_t m = DOUBLE_POW5_TABLE[offset];
-    uint64_t high1;
-    const uint64_t low1 = umul128(m, mul[1], &high1);
-    uint64_t high0;
-    const uint64_t low0 = umul128(m, mul[0] - 1, &high0);
-    const uint64_t sum = high0 + low1;
-    if (sum < high0) {
-      ++high1; // overflow into high1
-    }
-    // high1 | sum | low0
-    const uint32_t delta = pow5bits(base2) - pow5bits(i);
-    result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
-    result[1] = shiftright128(sum, high1, delta);
-  }
-
-  __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
-    // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
-    // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
-    // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
-    uint64_t pow5[2];
-    double_computeInvPow5(q, pow5);
-    return mulShift32(m, pow5[1] + 1, j);
-  }
-
-  __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) {
-    uint64_t pow5[2];
-    double_computePow5(i, pow5);
-    return mulShift32(m, pow5[1], j);
-  }
-
-  __device__ inline uint32_t decimalLength17(const uint64_t v) {
-    // This is slightly faster than a loop.
-    // The average output length is 16.38 digits, so we check high-to-low.
-    // Function precondition: v is not an 18, 19, or 20-digit number.
-    // (17 digits are sufficient for round-tripping.)
-    assert(v < 100000000000000000L);
-    if (v >= 10000000000000000L) { return 17; }
-    if (v >= 1000000000000000L) { return 16; }
-    if (v >= 100000000000000L) { return 15; }
-    if (v >= 10000000000000L) { return 14; }
-    if (v >= 1000000000000L) { return 13; }
-    if (v >= 100000000000L) { return 12; }
-    if (v >= 10000000000L) { return 11; }
-    if (v >= 1000000000L) { return 10; }
-    if (v >= 100000000L) { return 9; }
-    if (v >= 10000000L) { return 8; }
-    if (v >= 1000000L) { return 7; }
-    if (v >= 100000L) { return 6; }
-    if (v >= 10000L) { return 5; }
-    if (v >= 1000L) { return 4; }
-    if (v >= 100L) { return 3; }
-    if (v >= 10L) { return 2; }
-    return 1;
-  }
-
-  __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
-    int32_t e2;
-    uint64_t m2;
-    if (ieeeExponent == 0) {
-      // We subtract 2 so that the bounds computation has 2 additional bits.
-      e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
-      m2 = ieeeMantissa;
-    } else {
-      e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
-      m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-    }
-    const bool even = (m2 & 1) == 0;
-    const bool acceptBounds = even;
-
-    // Step 2: Determine the interval of valid decimal representations.
-    const uint64_t mv = 4 * m2;
-    // Implicit bool -> int conversion. True is 1, false is 0.
-    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-    // We would compute mp and mm like this:
-    // uint64_t mp = 4 * m2 + 2;
-    // uint64_t mm = mv - 1 - mmShift;
-
-    // Step 3: Convert to a decimal power base using 128-bit arithmetic.
-    uint64_t vr, vp, vm;
-    int32_t e10;
-    bool vmIsTrailingZeros = false;
-    bool vrIsTrailingZeros = false;
-    if (e2 >= 0) {
-      // I tried special-casing q == 0, but there was no effect on performance.
-      // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
-      const uint32_t q = log10Pow2(e2) - (e2 > 3);
-      e10 = (int32_t) q;
-      const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
-      const int32_t i = -e2 + (int32_t) q + k;
-      uint64_t pow5[2];
-      double_computeInvPow5(q, pow5);
-      vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
-
-      if (q <= 21) {
-        // This should use q <= 22, but I think 21 is also safe. Smaller values
-        // may still be safe, but it's more difficult to reason about them.
-        // Only one of mp, mv, and mm can be a multiple of 5, if any.
-        const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
-        if (mvMod5 == 0) {
-          vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
-        } else if (acceptBounds) {
-          // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
-          // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
-          // <=> true && pow5Factor(mm) >= q, since e2 >= q.
-          vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
-        } else {
-          // Same as min(e2 + 1, pow5Factor(mp)) >= q.
-          vp -= multipleOfPowerOf5(mv + 2, q);
-        }
-      }
-    } else {
-      // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
-      const uint32_t q = log10Pow5(-e2) - (-e2 > 1);
-      e10 = (int32_t) q + e2;
-      const int32_t i = -e2 - (int32_t) q;
-      const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
-      const int32_t j = (int32_t) q - k;
-
-      uint64_t pow5[2];
-      double_computePow5(i, pow5);
-      vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
-
-      if (q <= 1) {
-        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
-        // mv = 4 * m2, so it always has at least two trailing 0 bits.
-        vrIsTrailingZeros = true;
-        if (acceptBounds) {
-          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
-          vmIsTrailingZeros = mmShift == 1;
-        } else {
-          // mp = mv + 2, so it always has at least one trailing 0 bit.
-          --vp;
-        }
-      } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
-        // We want to know if the full product has at least q trailing zeros.
-        // We need to compute min(p2(mv), p5(mv) - e2) >= q
-        // <=> p2(mv) >= q && p5(mv) - e2 >= q
-        // <=> p2(mv) >= q (because -e2 >= q)
-        vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
-      }
-    }
-
-    // Step 4: Find the shortest decimal representation in the interval of valid representations.
-    int32_t removed = 0;
-    uint8_t lastRemovedDigit = 0;
-    uint64_t output;
-    // On average, we remove ~2 digits.
-    if (vmIsTrailingZeros || vrIsTrailingZeros) {
-      // General case, which happens rarely (~0.7%).
-      for (;;) {
-        const uint64_t vpDiv10 = div10(vp);
-        const uint64_t vmDiv10 = div10(vm);
-        if (vpDiv10 <= vmDiv10) {
-          break;
-        }
-        const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
-        const uint64_t vrDiv10 = div10(vr);
-        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-        vmIsTrailingZeros &= vmMod10 == 0;
-        vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t) vrMod10;
-        vr = vrDiv10;
-        vp = vpDiv10;
-        vm = vmDiv10;
-        ++removed;
-      }
-
-      if (vmIsTrailingZeros) {
-        for (;;) {
-          const uint64_t vmDiv10 = div10(vm);
-          const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
-          if (vmMod10 != 0) {
-            break;
-          }
-          const uint64_t vpDiv10 = div10(vp);
-          const uint64_t vrDiv10 = div10(vr);
-          const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-          vrIsTrailingZeros &= lastRemovedDigit == 0;
-          lastRemovedDigit = (uint8_t) vrMod10;
-          vr = vrDiv10;
-          vp = vpDiv10;
-          vm = vmDiv10;
-          ++removed;
-        }
-      }
-
-      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
-        // Round even if the exact number is .....50..0.
-        lastRemovedDigit = 4;
-      }
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
-    } else {
-      // Specialized for the common case (~99.3%). Percentages below are relative to this.
-      bool roundUp = false;
-      const uint64_t vpDiv100 = div100(vp);
-      const uint64_t vmDiv100 = div100(vm);
-      if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
-        const uint64_t vrDiv100 = div100(vr);
-        const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
-        roundUp = vrMod100 >= 50;
-        vr = vrDiv100;
-        vp = vpDiv100;
-        vm = vmDiv100;
-        removed += 2;
-      }
-      // Loop iterations below (approximately), without optimization above:
-      // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
-      // Loop iterations below (approximately), with optimization above:
-      // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
-      for (;;) {
-        const uint64_t vpDiv10 = div10(vp);
-        const uint64_t vmDiv10 = div10(vm);
-        if (vpDiv10 <= vmDiv10) {
-          break;
-        }
-        const uint64_t vrDiv10 = div10(vr);
-        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-        roundUp = vrMod10 >= 5;
-        vr = vrDiv10;
-        vp = vpDiv10;
-        vm = vmDiv10;
-        ++removed;
-      }
-
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + (vr == vm || roundUp);
-    }
-    const int32_t exp = e10 + removed;
-
-    floating_decimal_64 fd;
-    fd.exponent = exp;
-    fd.mantissa = output;
-    return fd;
-  }
-
-  __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) {
-    int32_t e2;
-    uint32_t m2;
-    if (ieeeExponent == 0) {
-      // We subtract 2 so that the bounds computation has 2 additional bits.
-      e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
-      m2 = ieeeMantissa;
-    } else {
-      e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
-      m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
-    }
-    const bool even = (m2 & 1) == 0;
-    const bool acceptBounds = even;
-
-    // Step 2: Determine the interval of valid decimal representations.
-    const uint32_t mv = 4 * m2;
-    const uint32_t mp = 4 * m2 + 2;
-    // Implicit bool -> int conversion. True is 1, false is 0.
-    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-    const uint32_t mm = 4 * m2 - 1 - mmShift;
-
-    // Step 3: Convert to a decimal power base using 64-bit arithmetic.
-    uint32_t vr, vp, vm;
-    int32_t e10;
-    bool vmIsTrailingZeros = false;
-    bool vrIsTrailingZeros = false;
-    uint8_t lastRemovedDigit = 0;
-    if (e2 >= 0) {
-      const uint32_t q = log10Pow2(e2);
-      e10 = (int32_t) q;
-      const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
-      const int32_t i = -e2 + (int32_t) q + k;
-      vr = mulPow5InvDivPow2(mv, q, i);
-      vp = mulPow5InvDivPow2(mp, q, i);
-      vm = mulPow5InvDivPow2(mm, q, i);
-      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-        // We need to know one removed digit even if we are not going to loop below. We could use
-        // q = X - 1 above, except that would require 33 bits for the result, and we've found that
-        // 32-bit arithmetic is faster even on 64-bit machines.
-        const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
-        lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
-      }
-      if (q <= 9) {
-        // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
-        // Only one of mp, mv, and mm can be a multiple of 5, if any.
-        if (mv % 5 == 0) {
-          vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
-        } else if (acceptBounds) {
-          vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
-        } else {
-          vp -= multipleOfPowerOf5_32(mp, q);
-        }
-      }
-    } else {
-      const uint32_t q = log10Pow5(-e2);
-      e10 = (int32_t) q + e2;
-      const int32_t i = -e2 - (int32_t) q;
-      const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
-      int32_t j = (int32_t) q - k;
-      vr = mulPow5divPow2(mv, (uint32_t) i, j);
-      vp = mulPow5divPow2(mp, (uint32_t) i, j);
-      vm = mulPow5divPow2(mm, (uint32_t) i, j);
-      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-        j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
-        lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
-      }
-      if (q <= 1) {
-        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
-        // mv = 4 * m2, so it always has at least two trailing 0 bits.
-        vrIsTrailingZeros = true;
-        if (acceptBounds) {
-          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
-          vmIsTrailingZeros = mmShift == 1;
-        } else {
-          // mp = mv + 2, so it always has at least one trailing 0 bit.
-          --vp;
-        }
-      } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
-        vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
-      }
-    }
-
-    // Step 4: Find the shortest decimal representation in the interval of valid representations.
-    int32_t removed = 0;
-    uint32_t output;
-    if (vmIsTrailingZeros || vrIsTrailingZeros) {
-      // General case, which happens rarely (~4.0%).
-      while (vp / 10 > vm / 10) {
-        vmIsTrailingZeros &= vm % 10 == 0;
-        vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t) (vr % 10);
-        vr /= 10;
-        vp /= 10;
-        vm /= 10;
-        ++removed;
-      }
-      if (vmIsTrailingZeros) {
-        while (vm % 10 == 0) {
-          vrIsTrailingZeros &= lastRemovedDigit == 0;
-          lastRemovedDigit = (uint8_t) (vr % 10);
-          vr /= 10;
-          vp /= 10;
-          vm /= 10;
-          ++removed;
-        }
-      }
-      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
-        // Round even if the exact number is .....50..0.
-        lastRemovedDigit = 4;
-      }
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
-    } else {
-      // Specialized for the common case (~96.0%). Percentages below are relative to this.
-      // Loop iterations below (approximately):
-      // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
-      while (vp / 10 > vm / 10) {
-        lastRemovedDigit = (uint8_t) (vr % 10);
-        vr /= 10;
-        vp /= 10;
-        vm /= 10;
-        ++removed;
-      }
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + (vr == vm || lastRemovedDigit >= 5);
-    }
-    const int32_t exp = e10 + removed;
-
-    floating_decimal_32 fd;
-    fd.exponent = exp;
-    fd.mantissa = output;
-    return fd;
-  }
-
-  __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) {
-    // Step 5: Print the decimal representation.
-    int index = 0;
-    if (sign) {
-      result[index++] = '-';
-    }
-
-    uint64_t output = v.mantissa;
-    const uint32_t olength = decimalLength17(output);
-    int32_t exp = v.exponent + (int32_t) olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-    
-    // Values in the interval [1E-3, 1E7) are special.
-    if (scientificNotation) {
-      // Print in the format x.xxxxxE-yy.
-      for (uint32_t i = 0; i < olength - 1; ++i) {
-        const uint32_t c = output % 10; output /= 10;
-        result[index + olength - i] = (char) ('0' + c);
-      }
-      result[index] = '0' + output % 10;
-      result[index + 1] = '.';
-      index += olength + 1;
-      if (olength == 1) {
-        result[index++] = '0';
-      }
-      // Print 'E', the exponent sign, and the exponent, which has at most three digits.
-      result[index++] = 'E';
-      if (exp < 0) {
-        result[index++] = '-';
-        exp = -exp;
-      }
-      if (exp >= 100) {
-          result[index++] = (char) ('0' + exp / 100);
-          exp %= 100;
-          result[index++] = (char) ('0' + exp / 10);
-        } else if (exp >= 10) {
-          result[index++] = (char) ('0' + exp / 10);
-        }
-        result[index++] = (char) ('0' + exp % 10);
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        // Decimal dot is before any of the digits.
-        result[index++] = '0';
-        result[index++] = '.';
-        for (int i = -1; i > exp; i--) {
-          result[index++] = '0';
-        }
-        int current = index;
-        for (int i = 0; i < olength; i++) {
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-          index++;
-        }
-      } else if (exp + 1 >= olength) {
-        // Decimal dot is after any of the digits.
-        for (int i = 0; i < olength; i++) {
-          result[index + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength;
-        for (int i = olength; i < exp + 1; i++) {
-          result[index++] = '0';
-        }
-        result[index++] = '.';
-        result[index++] = '0';
-      } else {
-        // Decimal dot is somewhere between the digits.
-        int current = index + 1;
-        for (int i = 0; i < olength; i++) {
-          if (olength - i - 1 == exp) {
-            result[current + olength - i - 1] = '.';
-            current--;
-          }
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) {
-    int index = 0;
-    if (sign) {
-      index++;
-    }
-
-    uint64_t output = v.mantissa;
-    const uint32_t olength = decimalLength17(output);
-    int32_t exp = v.exponent + (int32_t) olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-    
-    if (scientificNotation) {
-      index += olength + 1;
-      if (olength == 1) {
-        index++;
-      }
-      // 'E'
-      index++;
-      if (exp < 0) {
-        exp = -exp;
-        index++;
-      }
-      if (exp >= 100) {
-        index += 3;
-      } else if (exp >= 10) {
-        index += 2;
-      } else {
-        index++;
-      }
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        index += 1 - exp + olength;
-      } else if (exp + 1 >= olength) {
-        index += exp + 3;
-      } else {
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) {
-    // Step 5: Print the decimal representation.
-    int index = 0;
-    if (sign) {
-      result[index++] = '-';
-    }
-
-    uint32_t output = v.mantissa;
-    const uint32_t olength = decimalLength9(output);
-    int32_t exp = v.exponent + olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-
-    if (scientificNotation) {
-      // Print in the format x.xxxxxE-yy.
-      for (int i = 0; i < olength - 1; i++) {
-        int c = output % 10; output /= 10;
-        result[index + olength - i] = (char) ('0' + c);
-      }
-      result[index] = (char) ('0' + output % 10);
-      result[index + 1] = '.';
-      index += olength + 1;
-      if (olength == 1) {
-        result[index++] = '0';
-      }
-
-      // Print 'E', the exponent sign, and the exponent, which has at most two digits.
-      result[index++] = 'E';
-      if (exp < 0) {
-        result[index++] = '-';
-        exp = -exp;
-      }
-      if (exp >= 10) {
-        result[index++] = (char) ('0' + exp / 10);
-      }
-      result[index++] = (char) ('0' + exp % 10);
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        // Decimal dot is before any of the digits.
-        result[index++] = '0';
-        result[index++] = '.';
-        for (int i = -1; i > exp; i--) {
-          result[index++] = '0';
-        }
-        int current = index;
-        for (int i = 0; i < olength; i++) {
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-          index++;
-        }
-      } else if (exp + 1 >= olength) {
-        // Decimal dot is after any of the digits.
-        for (int i = 0; i < olength; i++) {
-          result[index + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength;
-        for (int i = olength; i < exp + 1; i++) {
-          result[index++] = '0';
-        }
-        result[index++] = '.';
-        result[index++] = '0';
-      } else {
-        // Decimal dot is somewhere between the digits.
-        int current = index + 1;
-        for (int i = 0; i < olength; i++) {
-          if (olength - i - 1 == exp) {
-            result[current + olength - i - 1] = '.';
-            current--;
-          }
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) {
-    // Step 5: Print the decimal representation.
-    int index = 0;
-    if (sign) {
-      index++;
-    }
-
-    uint32_t output = v.mantissa;
-    const uint32_t olength = decimalLength9(output);
-    int32_t exp = v.exponent + olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-
-    if (scientificNotation) {
-      index += olength + 1;
-      if (olength == 1) {
-        index++;
-      }
-      // 'E'
-      index++;
-      if (exp < 0) {
-        index++;
-        exp = -exp;
-      }
-      if (exp >= 10) {
-        index++;
-      }
-      index++;
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        // Decimal dot is before any of the digits.
-        index += 1 - exp + olength;
-      } else if (exp + 1 >= olength) {
-        // Decimal dot is after any of the digits.
-        index += exp + 3;
-      } else {
-        // Decimal dot is somewhere between the digits.
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent,
-    floating_decimal_64* const v) {
-    const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-    const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
-
-    if (e2 > 0) {
-      // f = m2 * 2^e2 >= 2^53 is an integer.
-      // Ignore this case for now.
-      return false;
-    }
-
-    if (e2 < -52) {
-      // f < 1.
-      return false;
-    }
-
-    // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
-    // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
-    const uint64_t mask = (1ull << -e2) - 1;
-    const uint64_t fraction = m2 & mask;
-    if (fraction != 0) {
-      return false;
-    }
-
-    // f is an integer in the range [1, 2^53).
-    // Note: mantissa might contain trailing (decimal) 0's.
-    // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
-    v->mantissa = m2 >> -e2;
-    v->exponent = 0;
-    return true;
-  }
-
-  __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
-    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
-    const uint64_t bits = double_to_bits(f);
-
-    // Decode bits into sign, mantissa, and exponent.
-    ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
-    const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
-    const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
-    // Case distinction; exit early for the easy cases.
-    if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
-      special = true;
-      return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
-    }
-    special = false;
-    floating_decimal_64 v;
-    const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
-    if (isSmallInt) {
-      // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
-      // For scientific notation we need to move these zeros into the exponent.
-      // (This is not needed for fixed-point notation, so it might be beneficial to trim
-      // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
-      for (;;) {
-        const uint64_t q = div10(v.mantissa);
-        const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
-        if (r != 0) {
-          break;
-        }
-        v.mantissa = q;
-        ++v.exponent;
-      }
-    } else {
-      v = d2d(ieeeMantissa, ieeeExponent);
-    }
-    return v;
-  }
-
-  __device__ int d2s_buffered_n(double f, char* result) {
-    bool sign = false, special = false;
-    floating_decimal_64 v = d2d(f, sign, special);
-    if (special) {
-      return copy_special_str(result, sign, v.exponent, v.mantissa);
-    }
-    return to_chars(v, sign, result);
-  }
-
-  __device__ int compute_d2s_size(double value) {
-    bool sign = false, special = false;
-    floating_decimal_64 v = d2d(value, sign, special);
-    if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa);
-    }
-    return d2s_size(v, sign);
-  }
-
-  __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
-    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
-    const uint32_t bits = float_to_bits(f);
-
-    // Decode bits into sign, mantissa, and exponent.
-    ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
-    const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
-    const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
-
-    // Case distinction; exit early for the easy cases.
-    if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
-      special = true;
-      return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
-    }
-    special = false;
-    return f2d(ieeeMantissa, ieeeExponent);
-  }
-
-  __device__ int f2s_buffered_n(float f, char* result) {
-    bool sign = false, special = false;
-    floating_decimal_32 v = f2d(f, sign, special);
-    if (special) {
-      return copy_special_str(result, sign, v.exponent, v.mantissa);
-    }
-    return to_chars(v, sign, result);
-  }
-
-  __device__ int compute_f2s_size(float value) {
-    bool sign = false, special = false;
-    floating_decimal_32 v = f2d(value, sign, special);
-    if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa);
-    }
-    return f2s_size(v, sign);
-  }
-
-  __device__ int compute_ftos_size(double value, bool is_float) {
-    if (is_float) {
-        return compute_f2s_size(value);
-    } else {
-        return compute_d2s_size(value);
-    }
-  }
-
-  __device__ int float_to_string(double value, char* output, bool is_float) {
-      if (is_float) {
-          return f2s_buffered_n(value, output);
-      } else {
-          return d2s_buffered_n(value, output);
-      }
-  }
-};
-
 template <typename FloatType>
 struct float_to_string_fn {
   column_device_view d_floats;
diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
new file mode 100644
index 0000000000..7d5cf0716f
--- /dev/null
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -0,0 +1,1163 @@
+/* Not a contribution
+ * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
+ * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
+ *
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda/std/climits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+using namespace cudf;
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_64 {
+  uint64_t mantissa;
+  // Decimal exponent's range is -324 to 308
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_64;
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_32 {
+  uint32_t mantissa;
+  // Decimal exponent's range is -45 to 38
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_32;
+
+struct ftos_converter {
+
+  // These tables are generated by PrintDoubleLookupTable.
+  static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
+  static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
+  static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
+  static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
+  static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
+  static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
+  static constexpr unsigned int DOUBLE_BIAS = 1023;
+  static constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
+  static constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
+  static constexpr unsigned int FLOAT_BIAS = 127;
+
+
+  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
+  __device__ inline uint32_t decimalLength9(const uint32_t v) {
+    // Function precondition: v is not a 10-digit number.
+    // (f2s: 9 digits are sufficient for round-tripping.)
+    // (d2fixed: We print 9-digit blocks.)
+    assert(v < 1000000000);
+    if (v >= 100000000) { return 9; }
+    if (v >= 10000000) { return 8; }
+    if (v >= 1000000) { return 7; }
+    if (v >= 100000) { return 6; }
+    if (v >= 10000) { return 5; }
+    if (v >= 1000) { return 4; }
+    if (v >= 100) { return 3; }
+    if (v >= 10) { return 2; }
+    return 1;
+  }
+
+  const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = {
+    {                    1u, 2305843009213693952u },
+    {  5955668970331000884u, 1784059615882449851u },
+    {  8982663654677661702u, 1380349269358112757u },
+    {  7286864317269821294u, 2135987035920910082u },
+    {  7005857020398200553u, 1652639921975621497u },
+    { 17965325103354776697u, 1278668206209430417u },
+    {  8928596168509315048u, 1978643211784836272u },
+    { 10075671573058298858u, 1530901034580419511u },
+    {   597001226353042382u, 1184477304306571148u },
+    {  1527430471115325346u, 1832889850782397517u },
+    { 12533209867169019542u, 1418129833677084982u },
+    {  5577825024675947042u, 2194449627517475473u },
+    { 11006974540203867551u, 1697873161311732311u },
+    { 10313493231639821582u, 1313665730009899186u },
+    { 12701016819766672773u, 2032799256770390445u }
+  };
+
+  const uint32_t POW5_INV_OFFSETS[19] = {
+    0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
+    0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
+    0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
+    0x00000000
+  };
+
+  const uint64_t DOUBLE_POW5_SPLIT2[13][2] = {
+    {                    0u, 1152921504606846976u },
+    {                    0u, 1490116119384765625u },
+    {  1032610780636961552u, 1925929944387235853u },
+    {  7910200175544436838u, 1244603055572228341u },
+    { 16941905809032713930u, 1608611746708759036u },
+    { 13024893955298202172u, 2079081953128979843u },
+    {  6607496772837067824u, 1343575221513417750u },
+    { 17332926989895652603u, 1736530273035216783u },
+    { 13037379183483547984u, 2244412773384604712u },
+    {  1605989338741628675u, 1450417759929778918u },
+    {  9630225068416591280u, 1874621017369538693u },
+    {   665883850346957067u, 1211445438634777304u },
+    { 14931890668723713708u, 1565756531257009982u }
+  };
+
+  const uint32_t POW5_OFFSETS[21] = {
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
+    0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
+    0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
+    0x55559155, 0x51405555, 0x00000105
+  };
+
+  static constexpr uint32_t POW5_TABLE_SIZE = 26;
+  const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+  1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
+  1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
+  30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
+  19073486328125ull, 95367431640625ull, 476837158203125ull,
+  2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
+  298023223876953125ull //, 1490116119384765625ull
+  };
+
+  // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528.
+  __device__ inline int32_t log2pow5(const int32_t e) {
+    // This approximation works up to the point that the multiplication overflows at e = 3529.
+    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+    // than 2^9297.
+    assert(e >= 0);
+    assert(e <= 3528);
+    return (int32_t) ((((uint32_t) e) * 1217359) >> 19);
+  }
+
+  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+  __device__ inline int32_t pow5bits(const int32_t e) {
+    // This approximation works up to the point that the multiplication overflows at e = 3529.
+    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+    // than 2^9297.
+    assert(e >= 0);
+    assert(e <= 3528);
+    return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
+  }
+
+  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+  __device__ inline int32_t ceil_log2pow5(const int32_t e) {
+    return log2pow5(e) + 1;
+  }
+
+  // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
+  __device__ inline uint32_t log10Pow2(const int32_t e) {
+    // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
+    assert(e >= 0);
+    assert(e <= 1650);
+    return (((uint32_t) e) * 78913) >> 18;
+  }
+
+  // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
+  __device__ inline uint32_t log10Pow5(const int32_t e) {
+    // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
+    assert(e >= 0);
+    assert(e <= 2620);
+    return (((uint32_t) e) * 732923) >> 20;
+  }
+
+  __device__ inline uint32_t pow5factor_32(uint32_t value) {
+    uint32_t count = 0;
+    for (;;) {
+      assert(value != 0);
+      const uint32_t q = value / 5;
+      const uint32_t r = value % 5;
+      if (r != 0) {
+        break;
+      }
+      value = q;
+      ++count;
+    }
+    return count;
+  }
+
+  // Returns true if value is divisible by 5^p.
+  __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) {
+    return pow5factor_32(value) >= p;
+  }
+
+  // Returns true if value is divisible by 2^p.
+  __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) {
+    // __builtin_ctz doesn't appear to be faster here.
+    return (value & ((1u << p) - 1)) == 0;
+  }
+
+  // It seems to be slightly faster to avoid uint128_t here, although the
+  // generated code for uint128_t looks slightly nicer.
+  __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) {
+    assert(shift > 32);
+
+    // The casts here help MSVC to avoid calls to the __allmul library
+    // function.
+    const uint32_t factorLo = (uint32_t)(factor);
+    const uint32_t factorHi = (uint32_t)(factor >> 32);
+    const uint64_t bits0 = (uint64_t)m * factorLo;
+    const uint64_t bits1 = (uint64_t)m * factorHi;
+
+    const uint64_t sum = (bits0 >> 32) + bits1;
+    const uint64_t shiftedSum = sum >> (shift - 32);
+    assert(shiftedSum <= UINT32_MAX);
+    return (uint32_t) shiftedSum;
+
+  }
+
+  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
+    if (mantissa) {
+      memcpy(result, "NaN", 3);
+      return 3;
+    }
+    if (sign) {
+      result[0] = '-';
+    }
+    if (exponent) {
+      memcpy(result + sign, "Infinity", 8);
+      return sign + 8;
+    }
+    memcpy(result + sign, "0.0", 3);
+    return sign + 3;
+  }
+
+  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) {
+    if (mantissa) {
+      return 3;
+    }
+    if (exponent) {
+      return sign + 8;
+    }
+    return sign + 3;
+  }
+
+  __device__ inline uint32_t float_to_bits(const float f) {
+    uint32_t bits = 0;
+    memcpy(&bits, &f, sizeof(float));
+    return bits;
+  }
+
+  __device__ inline uint64_t double_to_bits(const double d) {
+    uint64_t bits = 0;
+    memcpy(&bits, &d, sizeof(double));
+    return bits;
+  }
+
+  __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
+    // The casts here help MSVC to avoid calls to the __allmul library function.
+    const uint32_t aLo = (uint32_t)a;
+    const uint32_t aHi = (uint32_t)(a >> 32);
+    const uint32_t bLo = (uint32_t)b;
+    const uint32_t bHi = (uint32_t)(b >> 32);
+
+    const uint64_t b00 = (uint64_t)aLo * bLo;
+    const uint64_t b01 = (uint64_t)aLo * bHi;
+    const uint64_t b10 = (uint64_t)aHi * bLo;
+    const uint64_t b11 = (uint64_t)aHi * bHi;
+
+    const uint32_t b00Lo = (uint32_t)b00;
+    const uint32_t b00Hi = (uint32_t)(b00 >> 32);
+
+    const uint64_t mid1 = b10 + b00Hi;
+    const uint32_t mid1Lo = (uint32_t)(mid1);
+    const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
+
+    const uint64_t mid2 = b01 + mid1Lo;
+    const uint32_t mid2Lo = (uint32_t)(mid2);
+    const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
+
+    const uint64_t pHi = b11 + mid1Hi + mid2Hi;
+    const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
+
+    *productHi = pHi;
+    return pLo;
+  }
+
+  __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
+    // We don't need to handle the case dist >= 64 here (see above).
+    assert(dist < 64);
+    assert(dist > 0);
+    return (hi << (64 - dist)) | (lo >> dist);
+  }
+
+  __device__ inline uint64_t div5(const uint64_t x) {
+    return x / 5;
+  }
+
+  __device__ inline uint64_t div10(const uint64_t x) {
+    return x / 10;
+  }
+
+  __device__ inline uint64_t div100(const uint64_t x) {
+    return x / 100;
+  }
+
+  __device__ inline uint64_t div1e8(const uint64_t x) {
+    return x / 100000000;
+  }
+
+  __device__ inline uint64_t div1e9(const uint64_t x) {
+    return x / 1000000000;
+  }
+
+  __device__ inline uint32_t mod1e9(const uint64_t x) {
+    return (uint32_t) (x - 1000000000 * div1e9(x));
+  }
+
+  __device__ inline uint32_t pow5Factor(uint64_t value) {
+    const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
+    const uint64_t n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
+    uint32_t count = 0;
+    for (;;) {
+      assert(value != 0);
+      value *= m_inv_5;
+      if (value > n_div_5)
+        break;
+      ++count;
+    }
+    return count;
+  }
+
+  // Returns true if value is divisible by 5^p.
+  __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
+    // I tried a case distinction on p, but there was no performance difference.
+    return pow5Factor(value) >= p;
+  }
+
+  // Returns true if value is divisible by 2^p.
+  __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
+    assert(value != 0);
+    assert(p < 64);
+    // __builtin_ctzll doesn't appear to be faster here.
+    return (value & ((1ull << p) - 1)) == 0;
+  }
+
+  __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
+    // m is maximum 55 bits
+    uint64_t high1;                                   // 128
+    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
+    uint64_t high0;                                   // 64
+    umul128(m, mul[0], &high0);                       // 0
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    return shiftright128(sum, high1, j - 64);
+  }
+
+  __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
+    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
+    *vp = mulShift64(4 * m + 2, mul, j);
+    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
+    return mulShift64(4 * m, mul, j);
+  }
+
+  // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+  __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) {
+    const uint32_t base = i / POW5_TABLE_SIZE;
+    const uint32_t base2 = base * POW5_TABLE_SIZE;
+    const uint32_t offset = i - base2;
+    const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
+    if (offset == 0) {
+      result[0] = mul[0];
+      result[1] = mul[1];
+      return;
+    }
+    const uint64_t m = DOUBLE_POW5_TABLE[offset];
+    uint64_t high1;
+    const uint64_t low1 = umul128(m, mul[1], &high1);
+    uint64_t high0;
+    const uint64_t low0 = umul128(m, mul[0], &high0);
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    // high1 | sum | low0
+    const uint32_t delta = pow5bits(i) - pow5bits(base2);
+    result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+    result[1] = shiftright128(sum, high1, delta);
+  }
+
+  // Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+  __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
+    const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+    const uint32_t base2 = base * POW5_TABLE_SIZE;
+    const uint32_t offset = base2 - i;
+    const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+    if (offset == 0) {
+      result[0] = mul[0];
+      result[1] = mul[1];
+      return;
+    }
+    const uint64_t m = DOUBLE_POW5_TABLE[offset];
+    uint64_t high1;
+    const uint64_t low1 = umul128(m, mul[1], &high1);
+    uint64_t high0;
+    const uint64_t low0 = umul128(m, mul[0] - 1, &high0);
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    // high1 | sum | low0
+    const uint32_t delta = pow5bits(base2) - pow5bits(i);
+    result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+    result[1] = shiftright128(sum, high1, delta);
+  }
+
+  __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
+    // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
+    // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
+    // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
+    uint64_t pow5[2];
+    double_computeInvPow5(q, pow5);
+    return mulShift32(m, pow5[1] + 1, j);
+  }
+
+  __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) {
+    uint64_t pow5[2];
+    double_computePow5(i, pow5);
+    return mulShift32(m, pow5[1], j);
+  }
+
+  __device__ inline uint32_t decimalLength17(const uint64_t v) {
+    // This is slightly faster than a loop.
+    // The average output length is 16.38 digits, so we check high-to-low.
+    // Function precondition: v is not an 18, 19, or 20-digit number.
+    // (17 digits are sufficient for round-tripping.)
+    assert(v < 100000000000000000L);
+    if (v >= 10000000000000000L) { return 17; }
+    if (v >= 1000000000000000L) { return 16; }
+    if (v >= 100000000000000L) { return 15; }
+    if (v >= 10000000000000L) { return 14; }
+    if (v >= 1000000000000L) { return 13; }
+    if (v >= 100000000000L) { return 12; }
+    if (v >= 10000000000L) { return 11; }
+    if (v >= 1000000000L) { return 10; }
+    if (v >= 100000000L) { return 9; }
+    if (v >= 10000000L) { return 8; }
+    if (v >= 1000000L) { return 7; }
+    if (v >= 100000L) { return 6; }
+    if (v >= 10000L) { return 5; }
+    if (v >= 1000L) { return 4; }
+    if (v >= 100L) { return 3; }
+    if (v >= 10L) { return 2; }
+    return 1;
+  }
+
+  __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
+    int32_t e2;
+    uint64_t m2;
+    if (ieeeExponent == 0) {
+      // We subtract 2 so that the bounds computation has 2 additional bits.
+      e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+      m2 = ieeeMantissa;
+    } else {
+      e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+      m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+    }
+    const bool even = (m2 & 1) == 0;
+    const bool acceptBounds = even;
+
+    // Step 2: Determine the interval of valid decimal representations.
+    const uint64_t mv = 4 * m2;
+    // Implicit bool -> int conversion. True is 1, false is 0.
+    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+    // We would compute mp and mm like this:
+    // uint64_t mp = 4 * m2 + 2;
+    // uint64_t mm = mv - 1 - mmShift;
+
+    // Step 3: Convert to a decimal power base using 128-bit arithmetic.
+    uint64_t vr, vp, vm;
+    int32_t e10;
+    bool vmIsTrailingZeros = false;
+    bool vrIsTrailingZeros = false;
+    if (e2 >= 0) {
+      // I tried special-casing q == 0, but there was no effect on performance.
+      // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
+      const uint32_t q = log10Pow2(e2) - (e2 > 3);
+      e10 = (int32_t) q;
+      const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+      const int32_t i = -e2 + (int32_t) q + k;
+      uint64_t pow5[2];
+      double_computeInvPow5(q, pow5);
+      vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
+
+      if (q <= 21) {
+        // This should use q <= 22, but I think 21 is also safe. Smaller values
+        // may still be safe, but it's more difficult to reason about them.
+        // Only one of mp, mv, and mm can be a multiple of 5, if any.
+        const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
+        if (mvMod5 == 0) {
+          vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+        } else if (acceptBounds) {
+          // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
+          // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
+          // <=> true && pow5Factor(mm) >= q, since e2 >= q.
+          vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
+        } else {
+          // Same as min(e2 + 1, pow5Factor(mp)) >= q.
+          vp -= multipleOfPowerOf5(mv + 2, q);
+        }
+      }
+    } else {
+      // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
+      const uint32_t q = log10Pow5(-e2) - (-e2 > 1);
+      e10 = (int32_t) q + e2;
+      const int32_t i = -e2 - (int32_t) q;
+      const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+      const int32_t j = (int32_t) q - k;
+
+      uint64_t pow5[2];
+      double_computePow5(i, pow5);
+      vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
+
+      if (q <= 1) {
+        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+        // mv = 4 * m2, so it always has at least two trailing 0 bits.
+        vrIsTrailingZeros = true;
+        if (acceptBounds) {
+          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+          vmIsTrailingZeros = mmShift == 1;
+        } else {
+          // mp = mv + 2, so it always has at least one trailing 0 bit.
+          --vp;
+        }
+      } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
+        // We want to know if the full product has at least q trailing zeros.
+        // We need to compute min(p2(mv), p5(mv) - e2) >= q
+        // <=> p2(mv) >= q && p5(mv) - e2 >= q
+        // <=> p2(mv) >= q (because -e2 >= q)
+        vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
+      }
+    }
+
+    // Step 4: Find the shortest decimal representation in the interval of valid representations.
+    int32_t removed = 0;
+    uint8_t lastRemovedDigit = 0;
+    uint64_t output;
+    // On average, we remove ~2 digits.
+    if (vmIsTrailingZeros || vrIsTrailingZeros) {
+      // General case, which happens rarely (~0.7%).
+      for (;;) {
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vmDiv10 = div10(vm);
+        if (vpDiv10 <= vmDiv10) {
+          break;
+        }
+        const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        vmIsTrailingZeros &= vmMod10 == 0;
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) vrMod10;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+
+      if (vmIsTrailingZeros) {
+        for (;;) {
+          const uint64_t vmDiv10 = div10(vm);
+          const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+          if (vmMod10 != 0) {
+            break;
+          }
+          const uint64_t vpDiv10 = div10(vp);
+          const uint64_t vrDiv10 = div10(vr);
+          const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+          vrIsTrailingZeros &= lastRemovedDigit == 0;
+          lastRemovedDigit = (uint8_t) vrMod10;
+          vr = vrDiv10;
+          vp = vpDiv10;
+          vm = vmDiv10;
+          ++removed;
+        }
+      }
+
+      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+        // Round even if the exact number is .....50..0.
+        lastRemovedDigit = 4;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+    } else {
+      // Specialized for the common case (~99.3%). Percentages below are relative to this.
+      bool roundUp = false;
+      const uint64_t vpDiv100 = div100(vp);
+      const uint64_t vmDiv100 = div100(vm);
+      if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
+        const uint64_t vrDiv100 = div100(vr);
+        const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
+        roundUp = vrMod100 >= 50;
+        vr = vrDiv100;
+        vp = vpDiv100;
+        vm = vmDiv100;
+        removed += 2;
+      }
+      // Loop iterations below (approximately), without optimization above:
+      // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+      // Loop iterations below (approximately), with optimization above:
+      // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+      for (;;) {
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vmDiv10 = div10(vm);
+        if (vpDiv10 <= vmDiv10) {
+          break;
+        }
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        roundUp = vrMod10 >= 5;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + (vr == vm || roundUp);
+    }
+    const int32_t exp = e10 + removed;
+
+    floating_decimal_64 fd;
+    fd.exponent = exp;
+    fd.mantissa = output;
+    return fd;
+  }
+
+  __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) {
+    int32_t e2;
+    uint32_t m2;
+    if (ieeeExponent == 0) {
+      // We subtract 2 so that the bounds computation has 2 additional bits.
+      e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+      m2 = ieeeMantissa;
+    } else {
+      e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+      m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
+    }
+    const bool even = (m2 & 1) == 0;
+    const bool acceptBounds = even;
+
+    // Step 2: Determine the interval of valid decimal representations.
+    const uint32_t mv = 4 * m2;
+    const uint32_t mp = 4 * m2 + 2;
+    // Implicit bool -> int conversion. True is 1, false is 0.
+    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+    const uint32_t mm = 4 * m2 - 1 - mmShift;
+
+    // Step 3: Convert to a decimal power base using 64-bit arithmetic.
+    uint32_t vr, vp, vm;
+    int32_t e10;
+    bool vmIsTrailingZeros = false;
+    bool vrIsTrailingZeros = false;
+    uint8_t lastRemovedDigit = 0;
+    if (e2 >= 0) {
+      const uint32_t q = log10Pow2(e2);
+      e10 = (int32_t) q;
+      const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+      const int32_t i = -e2 + (int32_t) q + k;
+      vr = mulPow5InvDivPow2(mv, q, i);
+      vp = mulPow5InvDivPow2(mp, q, i);
+      vm = mulPow5InvDivPow2(mm, q, i);
+      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+        // We need to know one removed digit even if we are not going to loop below. We could use
+        // q = X - 1 above, except that would require 33 bits for the result, and we've found that
+        // 32-bit arithmetic is faster even on 64-bit machines.
+        const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
+        lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
+      }
+      if (q <= 9) {
+        // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
+        // Only one of mp, mv, and mm can be a multiple of 5, if any.
+        if (mv % 5 == 0) {
+          vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
+        } else if (acceptBounds) {
+          vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
+        } else {
+          vp -= multipleOfPowerOf5_32(mp, q);
+        }
+      }
+    } else {
+      const uint32_t q = log10Pow5(-e2);
+      e10 = (int32_t) q + e2;
+      const int32_t i = -e2 - (int32_t) q;
+      const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+      int32_t j = (int32_t) q - k;
+      vr = mulPow5divPow2(mv, (uint32_t) i, j);
+      vp = mulPow5divPow2(mp, (uint32_t) i, j);
+      vm = mulPow5divPow2(mm, (uint32_t) i, j);
+      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+        j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+        lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
+      }
+      if (q <= 1) {
+        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+        // mv = 4 * m2, so it always has at least two trailing 0 bits.
+        vrIsTrailingZeros = true;
+        if (acceptBounds) {
+          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+          vmIsTrailingZeros = mmShift == 1;
+        } else {
+          // mp = mv + 2, so it always has at least one trailing 0 bit.
+          --vp;
+        }
+      } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
+        vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
+      }
+    }
+
+    // Step 4: Find the shortest decimal representation in the interval of valid representations.
+    int32_t removed = 0;
+    uint32_t output;
+    if (vmIsTrailingZeros || vrIsTrailingZeros) {
+      // General case, which happens rarely (~4.0%).
+      while (vp / 10 > vm / 10) {
+        vmIsTrailingZeros &= vm % 10 == 0;
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+      if (vmIsTrailingZeros) {
+        while (vm % 10 == 0) {
+          vrIsTrailingZeros &= lastRemovedDigit == 0;
+          lastRemovedDigit = (uint8_t) (vr % 10);
+          vr /= 10;
+          vp /= 10;
+          vm /= 10;
+          ++removed;
+        }
+      }
+      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+        // Round even if the exact number is .....50..0.
+        lastRemovedDigit = 4;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+    } else {
+      // Specialized for the common case (~96.0%). Percentages below are relative to this.
+      // Loop iterations below (approximately):
+      // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
+      while (vp / 10 > vm / 10) {
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + (vr == vm || lastRemovedDigit >= 5);
+    }
+    const int32_t exp = e10 + removed;
+
+    floating_decimal_32 fd;
+    fd.exponent = exp;
+    fd.mantissa = output;
+    return fd;
+  }
+
+  __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+    
+    // Values in the interval [1E-3, 1E7) are special.
+    if (scientificNotation) {
+      // Print in the format x.xxxxxE-yy.
+      for (uint32_t i = 0; i < olength - 1; ++i) {
+        const uint32_t c = output % 10; output /= 10;
+        result[index + olength - i] = (char) ('0' + c);
+      }
+      result[index] = '0' + output % 10;
+      result[index + 1] = '.';
+      index += olength + 1;
+      if (olength == 1) {
+        result[index++] = '0';
+      }
+      // Print 'E', the exponent sign, and the exponent, which has at most three digits.
+      result[index++] = 'E';
+      if (exp < 0) {
+        result[index++] = '-';
+        exp = -exp;
+      }
+      if (exp >= 100) {
+          result[index++] = (char) ('0' + exp / 100);
+          exp %= 100;
+          result[index++] = (char) ('0' + exp / 10);
+        } else if (exp >= 10) {
+          result[index++] = (char) ('0' + exp / 10);
+        }
+        result[index++] = (char) ('0' + exp % 10);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        result[index++] = '0';
+        result[index++] = '.';
+        for (int i = -1; i > exp; i--) {
+          result[index++] = '0';
+        }
+        int current = index;
+        for (int i = 0; i < olength; i++) {
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+          index++;
+        }
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        for (int i = 0; i < olength; i++) {
+          result[index + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength;
+        for (int i = olength; i < exp + 1; i++) {
+          result[index++] = '0';
+        }
+        result[index++] = '.';
+        result[index++] = '0';
+      } else {
+        // Decimal dot is somewhere between the digits.
+        int current = index + 1;
+        for (int i = 0; i < olength; i++) {
+          if (olength - i - 1 == exp) {
+            result[current + olength - i - 1] = '.';
+            current--;
+          }
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) {
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+    
+    if (scientificNotation) {
+      index += olength + 1;
+      if (olength == 1) {
+        index++;
+      }
+      // 'E'
+      index++;
+      if (exp < 0) {
+        exp = -exp;
+        index++;
+      }
+      if (exp >= 100) {
+        index += 3;
+      } else if (exp >= 10) {
+        index += 2;
+      } else {
+        index++;
+      }
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        index += 1 - exp + olength;
+      } else if (exp + 1 >= olength) {
+        index += exp + 3;
+      } else {
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      // Print in the format x.xxxxxE-yy.
+      for (int i = 0; i < olength - 1; i++) {
+        int c = output % 10; output /= 10;
+        result[index + olength - i] = (char) ('0' + c);
+      }
+      result[index] = (char) ('0' + output % 10);
+      result[index + 1] = '.';
+      index += olength + 1;
+      if (olength == 1) {
+        result[index++] = '0';
+      }
+
+      // Print 'E', the exponent sign, and the exponent, which has at most two digits.
+      result[index++] = 'E';
+      if (exp < 0) {
+        result[index++] = '-';
+        exp = -exp;
+      }
+      if (exp >= 10) {
+        result[index++] = (char) ('0' + exp / 10);
+      }
+      result[index++] = (char) ('0' + exp % 10);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        result[index++] = '0';
+        result[index++] = '.';
+        for (int i = -1; i > exp; i--) {
+          result[index++] = '0';
+        }
+        int current = index;
+        for (int i = 0; i < olength; i++) {
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+          index++;
+        }
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        for (int i = 0; i < olength; i++) {
+          result[index + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength;
+        for (int i = olength; i < exp + 1; i++) {
+          result[index++] = '0';
+        }
+        result[index++] = '.';
+        result[index++] = '0';
+      } else {
+        // Decimal dot is somewhere between the digits.
+        int current = index + 1;
+        for (int i = 0; i < olength; i++) {
+          if (olength - i - 1 == exp) {
+            result[current + olength - i - 1] = '.';
+            current--;
+          }
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      index += olength + 1;
+      if (olength == 1) {
+        index++;
+      }
+      // 'E'
+      index++;
+      if (exp < 0) {
+        index++;
+        exp = -exp;
+      }
+      if (exp >= 10) {
+        index++;
+      }
+      index++;
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        index += 1 - exp + olength;
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        index += exp + 3;
+      } else {
+        // Decimal dot is somewhere between the digits.
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent,
+    floating_decimal_64* const v) {
+    const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+    const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+
+    if (e2 > 0) {
+      // f = m2 * 2^e2 >= 2^53 is an integer.
+      // Ignore this case for now.
+      return false;
+    }
+
+    if (e2 < -52) {
+      // f < 1.
+      return false;
+    }
+
+    // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
+    // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
+    const uint64_t mask = (1ull << -e2) - 1;
+    const uint64_t fraction = m2 & mask;
+    if (fraction != 0) {
+      return false;
+    }
+
+    // f is an integer in the range [1, 2^53).
+    // Note: mantissa might contain trailing (decimal) 0's.
+    // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+    v->mantissa = m2 >> -e2;
+    v->exponent = 0;
+    return true;
+  }
+
+  __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
+    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+    const uint64_t bits = double_to_bits(f);
+
+    // Decode bits into sign, mantissa, and exponent.
+    ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+    const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
+    const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+    // Case distinction; exit early for the easy cases.
+    if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+      special = true;
+      return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
+    }
+    special = false;
+    floating_decimal_64 v;
+    const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
+    if (isSmallInt) {
+      // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
+      // For scientific notation we need to move these zeros into the exponent.
+      // (This is not needed for fixed-point notation, so it might be beneficial to trim
+      // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
+      for (;;) {
+        const uint64_t q = div10(v.mantissa);
+        const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
+        if (r != 0) {
+          break;
+        }
+        v.mantissa = q;
+        ++v.exponent;
+      }
+    } else {
+      v = d2d(ieeeMantissa, ieeeExponent);
+    }
+    return v;
+  }
+
+  __device__ int d2s_buffered_n(double f, char* result) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(f, sign, special);
+    if (special) {
+      return copy_special_str(result, sign, v.exponent, v.mantissa);
+    }
+    return to_chars(v, sign, result);
+  }
+
+  __device__ int compute_d2s_size(double value) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return d2s_size(v, sign);
+  }
+
+  __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
+    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+    const uint32_t bits = float_to_bits(f);
+
+    // Decode bits into sign, mantissa, and exponent.
+    ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+    const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
+    const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
+
+    // Case distinction; exit early for the easy cases.
+    if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+      special = true;
+      return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
+    }
+    special = false;
+    return f2d(ieeeMantissa, ieeeExponent);
+  }
+
+  __device__ int f2s_buffered_n(float f, char* result) {
+    bool sign = false, special = false;
+    floating_decimal_32 v = f2d(f, sign, special);
+    if (special) {
+      return copy_special_str(result, sign, v.exponent, v.mantissa);
+    }
+    return to_chars(v, sign, result);
+  }
+
+  __device__ int compute_f2s_size(float value) {
+    bool sign = false, special = false;
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return f2s_size(v, sign);
+  }
+
+  __device__ int compute_ftos_size(double value, bool is_float) {
+    if (is_float) {
+        return compute_f2s_size(value);
+    } else {
+        return compute_d2s_size(value);
+    }
+  }
+
+  __device__ int float_to_string(double value, char* output, bool is_float) {
+      if (is_float) {
+          return f2s_buffered_n(value, output);
+      } else {
+          return d2s_buffered_n(value, output);
+      }
+  }
+};
+
+}
+}
+}
\ No newline at end of file
diff --git a/thirdparty/cudf b/thirdparty/cudf
index fa4e8ab1af..87d2a36f04 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fa4e8ab1af4acfd2c88a619b4d9693f4a5fda168
+Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9

From 4c75bc797c0618705b1b37c0faae271619bedc8c Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 13 Nov 2023 16:52:13 +0800
Subject: [PATCH 12/54] clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/CMakeLists.txt        | 1 +
 src/main/cpp/src/ftos_converter.cu | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 8f90b9078e..a6ac8ac98c 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -168,6 +168,7 @@ add_library(
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
+  src/ftos_converter.cu
   src/histogram.cu
   src/map_utils.cu
   src/murmur_hash.cu
diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 7d5cf0716f..2d5424319e 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -19,8 +19,6 @@
 #include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 
-using namespace cudf;
-
 namespace spark_rapids_jni {
 
 namespace detail {
@@ -1160,4 +1158,4 @@ struct ftos_converter {
 
 }
 }
-}
\ No newline at end of file
+}

From f1c11e6bace6b79de768cb4d9ad0482b5377c7fd Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 14 Nov 2023 17:53:13 +0800
Subject: [PATCH 13/54] resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 87d2a36f04..5d09d38bc8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9
+Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5

From 760799be1b82cc7b5caf4aee63d718d1a48e921d Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 14 Nov 2023 17:57:17 +0800
Subject: [PATCH 14/54] resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5d09d38bc8..54c00e2d2f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5
+Subproject commit 54c00e2d2f6d7049c91594f9670f5b25d587f9f2

From bfba655b7e3dde0daff03c0822b6c152c03bfc5c Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 14 Nov 2023 17:59:09 +0800
Subject: [PATCH 15/54] resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 54c00e2d2f..5d09d38bc8 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 54c00e2d2f6d7049c91594f9670f5b25d587f9f2
+Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5

From ad27fee1efd9bc714ebec51d6e8e71258cbbc4e9 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 14 Nov 2023 18:20:51 +0800
Subject: [PATCH 16/54] resolve cudf conflicts

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5d09d38bc8..5935ef3ce2 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5
+Subproject commit 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d

From 40a4cb8f8da0bbd852dba668e417b1e0921895a2 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 14 Nov 2023 23:17:25 +0800
Subject: [PATCH 17/54] remove cudf changes

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 5935ef3ce2..e982d3736f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 5935ef3ce26b1eb7136dcaa989a36b15071a9d0d
+Subproject commit e982d3736f095e680298af85bde732d9b5a73122

From 05f55175d26044b196f099d42a9f55c8ab5ecd98 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 14 Nov 2023 23:22:42 +0800
Subject: [PATCH 18/54] remove cudf changes

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index e982d3736f..87d2a36f04 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit e982d3736f095e680298af85bde732d9b5a73122
+Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9

From 8ed59bd538726d55c14de20c4631942beb565be6 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 16 Nov 2023 09:56:38 +0800
Subject: [PATCH 19/54] add ryu

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/CMakeLists.txt        |    1 +
 src/main/cpp/src/format_float.cu   |  108 ---
 src/main/cpp/src/ftos_converter.cu | 1162 ++++++++++++++++++++++++++++
 3 files changed, 1163 insertions(+), 108 deletions(-)
 create mode 100644 src/main/cpp/src/ftos_converter.cu

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 745a9df2a7..fb88705259 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -168,6 +168,7 @@ add_library(
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
+  src/ftos_converter.cu
   src/histogram.cu
   src/map_utils.cu
   src/murmur_hash.cu
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index 5972d108a9..73c8bb7be1 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -125,114 +125,6 @@ struct ftos_converter {
     return result;
   }
 
-  // __device__ char* format_ll(long long n, char* result, char* dec_ptr, int& dec_pos, int exp10) {
-  //   if (n == 0) {
-  //     *result++ = '0';
-  //     return result;
-  //   }
-  //   int sep_count = 0;
-  //   char buffer[305];  // should be big-enough for significant digits
-  //   char* ptr = buffer;
-  //   while (n > 0) {
-  //       if (sep_count == 3) {
-  //           *ptr++ = ',';
-  //           sep_count = 0;
-  //       }
-  //       *ptr++ = (char)('0' + (n % 10));
-  //       n /= 10;
-  //       sep_count++;
-  //   }
-  //   int len = dec_ptr - dec_str;
-  //   int dec_pos = 0;
-  //   while (exp10--) {
-  //       if (sep_count == 3) {
-  //           *ptr++ = ',';
-  //           sep_count = 0;
-  //       }
-  //       if (dec_pos < len) {
-  //         *ptr++ = dec_str[dec_pos++];
-  //       } else {
-  //         *ptr++ = '0';
-  //       }
-  //       sep_count++;
-  //   }
-  //   while (ptr != buffer) {
-  //       *result++ = *--ptr;  // 54321 -> 12345
-  //   }
-  //   return result;
-  // }
-
-  // /**
-  //  * @brief Dissect a float value into integer, decimal, and exponent components.
-  //  *
-  //  * @return The number of decimal places.
-  //  */
-  // __device__ int dissect_value(double value,
-  //                              int digits,
-  //                              unsigned int& integer,
-  //                              unsigned long long& decimal,
-  //                              int& exp10,
-  //                              bool is_float = false)
-  // {
-  //   // normalize step puts value between lower-limit and upper-limit
-  //   // by adjusting the exponent up or down
-  //   exp10 = 0;
-  //   if (value > upper_limit) {
-  //     int fx = 256;
-  //     for (int idx = 8; idx >= 0; --idx) {
-  //       if (value >= upper10[idx]) {
-  //         value *= lower10[idx];
-  //         exp10 += fx;
-  //       }
-  //       fx = fx >> 1;
-  //     }
-  //   } else if ((value > 0.0) && (value < lower_limit)) {
-  //     int fx = 256;
-  //     for (int idx = 8; idx >= 0; --idx) {
-  //       if (value < blower10[idx]) {
-  //         value *= upper10[idx];
-  //         exp10 -= fx;
-  //       }
-  //       fx = fx >> 1;
-  //     }
-  //   }
-  //   //
-  //   // int decimal_places = significant_digits - (exp10? 2 : 1);
-  //   // unsigned long long max_digits = (exp10? fifteen_digits : sixteen_digits);
-  //   int decimal_places = (is_float? significant_digits_float: significant_digits_double) - 1;
-  //   unsigned long long max_digits = (is_float? eight_digits: sixteen_digits);
-  //   double temp_value = value;
-  //   while (temp_value < 1.0 && temp_value > 0.0) {
-  //     max_digits *= 10;
-  //     temp_value *= 10.0;
-  //     decimal_places++;
-  //   }
-  //   integer                 = (unsigned int)value;
-  //   for (unsigned int i = integer; i >= 10; i /= 10) {
-  //     --decimal_places;
-  //     max_digits /= 10;
-  //   }
-  //   double diff = value - (double)integer;
-  //   double remainder = diff * (double)max_digits;
-  //   decimal          = (unsigned long long)remainder;
-  //   remainder -= (double)decimal;
-  //   decimal += (unsigned long long)(2.0 * remainder); // round up
-  //   if (decimal >= max_digits) {
-  //     decimal = 0;
-  //     ++integer;
-  //     if (exp10 && (integer >= 10)) {
-  //       ++exp10;
-  //       integer = 1;
-  //     }
-  //   }
-  //   //
-  //   while ((decimal % 10) == 0 && (decimal_places > 0)) {
-  //     decimal /= 10;
-  //     --decimal_places;
-  //   }
-  //   return decimal_places;
-  // }
-
   /**
    * @brief Main kernel method for converting float value to char output array.
    *
diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
new file mode 100644
index 0000000000..55d2e4282c
--- /dev/null
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -0,0 +1,1162 @@
+/* Not a contribution
+ * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
+ * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
+ *
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda/std/climits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_64 {
+  uint64_t mantissa;
+  // Decimal exponent's range is -324 to 308
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_64;
+
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_32 {
+  uint32_t mantissa;
+  // Decimal exponent's range is -45 to 38
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_32;
+
+struct ftos_converter {
+
+  // These tables are generated by PrintDoubleLookupTable.
+  static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
+  static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
+  static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
+  static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
+  static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
+  static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
+  static constexpr unsigned int DOUBLE_BIAS = 1023;
+  static constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
+  static constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
+  static constexpr unsigned int FLOAT_BIAS = 127;
+
+
+  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
+  __device__ inline uint32_t decimalLength9(const uint32_t v) {
+    // Function precondition: v is not a 10-digit number.
+    // (f2s: 9 digits are sufficient for round-tripping.)
+    // (d2fixed: We print 9-digit blocks.)
+    assert(v < 1000000000);
+    if (v >= 100000000) { return 9; }
+    if (v >= 10000000) { return 8; }
+    if (v >= 1000000) { return 7; }
+    if (v >= 100000) { return 6; }
+    if (v >= 10000) { return 5; }
+    if (v >= 1000) { return 4; }
+    if (v >= 100) { return 3; }
+    if (v >= 10) { return 2; }
+    return 1;
+  }
+
+  const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = {
+    {                    1u, 2305843009213693952u },
+    {  5955668970331000884u, 1784059615882449851u },
+    {  8982663654677661702u, 1380349269358112757u },
+    {  7286864317269821294u, 2135987035920910082u },
+    {  7005857020398200553u, 1652639921975621497u },
+    { 17965325103354776697u, 1278668206209430417u },
+    {  8928596168509315048u, 1978643211784836272u },
+    { 10075671573058298858u, 1530901034580419511u },
+    {   597001226353042382u, 1184477304306571148u },
+    {  1527430471115325346u, 1832889850782397517u },
+    { 12533209867169019542u, 1418129833677084982u },
+    {  5577825024675947042u, 2194449627517475473u },
+    { 11006974540203867551u, 1697873161311732311u },
+    { 10313493231639821582u, 1313665730009899186u },
+    { 12701016819766672773u, 2032799256770390445u }
+  };
+
+  const uint32_t POW5_INV_OFFSETS[19] = {
+    0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
+    0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
+    0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
+    0x00000000
+  };
+
+  const uint64_t DOUBLE_POW5_SPLIT2[13][2] = {
+    {                    0u, 1152921504606846976u },
+    {                    0u, 1490116119384765625u },
+    {  1032610780636961552u, 1925929944387235853u },
+    {  7910200175544436838u, 1244603055572228341u },
+    { 16941905809032713930u, 1608611746708759036u },
+    { 13024893955298202172u, 2079081953128979843u },
+    {  6607496772837067824u, 1343575221513417750u },
+    { 17332926989895652603u, 1736530273035216783u },
+    { 13037379183483547984u, 2244412773384604712u },
+    {  1605989338741628675u, 1450417759929778918u },
+    {  9630225068416591280u, 1874621017369538693u },
+    {   665883850346957067u, 1211445438634777304u },
+    { 14931890668723713708u, 1565756531257009982u }
+  };
+
+  const uint32_t POW5_OFFSETS[21] = {
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
+    0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
+    0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
+    0x55559155, 0x51405555, 0x00000105
+  };
+
+  static constexpr uint32_t POW5_TABLE_SIZE = 26;
+  const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+  1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
+  1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
+  30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
+  19073486328125ull, 95367431640625ull, 476837158203125ull,
+  2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
+  298023223876953125ull //, 1490116119384765625ull
+  };
+
+  // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528.
+  __device__ inline int32_t log2pow5(const int32_t e) {
+    // This approximation works up to the point that the multiplication overflows at e = 3529.
+    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+    // than 2^9297.
+    assert(e >= 0);
+    assert(e <= 3528);
+    return (int32_t) ((((uint32_t) e) * 1217359) >> 19);
+  }
+
+  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+  __device__ inline int32_t pow5bits(const int32_t e) {
+    // This approximation works up to the point that the multiplication overflows at e = 3529.
+    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+    // than 2^9297.
+    assert(e >= 0);
+    assert(e <= 3528);
+    return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
+  }
+
+  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+  __device__ inline int32_t ceil_log2pow5(const int32_t e) {
+    return log2pow5(e) + 1;
+  }
+
+  // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
+  __device__ inline uint32_t log10Pow2(const int32_t e) {
+    // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
+    assert(e >= 0);
+    assert(e <= 1650);
+    return (((uint32_t) e) * 78913) >> 18;
+  }
+
+  // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
+  __device__ inline uint32_t log10Pow5(const int32_t e) {
+    // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
+    assert(e >= 0);
+    assert(e <= 2620);
+    return (((uint32_t) e) * 732923) >> 20;
+  }
+
+  __device__ inline uint32_t pow5factor_32(uint32_t value) {
+    uint32_t count = 0;
+    for (;;) {
+      assert(value != 0);
+      const uint32_t q = value / 5;
+      const uint32_t r = value % 5;
+      if (r != 0) {
+        break;
+      }
+      value = q;
+      ++count;
+    }
+    return count;
+  }
+
+  // Returns true if value is divisible by 5^p.
+  __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) {
+    return pow5factor_32(value) >= p;
+  }
+
+  // Returns true if value is divisible by 2^p.
+  __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) {
+    // __builtin_ctz doesn't appear to be faster here.
+    return (value & ((1u << p) - 1)) == 0;
+  }
+
+  // It seems to be slightly faster to avoid uint128_t here, although the
+  // generated code for uint128_t looks slightly nicer.
+  __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) {
+    assert(shift > 32);
+
+    // The casts here help MSVC to avoid calls to the __allmul library
+    // function.
+    const uint32_t factorLo = (uint32_t)(factor);
+    const uint32_t factorHi = (uint32_t)(factor >> 32);
+    const uint64_t bits0 = (uint64_t)m * factorLo;
+    const uint64_t bits1 = (uint64_t)m * factorHi;
+
+    const uint64_t sum = (bits0 >> 32) + bits1;
+    const uint64_t shiftedSum = sum >> (shift - 32);
+    assert(shiftedSum <= UINT32_MAX);
+    return (uint32_t) shiftedSum;
+
+  }
+
+  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
+    if (mantissa) {
+      memcpy(result, "NaN", 3);
+      return 3;
+    }
+    if (sign) {
+      result[0] = '-';
+    }
+    if (exponent) {
+      memcpy(result + sign, "Infinity", 8);
+      return sign + 8;
+    }
+    memcpy(result + sign, "0.0", 3);
+    return sign + 3;
+  }
+
+  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) {
+    if (mantissa) {
+      return 3;
+    }
+    if (exponent) {
+      return sign + 8;
+    }
+    return sign + 3;
+  }
+
+  __device__ inline uint32_t float_to_bits(const float f) {
+    uint32_t bits = 0;
+    memcpy(&bits, &f, sizeof(float));
+    return bits;
+  }
+
+  __device__ inline uint64_t double_to_bits(const double d) {
+    uint64_t bits = 0;
+    memcpy(&bits, &d, sizeof(double));
+    return bits;
+  }
+
+  __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
+    // The casts here help MSVC to avoid calls to the __allmul library function.
+    const uint32_t aLo = (uint32_t)a;
+    const uint32_t aHi = (uint32_t)(a >> 32);
+    const uint32_t bLo = (uint32_t)b;
+    const uint32_t bHi = (uint32_t)(b >> 32);
+
+    const uint64_t b00 = (uint64_t)aLo * bLo;
+    const uint64_t b01 = (uint64_t)aLo * bHi;
+    const uint64_t b10 = (uint64_t)aHi * bLo;
+    const uint64_t b11 = (uint64_t)aHi * bHi;
+
+    const uint32_t b00Lo = (uint32_t)b00;
+    const uint32_t b00Hi = (uint32_t)(b00 >> 32);
+
+    const uint64_t mid1 = b10 + b00Hi;
+    const uint32_t mid1Lo = (uint32_t)(mid1);
+    const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
+
+    const uint64_t mid2 = b01 + mid1Lo;
+    const uint32_t mid2Lo = (uint32_t)(mid2);
+    const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
+
+    const uint64_t pHi = b11 + mid1Hi + mid2Hi;
+    const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
+
+    *productHi = pHi;
+    return pLo;
+  }
+
+  __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
+    // We don't need to handle the case dist >= 64 here (see above).
+    assert(dist < 64);
+    assert(dist > 0);
+    return (hi << (64 - dist)) | (lo >> dist);
+  }
+
+  __device__ inline uint64_t div5(const uint64_t x) {
+    return x / 5;
+  }
+
+  __device__ inline uint64_t div10(const uint64_t x) {
+    return x / 10;
+  }
+
+  __device__ inline uint64_t div100(const uint64_t x) {
+    return x / 100;
+  }
+
+  __device__ inline uint64_t div1e8(const uint64_t x) {
+    return x / 100000000;
+  }
+
+  __device__ inline uint64_t div1e9(const uint64_t x) {
+    return x / 1000000000;
+  }
+
+  __device__ inline uint32_t mod1e9(const uint64_t x) {
+    return (uint32_t) (x - 1000000000 * div1e9(x));
+  }
+
+  __device__ inline uint32_t pow5Factor(uint64_t value) {
+    const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
+    const uint64_t n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
+    uint32_t count = 0;
+    for (;;) {
+      assert(value != 0);
+      value *= m_inv_5;
+      if (value > n_div_5)
+        break;
+      ++count;
+    }
+    return count;
+  }
+
+  // Returns true if value is divisible by 5^p.
+  __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
+    // I tried a case distinction on p, but there was no performance difference.
+    return pow5Factor(value) >= p;
+  }
+
+  // Returns true if value is divisible by 2^p.
+  __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
+    assert(value != 0);
+    assert(p < 64);
+    // __builtin_ctzll doesn't appear to be faster here.
+    return (value & ((1ull << p) - 1)) == 0;
+  }
+
+  __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
+    // m is maximum 55 bits
+    uint64_t high1;                                   // 128
+    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
+    uint64_t high0;                                   // 64
+    umul128(m, mul[0], &high0);                       // 0
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    return shiftright128(sum, high1, j - 64);
+  }
+
+  __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
+    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
+    *vp = mulShift64(4 * m + 2, mul, j);
+    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
+    return mulShift64(4 * m, mul, j);
+  }
+
+  // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+  __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) {
+    const uint32_t base = i / POW5_TABLE_SIZE;
+    const uint32_t base2 = base * POW5_TABLE_SIZE;
+    const uint32_t offset = i - base2;
+    const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
+    if (offset == 0) {
+      result[0] = mul[0];
+      result[1] = mul[1];
+      return;
+    }
+    const uint64_t m = DOUBLE_POW5_TABLE[offset];
+    uint64_t high1;
+    const uint64_t low1 = umul128(m, mul[1], &high1);
+    uint64_t high0;
+    const uint64_t low0 = umul128(m, mul[0], &high0);
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    // high1 | sum | low0
+    const uint32_t delta = pow5bits(i) - pow5bits(base2);
+    result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+    result[1] = shiftright128(sum, high1, delta);
+  }
+
+  // Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+  __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
+    const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+    const uint32_t base2 = base * POW5_TABLE_SIZE;
+    const uint32_t offset = base2 - i;
+    const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+    if (offset == 0) {
+      result[0] = mul[0];
+      result[1] = mul[1];
+      return;
+    }
+    const uint64_t m = DOUBLE_POW5_TABLE[offset];
+    uint64_t high1;
+    const uint64_t low1 = umul128(m, mul[1], &high1);
+    uint64_t high0;
+    const uint64_t low0 = umul128(m, mul[0] - 1, &high0);
+    const uint64_t sum = high0 + low1;
+    if (sum < high0) {
+      ++high1; // overflow into high1
+    }
+    // high1 | sum | low0
+    const uint32_t delta = pow5bits(base2) - pow5bits(i);
+    result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+    result[1] = shiftright128(sum, high1, delta);
+  }
+
+  __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
+    // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
+    // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
+    // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
+    uint64_t pow5[2];
+    double_computeInvPow5(q, pow5);
+    return mulShift32(m, pow5[1] + 1, j);
+  }
+
+  __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) {
+    uint64_t pow5[2];
+    double_computePow5(i, pow5);
+    return mulShift32(m, pow5[1], j);
+  }
+
+  __device__ inline uint32_t decimalLength17(const uint64_t v) {
+    // This is slightly faster than a loop.
+    // The average output length is 16.38 digits, so we check high-to-low.
+    // Function precondition: v is not an 18, 19, or 20-digit number.
+    // (17 digits are sufficient for round-tripping.)
+    assert(v < 100000000000000000L);
+    if (v >= 10000000000000000L) { return 17; }
+    if (v >= 1000000000000000L) { return 16; }
+    if (v >= 100000000000000L) { return 15; }
+    if (v >= 10000000000000L) { return 14; }
+    if (v >= 1000000000000L) { return 13; }
+    if (v >= 100000000000L) { return 12; }
+    if (v >= 10000000000L) { return 11; }
+    if (v >= 1000000000L) { return 10; }
+    if (v >= 100000000L) { return 9; }
+    if (v >= 10000000L) { return 8; }
+    if (v >= 1000000L) { return 7; }
+    if (v >= 100000L) { return 6; }
+    if (v >= 10000L) { return 5; }
+    if (v >= 1000L) { return 4; }
+    if (v >= 100L) { return 3; }
+    if (v >= 10L) { return 2; }
+    return 1;
+  }
+
+  __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
+    int32_t e2;
+    uint64_t m2;
+    if (ieeeExponent == 0) {
+      // We subtract 2 so that the bounds computation has 2 additional bits.
+      e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+      m2 = ieeeMantissa;
+    } else {
+      e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+      m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+    }
+    const bool even = (m2 & 1) == 0;
+    const bool acceptBounds = even;
+
+    // Step 2: Determine the interval of valid decimal representations.
+    const uint64_t mv = 4 * m2;
+    // Implicit bool -> int conversion. True is 1, false is 0.
+    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+    // We would compute mp and mm like this:
+    // uint64_t mp = 4 * m2 + 2;
+    // uint64_t mm = mv - 1 - mmShift;
+
+    // Step 3: Convert to a decimal power base using 128-bit arithmetic.
+    uint64_t vr, vp, vm;
+    int32_t e10;
+    bool vmIsTrailingZeros = false;
+    bool vrIsTrailingZeros = false;
+    if (e2 >= 0) {
+      // I tried special-casing q == 0, but there was no effect on performance.
+      // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
+      const uint32_t q = log10Pow2(e2) - (e2 > 3);
+      e10 = (int32_t) q;
+      const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+      const int32_t i = -e2 + (int32_t) q + k;
+      uint64_t pow5[2];
+      double_computeInvPow5(q, pow5);
+      vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
+
+      if (q <= 21) {
+        // This should use q <= 22, but I think 21 is also safe. Smaller values
+        // may still be safe, but it's more difficult to reason about them.
+        // Only one of mp, mv, and mm can be a multiple of 5, if any.
+        const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
+        if (mvMod5 == 0) {
+          vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+        } else if (acceptBounds) {
+          // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
+          // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
+          // <=> true && pow5Factor(mm) >= q, since e2 >= q.
+          vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
+        } else {
+          // Same as min(e2 + 1, pow5Factor(mp)) >= q.
+          vp -= multipleOfPowerOf5(mv + 2, q);
+        }
+      }
+    } else {
+      // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
+      const uint32_t q = log10Pow5(-e2) - (-e2 > 1);
+      e10 = (int32_t) q + e2;
+      const int32_t i = -e2 - (int32_t) q;
+      const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+      const int32_t j = (int32_t) q - k;
+
+      uint64_t pow5[2];
+      double_computePow5(i, pow5);
+      vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
+
+      if (q <= 1) {
+        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+        // mv = 4 * m2, so it always has at least two trailing 0 bits.
+        vrIsTrailingZeros = true;
+        if (acceptBounds) {
+          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+          vmIsTrailingZeros = mmShift == 1;
+        } else {
+          // mp = mv + 2, so it always has at least one trailing 0 bit.
+          --vp;
+        }
+      } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
+        // We want to know if the full product has at least q trailing zeros.
+        // We need to compute min(p2(mv), p5(mv) - e2) >= q
+        // <=> p2(mv) >= q && p5(mv) - e2 >= q
+        // <=> p2(mv) >= q (because -e2 >= q)
+        vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
+      }
+    }
+
+    // Step 4: Find the shortest decimal representation in the interval of valid representations.
+    int32_t removed = 0;
+    uint8_t lastRemovedDigit = 0;
+    uint64_t output;
+    // On average, we remove ~2 digits.
+    if (vmIsTrailingZeros || vrIsTrailingZeros) {
+      // General case, which happens rarely (~0.7%).
+      for (;;) {
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vmDiv10 = div10(vm);
+        if (vpDiv10 <= vmDiv10) {
+          break;
+        }
+        const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        vmIsTrailingZeros &= vmMod10 == 0;
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) vrMod10;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+
+      if (vmIsTrailingZeros) {
+        for (;;) {
+          const uint64_t vmDiv10 = div10(vm);
+          const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+          if (vmMod10 != 0) {
+            break;
+          }
+          const uint64_t vpDiv10 = div10(vp);
+          const uint64_t vrDiv10 = div10(vr);
+          const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+          vrIsTrailingZeros &= lastRemovedDigit == 0;
+          lastRemovedDigit = (uint8_t) vrMod10;
+          vr = vrDiv10;
+          vp = vpDiv10;
+          vm = vmDiv10;
+          ++removed;
+        }
+      }
+
+      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+        // Round even if the exact number is .....50..0.
+        lastRemovedDigit = 4;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+    } else {
+      // Specialized for the common case (~99.3%). Percentages below are relative to this.
+      bool roundUp = false;
+      const uint64_t vpDiv100 = div100(vp);
+      const uint64_t vmDiv100 = div100(vm);
+      if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
+        const uint64_t vrDiv100 = div100(vr);
+        const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
+        roundUp = vrMod100 >= 50;
+        vr = vrDiv100;
+        vp = vpDiv100;
+        vm = vmDiv100;
+        removed += 2;
+      }
+      // Loop iterations below (approximately), without optimization above:
+      // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+      // Loop iterations below (approximately), with optimization above:
+      // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+      for (;;) {
+        const uint64_t vpDiv10 = div10(vp);
+        const uint64_t vmDiv10 = div10(vm);
+        if (vpDiv10 <= vmDiv10) {
+          break;
+        }
+        const uint64_t vrDiv10 = div10(vr);
+        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        roundUp = vrMod10 >= 5;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + (vr == vm || roundUp);
+    }
+    const int32_t exp = e10 + removed;
+
+    floating_decimal_64 fd;
+    fd.exponent = exp;
+    fd.mantissa = output;
+    return fd;
+  }
+
+  __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) {
+    int32_t e2;
+    uint32_t m2;
+    if (ieeeExponent == 0) {
+      // We subtract 2 so that the bounds computation has 2 additional bits.
+      e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+      m2 = ieeeMantissa;
+    } else {
+      e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+      m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
+    }
+    const bool even = (m2 & 1) == 0;
+    const bool acceptBounds = even;
+
+    // Step 2: Determine the interval of valid decimal representations.
+    const uint32_t mv = 4 * m2;
+    const uint32_t mp = 4 * m2 + 2;
+    // Implicit bool -> int conversion. True is 1, false is 0.
+    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+    const uint32_t mm = 4 * m2 - 1 - mmShift;
+
+    // Step 3: Convert to a decimal power base using 64-bit arithmetic.
+    uint32_t vr, vp, vm;
+    int32_t e10;
+    bool vmIsTrailingZeros = false;
+    bool vrIsTrailingZeros = false;
+    uint8_t lastRemovedDigit = 0;
+    if (e2 >= 0) {
+      const uint32_t q = log10Pow2(e2);
+      e10 = (int32_t) q;
+      const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+      const int32_t i = -e2 + (int32_t) q + k;
+      vr = mulPow5InvDivPow2(mv, q, i);
+      vp = mulPow5InvDivPow2(mp, q, i);
+      vm = mulPow5InvDivPow2(mm, q, i);
+      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+        // We need to know one removed digit even if we are not going to loop below. We could use
+        // q = X - 1 above, except that would require 33 bits for the result, and we've found that
+        // 32-bit arithmetic is faster even on 64-bit machines.
+        const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
+        lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
+      }
+      if (q <= 9) {
+        // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
+        // Only one of mp, mv, and mm can be a multiple of 5, if any.
+        if (mv % 5 == 0) {
+          vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
+        } else if (acceptBounds) {
+          vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
+        } else {
+          vp -= multipleOfPowerOf5_32(mp, q);
+        }
+      }
+    } else {
+      const uint32_t q = log10Pow5(-e2);
+      e10 = (int32_t) q + e2;
+      const int32_t i = -e2 - (int32_t) q;
+      const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+      int32_t j = (int32_t) q - k;
+      vr = mulPow5divPow2(mv, (uint32_t) i, j);
+      vp = mulPow5divPow2(mp, (uint32_t) i, j);
+      vm = mulPow5divPow2(mm, (uint32_t) i, j);
+      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+        j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+        lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
+      }
+      if (q <= 1) {
+        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+        // mv = 4 * m2, so it always has at least two trailing 0 bits.
+        vrIsTrailingZeros = true;
+        if (acceptBounds) {
+          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+          vmIsTrailingZeros = mmShift == 1;
+        } else {
+          // mp = mv + 2, so it always has at least one trailing 0 bit.
+          --vp;
+        }
+      } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
+        vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
+      }
+    }
+
+    // Step 4: Find the shortest decimal representation in the interval of valid representations.
+    int32_t removed = 0;
+    uint32_t output;
+    if (vmIsTrailingZeros || vrIsTrailingZeros) {
+      // General case, which happens rarely (~4.0%).
+      while (vp / 10 > vm / 10) {
+        vmIsTrailingZeros &= vm % 10 == 0;
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+      if (vmIsTrailingZeros) {
+        while (vm % 10 == 0) {
+          vrIsTrailingZeros &= lastRemovedDigit == 0;
+          lastRemovedDigit = (uint8_t) (vr % 10);
+          vr /= 10;
+          vp /= 10;
+          vm /= 10;
+          ++removed;
+        }
+      }
+      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+        // Round even if the exact number is .....50..0.
+        lastRemovedDigit = 4;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+    } else {
+      // Specialized for the common case (~96.0%). Percentages below are relative to this.
+      // Loop iterations below (approximately):
+      // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
+      while (vp / 10 > vm / 10) {
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+      // We need to take vr + 1 if vr is outside bounds or we need to round up.
+      output = vr + (vr == vm || lastRemovedDigit >= 5);
+    }
+    const int32_t exp = e10 + removed;
+
+    floating_decimal_32 fd;
+    fd.exponent = exp;
+    fd.mantissa = output;
+    return fd;
+  }
+
+  __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    // Values in the interval [1E-3, 1E7) are special.
+    if (scientificNotation) {
+      // Print in the format x.xxxxxE-yy.
+      for (uint32_t i = 0; i < olength - 1; ++i) {
+        const uint32_t c = output % 10; output /= 10;
+        result[index + olength - i] = (char) ('0' + c);
+      }
+      result[index] = '0' + output % 10;
+      result[index + 1] = '.';
+      index += olength + 1;
+      if (olength == 1) {
+        result[index++] = '0';
+      }
+      // Print 'E', the exponent sign, and the exponent, which has at most three digits.
+      result[index++] = 'E';
+      if (exp < 0) {
+        result[index++] = '-';
+        exp = -exp;
+      }
+      if (exp >= 100) {
+          result[index++] = (char) ('0' + exp / 100);
+          exp %= 100;
+          result[index++] = (char) ('0' + exp / 10);
+        } else if (exp >= 10) {
+          result[index++] = (char) ('0' + exp / 10);
+        }
+        result[index++] = (char) ('0' + exp % 10);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        result[index++] = '0';
+        result[index++] = '.';
+        for (int i = -1; i > exp; i--) {
+          result[index++] = '0';
+        }
+        int current = index;
+        for (int i = 0; i < olength; i++) {
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+          index++;
+        }
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        for (int i = 0; i < olength; i++) {
+          result[index + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength;
+        for (int i = olength; i < exp + 1; i++) {
+          result[index++] = '0';
+        }
+        result[index++] = '.';
+        result[index++] = '0';
+      } else {
+        // Decimal dot is somewhere between the digits.
+        int current = index + 1;
+        for (int i = 0; i < olength; i++) {
+          if (olength - i - 1 == exp) {
+            result[current + olength - i - 1] = '.';
+            current--;
+          }
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) {
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      index += olength + 1;
+      if (olength == 1) {
+        index++;
+      }
+      // 'E'
+      index++;
+      if (exp < 0) {
+        exp = -exp;
+        index++;
+      }
+      if (exp >= 100) {
+        index += 3;
+      } else if (exp >= 10) {
+        index += 2;
+      } else {
+        index++;
+      }
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        index += 1 - exp + olength;
+      } else if (exp + 1 >= olength) {
+        index += exp + 3;
+      } else {
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      // Print in the format x.xxxxxE-yy.
+      for (int i = 0; i < olength - 1; i++) {
+        int c = output % 10; output /= 10;
+        result[index + olength - i] = (char) ('0' + c);
+      }
+      result[index] = (char) ('0' + output % 10);
+      result[index + 1] = '.';
+      index += olength + 1;
+      if (olength == 1) {
+        result[index++] = '0';
+      }
+
+      // Print 'E', the exponent sign, and the exponent, which has at most two digits.
+      result[index++] = 'E';
+      if (exp < 0) {
+        result[index++] = '-';
+        exp = -exp;
+      }
+      if (exp >= 10) {
+        result[index++] = (char) ('0' + exp / 10);
+      }
+      result[index++] = (char) ('0' + exp % 10);
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        result[index++] = '0';
+        result[index++] = '.';
+        for (int i = -1; i > exp; i--) {
+          result[index++] = '0';
+        }
+        int current = index;
+        for (int i = 0; i < olength; i++) {
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+          index++;
+        }
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        for (int i = 0; i < olength; i++) {
+          result[index + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength;
+        for (int i = olength; i < exp + 1; i++) {
+          result[index++] = '0';
+        }
+        result[index++] = '.';
+        result[index++] = '0';
+      } else {
+        // Decimal dot is somewhere between the digits.
+        int current = index + 1;
+        for (int i = 0; i < olength; i++) {
+          if (olength - i - 1 == exp) {
+            result[current + olength - i - 1] = '.';
+            current--;
+          }
+          result[current + olength - i - 1] = (char) ('0' + output % 10);
+          output /= 10;
+        }
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + olength - 1;
+    bool scientificNotation = (exp < -3) || (exp >= 7);
+
+    if (scientificNotation) {
+      index += olength + 1;
+      if (olength == 1) {
+        index++;
+      }
+      // 'E'
+      index++;
+      if (exp < 0) {
+        index++;
+        exp = -exp;
+      }
+      if (exp >= 10) {
+        index++;
+      }
+      index++;
+    } else {
+      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+      if (exp < 0) {
+        // Decimal dot is before any of the digits.
+        index += 1 - exp + olength;
+      } else if (exp + 1 >= olength) {
+        // Decimal dot is after any of the digits.
+        index += exp + 3;
+      } else {
+        // Decimal dot is somewhere between the digits.
+        index += olength + 1;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent,
+    floating_decimal_64* const v) {
+    const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+    const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+
+    if (e2 > 0) {
+      // f = m2 * 2^e2 >= 2^53 is an integer.
+      // Ignore this case for now.
+      return false;
+    }
+
+    if (e2 < -52) {
+      // f < 1.
+      return false;
+    }
+
+    // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
+    // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
+    const uint64_t mask = (1ull << -e2) - 1;
+    const uint64_t fraction = m2 & mask;
+    if (fraction != 0) {
+      return false;
+    }
+
+    // f is an integer in the range [1, 2^53).
+    // Note: mantissa might contain trailing (decimal) 0's.
+    // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+    v->mantissa = m2 >> -e2;
+    v->exponent = 0;
+    return true;
+  }
+
+  __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
+    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+    const uint64_t bits = double_to_bits(f);
+
+    // Decode bits into sign, mantissa, and exponent.
+    ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+    const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
+    const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+    // Case distinction; exit early for the easy cases.
+    if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+      special = true;
+      return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
+    }
+    special = false;
+    floating_decimal_64 v;
+    const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
+    if (isSmallInt) {
+      // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
+      // For scientific notation we need to move these zeros into the exponent.
+      // (This is not needed for fixed-point notation, so it might be beneficial to trim
+      // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
+      for (;;) {
+        const uint64_t q = div10(v.mantissa);
+        const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
+        if (r != 0) {
+          break;
+        }
+        v.mantissa = q;
+        ++v.exponent;
+      }
+    } else {
+      v = d2d(ieeeMantissa, ieeeExponent);
+    }
+    return v;
+  }
+
+  __device__ int d2s_buffered_n(double f, char* result) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(f, sign, special);
+    if (special) {
+      return copy_special_str(result, sign, v.exponent, v.mantissa);
+    }
+    return to_chars(v, sign, result);
+  }
+
+  __device__ int compute_d2s_size(double value) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return d2s_size(v, sign);
+  }
+
+  __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
+    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+    const uint32_t bits = float_to_bits(f);
+
+    // Decode bits into sign, mantissa, and exponent.
+    ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+    const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
+    const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
+
+    // Case distinction; exit early for the easy cases.
+    if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+      special = true;
+      return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
+    }
+    special = false;
+    return f2d(ieeeMantissa, ieeeExponent);
+  }
+
+  __device__ int f2s_buffered_n(float f, char* result) {
+    bool sign = false, special = false;
+    floating_decimal_32 v = f2d(f, sign, special);
+    if (special) {
+      return copy_special_str(result, sign, v.exponent, v.mantissa);
+    }
+    return to_chars(v, sign, result);
+  }
+
+  __device__ int compute_f2s_size(float value) {
+    bool sign = false, special = false;
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return f2s_size(v, sign);
+  }
+
+  __device__ int compute_ftos_size(double value, bool is_float) {
+    if (is_float) {
+        return compute_f2s_size(value);
+    } else {
+        return compute_d2s_size(value);
+    }
+  }
+
+  __device__ int float_to_string(double value, char* output, bool is_float) {
+      if (is_float) {
+          return f2s_buffered_n(value, output);
+      } else {
+          return d2s_buffered_n(value, output);
+      }
+  }
+
+};
+
+}
+}
+}
\ No newline at end of file

From da2197b826b86b3520150ab4dbddedc9f1f09417 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 16 Nov 2023 17:43:58 +0800
Subject: [PATCH 20/54] Add copyright and notice

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 NOTICE                                        | 21 +++++++++++++++++++
 src/main/cpp/src/CastStringJni.cpp            |  2 +-
 src/main/cpp/src/cast_string.hpp              |  2 +-
 .../nvidia/spark/rapids/jni/CastStrings.java  |  2 +-
 4 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 NOTICE

diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000000..53333b52c5
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,21 @@
+RAPIDS Accelerator JNI For Apache Spark
+Copyright (c) 2022-2023, NVIDIA CORPORATION
+
+--------------------------------------------------------------------------------
+
+This project includes code from ryu (https://github.com/ulfjack/ryu).
+
+ryu
+Copyright (2018) Ulf Adams and contributors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index ff8ee2afd4..093b51188b 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index fc2270ca8c..c4f850b47f 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 3002e1cdab..022cb93085 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 2c6cdcbfbf220dad145468ee003b6ca3ea6f4116 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 17 Nov 2023 10:31:26 +0800
Subject: [PATCH 21/54] Fix copyrights and license

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 97 ++++++++++++++++++++----------
 thirdparty/cudf                    |  2 +-
 2 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 2d5424319e..1ff5fe8543 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -1,7 +1,24 @@
-/* Not a contribution
- * Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
- * NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
- *
+// Copyright 2018 Ulf Adams
+//
+// The contents of this file may be used under the terms of the Apache License,
+// Version 2.0.
+//
+//    (See accompanying file LICENSE-Apache or copy at
+//     http://www.apache.org/licenses/LICENSE-2.0)
+//
+// Alternatively, the contents of this file may be used under the terms of
+// the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE-Boost or copy at
+//     https://www.boost.org/LICENSE_1_0.txt)
+//
+// Unless required by applicable law or agreed to in writing, this software
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.
+
+//  Not a contribution
+//  Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
+//  NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
+/*
  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
  *
@@ -24,6 +41,7 @@ namespace spark_rapids_jni {
 namespace detail {
 namespace {
 
+// d2s.c from ryu
 // A floating decimal representing m * 10^e.
 typedef struct floating_decimal_64 {
   uint64_t mantissa;
@@ -32,6 +50,7 @@ typedef struct floating_decimal_64 {
   int32_t exponent;
 } floating_decimal_64;
 
+// f2s.c from ryu
 // A floating decimal representing m * 10^e.
 typedef struct floating_decimal_32 {
   uint32_t mantissa;
@@ -42,6 +61,8 @@ typedef struct floating_decimal_32 {
 
 struct ftos_converter {
 
+  //===== constants from ryu =====
+
   // These tables are generated by PrintDoubleLookupTable.
   static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
   static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
@@ -54,24 +75,6 @@ struct ftos_converter {
   static constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
   static constexpr unsigned int FLOAT_BIAS = 127;
 
-
-  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
-  __device__ inline uint32_t decimalLength9(const uint32_t v) {
-    // Function precondition: v is not a 10-digit number.
-    // (f2s: 9 digits are sufficient for round-tripping.)
-    // (d2fixed: We print 9-digit blocks.)
-    assert(v < 1000000000);
-    if (v >= 100000000) { return 9; }
-    if (v >= 10000000) { return 8; }
-    if (v >= 1000000) { return 7; }
-    if (v >= 100000) { return 6; }
-    if (v >= 10000) { return 5; }
-    if (v >= 1000) { return 4; }
-    if (v >= 100) { return 3; }
-    if (v >= 10) { return 2; }
-    return 1;
-  }
-
   const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = {
     {                    1u, 2305843009213693952u },
     {  5955668970331000884u, 1784059615882449851u },
@@ -130,6 +133,25 @@ struct ftos_converter {
   298023223876953125ull //, 1490116119384765625ull
   };
 
+  //===== common.h from ryu =====
+
+  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
+  __device__ inline uint32_t decimalLength9(const uint32_t v) {
+    // Function precondition: v is not a 10-digit number.
+    // (f2s: 9 digits are sufficient for round-tripping.)
+    // (d2fixed: We print 9-digit blocks.)
+    assert(v < 1000000000);
+    if (v >= 100000000) { return 9; }
+    if (v >= 10000000) { return 8; }
+    if (v >= 1000000) { return 7; }
+    if (v >= 100000) { return 6; }
+    if (v >= 10000) { return 5; }
+    if (v >= 1000) { return 4; }
+    if (v >= 100) { return 3; }
+    if (v >= 10) { return 2; }
+    return 1;
+  }
+
   // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528.
   __device__ inline int32_t log2pow5(const int32_t e) {
     // This approximation works up to the point that the multiplication overflows at e = 3529.
@@ -254,6 +276,8 @@ struct ftos_converter {
     return bits;
   }
 
+  //===== d2s_intrinsics.h from ryu =====
+
   __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
     // The casts here help MSVC to avoid calls to the __allmul library function.
     const uint32_t aLo = (uint32_t)a;
@@ -363,6 +387,8 @@ struct ftos_converter {
     return mulShift64(4 * m, mul, j);
   }
 
+  //===== d2s_small_table.h from ryu =====
+
   // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
   __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) {
     const uint32_t base = i / POW5_TABLE_SIZE;
@@ -415,6 +441,8 @@ struct ftos_converter {
     result[1] = shiftright128(sum, high1, delta);
   }
 
+  //===== f2s_intrinsics.h from ryu =====
+
   __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
     // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
     // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
@@ -430,6 +458,8 @@ struct ftos_converter {
     return mulShift32(m, pow5[1], j);
   }
 
+  //===== d2s.c and f2s.c from ryu =====
+
   __device__ inline uint32_t decimalLength17(const uint64_t v) {
     // This is slightly faster than a loop.
     // The average output length is 16.38 digits, so we check high-to-low.
@@ -1094,15 +1124,6 @@ struct ftos_converter {
     return to_chars(v, sign, result);
   }
 
-  __device__ int compute_d2s_size(double value) {
-    bool sign = false, special = false;
-    floating_decimal_64 v = d2d(value, sign, special);
-    if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa);
-    }
-    return d2s_size(v, sign);
-  }
-
   __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
     // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
     const uint32_t bits = float_to_bits(f);
@@ -1130,6 +1151,18 @@ struct ftos_converter {
     return to_chars(v, sign, result);
   }
 
+
+  //===== compute float to string size =====
+
+  __device__ int compute_d2s_size(double value) {
+    bool sign = false, special = false;
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa);
+    }
+    return d2s_size(v, sign);
+  }
+
   __device__ int compute_f2s_size(float value) {
     bool sign = false, special = false;
     floating_decimal_32 v = f2d(value, sign, special);
@@ -1139,6 +1172,8 @@ struct ftos_converter {
     return f2s_size(v, sign);
   }
 
+  //===== APIs =====
+
   __device__ int compute_ftos_size(double value, bool is_float) {
     if (is_float) {
         return compute_f2s_size(value);
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4313cfa9b3..87d2a36f04 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc
+Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9

From 32287554fd016cf77ce737f9d61a7e89724250e4 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 17 Nov 2023 12:02:13 +0800
Subject: [PATCH 22/54] cudf conflict resolve

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 87d2a36f04..4313cfa9b3 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9
+Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc

From d7be0d7cc1f89e489ac6a4a74a4f6c8650500e48 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 17 Nov 2023 18:08:09 +0800
Subject: [PATCH 23/54] Add format_float kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/format_float.cu    |   2 +-
 src/main/cpp/src/ftos_converter.cu  | 381 +++++++++++++++++++++++++++-
 src/main/cpp/tests/format_float.cpp |  39 ++-
 3 files changed, 409 insertions(+), 13 deletions(-)

diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index 2590991629..cdac863553 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -51,7 +51,7 @@ struct format_float_fn {
   {
     ftos_converter fts;
     bool is_float = std::is_same_v<FloatType, float>;
-    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value), digits, is_float));
+    return static_cast<size_type>(fts.compute_format_float_size(static_cast<double>(value), digits, is_float));
   }
 
   __device__ void format_float(size_type idx, int digits)
diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 1ff5fe8543..69197fdf64 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -238,7 +238,7 @@ struct ftos_converter {
 
   }
 
-  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
+  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa, const int d=1) {
     if (mantissa) {
       memcpy(result, "NaN", 3);
       return 3;
@@ -250,18 +250,21 @@ struct ftos_converter {
       memcpy(result + sign, "Infinity", 8);
       return sign + 8;
     }
-    memcpy(result + sign, "0.0", 3);
-    return sign + 3;
+    memcpy(result + sign, "0.", 2);
+    for (int i = 0; i < d; i++) {
+      result[sign + 2 + i] = '0';
+    }
+    return sign + 2 + d;
   }
 
-  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) {
+  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa, const int d=1) {
     if (mantissa) {
       return 3;
     }
     if (exponent) {
       return sign + 8;
     }
-    return sign + 3;
+    return sign + 2 + d;
   }
 
   __device__ inline uint32_t float_to_bits(const float f) {
@@ -1189,6 +1192,374 @@ struct ftos_converter {
           return d2s_buffered_n(value, output);
       }
   }
+
+  //===== format float =====
+
+  const uint64_t POW10_TABLE[19] = {
+  1ull, 10ull, 100ull, 1000ull, 10000ull, 100000ull, 1000000ull, 10000000ull,
+  100000000ull, 1000000000ull, 10000000000ull, 100000000000ull, 1000000000000ull,
+  10000000000000ull, 100000000000000ull, 1000000000000000ull, 10000000000000000ull,
+  100000000000000000ull
+  };
+
+  template<typename T>
+  __device__ inline T round_half_even(const T input, const int olength, const int d) {
+    if (d > olength) {
+      T num = input;
+      for (int i = 0; i < d - olength; i++) {
+        num *= 10;
+      }
+      return num;
+    }
+    T div = POW10_TABLE[olength - d];
+    T mod = input % div;
+    T num = input / div;
+    if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) {
+      num++;
+    }
+    return num;
+  }
+
+  __device__ inline int to_formated_chars(const floating_decimal_64 v, const bool sign, char* const result, int d=10) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      int index_for_carrier = index;
+      result[index++] = '0';
+      result[index++] = '.';
+      int actural_round = d;
+      for (int i = -1; i > exp; i--) {
+        index_for_carrier = index;
+        result[index++] = '0';
+        actural_round--;
+        if (actural_round == 0) {
+          break;
+        }
+      }
+      int actural_olength = fmin(int(olength), actural_round);
+      uint64_t rounded_output = round_half_even(output, olength, actural_round);
+      // check if carry
+      if (rounded_output >= POW10_TABLE[actural_olength]) {
+        result[index_for_carrier] = '1';
+        rounded_output -= POW10_TABLE[actural_olength];
+      }
+      int current = index;
+      for (int i = 0; i < actural_olength; i++) {
+        result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10);
+        rounded_output /= 10;
+        index++;
+      }
+      actural_round -= actural_olength;
+      if (actural_round > 0) {
+        for (int i = 0; i < actural_round; i++) {
+          result[index++] = '0';
+        }
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      int integer_len = index + exp + 1 + (exp - index) / 3;
+      int sep_cnt = 0;
+      int rev_index = 0;
+      for (int i = olength; i < exp + 1; i++) {
+        result[integer_len - (rev_index++) - 1] = '0';
+        sep_cnt++;
+        if (sep_cnt == 3) {
+            result[integer_len - (rev_index++) - 1] = ',';
+            sep_cnt = 0;
+        }
+      }
+      for (int i = 0; i < olength; i++) {
+        if (sep_cnt == 3) {
+          result[integer_len - (rev_index++) - 1] = ',';
+          sep_cnt = 0;
+        }
+        result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10);
+        sep_cnt++;
+        output /= 10;
+      }
+      index = integer_len;
+      result[index++] = '.';
+      for (int i = 0; i < d; i++) {
+        result[index++] = '0';
+      }
+    } else {
+      uint32_t temp_d = d, tailing_zero = 0;
+      if (exp + d > olength) {
+        temp_d = olength - exp;
+        tailing_zero = d - temp_d;
+      }
+      uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+      uint64_t pow10 = POW10_TABLE[temp_d];
+      uint64_t integer = rounded_output / pow10;
+      uint64_t decimal = rounded_output % pow10;
+      uint32_t integer_len = decimalLength17(integer);
+      uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+      uint32_t sep_cnt = 0;
+      int rev_index = 0;
+      for (int i = 0; i < integer_len; i++) {
+        if (sep_cnt == 3) {
+          result[formated_integer_len - (rev_index++) - 1] = ',';
+          sep_cnt = 0;
+        }
+        result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10);
+        sep_cnt++;
+        integer /= 10;
+      }
+      index = formated_integer_len;
+      result[index++] = '.';
+      int current = index;
+      for (int i = 0; i < tailing_zero; i++) {
+        result[current + d - i - 1] = '0';
+        index++;
+      }
+      for (int i = tailing_zero; i < d; i++) {
+        result[current + d - i - 1] = (char) ('0' + decimal % 10);
+        decimal /= 10;
+        index++;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int format_float_size(const floating_decimal_64 v, const bool sign, int d=10) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength17(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      int index_for_carrier = index;
+      index+=2;
+      int actural_round = d;
+      index += exp + 1;
+      int actural_olength = fmin(int(olength), actural_round);
+      index += actural_olength;
+      actural_round -= actural_olength;
+      if (actural_round > 0) {
+        index += actural_round;
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      int integer_len = index + exp + 1 + (exp - index) / 3;
+      index = integer_len;
+      index++;
+      index += d;
+    } else {
+      uint32_t temp_d = d, tailing_zero = 0;
+      if (exp + d > olength) {
+        temp_d = olength - exp;
+        tailing_zero = d - temp_d;
+      }
+      uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+      uint64_t pow10 = POW10_TABLE[temp_d];
+      uint64_t integer = rounded_output / pow10;
+      uint32_t integer_len = decimalLength17(integer);
+      uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+      index = formated_integer_len;
+      index++;
+      index += d;
+    }
+    return index;
+  }
+
+  __device__ inline int to_formated_chars(const floating_decimal_32 v, const bool sign, char* const result, int d=10) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      result[index++] = '-';
+    }
+    uint32_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      int index_for_carrier = index;
+      result[index++] = '0';
+      result[index++] = '.';
+      int actural_round = d;
+      for (int i = -1; i > exp; i--) {
+        index_for_carrier = index;
+        result[index++] = '0';
+        actural_round--;
+        if (actural_round == 0) {
+          break;
+        }
+      }
+      int actural_olength = fmin(int(olength), actural_round);
+      uint64_t rounded_output = round_half_even(output, olength, actural_round);
+      // check if carry
+      if (rounded_output >= POW10_TABLE[actural_olength]) {
+        result[index_for_carrier] = '1';
+        rounded_output -= POW10_TABLE[actural_olength];
+      }
+      int current = index;
+      for (int i = 0; i < actural_olength; i++) {
+        result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10);
+        rounded_output /= 10;
+        index++;
+      }
+      actural_round -= actural_olength;
+      if (actural_round > 0) {
+        for (int i = 0; i < actural_round; i++) {
+          result[index++] = '0';
+        }
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      int integer_len = index + exp + 1 + (exp - index) / 3;
+      int sep_cnt = 0;
+      int rev_index = 0;
+      for (int i = olength; i < exp + 1; i++) {
+        result[integer_len - (rev_index++) - 1] = '0';
+        sep_cnt++;
+        if (sep_cnt == 3) {
+            result[integer_len - (rev_index++) - 1] = ',';
+            sep_cnt = 0;
+        }
+      }
+      for (int i = 0; i < olength; i++) {
+        if (sep_cnt == 3) {
+          result[integer_len - (rev_index++) - 1] = ',';
+          sep_cnt = 0;
+        }
+        result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10);
+        sep_cnt++;
+        output /= 10;
+      }
+      index = integer_len;
+      result[index++] = '.';
+      for (int i = 0; i < d; i++) {
+        result[index++] = '0';
+      }
+    } else {
+      uint32_t temp_d = d, tailing_zero = 0;
+      if (exp + d > olength) {
+        temp_d = olength - exp;
+        tailing_zero = d - temp_d;
+      }
+      uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+      uint32_t pow10 = POW10_TABLE[temp_d];
+      uint32_t integer = rounded_output / pow10;
+      uint32_t decimal = rounded_output % pow10;
+      uint32_t integer_len = decimalLength9(integer);
+      uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+      uint32_t sep_cnt = 0;
+      int rev_index = 0;
+      for (int i = 0; i < integer_len; i++) {
+        if (sep_cnt == 3) {
+          result[formated_integer_len - (rev_index++) - 1] = ',';
+          sep_cnt = 0;
+        }
+        result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10);
+        sep_cnt++;
+        integer /= 10;
+      }
+      index = formated_integer_len;
+      result[index++] = '.';
+      int current = index;
+      for (int i = 0; i < tailing_zero; i++) {
+        result[current + d - i - 1] = '0';
+        index++;
+      }
+      for (int i = tailing_zero; i < d; i++) {
+        result[current + d - i - 1] = (char) ('0' + decimal % 10);
+        decimal /= 10;
+        index++;
+      }
+    }
+    return index;
+  }
+
+  __device__ inline int format_float_size(const floating_decimal_32 v, const bool sign, int d=10) {
+    // Step 5: Print the decimal representation.
+    int index = 0;
+    if (sign) {
+      index++;
+    }
+    uint64_t output = v.mantissa;
+    const uint32_t olength = decimalLength9(output);
+    int32_t exp = v.exponent + (int32_t) olength - 1;
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      int index_for_carrier = index;
+      index+=2;
+      int actural_round = d;
+      index += exp + 1;
+      int actural_olength = fmin(int(olength), actural_round);
+      index += actural_olength;
+      actural_round -= actural_olength;
+      if (actural_round > 0) {
+        index += actural_round;
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      int integer_len = index + exp + 1 + (exp - index) / 3;
+      index = integer_len;
+      index++;
+      index += d;
+    } else {
+      uint32_t temp_d = d, tailing_zero = 0;
+      if (exp + d > olength) {
+        temp_d = olength - exp;
+        tailing_zero = d - temp_d;
+      }
+      uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+      uint32_t pow10 = POW10_TABLE[temp_d];
+      uint32_t integer = rounded_output / pow10;
+      uint32_t integer_len = decimalLength9(integer);
+      uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+      index = formated_integer_len;
+      index++;
+      index += d;
+    }
+    return index;
+  }  
+
+  __device__ int compute_format_float_size(double value, int d, bool is_float) {
+    bool sign = false, special = false;
+    if (is_float) {
+      floating_decimal_32 v = f2d(value, sign, special);
+      if (special) {
+        return special_str_size(sign, v.exponent, v.mantissa, d);
+      }
+      return format_float_size(v, sign, d);
+    } else {
+      floating_decimal_64 v = d2d(value, sign, special);
+      if (special) {
+        return special_str_size(sign, v.exponent, v.mantissa, d);
+      }
+      return format_float_size(v, sign, d);
+    }
+  }
+
+  __device__ int format_float(double value, int d, char* output, bool is_float) {
+    bool sign = false, special = false;
+    if (is_float) {
+      floating_decimal_32 v = f2d(value, sign, special);
+      if (special) {
+        return copy_special_str(output, sign, v.exponent, v.mantissa, d);
+      }
+      return to_formated_chars(v, sign, output, d);
+    } else {
+      floating_decimal_64 v = d2d(value, sign, special);
+      if (special) {
+        return copy_special_str(output, sign, v.exponent, v.mantissa, d);
+      }
+      return to_formated_chars(v, sign, output, d);
+    }
+  }
+
 };
 
 }
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index 3e03578f4c..459d2e0b7f 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -43,9 +43,10 @@ TEST_F(FormatFloatTests, FormatFloats32)
                               -4,
                               std::numeric_limits<float>::quiet_NaN(),
                               123456789012.34,
-                              -0.0};
+                              -0.0
+                              };
   std::vector<char const*> h_expected{
-    "100.0", "654,321.25", "-12,761.125", "0.0", "5.0", "-4.0", "NaN", "8.3954222323279E11", "-0.0"};
+    "100.00000", "654,321.25000", "-12,761.12500", "0.00000", "5.00000", "-4.00000", "NaN", "123,456,790,000.00000", "-0.00000"};
 
   cudf::test::fixed_width_column_wrapper<float> floats(
     h_floats.begin(),
@@ -64,20 +65,44 @@ TEST_F(FormatFloatTests, FormatFloats32)
 
 TEST_F(FormatFloatTests, FormatFloats64)
 {
-  std::vector<double> h_floats{100,
+  // std::vector<double> h_floats{100,
+  //                              654321.25,
+  //                              -12761.125,
+  //                              1.123456789123456789,
+  //                              0.000000000000000000123456789123456789,
+  //                              0,
+  //                              5,
+  //                              -4,
+  //                              std::numeric_limits<double>::quiet_NaN(),
+  //                              839542223232.794248339,
+  //                              -0.0};
+  // std::vector<char const*> h_expected{
+  //   "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", 
+  //   "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
+
+    std::vector<double> h_floats{100,
                                654321.25,
                                -12761.125,
                                1.123456789123456789,
-                               0.000000000000000000123456789123456789,
                                0,
                                5,
                                -4,
                                std::numeric_limits<double>::quiet_NaN(),
                                839542223232.794248339,
-                               -0.0};
+                               -0.0
+                               };
   std::vector<char const*> h_expected{
-    "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", 
-    "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
+    "100.00000", 
+    "654,321.25000", 
+    "-12,761.12500", 
+    "1.12346", 
+    "0.00000", 
+    "5.00000", 
+    "-4.00000", 
+    "NaN", 
+    "839,542,223,232.79420", 
+    "-0.00000"
+    };
 
   cudf::test::fixed_width_column_wrapper<double> floats(
     h_floats.begin(),

From 5397f120cfa4314fa66ec5aaffd3e2b012540108 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 17 Nov 2023 18:19:05 +0800
Subject: [PATCH 24/54] clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/tests/format_float.cpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index 459d2e0b7f..4bf1e17c56 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -65,21 +65,6 @@ TEST_F(FormatFloatTests, FormatFloats32)
 
 TEST_F(FormatFloatTests, FormatFloats64)
 {
-  // std::vector<double> h_floats{100,
-  //                              654321.25,
-  //                              -12761.125,
-  //                              1.123456789123456789,
-  //                              0.000000000000000000123456789123456789,
-  //                              0,
-  //                              5,
-  //                              -4,
-  //                              std::numeric_limits<double>::quiet_NaN(),
-  //                              839542223232.794248339,
-  //                              -0.0};
-  // std::vector<char const*> h_expected{
-  //   "100.0", "654,321.25", "-12,761.125", "1.1234567891234568", "1.234567891234568E-19", 
-  //   "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
-
     std::vector<double> h_floats{100,
                                654321.25,
                                -12761.125,

From 8aeeb6b00237d7bbd1e42d3add3e853105d6c46d Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 20 Nov 2023 11:48:39 +0800
Subject: [PATCH 25/54] Fixed two bugs

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 69197fdf64..e11df5faf0 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -1264,7 +1264,7 @@ struct ftos_converter {
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
-      int integer_len = index + exp + 1 + (exp - index) / 3;
+      int integer_len = index + exp + 1 + exp / 3;
       int sep_cnt = 0;
       int rev_index = 0;
       for (int i = olength; i < exp + 1; i++) {
@@ -1343,6 +1343,7 @@ struct ftos_converter {
       index+=2;
       int actural_round = d;
       index += exp + 1;
+      actural_round -= exp + 1;
       int actural_olength = fmin(int(olength), actural_round);
       index += actural_olength;
       actural_round -= actural_olength;
@@ -1351,7 +1352,7 @@ struct ftos_converter {
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
-      int integer_len = index + exp + 1 + (exp - index) / 3;
+      int integer_len = index + exp + 1 + exp / 3;
       index = integer_len;
       index++;
       index += d;
@@ -1417,7 +1418,7 @@ struct ftos_converter {
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
-      int integer_len = index + exp + 1 + (exp - index) / 3;
+      int integer_len = index + exp + 1 + exp / 3;
       int sep_cnt = 0;
       int rev_index = 0;
       for (int i = olength; i < exp + 1; i++) {
@@ -1496,6 +1497,7 @@ struct ftos_converter {
       index+=2;
       int actural_round = d;
       index += exp + 1;
+      actural_round -= exp + 1;
       int actural_olength = fmin(int(olength), actural_round);
       index += actural_olength;
       actural_round -= actural_olength;
@@ -1504,7 +1506,7 @@ struct ftos_converter {
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
-      int integer_len = index + exp + 1 + (exp - index) / 3;
+      int integer_len = index + exp + 1 + exp / 3;
       index = integer_len;
       index++;
       index += d;

From a6578c775af069cd76805ad46df72d7ad235251a Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 20 Nov 2023 12:21:57 +0800
Subject: [PATCH 26/54] Added a failed case back

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/tests/format_float.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index 4bf1e17c56..b630039263 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -69,6 +69,7 @@ TEST_F(FormatFloatTests, FormatFloats64)
                                654321.25,
                                -12761.125,
                                1.123456789123456789,
+                               0.000000000000000000123456789123456789,
                                0,
                                5,
                                -4,
@@ -77,17 +78,8 @@ TEST_F(FormatFloatTests, FormatFloats64)
                                -0.0
                                };
   std::vector<char const*> h_expected{
-    "100.00000", 
-    "654,321.25000", 
-    "-12,761.12500", 
-    "1.12346", 
-    "0.00000", 
-    "5.00000", 
-    "-4.00000", 
-    "NaN", 
-    "839,542,223,232.79420", 
-    "-0.00000"
-    };
+    "100.00000", "654,321.25000", "-12,761.12500", "1.12346", "0.00000", "0.00000", "5.00000", 
+    "-4.00000", "NaN", "839,542,223,232.79420", "-0.00000"};
 
   cudf::test::fixed_width_column_wrapper<double> floats(
     h_floats.begin(),

From 9b7fb4a531dcb29fe77f5698284b1a6a69f95964 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 20 Nov 2023 18:02:04 +0800
Subject: [PATCH 27/54] Refactor

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 75 +++++++-----------------------
 1 file changed, 18 insertions(+), 57 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index e11df5faf0..e516ba8910 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -1204,6 +1204,7 @@ struct ftos_converter {
 
   template<typename T>
   __device__ inline T round_half_even(const T input, const int olength, const int d) {
+    // "round" a integer to d digits, with the half-even rounding mode.    
     if (d > olength) {
       T num = input;
       for (int i = 0; i < d - olength; i++) {
@@ -1220,8 +1221,7 @@ struct ftos_converter {
     return num;
   }
 
-  __device__ inline int to_formated_chars(const floating_decimal_64 v, const bool sign, char* const result, int d=10) {
-    // Step 5: Print the decimal representation.
+  __device__ inline int to_formated_chars(const floating_decimal_64 v, const bool sign, char* const result, int d) {
     int index = 0;
     if (sign) {
       result[index++] = '-';
@@ -1299,6 +1299,7 @@ struct ftos_converter {
       uint64_t pow10 = POW10_TABLE[temp_d];
       uint64_t integer = rounded_output / pow10;
       uint64_t decimal = rounded_output % pow10;
+      // calculate integer length after format to cover carry case
       uint32_t integer_len = decimalLength17(integer);
       uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
       uint32_t sep_cnt = 0;
@@ -1328,8 +1329,7 @@ struct ftos_converter {
     return index;
   }
 
-  __device__ inline int format_float_size(const floating_decimal_64 v, const bool sign, int d=10) {
-    // Step 5: Print the decimal representation.
+  __device__ inline int format_float_size(const floating_decimal_64 v, const bool sign, int d) {
     int index = 0;
     if (sign) {
       index++;
@@ -1338,44 +1338,24 @@ struct ftos_converter {
     const uint32_t olength = decimalLength17(output);
     int32_t exp = v.exponent + (int32_t) olength - 1;
     if (exp < 0) {
-      // Decimal dot is before any of the digits.
-      int index_for_carrier = index;
-      index+=2;
-      int actural_round = d;
-      index += exp + 1;
-      actural_round -= exp + 1;
-      int actural_olength = fmin(int(olength), actural_round);
-      index += actural_olength;
-      actural_round -= actural_olength;
-      if (actural_round > 0) {
-        index += actural_round;
-      }
+      index += 2 + d;
     } else if (exp + 1 >= olength) {
-      // Decimal dot is after any of the digits.
-      int integer_len = index + exp + 1 + exp / 3;
-      index = integer_len;
-      index++;
-      index += d;
+      index += exp + 1 + exp / 3 + 1 + d;
     } else {
-      uint32_t temp_d = d, tailing_zero = 0;
+      uint32_t temp_d = d;
       if (exp + d > olength) {
         temp_d = olength - exp;
-        tailing_zero = d - temp_d;
       }
       uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
       uint64_t pow10 = POW10_TABLE[temp_d];
       uint64_t integer = rounded_output / pow10;
       uint32_t integer_len = decimalLength17(integer);
-      uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
-      index = formated_integer_len;
-      index++;
-      index += d;
+      index += integer_len + (integer_len - 1) / 3 + 1 + d;
     }
     return index;
   }
 
-  __device__ inline int to_formated_chars(const floating_decimal_32 v, const bool sign, char* const result, int d=10) {
-    // Step 5: Print the decimal representation.
+  __device__ inline int to_formated_chars(const floating_decimal_32 v, const bool sign, char* const result, int d) {
     int index = 0;
     if (sign) {
       result[index++] = '-';
@@ -1453,6 +1433,7 @@ struct ftos_converter {
       uint32_t pow10 = POW10_TABLE[temp_d];
       uint32_t integer = rounded_output / pow10;
       uint32_t decimal = rounded_output % pow10;
+      // calculate integer length after format to cover carry case
       uint32_t integer_len = decimalLength9(integer);
       uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
       uint32_t sep_cnt = 0;
@@ -1482,8 +1463,7 @@ struct ftos_converter {
     return index;
   }
 
-  __device__ inline int format_float_size(const floating_decimal_32 v, const bool sign, int d=10) {
-    // Step 5: Print the decimal representation.
+  __device__ inline int format_float_size(const floating_decimal_32 v, const bool sign, int d) {
     int index = 0;
     if (sign) {
       index++;
@@ -1492,38 +1472,19 @@ struct ftos_converter {
     const uint32_t olength = decimalLength9(output);
     int32_t exp = v.exponent + (int32_t) olength - 1;
     if (exp < 0) {
-      // Decimal dot is before any of the digits.
-      int index_for_carrier = index;
-      index+=2;
-      int actural_round = d;
-      index += exp + 1;
-      actural_round -= exp + 1;
-      int actural_olength = fmin(int(olength), actural_round);
-      index += actural_olength;
-      actural_round -= actural_olength;
-      if (actural_round > 0) {
-        index += actural_round;
-      }
+      index += 2 + d;
     } else if (exp + 1 >= olength) {
-      // Decimal dot is after any of the digits.
-      int integer_len = index + exp + 1 + exp / 3;
-      index = integer_len;
-      index++;
-      index += d;
+      index += exp + 1 + exp / 3 + 1 + d;
     } else {
-      uint32_t temp_d = d, tailing_zero = 0;
+      uint32_t temp_d = d;
       if (exp + d > olength) {
         temp_d = olength - exp;
-        tailing_zero = d - temp_d;
       }
-      uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
-      uint32_t pow10 = POW10_TABLE[temp_d];
-      uint32_t integer = rounded_output / pow10;
+      uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+      uint64_t pow10 = POW10_TABLE[temp_d];
+      uint64_t integer = rounded_output / pow10;
       uint32_t integer_len = decimalLength9(integer);
-      uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
-      index = formated_integer_len;
-      index++;
-      index += d;
+      index += integer_len + (integer_len - 1) / 3 + 1 + d;
     }
     return index;
   }  

From 41967d91b97398676653c19b0a7c9cb1492f7d88 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 21 Nov 2023 02:03:03 +0800
Subject: [PATCH 28/54] Handle d=0 case

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 36 ++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index e516ba8910..664feb706e 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -238,7 +238,7 @@ struct ftos_converter {
 
   }
 
-  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa, const int d=1) {
+  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa, const int d = 1) {
     if (mantissa) {
       memcpy(result, "NaN", 3);
       return 3;
@@ -250,7 +250,12 @@ struct ftos_converter {
       memcpy(result + sign, "Infinity", 8);
       return sign + 8;
     }
-    memcpy(result + sign, "0.", 2);
+    result[sign] = '0';
+    if (d == 0) {
+      return sign + 1;
+    } else {
+      result[sign + 1] = '.';
+    }
     for (int i = 0; i < d; i++) {
       result[sign + 2 + i] = '0';
     }
@@ -264,6 +269,9 @@ struct ftos_converter {
     if (exponent) {
       return sign + 8;
     }
+    if (d == 0) {
+      return sign + 1;
+    }
     return sign + 2 + d;
   }
 
@@ -1233,6 +1241,9 @@ struct ftos_converter {
       // Decimal dot is before any of the digits.
       int index_for_carrier = index;
       result[index++] = '0';
+      if (d == 0) {
+        return index;
+      }
       result[index++] = '.';
       int actural_round = d;
       for (int i = -1; i > exp; i--) {
@@ -1285,6 +1296,9 @@ struct ftos_converter {
         output /= 10;
       }
       index = integer_len;
+      if (d == 0) {
+        return index;
+      }
       result[index++] = '.';
       for (int i = 0; i < d; i++) {
         result[index++] = '0';
@@ -1314,6 +1328,9 @@ struct ftos_converter {
         integer /= 10;
       }
       index = formated_integer_len;
+      if (d == 0) {
+        return index;
+      }
       result[index++] = '.';
       int current = index;
       for (int i = 0; i < tailing_zero; i++) {
@@ -1352,6 +1369,9 @@ struct ftos_converter {
       uint32_t integer_len = decimalLength17(integer);
       index += integer_len + (integer_len - 1) / 3 + 1 + d;
     }
+    if (d == 0) {
+      index--;
+    }
     return index;
   }
 
@@ -1367,6 +1387,9 @@ struct ftos_converter {
       // Decimal dot is before any of the digits.
       int index_for_carrier = index;
       result[index++] = '0';
+      if (d == 0) {
+        return index;
+      }
       result[index++] = '.';
       int actural_round = d;
       for (int i = -1; i > exp; i--) {
@@ -1419,6 +1442,9 @@ struct ftos_converter {
         output /= 10;
       }
       index = integer_len;
+      if (d == 0) {
+        return index;
+      }
       result[index++] = '.';
       for (int i = 0; i < d; i++) {
         result[index++] = '0';
@@ -1448,6 +1474,9 @@ struct ftos_converter {
         integer /= 10;
       }
       index = formated_integer_len;
+      if (d == 0) {
+        return index;
+      }
       result[index++] = '.';
       int current = index;
       for (int i = 0; i < tailing_zero; i++) {
@@ -1486,6 +1515,9 @@ struct ftos_converter {
       uint32_t integer_len = decimalLength9(integer);
       index += integer_len + (integer_len - 1) / 3 + 1 + d;
     }
+    if (d == 0) {
+      index--;
+    }
     return index;
   }  
 

From dc570cbc3f04067612580144d490485415f1b0be Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 21 Nov 2023 08:56:46 +0800
Subject: [PATCH 29/54] Add nv apache license to ftos_converter

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 1ff5fe8543..2190381260 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -15,19 +15,20 @@
 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.
 
-//  Not a contribution
-//  Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
-//  NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include <cuda/std/climits>

From 96333ca53e550ba9cdbdfe66d0c538df87e105a1 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 21 Nov 2023 08:56:46 +0800
Subject: [PATCH 30/54] Add nv apache license to ftos_converter

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 664feb706e..835715f3d1 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -15,19 +15,20 @@
 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.
 
-//  Not a contribution
-//  Changes made by NVIDIA CORPORATION & AFFILIATES enabling ryu or otherwise documented as
-//  NVIDIA-proprietary are not a contribution and subject to the following terms and conditions:
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 #include <cuda/std/climits>

From c36ce9435f6b620984aa7e0978e0137804544d5e Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 21 Nov 2023 16:02:58 +0800
Subject: [PATCH 31/54] Fix an rounding bug

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 6 ++++++
 thirdparty/cudf                    | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 835715f3d1..d226052b46 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -1252,6 +1252,9 @@ struct ftos_converter {
         result[index++] = '0';
         actural_round--;
         if (actural_round == 0) {
+          if (i != exp + 1) {
+            return index;
+          } // else, possible carry
           break;
         }
       }
@@ -1398,6 +1401,9 @@ struct ftos_converter {
         result[index++] = '0';
         actural_round--;
         if (actural_round == 0) {
+          if (i != exp + 1) {
+            return index;
+          } // else, possible carry
           break;
         }
       }
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 4313cfa9b3..8a0a08f34f 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 4313cfa9b3fcff41f67b48ac8797dc015d441ecc
+Subproject commit 8a0a08f34ff804a7329ea640aa1e0a9b188d2162

From 360a77bb87a83ca6bcd96f2637fb477118fdffe8 Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Tue, 21 Nov 2023 23:55:53 +0800
Subject: [PATCH 32/54] Update src/main/cpp/src/ftos_converter.cu

Co-authored-by: Jason Lowe <jlowe@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cu | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
index 2190381260..0c05373ed9 100644
--- a/src/main/cpp/src/ftos_converter.cu
+++ b/src/main/cpp/src/ftos_converter.cu
@@ -1,21 +1,5 @@
-// Copyright 2018 Ulf Adams
-//
-// The contents of this file may be used under the terms of the Apache License,
-// Version 2.0.
-//
-//    (See accompanying file LICENSE-Apache or copy at
-//     http://www.apache.org/licenses/LICENSE-2.0)
-//
-// Alternatively, the contents of this file may be used under the terms of
-// the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE-Boost or copy at
-//     https://www.boost.org/LICENSE_1_0.txt)
-//
-// Unless required by applicable law or agreed to in writing, this software
-// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.
-
 /*
+ * Copyright 2018 Ulf Adams
  * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");

From ced33b6cf203d3bd46c6f07b0dc7cd0669156503 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 22 Nov 2023 14:58:12 +0800
Subject: [PATCH 33/54] address some comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 NOTICE                                      |    1 -
 src/main/cpp/CMakeLists.txt                 |    1 -
 src/main/cpp/src/cast_float_to_string.cu    |   59 +-
 src/main/cpp/src/ftos_converter.cu          | 1181 -------------------
 src/main/cpp/src/ftos_converter.cuh         | 1156 ++++++++++++++++++
 src/main/cpp/tests/cast_float_to_string.cpp |    4 +-
 6 files changed, 1185 insertions(+), 1217 deletions(-)
 delete mode 100644 src/main/cpp/src/ftos_converter.cu
 create mode 100644 src/main/cpp/src/ftos_converter.cuh

diff --git a/NOTICE b/NOTICE
index 53333b52c5..a0975c00c8 100644
--- a/NOTICE
+++ b/NOTICE
@@ -5,7 +5,6 @@ Copyright (c) 2022-2023, NVIDIA CORPORATION
 
 This project includes code from ryu (https://github.com/ulfjack/ryu).
 
-ryu
 Copyright (2018) Ulf Adams and contributors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index a6ac8ac98c..8f90b9078e 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -168,7 +168,6 @@ add_library(
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
-  src/ftos_converter.cu
   src/histogram.cu
   src/map_utils.cu
   src/murmur_hash.cu
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index eaf0c989b9..e22947ab9e 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -15,6 +15,7 @@
  */
 
 #include "cast_string.hpp"
+#include "ftos_converter.cuh"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -28,13 +29,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <ftos_converter.cu>
-
-using namespace cudf;
-
 namespace spark_rapids_jni {
 
 namespace detail {
@@ -42,26 +40,25 @@ namespace {
 
 template <typename FloatType>
 struct float_to_string_fn {
-  column_device_view d_floats;
-  size_type* d_offsets;
+  cudf::column_device_view d_floats;
+  cudf::size_type* d_offsets;
   char* d_chars;
 
-  __device__ size_type compute_output_size(FloatType value)
+  __device__ cudf::size_type compute_output_size(FloatType value) const
   {
-    ftos_converter fts;
-    bool is_float = std::is_same_v<FloatType, float>;
-    return static_cast<size_type>(fts.compute_ftos_size(static_cast<double>(value), is_float));
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    return static_cast<cudf::size_type>(ftos_converter::compute_ftos_size(static_cast<double>(value), is_float));
   }
 
-  __device__ void float_to_string(size_type idx)
+  __device__ void float_to_string(cudf::size_type idx) const
   {
-    FloatType value = d_floats.element<FloatType>(idx);
-    ftos_converter fts;
-    bool is_float = std::is_same_v<FloatType, float>;
-    fts.float_to_string(static_cast<double>(value), d_chars + d_offsets[idx], is_float);
+    auto const value = d_floats.element<FloatType>(idx);
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    auto const output = d_chars + d_offsets[idx];
+    ftos_converter::float_to_string(static_cast<double>(value), is_float, output);
   }
 
-  __device__ void operator()(size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     if (d_floats.is_null(idx)) {
       if (d_chars == nullptr) { d_offsets[idx] = 0; }
@@ -82,32 +79,28 @@ struct float_to_string_fn {
  */
 struct dispatch_float_to_string_fn {
   template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& floats,
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::mr::device_memory_resource* mr)
   {
-    size_type strings_count = floats.size();
-    auto column             = column_device_view::create(floats, stream);
-    auto d_column           = *column;
-
-    // copy the null mask
-    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
+    auto const strings_count = floats.size();
+    auto const input_ptr = cudf::column_device_view::create(floats, stream);
 
     auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(float_to_string_fn<FloatType>{d_column}, strings_count, stream, mr);
+      cudf::strings::detail::make_strings_children(float_to_string_fn<FloatType>{*input_ptr}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
                                std::move(chars),
                                floats.null_count(),
-                               std::move(null_mask));
+                               std::move(cudf::detail::copy_bitmask(floats, stream, mr)));
   }
 
   // non-float types throw an exception
   template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const&,
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::mr::device_memory_resource*)
   {
     CUDF_FAIL("Values for float_to_string function must be a float type.");
   }
@@ -116,12 +109,14 @@ struct dispatch_float_to_string_fn {
 }  // namespace
 
 // This will convert all float column types into a strings column.
-std::unique_ptr<column> float_to_string(column_view const& floats,
+std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = floats.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  auto const strings_count = floats.size();
+  if (strings_count == 0) { 
+    return cudf::make_empty_column(cudf::type_id::STRING);
+  }
 
   return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
 }
@@ -129,7 +124,7 @@ std::unique_ptr<column> float_to_string(column_view const& floats,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> float_to_string(column_view const& floats, 
+std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats, 
                                       rmm::cuda_stream_view stream, 
                                       rmm::mr::device_memory_resource* mr)
 {
diff --git a/src/main/cpp/src/ftos_converter.cu b/src/main/cpp/src/ftos_converter.cu
deleted file mode 100644
index 0c05373ed9..0000000000
--- a/src/main/cpp/src/ftos_converter.cu
+++ /dev/null
@@ -1,1181 +0,0 @@
-/*
- * Copyright 2018 Ulf Adams
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda/std/climits>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
-
-namespace spark_rapids_jni {
-
-namespace detail {
-namespace {
-
-// d2s.c from ryu
-// A floating decimal representing m * 10^e.
-typedef struct floating_decimal_64 {
-  uint64_t mantissa;
-  // Decimal exponent's range is -324 to 308
-  // inclusive, and can fit in a short if needed.
-  int32_t exponent;
-} floating_decimal_64;
-
-// f2s.c from ryu
-// A floating decimal representing m * 10^e.
-typedef struct floating_decimal_32 {
-  uint32_t mantissa;
-  // Decimal exponent's range is -45 to 38
-  // inclusive, and can fit in a short if needed.
-  int32_t exponent;
-} floating_decimal_32;
-
-struct ftos_converter {
-
-  //===== constants from ryu =====
-
-  // These tables are generated by PrintDoubleLookupTable.
-  static constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
-  static constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
-  static constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
-  static constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
-  static constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
-  static constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
-  static constexpr unsigned int DOUBLE_BIAS = 1023;
-  static constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
-  static constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
-  static constexpr unsigned int FLOAT_BIAS = 127;
-
-  const uint64_t DOUBLE_POW5_INV_SPLIT2[15][2] = {
-    {                    1u, 2305843009213693952u },
-    {  5955668970331000884u, 1784059615882449851u },
-    {  8982663654677661702u, 1380349269358112757u },
-    {  7286864317269821294u, 2135987035920910082u },
-    {  7005857020398200553u, 1652639921975621497u },
-    { 17965325103354776697u, 1278668206209430417u },
-    {  8928596168509315048u, 1978643211784836272u },
-    { 10075671573058298858u, 1530901034580419511u },
-    {   597001226353042382u, 1184477304306571148u },
-    {  1527430471115325346u, 1832889850782397517u },
-    { 12533209867169019542u, 1418129833677084982u },
-    {  5577825024675947042u, 2194449627517475473u },
-    { 11006974540203867551u, 1697873161311732311u },
-    { 10313493231639821582u, 1313665730009899186u },
-    { 12701016819766672773u, 2032799256770390445u }
-  };
-
-  const uint32_t POW5_INV_OFFSETS[19] = {
-    0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
-    0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
-    0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
-    0x00000000
-  };
-
-  const uint64_t DOUBLE_POW5_SPLIT2[13][2] = {
-    {                    0u, 1152921504606846976u },
-    {                    0u, 1490116119384765625u },
-    {  1032610780636961552u, 1925929944387235853u },
-    {  7910200175544436838u, 1244603055572228341u },
-    { 16941905809032713930u, 1608611746708759036u },
-    { 13024893955298202172u, 2079081953128979843u },
-    {  6607496772837067824u, 1343575221513417750u },
-    { 17332926989895652603u, 1736530273035216783u },
-    { 13037379183483547984u, 2244412773384604712u },
-    {  1605989338741628675u, 1450417759929778918u },
-    {  9630225068416591280u, 1874621017369538693u },
-    {   665883850346957067u, 1211445438634777304u },
-    { 14931890668723713708u, 1565756531257009982u }
-  };
-
-  const uint32_t POW5_OFFSETS[21] = {
-    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
-    0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
-    0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
-    0x55559155, 0x51405555, 0x00000105
-  };
-
-  static constexpr uint32_t POW5_TABLE_SIZE = 26;
-  const uint64_t DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
-  1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
-  1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
-  30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
-  19073486328125ull, 95367431640625ull, 476837158203125ull,
-  2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
-  298023223876953125ull //, 1490116119384765625ull
-  };
-
-  //===== common.h from ryu =====
-
-  // Returns the number of decimal digits in v, which must not contain more than 9 digits.
-  __device__ inline uint32_t decimalLength9(const uint32_t v) {
-    // Function precondition: v is not a 10-digit number.
-    // (f2s: 9 digits are sufficient for round-tripping.)
-    // (d2fixed: We print 9-digit blocks.)
-    assert(v < 1000000000);
-    if (v >= 100000000) { return 9; }
-    if (v >= 10000000) { return 8; }
-    if (v >= 1000000) { return 7; }
-    if (v >= 100000) { return 6; }
-    if (v >= 10000) { return 5; }
-    if (v >= 1000) { return 4; }
-    if (v >= 100) { return 3; }
-    if (v >= 10) { return 2; }
-    return 1;
-  }
-
-  // Returns e == 0 ? 1 : [log_2(5^e)]; requires 0 <= e <= 3528.
-  __device__ inline int32_t log2pow5(const int32_t e) {
-    // This approximation works up to the point that the multiplication overflows at e = 3529.
-    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
-    // than 2^9297.
-    assert(e >= 0);
-    assert(e <= 3528);
-    return (int32_t) ((((uint32_t) e) * 1217359) >> 19);
-  }
-
-  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
-  __device__ inline int32_t pow5bits(const int32_t e) {
-    // This approximation works up to the point that the multiplication overflows at e = 3529.
-    // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
-    // than 2^9297.
-    assert(e >= 0);
-    assert(e <= 3528);
-    return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
-  }
-
-  // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
-  __device__ inline int32_t ceil_log2pow5(const int32_t e) {
-    return log2pow5(e) + 1;
-  }
-
-  // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
-  __device__ inline uint32_t log10Pow2(const int32_t e) {
-    // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
-    assert(e >= 0);
-    assert(e <= 1650);
-    return (((uint32_t) e) * 78913) >> 18;
-  }
-
-  // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
-  __device__ inline uint32_t log10Pow5(const int32_t e) {
-    // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
-    assert(e >= 0);
-    assert(e <= 2620);
-    return (((uint32_t) e) * 732923) >> 20;
-  }
-
-  __device__ inline uint32_t pow5factor_32(uint32_t value) {
-    uint32_t count = 0;
-    for (;;) {
-      assert(value != 0);
-      const uint32_t q = value / 5;
-      const uint32_t r = value % 5;
-      if (r != 0) {
-        break;
-      }
-      value = q;
-      ++count;
-    }
-    return count;
-  }
-
-  // Returns true if value is divisible by 5^p.
-  __device__ inline bool multipleOfPowerOf5_32(const uint32_t value, const uint32_t p) {
-    return pow5factor_32(value) >= p;
-  }
-
-  // Returns true if value is divisible by 2^p.
-  __device__ inline bool multipleOfPowerOf2_32(const uint32_t value, const uint32_t p) {
-    // __builtin_ctz doesn't appear to be faster here.
-    return (value & ((1u << p) - 1)) == 0;
-  }
-
-  // It seems to be slightly faster to avoid uint128_t here, although the
-  // generated code for uint128_t looks slightly nicer.
-  __device__ inline uint32_t mulShift32(const uint32_t m, const uint64_t factor, const int32_t shift) {
-    assert(shift > 32);
-
-    // The casts here help MSVC to avoid calls to the __allmul library
-    // function.
-    const uint32_t factorLo = (uint32_t)(factor);
-    const uint32_t factorHi = (uint32_t)(factor >> 32);
-    const uint64_t bits0 = (uint64_t)m * factorLo;
-    const uint64_t bits1 = (uint64_t)m * factorHi;
-
-    const uint64_t sum = (bits0 >> 32) + bits1;
-    const uint64_t shiftedSum = sum >> (shift - 32);
-    assert(shiftedSum <= UINT32_MAX);
-    return (uint32_t) shiftedSum;
-
-  }
-
-  __device__ inline int copy_special_str(char * const result, const bool sign, const bool exponent, const bool mantissa) {
-    if (mantissa) {
-      memcpy(result, "NaN", 3);
-      return 3;
-    }
-    if (sign) {
-      result[0] = '-';
-    }
-    if (exponent) {
-      memcpy(result + sign, "Infinity", 8);
-      return sign + 8;
-    }
-    memcpy(result + sign, "0.0", 3);
-    return sign + 3;
-  }
-
-  __device__ inline int special_str_size(const bool sign, const bool exponent, const bool mantissa) {
-    if (mantissa) {
-      return 3;
-    }
-    if (exponent) {
-      return sign + 8;
-    }
-    return sign + 3;
-  }
-
-  __device__ inline uint32_t float_to_bits(const float f) {
-    uint32_t bits = 0;
-    memcpy(&bits, &f, sizeof(float));
-    return bits;
-  }
-
-  __device__ inline uint64_t double_to_bits(const double d) {
-    uint64_t bits = 0;
-    memcpy(&bits, &d, sizeof(double));
-    return bits;
-  }
-
-  //===== d2s_intrinsics.h from ryu =====
-
-  __device__ inline uint64_t umul128(const uint64_t a, const uint64_t b, uint64_t* const productHi) {
-    // The casts here help MSVC to avoid calls to the __allmul library function.
-    const uint32_t aLo = (uint32_t)a;
-    const uint32_t aHi = (uint32_t)(a >> 32);
-    const uint32_t bLo = (uint32_t)b;
-    const uint32_t bHi = (uint32_t)(b >> 32);
-
-    const uint64_t b00 = (uint64_t)aLo * bLo;
-    const uint64_t b01 = (uint64_t)aLo * bHi;
-    const uint64_t b10 = (uint64_t)aHi * bLo;
-    const uint64_t b11 = (uint64_t)aHi * bHi;
-
-    const uint32_t b00Lo = (uint32_t)b00;
-    const uint32_t b00Hi = (uint32_t)(b00 >> 32);
-
-    const uint64_t mid1 = b10 + b00Hi;
-    const uint32_t mid1Lo = (uint32_t)(mid1);
-    const uint32_t mid1Hi = (uint32_t)(mid1 >> 32);
-
-    const uint64_t mid2 = b01 + mid1Lo;
-    const uint32_t mid2Lo = (uint32_t)(mid2);
-    const uint32_t mid2Hi = (uint32_t)(mid2 >> 32);
-
-    const uint64_t pHi = b11 + mid1Hi + mid2Hi;
-    const uint64_t pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
-
-    *productHi = pHi;
-    return pLo;
-  }
-
-  __device__ inline uint64_t shiftright128(const uint64_t lo, const uint64_t hi, const uint32_t dist) {
-    // We don't need to handle the case dist >= 64 here (see above).
-    assert(dist < 64);
-    assert(dist > 0);
-    return (hi << (64 - dist)) | (lo >> dist);
-  }
-
-  __device__ inline uint64_t div5(const uint64_t x) {
-    return x / 5;
-  }
-
-  __device__ inline uint64_t div10(const uint64_t x) {
-    return x / 10;
-  }
-
-  __device__ inline uint64_t div100(const uint64_t x) {
-    return x / 100;
-  }
-
-  __device__ inline uint64_t div1e8(const uint64_t x) {
-    return x / 100000000;
-  }
-
-  __device__ inline uint64_t div1e9(const uint64_t x) {
-    return x / 1000000000;
-  }
-
-  __device__ inline uint32_t mod1e9(const uint64_t x) {
-    return (uint32_t) (x - 1000000000 * div1e9(x));
-  }
-
-  __device__ inline uint32_t pow5Factor(uint64_t value) {
-    const uint64_t m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
-    const uint64_t n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
-    uint32_t count = 0;
-    for (;;) {
-      assert(value != 0);
-      value *= m_inv_5;
-      if (value > n_div_5)
-        break;
-      ++count;
-    }
-    return count;
-  }
-
-  // Returns true if value is divisible by 5^p.
-  __device__ inline bool multipleOfPowerOf5(const uint64_t value, const uint32_t p) {
-    // I tried a case distinction on p, but there was no performance difference.
-    return pow5Factor(value) >= p;
-  }
-
-  // Returns true if value is divisible by 2^p.
-  __device__ inline bool multipleOfPowerOf2(const uint64_t value, const uint32_t p) {
-    assert(value != 0);
-    assert(p < 64);
-    // __builtin_ctzll doesn't appear to be faster here.
-    return (value & ((1ull << p) - 1)) == 0;
-  }
-
-  __device__ inline uint64_t mulShift64(const uint64_t m, const uint64_t* const mul, const int32_t j) {
-    // m is maximum 55 bits
-    uint64_t high1;                                   // 128
-    const uint64_t low1 = umul128(m, mul[1], &high1); // 64
-    uint64_t high0;                                   // 64
-    umul128(m, mul[0], &high0);                       // 0
-    const uint64_t sum = high0 + low1;
-    if (sum < high0) {
-      ++high1; // overflow into high1
-    }
-    return shiftright128(sum, high1, j - 64);
-  }
-
-  __device__ inline uint64_t mulShiftAll64(const uint64_t m, const uint64_t* const mul, const int32_t j,
-    uint64_t* const vp, uint64_t* const vm, const uint32_t mmShift) {
-    *vp = mulShift64(4 * m + 2, mul, j);
-    *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
-    return mulShift64(4 * m, mul, j);
-  }
-
-  //===== d2s_small_table.h from ryu =====
-
-  // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
-  __device__ inline void double_computePow5(const uint32_t i, uint64_t* const result) {
-    const uint32_t base = i / POW5_TABLE_SIZE;
-    const uint32_t base2 = base * POW5_TABLE_SIZE;
-    const uint32_t offset = i - base2;
-    const uint64_t* const mul = DOUBLE_POW5_SPLIT2[base];
-    if (offset == 0) {
-      result[0] = mul[0];
-      result[1] = mul[1];
-      return;
-    }
-    const uint64_t m = DOUBLE_POW5_TABLE[offset];
-    uint64_t high1;
-    const uint64_t low1 = umul128(m, mul[1], &high1);
-    uint64_t high0;
-    const uint64_t low0 = umul128(m, mul[0], &high0);
-    const uint64_t sum = high0 + low1;
-    if (sum < high0) {
-      ++high1; // overflow into high1
-    }
-    // high1 | sum | low0
-    const uint32_t delta = pow5bits(i) - pow5bits(base2);
-    result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
-    result[1] = shiftright128(sum, high1, delta);
-  }
-
-  // Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
-  __device__ inline void double_computeInvPow5(const uint32_t i, uint64_t* const result) {
-    const uint32_t base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
-    const uint32_t base2 = base * POW5_TABLE_SIZE;
-    const uint32_t offset = base2 - i;
-    const uint64_t* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
-    if (offset == 0) {
-      result[0] = mul[0];
-      result[1] = mul[1];
-      return;
-    }
-    const uint64_t m = DOUBLE_POW5_TABLE[offset];
-    uint64_t high1;
-    const uint64_t low1 = umul128(m, mul[1], &high1);
-    uint64_t high0;
-    const uint64_t low0 = umul128(m, mul[0] - 1, &high0);
-    const uint64_t sum = high0 + low1;
-    if (sum < high0) {
-      ++high1; // overflow into high1
-    }
-    // high1 | sum | low0
-    const uint32_t delta = pow5bits(base2) - pow5bits(i);
-    result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
-    result[1] = shiftright128(sum, high1, delta);
-  }
-
-  //===== f2s_intrinsics.h from ryu =====
-
-  __device__ inline uint32_t mulPow5InvDivPow2(const uint32_t m, const uint32_t q, const int32_t j) {
-    // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
-    // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
-    // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
-    uint64_t pow5[2];
-    double_computeInvPow5(q, pow5);
-    return mulShift32(m, pow5[1] + 1, j);
-  }
-
-  __device__ inline uint32_t mulPow5divPow2(const uint32_t m, const uint32_t i, const int32_t j) {
-    uint64_t pow5[2];
-    double_computePow5(i, pow5);
-    return mulShift32(m, pow5[1], j);
-  }
-
-  //===== d2s.c and f2s.c from ryu =====
-
-  __device__ inline uint32_t decimalLength17(const uint64_t v) {
-    // This is slightly faster than a loop.
-    // The average output length is 16.38 digits, so we check high-to-low.
-    // Function precondition: v is not an 18, 19, or 20-digit number.
-    // (17 digits are sufficient for round-tripping.)
-    assert(v < 100000000000000000L);
-    if (v >= 10000000000000000L) { return 17; }
-    if (v >= 1000000000000000L) { return 16; }
-    if (v >= 100000000000000L) { return 15; }
-    if (v >= 10000000000000L) { return 14; }
-    if (v >= 1000000000000L) { return 13; }
-    if (v >= 100000000000L) { return 12; }
-    if (v >= 10000000000L) { return 11; }
-    if (v >= 1000000000L) { return 10; }
-    if (v >= 100000000L) { return 9; }
-    if (v >= 10000000L) { return 8; }
-    if (v >= 1000000L) { return 7; }
-    if (v >= 100000L) { return 6; }
-    if (v >= 10000L) { return 5; }
-    if (v >= 1000L) { return 4; }
-    if (v >= 100L) { return 3; }
-    if (v >= 10L) { return 2; }
-    return 1;
-  }
-
-  __device__ inline floating_decimal_64 d2d(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
-    int32_t e2;
-    uint64_t m2;
-    if (ieeeExponent == 0) {
-      // We subtract 2 so that the bounds computation has 2 additional bits.
-      e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
-      m2 = ieeeMantissa;
-    } else {
-      e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
-      m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-    }
-    const bool even = (m2 & 1) == 0;
-    const bool acceptBounds = even;
-
-    // Step 2: Determine the interval of valid decimal representations.
-    const uint64_t mv = 4 * m2;
-    // Implicit bool -> int conversion. True is 1, false is 0.
-    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-    // We would compute mp and mm like this:
-    // uint64_t mp = 4 * m2 + 2;
-    // uint64_t mm = mv - 1 - mmShift;
-
-    // Step 3: Convert to a decimal power base using 128-bit arithmetic.
-    uint64_t vr, vp, vm;
-    int32_t e10;
-    bool vmIsTrailingZeros = false;
-    bool vrIsTrailingZeros = false;
-    if (e2 >= 0) {
-      // I tried special-casing q == 0, but there was no effect on performance.
-      // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
-      const uint32_t q = log10Pow2(e2) - (e2 > 3);
-      e10 = (int32_t) q;
-      const int32_t k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
-      const int32_t i = -e2 + (int32_t) q + k;
-      uint64_t pow5[2];
-      double_computeInvPow5(q, pow5);
-      vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
-
-      if (q <= 21) {
-        // This should use q <= 22, but I think 21 is also safe. Smaller values
-        // may still be safe, but it's more difficult to reason about them.
-        // Only one of mp, mv, and mm can be a multiple of 5, if any.
-        const uint32_t mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
-        if (mvMod5 == 0) {
-          vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
-        } else if (acceptBounds) {
-          // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
-          // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
-          // <=> true && pow5Factor(mm) >= q, since e2 >= q.
-          vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
-        } else {
-          // Same as min(e2 + 1, pow5Factor(mp)) >= q.
-          vp -= multipleOfPowerOf5(mv + 2, q);
-        }
-      }
-    } else {
-      // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
-      const uint32_t q = log10Pow5(-e2) - (-e2 > 1);
-      e10 = (int32_t) q + e2;
-      const int32_t i = -e2 - (int32_t) q;
-      const int32_t k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
-      const int32_t j = (int32_t) q - k;
-
-      uint64_t pow5[2];
-      double_computePow5(i, pow5);
-      vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
-
-      if (q <= 1) {
-        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
-        // mv = 4 * m2, so it always has at least two trailing 0 bits.
-        vrIsTrailingZeros = true;
-        if (acceptBounds) {
-          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
-          vmIsTrailingZeros = mmShift == 1;
-        } else {
-          // mp = mv + 2, so it always has at least one trailing 0 bit.
-          --vp;
-        }
-      } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
-        // We want to know if the full product has at least q trailing zeros.
-        // We need to compute min(p2(mv), p5(mv) - e2) >= q
-        // <=> p2(mv) >= q && p5(mv) - e2 >= q
-        // <=> p2(mv) >= q (because -e2 >= q)
-        vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
-      }
-    }
-
-    // Step 4: Find the shortest decimal representation in the interval of valid representations.
-    int32_t removed = 0;
-    uint8_t lastRemovedDigit = 0;
-    uint64_t output;
-    // On average, we remove ~2 digits.
-    if (vmIsTrailingZeros || vrIsTrailingZeros) {
-      // General case, which happens rarely (~0.7%).
-      for (;;) {
-        const uint64_t vpDiv10 = div10(vp);
-        const uint64_t vmDiv10 = div10(vm);
-        if (vpDiv10 <= vmDiv10) {
-          break;
-        }
-        const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
-        const uint64_t vrDiv10 = div10(vr);
-        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-        vmIsTrailingZeros &= vmMod10 == 0;
-        vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t) vrMod10;
-        vr = vrDiv10;
-        vp = vpDiv10;
-        vm = vmDiv10;
-        ++removed;
-      }
-
-      if (vmIsTrailingZeros) {
-        for (;;) {
-          const uint64_t vmDiv10 = div10(vm);
-          const uint32_t vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
-          if (vmMod10 != 0) {
-            break;
-          }
-          const uint64_t vpDiv10 = div10(vp);
-          const uint64_t vrDiv10 = div10(vr);
-          const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-          vrIsTrailingZeros &= lastRemovedDigit == 0;
-          lastRemovedDigit = (uint8_t) vrMod10;
-          vr = vrDiv10;
-          vp = vpDiv10;
-          vm = vmDiv10;
-          ++removed;
-        }
-      }
-
-      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
-        // Round even if the exact number is .....50..0.
-        lastRemovedDigit = 4;
-      }
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
-    } else {
-      // Specialized for the common case (~99.3%). Percentages below are relative to this.
-      bool roundUp = false;
-      const uint64_t vpDiv100 = div100(vp);
-      const uint64_t vmDiv100 = div100(vm);
-      if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
-        const uint64_t vrDiv100 = div100(vr);
-        const uint32_t vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
-        roundUp = vrMod100 >= 50;
-        vr = vrDiv100;
-        vp = vpDiv100;
-        vm = vmDiv100;
-        removed += 2;
-      }
-      // Loop iterations below (approximately), without optimization above:
-      // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
-      // Loop iterations below (approximately), with optimization above:
-      // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
-      for (;;) {
-        const uint64_t vpDiv10 = div10(vp);
-        const uint64_t vmDiv10 = div10(vm);
-        if (vpDiv10 <= vmDiv10) {
-          break;
-        }
-        const uint64_t vrDiv10 = div10(vr);
-        const uint32_t vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-        roundUp = vrMod10 >= 5;
-        vr = vrDiv10;
-        vp = vpDiv10;
-        vm = vmDiv10;
-        ++removed;
-      }
-
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + (vr == vm || roundUp);
-    }
-    const int32_t exp = e10 + removed;
-
-    floating_decimal_64 fd;
-    fd.exponent = exp;
-    fd.mantissa = output;
-    return fd;
-  }
-
-  __device__ inline floating_decimal_32 f2d(const uint32_t ieeeMantissa, const uint32_t ieeeExponent) {
-    int32_t e2;
-    uint32_t m2;
-    if (ieeeExponent == 0) {
-      // We subtract 2 so that the bounds computation has 2 additional bits.
-      e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
-      m2 = ieeeMantissa;
-    } else {
-      e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
-      m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
-    }
-    const bool even = (m2 & 1) == 0;
-    const bool acceptBounds = even;
-
-    // Step 2: Determine the interval of valid decimal representations.
-    const uint32_t mv = 4 * m2;
-    const uint32_t mp = 4 * m2 + 2;
-    // Implicit bool -> int conversion. True is 1, false is 0.
-    const uint32_t mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-    const uint32_t mm = 4 * m2 - 1 - mmShift;
-
-    // Step 3: Convert to a decimal power base using 64-bit arithmetic.
-    uint32_t vr, vp, vm;
-    int32_t e10;
-    bool vmIsTrailingZeros = false;
-    bool vrIsTrailingZeros = false;
-    uint8_t lastRemovedDigit = 0;
-    if (e2 >= 0) {
-      const uint32_t q = log10Pow2(e2);
-      e10 = (int32_t) q;
-      const int32_t k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
-      const int32_t i = -e2 + (int32_t) q + k;
-      vr = mulPow5InvDivPow2(mv, q, i);
-      vp = mulPow5InvDivPow2(mp, q, i);
-      vm = mulPow5InvDivPow2(mm, q, i);
-      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-        // We need to know one removed digit even if we are not going to loop below. We could use
-        // q = X - 1 above, except that would require 33 bits for the result, and we've found that
-        // 32-bit arithmetic is faster even on 64-bit machines.
-        const int32_t l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
-        lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
-      }
-      if (q <= 9) {
-        // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
-        // Only one of mp, mv, and mm can be a multiple of 5, if any.
-        if (mv % 5 == 0) {
-          vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
-        } else if (acceptBounds) {
-          vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
-        } else {
-          vp -= multipleOfPowerOf5_32(mp, q);
-        }
-      }
-    } else {
-      const uint32_t q = log10Pow5(-e2);
-      e10 = (int32_t) q + e2;
-      const int32_t i = -e2 - (int32_t) q;
-      const int32_t k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
-      int32_t j = (int32_t) q - k;
-      vr = mulPow5divPow2(mv, (uint32_t) i, j);
-      vp = mulPow5divPow2(mp, (uint32_t) i, j);
-      vm = mulPow5divPow2(mm, (uint32_t) i, j);
-      if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-        j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
-        lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
-      }
-      if (q <= 1) {
-        // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
-        // mv = 4 * m2, so it always has at least two trailing 0 bits.
-        vrIsTrailingZeros = true;
-        if (acceptBounds) {
-          // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
-          vmIsTrailingZeros = mmShift == 1;
-        } else {
-          // mp = mv + 2, so it always has at least one trailing 0 bit.
-          --vp;
-        }
-      } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
-        vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
-      }
-    }
-
-    // Step 4: Find the shortest decimal representation in the interval of valid representations.
-    int32_t removed = 0;
-    uint32_t output;
-    if (vmIsTrailingZeros || vrIsTrailingZeros) {
-      // General case, which happens rarely (~4.0%).
-      while (vp / 10 > vm / 10) {
-        vmIsTrailingZeros &= vm % 10 == 0;
-        vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t) (vr % 10);
-        vr /= 10;
-        vp /= 10;
-        vm /= 10;
-        ++removed;
-      }
-      if (vmIsTrailingZeros) {
-        while (vm % 10 == 0) {
-          vrIsTrailingZeros &= lastRemovedDigit == 0;
-          lastRemovedDigit = (uint8_t) (vr % 10);
-          vr /= 10;
-          vp /= 10;
-          vm /= 10;
-          ++removed;
-        }
-      }
-      if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
-        // Round even if the exact number is .....50..0.
-        lastRemovedDigit = 4;
-      }
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
-    } else {
-      // Specialized for the common case (~96.0%). Percentages below are relative to this.
-      // Loop iterations below (approximately):
-      // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
-      while (vp / 10 > vm / 10) {
-        lastRemovedDigit = (uint8_t) (vr % 10);
-        vr /= 10;
-        vp /= 10;
-        vm /= 10;
-        ++removed;
-      }
-      // We need to take vr + 1 if vr is outside bounds or we need to round up.
-      output = vr + (vr == vm || lastRemovedDigit >= 5);
-    }
-    const int32_t exp = e10 + removed;
-
-    floating_decimal_32 fd;
-    fd.exponent = exp;
-    fd.mantissa = output;
-    return fd;
-  }
-
-  __device__ inline int to_chars(const floating_decimal_64 v, const bool sign, char* const result) {
-    // Step 5: Print the decimal representation.
-    int index = 0;
-    if (sign) {
-      result[index++] = '-';
-    }
-
-    uint64_t output = v.mantissa;
-    const uint32_t olength = decimalLength17(output);
-    int32_t exp = v.exponent + (int32_t) olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-    
-    // Values in the interval [1E-3, 1E7) are special.
-    if (scientificNotation) {
-      // Print in the format x.xxxxxE-yy.
-      for (uint32_t i = 0; i < olength - 1; ++i) {
-        const uint32_t c = output % 10; output /= 10;
-        result[index + olength - i] = (char) ('0' + c);
-      }
-      result[index] = '0' + output % 10;
-      result[index + 1] = '.';
-      index += olength + 1;
-      if (olength == 1) {
-        result[index++] = '0';
-      }
-      // Print 'E', the exponent sign, and the exponent, which has at most three digits.
-      result[index++] = 'E';
-      if (exp < 0) {
-        result[index++] = '-';
-        exp = -exp;
-      }
-      if (exp >= 100) {
-          result[index++] = (char) ('0' + exp / 100);
-          exp %= 100;
-          result[index++] = (char) ('0' + exp / 10);
-        } else if (exp >= 10) {
-          result[index++] = (char) ('0' + exp / 10);
-        }
-        result[index++] = (char) ('0' + exp % 10);
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        // Decimal dot is before any of the digits.
-        result[index++] = '0';
-        result[index++] = '.';
-        for (int i = -1; i > exp; i--) {
-          result[index++] = '0';
-        }
-        int current = index;
-        for (int i = 0; i < olength; i++) {
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-          index++;
-        }
-      } else if (exp + 1 >= olength) {
-        // Decimal dot is after any of the digits.
-        for (int i = 0; i < olength; i++) {
-          result[index + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength;
-        for (int i = olength; i < exp + 1; i++) {
-          result[index++] = '0';
-        }
-        result[index++] = '.';
-        result[index++] = '0';
-      } else {
-        // Decimal dot is somewhere between the digits.
-        int current = index + 1;
-        for (int i = 0; i < olength; i++) {
-          if (olength - i - 1 == exp) {
-            result[current + olength - i - 1] = '.';
-            current--;
-          }
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline int d2s_size(const floating_decimal_64 v, const bool sign) {
-    int index = 0;
-    if (sign) {
-      index++;
-    }
-
-    uint64_t output = v.mantissa;
-    const uint32_t olength = decimalLength17(output);
-    int32_t exp = v.exponent + (int32_t) olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-    
-    if (scientificNotation) {
-      index += olength + 1;
-      if (olength == 1) {
-        index++;
-      }
-      // 'E'
-      index++;
-      if (exp < 0) {
-        exp = -exp;
-        index++;
-      }
-      if (exp >= 100) {
-        index += 3;
-      } else if (exp >= 10) {
-        index += 2;
-      } else {
-        index++;
-      }
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        index += 1 - exp + olength;
-      } else if (exp + 1 >= olength) {
-        index += exp + 3;
-      } else {
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline int to_chars(const floating_decimal_32 v, const bool sign, char* const result) {
-    // Step 5: Print the decimal representation.
-    int index = 0;
-    if (sign) {
-      result[index++] = '-';
-    }
-
-    uint32_t output = v.mantissa;
-    const uint32_t olength = decimalLength9(output);
-    int32_t exp = v.exponent + olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-
-    if (scientificNotation) {
-      // Print in the format x.xxxxxE-yy.
-      for (int i = 0; i < olength - 1; i++) {
-        int c = output % 10; output /= 10;
-        result[index + olength - i] = (char) ('0' + c);
-      }
-      result[index] = (char) ('0' + output % 10);
-      result[index + 1] = '.';
-      index += olength + 1;
-      if (olength == 1) {
-        result[index++] = '0';
-      }
-
-      // Print 'E', the exponent sign, and the exponent, which has at most two digits.
-      result[index++] = 'E';
-      if (exp < 0) {
-        result[index++] = '-';
-        exp = -exp;
-      }
-      if (exp >= 10) {
-        result[index++] = (char) ('0' + exp / 10);
-      }
-      result[index++] = (char) ('0' + exp % 10);
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        // Decimal dot is before any of the digits.
-        result[index++] = '0';
-        result[index++] = '.';
-        for (int i = -1; i > exp; i--) {
-          result[index++] = '0';
-        }
-        int current = index;
-        for (int i = 0; i < olength; i++) {
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-          index++;
-        }
-      } else if (exp + 1 >= olength) {
-        // Decimal dot is after any of the digits.
-        for (int i = 0; i < olength; i++) {
-          result[index + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength;
-        for (int i = olength; i < exp + 1; i++) {
-          result[index++] = '0';
-        }
-        result[index++] = '.';
-        result[index++] = '0';
-      } else {
-        // Decimal dot is somewhere between the digits.
-        int current = index + 1;
-        for (int i = 0; i < olength; i++) {
-          if (olength - i - 1 == exp) {
-            result[current + olength - i - 1] = '.';
-            current--;
-          }
-          result[current + olength - i - 1] = (char) ('0' + output % 10);
-          output /= 10;
-        }
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline int f2s_size(const floating_decimal_32 v, const bool sign) {
-    // Step 5: Print the decimal representation.
-    int index = 0;
-    if (sign) {
-      index++;
-    }
-
-    uint32_t output = v.mantissa;
-    const uint32_t olength = decimalLength9(output);
-    int32_t exp = v.exponent + olength - 1;
-    bool scientificNotation = (exp < -3) || (exp >= 7);
-
-    if (scientificNotation) {
-      index += olength + 1;
-      if (olength == 1) {
-        index++;
-      }
-      // 'E'
-      index++;
-      if (exp < 0) {
-        index++;
-        exp = -exp;
-      }
-      if (exp >= 10) {
-        index++;
-      }
-      index++;
-    } else {
-      // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-      if (exp < 0) {
-        // Decimal dot is before any of the digits.
-        index += 1 - exp + olength;
-      } else if (exp + 1 >= olength) {
-        // Decimal dot is after any of the digits.
-        index += exp + 3;
-      } else {
-        // Decimal dot is somewhere between the digits.
-        index += olength + 1;
-      }
-    }
-    return index;
-  }
-
-  __device__ inline bool d2d_small_int(const uint64_t ieeeMantissa, const uint32_t ieeeExponent,
-    floating_decimal_64* const v) {
-    const uint64_t m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-    const int32_t e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
-
-    if (e2 > 0) {
-      // f = m2 * 2^e2 >= 2^53 is an integer.
-      // Ignore this case for now.
-      return false;
-    }
-
-    if (e2 < -52) {
-      // f < 1.
-      return false;
-    }
-
-    // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
-    // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
-    const uint64_t mask = (1ull << -e2) - 1;
-    const uint64_t fraction = m2 & mask;
-    if (fraction != 0) {
-      return false;
-    }
-
-    // f is an integer in the range [1, 2^53).
-    // Note: mantissa might contain trailing (decimal) 0's.
-    // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
-    v->mantissa = m2 >> -e2;
-    v->exponent = 0;
-    return true;
-  }
-
-  __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
-    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
-    const uint64_t bits = double_to_bits(f);
-
-    // Decode bits into sign, mantissa, and exponent.
-    ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
-    const uint64_t ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
-    const uint32_t ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
-    // Case distinction; exit early for the easy cases.
-    if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
-      special = true;
-      return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
-    }
-    special = false;
-    floating_decimal_64 v;
-    const bool isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
-    if (isSmallInt) {
-      // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
-      // For scientific notation we need to move these zeros into the exponent.
-      // (This is not needed for fixed-point notation, so it might be beneficial to trim
-      // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
-      for (;;) {
-        const uint64_t q = div10(v.mantissa);
-        const uint32_t r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
-        if (r != 0) {
-          break;
-        }
-        v.mantissa = q;
-        ++v.exponent;
-      }
-    } else {
-      v = d2d(ieeeMantissa, ieeeExponent);
-    }
-    return v;
-  }
-
-  __device__ int d2s_buffered_n(double f, char* result) {
-    bool sign = false, special = false;
-    floating_decimal_64 v = d2d(f, sign, special);
-    if (special) {
-      return copy_special_str(result, sign, v.exponent, v.mantissa);
-    }
-    return to_chars(v, sign, result);
-  }
-
-  __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
-    // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
-    const uint32_t bits = float_to_bits(f);
-
-    // Decode bits into sign, mantissa, and exponent.
-    ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
-    const uint32_t ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
-    const uint32_t ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
-
-    // Case distinction; exit early for the easy cases.
-    if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
-      special = true;
-      return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
-    }
-    special = false;
-    return f2d(ieeeMantissa, ieeeExponent);
-  }
-
-  __device__ int f2s_buffered_n(float f, char* result) {
-    bool sign = false, special = false;
-    floating_decimal_32 v = f2d(f, sign, special);
-    if (special) {
-      return copy_special_str(result, sign, v.exponent, v.mantissa);
-    }
-    return to_chars(v, sign, result);
-  }
-
-
-  //===== compute float to string size =====
-
-  __device__ int compute_d2s_size(double value) {
-    bool sign = false, special = false;
-    floating_decimal_64 v = d2d(value, sign, special);
-    if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa);
-    }
-    return d2s_size(v, sign);
-  }
-
-  __device__ int compute_f2s_size(float value) {
-    bool sign = false, special = false;
-    floating_decimal_32 v = f2d(value, sign, special);
-    if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa);
-    }
-    return f2s_size(v, sign);
-  }
-
-  //===== APIs =====
-
-  __device__ int compute_ftos_size(double value, bool is_float) {
-    if (is_float) {
-        return compute_f2s_size(value);
-    } else {
-        return compute_d2s_size(value);
-    }
-  }
-
-  __device__ int float_to_string(double value, char* output, bool is_float) {
-      if (is_float) {
-          return f2s_buffered_n(value, output);
-      } else {
-          return d2s_buffered_n(value, output);
-      }
-  }
-};
-
-}
-}
-}
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
new file mode 100644
index 0000000000..b9905ae567
--- /dev/null
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -0,0 +1,1156 @@
+/*
+ * Copyright 2018 Ulf Adams
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda/std/climits>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+namespace spark_rapids_jni::ftos_converter {
+
+namespace {
+
+// d2s.c from ryu
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_64 {
+  uint64_t mantissa;
+  // Decimal exponent's range is -324 to 308
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_64;
+
+// f2s.c from ryu
+// A floating decimal representing m * 10^e.
+typedef struct floating_decimal_32 {
+  uint32_t mantissa;
+  // Decimal exponent's range is -45 to 38
+  // inclusive, and can fit in a short if needed.
+  int32_t exponent;
+} floating_decimal_32;
+
+//===== constants from ryu =====
+
+// These tables are generated by PrintDoubleLookupTable.
+constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
+constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
+constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
+constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
+constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
+constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
+constexpr unsigned int DOUBLE_BIAS = 1023;
+constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
+constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
+constexpr unsigned int FLOAT_BIAS = 127;
+
+__constant__
+uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = {
+  {                    1u, 2305843009213693952u },
+  {  5955668970331000884u, 1784059615882449851u },
+  {  8982663654677661702u, 1380349269358112757u },
+  {  7286864317269821294u, 2135987035920910082u },
+  {  7005857020398200553u, 1652639921975621497u },
+  { 17965325103354776697u, 1278668206209430417u },
+  {  8928596168509315048u, 1978643211784836272u },
+  { 10075671573058298858u, 1530901034580419511u },
+  {   597001226353042382u, 1184477304306571148u },
+  {  1527430471115325346u, 1832889850782397517u },
+  { 12533209867169019542u, 1418129833677084982u },
+  {  5577825024675947042u, 2194449627517475473u },
+  { 11006974540203867551u, 1697873161311732311u },
+  { 10313493231639821582u, 1313665730009899186u },
+  { 12701016819766672773u, 2032799256770390445u }
+};
+
+__constant__
+uint32_t const POW5_INV_OFFSETS[19] = {
+  0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
+  0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
+  0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
+  0x00000000
+};
+
+__constant__
+uint64_t const DOUBLE_POW5_SPLIT2[13][2] = {
+  {                    0u, 1152921504606846976u },
+  {                    0u, 1490116119384765625u },
+  {  1032610780636961552u, 1925929944387235853u },
+  {  7910200175544436838u, 1244603055572228341u },
+  { 16941905809032713930u, 1608611746708759036u },
+  { 13024893955298202172u, 2079081953128979843u },
+  {  6607496772837067824u, 1343575221513417750u },
+  { 17332926989895652603u, 1736530273035216783u },
+  { 13037379183483547984u, 2244412773384604712u },
+  {  1605989338741628675u, 1450417759929778918u },
+  {  9630225068416591280u, 1874621017369538693u },
+  {   665883850346957067u, 1211445438634777304u },
+  { 14931890668723713708u, 1565756531257009982u }
+};
+
+__constant__
+uint32_t const POW5_OFFSETS[21] = {
+  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
+  0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
+  0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
+  0x55559155, 0x51405555, 0x00000105
+};
+
+constexpr uint32_t POW5_TABLE_SIZE = 26;
+
+__constant__
+uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
+1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
+30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
+19073486328125ull, 95367431640625ull, 476837158203125ull,
+2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
+298023223876953125ull //, 1490116119384765625ull
+};
+
+//===== common.h from ryu =====
+
+// Returns the number of decimal digits in v, which must not contain more than 9 digits.
+__device__ inline uint32_t decimalLength9(uint32_t const v) {
+  // Function precondition: v is not a 10-digit number.
+  // (f2s: 9 digits are sufficient for round-tripping.)
+  // (d2fixed: We print 9-digit blocks.)
+  assert(v < 1000000000);
+  if (v >= 100000000) { return 9; }
+  if (v >= 10000000) { return 8; }
+  if (v >= 1000000) { return 7; }
+  if (v >= 100000) { return 6; }
+  if (v >= 10000) { return 5; }
+  if (v >= 1000) { return 4; }
+  if (v >= 100) { return 3; }
+  if (v >= 10) { return 2; }
+  return 1;
+}
+
+// Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
+__device__ inline int32_t pow5bits(int32_t const e) {
+  // This approximation works up to the point that the multiplication overflows at e = 3529.
+  // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
+  // than 2^9297.
+  assert(e >= 0);
+  assert(e <= 3528);
+  return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
+}
+
+// Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
+__device__ inline uint32_t log10Pow2(int32_t const e) {
+  // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
+  assert(e >= 0);
+  assert(e <= 1650);
+  return (((uint32_t) e) * 78913) >> 18;
+}
+
+// Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
+__device__ inline uint32_t log10Pow5(int32_t const e) {
+  // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
+  assert(e >= 0);
+  assert(e <= 2620);
+  return (((uint32_t) e) * 732923) >> 20;
+}
+
+__device__ inline uint32_t pow5factor_32(uint32_t value) {
+  uint32_t count = 0;
+  for (;;) {
+    assert(value != 0);
+    uint32_t const q = value / 5;
+    uint32_t const r = value % 5;
+    if (r != 0) {
+      break;
+    }
+    value = q;
+    ++count;
+  }
+  return count;
+}
+
+// Returns true if value is divisible by 5^p.
+__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) {
+  return pow5factor_32(value) >= p;
+}
+
+// Returns true if value is divisible by 2^p.
+__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) {
+  // __builtin_ctz doesn't appear to be faster here.
+  return (value & ((1u << p) - 1)) == 0;
+}
+
+// It seems to be slightly faster to avoid uint128_t here, although the
+// generated code for uint128_t looks slightly nicer.
+__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) {
+  assert(shift > 32);
+
+  // The casts here help MSVC to avoid calls to the __allmul library
+  // function.
+  uint32_t const factorLo = (uint32_t)(factor);
+  uint32_t const factorHi = (uint32_t)(factor >> 32);
+  uint64_t const bits0 = (uint64_t)m * factorLo;
+  uint64_t const bits1 = (uint64_t)m * factorHi;
+
+  uint64_t const sum = (bits0 >> 32) + bits1;
+  uint64_t const shiftedSum = sum >> (shift - 32);
+  assert(shiftedSum <= UINT32_MAX);
+  return (uint32_t) shiftedSum;
+
+}
+
+__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa) {
+  if (mantissa) {
+    memcpy(result, "NaN", 3);
+    return 3;
+  }
+  if (sign) {
+    result[0] = '-';
+  }
+  if (exponent) {
+    memcpy(result + sign, "Infinity", 8);
+    return sign + 8;
+  }
+  memcpy(result + sign, "0.0", 3);
+  return sign + 3;
+}
+
+__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) {
+  if (mantissa) {
+    return 3;
+  }
+  if (exponent) {
+    return sign + 8;
+  }
+  return sign + 3;
+}
+
+__device__ inline uint32_t float_to_bits(float const f) {
+  uint32_t bits = 0;
+  memcpy(&bits, &f, sizeof(float));
+  return bits;
+}
+
+__device__ inline uint64_t double_to_bits(double const d) {
+  uint64_t bits = 0;
+  memcpy(&bits, &d, sizeof(double));
+  return bits;
+}
+
+//===== d2s_intrinsics.h from ryu =====
+
+__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) {
+  // The casts here help MSVC to avoid calls to the __allmul library function.
+  uint32_t const aLo = (uint32_t)a;
+  uint32_t const aHi = (uint32_t)(a >> 32);
+  uint32_t const bLo = (uint32_t)b;
+  uint32_t const bHi = (uint32_t)(b >> 32);
+
+  uint64_t const b00 = (uint64_t)aLo * bLo;
+  uint64_t const b01 = (uint64_t)aLo * bHi;
+  uint64_t const b10 = (uint64_t)aHi * bLo;
+  uint64_t const b11 = (uint64_t)aHi * bHi;
+
+  uint32_t const b00Lo = (uint32_t)b00;
+  uint32_t const b00Hi = (uint32_t)(b00 >> 32);
+
+  uint64_t const mid1 = b10 + b00Hi;
+  uint32_t const mid1Lo = (uint32_t)(mid1);
+  uint32_t const mid1Hi = (uint32_t)(mid1 >> 32);
+
+  uint64_t const mid2 = b01 + mid1Lo;
+  uint32_t const mid2Lo = (uint32_t)(mid2);
+  uint32_t const mid2Hi = (uint32_t)(mid2 >> 32);
+
+  uint64_t const pHi = b11 + mid1Hi + mid2Hi;
+  uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
+
+  *productHi = pHi;
+  return pLo;
+}
+
+__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) {
+  // We don't need to handle the case dist >= 64 here (see above).
+  assert(dist < 64);
+  assert(dist > 0);
+  return (hi << (64 - dist)) | (lo >> dist);
+}
+
+__device__ inline uint64_t div5(uint64_t const x) {
+  return x / 5;
+}
+
+__device__ inline uint64_t div10(uint64_t const x) {
+  return x / 10;
+}
+
+__device__ inline uint64_t div100(uint64_t const x) {
+  return x / 100;
+}
+
+__device__ inline uint32_t pow5Factor(uint64_t value) {
+  uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
+  uint64_t const n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
+  uint32_t count = 0;
+  for (;;) {
+    assert(value != 0);
+    value *= m_inv_5;
+    if (value > n_div_5)
+      break;
+    ++count;
+  }
+  return count;
+}
+
+// Returns true if value is divisible by 5^p.
+__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) {
+  // I tried a case distinction on p, but there was no performance difference.
+  return pow5Factor(value) >= p;
+}
+
+// Returns true if value is divisible by 2^p.
+__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) {
+  assert(value != 0);
+  assert(p < 64);
+  // __builtin_ctzll doesn't appear to be faster here.
+  return (value & ((1ull << p) - 1)) == 0;
+}
+
+__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) {
+  // m is maximum 55 bits
+  uint64_t high1;                                   // 128
+  uint64_t const low1 = umul128(m, mul[1], &high1); // 64
+  uint64_t high0;                                   // 64
+  umul128(m, mul[0], &high0);                       // 0
+  uint64_t const sum = high0 + low1;
+  if (sum < high0) {
+    ++high1; // overflow into high1
+  }
+  return shiftright128(sum, high1, j - 64);
+}
+
+__device__ inline uint64_t mulShiftAll64(uint64_t const m, uint64_t const* const mul, int32_t const j,
+  uint64_t* const vp, uint64_t* const vm, uint32_t const mmShift) {
+  *vp = mulShift64(4 * m + 2, mul, j);
+  *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
+  return mulShift64(4 * m, mul, j);
+}
+
+//===== d2s_small_table.h from ryu =====
+
+// Computes 5^i in the form required by Ryu, and stores it in the given pointer.
+__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) {
+  uint32_t const base = i / POW5_TABLE_SIZE;
+  uint32_t const base2 = base * POW5_TABLE_SIZE;
+  uint32_t const offset = i - base2;
+  uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base];
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  uint64_t const m = DOUBLE_POW5_TABLE[offset];
+  uint64_t high1;
+  uint64_t const low1 = umul128(m, mul[1], &high1);
+  uint64_t high0;
+  uint64_t const low0 = umul128(m, mul[0], &high0);
+  uint64_t const sum = high0 + low1;
+  if (sum < high0) {
+    ++high1; // overflow into high1
+  }
+  // high1 | sum | low0
+  uint32_t const delta = pow5bits(i) - pow5bits(base2);
+  result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[1] = shiftright128(sum, high1, delta);
+}
+
+// Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
+__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) {
+  uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+  uint32_t const base2 = base * POW5_TABLE_SIZE;
+  uint32_t const offset = base2 - i;
+  uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+  if (offset == 0) {
+    result[0] = mul[0];
+    result[1] = mul[1];
+    return;
+  }
+  uint64_t const m = DOUBLE_POW5_TABLE[offset];
+  uint64_t high1;
+  uint64_t const low1 = umul128(m, mul[1], &high1);
+  uint64_t high0;
+  uint64_t const low0 = umul128(m, mul[0] - 1, &high0);
+  uint64_t const sum = high0 + low1;
+  if (sum < high0) {
+    ++high1; // overflow into high1
+  }
+  // high1 | sum | low0
+  uint32_t const delta = pow5bits(base2) - pow5bits(i);
+  result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[1] = shiftright128(sum, high1, delta);
+}
+
+//===== f2s_intrinsics.h from ryu =====
+
+__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) {
+  // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
+  // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
+  // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
+  uint64_t pow5[2];
+  double_computeInvPow5(q, pow5);
+  return mulShift32(m, pow5[1] + 1, j);
+}
+
+__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) {
+  uint64_t pow5[2];
+  double_computePow5(i, pow5);
+  return mulShift32(m, pow5[1], j);
+}
+
+//===== d2s.c and f2s.c from ryu =====
+
+__device__ inline uint32_t decimalLength17(uint64_t const v) {
+  // This is slightly faster than a loop.
+  // The average output length is 16.38 digits, so we check high-to-low.
+  // Function precondition: v is not an 18, 19, or 20-digit number.
+  // (17 digits are sufficient for round-tripping.)
+  assert(v < 100000000000000000L);
+  if (v >= 10000000000000000L) { return 17; }
+  if (v >= 1000000000000000L) { return 16; }
+  if (v >= 100000000000000L) { return 15; }
+  if (v >= 10000000000000L) { return 14; }
+  if (v >= 1000000000000L) { return 13; }
+  if (v >= 100000000000L) { return 12; }
+  if (v >= 10000000000L) { return 11; }
+  if (v >= 1000000000L) { return 10; }
+  if (v >= 100000000L) { return 9; }
+  if (v >= 10000000L) { return 8; }
+  if (v >= 1000000L) { return 7; }
+  if (v >= 100000L) { return 6; }
+  if (v >= 10000L) { return 5; }
+  if (v >= 1000L) { return 4; }
+  if (v >= 100L) { return 3; }
+  if (v >= 10L) { return 2; }
+  return 1;
+}
+
+__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) {
+  int32_t e2;
+  uint64_t m2;
+  if (ieeeExponent == 0) {
+    // We subtract 2 so that the bounds computation has 2 additional bits.
+    e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    m2 = ieeeMantissa;
+  } else {
+    e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+  }
+  bool const even = (m2 & 1) == 0;
+  bool const acceptBounds = even;
+
+  // Step 2: Determine the interval of valid decimal representations.
+  uint64_t const mv = 4 * m2;
+  // Implicit bool -> int conversion. True is 1, false is 0.
+  uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+  // We would compute mp and mm like this:
+  // uint64_t mp = 4 * m2 + 2;
+  // uint64_t mm = mv - 1 - mmShift;
+
+  // Step 3: Convert to a decimal power base using 128-bit arithmetic.
+  uint64_t vr, vp, vm;
+  int32_t e10;
+  bool vmIsTrailingZeros = false;
+  bool vrIsTrailingZeros = false;
+  if (e2 >= 0) {
+    // I tried special-casing q == 0, but there was no effect on performance.
+    // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
+    uint32_t const q = log10Pow2(e2) - (e2 > 3);
+    e10 = (int32_t) q;
+    int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+    int32_t const i = -e2 + (int32_t) q + k;
+    uint64_t pow5[2];
+    double_computeInvPow5(q, pow5);
+    vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
+
+    if (q <= 21) {
+      // This should use q <= 22, but I think 21 is also safe. Smaller values
+      // may still be safe, but it's more difficult to reason about them.
+      // Only one of mp, mv, and mm can be a multiple of 5, if any.
+      uint32_t const mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
+      if (mvMod5 == 0) {
+        vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
+      } else if (acceptBounds) {
+        // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
+        // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
+        // <=> true && pow5Factor(mm) >= q, since e2 >= q.
+        vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
+      } else {
+        // Same as min(e2 + 1, pow5Factor(mp)) >= q.
+        vp -= multipleOfPowerOf5(mv + 2, q);
+      }
+    }
+  } else {
+    // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
+    uint32_t const q = log10Pow5(-e2) - (-e2 > 1);
+    e10 = (int32_t) q + e2;
+    int32_t const i = -e2 - (int32_t) q;
+    int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+    int32_t const j = (int32_t) q - k;
+
+    uint64_t pow5[2];
+    double_computePow5(i, pow5);
+    vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
+
+    if (q <= 1) {
+      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+      // mv = 4 * m2, so it always has at least two trailing 0 bits.
+      vrIsTrailingZeros = true;
+      if (acceptBounds) {
+        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+        vmIsTrailingZeros = mmShift == 1;
+      } else {
+        // mp = mv + 2, so it always has at least one trailing 0 bit.
+        --vp;
+      }
+    } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
+      // We want to know if the full product has at least q trailing zeros.
+      // We need to compute min(p2(mv), p5(mv) - e2) >= q
+      // <=> p2(mv) >= q && p5(mv) - e2 >= q
+      // <=> p2(mv) >= q (because -e2 >= q)
+      vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
+    }
+  }
+
+  // Step 4: Find the shortest decimal representation in the interval of valid representations.
+  int32_t removed = 0;
+  uint8_t lastRemovedDigit = 0;
+  uint64_t output;
+  // On average, we remove ~2 digits.
+  if (vmIsTrailingZeros || vrIsTrailingZeros) {
+    // General case, which happens rarely (~0.7%).
+    for (;;) {
+      uint64_t const vpDiv10 = div10(vp);
+      uint64_t const vmDiv10 = div10(vm);
+      if (vpDiv10 <= vmDiv10) {
+        break;
+      }
+      uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+      uint64_t const vrDiv10 = div10(vr);
+      uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+      vmIsTrailingZeros &= vmMod10 == 0;
+      vrIsTrailingZeros &= lastRemovedDigit == 0;
+      lastRemovedDigit = (uint8_t) vrMod10;
+      vr = vrDiv10;
+      vp = vpDiv10;
+      vm = vmDiv10;
+      ++removed;
+    }
+
+    if (vmIsTrailingZeros) {
+      for (;;) {
+        uint64_t const vmDiv10 = div10(vm);
+        uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+        if (vmMod10 != 0) {
+          break;
+        }
+        uint64_t const vpDiv10 = div10(vp);
+        uint64_t const vrDiv10 = div10(vr);
+        uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) vrMod10;
+        vr = vrDiv10;
+        vp = vpDiv10;
+        vm = vmDiv10;
+        ++removed;
+      }
+    }
+
+    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+      // Round even if the exact number is .....50..0.
+      lastRemovedDigit = 4;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+  } else {
+    // Specialized for the common case (~99.3%). Percentages below are relative to this.
+    bool roundUp = false;
+    uint64_t const vpDiv100 = div100(vp);
+    uint64_t const vmDiv100 = div100(vm);
+    if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
+      uint64_t const vrDiv100 = div100(vr);
+      uint32_t const vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
+      roundUp = vrMod100 >= 50;
+      vr = vrDiv100;
+      vp = vpDiv100;
+      vm = vmDiv100;
+      removed += 2;
+    }
+    // Loop iterations below (approximately), without optimization above:
+    // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
+    // Loop iterations below (approximately), with optimization above:
+    // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
+    for (;;) {
+      uint64_t const vpDiv10 = div10(vp);
+      uint64_t const vmDiv10 = div10(vm);
+      if (vpDiv10 <= vmDiv10) {
+        break;
+      }
+      uint64_t const vrDiv10 = div10(vr);
+      uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+      roundUp = vrMod10 >= 5;
+      vr = vrDiv10;
+      vp = vpDiv10;
+      vm = vmDiv10;
+      ++removed;
+    }
+
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + (vr == vm || roundUp);
+  }
+  int32_t const exp = e10 + removed;
+
+  floating_decimal_64 fd;
+  fd.exponent = exp;
+  fd.mantissa = output;
+  return fd;
+}
+
+__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) {
+  int32_t e2;
+  uint32_t m2;
+  if (ieeeExponent == 0) {
+    // We subtract 2 so that the bounds computation has 2 additional bits.
+    e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    m2 = ieeeMantissa;
+  } else {
+    e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
+  }
+  bool const even = (m2 & 1) == 0;
+  bool const acceptBounds = even;
+
+  // Step 2: Determine the interval of valid decimal representations.
+  uint32_t const mv = 4 * m2;
+  uint32_t const mp = 4 * m2 + 2;
+  // Implicit bool -> int conversion. True is 1, false is 0.
+  uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
+  uint32_t const mm = 4 * m2 - 1 - mmShift;
+
+  // Step 3: Convert to a decimal power base using 64-bit arithmetic.
+  uint32_t vr, vp, vm;
+  int32_t e10;
+  bool vmIsTrailingZeros = false;
+  bool vrIsTrailingZeros = false;
+  uint8_t lastRemovedDigit = 0;
+  if (e2 >= 0) {
+    uint32_t const q = log10Pow2(e2);
+    e10 = (int32_t) q;
+    int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
+    int32_t const i = -e2 + (int32_t) q + k;
+    vr = mulPow5InvDivPow2(mv, q, i);
+    vp = mulPow5InvDivPow2(mp, q, i);
+    vm = mulPow5InvDivPow2(mm, q, i);
+    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+      // We need to know one removed digit even if we are not going to loop below. We could use
+      // q = X - 1 above, except that would require 33 bits for the result, and we've found that
+      // 32-bit arithmetic is faster even on 64-bit machines.
+      int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
+      lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
+    }
+    if (q <= 9) {
+      // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
+      // Only one of mp, mv, and mm can be a multiple of 5, if any.
+      if (mv % 5 == 0) {
+        vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
+      } else if (acceptBounds) {
+        vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
+      } else {
+        vp -= multipleOfPowerOf5_32(mp, q);
+      }
+    }
+  } else {
+    uint32_t const q = log10Pow5(-e2);
+    e10 = (int32_t) q + e2;
+    int32_t const i = -e2 - (int32_t) q;
+    int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+    int32_t j = (int32_t) q - k;
+    vr = mulPow5divPow2(mv, (uint32_t) i, j);
+    vp = mulPow5divPow2(mp, (uint32_t) i, j);
+    vm = mulPow5divPow2(mm, (uint32_t) i, j);
+    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
+      j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+      lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
+    }
+    if (q <= 1) {
+      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
+      // mv = 4 * m2, so it always has at least two trailing 0 bits.
+      vrIsTrailingZeros = true;
+      if (acceptBounds) {
+        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
+        vmIsTrailingZeros = mmShift == 1;
+      } else {
+        // mp = mv + 2, so it always has at least one trailing 0 bit.
+        --vp;
+      }
+    } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
+      vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
+    }
+  }
+
+  // Step 4: Find the shortest decimal representation in the interval of valid representations.
+  int32_t removed = 0;
+  uint32_t output;
+  if (vmIsTrailingZeros || vrIsTrailingZeros) {
+    // General case, which happens rarely (~4.0%).
+    while (vp / 10 > vm / 10) {
+      vmIsTrailingZeros &= vm % 10 == 0;
+      vrIsTrailingZeros &= lastRemovedDigit == 0;
+      lastRemovedDigit = (uint8_t) (vr % 10);
+      vr /= 10;
+      vp /= 10;
+      vm /= 10;
+      ++removed;
+    }
+    if (vmIsTrailingZeros) {
+      while (vm % 10 == 0) {
+        vrIsTrailingZeros &= lastRemovedDigit == 0;
+        lastRemovedDigit = (uint8_t) (vr % 10);
+        vr /= 10;
+        vp /= 10;
+        vm /= 10;
+        ++removed;
+      }
+    }
+    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
+      // Round even if the exact number is .....50..0.
+      lastRemovedDigit = 4;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
+  } else {
+    // Specialized for the common case (~96.0%). Percentages below are relative to this.
+    // Loop iterations below (approximately):
+    // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
+    while (vp / 10 > vm / 10) {
+      lastRemovedDigit = (uint8_t) (vr % 10);
+      vr /= 10;
+      vp /= 10;
+      vm /= 10;
+      ++removed;
+    }
+    // We need to take vr + 1 if vr is outside bounds or we need to round up.
+    output = vr + (vr == vm || lastRemovedDigit >= 5);
+  }
+  int32_t const exp = e10 + removed;
+
+  floating_decimal_32 fd;
+  fd.exponent = exp;
+  fd.mantissa = output;
+  return fd;
+}
+
+__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) {
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) {
+    result[index++] = '-';
+  }
+
+  uint64_t output = v.mantissa;
+  uint32_t const olength = decimalLength17(output);
+  int32_t exp = v.exponent + (int32_t) olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+  
+  // Values in the interval [1E-3, 1E7) are special.
+  if (scientificNotation) {
+    // Print in the format x.xxxxxE-yy.
+    for (uint32_t i = 0; i < olength - 1; ++i) {
+      uint32_t const c = output % 10; output /= 10;
+      result[index + olength - i] = (char) ('0' + c);
+    }
+    result[index] = '0' + output % 10;
+    result[index + 1] = '.';
+    index += olength + 1;
+    if (olength == 1) {
+      result[index++] = '0';
+    }
+    // Print 'E', the exponent sign, and the exponent, which has at most three digits.
+    result[index++] = 'E';
+    if (exp < 0) {
+      result[index++] = '-';
+      exp = -exp;
+    }
+    if (exp >= 100) {
+        result[index++] = (char) ('0' + exp / 100);
+        exp %= 100;
+        result[index++] = (char) ('0' + exp / 10);
+      } else if (exp >= 10) {
+        result[index++] = (char) ('0' + exp / 10);
+      }
+      result[index++] = (char) ('0' + exp % 10);
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      result[index++] = '0';
+      result[index++] = '.';
+      for (int i = -1; i > exp; i--) {
+        result[index++] = '0';
+      }
+      int current = index;
+      for (int i = 0; i < olength; i++) {
+        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        output /= 10;
+        index++;
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      for (int i = 0; i < olength; i++) {
+        result[index + olength - i - 1] = (char) ('0' + output % 10);
+        output /= 10;
+      }
+      index += olength;
+      for (int i = olength; i < exp + 1; i++) {
+        result[index++] = '0';
+      }
+      result[index++] = '.';
+      result[index++] = '0';
+    } else {
+      // Decimal dot is somewhere between the digits.
+      int current = index + 1;
+      for (int i = 0; i < olength; i++) {
+        if (olength - i - 1 == exp) {
+          result[current + olength - i - 1] = '.';
+          current--;
+        }
+        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        output /= 10;
+      }
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) {
+  int index = 0;
+  if (sign) {
+    index++;
+  }
+
+  uint64_t output = v.mantissa;
+  uint32_t const olength = decimalLength17(output);
+  int32_t exp = v.exponent + (int32_t) olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+  
+  if (scientificNotation) {
+    index += olength + 1;
+    if (olength == 1) {
+      index++;
+    }
+    // 'E'
+    index++;
+    if (exp < 0) {
+      exp = -exp;
+      index++;
+    }
+    if (exp >= 100) {
+      index += 3;
+    } else if (exp >= 10) {
+      index += 2;
+    } else {
+      index++;
+    }
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      index += 1 - exp + olength;
+    } else if (exp + 1 >= olength) {
+      index += exp + 3;
+    } else {
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) {
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) {
+    result[index++] = '-';
+  }
+
+  uint32_t output = v.mantissa;
+  uint32_t const olength = decimalLength9(output);
+  int32_t exp = v.exponent + olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+
+  if (scientificNotation) {
+    // Print in the format x.xxxxxE-yy.
+    for (int i = 0; i < olength - 1; i++) {
+      int c = output % 10; output /= 10;
+      result[index + olength - i] = (char) ('0' + c);
+    }
+    result[index] = (char) ('0' + output % 10);
+    result[index + 1] = '.';
+    index += olength + 1;
+    if (olength == 1) {
+      result[index++] = '0';
+    }
+
+    // Print 'E', the exponent sign, and the exponent, which has at most two digits.
+    result[index++] = 'E';
+    if (exp < 0) {
+      result[index++] = '-';
+      exp = -exp;
+    }
+    if (exp >= 10) {
+      result[index++] = (char) ('0' + exp / 10);
+    }
+    result[index++] = (char) ('0' + exp % 10);
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      result[index++] = '0';
+      result[index++] = '.';
+      for (int i = -1; i > exp; i--) {
+        result[index++] = '0';
+      }
+      int current = index;
+      for (int i = 0; i < olength; i++) {
+        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        output /= 10;
+        index++;
+      }
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      for (int i = 0; i < olength; i++) {
+        result[index + olength - i - 1] = (char) ('0' + output % 10);
+        output /= 10;
+      }
+      index += olength;
+      for (int i = olength; i < exp + 1; i++) {
+        result[index++] = '0';
+      }
+      result[index++] = '.';
+      result[index++] = '0';
+    } else {
+      // Decimal dot is somewhere between the digits.
+      int current = index + 1;
+      for (int i = 0; i < olength; i++) {
+        if (olength - i - 1 == exp) {
+          result[current + olength - i - 1] = '.';
+          current--;
+        }
+        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        output /= 10;
+      }
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) {
+  // Step 5: Print the decimal representation.
+  int index = 0;
+  if (sign) {
+    index++;
+  }
+
+  uint32_t output = v.mantissa;
+  uint32_t const olength = decimalLength9(output);
+  int32_t exp = v.exponent + olength - 1;
+  bool scientificNotation = (exp < -3) || (exp >= 7);
+
+  if (scientificNotation) {
+    index += olength + 1;
+    if (olength == 1) {
+      index++;
+    }
+    // 'E'
+    index++;
+    if (exp < 0) {
+      index++;
+      exp = -exp;
+    }
+    if (exp >= 10) {
+      index++;
+    }
+    index++;
+  } else {
+    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
+    if (exp < 0) {
+      // Decimal dot is before any of the digits.
+      index += 1 - exp + olength;
+    } else if (exp + 1 >= olength) {
+      // Decimal dot is after any of the digits.
+      index += exp + 3;
+    } else {
+      // Decimal dot is somewhere between the digits.
+      index += olength + 1;
+    }
+  }
+  return index;
+}
+
+__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const ieeeExponent,
+  floating_decimal_64* const v) {
+  uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
+  int32_t const e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+
+  if (e2 > 0) {
+    // f = m2 * 2^e2 >= 2^53 is an integer.
+    // Ignore this case for now.
+    return false;
+  }
+
+  if (e2 < -52) {
+    // f < 1.
+    return false;
+  }
+
+  // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
+  // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
+  uint64_t const mask = (1ull << -e2) - 1;
+  uint64_t const fraction = m2 & mask;
+  if (fraction != 0) {
+    return false;
+  }
+
+  // f is an integer in the range [1, 2^53).
+  // Note: mantissa might contain trailing (decimal) 0's.
+  // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
+  v->mantissa = m2 >> -e2;
+  v->exponent = 0;
+  return true;
+}
+
+__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
+  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+  uint64_t const bits = double_to_bits(f);
+
+  // Decode bits into sign, mantissa, and exponent.
+  ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+  uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
+  uint32_t const ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+  // Case distinction; exit early for the easy cases.
+  if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+    special = true;
+    return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
+  }
+  special = false;
+  floating_decimal_64 v;
+  bool const isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
+  if (isSmallInt) {
+    // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
+    // For scientific notation we need to move these zeros into the exponent.
+    // (This is not needed for fixed-point notation, so it might be beneficial to trim
+    // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
+    for (;;) {
+      uint64_t const q = div10(v.mantissa);
+      uint32_t const r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
+      if (r != 0) {
+        break;
+      }
+      v.mantissa = q;
+      ++v.exponent;
+    }
+  } else {
+    v = d2d(ieeeMantissa, ieeeExponent);
+  }
+  return v;
+}
+
+__device__ int d2s_buffered_n(double f, char* result) {
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(f, sign, special);
+  if (special) {
+    return copy_special_str(result, sign, v.exponent, v.mantissa);
+  }
+  return to_chars(v, sign, result);
+}
+
+__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
+  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
+  uint32_t const bits = float_to_bits(f);
+
+  // Decode bits into sign, mantissa, and exponent.
+  ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+  uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
+  uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
+
+  // Case distinction; exit early for the easy cases.
+  if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+    special = true;
+    return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
+  }
+  special = false;
+  return f2d(ieeeMantissa, ieeeExponent);
+}
+
+__device__ int f2s_buffered_n(float f, char* result) {
+  bool sign = false, special = false;
+  floating_decimal_32 v = f2d(f, sign, special);
+  if (special) {
+    return copy_special_str(result, sign, v.exponent, v.mantissa);
+  }
+  return to_chars(v, sign, result);
+}
+
+
+//===== compute float to string size =====
+
+__device__ int compute_d2s_size(double value) {
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(value, sign, special);
+  if (special) {
+    return special_str_size(sign, v.exponent, v.mantissa);
+  }
+  return d2s_size(v, sign);
+}
+
+__device__ int compute_f2s_size(float value) {
+  bool sign = false, special = false;
+  floating_decimal_32 v = f2d(value, sign, special);
+  if (special) {
+    return special_str_size(sign, v.exponent, v.mantissa);
+  }
+  return f2s_size(v, sign);
+}
+
+} // namespace 
+
+//===== APIs =====
+
+__device__ int compute_ftos_size(double value, bool is_float) {
+  if (is_float) {
+      return compute_f2s_size(value);
+  } else {
+      return compute_d2s_size(value);
+  }
+}
+
+__device__ int float_to_string(double value, bool is_float, char* output) {
+    if (is_float) {
+        return f2s_buffered_n(value, output);
+    } else {
+        return d2s_buffered_n(value, output);
+    }
+}
+
+} // namespace spark-rapids-jni::ftos_converter
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index d75741b8a0..0d4f62ac1b 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cast_string.hpp>
+#include "cast_string.hpp"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -29,7 +29,7 @@
 
 using namespace cudf;
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
 
 struct FloatToStringTests : public cudf::test::BaseFixture {};
 

From 131e48c3f49d84e8c5ad18d59e5f9ad56b04d86e Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 23 Nov 2023 16:26:12 +0800
Subject: [PATCH 34/54] cudf conflict

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 168533a8ad..823d3214a9 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243
+Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3

From 3c09c49a175df5b89620126b47b85f23014d12a0 Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Thu, 23 Nov 2023 16:28:20 +0800
Subject: [PATCH 35/54] Update src/main/cpp/src/cast_float_to_string.cu

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 src/main/cpp/src/cast_float_to_string.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index e22947ab9e..c41936c3bb 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -93,7 +93,7 @@ struct dispatch_float_to_string_fn {
                                std::move(offsets),
                                std::move(chars),
                                floats.null_count(),
-                               std::move(cudf::detail::copy_bitmask(floats, stream, mr)));
+                               cudf::detail::copy_bitmask(floats, stream, mr));
   }
 
   // non-float types throw an exception

From 346c1f7e599f9f2c7c6041908526d2fde095bda2 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 23 Nov 2023 17:17:14 +0800
Subject: [PATCH 36/54] Make it runable again

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/format_float.cu    | 140 ++++++++++
 src/main/cpp/src/ftos_converter.cuh | 382 +++++++++++++++++++++++++++-
 2 files changed, 517 insertions(+), 5 deletions(-)

diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index e69de29bb2..24fd39367b 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cast_string.hpp"
+#include "ftos_converter.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/convert/int_to_string.cuh>
+#include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace spark_rapids_jni {
+
+namespace detail {
+namespace {
+
+template <typename FloatType>
+struct format_float_fn {
+  cudf::column_device_view d_floats;
+  int digits;
+  cudf::size_type* d_offsets;
+  char* d_chars;
+
+  __device__ cudf::size_type compute_output_size(FloatType value, int digits)
+  {
+    bool is_float = std::is_same_v<FloatType, float>;
+    return static_cast<cudf::size_type>(ftos_converter::compute_format_float_size(static_cast<double>(value), digits, is_float));
+  }
+
+  __device__ void format_float(cudf::size_type idx, int digits)
+  {
+    FloatType value = d_floats.element<FloatType>(idx);
+    bool is_float = std::is_same_v<FloatType, float>;
+    ftos_converter::format_float(static_cast<double>(value), digits, d_chars + d_offsets[idx], is_float);
+  }
+
+  __device__ void operator()(cudf::size_type idx)
+  {
+    if (d_floats.is_null(idx)) {
+      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      return;
+    }
+    if (d_chars != nullptr) {
+      format_float(idx, digits);
+    } else {
+      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx), digits);
+    }
+  }
+};
+
+/**
+ * @brief This dispatch method is for converting floats into strings.
+ *
+ * The template function declaration ensures only float types are allowed.
+ */
+struct dispatch_format_float_fn {
+  template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
+                                     int digits,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr) const
+  {
+    cudf::size_type strings_count = floats.size();
+    auto column             = cudf::column_device_view::create(floats, stream);
+    auto d_column           = *column;
+
+    // copy the null mask
+    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
+
+    auto [offsets, chars] =
+      cudf::strings::detail::make_strings_children(format_float_fn<FloatType>{d_column, digits}, strings_count, stream, mr);
+
+    return cudf::make_strings_column(strings_count,
+                               std::move(offsets),
+                               std::move(chars),
+                               floats.null_count(),
+                               std::move(null_mask));
+  }
+
+  // non-float types throw an exception
+  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
+                                     int,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
+  {
+    CUDF_FAIL("Values for format_float function must be a float type.");
+  }
+};
+
+}  // namespace
+
+// This will convert all float column types into a strings column.
+std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
+                                    int digits,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  cudf::size_type strings_count = floats.size();
+  if (strings_count == 0) return cudf::make_empty_column(cudf::type_id::STRING);
+
+  return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr);
+}
+
+}  // namespace detail
+
+// external API
+std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats, 
+                                      int digits,
+                                      rmm::cuda_stream_view stream, 
+                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::format_float(floats, digits, stream, mr);
+}
+
+}  // namespace spark_rapids_jni
\ No newline at end of file
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index b9905ae567..50c64996be 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -211,7 +211,7 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i
 
 }
 
-__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa) {
+__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa, int const d = 1) {
   if (mantissa) {
     memcpy(result, "NaN", 3);
     return 3;
@@ -223,18 +223,29 @@ __device__ inline int copy_special_str(char * const result, bool const sign, boo
     memcpy(result + sign, "Infinity", 8);
     return sign + 8;
   }
-  memcpy(result + sign, "0.0", 3);
-  return sign + 3;
+  result[sign] = '0';
+  if (d == 0) {
+    return sign + 1;
+  } else {
+    result[sign + 1] = '.';
+  }
+  for (int i = 0; i < d; i++) {
+    result[sign + 2 + i] = '0';
+  }
+  return sign + 2 + d;
 }
 
-__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) {
+__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa, int const d=1) {
   if (mantissa) {
     return 3;
   }
   if (exponent) {
     return sign + 8;
   }
-  return sign + 3;
+  if (d == 0) {
+    return sign + 1;
+  }
+  return sign + 2 + d;
 }
 
 __device__ inline uint32_t float_to_bits(float const f) {
@@ -1153,4 +1164,365 @@ __device__ int float_to_string(double value, bool is_float, char* output) {
     }
 }
 
+//===== format float =====
+
+__constant__
+uint64_t const POW10_TABLE[19] = {
+1ull, 10ull, 100ull, 1000ull, 10000ull, 100000ull, 1000000ull, 10000000ull,
+100000000ull, 1000000000ull, 10000000000ull, 100000000000ull, 1000000000000ull,
+10000000000000ull, 100000000000000ull, 1000000000000000ull, 10000000000000000ull,
+100000000000000000ull
+};
+
+template<typename T>
+__device__ inline T round_half_even(T const input, int const olength, int const d) {
+  // "round" a integer to d digits, with the half-even rounding mode.    
+  if (d > olength) {
+    T num = input;
+    for (int i = 0; i < d - olength; i++) {
+      num *= 10;
+    }
+    return num;
+  }
+  T div = POW10_TABLE[olength - d];
+  T mod = input % div;
+  T num = input / div;
+  if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) {
+    num++;
+  }
+  return num;
+}
+
+__device__ inline int to_formated_chars(floating_decimal_64 const v, bool const sign, char* const result, int d) {
+  int index = 0;
+  if (sign) {
+    result[index++] = '-';
+  }
+  uint64_t output = v.mantissa;
+  const uint32_t olength = decimalLength17(output);
+  int32_t exp = v.exponent + (int32_t) olength - 1;
+  if (exp < 0) {
+    // Decimal dot is before any of the digits.
+    int index_for_carrier = index;
+    result[index++] = '0';
+    if (d == 0) {
+      return index;
+    }
+    result[index++] = '.';
+    int actural_round = d;
+    for (int i = -1; i > exp; i--) {
+      index_for_carrier = index;
+      result[index++] = '0';
+      actural_round--;
+      if (actural_round == 0) {
+        if (i != exp + 1) {
+          return index;
+        } // else, possible carry
+        break;
+      }
+    }
+    int actural_olength = fmin(int(olength), actural_round);
+    uint64_t rounded_output = round_half_even(output, olength, actural_round);
+    // check if carry
+    if (rounded_output >= POW10_TABLE[actural_olength]) {
+      result[index_for_carrier] = '1';
+      rounded_output -= POW10_TABLE[actural_olength];
+    }
+    int current = index;
+    for (int i = 0; i < actural_olength; i++) {
+      result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10);
+      rounded_output /= 10;
+      index++;
+    }
+    actural_round -= actural_olength;
+    if (actural_round > 0) {
+      for (int i = 0; i < actural_round; i++) {
+        result[index++] = '0';
+      }
+    }
+  } else if (exp + 1 >= olength) {
+    // Decimal dot is after any of the digits.
+    int integer_len = index + exp + 1 + exp / 3;
+    int sep_cnt = 0;
+    int rev_index = 0;
+    for (int i = olength; i < exp + 1; i++) {
+      result[integer_len - (rev_index++) - 1] = '0';
+      sep_cnt++;
+      if (sep_cnt == 3) {
+          result[integer_len - (rev_index++) - 1] = ',';
+          sep_cnt = 0;
+      }
+    }
+    for (int i = 0; i < olength; i++) {
+      if (sep_cnt == 3) {
+        result[integer_len - (rev_index++) - 1] = ',';
+        sep_cnt = 0;
+      }
+      result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10);
+      sep_cnt++;
+      output /= 10;
+    }
+    index = integer_len;
+    if (d == 0) {
+      return index;
+    }
+    result[index++] = '.';
+    for (int i = 0; i < d; i++) {
+      result[index++] = '0';
+    }
+  } else {
+    uint32_t temp_d = d, tailing_zero = 0;
+    if (exp + d > olength) {
+      temp_d = olength - exp;
+      tailing_zero = d - temp_d;
+    }
+    uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+    uint64_t pow10 = POW10_TABLE[temp_d];
+    uint64_t integer = rounded_output / pow10;
+    uint64_t decimal = rounded_output % pow10;
+    // calculate integer length after format to cover carry case
+    uint32_t integer_len = decimalLength17(integer);
+    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+    uint32_t sep_cnt = 0;
+    int rev_index = 0;
+    for (int i = 0; i < integer_len; i++) {
+      if (sep_cnt == 3) {
+        result[formated_integer_len - (rev_index++) - 1] = ',';
+        sep_cnt = 0;
+      }
+      result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10);
+      sep_cnt++;
+      integer /= 10;
+    }
+    index = formated_integer_len;
+    if (d == 0) {
+      return index;
+    }
+    result[index++] = '.';
+    int current = index;
+    for (int i = 0; i < tailing_zero; i++) {
+      result[current + d - i - 1] = '0';
+      index++;
+    }
+    for (int i = tailing_zero; i < d; i++) {
+      result[current + d - i - 1] = (char) ('0' + decimal % 10);
+      decimal /= 10;
+      index++;
+    }
+  }
+  return index;
+}
+
+__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int d) {
+  int index = 0;
+  if (sign) {
+    index++;
+  }
+  uint64_t output = v.mantissa;
+  const uint32_t olength = decimalLength17(output);
+  int32_t exp = v.exponent + (int32_t) olength - 1;
+  if (exp < 0) {
+    index += 2 + d;
+  } else if (exp + 1 >= olength) {
+    index += exp + 1 + exp / 3 + 1 + d;
+  } else {
+    uint32_t temp_d = d;
+    if (exp + d > olength) {
+      temp_d = olength - exp;
+    }
+    uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+    uint64_t pow10 = POW10_TABLE[temp_d];
+    uint64_t integer = rounded_output / pow10;
+    uint32_t integer_len = decimalLength17(integer);
+    index += integer_len + (integer_len - 1) / 3 + 1 + d;
+  }
+  if (d == 0) {
+    index--;
+  }
+  return index;
+}
+
+__device__ inline int to_formated_chars(floating_decimal_32 const v, bool const sign, char* const result, int d) {
+  int index = 0;
+  if (sign) {
+    result[index++] = '-';
+  }
+  uint32_t output = v.mantissa;
+  uint32_t const olength = decimalLength9(output);
+  int32_t exp = v.exponent + (int32_t) olength - 1;
+  if (exp < 0) {
+    // Decimal dot is before any of the digits.
+    int index_for_carrier = index;
+    result[index++] = '0';
+    if (d == 0) {
+      return index;
+    }
+    result[index++] = '.';
+    int actural_round = d;
+    for (int i = -1; i > exp; i--) {
+      index_for_carrier = index;
+      result[index++] = '0';
+      actural_round--;
+      if (actural_round == 0) {
+        if (i != exp + 1) {
+          return index;
+        } // else, possible carry
+        break;
+      }
+    }
+    int actural_olength = fmin(int(olength), actural_round);
+    uint64_t rounded_output = round_half_even(output, olength, actural_round);
+    // check if carry
+    if (rounded_output >= POW10_TABLE[actural_olength]) {
+      result[index_for_carrier] = '1';
+      rounded_output -= POW10_TABLE[actural_olength];
+    }
+    int current = index;
+    for (int i = 0; i < actural_olength; i++) {
+      result[current + actural_olength - i - 1] = (char) ('0' + rounded_output % 10);
+      rounded_output /= 10;
+      index++;
+    }
+    actural_round -= actural_olength;
+    if (actural_round > 0) {
+      for (int i = 0; i < actural_round; i++) {
+        result[index++] = '0';
+      }
+    }
+  } else if (exp + 1 >= olength) {
+    // Decimal dot is after any of the digits.
+    int integer_len = index + exp + 1 + exp / 3;
+    int sep_cnt = 0;
+    int rev_index = 0;
+    for (int i = olength; i < exp + 1; i++) {
+      result[integer_len - (rev_index++) - 1] = '0';
+      sep_cnt++;
+      if (sep_cnt == 3) {
+          result[integer_len - (rev_index++) - 1] = ',';
+          sep_cnt = 0;
+      }
+    }
+    for (int i = 0; i < olength; i++) {
+      if (sep_cnt == 3) {
+        result[integer_len - (rev_index++) - 1] = ',';
+        sep_cnt = 0;
+      }
+      result[integer_len - (rev_index++) - 1] = (char) ('0' + output % 10);
+      sep_cnt++;
+      output /= 10;
+    }
+    index = integer_len;
+    if (d == 0) {
+      return index;
+    }
+    result[index++] = '.';
+    for (int i = 0; i < d; i++) {
+      result[index++] = '0';
+    }
+  } else {
+    uint32_t temp_d = d, tailing_zero = 0;
+    if (exp + d > olength) {
+      temp_d = olength - exp;
+      tailing_zero = d - temp_d;
+    }
+    uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+    uint32_t pow10 = POW10_TABLE[temp_d];
+    uint32_t integer = rounded_output / pow10;
+    uint32_t decimal = rounded_output % pow10;
+    // calculate integer length after format to cover carry case
+    uint32_t integer_len = decimalLength9(integer);
+    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+    uint32_t sep_cnt = 0;
+    int rev_index = 0;
+    for (int i = 0; i < integer_len; i++) {
+      if (sep_cnt == 3) {
+        result[formated_integer_len - (rev_index++) - 1] = ',';
+        sep_cnt = 0;
+      }
+      result[formated_integer_len - (rev_index++) - 1] = (char) ('0' + integer % 10);
+      sep_cnt++;
+      integer /= 10;
+    }
+    index = formated_integer_len;
+    if (d == 0) {
+      return index;
+    }
+    result[index++] = '.';
+    int current = index;
+    for (int i = 0; i < tailing_zero; i++) {
+      result[current + d - i - 1] = '0';
+      index++;
+    }
+    for (int i = tailing_zero; i < d; i++) {
+      result[current + d - i - 1] = (char) ('0' + decimal % 10);
+      decimal /= 10;
+      index++;
+    }
+  }
+  return index;
+}
+
+__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int d) {
+  int index = 0;
+  if (sign) {
+    index++;
+  }
+  uint64_t output = v.mantissa;
+  uint32_t const olength = decimalLength9(output);
+  int32_t exp = v.exponent + (int32_t) olength - 1;
+  if (exp < 0) {
+    index += 2 + d;
+  } else if (exp + 1 >= olength) {
+    index += exp + 1 + exp / 3 + 1 + d;
+  } else {
+    uint32_t temp_d = d;
+    if (exp + d > olength) {
+      temp_d = olength - exp;
+    }
+    uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
+    uint64_t pow10 = POW10_TABLE[temp_d];
+    uint64_t integer = rounded_output / pow10;
+    uint32_t integer_len = decimalLength9(integer);
+    index += integer_len + (integer_len - 1) / 3 + 1 + d;
+  }
+  if (d == 0) {
+    index--;
+  }
+  return index;
+}  
+
+__device__ int compute_format_float_size(double value, int d, bool is_float) {
+  bool sign = false, special = false;
+  if (is_float) {
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa, d);
+    }
+    return format_float_size(v, sign, d);
+  } else {
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) {
+      return special_str_size(sign, v.exponent, v.mantissa, d);
+    }
+    return format_float_size(v, sign, d);
+  }
+}
+
+__device__ int format_float(double value, int d, char* output, bool is_float) {
+  bool sign = false, special = false;
+  if (is_float) {
+    floating_decimal_32 v = f2d(value, sign, special);
+    if (special) {
+      return copy_special_str(output, sign, v.exponent, v.mantissa, d);
+    }
+    return to_formated_chars(v, sign, output, d);
+  } else {
+    floating_decimal_64 v = d2d(value, sign, special);
+    if (special) {
+      return copy_special_str(output, sign, v.exponent, v.mantissa, d);
+    }
+    return to_formated_chars(v, sign, output, d);
+  }
+}
+
 } // namespace spark-rapids-jni::ftos_converter

From 98918ce1a8a8bb45ebd4be0e3634144bb8200368 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 23 Nov 2023 17:56:27 +0800
Subject: [PATCH 37/54] address some comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/CastStringJni.cpp            |   4 +-
 src/main/cpp/src/cast_string.hpp              |   2 +-
 src/main/cpp/src/format_float.cu              |  36 +++---
 src/main/cpp/src/ftos_converter.cuh           | 120 +++++++++---------
 src/main/cpp/tests/cast_decimal_to_string.cpp |   3 +-
 src/main/cpp/tests/cast_string.cpp            |   3 +-
 src/main/cpp/tests/format_float.cpp           |   3 +-
 .../nvidia/spark/rapids/jni/CastStrings.java  |   7 +-
 8 files changed, 91 insertions(+), 87 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 17bfeaf50b..063fabe222 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -110,7 +110,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat(
 }
 
 JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat(
-  JNIEnv* env, jclass, jlong input_column, jint d, jint j_dtype)
+  JNIEnv* env, jclass, jlong input_column, jint digits, jint j_dtype)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
 
@@ -119,7 +119,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat
 
     cudf::column_view cv{*reinterpret_cast<cudf::column_view const*>(input_column)};
     return cudf::jni::release_as_jlong(
-      spark_rapids_jni::format_float(cv, d, cudf::get_default_stream()));
+      spark_rapids_jni::format_float(cv, digits, cudf::get_default_stream()));
   }
   CATCH_CAST_EXCEPTION(env, 0);
 }
diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index e194919ffb..5c370f9185 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -117,7 +117,7 @@ std::unique_ptr<cudf::column> string_to_float(
 
 std::unique_ptr<cudf::column> format_float(
   cudf::column_view const& input,
-  int d,
+  int digits,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index 24fd39367b..3d3fedbfb9 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -29,6 +29,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -40,24 +41,25 @@ namespace {
 template <typename FloatType>
 struct format_float_fn {
   cudf::column_device_view d_floats;
-  int digits;
+  int const digits;
   cudf::size_type* d_offsets;
   char* d_chars;
 
-  __device__ cudf::size_type compute_output_size(FloatType value, int digits)
+  __device__ cudf::size_type compute_output_size(FloatType value, int digits) const
   {
-    bool is_float = std::is_same_v<FloatType, float>;
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
     return static_cast<cudf::size_type>(ftos_converter::compute_format_float_size(static_cast<double>(value), digits, is_float));
   }
 
-  __device__ void format_float(cudf::size_type idx, int digits)
+  __device__ void format_float(cudf::size_type idx, int digits) const
   {
-    FloatType value = d_floats.element<FloatType>(idx);
-    bool is_float = std::is_same_v<FloatType, float>;
-    ftos_converter::format_float(static_cast<double>(value), digits, d_chars + d_offsets[idx], is_float);
+    auto const value = d_floats.element<FloatType>(idx);
+    bool constexpr is_float = std::is_same_v<FloatType, float>;
+    auto const output = d_chars + d_offsets[idx];
+    ftos_converter::format_float(static_cast<double>(value), digits, is_float, output);
   }
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     if (d_floats.is_null(idx)) {
       if (d_chars == nullptr) { d_offsets[idx] = 0; }
@@ -83,21 +85,17 @@ struct dispatch_format_float_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    cudf::size_type strings_count = floats.size();
-    auto column             = cudf::column_device_view::create(floats, stream);
-    auto d_column           = *column;
-
-    // copy the null mask
-    rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
+    auto const strings_count = floats.size();
+    auto const input_ptr = cudf::column_device_view::create(floats, stream);
 
     auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(format_float_fn<FloatType>{d_column, digits}, strings_count, stream, mr);
+      cudf::strings::detail::make_strings_children(format_float_fn<FloatType>{*input_ptr, digits}, strings_count, stream, mr);
 
     return cudf::make_strings_column(strings_count,
                                std::move(offsets),
                                std::move(chars),
                                floats.null_count(),
-                               std::move(null_mask));
+                               cudf::detail::copy_bitmask(floats, stream, mr));
   }
 
   // non-float types throw an exception
@@ -119,8 +117,10 @@ std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  cudf::size_type strings_count = floats.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::type_id::STRING);
+  auto const strings_count = floats.size();
+  if (strings_count == 0) {
+    return cudf::make_empty_column(cudf::type_id::STRING);
+  }
 
   return type_dispatcher(floats.type(), dispatch_format_float_fn{}, floats, digits, stream, mr);
 }
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index 50c64996be..a3056d7e46 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -211,7 +211,7 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i
 
 }
 
-__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa, int const d = 1) {
+__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa, int const digits = 1) {
   if (mantissa) {
     memcpy(result, "NaN", 3);
     return 3;
@@ -224,28 +224,28 @@ __device__ inline int copy_special_str(char * const result, bool const sign, boo
     return sign + 8;
   }
   result[sign] = '0';
-  if (d == 0) {
+  if (digits == 0) {
     return sign + 1;
   } else {
     result[sign + 1] = '.';
   }
-  for (int i = 0; i < d; i++) {
+  for (int i = 0; i < digits; i++) {
     result[sign + 2 + i] = '0';
   }
-  return sign + 2 + d;
+  return sign + 2 + digits;
 }
 
-__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa, int const d=1) {
+__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa, int const digits = 1) {
   if (mantissa) {
     return 3;
   }
   if (exponent) {
     return sign + 8;
   }
-  if (d == 0) {
+  if (digits == 0) {
     return sign + 1;
   }
-  return sign + 2 + d;
+  return sign + 2 + digits;
 }
 
 __device__ inline uint32_t float_to_bits(float const f) {
@@ -1175,16 +1175,16 @@ uint64_t const POW10_TABLE[19] = {
 };
 
 template<typename T>
-__device__ inline T round_half_even(T const input, int const olength, int const d) {
-  // "round" a integer to d digits, with the half-even rounding mode.    
-  if (d > olength) {
+__device__ inline T round_half_even(T const input, int const olength, int const digits) {
+  // "round" a integer to digits digits, with the half-even rounding mode.    
+  if (digits > olength) {
     T num = input;
-    for (int i = 0; i < d - olength; i++) {
+    for (int i = 0; i < digits - olength; i++) {
       num *= 10;
     }
     return num;
   }
-  T div = POW10_TABLE[olength - d];
+  T div = POW10_TABLE[olength - digits];
   T mod = input % div;
   T num = input / div;
   if (mod > (div / 2) || ((mod == (div / 2) && (num % 2 == 1) && mod != 0))) {
@@ -1193,7 +1193,7 @@ __device__ inline T round_half_even(T const input, int const olength, int const
   return num;
 }
 
-__device__ inline int to_formated_chars(floating_decimal_64 const v, bool const sign, char* const result, int d) {
+__device__ inline int to_formated_chars(floating_decimal_64 const v, bool const sign, char* const result, int digits) {
   int index = 0;
   if (sign) {
     result[index++] = '-';
@@ -1205,11 +1205,11 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const
     // Decimal dot is before any of the digits.
     int index_for_carrier = index;
     result[index++] = '0';
-    if (d == 0) {
+    if (digits == 0) {
       return index;
     }
     result[index++] = '.';
-    int actural_round = d;
+    int actural_round = digits;
     for (int i = -1; i > exp; i--) {
       index_for_carrier = index;
       result[index++] = '0';
@@ -1263,18 +1263,18 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const
       output /= 10;
     }
     index = integer_len;
-    if (d == 0) {
+    if (digits == 0) {
       return index;
     }
     result[index++] = '.';
-    for (int i = 0; i < d; i++) {
+    for (int i = 0; i < digits; i++) {
       result[index++] = '0';
     }
   } else {
-    uint32_t temp_d = d, tailing_zero = 0;
-    if (exp + d > olength) {
+    uint32_t temp_d = digits, tailing_zero = 0;
+    if (exp + digits > olength) {
       temp_d = olength - exp;
-      tailing_zero = d - temp_d;
+      tailing_zero = digits - temp_d;
     }
     uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
     uint64_t pow10 = POW10_TABLE[temp_d];
@@ -1295,17 +1295,17 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const
       integer /= 10;
     }
     index = formated_integer_len;
-    if (d == 0) {
+    if (digits == 0) {
       return index;
     }
     result[index++] = '.';
     int current = index;
     for (int i = 0; i < tailing_zero; i++) {
-      result[current + d - i - 1] = '0';
+      result[current + digits - i - 1] = '0';
       index++;
     }
-    for (int i = tailing_zero; i < d; i++) {
-      result[current + d - i - 1] = (char) ('0' + decimal % 10);
+    for (int i = tailing_zero; i < digits; i++) {
+      result[current + digits - i - 1] = (char) ('0' + decimal % 10);
       decimal /= 10;
       index++;
     }
@@ -1313,7 +1313,7 @@ __device__ inline int to_formated_chars(floating_decimal_64 const v, bool const
   return index;
 }
 
-__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int d) {
+__device__ inline int format_float_size(floating_decimal_64 const v, bool const sign, int digits) {
   int index = 0;
   if (sign) {
     index++;
@@ -1322,27 +1322,27 @@ __device__ inline int format_float_size(floating_decimal_64 const v, bool const
   const uint32_t olength = decimalLength17(output);
   int32_t exp = v.exponent + (int32_t) olength - 1;
   if (exp < 0) {
-    index += 2 + d;
+    index += 2 + digits;
   } else if (exp + 1 >= olength) {
-    index += exp + 1 + exp / 3 + 1 + d;
+    index += exp + 1 + exp / 3 + 1 + digits;
   } else {
-    uint32_t temp_d = d;
-    if (exp + d > olength) {
+    uint32_t temp_d = digits;
+    if (exp + digits > olength) {
       temp_d = olength - exp;
     }
     uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
     uint64_t pow10 = POW10_TABLE[temp_d];
     uint64_t integer = rounded_output / pow10;
     uint32_t integer_len = decimalLength17(integer);
-    index += integer_len + (integer_len - 1) / 3 + 1 + d;
+    index += integer_len + (integer_len - 1) / 3 + 1 + digits;
   }
-  if (d == 0) {
+  if (digits == 0) {
     index--;
   }
   return index;
 }
 
-__device__ inline int to_formated_chars(floating_decimal_32 const v, bool const sign, char* const result, int d) {
+__device__ inline int to_formated_chars(floating_decimal_32 const v, bool const sign, char* const result, int digits) {
   int index = 0;
   if (sign) {
     result[index++] = '-';
@@ -1354,11 +1354,11 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const
     // Decimal dot is before any of the digits.
     int index_for_carrier = index;
     result[index++] = '0';
-    if (d == 0) {
+    if (digits == 0) {
       return index;
     }
     result[index++] = '.';
-    int actural_round = d;
+    int actural_round = digits;
     for (int i = -1; i > exp; i--) {
       index_for_carrier = index;
       result[index++] = '0';
@@ -1412,18 +1412,18 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const
       output /= 10;
     }
     index = integer_len;
-    if (d == 0) {
+    if (digits == 0) {
       return index;
     }
     result[index++] = '.';
-    for (int i = 0; i < d; i++) {
+    for (int i = 0; i < digits; i++) {
       result[index++] = '0';
     }
   } else {
-    uint32_t temp_d = d, tailing_zero = 0;
-    if (exp + d > olength) {
+    uint32_t temp_d = digits, tailing_zero = 0;
+    if (exp + digits > olength) {
       temp_d = olength - exp;
-      tailing_zero = d - temp_d;
+      tailing_zero = digits - temp_d;
     }
     uint32_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
     uint32_t pow10 = POW10_TABLE[temp_d];
@@ -1444,17 +1444,17 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const
       integer /= 10;
     }
     index = formated_integer_len;
-    if (d == 0) {
+    if (digits == 0) {
       return index;
     }
     result[index++] = '.';
     int current = index;
     for (int i = 0; i < tailing_zero; i++) {
-      result[current + d - i - 1] = '0';
+      result[current + digits - i - 1] = '0';
       index++;
     }
-    for (int i = tailing_zero; i < d; i++) {
-      result[current + d - i - 1] = (char) ('0' + decimal % 10);
+    for (int i = tailing_zero; i < digits; i++) {
+      result[current + digits - i - 1] = (char) ('0' + decimal % 10);
       decimal /= 10;
       index++;
     }
@@ -1462,7 +1462,7 @@ __device__ inline int to_formated_chars(floating_decimal_32 const v, bool const
   return index;
 }
 
-__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int d) {
+__device__ inline int format_float_size(floating_decimal_32 const v, bool const sign, int digits) {
   int index = 0;
   if (sign) {
     index++;
@@ -1471,57 +1471,57 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const
   uint32_t const olength = decimalLength9(output);
   int32_t exp = v.exponent + (int32_t) olength - 1;
   if (exp < 0) {
-    index += 2 + d;
+    index += 2 + digits;
   } else if (exp + 1 >= olength) {
-    index += exp + 1 + exp / 3 + 1 + d;
+    index += exp + 1 + exp / 3 + 1 + digits;
   } else {
-    uint32_t temp_d = d;
-    if (exp + d > olength) {
+    uint32_t temp_d = digits;
+    if (exp + digits > olength) {
       temp_d = olength - exp;
     }
     uint64_t rounded_output = round_half_even(output, olength, exp+temp_d+1);
     uint64_t pow10 = POW10_TABLE[temp_d];
     uint64_t integer = rounded_output / pow10;
     uint32_t integer_len = decimalLength9(integer);
-    index += integer_len + (integer_len - 1) / 3 + 1 + d;
+    index += integer_len + (integer_len - 1) / 3 + 1 + digits;
   }
-  if (d == 0) {
+  if (digits == 0) {
     index--;
   }
   return index;
 }  
 
-__device__ int compute_format_float_size(double value, int d, bool is_float) {
+__device__ int compute_format_float_size(double value, int digits, bool is_float) {
   bool sign = false, special = false;
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
     if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa, d);
+      return special_str_size(sign, v.exponent, v.mantissa, digits);
     }
-    return format_float_size(v, sign, d);
+    return format_float_size(v, sign, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
     if (special) {
-      return special_str_size(sign, v.exponent, v.mantissa, d);
+      return special_str_size(sign, v.exponent, v.mantissa, digits);
     }
-    return format_float_size(v, sign, d);
+    return format_float_size(v, sign, digits);
   }
 }
 
-__device__ int format_float(double value, int d, char* output, bool is_float) {
+__device__ int format_float(double value, int digits, bool is_float, char* output) {
   bool sign = false, special = false;
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
     if (special) {
-      return copy_special_str(output, sign, v.exponent, v.mantissa, d);
+      return copy_special_str(output, sign, v.exponent, v.mantissa, digits);
     }
-    return to_formated_chars(v, sign, output, d);
+    return to_formated_chars(v, sign, output, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
     if (special) {
-      return copy_special_str(output, sign, v.exponent, v.mantissa, d);
+      return copy_special_str(output, sign, v.exponent, v.mantissa, digits);
     }
-    return to_formated_chars(v, sign, output, d);
+    return to_formated_chars(v, sign, output, digits);
   }
 }
 
diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp
index 1a93354339..ba1aaf05c8 100644
--- a/src/main/cpp/tests/cast_decimal_to_string.cpp
+++ b/src/main/cpp/tests/cast_decimal_to_string.cpp
@@ -24,9 +24,10 @@
 
 #include <cudf/strings/convert/convert_floats.hpp>
 
-#include <limits>
 #include <rmm/device_uvector.hpp>
 
+#include <limits>
+
 using namespace cudf;
 
 template <typename T>
diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp
index c736d5971f..1f7aaaad21 100644
--- a/src/main/cpp/tests/cast_string.cpp
+++ b/src/main/cpp/tests/cast_string.cpp
@@ -24,9 +24,10 @@
 
 #include <cudf/strings/convert/convert_floats.hpp>
 
-#include <limits>
 #include <rmm/device_uvector.hpp>
 
+#include <limits>
+
 using namespace cudf;
 
 template <typename T>
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index acafd231c4..9f02b2b0b6 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -24,9 +24,10 @@
 
 #include <cudf/strings/convert/convert_floats.hpp>
 
-#include <limits>
 #include <rmm/device_uvector.hpp>
 
+#include <limits>
+
 using namespace cudf;
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index feda669e32..6a9751fb98 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -85,9 +85,10 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
    *
    * @param cv the column data to process
    * @return the converted column
+   * @param digits the number of digits to display after the decimal point
    */
-  public static ColumnVector formatFloat(ColumnView cv, int d) {
-    return new ColumnVector(formatFloat(cv.getNativeView(), d));
+  public static ColumnVector formatFloat(ColumnView cv, int digits) {
+    return new ColumnVector(formatFloat(cv.getNativeView(), digits));
   }
 
   /**
@@ -147,7 +148,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
       int precision, int scale);
   private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype);
   private static native long fromDecimal(long nativeColumnView);
-  private static native long formatFloat(long nativeColumnView, int d);
+  private static native long formatFloat(long nativeColumnView, int digits);
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);

From b78e3b35da66956847f222e330f0c33e9aaacc32 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 27 Nov 2023 17:28:45 +0800
Subject: [PATCH 38/54] addressed comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu    |  4 +-
 src/main/cpp/tests/cast_float_to_string.cpp | 53 ++++-----------------
 thirdparty/cudf                             |  2 +-
 3 files changed, 13 insertions(+), 46 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index c41936c3bb..1c9eb2c4d6 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -78,7 +78,7 @@ struct float_to_string_fn {
  * The template function declaration ensures only float types are allowed.
  */
 struct dispatch_float_to_string_fn {
-  template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
+  template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
@@ -97,7 +97,7 @@ struct dispatch_float_to_string_fn {
   }
 
   // non-float types throw an exception
-  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*)
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index 0d4f62ac1b..806b3eaad5 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -35,61 +35,28 @@ struct FloatToStringTests : public cudf::test::BaseFixture {};
 
 TEST_F(FloatToStringTests, FromFloats32)
 {
-  std::vector<float> h_floats{100,
-                              654321.25,
-                              -12761.125,
-                              0,
-                              5,
-                              -4,
-                              std::numeric_limits<float>::quiet_NaN(),
-                              123456789012.34,
-                              -0.0};
-  std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"};
-
-  cudf::test::fixed_width_column_wrapper<float> floats(
-    h_floats.begin(),
-    h_floats.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+  auto const floats = cudf::test::fixed_width_column_wrapper<float> {
+    100.0f, 654321.25f, -12761.125f, 0.f, 5.0f, -4.0f, std::numeric_limits<float>::quiet_NaN(), 123456789012.34f, -0.0f};
 
   auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
 
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+  auto const expected = cudf::test::strings_column_wrapper{
+    "100.0", "654321.25", "-12761.125", "0.0", "5.0", "-4.0", "NaN", "1.2345679E11", "-0.0"};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
 
 TEST_F(FloatToStringTests, FromFloats64)
 {
-  std::vector<double> h_floats{100,
-                               654321.25,
-                               -12761.125,
-                               1.123456789123456789,
-                               0.000000000000000000123456789123456789,
-                               0,
-                               5,
-                               -4,
-                               std::numeric_limits<double>::quiet_NaN(),
-                               839542223232.794248339,
-                               -0.0};
-  std::vector<char const*> h_expected{
-    "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", 
-    "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
-
-  cudf::test::fixed_width_column_wrapper<double> floats(
-    h_floats.begin(),
-    h_floats.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+  auto const floats = cudf::test::fixed_width_column_wrapper<double> {
+    100.0d, 654321.25d, -12761.125d, 1.123456789123456789d, 0.000000000000000000123456789123456789d,
+    0.0d, 5.0d, -4.0d, std::numeric_limits<double>::quiet_NaN(), 839542223232.794248339d, -0.0d};
 
   auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
 
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+  auto const expected = cudf::test::strings_column_wrapper{
+    "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", 
+    "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
\ No newline at end of file
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 823d3214a9..168533a8ad 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 823d3214a9489e3c496aa31041b5d29f650e94b3
+Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243

From d2cba4f165d2d3feb7bcf362ae81fde05c16557f Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 27 Nov 2023 18:02:08 +0800
Subject: [PATCH 39/54] Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 .../cpp/benchmarks/cast_string_to_float.cpp   |  2 +-
 src/main/cpp/src/CastStringJni.cpp            |  2 +-
 src/main/cpp/src/format_float.cu              |  4 +-
 src/main/cpp/tests/cast_decimal_to_string.cpp |  2 +-
 src/main/cpp/tests/cast_string.cpp            |  2 +-
 src/main/cpp/tests/format_float.cpp           | 53 ++++---------------
 .../nvidia/spark/rapids/jni/CastStrings.java  |  6 +--
 7 files changed, 18 insertions(+), 53 deletions(-)

diff --git a/src/main/cpp/benchmarks/cast_string_to_float.cpp b/src/main/cpp/benchmarks/cast_string_to_float.cpp
index d94f9d26a0..32e245aa98 100644
--- a/src/main/cpp/benchmarks/cast_string_to_float.cpp
+++ b/src/main/cpp/benchmarks/cast_string_to_float.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cast_string.hpp>
+#include "cast_string.hpp"
 
 #include <benchmarks/common/generate_input.hpp>
 
diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 063fabe222..720f8514fc 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -109,7 +109,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat(
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_formatFloat(
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloatWithFormat(
   JNIEnv* env, jclass, jlong input_column, jint digits, jint j_dtype)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index 3d3fedbfb9..ec3c934415 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -79,7 +79,7 @@ struct format_float_fn {
  * The template function declaration ensures only float types are allowed.
  */
 struct dispatch_format_float_fn {
-  template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
+  template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
                                      int digits,
                                      rmm::cuda_stream_view stream,
@@ -99,7 +99,7 @@ struct dispatch_format_float_fn {
   }
 
   // non-float types throw an exception
-  template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
+  template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                      int,
                                      rmm::cuda_stream_view,
diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp
index ba1aaf05c8..05002c373c 100644
--- a/src/main/cpp/tests/cast_decimal_to_string.cpp
+++ b/src/main/cpp/tests/cast_decimal_to_string.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cast_string.hpp>
+#include "cast_string.hpp"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp
index 1f7aaaad21..0a3f221894 100644
--- a/src/main/cpp/tests/cast_string.cpp
+++ b/src/main/cpp/tests/cast_string.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cast_string.hpp>
+#include "cast_string.hpp"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index 9f02b2b0b6..6d510d83a0 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -36,63 +36,28 @@ struct FormatFloatTests : public cudf::test::BaseFixture {};
 
 TEST_F(FormatFloatTests, FormatFloats32)
 {
-  std::vector<float> h_floats{100,
-                              654321.25,
-                              -12761.125,
-                              0,
-                              5,
-                              -4,
-                              std::numeric_limits<float>::quiet_NaN(),
-                              123456789012.34,
-                              -0.0
-                              };
-  std::vector<char const*> h_expected{
+  auto const floats = cudf::test::fixed_width_column_wrapper<float> {
+    100.0f, 654321.25f, -12761.125f, 0.0f, 5.0f, -4.0f, std::numeric_limits<float>::quiet_NaN(), 123456789012.34f, -0.0f};
+  
+  auto const expected = cudf::test::strings_column_wrapper{
     "100.00000", "654,321.25000", "-12,761.12500", "0.00000", "5.00000", "-4.00000", "NaN", "123,456,790,000.00000", "-0.00000"};
 
-  cudf::test::fixed_width_column_wrapper<float> floats(
-    h_floats.begin(),
-    h_floats.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-
   auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
 
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
 
 TEST_F(FormatFloatTests, FormatFloats64)
 {
-    std::vector<double> h_floats{100,
-                               654321.25,
-                               -12761.125,
-                               1.123456789123456789,
-                               0.000000000000000000123456789123456789,
-                               0,
-                               5,
-                               -4,
-                               std::numeric_limits<double>::quiet_NaN(),
-                               839542223232.794248339,
-                               -0.0
-                               };
-  std::vector<char const*> h_expected{
+  auto const floats = cudf::test::fixed_width_column_wrapper<double> {
+      100.0d, 654321.25d, -12761.125d, 1.123456789123456789d, 0.000000000000000000123456789123456789d,
+      0.0d, 5.0d, -4.0d, std::numeric_limits<double>::quiet_NaN(), 839542223232.794248339d, -0.0d};
+
+  auto const expected = cudf::test::strings_column_wrapper{
     "100.00000", "654,321.25000", "-12,761.12500", "1.12346", "0.00000", "0.00000", "5.00000", 
     "-4.00000", "NaN", "839,542,223,232.79420", "-0.00000"};
 
-  cudf::test::fixed_width_column_wrapper<double> floats(
-    h_floats.begin(),
-    h_floats.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-
   auto results = spark_rapids_jni::format_float(floats, 5, cudf::get_default_stream());
 
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
\ No newline at end of file
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 6a9751fb98..6ce4687b23 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -87,8 +87,8 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
    * @return the converted column
    * @param digits the number of digits to display after the decimal point
    */
-  public static ColumnVector formatFloat(ColumnView cv, int digits) {
-    return new ColumnVector(formatFloat(cv.getNativeView(), digits));
+  public static ColumnVector fromFloatWithFormat(ColumnView cv, int digits) {
+    return new ColumnVector(fromFloatWithFormat(cv.getNativeView(), digits));
   }
 
   /**
@@ -148,7 +148,7 @@ private static native long toDecimal(long nativeColumnView, boolean ansi_enabled
       int precision, int scale);
   private static native long toFloat(long nativeColumnView, boolean ansi_enabled, int dtype);
   private static native long fromDecimal(long nativeColumnView);
-  private static native long formatFloat(long nativeColumnView, int digits);
+  private static native long fromFloatWithFormat(long nativeColumnView, int digits);
   private static native long toIntegersWithBase(long nativeColumnView, int base,
     boolean ansiEnabled, int dtype);
   private static native long fromIntegersWithBase(long nativeColumnView, int base);

From 04d1c4f10cd95c7122de091d564138ada5aea4bc Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 4 Dec 2023 17:20:22 +0800
Subject: [PATCH 40/54] clang format

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/CastStringJni.cpp          |   9 +-
 src/main/cpp/src/cast_float_to_string.cu    |  36 +-
 src/main/cpp/src/ftos_converter.cuh         | 705 ++++++++++----------
 src/main/cpp/tests/cast_float_to_string.cpp |  42 +-
 thirdparty/cudf                             |   2 +-
 5 files changed, 420 insertions(+), 374 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 093b51188b..933fc15e34 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -109,15 +109,16 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_toFloat(
   CATCH_CAST_EXCEPTION(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(
-  JNIEnv* env, jclass, jlong input_column, jint j_dtype)
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromFloat(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong input_column)
 {
   JNI_NULL_CHECK(env, input_column, "input column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
 
-    cudf::column_view cv{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    auto const& cv = *reinterpret_cast<cudf::column_view const*>(input_column);
     return cudf::jni::release_as_jlong(
       spark_rapids_jni::float_to_string(cv, cudf::get_default_stream()));
   }
@@ -133,7 +134,7 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_CastStrings_fromDecimal
   try {
     cudf::jni::auto_set_device(env);
 
-    cudf::column_view cv{*reinterpret_cast<cudf::column_view const*>(input_column)};
+    auto const& cv = *reinterpret_cast<cudf::column_view const*>(input_column);
     return cudf::jni::release_as_jlong(
       spark_rapids_jni::decimal_to_non_ansi_string(cv, cudf::get_default_stream()));
   }
diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 1c9eb2c4d6..31d3f69d11 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -47,14 +47,15 @@ struct float_to_string_fn {
   __device__ cudf::size_type compute_output_size(FloatType value) const
   {
     bool constexpr is_float = std::is_same_v<FloatType, float>;
-    return static_cast<cudf::size_type>(ftos_converter::compute_ftos_size(static_cast<double>(value), is_float));
+    return static_cast<cudf::size_type>(
+      ftos_converter::compute_ftos_size(static_cast<double>(value), is_float));
   }
 
   __device__ void float_to_string(cudf::size_type idx) const
   {
-    auto const value = d_floats.element<FloatType>(idx);
+    auto const value        = d_floats.element<FloatType>(idx);
     bool constexpr is_float = std::is_same_v<FloatType, float>;
-    auto const output = d_chars + d_offsets[idx];
+    auto const output       = d_chars + d_offsets[idx];
     ftos_converter::float_to_string(static_cast<double>(value), is_float, output);
   }
 
@@ -80,14 +81,16 @@ struct float_to_string_fn {
 struct dispatch_float_to_string_fn {
   template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto const strings_count = floats.size();
+    if (strings_count == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
+
     auto const input_ptr = cudf::column_device_view::create(floats, stream);
 
-    auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(float_to_string_fn<FloatType>{*input_ptr}, strings_count, stream, mr);
+    auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+      float_to_string_fn<FloatType>{*input_ptr}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
@@ -99,8 +102,8 @@ struct dispatch_float_to_string_fn {
   // non-float types throw an exception
   template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
-                                     rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                           rmm::cuda_stream_view,
+                                           rmm::mr::device_memory_resource*)
   {
     CUDF_FAIL("Values for float_to_string function must be a float type.");
   }
@@ -110,23 +113,18 @@ struct dispatch_float_to_string_fn {
 
 // This will convert all float column types into a strings column.
 std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
-  auto const strings_count = floats.size();
-  if (strings_count == 0) { 
-    return cudf::make_empty_column(cudf::type_id::STRING);
-  }
-
   return type_dispatcher(floats.type(), dispatch_float_to_string_fn{}, floats, stream, mr);
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats, 
-                                      rmm::cuda_stream_view stream, 
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<cudf::column> float_to_string(cudf::column_view const& floats,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::float_to_string(floats, stream, mr);
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index b9905ae567..444f790d3c 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -15,11 +15,11 @@
  * limitations under the License.
  */
 
+#include <cuda/std/cassert>
 #include <cuda/std/climits>
+#include <cuda/std/cstdint>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
-#include <cuda/std/cassert>
-#include <cuda/std/cstdint>
 
 namespace spark_rapids_jni::ftos_converter {
 
@@ -47,84 +47,109 @@ typedef struct floating_decimal_32 {
 
 // These tables are generated by PrintDoubleLookupTable.
 constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
-constexpr unsigned int DOUBLE_POW5_BITCOUNT = 125;
-constexpr unsigned int FLOAT_POW5_INV_BITCOUNT = (DOUBLE_POW5_INV_BITCOUNT - 64);
-constexpr unsigned int FLOAT_POW5_BITCOUNT = (DOUBLE_POW5_BITCOUNT - 64);
-constexpr unsigned int DOUBLE_MANTISSA_BITS = 52;
-constexpr unsigned int DOUBLE_EXPONENT_BITS = 11;
-constexpr unsigned int DOUBLE_BIAS = 1023;
-constexpr unsigned int FLOAT_MANTISSA_BITS = 23;
-constexpr unsigned int FLOAT_EXPONENT_BITS = 8;
-constexpr unsigned int FLOAT_BIAS = 127;
-
-__constant__
-uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = {
-  {                    1u, 2305843009213693952u },
-  {  5955668970331000884u, 1784059615882449851u },
-  {  8982663654677661702u, 1380349269358112757u },
-  {  7286864317269821294u, 2135987035920910082u },
-  {  7005857020398200553u, 1652639921975621497u },
-  { 17965325103354776697u, 1278668206209430417u },
-  {  8928596168509315048u, 1978643211784836272u },
-  { 10075671573058298858u, 1530901034580419511u },
-  {   597001226353042382u, 1184477304306571148u },
-  {  1527430471115325346u, 1832889850782397517u },
-  { 12533209867169019542u, 1418129833677084982u },
-  {  5577825024675947042u, 2194449627517475473u },
-  { 11006974540203867551u, 1697873161311732311u },
-  { 10313493231639821582u, 1313665730009899186u },
-  { 12701016819766672773u, 2032799256770390445u }
-};
-
-__constant__
-uint32_t const POW5_INV_OFFSETS[19] = {
-  0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
-  0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
-  0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
-  0x00000000
-};
-
-__constant__
-uint64_t const DOUBLE_POW5_SPLIT2[13][2] = {
-  {                    0u, 1152921504606846976u },
-  {                    0u, 1490116119384765625u },
-  {  1032610780636961552u, 1925929944387235853u },
-  {  7910200175544436838u, 1244603055572228341u },
-  { 16941905809032713930u, 1608611746708759036u },
-  { 13024893955298202172u, 2079081953128979843u },
-  {  6607496772837067824u, 1343575221513417750u },
-  { 17332926989895652603u, 1736530273035216783u },
-  { 13037379183483547984u, 2244412773384604712u },
-  {  1605989338741628675u, 1450417759929778918u },
-  {  9630225068416591280u, 1874621017369538693u },
-  {   665883850346957067u, 1211445438634777304u },
-  { 14931890668723713708u, 1565756531257009982u }
-};
-
-__constant__
-uint32_t const POW5_OFFSETS[21] = {
-  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
-  0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
-  0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
-  0x55559155, 0x51405555, 0x00000105
-};
+constexpr unsigned int DOUBLE_POW5_BITCOUNT     = 125;
+constexpr unsigned int FLOAT_POW5_INV_BITCOUNT  = (DOUBLE_POW5_INV_BITCOUNT - 64);
+constexpr unsigned int FLOAT_POW5_BITCOUNT      = (DOUBLE_POW5_BITCOUNT - 64);
+constexpr unsigned int DOUBLE_MANTISSA_BITS     = 52;
+constexpr unsigned int DOUBLE_EXPONENT_BITS     = 11;
+constexpr unsigned int DOUBLE_BIAS              = 1023;
+constexpr unsigned int FLOAT_MANTISSA_BITS      = 23;
+constexpr unsigned int FLOAT_EXPONENT_BITS      = 8;
+constexpr unsigned int FLOAT_BIAS               = 127;
+
+__constant__ uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = {
+  {1u, 2305843009213693952u},
+  {5955668970331000884u, 1784059615882449851u},
+  {8982663654677661702u, 1380349269358112757u},
+  {7286864317269821294u, 2135987035920910082u},
+  {7005857020398200553u, 1652639921975621497u},
+  {17965325103354776697u, 1278668206209430417u},
+  {8928596168509315048u, 1978643211784836272u},
+  {10075671573058298858u, 1530901034580419511u},
+  {597001226353042382u, 1184477304306571148u},
+  {1527430471115325346u, 1832889850782397517u},
+  {12533209867169019542u, 1418129833677084982u},
+  {5577825024675947042u, 2194449627517475473u},
+  {11006974540203867551u, 1697873161311732311u},
+  {10313493231639821582u, 1313665730009899186u},
+  {12701016819766672773u, 2032799256770390445u}};
+
+__constant__ uint32_t const POW5_INV_OFFSETS[19] = {0x54544554,
+                                                    0x04055545,
+                                                    0x10041000,
+                                                    0x00400414,
+                                                    0x40010000,
+                                                    0x41155555,
+                                                    0x00000454,
+                                                    0x00010044,
+                                                    0x40000000,
+                                                    0x44000041,
+                                                    0x50454450,
+                                                    0x55550054,
+                                                    0x51655554,
+                                                    0x40004000,
+                                                    0x01000001,
+                                                    0x00010500,
+                                                    0x51515411,
+                                                    0x05555554,
+                                                    0x00000000};
+
+__constant__ uint64_t const DOUBLE_POW5_SPLIT2[13][2] = {
+  {0u, 1152921504606846976u},
+  {0u, 1490116119384765625u},
+  {1032610780636961552u, 1925929944387235853u},
+  {7910200175544436838u, 1244603055572228341u},
+  {16941905809032713930u, 1608611746708759036u},
+  {13024893955298202172u, 2079081953128979843u},
+  {6607496772837067824u, 1343575221513417750u},
+  {17332926989895652603u, 1736530273035216783u},
+  {13037379183483547984u, 2244412773384604712u},
+  {1605989338741628675u, 1450417759929778918u},
+  {9630225068416591280u, 1874621017369538693u},
+  {665883850346957067u, 1211445438634777304u},
+  {14931890668723713708u, 1565756531257009982u}};
+
+__constant__ uint32_t const POW5_OFFSETS[21] = {
+  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, 0x55545555,
+  0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, 0x45555550, 0x40004000,
+  0x96440440, 0x55565565, 0x54454045, 0x40154151, 0x55559155, 0x51405555, 0x00000105};
 
 constexpr uint32_t POW5_TABLE_SIZE = 26;
 
-__constant__
-uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
-1ull, 5ull, 25ull, 125ull, 625ull, 3125ull, 15625ull, 78125ull, 390625ull,
-1953125ull, 9765625ull, 48828125ull, 244140625ull, 1220703125ull, 6103515625ull,
-30517578125ull, 152587890625ull, 762939453125ull, 3814697265625ull,
-19073486328125ull, 95367431640625ull, 476837158203125ull,
-2384185791015625ull, 11920928955078125ull, 59604644775390625ull,
-298023223876953125ull //, 1490116119384765625ull
+__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
+  1ull,
+  5ull,
+  25ull,
+  125ull,
+  625ull,
+  3125ull,
+  15625ull,
+  78125ull,
+  390625ull,
+  1953125ull,
+  9765625ull,
+  48828125ull,
+  244140625ull,
+  1220703125ull,
+  6103515625ull,
+  30517578125ull,
+  152587890625ull,
+  762939453125ull,
+  3814697265625ull,
+  19073486328125ull,
+  95367431640625ull,
+  476837158203125ull,
+  2384185791015625ull,
+  11920928955078125ull,
+  59604644775390625ull,
+  298023223876953125ull  //, 1490116119384765625ull
 };
 
 //===== common.h from ryu =====
 
 // Returns the number of decimal digits in v, which must not contain more than 9 digits.
-__device__ inline uint32_t decimalLength9(uint32_t const v) {
+__device__ inline uint32_t decimalLength9(uint32_t const v)
+{
   // Function precondition: v is not a 10-digit number.
   // (f2s: 9 digits are sufficient for round-tripping.)
   // (d2fixed: We print 9-digit blocks.)
@@ -141,40 +166,42 @@ __device__ inline uint32_t decimalLength9(uint32_t const v) {
 }
 
 // Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
-__device__ inline int32_t pow5bits(int32_t const e) {
+__device__ inline int32_t pow5bits(int32_t const e)
+{
   // This approximation works up to the point that the multiplication overflows at e = 3529.
   // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
   // than 2^9297.
   assert(e >= 0);
   assert(e <= 3528);
-  return (int32_t) (((((uint32_t) e) * 1217359) >> 19) + 1);
+  return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1);
 }
 
 // Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
-__device__ inline uint32_t log10Pow2(int32_t const e) {
+__device__ inline uint32_t log10Pow2(int32_t const e)
+{
   // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
   assert(e >= 0);
   assert(e <= 1650);
-  return (((uint32_t) e) * 78913) >> 18;
+  return (((uint32_t)e) * 78913) >> 18;
 }
 
 // Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
-__device__ inline uint32_t log10Pow5(int32_t const e) {
+__device__ inline uint32_t log10Pow5(int32_t const e)
+{
   // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
   assert(e >= 0);
   assert(e <= 2620);
-  return (((uint32_t) e) * 732923) >> 20;
+  return (((uint32_t)e) * 732923) >> 20;
 }
 
-__device__ inline uint32_t pow5factor_32(uint32_t value) {
+__device__ inline uint32_t pow5factor_32(uint32_t value)
+{
   uint32_t count = 0;
   for (;;) {
     assert(value != 0);
     uint32_t const q = value / 5;
     uint32_t const r = value % 5;
-    if (r != 0) {
-      break;
-    }
+    if (r != 0) { break; }
     value = q;
     ++count;
   }
@@ -182,43 +209,47 @@ __device__ inline uint32_t pow5factor_32(uint32_t value) {
 }
 
 // Returns true if value is divisible by 5^p.
-__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p) {
+__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p)
+{
   return pow5factor_32(value) >= p;
 }
 
 // Returns true if value is divisible by 2^p.
-__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p) {
+__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p)
+{
   // __builtin_ctz doesn't appear to be faster here.
   return (value & ((1u << p) - 1)) == 0;
 }
 
 // It seems to be slightly faster to avoid uint128_t here, although the
 // generated code for uint128_t looks slightly nicer.
-__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift) {
+__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift)
+{
   assert(shift > 32);
 
   // The casts here help MSVC to avoid calls to the __allmul library
   // function.
   uint32_t const factorLo = (uint32_t)(factor);
   uint32_t const factorHi = (uint32_t)(factor >> 32);
-  uint64_t const bits0 = (uint64_t)m * factorLo;
-  uint64_t const bits1 = (uint64_t)m * factorHi;
+  uint64_t const bits0    = (uint64_t)m * factorLo;
+  uint64_t const bits1    = (uint64_t)m * factorHi;
 
-  uint64_t const sum = (bits0 >> 32) + bits1;
+  uint64_t const sum        = (bits0 >> 32) + bits1;
   uint64_t const shiftedSum = sum >> (shift - 32);
   assert(shiftedSum <= UINT32_MAX);
-  return (uint32_t) shiftedSum;
-
+  return (uint32_t)shiftedSum;
 }
 
-__device__ inline int copy_special_str(char * const result, bool const sign, bool const exponent, bool const mantissa) {
+__device__ inline int copy_special_str(char* const result,
+                                       bool const sign,
+                                       bool const exponent,
+                                       bool const mantissa)
+{
   if (mantissa) {
     memcpy(result, "NaN", 3);
     return 3;
   }
-  if (sign) {
-    result[0] = '-';
-  }
+  if (sign) { result[0] = '-'; }
   if (exponent) {
     memcpy(result + sign, "Infinity", 8);
     return sign + 8;
@@ -227,23 +258,22 @@ __device__ inline int copy_special_str(char * const result, bool const sign, boo
   return sign + 3;
 }
 
-__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa) {
-  if (mantissa) {
-    return 3;
-  }
-  if (exponent) {
-    return sign + 8;
-  }
+__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa)
+{
+  if (mantissa) { return 3; }
+  if (exponent) { return sign + 8; }
   return sign + 3;
 }
 
-__device__ inline uint32_t float_to_bits(float const f) {
+__device__ inline uint32_t float_to_bits(float const f)
+{
   uint32_t bits = 0;
   memcpy(&bits, &f, sizeof(float));
   return bits;
 }
 
-__device__ inline uint64_t double_to_bits(double const d) {
+__device__ inline uint64_t double_to_bits(double const d)
+{
   uint64_t bits = 0;
   memcpy(&bits, &d, sizeof(double));
   return bits;
@@ -251,7 +281,8 @@ __device__ inline uint64_t double_to_bits(double const d) {
 
 //===== d2s_intrinsics.h from ryu =====
 
-__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi) {
+__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi)
+{
   // The casts here help MSVC to avoid calls to the __allmul library function.
   uint32_t const aLo = (uint32_t)a;
   uint32_t const aHi = (uint32_t)(a >> 32);
@@ -266,11 +297,11 @@ __device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t*
   uint32_t const b00Lo = (uint32_t)b00;
   uint32_t const b00Hi = (uint32_t)(b00 >> 32);
 
-  uint64_t const mid1 = b10 + b00Hi;
+  uint64_t const mid1   = b10 + b00Hi;
   uint32_t const mid1Lo = (uint32_t)(mid1);
   uint32_t const mid1Hi = (uint32_t)(mid1 >> 32);
 
-  uint64_t const mid2 = b01 + mid1Lo;
+  uint64_t const mid2   = b01 + mid1Lo;
   uint32_t const mid2Lo = (uint32_t)(mid2);
   uint32_t const mid2Hi = (uint32_t)(mid2 >> 32);
 
@@ -281,68 +312,71 @@ __device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t*
   return pLo;
 }
 
-__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist) {
+__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist)
+{
   // We don't need to handle the case dist >= 64 here (see above).
   assert(dist < 64);
   assert(dist > 0);
   return (hi << (64 - dist)) | (lo >> dist);
 }
 
-__device__ inline uint64_t div5(uint64_t const x) {
-  return x / 5;
-}
+__device__ inline uint64_t div5(uint64_t const x) { return x / 5; }
 
-__device__ inline uint64_t div10(uint64_t const x) {
-  return x / 10;
-}
+__device__ inline uint64_t div10(uint64_t const x) { return x / 10; }
 
-__device__ inline uint64_t div100(uint64_t const x) {
-  return x / 100;
-}
+__device__ inline uint64_t div100(uint64_t const x) { return x / 100; }
 
-__device__ inline uint32_t pow5Factor(uint64_t value) {
-  uint64_t const m_inv_5 = 14757395258967641293u; // 5 * m_inv_5 = 1 (mod 2^64)
-  uint64_t const n_div_5 = 3689348814741910323u;  // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
-  uint32_t count = 0;
+__device__ inline uint32_t pow5Factor(uint64_t value)
+{
+  uint64_t const m_inv_5 = 14757395258967641293u;  // 5 * m_inv_5 = 1 (mod 2^64)
+  uint64_t const n_div_5 = 3689348814741910323u;   // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
+  uint32_t count         = 0;
   for (;;) {
     assert(value != 0);
     value *= m_inv_5;
-    if (value > n_div_5)
-      break;
+    if (value > n_div_5) break;
     ++count;
   }
   return count;
 }
 
 // Returns true if value is divisible by 5^p.
-__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p) {
+__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p)
+{
   // I tried a case distinction on p, but there was no performance difference.
   return pow5Factor(value) >= p;
 }
 
 // Returns true if value is divisible by 2^p.
-__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p) {
+__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p)
+{
   assert(value != 0);
   assert(p < 64);
   // __builtin_ctzll doesn't appear to be faster here.
   return (value & ((1ull << p) - 1)) == 0;
 }
 
-__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j) {
+__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j)
+{
   // m is maximum 55 bits
-  uint64_t high1;                                   // 128
-  uint64_t const low1 = umul128(m, mul[1], &high1); // 64
-  uint64_t high0;                                   // 64
-  umul128(m, mul[0], &high0);                       // 0
+  uint64_t high1;                                    // 128
+  uint64_t const low1 = umul128(m, mul[1], &high1);  // 64
+  uint64_t high0;                                    // 64
+  umul128(m, mul[0], &high0);                        // 0
   uint64_t const sum = high0 + low1;
   if (sum < high0) {
-    ++high1; // overflow into high1
+    ++high1;  // overflow into high1
   }
   return shiftright128(sum, high1, j - 64);
 }
 
-__device__ inline uint64_t mulShiftAll64(uint64_t const m, uint64_t const* const mul, int32_t const j,
-  uint64_t* const vp, uint64_t* const vm, uint32_t const mmShift) {
+__device__ inline uint64_t mulShiftAll64(uint64_t const m,
+                                         uint64_t const* const mul,
+                                         int32_t const j,
+                                         uint64_t* const vp,
+                                         uint64_t* const vm,
+                                         uint32_t const mmShift)
+{
   *vp = mulShift64(4 * m + 2, mul, j);
   *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
   return mulShift64(4 * m, mul, j);
@@ -351,10 +385,11 @@ __device__ inline uint64_t mulShiftAll64(uint64_t const m, uint64_t const* const
 //===== d2s_small_table.h from ryu =====
 
 // Computes 5^i in the form required by Ryu, and stores it in the given pointer.
-__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result) {
-  uint32_t const base = i / POW5_TABLE_SIZE;
-  uint32_t const base2 = base * POW5_TABLE_SIZE;
-  uint32_t const offset = i - base2;
+__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result)
+{
+  uint32_t const base       = i / POW5_TABLE_SIZE;
+  uint32_t const base2      = base * POW5_TABLE_SIZE;
+  uint32_t const offset     = i - base2;
   uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base];
   if (offset == 0) {
     result[0] = mul[0];
@@ -366,9 +401,9 @@ __device__ inline void double_computePow5(uint32_t const i, uint64_t* const resu
   uint64_t const low1 = umul128(m, mul[1], &high1);
   uint64_t high0;
   uint64_t const low0 = umul128(m, mul[0], &high0);
-  uint64_t const sum = high0 + low1;
+  uint64_t const sum  = high0 + low1;
   if (sum < high0) {
-    ++high1; // overflow into high1
+    ++high1;  // overflow into high1
   }
   // high1 | sum | low0
   uint32_t const delta = pow5bits(i) - pow5bits(base2);
@@ -377,11 +412,12 @@ __device__ inline void double_computePow5(uint32_t const i, uint64_t* const resu
 }
 
 // Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
-__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result) {
-  uint32_t const base = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
-  uint32_t const base2 = base * POW5_TABLE_SIZE;
-  uint32_t const offset = base2 - i;
-  uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base]; // 1/5^base2
+__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result)
+{
+  uint32_t const base       = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
+  uint32_t const base2      = base * POW5_TABLE_SIZE;
+  uint32_t const offset     = base2 - i;
+  uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base];  // 1/5^base2
   if (offset == 0) {
     result[0] = mul[0];
     result[1] = mul[1];
@@ -392,28 +428,32 @@ __device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const r
   uint64_t const low1 = umul128(m, mul[1], &high1);
   uint64_t high0;
   uint64_t const low0 = umul128(m, mul[0] - 1, &high0);
-  uint64_t const sum = high0 + low1;
+  uint64_t const sum  = high0 + low1;
   if (sum < high0) {
-    ++high1; // overflow into high1
+    ++high1;  // overflow into high1
   }
   // high1 | sum | low0
   uint32_t const delta = pow5bits(base2) - pow5bits(i);
-  result[0] = shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
+  result[0] =
+    shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
   result[1] = shiftright128(sum, high1, delta);
 }
 
 //===== f2s_intrinsics.h from ryu =====
 
-__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j) {
-  // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double lookup
-  // table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely on the
-  // fact that the added 1 that's already stored in the table never overflows into the upper 64 bits.
+__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j)
+{
+  // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double
+  // lookup table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely
+  // on the fact that the added 1 that's already stored in the table never overflows into the upper
+  // 64 bits.
   uint64_t pow5[2];
   double_computeInvPow5(q, pow5);
   return mulShift32(m, pow5[1] + 1, j);
 }
 
-__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j) {
+__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j)
+{
   uint64_t pow5[2];
   double_computePow5(i, pow5);
   return mulShift32(m, pow5[1], j);
@@ -421,7 +461,8 @@ __device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, in
 
 //===== d2s.c and f2s.c from ryu =====
 
-__device__ inline uint32_t decimalLength17(uint64_t const v) {
+__device__ inline uint32_t decimalLength17(uint64_t const v)
+{
   // This is slightly faster than a loop.
   // The average output length is 16.38 digits, so we check high-to-low.
   // Function precondition: v is not an 18, 19, or 20-digit number.
@@ -446,7 +487,8 @@ __device__ inline uint32_t decimalLength17(uint64_t const v) {
   return 1;
 }
 
-__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent) {
+__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent)
+{
   int32_t e2;
   uint64_t m2;
   if (ieeeExponent == 0) {
@@ -454,10 +496,10 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
     m2 = ieeeMantissa;
   } else {
-    e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
+    e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
     m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
   }
-  bool const even = (m2 & 1) == 0;
+  bool const even         = (m2 & 1) == 0;
   bool const acceptBounds = even;
 
   // Step 2: Determine the interval of valid decimal representations.
@@ -477,9 +519,9 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     // I tried special-casing q == 0, but there was no effect on performance.
     // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
     uint32_t const q = log10Pow2(e2) - (e2 > 3);
-    e10 = (int32_t) q;
-    int32_t const k = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
-    int32_t const i = -e2 + (int32_t) q + k;
+    e10              = (int32_t)q;
+    int32_t const k  = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
+    int32_t const i  = -e2 + (int32_t)q + k;
     uint64_t pow5[2];
     double_computeInvPow5(q, pow5);
     vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
@@ -488,7 +530,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
       // This should use q <= 22, but I think 21 is also safe. Smaller values
       // may still be safe, but it's more difficult to reason about them.
       // Only one of mp, mv, and mm can be a multiple of 5, if any.
-      uint32_t const mvMod5 = ((uint32_t) mv) - 5 * ((uint32_t) div5(mv));
+      uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv));
       if (mvMod5 == 0) {
         vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
       } else if (acceptBounds) {
@@ -504,10 +546,10 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
   } else {
     // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
     uint32_t const q = log10Pow5(-e2) - (-e2 > 1);
-    e10 = (int32_t) q + e2;
-    int32_t const i = -e2 - (int32_t) q;
-    int32_t const k = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
-    int32_t const j = (int32_t) q - k;
+    e10              = (int32_t)q + e2;
+    int32_t const i  = -e2 - (int32_t)q;
+    int32_t const k  = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
+    int32_t const j  = (int32_t)q - k;
 
     uint64_t pow5[2];
     double_computePow5(i, pow5);
@@ -524,7 +566,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
         // mp = mv + 2, so it always has at least one trailing 0 bit.
         --vp;
       }
-    } else if (q < 63) { // TODO(ulfjack): Use a tighter bound here.
+    } else if (q < 63) {  // TODO(ulfjack): Use a tighter bound here.
       // We want to know if the full product has at least q trailing zeros.
       // We need to compute min(p2(mv), p5(mv) - e2) >= q
       // <=> p2(mv) >= q && p5(mv) - e2 >= q
@@ -534,7 +576,7 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
   }
 
   // Step 4: Find the shortest decimal representation in the interval of valid representations.
-  int32_t removed = 0;
+  int32_t removed          = 0;
   uint8_t lastRemovedDigit = 0;
   uint64_t output;
   // On average, we remove ~2 digits.
@@ -543,36 +585,32 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     for (;;) {
       uint64_t const vpDiv10 = div10(vp);
       uint64_t const vmDiv10 = div10(vm);
-      if (vpDiv10 <= vmDiv10) {
-        break;
-      }
-      uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
+      if (vpDiv10 <= vmDiv10) { break; }
+      uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
       uint64_t const vrDiv10 = div10(vr);
-      uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
       vmIsTrailingZeros &= vmMod10 == 0;
       vrIsTrailingZeros &= lastRemovedDigit == 0;
-      lastRemovedDigit = (uint8_t) vrMod10;
-      vr = vrDiv10;
-      vp = vpDiv10;
-      vm = vmDiv10;
+      lastRemovedDigit = (uint8_t)vrMod10;
+      vr               = vrDiv10;
+      vp               = vpDiv10;
+      vm               = vmDiv10;
       ++removed;
     }
 
     if (vmIsTrailingZeros) {
       for (;;) {
         uint64_t const vmDiv10 = div10(vm);
-        uint32_t const vmMod10 = ((uint32_t) vm) - 10 * ((uint32_t) vmDiv10);
-        if (vmMod10 != 0) {
-          break;
-        }
+        uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
+        if (vmMod10 != 0) { break; }
         uint64_t const vpDiv10 = div10(vp);
         uint64_t const vrDiv10 = div10(vr);
-        uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
+        uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
         vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t) vrMod10;
-        vr = vrDiv10;
-        vp = vpDiv10;
-        vm = vmDiv10;
+        lastRemovedDigit = (uint8_t)vrMod10;
+        vr               = vrDiv10;
+        vp               = vpDiv10;
+        vm               = vmDiv10;
         ++removed;
       }
     }
@@ -585,16 +623,16 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
   } else {
     // Specialized for the common case (~99.3%). Percentages below are relative to this.
-    bool roundUp = false;
+    bool roundUp            = false;
     uint64_t const vpDiv100 = div100(vp);
     uint64_t const vmDiv100 = div100(vm);
-    if (vpDiv100 > vmDiv100) { // Optimization: remove two digits at a time (~86.2%).
+    if (vpDiv100 > vmDiv100) {  // Optimization: remove two digits at a time (~86.2%).
       uint64_t const vrDiv100 = div100(vr);
-      uint32_t const vrMod100 = ((uint32_t) vr) - 100 * ((uint32_t) vrDiv100);
-      roundUp = vrMod100 >= 50;
-      vr = vrDiv100;
-      vp = vpDiv100;
-      vm = vmDiv100;
+      uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100);
+      roundUp                 = vrMod100 >= 50;
+      vr                      = vrDiv100;
+      vp                      = vpDiv100;
+      vm                      = vmDiv100;
       removed += 2;
     }
     // Loop iterations below (approximately), without optimization above:
@@ -604,15 +642,13 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
     for (;;) {
       uint64_t const vpDiv10 = div10(vp);
       uint64_t const vmDiv10 = div10(vm);
-      if (vpDiv10 <= vmDiv10) {
-        break;
-      }
+      if (vpDiv10 <= vmDiv10) { break; }
       uint64_t const vrDiv10 = div10(vr);
-      uint32_t const vrMod10 = ((uint32_t) vr) - 10 * ((uint32_t) vrDiv10);
-      roundUp = vrMod10 >= 5;
-      vr = vrDiv10;
-      vp = vpDiv10;
-      vm = vmDiv10;
+      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
+      roundUp                = vrMod10 >= 5;
+      vr                     = vrDiv10;
+      vp                     = vpDiv10;
+      vm                     = vmDiv10;
       ++removed;
     }
 
@@ -627,7 +663,8 @@ __device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t
   return fd;
 }
 
-__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent) {
+__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent)
+{
   int32_t e2;
   uint32_t m2;
   if (ieeeExponent == 0) {
@@ -635,10 +672,10 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
     m2 = ieeeMantissa;
   } else {
-    e2 = (int32_t) ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
+    e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
     m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
   }
-  bool const even = (m2 & 1) == 0;
+  bool const even         = (m2 & 1) == 0;
   bool const acceptBounds = even;
 
   // Step 2: Determine the interval of valid decimal representations.
@@ -646,28 +683,28 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
   uint32_t const mp = 4 * m2 + 2;
   // Implicit bool -> int conversion. True is 1, false is 0.
   uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-  uint32_t const mm = 4 * m2 - 1 - mmShift;
+  uint32_t const mm      = 4 * m2 - 1 - mmShift;
 
   // Step 3: Convert to a decimal power base using 64-bit arithmetic.
   uint32_t vr, vp, vm;
   int32_t e10;
-  bool vmIsTrailingZeros = false;
-  bool vrIsTrailingZeros = false;
+  bool vmIsTrailingZeros   = false;
+  bool vrIsTrailingZeros   = false;
   uint8_t lastRemovedDigit = 0;
   if (e2 >= 0) {
     uint32_t const q = log10Pow2(e2);
-    e10 = (int32_t) q;
-    int32_t const k = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) q) - 1;
-    int32_t const i = -e2 + (int32_t) q + k;
-    vr = mulPow5InvDivPow2(mv, q, i);
-    vp = mulPow5InvDivPow2(mp, q, i);
-    vm = mulPow5InvDivPow2(mm, q, i);
+    e10              = (int32_t)q;
+    int32_t const k  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
+    int32_t const i  = -e2 + (int32_t)q + k;
+    vr               = mulPow5InvDivPow2(mv, q, i);
+    vp               = mulPow5InvDivPow2(mp, q, i);
+    vm               = mulPow5InvDivPow2(mm, q, i);
     if (q != 0 && (vp - 1) / 10 <= vm / 10) {
       // We need to know one removed digit even if we are not going to loop below. We could use
       // q = X - 1 above, except that would require 33 bits for the result, and we've found that
       // 32-bit arithmetic is faster even on 64-bit machines.
-      int32_t const l = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t) (q - 1)) - 1;
-      lastRemovedDigit = (uint8_t) (mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t) q - 1 + l) % 10);
+      int32_t const l  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1;
+      lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10);
     }
     if (q <= 9) {
       // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
@@ -682,16 +719,16 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     }
   } else {
     uint32_t const q = log10Pow5(-e2);
-    e10 = (int32_t) q + e2;
-    int32_t const i = -e2 - (int32_t) q;
-    int32_t const k = pow5bits(i) - FLOAT_POW5_BITCOUNT;
-    int32_t j = (int32_t) q - k;
-    vr = mulPow5divPow2(mv, (uint32_t) i, j);
-    vp = mulPow5divPow2(mp, (uint32_t) i, j);
-    vm = mulPow5divPow2(mm, (uint32_t) i, j);
+    e10              = (int32_t)q + e2;
+    int32_t const i  = -e2 - (int32_t)q;
+    int32_t const k  = pow5bits(i) - FLOAT_POW5_BITCOUNT;
+    int32_t j        = (int32_t)q - k;
+    vr               = mulPow5divPow2(mv, (uint32_t)i, j);
+    vp               = mulPow5divPow2(mp, (uint32_t)i, j);
+    vm               = mulPow5divPow2(mm, (uint32_t)i, j);
     if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-      j = (int32_t) q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
-      lastRemovedDigit = (uint8_t) (mulPow5divPow2(mv, (uint32_t) (i + 1), j) % 10);
+      j                = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
+      lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10);
     }
     if (q <= 1) {
       // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
@@ -704,7 +741,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
         // mp = mv + 2, so it always has at least one trailing 0 bit.
         --vp;
       }
-    } else if (q < 31) { // TODO(ulfjack): Use a tighter bound here.
+    } else if (q < 31) {  // TODO(ulfjack): Use a tighter bound here.
       vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
     }
   }
@@ -717,7 +754,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     while (vp / 10 > vm / 10) {
       vmIsTrailingZeros &= vm % 10 == 0;
       vrIsTrailingZeros &= lastRemovedDigit == 0;
-      lastRemovedDigit = (uint8_t) (vr % 10);
+      lastRemovedDigit = (uint8_t)(vr % 10);
       vr /= 10;
       vp /= 10;
       vm /= 10;
@@ -726,7 +763,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     if (vmIsTrailingZeros) {
       while (vm % 10 == 0) {
         vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t) (vr % 10);
+        lastRemovedDigit = (uint8_t)(vr % 10);
         vr /= 10;
         vp /= 10;
         vm /= 10;
@@ -744,7 +781,7 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
     // Loop iterations below (approximately):
     // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
     while (vp / 10 > vm / 10) {
-      lastRemovedDigit = (uint8_t) (vr % 10);
+      lastRemovedDigit = (uint8_t)(vr % 10);
       vr /= 10;
       vp /= 10;
       vm /= 10;
@@ -761,45 +798,43 @@ __device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t
   return fd;
 }
 
-__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result) {
+__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result)
+{
   // Step 5: Print the decimal representation.
   int index = 0;
-  if (sign) {
-    result[index++] = '-';
-  }
+  if (sign) { result[index++] = '-'; }
 
-  uint64_t output = v.mantissa;
-  uint32_t const olength = decimalLength17(output);
-  int32_t exp = v.exponent + (int32_t) olength - 1;
+  uint64_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength17(output);
+  int32_t exp             = v.exponent + (int32_t)olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
-  
+
   // Values in the interval [1E-3, 1E7) are special.
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
     for (uint32_t i = 0; i < olength - 1; ++i) {
-      uint32_t const c = output % 10; output /= 10;
-      result[index + olength - i] = (char) ('0' + c);
+      uint32_t const c = output % 10;
+      output /= 10;
+      result[index + olength - i] = (char)('0' + c);
     }
-    result[index] = '0' + output % 10;
+    result[index]     = '0' + output % 10;
     result[index + 1] = '.';
     index += olength + 1;
-    if (olength == 1) {
-      result[index++] = '0';
-    }
+    if (olength == 1) { result[index++] = '0'; }
     // Print 'E', the exponent sign, and the exponent, which has at most three digits.
     result[index++] = 'E';
     if (exp < 0) {
       result[index++] = '-';
-      exp = -exp;
+      exp             = -exp;
     }
     if (exp >= 100) {
-        result[index++] = (char) ('0' + exp / 100);
-        exp %= 100;
-        result[index++] = (char) ('0' + exp / 10);
-      } else if (exp >= 10) {
-        result[index++] = (char) ('0' + exp / 10);
-      }
-      result[index++] = (char) ('0' + exp % 10);
+      result[index++] = (char)('0' + exp / 100);
+      exp %= 100;
+      result[index++] = (char)('0' + exp / 10);
+    } else if (exp >= 10) {
+      result[index++] = (char)('0' + exp / 10);
+    }
+    result[index++] = (char)('0' + exp % 10);
   } else {
     // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
     if (exp < 0) {
@@ -811,14 +846,14 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
       }
       int current = index;
       for (int i = 0; i < olength; i++) {
-        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
         index++;
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
       for (int i = 0; i < olength; i++) {
-        result[index + olength - i - 1] = (char) ('0' + output % 10);
+        result[index + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength;
@@ -835,7 +870,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
           result[current + olength - i - 1] = '.';
           current--;
         }
-        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength + 1;
@@ -844,22 +879,19 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
   return index;
 }
 
-__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) {
+__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign)
+{
   int index = 0;
-  if (sign) {
-    index++;
-  }
+  if (sign) { index++; }
 
-  uint64_t output = v.mantissa;
-  uint32_t const olength = decimalLength17(output);
-  int32_t exp = v.exponent + (int32_t) olength - 1;
+  uint64_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength17(output);
+  int32_t exp             = v.exponent + (int32_t)olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
-  
+
   if (scientificNotation) {
     index += olength + 1;
-    if (olength == 1) {
-      index++;
-    }
+    if (olength == 1) { index++; }
     // 'E'
     index++;
     if (exp < 0) {
@@ -886,41 +918,37 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) {
   return index;
 }
 
-__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result) {
+__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result)
+{
   // Step 5: Print the decimal representation.
   int index = 0;
-  if (sign) {
-    result[index++] = '-';
-  }
+  if (sign) { result[index++] = '-'; }
 
-  uint32_t output = v.mantissa;
-  uint32_t const olength = decimalLength9(output);
-  int32_t exp = v.exponent + olength - 1;
+  uint32_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength9(output);
+  int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
     for (int i = 0; i < olength - 1; i++) {
-      int c = output % 10; output /= 10;
-      result[index + olength - i] = (char) ('0' + c);
+      int c = output % 10;
+      output /= 10;
+      result[index + olength - i] = (char)('0' + c);
     }
-    result[index] = (char) ('0' + output % 10);
+    result[index]     = (char)('0' + output % 10);
     result[index + 1] = '.';
     index += olength + 1;
-    if (olength == 1) {
-      result[index++] = '0';
-    }
+    if (olength == 1) { result[index++] = '0'; }
 
     // Print 'E', the exponent sign, and the exponent, which has at most two digits.
     result[index++] = 'E';
     if (exp < 0) {
       result[index++] = '-';
-      exp = -exp;
-    }
-    if (exp >= 10) {
-      result[index++] = (char) ('0' + exp / 10);
+      exp             = -exp;
     }
-    result[index++] = (char) ('0' + exp % 10);
+    if (exp >= 10) { result[index++] = (char)('0' + exp / 10); }
+    result[index++] = (char)('0' + exp % 10);
   } else {
     // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
     if (exp < 0) {
@@ -932,14 +960,14 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
       }
       int current = index;
       for (int i = 0; i < olength; i++) {
-        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
         index++;
       }
     } else if (exp + 1 >= olength) {
       // Decimal dot is after any of the digits.
       for (int i = 0; i < olength; i++) {
-        result[index + olength - i - 1] = (char) ('0' + output % 10);
+        result[index + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength;
@@ -956,7 +984,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
           result[current + olength - i - 1] = '.';
           current--;
         }
-        result[current + olength - i - 1] = (char) ('0' + output % 10);
+        result[current + olength - i - 1] = (char)('0' + output % 10);
         output /= 10;
       }
       index += olength + 1;
@@ -965,32 +993,27 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
   return index;
 }
 
-__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) {
+__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign)
+{
   // Step 5: Print the decimal representation.
   int index = 0;
-  if (sign) {
-    index++;
-  }
+  if (sign) { index++; }
 
-  uint32_t output = v.mantissa;
-  uint32_t const olength = decimalLength9(output);
-  int32_t exp = v.exponent + olength - 1;
+  uint32_t output         = v.mantissa;
+  uint32_t const olength  = decimalLength9(output);
+  int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
   if (scientificNotation) {
     index += olength + 1;
-    if (olength == 1) {
-      index++;
-    }
+    if (olength == 1) { index++; }
     // 'E'
     index++;
     if (exp < 0) {
       index++;
       exp = -exp;
     }
-    if (exp >= 10) {
-      index++;
-    }
+    if (exp >= 10) { index++; }
     index++;
   } else {
     // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
@@ -1008,10 +1031,12 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) {
   return index;
 }
 
-__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const ieeeExponent,
-  floating_decimal_64* const v) {
+__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
+                                     uint32_t const ieeeExponent,
+                                     floating_decimal_64* const v)
+{
   uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-  int32_t const e2 = (int32_t) ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
+  int32_t const e2  = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
 
   if (e2 > 0) {
     // f = m2 * 2^e2 >= 2^53 is an integer.
@@ -1026,11 +1051,9 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const
 
   // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
   // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
-  uint64_t const mask = (1ull << -e2) - 1;
+  uint64_t const mask     = (1ull << -e2) - 1;
   uint64_t const fraction = m2 & mask;
-  if (fraction != 0) {
-    return false;
-  }
+  if (fraction != 0) { return false; }
 
   // f is an integer in the range [1, 2^53).
   // Note: mantissa might contain trailing (decimal) 0's.
@@ -1040,16 +1063,19 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa, uint32_t const
   return true;
 }
 
-__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
+__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
+{
   // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
   uint64_t const bits = double_to_bits(f);
 
   // Decode bits into sign, mantissa, and exponent.
-  ieeeSign = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
+  ieeeSign                    = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
   uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
-  uint32_t const ieeeExponent = (uint32_t) ((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
+  uint32_t const ieeeExponent =
+    (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
   // Case distinction; exit early for the easy cases.
-  if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+  if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) ||
+      (ieeeExponent == 0 && ieeeMantissa == 0)) {
     special = true;
     return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
   }
@@ -1063,10 +1089,8 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
     // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
     for (;;) {
       uint64_t const q = div10(v.mantissa);
-      uint32_t const r = ((uint32_t) v.mantissa) - 10 * ((uint32_t) q);
-      if (r != 0) {
-        break;
-      }
+      uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q);
+      if (r != 0) { break; }
       v.mantissa = q;
       ++v.exponent;
     }
@@ -1076,26 +1100,27 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special) {
   return v;
 }
 
-__device__ int d2s_buffered_n(double f, char* result) {
+__device__ int d2s_buffered_n(double f, char* result)
+{
   bool sign = false, special = false;
   floating_decimal_64 v = d2d(f, sign, special);
-  if (special) {
-    return copy_special_str(result, sign, v.exponent, v.mantissa);
-  }
+  if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); }
   return to_chars(v, sign, result);
 }
 
-__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
+__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
+{
   // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
   uint32_t const bits = float_to_bits(f);
 
   // Decode bits into sign, mantissa, and exponent.
-  ieeeSign = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
+  ieeeSign                    = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
   uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
   uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
 
   // Case distinction; exit early for the easy cases.
-  if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) || (ieeeExponent == 0 && ieeeMantissa == 0)) {
+  if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) ||
+      (ieeeExponent == 0 && ieeeMantissa == 0)) {
     special = true;
     return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
   }
@@ -1103,54 +1128,52 @@ __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special) {
   return f2d(ieeeMantissa, ieeeExponent);
 }
 
-__device__ int f2s_buffered_n(float f, char* result) {
+__device__ int f2s_buffered_n(float f, char* result)
+{
   bool sign = false, special = false;
   floating_decimal_32 v = f2d(f, sign, special);
-  if (special) {
-    return copy_special_str(result, sign, v.exponent, v.mantissa);
-  }
+  if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); }
   return to_chars(v, sign, result);
 }
 
-
 //===== compute float to string size =====
 
-__device__ int compute_d2s_size(double value) {
+__device__ int compute_d2s_size(double value)
+{
   bool sign = false, special = false;
   floating_decimal_64 v = d2d(value, sign, special);
-  if (special) {
-    return special_str_size(sign, v.exponent, v.mantissa);
-  }
+  if (special) { return special_str_size(sign, v.exponent, v.mantissa); }
   return d2s_size(v, sign);
 }
 
-__device__ int compute_f2s_size(float value) {
+__device__ int compute_f2s_size(float value)
+{
   bool sign = false, special = false;
   floating_decimal_32 v = f2d(value, sign, special);
-  if (special) {
-    return special_str_size(sign, v.exponent, v.mantissa);
-  }
+  if (special) { return special_str_size(sign, v.exponent, v.mantissa); }
   return f2s_size(v, sign);
 }
 
-} // namespace 
+}  // namespace
 
 //===== APIs =====
 
-__device__ int compute_ftos_size(double value, bool is_float) {
+__device__ int compute_ftos_size(double value, bool is_float)
+{
   if (is_float) {
-      return compute_f2s_size(value);
+    return compute_f2s_size(value);
   } else {
-      return compute_d2s_size(value);
+    return compute_d2s_size(value);
   }
 }
 
-__device__ int float_to_string(double value, bool is_float, char* output) {
-    if (is_float) {
-        return f2s_buffered_n(value, output);
-    } else {
-        return d2s_buffered_n(value, output);
-    }
+__device__ int float_to_string(double value, bool is_float, char* output)
+{
+  if (is_float) {
+    return f2s_buffered_n(value, output);
+  } else {
+    return d2s_buffered_n(value, output);
+  }
 }
 
-} // namespace spark-rapids-jni::ftos_converter
+}  // namespace spark_rapids_jni::ftos_converter
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index 806b3eaad5..128b5fd592 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
@@ -35,8 +34,16 @@ struct FloatToStringTests : public cudf::test::BaseFixture {};
 
 TEST_F(FloatToStringTests, FromFloats32)
 {
-  auto const floats = cudf::test::fixed_width_column_wrapper<float> {
-    100.0f, 654321.25f, -12761.125f, 0.f, 5.0f, -4.0f, std::numeric_limits<float>::quiet_NaN(), 123456789012.34f, -0.0f};
+  auto const floats =
+    cudf::test::fixed_width_column_wrapper<float>{100.0f,
+                                                  654321.25f,
+                                                  -12761.125f,
+                                                  0.f,
+                                                  5.0f,
+                                                  -4.0f,
+                                                  std::numeric_limits<float>::quiet_NaN(),
+                                                  123456789012.34f,
+                                                  -0.0f};
 
   auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
 
@@ -48,15 +55,32 @@ TEST_F(FloatToStringTests, FromFloats32)
 
 TEST_F(FloatToStringTests, FromFloats64)
 {
-  auto const floats = cudf::test::fixed_width_column_wrapper<double> {
-    100.0d, 654321.25d, -12761.125d, 1.123456789123456789d, 0.000000000000000000123456789123456789d,
-    0.0d, 5.0d, -4.0d, std::numeric_limits<double>::quiet_NaN(), 839542223232.794248339d, -0.0d};
+  auto const floats =
+    cudf::test::fixed_width_column_wrapper<double>{100.0d,
+                                                   654321.25d,
+                                                   -12761.125d,
+                                                   1.123456789123456789d,
+                                                   0.000000000000000000123456789123456789d,
+                                                   0.0d,
+                                                   5.0d,
+                                                   -4.0d,
+                                                   std::numeric_limits<double>::quiet_NaN(),
+                                                   839542223232.794248339d,
+                                                   -0.0d};
 
   auto results = spark_rapids_jni::float_to_string(floats, cudf::get_default_stream());
 
-  auto const expected = cudf::test::strings_column_wrapper{
-    "100.0", "654321.25", "-12761.125", "1.1234567891234568", "1.234567891234568E-19", 
-    "0.0", "5.0", "-4.0", "NaN", "8.395422232327942E11", "-0.0"};
+  auto const expected = cudf::test::strings_column_wrapper{"100.0",
+                                                           "654321.25",
+                                                           "-12761.125",
+                                                           "1.1234567891234568",
+                                                           "1.234567891234568E-19",
+                                                           "0.0",
+                                                           "5.0",
+                                                           "-4.0",
+                                                           "NaN",
+                                                           "8.395422232327942E11",
+                                                           "-0.0"};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 }
\ No newline at end of file
diff --git a/thirdparty/cudf b/thirdparty/cudf
index c8074b5176..168533a8ad 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit c8074b5176a74630101c78c43c24b66141352b24
+Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243

From 388cb500f4ee08f02857b15bd2ff6c6799c66388 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 4 Dec 2023 18:05:59 +0800
Subject: [PATCH 41/54] Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/cast_float_to_string.cu    | 7 -------
 src/main/cpp/tests/cast_float_to_string.cpp | 2 --
 thirdparty/cudf                             | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/main/cpp/src/cast_float_to_string.cu b/src/main/cpp/src/cast_float_to_string.cu
index 31d3f69d11..050aaf742f 100644
--- a/src/main/cpp/src/cast_float_to_string.cu
+++ b/src/main/cpp/src/cast_float_to_string.cu
@@ -18,16 +18,9 @@
 #include "ftos_converter.cuh"
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index 128b5fd592..f9f8cd44a6 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -18,8 +18,6 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 168533a8ad..c8074b5176 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 168533a8ad4086bd020be4f7bf9264a08b6d2243
+Subproject commit c8074b5176a74630101c78c43c24b66141352b24

From 54fa73cbed0840a19b0046ef0f670028cf8d1056 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Mon, 4 Dec 2023 18:12:21 +0800
Subject: [PATCH 42/54] Address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/tests/cast_float_to_string.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/main/cpp/tests/cast_float_to_string.cpp b/src/main/cpp/tests/cast_float_to_string.cpp
index f9f8cd44a6..ac2b2c0e24 100644
--- a/src/main/cpp/tests/cast_float_to_string.cpp
+++ b/src/main/cpp/tests/cast_float_to_string.cpp
@@ -19,8 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/strings/convert/convert_floats.hpp>
-
 #include <limits>
 #include <rmm/device_uvector.hpp>
 

From 3d19638d83336f438bb30e142f1ec4567f7c7802 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Thu, 7 Dec 2023 17:26:52 +0800
Subject: [PATCH 43/54] address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 .../cpp/benchmarks/cast_string_to_float.cpp   |  2 +-
 src/main/cpp/src/ftos_converter.cuh           | 54 +++++++++----------
 src/main/cpp/tests/cast_decimal_to_string.cpp |  2 +-
 src/main/cpp/tests/cast_string.cpp            |  2 +-
 4 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/src/main/cpp/benchmarks/cast_string_to_float.cpp b/src/main/cpp/benchmarks/cast_string_to_float.cpp
index 32e245aa98..d94f9d26a0 100644
--- a/src/main/cpp/benchmarks/cast_string_to_float.cpp
+++ b/src/main/cpp/benchmarks/cast_string_to_float.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "cast_string.hpp"
+#include <cast_string.hpp>
 
 #include <benchmarks/common/generate_input.hpp>
 
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index 10b99b4d7e..df1772bc99 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -116,34 +116,32 @@ __constant__ uint32_t const POW5_OFFSETS[21] = {
 
 constexpr uint32_t POW5_TABLE_SIZE = 26;
 
-__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
-  1ull,
-  5ull,
-  25ull,
-  125ull,
-  625ull,
-  3125ull,
-  15625ull,
-  78125ull,
-  390625ull,
-  1953125ull,
-  9765625ull,
-  48828125ull,
-  244140625ull,
-  1220703125ull,
-  6103515625ull,
-  30517578125ull,
-  152587890625ull,
-  762939453125ull,
-  3814697265625ull,
-  19073486328125ull,
-  95367431640625ull,
-  476837158203125ull,
-  2384185791015625ull,
-  11920928955078125ull,
-  59604644775390625ull,
-  298023223876953125ull  //, 1490116119384765625ull
-};
+__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {1ull,
+                                                                  5ull,
+                                                                  25ull,
+                                                                  125ull,
+                                                                  625ull,
+                                                                  3125ull,
+                                                                  15625ull,
+                                                                  78125ull,
+                                                                  390625ull,
+                                                                  1953125ull,
+                                                                  9765625ull,
+                                                                  48828125ull,
+                                                                  244140625ull,
+                                                                  1220703125ull,
+                                                                  6103515625ull,
+                                                                  30517578125ull,
+                                                                  152587890625ull,
+                                                                  762939453125ull,
+                                                                  3814697265625ull,
+                                                                  19073486328125ull,
+                                                                  95367431640625ull,
+                                                                  476837158203125ull,
+                                                                  2384185791015625ull,
+                                                                  11920928955078125ull,
+                                                                  59604644775390625ull,
+                                                                  298023223876953125ull};
 
 //===== common.h from ryu =====
 
diff --git a/src/main/cpp/tests/cast_decimal_to_string.cpp b/src/main/cpp/tests/cast_decimal_to_string.cpp
index 05002c373c..ba1aaf05c8 100644
--- a/src/main/cpp/tests/cast_decimal_to_string.cpp
+++ b/src/main/cpp/tests/cast_decimal_to_string.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "cast_string.hpp"
+#include <cast_string.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
diff --git a/src/main/cpp/tests/cast_string.cpp b/src/main/cpp/tests/cast_string.cpp
index 0a3f221894..1f7aaaad21 100644
--- a/src/main/cpp/tests/cast_string.cpp
+++ b/src/main/cpp/tests/cast_string.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "cast_string.hpp"
+#include <cast_string.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>

From 8d02a3f6a23255580c2fc51483d78544e8fb6a86 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 8 Dec 2023 15:01:41 +0800
Subject: [PATCH 44/54] fix build after upmerge

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cuh | 1202 +--------------------------
 1 file changed, 12 insertions(+), 1190 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index 09c58b88fb..fe71df924f 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cuda/std/cassert>
 #include <cuda/std/climits>
 #include <cuda/std/cstdint>
@@ -1074,7 +1076,7 @@ __device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
   return true;
 }
 
-__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
+__device__ inline floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
 {
   // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
   uint64_t const bits = double_to_bits(f);
@@ -1111,7 +1113,7 @@ __device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
   return v;
 }
 
-__device__ int d2s_buffered_n(double f, char* result)
+__device__ inline int d2s_buffered_n(double f, char* result)
 {
   bool sign = false, special = false;
   floating_decimal_64 v = d2d(f, sign, special);
@@ -1119,7 +1121,7 @@ __device__ int d2s_buffered_n(double f, char* result)
   return to_chars(v, sign, result);
 }
 
-__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
+__device__ inline floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
 {
   // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
   uint32_t const bits = float_to_bits(f);
@@ -1139,7 +1141,7 @@ __device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
   return f2d(ieeeMantissa, ieeeExponent);
 }
 
-__device__ int f2s_buffered_n(float f, char* result)
+__device__ inline int f2s_buffered_n(float f, char* result)
 {
   bool sign = false, special = false;
   floating_decimal_32 v = f2d(f, sign, special);
@@ -1149,7 +1151,7 @@ __device__ int f2s_buffered_n(float f, char* result)
 
 //===== compute float to string size =====
 
-__device__ int compute_d2s_size(double value)
+__device__ inline int compute_d2s_size(double value)
 {
   bool sign = false, special = false;
   floating_decimal_64 v = d2d(value, sign, special);
@@ -1157,7 +1159,7 @@ __device__ int compute_d2s_size(double value)
   return d2s_size(v, sign);
 }
 
-__device__ int compute_f2s_size(float value)
+__device__ inline int compute_f2s_size(float value)
 {
   bool sign = false, special = false;
   floating_decimal_32 v = f2d(value, sign, special);
@@ -1169,7 +1171,7 @@ __device__ int compute_f2s_size(float value)
 
 //===== APIs =====
 
-__device__ int compute_ftos_size(double value, bool is_float)
+__device__ inline int compute_ftos_size(double value, bool is_float)
 {
   if (is_float) {
     return compute_f2s_size(value);
@@ -1178,7 +1180,7 @@ __device__ int compute_ftos_size(double value, bool is_float)
   }
 }
 
-__device__ int float_to_string(double value, bool is_float, char* output)
+__device__ inline int float_to_string(double value, bool is_float, char* output)
 {
   if (is_float) {
     return f2s_buffered_n(value, output);
@@ -1502,7 +1504,7 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const
   return index;
 }
 
-__device__ int compute_format_float_size(double value, int digits, bool is_float)
+__device__ inline int compute_format_float_size(double value, int digits, bool is_float)
 {
   bool sign = false, special = false;
   if (is_float) {
@@ -1516,7 +1518,7 @@ __device__ int compute_format_float_size(double value, int digits, bool is_float
   }
 }
 
-__device__ int format_float(double value, int digits, bool is_float, char* output)
+__device__ inline int format_float(double value, int digits, bool is_float, char* output)
 {
   bool sign = false, special = false;
   if (is_float) {
@@ -1531,1183 +1533,3 @@ __device__ int format_float(double value, int digits, bool is_float, char* outpu
 }
 
 }  // namespace spark_rapids_jni::ftos_converter
-
-/*
- * Copyright 2018 Ulf Adams
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuda/std/cassert>
-#include <cuda/std/climits>
-#include <cuda/std/cstdint>
-#include <cuda/std/limits>
-#include <cuda/std/type_traits>
-
-namespace spark_rapids_jni::ftos_converter {
-
-namespace {
-
-// d2s.c from ryu
-// A floating decimal representing m * 10^e.
-typedef struct floating_decimal_64 {
-  uint64_t mantissa;
-  // Decimal exponent's range is -324 to 308
-  // inclusive, and can fit in a short if needed.
-  int32_t exponent;
-} floating_decimal_64;
-
-// f2s.c from ryu
-// A floating decimal representing m * 10^e.
-typedef struct floating_decimal_32 {
-  uint32_t mantissa;
-  // Decimal exponent's range is -45 to 38
-  // inclusive, and can fit in a short if needed.
-  int32_t exponent;
-} floating_decimal_32;
-
-//===== constants from ryu =====
-
-// These tables are generated by PrintDoubleLookupTable.
-constexpr unsigned int DOUBLE_POW5_INV_BITCOUNT = 125;
-constexpr unsigned int DOUBLE_POW5_BITCOUNT     = 125;
-constexpr unsigned int FLOAT_POW5_INV_BITCOUNT  = (DOUBLE_POW5_INV_BITCOUNT - 64);
-constexpr unsigned int FLOAT_POW5_BITCOUNT      = (DOUBLE_POW5_BITCOUNT - 64);
-constexpr unsigned int DOUBLE_MANTISSA_BITS     = 52;
-constexpr unsigned int DOUBLE_EXPONENT_BITS     = 11;
-constexpr unsigned int DOUBLE_BIAS              = 1023;
-constexpr unsigned int FLOAT_MANTISSA_BITS      = 23;
-constexpr unsigned int FLOAT_EXPONENT_BITS      = 8;
-constexpr unsigned int FLOAT_BIAS               = 127;
-
-__constant__ uint64_t const DOUBLE_POW5_INV_SPLIT2[15][2] = {
-  {1u, 2305843009213693952u},
-  {5955668970331000884u, 1784059615882449851u},
-  {8982663654677661702u, 1380349269358112757u},
-  {7286864317269821294u, 2135987035920910082u},
-  {7005857020398200553u, 1652639921975621497u},
-  {17965325103354776697u, 1278668206209430417u},
-  {8928596168509315048u, 1978643211784836272u},
-  {10075671573058298858u, 1530901034580419511u},
-  {597001226353042382u, 1184477304306571148u},
-  {1527430471115325346u, 1832889850782397517u},
-  {12533209867169019542u, 1418129833677084982u},
-  {5577825024675947042u, 2194449627517475473u},
-  {11006974540203867551u, 1697873161311732311u},
-  {10313493231639821582u, 1313665730009899186u},
-  {12701016819766672773u, 2032799256770390445u}};
-
-__constant__ uint32_t const POW5_INV_OFFSETS[19] = {0x54544554,
-                                                    0x04055545,
-                                                    0x10041000,
-                                                    0x00400414,
-                                                    0x40010000,
-                                                    0x41155555,
-                                                    0x00000454,
-                                                    0x00010044,
-                                                    0x40000000,
-                                                    0x44000041,
-                                                    0x50454450,
-                                                    0x55550054,
-                                                    0x51655554,
-                                                    0x40004000,
-                                                    0x01000001,
-                                                    0x00010500,
-                                                    0x51515411,
-                                                    0x05555554,
-                                                    0x00000000};
-
-__constant__ uint64_t const DOUBLE_POW5_SPLIT2[13][2] = {
-  {0u, 1152921504606846976u},
-  {0u, 1490116119384765625u},
-  {1032610780636961552u, 1925929944387235853u},
-  {7910200175544436838u, 1244603055572228341u},
-  {16941905809032713930u, 1608611746708759036u},
-  {13024893955298202172u, 2079081953128979843u},
-  {6607496772837067824u, 1343575221513417750u},
-  {17332926989895652603u, 1736530273035216783u},
-  {13037379183483547984u, 2244412773384604712u},
-  {1605989338741628675u, 1450417759929778918u},
-  {9630225068416591280u, 1874621017369538693u},
-  {665883850346957067u, 1211445438634777304u},
-  {14931890668723713708u, 1565756531257009982u}};
-
-__constant__ uint32_t const POW5_OFFSETS[21] = {
-  0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995, 0x55545555,
-  0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540, 0x45555550, 0x40004000,
-  0x96440440, 0x55565565, 0x54454045, 0x40154151, 0x55559155, 0x51405555, 0x00000105};
-
-constexpr uint32_t POW5_TABLE_SIZE = 26;
-
-__constant__ uint64_t const DOUBLE_POW5_TABLE[POW5_TABLE_SIZE] = {
-  1ull,
-  5ull,
-  25ull,
-  125ull,
-  625ull,
-  3125ull,
-  15625ull,
-  78125ull,
-  390625ull,
-  1953125ull,
-  9765625ull,
-  48828125ull,
-  244140625ull,
-  1220703125ull,
-  6103515625ull,
-  30517578125ull,
-  152587890625ull,
-  762939453125ull,
-  3814697265625ull,
-  19073486328125ull,
-  95367431640625ull,
-  476837158203125ull,
-  2384185791015625ull,
-  11920928955078125ull,
-  59604644775390625ull,
-  298023223876953125ull  //, 1490116119384765625ull
-};
-
-//===== common.h from ryu =====
-
-// Returns the number of decimal digits in v, which must not contain more than 9 digits.
-__device__ inline uint32_t decimalLength9(uint32_t const v)
-{
-  // Function precondition: v is not a 10-digit number.
-  // (f2s: 9 digits are sufficient for round-tripping.)
-  // (d2fixed: We print 9-digit blocks.)
-  assert(v < 1000000000);
-  if (v >= 100000000) { return 9; }
-  if (v >= 10000000) { return 8; }
-  if (v >= 1000000) { return 7; }
-  if (v >= 100000) { return 6; }
-  if (v >= 10000) { return 5; }
-  if (v >= 1000) { return 4; }
-  if (v >= 100) { return 3; }
-  if (v >= 10) { return 2; }
-  return 1;
-}
-
-// Returns e == 0 ? 1 : ceil(log_2(5^e)); requires 0 <= e <= 3528.
-__device__ inline int32_t pow5bits(int32_t const e)
-{
-  // This approximation works up to the point that the multiplication overflows at e = 3529.
-  // If the multiplication were done in 64 bits, it would fail at 5^4004 which is just greater
-  // than 2^9297.
-  assert(e >= 0);
-  assert(e <= 3528);
-  return (int32_t)(((((uint32_t)e) * 1217359) >> 19) + 1);
-}
-
-// Returns floor(log_10(2^e)); requires 0 <= e <= 1650.
-__device__ inline uint32_t log10Pow2(int32_t const e)
-{
-  // The first value this approximation fails for is 2^1651 which is just greater than 10^297.
-  assert(e >= 0);
-  assert(e <= 1650);
-  return (((uint32_t)e) * 78913) >> 18;
-}
-
-// Returns floor(log_10(5^e)); requires 0 <= e <= 2620.
-__device__ inline uint32_t log10Pow5(int32_t const e)
-{
-  // The first value this approximation fails for is 5^2621 which is just greater than 10^1832.
-  assert(e >= 0);
-  assert(e <= 2620);
-  return (((uint32_t)e) * 732923) >> 20;
-}
-
-__device__ inline uint32_t pow5factor_32(uint32_t value)
-{
-  uint32_t count = 0;
-  for (;;) {
-    assert(value != 0);
-    uint32_t const q = value / 5;
-    uint32_t const r = value % 5;
-    if (r != 0) { break; }
-    value = q;
-    ++count;
-  }
-  return count;
-}
-
-// Returns true if value is divisible by 5^p.
-__device__ inline bool multipleOfPowerOf5_32(uint32_t const value, uint32_t const p)
-{
-  return pow5factor_32(value) >= p;
-}
-
-// Returns true if value is divisible by 2^p.
-__device__ inline bool multipleOfPowerOf2_32(uint32_t const value, uint32_t const p)
-{
-  // __builtin_ctz doesn't appear to be faster here.
-  return (value & ((1u << p) - 1)) == 0;
-}
-
-// It seems to be slightly faster to avoid uint128_t here, although the
-// generated code for uint128_t looks slightly nicer.
-__device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, int32_t const shift)
-{
-  assert(shift > 32);
-
-  // The casts here help MSVC to avoid calls to the __allmul library
-  // function.
-  uint32_t const factorLo = (uint32_t)(factor);
-  uint32_t const factorHi = (uint32_t)(factor >> 32);
-  uint64_t const bits0    = (uint64_t)m * factorLo;
-  uint64_t const bits1    = (uint64_t)m * factorHi;
-
-  uint64_t const sum        = (bits0 >> 32) + bits1;
-  uint64_t const shiftedSum = sum >> (shift - 32);
-  assert(shiftedSum <= UINT32_MAX);
-  return (uint32_t)shiftedSum;
-}
-
-__device__ inline int copy_special_str(char* const result,
-                                       bool const sign,
-                                       bool const exponent,
-                                       bool const mantissa)
-{
-  if (mantissa) {
-    memcpy(result, "NaN", 3);
-    return 3;
-  }
-  if (sign) { result[0] = '-'; }
-  if (exponent) {
-    memcpy(result + sign, "Infinity", 8);
-    return sign + 8;
-  }
-  memcpy(result + sign, "0.0", 3);
-  return sign + 3;
-}
-
-__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa)
-{
-  if (mantissa) { return 3; }
-  if (exponent) { return sign + 8; }
-  return sign + 3;
-}
-
-__device__ inline uint32_t float_to_bits(float const f)
-{
-  uint32_t bits = 0;
-  memcpy(&bits, &f, sizeof(float));
-  return bits;
-}
-
-__device__ inline uint64_t double_to_bits(double const d)
-{
-  uint64_t bits = 0;
-  memcpy(&bits, &d, sizeof(double));
-  return bits;
-}
-
-//===== d2s_intrinsics.h from ryu =====
-
-__device__ inline uint64_t umul128(uint64_t const a, uint64_t const b, uint64_t* const productHi)
-{
-  // The casts here help MSVC to avoid calls to the __allmul library function.
-  uint32_t const aLo = (uint32_t)a;
-  uint32_t const aHi = (uint32_t)(a >> 32);
-  uint32_t const bLo = (uint32_t)b;
-  uint32_t const bHi = (uint32_t)(b >> 32);
-
-  uint64_t const b00 = (uint64_t)aLo * bLo;
-  uint64_t const b01 = (uint64_t)aLo * bHi;
-  uint64_t const b10 = (uint64_t)aHi * bLo;
-  uint64_t const b11 = (uint64_t)aHi * bHi;
-
-  uint32_t const b00Lo = (uint32_t)b00;
-  uint32_t const b00Hi = (uint32_t)(b00 >> 32);
-
-  uint64_t const mid1   = b10 + b00Hi;
-  uint32_t const mid1Lo = (uint32_t)(mid1);
-  uint32_t const mid1Hi = (uint32_t)(mid1 >> 32);
-
-  uint64_t const mid2   = b01 + mid1Lo;
-  uint32_t const mid2Lo = (uint32_t)(mid2);
-  uint32_t const mid2Hi = (uint32_t)(mid2 >> 32);
-
-  uint64_t const pHi = b11 + mid1Hi + mid2Hi;
-  uint64_t const pLo = ((uint64_t)mid2Lo << 32) | b00Lo;
-
-  *productHi = pHi;
-  return pLo;
-}
-
-__device__ inline uint64_t shiftright128(uint64_t const lo, uint64_t const hi, uint32_t const dist)
-{
-  // We don't need to handle the case dist >= 64 here (see above).
-  assert(dist < 64);
-  assert(dist > 0);
-  return (hi << (64 - dist)) | (lo >> dist);
-}
-
-__device__ inline uint64_t div5(uint64_t const x) { return x / 5; }
-
-__device__ inline uint64_t div10(uint64_t const x) { return x / 10; }
-
-__device__ inline uint64_t div100(uint64_t const x) { return x / 100; }
-
-__device__ inline uint32_t pow5Factor(uint64_t value)
-{
-  uint64_t const m_inv_5 = 14757395258967641293u;  // 5 * m_inv_5 = 1 (mod 2^64)
-  uint64_t const n_div_5 = 3689348814741910323u;   // #{ n | n = 0 (mod 2^64) } = 2^64 / 5
-  uint32_t count         = 0;
-  for (;;) {
-    assert(value != 0);
-    value *= m_inv_5;
-    if (value > n_div_5) break;
-    ++count;
-  }
-  return count;
-}
-
-// Returns true if value is divisible by 5^p.
-__device__ inline bool multipleOfPowerOf5(uint64_t const value, uint32_t const p)
-{
-  // I tried a case distinction on p, but there was no performance difference.
-  return pow5Factor(value) >= p;
-}
-
-// Returns true if value is divisible by 2^p.
-__device__ inline bool multipleOfPowerOf2(uint64_t const value, uint32_t const p)
-{
-  assert(value != 0);
-  assert(p < 64);
-  // __builtin_ctzll doesn't appear to be faster here.
-  return (value & ((1ull << p) - 1)) == 0;
-}
-
-__device__ inline uint64_t mulShift64(uint64_t const m, uint64_t const* const mul, int32_t const j)
-{
-  // m is maximum 55 bits
-  uint64_t high1;                                    // 128
-  uint64_t const low1 = umul128(m, mul[1], &high1);  // 64
-  uint64_t high0;                                    // 64
-  umul128(m, mul[0], &high0);                        // 0
-  uint64_t const sum = high0 + low1;
-  if (sum < high0) {
-    ++high1;  // overflow into high1
-  }
-  return shiftright128(sum, high1, j - 64);
-}
-
-__device__ inline uint64_t mulShiftAll64(uint64_t const m,
-                                         uint64_t const* const mul,
-                                         int32_t const j,
-                                         uint64_t* const vp,
-                                         uint64_t* const vm,
-                                         uint32_t const mmShift)
-{
-  *vp = mulShift64(4 * m + 2, mul, j);
-  *vm = mulShift64(4 * m - 1 - mmShift, mul, j);
-  return mulShift64(4 * m, mul, j);
-}
-
-//===== d2s_small_table.h from ryu =====
-
-// Computes 5^i in the form required by Ryu, and stores it in the given pointer.
-__device__ inline void double_computePow5(uint32_t const i, uint64_t* const result)
-{
-  uint32_t const base       = i / POW5_TABLE_SIZE;
-  uint32_t const base2      = base * POW5_TABLE_SIZE;
-  uint32_t const offset     = i - base2;
-  uint64_t const* const mul = DOUBLE_POW5_SPLIT2[base];
-  if (offset == 0) {
-    result[0] = mul[0];
-    result[1] = mul[1];
-    return;
-  }
-  uint64_t const m = DOUBLE_POW5_TABLE[offset];
-  uint64_t high1;
-  uint64_t const low1 = umul128(m, mul[1], &high1);
-  uint64_t high0;
-  uint64_t const low0 = umul128(m, mul[0], &high0);
-  uint64_t const sum  = high0 + low1;
-  if (sum < high0) {
-    ++high1;  // overflow into high1
-  }
-  // high1 | sum | low0
-  uint32_t const delta = pow5bits(i) - pow5bits(base2);
-  result[0] = shiftright128(low0, sum, delta) + ((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
-  result[1] = shiftright128(sum, high1, delta);
-}
-
-// Computes 5^-i in the form required by Ryu, and stores it in the given pointer.
-__device__ inline void double_computeInvPow5(uint32_t const i, uint64_t* const result)
-{
-  uint32_t const base       = (i + POW5_TABLE_SIZE - 1) / POW5_TABLE_SIZE;
-  uint32_t const base2      = base * POW5_TABLE_SIZE;
-  uint32_t const offset     = base2 - i;
-  uint64_t const* const mul = DOUBLE_POW5_INV_SPLIT2[base];  // 1/5^base2
-  if (offset == 0) {
-    result[0] = mul[0];
-    result[1] = mul[1];
-    return;
-  }
-  uint64_t const m = DOUBLE_POW5_TABLE[offset];
-  uint64_t high1;
-  uint64_t const low1 = umul128(m, mul[1], &high1);
-  uint64_t high0;
-  uint64_t const low0 = umul128(m, mul[0] - 1, &high0);
-  uint64_t const sum  = high0 + low1;
-  if (sum < high0) {
-    ++high1;  // overflow into high1
-  }
-  // high1 | sum | low0
-  uint32_t const delta = pow5bits(base2) - pow5bits(i);
-  result[0] =
-    shiftright128(low0, sum, delta) + 1 + ((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
-  result[1] = shiftright128(sum, high1, delta);
-}
-
-//===== f2s_intrinsics.h from ryu =====
-
-__device__ inline uint32_t mulPow5InvDivPow2(uint32_t const m, uint32_t const q, int32_t const j)
-{
-  // The inverse multipliers are defined as [2^x / 5^y] + 1; the upper 64 bits from the double
-  // lookup table are the correct bits for [2^x / 5^y], so we have to add 1 here. Note that we rely
-  // on the fact that the added 1 that's already stored in the table never overflows into the upper
-  // 64 bits.
-  uint64_t pow5[2];
-  double_computeInvPow5(q, pow5);
-  return mulShift32(m, pow5[1] + 1, j);
-}
-
-__device__ inline uint32_t mulPow5divPow2(uint32_t const m, uint32_t const i, int32_t const j)
-{
-  uint64_t pow5[2];
-  double_computePow5(i, pow5);
-  return mulShift32(m, pow5[1], j);
-}
-
-//===== d2s.c and f2s.c from ryu =====
-
-__device__ inline uint32_t decimalLength17(uint64_t const v)
-{
-  // This is slightly faster than a loop.
-  // The average output length is 16.38 digits, so we check high-to-low.
-  // Function precondition: v is not an 18, 19, or 20-digit number.
-  // (17 digits are sufficient for round-tripping.)
-  assert(v < 100000000000000000L);
-  if (v >= 10000000000000000L) { return 17; }
-  if (v >= 1000000000000000L) { return 16; }
-  if (v >= 100000000000000L) { return 15; }
-  if (v >= 10000000000000L) { return 14; }
-  if (v >= 1000000000000L) { return 13; }
-  if (v >= 100000000000L) { return 12; }
-  if (v >= 10000000000L) { return 11; }
-  if (v >= 1000000000L) { return 10; }
-  if (v >= 100000000L) { return 9; }
-  if (v >= 10000000L) { return 8; }
-  if (v >= 1000000L) { return 7; }
-  if (v >= 100000L) { return 6; }
-  if (v >= 10000L) { return 5; }
-  if (v >= 1000L) { return 4; }
-  if (v >= 100L) { return 3; }
-  if (v >= 10L) { return 2; }
-  return 1;
-}
-
-__device__ inline floating_decimal_64 d2d(uint64_t const ieeeMantissa, uint32_t const ieeeExponent)
-{
-  int32_t e2;
-  uint64_t m2;
-  if (ieeeExponent == 0) {
-    // We subtract 2 so that the bounds computation has 2 additional bits.
-    e2 = 1 - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
-    m2 = ieeeMantissa;
-  } else {
-    e2 = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS - 2;
-    m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-  }
-  bool const even         = (m2 & 1) == 0;
-  bool const acceptBounds = even;
-
-  // Step 2: Determine the interval of valid decimal representations.
-  uint64_t const mv = 4 * m2;
-  // Implicit bool -> int conversion. True is 1, false is 0.
-  uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-  // We would compute mp and mm like this:
-  // uint64_t mp = 4 * m2 + 2;
-  // uint64_t mm = mv - 1 - mmShift;
-
-  // Step 3: Convert to a decimal power base using 128-bit arithmetic.
-  uint64_t vr, vp, vm;
-  int32_t e10;
-  bool vmIsTrailingZeros = false;
-  bool vrIsTrailingZeros = false;
-  if (e2 >= 0) {
-    // I tried special-casing q == 0, but there was no effect on performance.
-    // This expression is slightly faster than max(0, log10Pow2(e2) - 1).
-    uint32_t const q = log10Pow2(e2) - (e2 > 3);
-    e10              = (int32_t)q;
-    int32_t const k  = DOUBLE_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
-    int32_t const i  = -e2 + (int32_t)q + k;
-    uint64_t pow5[2];
-    double_computeInvPow5(q, pow5);
-    vr = mulShiftAll64(m2, pow5, i, &vp, &vm, mmShift);
-
-    if (q <= 21) {
-      // This should use q <= 22, but I think 21 is also safe. Smaller values
-      // may still be safe, but it's more difficult to reason about them.
-      // Only one of mp, mv, and mm can be a multiple of 5, if any.
-      uint32_t const mvMod5 = ((uint32_t)mv) - 5 * ((uint32_t)div5(mv));
-      if (mvMod5 == 0) {
-        vrIsTrailingZeros = multipleOfPowerOf5(mv, q);
-      } else if (acceptBounds) {
-        // Same as min(e2 + (~mm & 1), pow5Factor(mm)) >= q
-        // <=> e2 + (~mm & 1) >= q && pow5Factor(mm) >= q
-        // <=> true && pow5Factor(mm) >= q, since e2 >= q.
-        vmIsTrailingZeros = multipleOfPowerOf5(mv - 1 - mmShift, q);
-      } else {
-        // Same as min(e2 + 1, pow5Factor(mp)) >= q.
-        vp -= multipleOfPowerOf5(mv + 2, q);
-      }
-    }
-  } else {
-    // This expression is slightly faster than max(0, log10Pow5(-e2) - 1).
-    uint32_t const q = log10Pow5(-e2) - (-e2 > 1);
-    e10              = (int32_t)q + e2;
-    int32_t const i  = -e2 - (int32_t)q;
-    int32_t const k  = pow5bits(i) - DOUBLE_POW5_BITCOUNT;
-    int32_t const j  = (int32_t)q - k;
-
-    uint64_t pow5[2];
-    double_computePow5(i, pow5);
-    vr = mulShiftAll64(m2, pow5, j, &vp, &vm, mmShift);
-
-    if (q <= 1) {
-      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
-      // mv = 4 * m2, so it always has at least two trailing 0 bits.
-      vrIsTrailingZeros = true;
-      if (acceptBounds) {
-        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
-        vmIsTrailingZeros = mmShift == 1;
-      } else {
-        // mp = mv + 2, so it always has at least one trailing 0 bit.
-        --vp;
-      }
-    } else if (q < 63) {  // TODO(ulfjack): Use a tighter bound here.
-      // We want to know if the full product has at least q trailing zeros.
-      // We need to compute min(p2(mv), p5(mv) - e2) >= q
-      // <=> p2(mv) >= q && p5(mv) - e2 >= q
-      // <=> p2(mv) >= q (because -e2 >= q)
-      vrIsTrailingZeros = multipleOfPowerOf2(mv, q);
-    }
-  }
-
-  // Step 4: Find the shortest decimal representation in the interval of valid representations.
-  int32_t removed          = 0;
-  uint8_t lastRemovedDigit = 0;
-  uint64_t output;
-  // On average, we remove ~2 digits.
-  if (vmIsTrailingZeros || vrIsTrailingZeros) {
-    // General case, which happens rarely (~0.7%).
-    for (;;) {
-      uint64_t const vpDiv10 = div10(vp);
-      uint64_t const vmDiv10 = div10(vm);
-      if (vpDiv10 <= vmDiv10) { break; }
-      uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
-      uint64_t const vrDiv10 = div10(vr);
-      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
-      vmIsTrailingZeros &= vmMod10 == 0;
-      vrIsTrailingZeros &= lastRemovedDigit == 0;
-      lastRemovedDigit = (uint8_t)vrMod10;
-      vr               = vrDiv10;
-      vp               = vpDiv10;
-      vm               = vmDiv10;
-      ++removed;
-    }
-
-    if (vmIsTrailingZeros) {
-      for (;;) {
-        uint64_t const vmDiv10 = div10(vm);
-        uint32_t const vmMod10 = ((uint32_t)vm) - 10 * ((uint32_t)vmDiv10);
-        if (vmMod10 != 0) { break; }
-        uint64_t const vpDiv10 = div10(vp);
-        uint64_t const vrDiv10 = div10(vr);
-        uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
-        vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t)vrMod10;
-        vr               = vrDiv10;
-        vp               = vpDiv10;
-        vm               = vmDiv10;
-        ++removed;
-      }
-    }
-
-    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
-      // Round even if the exact number is .....50..0.
-      lastRemovedDigit = 4;
-    }
-    // We need to take vr + 1 if vr is outside bounds or we need to round up.
-    output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
-  } else {
-    // Specialized for the common case (~99.3%). Percentages below are relative to this.
-    bool roundUp            = false;
-    uint64_t const vpDiv100 = div100(vp);
-    uint64_t const vmDiv100 = div100(vm);
-    if (vpDiv100 > vmDiv100) {  // Optimization: remove two digits at a time (~86.2%).
-      uint64_t const vrDiv100 = div100(vr);
-      uint32_t const vrMod100 = ((uint32_t)vr) - 100 * ((uint32_t)vrDiv100);
-      roundUp                 = vrMod100 >= 50;
-      vr                      = vrDiv100;
-      vp                      = vpDiv100;
-      vm                      = vmDiv100;
-      removed += 2;
-    }
-    // Loop iterations below (approximately), without optimization above:
-    // 0: 0.03%, 1: 13.8%, 2: 70.6%, 3: 14.0%, 4: 1.40%, 5: 0.14%, 6+: 0.02%
-    // Loop iterations below (approximately), with optimization above:
-    // 0: 70.6%, 1: 27.8%, 2: 1.40%, 3: 0.14%, 4+: 0.02%
-    for (;;) {
-      uint64_t const vpDiv10 = div10(vp);
-      uint64_t const vmDiv10 = div10(vm);
-      if (vpDiv10 <= vmDiv10) { break; }
-      uint64_t const vrDiv10 = div10(vr);
-      uint32_t const vrMod10 = ((uint32_t)vr) - 10 * ((uint32_t)vrDiv10);
-      roundUp                = vrMod10 >= 5;
-      vr                     = vrDiv10;
-      vp                     = vpDiv10;
-      vm                     = vmDiv10;
-      ++removed;
-    }
-
-    // We need to take vr + 1 if vr is outside bounds or we need to round up.
-    output = vr + (vr == vm || roundUp);
-  }
-  int32_t const exp = e10 + removed;
-
-  floating_decimal_64 fd;
-  fd.exponent = exp;
-  fd.mantissa = output;
-  return fd;
-}
-
-__device__ inline floating_decimal_32 f2d(uint32_t const ieeeMantissa, uint32_t const ieeeExponent)
-{
-  int32_t e2;
-  uint32_t m2;
-  if (ieeeExponent == 0) {
-    // We subtract 2 so that the bounds computation has 2 additional bits.
-    e2 = 1 - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
-    m2 = ieeeMantissa;
-  } else {
-    e2 = (int32_t)ieeeExponent - FLOAT_BIAS - FLOAT_MANTISSA_BITS - 2;
-    m2 = (1u << FLOAT_MANTISSA_BITS) | ieeeMantissa;
-  }
-  bool const even         = (m2 & 1) == 0;
-  bool const acceptBounds = even;
-
-  // Step 2: Determine the interval of valid decimal representations.
-  uint32_t const mv = 4 * m2;
-  uint32_t const mp = 4 * m2 + 2;
-  // Implicit bool -> int conversion. True is 1, false is 0.
-  uint32_t const mmShift = ieeeMantissa != 0 || ieeeExponent <= 1;
-  uint32_t const mm      = 4 * m2 - 1 - mmShift;
-
-  // Step 3: Convert to a decimal power base using 64-bit arithmetic.
-  uint32_t vr, vp, vm;
-  int32_t e10;
-  bool vmIsTrailingZeros   = false;
-  bool vrIsTrailingZeros   = false;
-  uint8_t lastRemovedDigit = 0;
-  if (e2 >= 0) {
-    uint32_t const q = log10Pow2(e2);
-    e10              = (int32_t)q;
-    int32_t const k  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)q) - 1;
-    int32_t const i  = -e2 + (int32_t)q + k;
-    vr               = mulPow5InvDivPow2(mv, q, i);
-    vp               = mulPow5InvDivPow2(mp, q, i);
-    vm               = mulPow5InvDivPow2(mm, q, i);
-    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-      // We need to know one removed digit even if we are not going to loop below. We could use
-      // q = X - 1 above, except that would require 33 bits for the result, and we've found that
-      // 32-bit arithmetic is faster even on 64-bit machines.
-      int32_t const l  = FLOAT_POW5_INV_BITCOUNT + pow5bits((int32_t)(q - 1)) - 1;
-      lastRemovedDigit = (uint8_t)(mulPow5InvDivPow2(mv, q - 1, -e2 + (int32_t)q - 1 + l) % 10);
-    }
-    if (q <= 9) {
-      // The largest power of 5 that fits in 24 bits is 5^10, but q <= 9 seems to be safe as well.
-      // Only one of mp, mv, and mm can be a multiple of 5, if any.
-      if (mv % 5 == 0) {
-        vrIsTrailingZeros = multipleOfPowerOf5_32(mv, q);
-      } else if (acceptBounds) {
-        vmIsTrailingZeros = multipleOfPowerOf5_32(mm, q);
-      } else {
-        vp -= multipleOfPowerOf5_32(mp, q);
-      }
-    }
-  } else {
-    uint32_t const q = log10Pow5(-e2);
-    e10              = (int32_t)q + e2;
-    int32_t const i  = -e2 - (int32_t)q;
-    int32_t const k  = pow5bits(i) - FLOAT_POW5_BITCOUNT;
-    int32_t j        = (int32_t)q - k;
-    vr               = mulPow5divPow2(mv, (uint32_t)i, j);
-    vp               = mulPow5divPow2(mp, (uint32_t)i, j);
-    vm               = mulPow5divPow2(mm, (uint32_t)i, j);
-    if (q != 0 && (vp - 1) / 10 <= vm / 10) {
-      j                = (int32_t)q - 1 - (pow5bits(i + 1) - FLOAT_POW5_BITCOUNT);
-      lastRemovedDigit = (uint8_t)(mulPow5divPow2(mv, (uint32_t)(i + 1), j) % 10);
-    }
-    if (q <= 1) {
-      // {vr,vp,vm} is trailing zeros if {mv,mp,mm} has at least q trailing 0 bits.
-      // mv = 4 * m2, so it always has at least two trailing 0 bits.
-      vrIsTrailingZeros = true;
-      if (acceptBounds) {
-        // mm = mv - 1 - mmShift, so it has 1 trailing 0 bit iff mmShift == 1.
-        vmIsTrailingZeros = mmShift == 1;
-      } else {
-        // mp = mv + 2, so it always has at least one trailing 0 bit.
-        --vp;
-      }
-    } else if (q < 31) {  // TODO(ulfjack): Use a tighter bound here.
-      vrIsTrailingZeros = multipleOfPowerOf2_32(mv, q - 1);
-    }
-  }
-
-  // Step 4: Find the shortest decimal representation in the interval of valid representations.
-  int32_t removed = 0;
-  uint32_t output;
-  if (vmIsTrailingZeros || vrIsTrailingZeros) {
-    // General case, which happens rarely (~4.0%).
-    while (vp / 10 > vm / 10) {
-      vmIsTrailingZeros &= vm % 10 == 0;
-      vrIsTrailingZeros &= lastRemovedDigit == 0;
-      lastRemovedDigit = (uint8_t)(vr % 10);
-      vr /= 10;
-      vp /= 10;
-      vm /= 10;
-      ++removed;
-    }
-    if (vmIsTrailingZeros) {
-      while (vm % 10 == 0) {
-        vrIsTrailingZeros &= lastRemovedDigit == 0;
-        lastRemovedDigit = (uint8_t)(vr % 10);
-        vr /= 10;
-        vp /= 10;
-        vm /= 10;
-        ++removed;
-      }
-    }
-    if (vrIsTrailingZeros && lastRemovedDigit == 5 && vr % 2 == 0) {
-      // Round even if the exact number is .....50..0.
-      lastRemovedDigit = 4;
-    }
-    // We need to take vr + 1 if vr is outside bounds or we need to round up.
-    output = vr + ((vr == vm && (!acceptBounds || !vmIsTrailingZeros)) || lastRemovedDigit >= 5);
-  } else {
-    // Specialized for the common case (~96.0%). Percentages below are relative to this.
-    // Loop iterations below (approximately):
-    // 0: 13.6%, 1: 70.7%, 2: 14.1%, 3: 1.39%, 4: 0.14%, 5+: 0.01%
-    while (vp / 10 > vm / 10) {
-      lastRemovedDigit = (uint8_t)(vr % 10);
-      vr /= 10;
-      vp /= 10;
-      vm /= 10;
-      ++removed;
-    }
-    // We need to take vr + 1 if vr is outside bounds or we need to round up.
-    output = vr + (vr == vm || lastRemovedDigit >= 5);
-  }
-  int32_t const exp = e10 + removed;
-
-  floating_decimal_32 fd;
-  fd.exponent = exp;
-  fd.mantissa = output;
-  return fd;
-}
-
-__device__ inline int to_chars(floating_decimal_64 const v, bool const sign, char* const result)
-{
-  // Step 5: Print the decimal representation.
-  int index = 0;
-  if (sign) { result[index++] = '-'; }
-
-  uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength17(output);
-  int32_t exp             = v.exponent + (int32_t)olength - 1;
-  bool scientificNotation = (exp < -3) || (exp >= 7);
-
-  // Values in the interval [1E-3, 1E7) are special.
-  if (scientificNotation) {
-    // Print in the format x.xxxxxE-yy.
-    for (uint32_t i = 0; i < olength - 1; ++i) {
-      uint32_t const c = output % 10;
-      output /= 10;
-      result[index + olength - i] = (char)('0' + c);
-    }
-    result[index]     = '0' + output % 10;
-    result[index + 1] = '.';
-    index += olength + 1;
-    if (olength == 1) { result[index++] = '0'; }
-    // Print 'E', the exponent sign, and the exponent, which has at most three digits.
-    result[index++] = 'E';
-    if (exp < 0) {
-      result[index++] = '-';
-      exp             = -exp;
-    }
-    if (exp >= 100) {
-      result[index++] = (char)('0' + exp / 100);
-      exp %= 100;
-      result[index++] = (char)('0' + exp / 10);
-    } else if (exp >= 10) {
-      result[index++] = (char)('0' + exp / 10);
-    }
-    result[index++] = (char)('0' + exp % 10);
-  } else {
-    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-    if (exp < 0) {
-      // Decimal dot is before any of the digits.
-      result[index++] = '0';
-      result[index++] = '.';
-      for (int i = -1; i > exp; i--) {
-        result[index++] = '0';
-      }
-      int current = index;
-      for (int i = 0; i < olength; i++) {
-        result[current + olength - i - 1] = (char)('0' + output % 10);
-        output /= 10;
-        index++;
-      }
-    } else if (exp + 1 >= olength) {
-      // Decimal dot is after any of the digits.
-      for (int i = 0; i < olength; i++) {
-        result[index + olength - i - 1] = (char)('0' + output % 10);
-        output /= 10;
-      }
-      index += olength;
-      for (int i = olength; i < exp + 1; i++) {
-        result[index++] = '0';
-      }
-      result[index++] = '.';
-      result[index++] = '0';
-    } else {
-      // Decimal dot is somewhere between the digits.
-      int current = index + 1;
-      for (int i = 0; i < olength; i++) {
-        if (olength - i - 1 == exp) {
-          result[current + olength - i - 1] = '.';
-          current--;
-        }
-        result[current + olength - i - 1] = (char)('0' + output % 10);
-        output /= 10;
-      }
-      index += olength + 1;
-    }
-  }
-  return index;
-}
-
-__device__ inline int d2s_size(floating_decimal_64 const v, bool const sign)
-{
-  int index = 0;
-  if (sign) { index++; }
-
-  uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength17(output);
-  int32_t exp             = v.exponent + (int32_t)olength - 1;
-  bool scientificNotation = (exp < -3) || (exp >= 7);
-
-  if (scientificNotation) {
-    index += olength + 1;
-    if (olength == 1) { index++; }
-    // 'E'
-    index++;
-    if (exp < 0) {
-      exp = -exp;
-      index++;
-    }
-    if (exp >= 100) {
-      index += 3;
-    } else if (exp >= 10) {
-      index += 2;
-    } else {
-      index++;
-    }
-  } else {
-    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-    if (exp < 0) {
-      index += 1 - exp + olength;
-    } else if (exp + 1 >= olength) {
-      index += exp + 3;
-    } else {
-      index += olength + 1;
-    }
-  }
-  return index;
-}
-
-__device__ inline int to_chars(floating_decimal_32 const v, bool const sign, char* const result)
-{
-  // Step 5: Print the decimal representation.
-  int index = 0;
-  if (sign) { result[index++] = '-'; }
-
-  uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength9(output);
-  int32_t exp             = v.exponent + olength - 1;
-  bool scientificNotation = (exp < -3) || (exp >= 7);
-
-  if (scientificNotation) {
-    // Print in the format x.xxxxxE-yy.
-    for (int i = 0; i < olength - 1; i++) {
-      int c = output % 10;
-      output /= 10;
-      result[index + olength - i] = (char)('0' + c);
-    }
-    result[index]     = (char)('0' + output % 10);
-    result[index + 1] = '.';
-    index += olength + 1;
-    if (olength == 1) { result[index++] = '0'; }
-
-    // Print 'E', the exponent sign, and the exponent, which has at most two digits.
-    result[index++] = 'E';
-    if (exp < 0) {
-      result[index++] = '-';
-      exp             = -exp;
-    }
-    if (exp >= 10) { result[index++] = (char)('0' + exp / 10); }
-    result[index++] = (char)('0' + exp % 10);
-  } else {
-    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-    if (exp < 0) {
-      // Decimal dot is before any of the digits.
-      result[index++] = '0';
-      result[index++] = '.';
-      for (int i = -1; i > exp; i--) {
-        result[index++] = '0';
-      }
-      int current = index;
-      for (int i = 0; i < olength; i++) {
-        result[current + olength - i - 1] = (char)('0' + output % 10);
-        output /= 10;
-        index++;
-      }
-    } else if (exp + 1 >= olength) {
-      // Decimal dot is after any of the digits.
-      for (int i = 0; i < olength; i++) {
-        result[index + olength - i - 1] = (char)('0' + output % 10);
-        output /= 10;
-      }
-      index += olength;
-      for (int i = olength; i < exp + 1; i++) {
-        result[index++] = '0';
-      }
-      result[index++] = '.';
-      result[index++] = '0';
-    } else {
-      // Decimal dot is somewhere between the digits.
-      int current = index + 1;
-      for (int i = 0; i < olength; i++) {
-        if (olength - i - 1 == exp) {
-          result[current + olength - i - 1] = '.';
-          current--;
-        }
-        result[current + olength - i - 1] = (char)('0' + output % 10);
-        output /= 10;
-      }
-      index += olength + 1;
-    }
-  }
-  return index;
-}
-
-__device__ inline int f2s_size(floating_decimal_32 const v, bool const sign)
-{
-  // Step 5: Print the decimal representation.
-  int index = 0;
-  if (sign) { index++; }
-
-  uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimalLength9(output);
-  int32_t exp             = v.exponent + olength - 1;
-  bool scientificNotation = (exp < -3) || (exp >= 7);
-
-  if (scientificNotation) {
-    index += olength + 1;
-    if (olength == 1) { index++; }
-    // 'E'
-    index++;
-    if (exp < 0) {
-      index++;
-      exp = -exp;
-    }
-    if (exp >= 10) { index++; }
-    index++;
-  } else {
-    // Otherwise follow the Java spec for values in the interval [1E-3, 1E7).
-    if (exp < 0) {
-      // Decimal dot is before any of the digits.
-      index += 1 - exp + olength;
-    } else if (exp + 1 >= olength) {
-      // Decimal dot is after any of the digits.
-      index += exp + 3;
-    } else {
-      // Decimal dot is somewhere between the digits.
-      index += olength + 1;
-    }
-  }
-  return index;
-}
-
-__device__ inline bool d2d_small_int(uint64_t const ieeeMantissa,
-                                     uint32_t const ieeeExponent,
-                                     floating_decimal_64* const v)
-{
-  uint64_t const m2 = (1ull << DOUBLE_MANTISSA_BITS) | ieeeMantissa;
-  int32_t const e2  = (int32_t)ieeeExponent - DOUBLE_BIAS - DOUBLE_MANTISSA_BITS;
-
-  if (e2 > 0) {
-    // f = m2 * 2^e2 >= 2^53 is an integer.
-    // Ignore this case for now.
-    return false;
-  }
-
-  if (e2 < -52) {
-    // f < 1.
-    return false;
-  }
-
-  // Since 2^52 <= m2 < 2^53 and 0 <= -e2 <= 52: 1 <= f = m2 / 2^-e2 < 2^53.
-  // Test if the lower -e2 bits of the significand are 0, i.e. whether the fraction is 0.
-  uint64_t const mask     = (1ull << -e2) - 1;
-  uint64_t const fraction = m2 & mask;
-  if (fraction != 0) { return false; }
-
-  // f is an integer in the range [1, 2^53).
-  // Note: mantissa might contain trailing (decimal) 0's.
-  // Note: since 2^53 < 10^16, there is no need to adjust decimalLength17().
-  v->mantissa = m2 >> -e2;
-  v->exponent = 0;
-  return true;
-}
-
-__device__ floating_decimal_64 d2d(double f, bool& ieeeSign, bool& special)
-{
-  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
-  uint64_t const bits = double_to_bits(f);
-
-  // Decode bits into sign, mantissa, and exponent.
-  ieeeSign                    = ((bits >> (DOUBLE_MANTISSA_BITS + DOUBLE_EXPONENT_BITS)) & 1) != 0;
-  uint64_t const ieeeMantissa = bits & ((1ull << DOUBLE_MANTISSA_BITS) - 1);
-  uint32_t const ieeeExponent =
-    (uint32_t)((bits >> DOUBLE_MANTISSA_BITS) & ((1u << DOUBLE_EXPONENT_BITS) - 1));
-  // Case distinction; exit early for the easy cases.
-  if (ieeeExponent == ((1u << DOUBLE_EXPONENT_BITS) - 1u) ||
-      (ieeeExponent == 0 && ieeeMantissa == 0)) {
-    special = true;
-    return floating_decimal_64{ieeeMantissa, (int32_t)ieeeExponent};
-  }
-  special = false;
-  floating_decimal_64 v;
-  bool const isSmallInt = d2d_small_int(ieeeMantissa, ieeeExponent, &v);
-  if (isSmallInt) {
-    // For small integers in the range [1, 2^53), v.mantissa might contain trailing (decimal) zeros.
-    // For scientific notation we need to move these zeros into the exponent.
-    // (This is not needed for fixed-point notation, so it might be beneficial to trim
-    // trailing zeros in to_chars only if needed - once fixed-point notation output is implemented.)
-    for (;;) {
-      uint64_t const q = div10(v.mantissa);
-      uint32_t const r = ((uint32_t)v.mantissa) - 10 * ((uint32_t)q);
-      if (r != 0) { break; }
-      v.mantissa = q;
-      ++v.exponent;
-    }
-  } else {
-    v = d2d(ieeeMantissa, ieeeExponent);
-  }
-  return v;
-}
-
-__device__ int d2s_buffered_n(double f, char* result)
-{
-  bool sign = false, special = false;
-  floating_decimal_64 v = d2d(f, sign, special);
-  if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); }
-  return to_chars(v, sign, result);
-}
-
-__device__ floating_decimal_32 f2d(float f, bool& ieeeSign, bool& special)
-{
-  // Step 1: Decode the floating-point number, and unify normalized and subnormal cases.
-  uint32_t const bits = float_to_bits(f);
-
-  // Decode bits into sign, mantissa, and exponent.
-  ieeeSign                    = ((bits >> (FLOAT_MANTISSA_BITS + FLOAT_EXPONENT_BITS)) & 1) != 0;
-  uint32_t const ieeeMantissa = bits & ((1u << FLOAT_MANTISSA_BITS) - 1);
-  uint32_t const ieeeExponent = (bits >> FLOAT_MANTISSA_BITS) & ((1u << FLOAT_EXPONENT_BITS) - 1);
-
-  // Case distinction; exit early for the easy cases.
-  if (ieeeExponent == ((1u << FLOAT_EXPONENT_BITS) - 1u) ||
-      (ieeeExponent == 0 && ieeeMantissa == 0)) {
-    special = true;
-    return floating_decimal_32{ieeeMantissa, (int32_t)ieeeExponent};
-  }
-  special = false;
-  return f2d(ieeeMantissa, ieeeExponent);
-}
-
-__device__ int f2s_buffered_n(float f, char* result)
-{
-  bool sign = false, special = false;
-  floating_decimal_32 v = f2d(f, sign, special);
-  if (special) { return copy_special_str(result, sign, v.exponent, v.mantissa); }
-  return to_chars(v, sign, result);
-}
-
-//===== compute float to string size =====
-
-__device__ int compute_d2s_size(double value)
-{
-  bool sign = false, special = false;
-  floating_decimal_64 v = d2d(value, sign, special);
-  if (special) { return special_str_size(sign, v.exponent, v.mantissa); }
-  return d2s_size(v, sign);
-}
-
-__device__ int compute_f2s_size(float value)
-{
-  bool sign = false, special = false;
-  floating_decimal_32 v = f2d(value, sign, special);
-  if (special) { return special_str_size(sign, v.exponent, v.mantissa); }
-  return f2s_size(v, sign);
-}
-
-}  // namespace
-
-//===== APIs =====
-
-__device__ int compute_ftos_size(double value, bool is_float)
-{
-  if (is_float) {
-    return compute_f2s_size(value);
-  } else {
-    return compute_d2s_size(value);
-  }
-}
-
-__device__ int float_to_string(double value, bool is_float, char* output)
-{
-  if (is_float) {
-    return f2s_buffered_n(value, output);
-  } else {
-    return d2s_buffered_n(value, output);
-  }
-}
-
-}  // namespace spark_rapids_jni::ftos_converter

From 62ff4f7dce335b644e4536aeb2f5482193474ecf Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Tue, 12 Dec 2023 13:24:56 +0800
Subject: [PATCH 45/54] move inf/nan replacement to kernel

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/ftos_converter.cuh | 69 ++++++++++++++++++++---------
 src/main/cpp/tests/format_float.cpp |  4 +-
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
index fe71df924f..e684f73921 100644
--- a/src/main/cpp/src/ftos_converter.cuh
+++ b/src/main/cpp/src/ftos_converter.cuh
@@ -243,8 +243,7 @@ __device__ inline uint32_t mulShift32(uint32_t const m, uint64_t const factor, i
 __device__ inline int copy_special_str(char* const result,
                                        bool const sign,
                                        bool const exponent,
-                                       bool const mantissa,
-                                       int const digits = 1)
+                                       bool const mantissa)
 {
   if (mantissa) {
     memcpy(result, "NaN", 3);
@@ -255,27 +254,15 @@ __device__ inline int copy_special_str(char* const result,
     memcpy(result + sign, "Infinity", 8);
     return sign + 8;
   }
-  result[sign] = '0';
-  if (digits == 0) {
-    return sign + 1;
-  } else {
-    result[sign + 1] = '.';
-  }
-  for (int i = 0; i < digits; i++) {
-    result[sign + 2 + i] = '0';
-  }
-  return sign + 2 + digits;
+  memcpy(result + sign, "0.0", 3);
+  return sign + 3;
 }
 
-__device__ inline int special_str_size(bool const sign,
-                                       bool const exponent,
-                                       bool const mantissa,
-                                       int const digits = 1)
+__device__ inline int special_str_size(bool const sign, bool const exponent, bool const mantissa)
 {
   if (mantissa) { return 3; }
   if (exponent) { return sign + 8; }
-  if (digits == 0) { return sign + 1; }
-  return sign + 2 + digits;
+  return sign + 3;
 }
 
 __device__ inline uint32_t float_to_bits(float const f)
@@ -1504,16 +1491,54 @@ __device__ inline int format_float_size(floating_decimal_32 const v, bool const
   return index;
 }
 
+__device__ inline int copy_format_special_str(char* const result,
+                                              bool const sign,
+                                              bool const exponent,
+                                              bool const mantissa,
+                                              int const digits = 1)
+{
+  if (mantissa) {
+    memcpy(result, "\xEF\xBF\xBD", 3);  // U+FFFD, replacement character, NaN
+    return 3;
+  }
+  if (sign) { result[0] = '-'; }
+  if (exponent) {
+    memcpy(result + sign, "\xE2\x88\x9E", 3);  // U+221E, infinity symbol
+    return sign + 3;
+  }
+  result[sign] = '0';
+  if (digits == 0) {
+    return sign + 1;
+  } else {
+    result[sign + 1] = '.';
+  }
+  for (int i = 0; i < digits; i++) {
+    result[sign + 2 + i] = '0';
+  }
+  return sign + 2 + digits;
+}
+
+__device__ inline int special_format_str_size(bool const sign,
+                                              bool const exponent,
+                                              bool const mantissa,
+                                              int const digits = 1)
+{
+  if (mantissa) { return 3; }
+  if (exponent) { return sign + 3; }
+  if (digits == 0) { return sign + 1; }
+  return sign + 2 + digits;
+}
+
 __device__ inline int compute_format_float_size(double value, int digits, bool is_float)
 {
   bool sign = false, special = false;
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
-    if (special) { return special_str_size(sign, v.exponent, v.mantissa, digits); }
+    if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
     return format_float_size(v, sign, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
-    if (special) { return special_str_size(sign, v.exponent, v.mantissa, digits); }
+    if (special) { return special_format_str_size(sign, v.exponent, v.mantissa, digits); }
     return format_float_size(v, sign, digits);
   }
 }
@@ -1523,11 +1548,11 @@ __device__ inline int format_float(double value, int digits, bool is_float, char
   bool sign = false, special = false;
   if (is_float) {
     floating_decimal_32 v = f2d(value, sign, special);
-    if (special) { return copy_special_str(output, sign, v.exponent, v.mantissa, digits); }
+    if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
     return to_formated_chars(v, sign, output, digits);
   } else {
     floating_decimal_64 v = d2d(value, sign, special);
-    if (special) { return copy_special_str(output, sign, v.exponent, v.mantissa, digits); }
+    if (special) { return copy_format_special_str(output, sign, v.exponent, v.mantissa, digits); }
     return to_formated_chars(v, sign, output, digits);
   }
 }
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index 5d4e79eebf..c8e5d84260 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -48,7 +48,7 @@ TEST_F(FormatFloatTests, FormatFloats32)
                                                            "0.00000",
                                                            "5.00000",
                                                            "-4.00000",
-                                                           "NaN",
+                                                           "\xEF\xBF\xBD",
                                                            "123,456,790,000.00000",
                                                            "-0.00000"};
 
@@ -80,7 +80,7 @@ TEST_F(FormatFloatTests, FormatFloats64)
                                                            "0.00000",
                                                            "5.00000",
                                                            "-4.00000",
-                                                           "NaN",
+                                                           "\xEF\xBF\xBD",
                                                            "839,542,223,232.79420",
                                                            "-0.00000"};
 

From 10bfe094fd526fec275c229b28cb4daf3615c64d Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Wed, 13 Dec 2023 16:09:32 +0800
Subject: [PATCH 46/54] Apply suggestions from code review

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 src/main/cpp/src/cast_string.hpp                         | 4 ++--
 src/main/cpp/src/format_float.cu                         | 9 +++++----
 .../java/com/nvidia/spark/rapids/jni/CastStrings.java    | 2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/main/cpp/src/cast_string.hpp b/src/main/cpp/src/cast_string.hpp
index 84df3f71b1..43ec36e576 100644
--- a/src/main/cpp/src/cast_string.hpp
+++ b/src/main/cpp/src/cast_string.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,7 +117,7 @@ std::unique_ptr<cudf::column> string_to_float(
 
 std::unique_ptr<cudf::column> format_float(
   cudf::column_view const& input,
-  int digits,
+  int const digits,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index cdcb75fc9a..b9bbb26cb2 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -23,6 +23,7 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -75,7 +76,7 @@ struct format_float_fn {
 struct dispatch_format_float_fn {
   template <typename FloatType, CUDF_ENABLE_IF(std::is_floating_point_v<FloatType>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& floats,
-                                           int digits,
+                                           int const digits,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr) const
   {
@@ -97,7 +98,7 @@ struct dispatch_format_float_fn {
   // non-float types throw an exception
   template <typename T, CUDF_ENABLE_IF(not std::is_floating_point_v<T>)>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
-                                           int,
+                                           int const,
                                            rmm::cuda_stream_view,
                                            rmm::mr::device_memory_resource*) const
   {
@@ -109,7 +110,7 @@ struct dispatch_format_float_fn {
 
 // This will convert all float column types into a strings column.
 std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
-                                           int digits,
+                                           int const digits,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
@@ -120,7 +121,7 @@ std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
 
 // external API
 std::unique_ptr<cudf::column> format_float(cudf::column_view const& floats,
-                                           int digits,
+                                           int const digits,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index 93c3c0f21a..cd6f62371b 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From e264ba90d4d86a12f12a0777a5cd14b2183c60e0 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Wed, 13 Dec 2023 11:49:28 +0330
Subject: [PATCH 47/54] address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/CastStringJni.cpp |  1 -
 src/main/cpp/src/format_float.cu   | 11 +++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/main/cpp/src/CastStringJni.cpp b/src/main/cpp/src/CastStringJni.cpp
index 2e73d0c4ab..b7d898a0c8 100644
--- a/src/main/cpp/src/CastStringJni.cpp
+++ b/src/main/cpp/src/CastStringJni.cpp
@@ -1,5 +1,4 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index b9bbb26cb2..f0310c9336 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -23,7 +23,6 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -35,23 +34,23 @@ namespace {
 template <typename FloatType>
 struct format_float_fn {
   cudf::column_device_view d_floats;
-  int const digits;
+  int digits;
   cudf::size_type* d_offsets;
   char* d_chars;
 
-  __device__ cudf::size_type compute_output_size(FloatType value, int digits) const
+  __device__ cudf::size_type compute_output_size(FloatType value, int digits_) const
   {
     bool constexpr is_float = std::is_same_v<FloatType, float>;
     return static_cast<cudf::size_type>(
-      ftos_converter::compute_format_float_size(static_cast<double>(value), digits, is_float));
+      ftos_converter::compute_format_float_size(static_cast<double>(value), digits_, is_float));
   }
 
-  __device__ void format_float(cudf::size_type idx, int digits) const
+  __device__ void format_float(cudf::size_type idx, int digits_) const
   {
     auto const value        = d_floats.element<FloatType>(idx);
     bool constexpr is_float = std::is_same_v<FloatType, float>;
     auto const output       = d_chars + d_offsets[idx];
-    ftos_converter::format_float(static_cast<double>(value), digits, is_float, output);
+    ftos_converter::format_float(static_cast<double>(value), digits_, is_float, output);
   }
 
   __device__ void operator()(cudf::size_type idx) const

From eab61eb08714564612b4460345136a59ad2e97f6 Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Fri, 15 Dec 2023 09:01:47 +0800
Subject: [PATCH 48/54] Apply suggestions from code review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 src/main/cpp/src/format_float.cu    | 4 ++--
 src/main/cpp/tests/format_float.cpp | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index f0310c9336..e1f3dd9662 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -38,14 +38,14 @@ struct format_float_fn {
   cudf::size_type* d_offsets;
   char* d_chars;
 
-  __device__ cudf::size_type compute_output_size(FloatType value, int digits_) const
+  __device__ cudf::size_type compute_output_size(FloatType value) const
   {
     bool constexpr is_float = std::is_same_v<FloatType, float>;
     return static_cast<cudf::size_type>(
       ftos_converter::compute_format_float_size(static_cast<double>(value), digits_, is_float));
   }
 
-  __device__ void format_float(cudf::size_type idx, int digits_) const
+  __device__ void format_float(cudf::size_type idx) const
   {
     auto const value        = d_floats.element<FloatType>(idx);
     bool constexpr is_float = std::is_same_v<FloatType, float>;
diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index c8e5d84260..9aab3d566c 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <rmm/device_uvector.hpp>
 
 #include <limits>
 

From 9892cae00ecec99c8fe1afcc689a3c661f252a1a Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 15 Dec 2023 09:24:32 +0800
Subject: [PATCH 49/54] address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/src/format_float.cu                          | 8 ++++----
 .../java/com/nvidia/spark/rapids/jni/CastStrings.java     | 2 +-
 thirdparty/cudf                                           | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index e1f3dd9662..d4b8ca8f16 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -42,7 +42,7 @@ struct format_float_fn {
   {
     bool constexpr is_float = std::is_same_v<FloatType, float>;
     return static_cast<cudf::size_type>(
-      ftos_converter::compute_format_float_size(static_cast<double>(value), digits_, is_float));
+      ftos_converter::compute_format_float_size(static_cast<double>(value), digits, is_float));
   }
 
   __device__ void format_float(cudf::size_type idx) const
@@ -50,7 +50,7 @@ struct format_float_fn {
     auto const value        = d_floats.element<FloatType>(idx);
     bool constexpr is_float = std::is_same_v<FloatType, float>;
     auto const output       = d_chars + d_offsets[idx];
-    ftos_converter::format_float(static_cast<double>(value), digits_, is_float, output);
+    ftos_converter::format_float(static_cast<double>(value), digits, is_float, output);
   }
 
   __device__ void operator()(cudf::size_type idx) const
@@ -60,9 +60,9 @@ struct format_float_fn {
       return;
     }
     if (d_chars != nullptr) {
-      format_float(idx, digits);
+      format_float(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx), digits);
+      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
index cd6f62371b..2b2267f034 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/CastStrings.java
@@ -84,8 +84,8 @@ public static ColumnVector toDecimal(ColumnView cv, boolean ansiMode, boolean st
    * Convert a float column to a formatted string column.
    *
    * @param cv the column data to process
-   * @return the converted column
    * @param digits the number of digits to display after the decimal point
+   * @return the converted column
    */
   public static ColumnVector fromFloatWithFormat(ColumnView cv, int digits) {
     return new ColumnVector(fromFloatWithFormat(cv.getNativeView(), digits));
diff --git a/thirdparty/cudf b/thirdparty/cudf
index 248aa2c887..420dc5d787 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68
+Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094

From 8bf5b1c6477f6cd1b72e4affe7ab48b7336e1ec4 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 15 Dec 2023 15:57:15 +0800
Subject: [PATCH 50/54] cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 420dc5d787..cee642916c 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 420dc5d787d4571c00266364f1a253e5ccffb094
+Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f

From 81ba4a0a56925954ae91715870cc4f5a8b7d0fff Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 15 Dec 2023 15:58:29 +0800
Subject: [PATCH 51/54] cudf

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index cee642916c..2cb8f3da3a 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit cee642916cfc3b8df73e819bf3bc50f1b9fc684f
+Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2

From efb27369475333d0facd4a4aa10fef12b6c4e47c Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 15 Dec 2023 16:01:43 +0800
Subject: [PATCH 52/54] format

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 src/main/cpp/tests/format_float.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/cpp/tests/format_float.cpp b/src/main/cpp/tests/format_float.cpp
index 9aab3d566c..b9d77593db 100644
--- a/src/main/cpp/tests/format_float.cpp
+++ b/src/main/cpp/tests/format_float.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-
 #include <limits>
 
 using namespace cudf;

From 0505d71745e6e8cf56dd7befc89b506016fd2406 Mon Sep 17 00:00:00 2001
From: Haoyang Li <haoyangl@nvidia.com>
Date: Fri, 15 Dec 2023 23:24:08 +0800
Subject: [PATCH 53/54] cudf reset

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
---
 thirdparty/cudf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/cudf b/thirdparty/cudf
index 2cb8f3da3a..248aa2c887 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit 2cb8f3da3a3dd539301f90dcfbccadaf06963fd2
+Subproject commit 248aa2c8873c12e41f1e6ea2660740a0a4ddaf68

From 20415e7c739eec9f7d5ef261e039363fb162aff2 Mon Sep 17 00:00:00 2001
From: Haoyang Li <ntlihy@gmail.com>
Date: Sat, 16 Dec 2023 05:10:04 +0800
Subject: [PATCH 54/54] Apply suggestions from code review

Co-authored-by: Mike Wilson <hyperbolic2346@users.noreply.github.com>
---
 src/main/cpp/src/format_float.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/src/format_float.cu b/src/main/cpp/src/format_float.cu
index d4b8ca8f16..d9ecbe8206 100644
--- a/src/main/cpp/src/format_float.cu
+++ b/src/main/cpp/src/format_float.cu
@@ -38,14 +38,14 @@ struct format_float_fn {
   cudf::size_type* d_offsets;
   char* d_chars;
 
-  __device__ cudf::size_type compute_output_size(FloatType value) const
+  __device__ cudf::size_type compute_output_size(FloatType const value) const
   {
     bool constexpr is_float = std::is_same_v<FloatType, float>;
     return static_cast<cudf::size_type>(
       ftos_converter::compute_format_float_size(static_cast<double>(value), digits, is_float));
   }
 
-  __device__ void format_float(cudf::size_type idx) const
+  __device__ void format_float(cudf::size_type const idx) const
   {
     auto const value        = d_floats.element<FloatType>(idx);
     bool constexpr is_float = std::is_same_v<FloatType, float>;
@@ -53,7 +53,7 @@ struct format_float_fn {
     ftos_converter::format_float(static_cast<double>(value), digits, is_float, output);
   }
 
-  __device__ void operator()(cudf::size_type idx) const
+  __device__ void operator()(cudf::size_type const idx) const
   {
     if (d_floats.is_null(idx)) {
       if (d_chars == nullptr) { d_offsets[idx] = 0; }