From 49a73eb67dad6ffbbd3747f6e59888d256df4162 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 6 Dec 2023 00:02:48 +0530
Subject: [PATCH 01/21] Add mixed_types_as_string reader option

---
 cpp/include/cudf/io/json.hpp          | 28 +++++++++++++++++++++++++++
 python/cudf/cudf/_lib/cpp/io/json.pxd |  5 +++++
 python/cudf/cudf/_lib/json.pyx        |  4 +++-
 python/cudf/cudf/io/json.py           |  2 ++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 472d42b1db5..7738b15243d 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -98,6 +98,8 @@ class json_reader_options {
 
   // Read the file as a json object per line
   bool _lines = false;
+  // Read the mixed types as string column
+  bool _mixed_types_as_string = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -225,6 +227,13 @@ class json_reader_options {
    */
   bool is_enabled_lines() const { return _lines; }
 
+  /**
+   * @brief Whether to read the mixed types as string column.
+   *
+   * @return `true` if reading the mixed types as string column
+   */
+  bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -302,6 +311,13 @@ class json_reader_options {
    */
   void enable_lines(bool val) { _lines = val; }
 
+  /**
+   * @brief Set whether to read the mixed types as string column.
+   *
+   * @param val Boolean value to enable/disable the option to read the mixed types as string column
+   */
+  void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -437,6 +453,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to read the mixed types as string column.
+   *
+   * @param val Boolean value to enable/disable the option to read the mixed types as string column
+   * @return this for chaining
+   */
+  json_reader_options_builder& mixed_types_as_string(bool val)
+  {
+    options._mixed_types_as_string = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index ad618cc4ed6..965a0b5bc23 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -27,6 +27,7 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_byte_range_offset() except +
         size_type get_byte_range_size() except +
         bool is_enabled_lines() except +
+        bool is_enabled_mixed_types_as_string() except +
         bool is_enabled_dayfirst() except +
         bool is_enabled_experimental() except +
 
@@ -39,6 +40,7 @@ cdef extern from "cudf/io/json.hpp" \
         void set_byte_range_offset(size_type offset) except +
         void set_byte_range_size(size_type size) except +
         void enable_lines(bool val) except +
+        void enable_mixed_types_as_string(bool val) except +
         void enable_dayfirst(bool val) except +
         void enable_experimental(bool val) except +
         void enable_keep_quotes(bool val) except +
@@ -74,6 +76,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& lines(
             bool val
         ) except +
+        json_reader_options_builder& mixed_types_as_string(
+            bool val
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 437c3ef6ec4..b6124fcbced 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -53,7 +53,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object compression,
                 object byte_range,
                 bool legacy,
-                bool keep_quotes):
+                bool keep_quotes,
+                bool mixed_types_as_string):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -131,6 +132,7 @@ cpdef read_json(object filepaths_or_buffers,
         opts.set_dtypes(c_dtypes_schema_map)
 
     opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index ae2f0203642..0088c59f8c3 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -25,6 +25,7 @@ def read_json(
     byte_range=None,
     keep_quotes=False,
     storage_options=None,
+    mixed_types_as_string=False,
     *args,
     **kwargs,
 ):
@@ -116,6 +117,7 @@ def read_json(
             byte_range,
             engine == "cudf_legacy",
             keep_quotes,
+            mixed_types_as_string,
         )
     else:
         warnings.warn(

From 6bc0819b4c6c16be72e7d789e434ccfe1406501c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 6 Dec 2023 00:13:48 +0530
Subject: [PATCH 02/21] Extract correct Struct, List node range end

---
 cpp/src/io/json/json_column.cu |  10 +++
 cpp/src/io/json/json_tree.cu   | 152 ++++++++++++++++++++++++++++++++-
 2 files changed, 161 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 5d7fb9d6b43..fd97b325ebb 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -276,6 +276,16 @@ reduce_to_column_tree(tree_meta_t& tree,
       return is_non_list_parent(parent_col_id);
     });
 
+  // For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
+  thrust::transform_if(
+    rmm::exec_policy(stream),
+    col_range_begin.begin(),
+    col_range_begin.end(),
+    column_categories.begin(),
+    col_range_end.begin(),
+    [] __device__(auto i) { return i + 1; },
+    [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; });
+
   return std::tuple{tree_meta_t{std::move(column_categories),
                                 std::move(parent_col_ids),
                                 std::move(column_levels),
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index da5b0eedfbd..1b0729b3052 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -58,8 +58,38 @@
 #include <limits>
 
 namespace cudf::io::json {
+// Debug print helpers
+[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string {
+  switch (token) {
+    case token_t::StructBegin: return " {";
+    case token_t::StructEnd: return " }";
+    case token_t::ListBegin: return " [";
+    case token_t::ListEnd: return " ]";
+    case token_t::FieldNameBegin: return "FB";
+    case token_t::FieldNameEnd: return "FE";
+    case token_t::StringBegin: return "SB";
+    case token_t::StringEnd: return "SE";
+    case token_t::ErrorBegin: return "er";
+    case token_t::ValueBegin: return "VB";
+    case token_t::ValueEnd: return "VE";
+    case token_t::StructMemberBegin: return " <";
+    case token_t::StructMemberEnd: return " >";
+    case token_t::LineEnd: return ";";
+    default: return ".";
+  }
+};
+auto to_int    = [](auto v) { return std::to_string(static_cast<int>(v)); };
+auto print_vec = [](auto const& cpu, auto const name, auto converter) {
+  for (auto const& v : cpu)
+    printf("%3s,", converter(v).c_str());
+  std::cout << name << std::endl;
+};
 namespace detail {
 
+void print_tree(host_span<SymbolT const> input,
+                tree_meta_t const& d_gpu_tree,
+                rmm::cuda_stream_view stream);
+
 // The node that a token represents
 struct token_to_node {
   __device__ auto operator()(PdaTokenT const token) -> NodeT
@@ -132,6 +162,14 @@ struct node_ranges {
   }
 };
 
+struct is_nested_end {
+  SymbolT const* tokens;
+  __device__ auto operator()(NodeIndexT i) -> bool
+  {
+    return tokens[i] == token_t::StructEnd or tokens[i] == token_t::ListEnd;
+  }
+};
+
 /**
  * @brief Returns stable sorted keys and its sorted order
  *
@@ -293,9 +331,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   // Node parent ids:
   // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
   rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
+  rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);  // needed for SE, LE later
   // This block of code is generalized logical stack algorithm. TODO: make this a separate function.
   {
-    rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
     cudf::detail::copy_if_safe(thrust::make_counting_iterator<NodeIndexT>(0),
                                thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
                                tokens.begin(),
@@ -376,6 +414,118 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     stream);
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
 
+  // TODO do this only if mixed type as string flag is enabled.
+  // Fixes here for struct, list nodes with correct range_end. How?
+  // Extract, struct, list - begin & end separately, then push, pop, levels, scan, similar propagate
+  // (segmented scan), then scatter.
+  {
+    // Whether the token pushes onto the parent node stack
+    auto const is_nested = [] __device__(PdaTokenT const token) -> bool {
+      switch (token) {
+        case token_t::StructBegin:
+        case token_t::StructEnd:
+        case token_t::ListBegin:
+        case token_t::ListEnd: return true;
+        default: return false;
+      };
+    };
+    auto const num_nested =
+      thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_nested);
+    rmm::device_uvector<TreeDepthT> token_levels(num_nested, stream);
+    rmm::device_uvector<NodeIndexT> token_id(num_nested, stream);         // 4B*2=8B, or 2B+
+    rmm::device_uvector<NodeIndexT> parent_node_ids(num_nested, stream);  // 4B*2=8B, or 2B+
+    auto const push_pop_it = thrust::make_transform_iterator(
+      tokens.begin(), [] __device__(PdaTokenT const token) -> size_type {
+        int const is_begin = token == token_t::StructBegin or token == token_t::ListBegin;
+        int const is_end   = token == token_t::StructEnd or token == token_t::ListEnd;
+        return is_begin - is_end;
+      });
+    // copy_if only struct/list, stable sort by level,
+    // corresponding node indices?,
+    // then scatter to node_range_end for struct/list end.
+    cudf::detail::copy_if_safe(push_pop_it,
+                               push_pop_it + num_tokens,
+                               tokens.begin(),
+                               token_levels.begin(),
+                               is_nested,
+                               stream);
+    cudf::detail::copy_if_safe(thrust::make_counting_iterator<NodeIndexT>(0),
+                               thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
+                               tokens.begin(),
+                               token_id.begin(),
+                               is_nested,
+                               stream);
+
+    print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int);
+
+    thrust::exclusive_scan(
+      rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin());
+
+    print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int);
+
+    rmm::device_uvector<TreeDepthT> ntokens(num_nested, stream);
+    cudf::detail::copy_if_safe(
+      tokens.begin(), tokens.end(), tokens.begin(), ntokens.begin(), is_nested, stream);
+    print_vec(cudf::detail::make_std_vector_async(ntokens, stream), "ntokens", to_token_str);
+    print_vec(cudf::detail::make_std_vector_async(token_id, stream), "ntoken_id", to_int);
+    //
+    auto const first_childs_parent_token_id2 =
+      [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT {
+      if (i <= 0) { return -1; }
+      auto id = token_id[i - 1];  // token indices.
+      if (tokens_gpu[id] == token_t::StructBegin or tokens_gpu[id] == token_t::ListBegin) {
+        return token_id[i - 1];
+      } else {
+        return -1;
+      }
+    };
+
+    // copied L+S tokens, and their token ids, their token levels.
+    // first child parent token ids
+    // propagate to siblings
+    // parent token id for all ends -> similar binary search here to find its node id.
+    // scatter to that location.
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<NodeIndexT>(0),
+      thrust::make_counting_iterator<NodeIndexT>(0) + num_nested,
+      parent_node_ids.begin(),
+      [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id2] __device__(
+        NodeIndexT const tid) -> NodeIndexT {
+        auto const pid = first_childs_parent_token_id2(tid);
+        // return pid;
+        return pid < 0
+                 ? parent_node_sentinel
+                 : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                     node_ids_gpu;
+        // parent_node_sentinel is -1, useful for segmented max operation below
+      });
+
+    print_vec(
+      cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int);
+    // propagate to siblings.
+    propagate_parent_to_siblings(
+      cudf::device_span<TreeDepthT const>{token_levels.data(), token_levels.size()},
+      parent_node_ids,
+      stream);
+    print_vec(
+      cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int);
+
+    // scatter to node_range_end for all nested end tokens. (if it's end)
+    auto token_indices_it =
+      thrust::make_permutation_iterator(token_indices.begin(), token_id.begin());
+    // add +1 to include end symbol.
+    auto nested_node_range_end_it = thrust::make_transform_output_iterator(
+      node_range_end.begin(), [] __device__(auto i) { return i + 1; });
+    auto stencil = thrust::make_transform_iterator(token_id.begin(), is_nested_end{tokens.begin()});
+    thrust::scatter_if(rmm::exec_policy(stream),
+                       token_indices_it,
+                       token_indices_it + num_nested,
+                       parent_node_ids.begin(),
+                       stencil,
+                       nested_node_range_end_it);
+  }
+
   return {std::move(node_categories),
           std::move(parent_node_ids),
           std::move(node_levels),

From aa03a95307f934009760d7a5528d7154f49bbef3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 6 Dec 2023 00:14:39 +0530
Subject: [PATCH 03/21] Force mixed types as string

---
 cpp/src/io/json/json_column.cu | 108 +++++++++++++++++++++++++++++----
 1 file changed, 96 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index fd97b325ebb..06af050cc17 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -427,11 +427,14 @@ void make_device_json_column(device_span<SymbolT const> input,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              bool is_enabled_lines,
+                             bool is_mixed_type_as_string_enabled,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   auto num_nodes = col_ids.size();
+  // TODO think about replacing all col_ids which are children of string column to ignore? (useful
+  // to reduce unique_col_id count for map types).
   rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
   thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
 
@@ -475,6 +478,8 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
   std::vector<std::string> column_names = copy_strings_to_host(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  // for(auto str: column_names) std::cout<<str<<"---\n";
+  // TODO preprocess input schema and try to avoid copying map type's field names by skipping them
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -539,8 +544,14 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
+  std::vector<uint8_t> is_mixed_string_column(num_columns, 0);
+  std::vector<NodeIndexT> remapped_col_id(num_columns, -1);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
+  // TODO for map types support
+  // TODO go through input schema, and force string columns to be string.
+  // ignore their children too during below processing.
+
   for (auto const this_col_id : unique_col_ids) {
     if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
       continue;
@@ -558,6 +569,11 @@ void make_device_json_column(device_span<SymbolT const> input,
       auto field_name_col_id = parent_col_id;
       parent_col_id          = column_parent_ids[parent_col_id];
       name                   = column_names[field_name_col_id];
+    } else if (is_mixed_string_column[parent_col_id] == 1) {
+      // if parent is mixed string column, ignore this column.
+      is_mixed_string_column[this_col_id] = 1;
+      ignore_vals[this_col_id]            = 1;
+      continue;
     } else {
       CUDF_FAIL("Unexpected parent column category");
     }
@@ -569,6 +585,38 @@ void make_device_json_column(device_span<SymbolT const> input,
     auto& parent_col = it->second.get();
     bool replaced    = false;
     if (mapped_columns.count({parent_col_id, name}) > 0) {
+      /**/
+      // TODO if mixed type is enabled.
+      // make both of them as str, merge them how?
+      // all its child columns should be ignored from parsing. (is adding to ignore_vals enough?)
+      // is key_value column going to slow anyway? because of host copy?
+      if (is_mixed_type_as_string_enabled) {
+        // VAL/STR or STRUCT or LIST
+        is_mixed_string_column[this_col_id] = 1;
+        auto old_col_id                     = mapped_columns[{parent_col_id, name}];
+        remapped_col_id[this_col_id]        = old_col_id;
+        // if old col type (not cat) is string/val, keep it.
+        // else replace with string.
+        column_categories[this_col_id] = NC_STR;
+        auto& col                      = columns.at(old_col_id).get();
+        if (col.type != json_col_t::StringColumn) {
+          column_categories[old_col_id] = NC_STR;
+          // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests.
+          initialize_json_columns(old_col_id, col);
+          // TODO all its children (which are already inserted) should be ignored.
+        }
+        columns.try_emplace(this_col_id, columns.at(old_col_id));
+        continue;
+      }
+      // old new new
+      // VAL SCT LST
+      // VAL LST SCT
+      // SCT LST VAL
+      // SCT VAL LST
+      // LST VAL SCT
+      // LST SCT VAL
+      /**/
+
       if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
         ignore_vals[this_col_id] = 1;
         continue;
@@ -601,6 +649,20 @@ void make_device_json_column(device_span<SymbolT const> input,
     columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
     mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
   }
+  // debug prints
+  for (auto i = 0ul; i < num_columns; i++)
+    printf("%3lu ", i);
+  printf(" col_id\n");
+  print_vec(column_categories, "column_categories", to_int);
+  print_vec(ignore_vals, "ignore_vals", to_int);
+  print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
+  for (auto i = 0ul; i < num_columns; i++)
+    printf("%3lu ", columns.count(i));
+  printf(" columns\n");
+  for (auto const& [key, value] : mapped_columns) {
+    std::cout << key.first << "+" << key.second << ":" << value << "\n";
+  }
+
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
     return thrust::get<1>(a) < thrust::get<1>(b);
@@ -609,7 +671,8 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::vector<json_column_data> columns_data(num_columns);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
-    auto& col            = col_ref.get();
+    auto& col = col_ref.get();
+    // if(ignore_vals[col_id]) continue;
     columns_data[col_id] = json_column_data{col.string_offsets.data(),
                                             col.string_lengths.data(),
                                             col.child_offsets.data(),
@@ -620,20 +683,30 @@ void make_device_json_column(device_span<SymbolT const> input,
     ignore_vals, stream, rmm::mr::get_current_device_resource());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
     columns_data, stream, rmm::mr::get_current_device_resource());
+  if (is_mixed_type_as_string_enabled)
+    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
+                    column_categories.data(),
+                    column_categories.size() * sizeof(column_categories[0]),
+                    cudaMemcpyDefault,
+                    stream.value());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
-    [node_categories = tree.node_categories.begin(),
-     col_ids         = col_ids.begin(),
-     row_offsets     = row_offsets.begin(),
-     range_begin     = tree.node_range_begin.begin(),
-     range_end       = tree.node_range_end.begin(),
-     d_ignore_vals   = d_ignore_vals.begin(),
-     d_columns_data  = d_columns_data.begin()] __device__(size_type i) {
-      switch (node_categories[i]) {
+    [node_categories   = tree.node_categories.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     range_begin       = tree.node_range_begin.begin(),
+     range_end         = tree.node_range_end.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      if (d_ignore_vals[col_ids[i]]) return;
+      auto const node_category = column_categories[col_ids[i]];
+      // switch (node_categories[i]) {
+      switch (node_category) {
         case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
         case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
         case NC_STR: [[fallthrough]];
@@ -646,6 +719,7 @@ void make_device_json_column(device_span<SymbolT const> input,
         default: break;
       }
     });
+  std::cout << "after for_each_n\n";
 
   // 4. scatter List offset
   // copy_if only node's whose parent is list, (node_id, parent_col_id)
@@ -670,10 +744,14 @@ void make_device_json_column(device_span<SymbolT const> input,
       num_nodes,
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [node_categories = tree.node_categories.begin(),
-     parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
+    [  // node_categories = tree.node_categories.begin(),
+      parent_node_ids   = tree.parent_node_ids.begin(),
+      column_categories = d_column_tree.node_categories.begin(),
+      col_ids           = col_ids.begin()] __device__(size_type node_id) {
       auto parent_node_id = parent_node_ids[node_id];
-      return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST;
+      return parent_node_id != parent_node_sentinel and
+             column_categories[col_ids[parent_node_id]] == NC_LIST;
+      // node_categories[parent_node_id] == NC_LIST;
     });
 
   auto const num_list_children =
@@ -705,6 +783,7 @@ void make_device_json_column(device_span<SymbolT const> input,
           row_offsets[node_id] + 1;
       }
     });
+  std::cout << "after list for_each_n\n";
 
   // 5. scan on offsets.
   for (auto& [id, col_ref] : columns) {
@@ -909,6 +988,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     return get_tree_representation(
       tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
   }();  // IILE used to free memory of token data.
+#define NJP_DEBUG_PRINT
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
   print_tree(h_input, gpu_tree, stream);
@@ -935,6 +1015,9 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   stream,
                                   rmm::mr::get_current_device_resource());
 
+  print_vec(cudf::detail::make_std_vector_async(gpu_col_id, stream), "gpu_col_id", to_int);
+  print_vec(
+    cudf::detail::make_std_vector_async(gpu_row_offsets, stream), "gpu_row_offsets", to_int);
   device_json_column root_column(stream, mr);
   root_column.type = json_col_t::ListColumn;
   root_column.child_offsets.resize(2, stream);
@@ -951,6 +1034,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                           root_column,
                           is_array_of_arrays,
                           options.is_enabled_lines(),
+                          options.is_enabled_mixed_types_as_string(),
                           stream,
                           mr);
 

From 3341109875c08d438ec3c25462bf3483e665e834 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 6 Dec 2023 00:16:06 +0530
Subject: [PATCH 04/21] Add simple mixed type testcase

---
 cpp/tests/io/json_test.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index a2db2d69984..fa4c0cb9f0b 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -2090,4 +2091,30 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
     float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
 }
 
+TEST_F(JsonReaderTest, Mixed)
+{
+  // std::string json_string = R"( [{"a":[123], "b":1.0}, {"b":1.1, "c": {"0": 123}}, {"b":2.1}])";
+  std::string json_string = R"( [{"a":[123], "b":1.0}, {"a":1.1}, {"b":2.1, "a": {"0": 123}}])";
+
+  // TODO Force to string via schema
+  // std::map<std::string, cudf::io::schema_element> dtype_schema{
+  //   {"a",
+  //    {
+  //      data_type{cudf::type_id::LIST},
+  //      {{"element", {data_type{cudf::type_id::STRUCT}, {{"0", {dtype<float>()}}}}}},
+  //    }},
+  //   {"b", {dtype<int32_t>()}},
+  // };
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_string.data(), json_string.size()})
+      // .dtypes(dtype_schema)
+      .mixed_types_as_string(true)
+      .lines(false);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  cudf::test::print(result.tbl->view().column(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 12040a5de3300c38b709f122b7842e8f1f159a68 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 7 Dec 2023 23:38:37 +0530
Subject: [PATCH 05/21] add is_strict_nested_boundaries

---
 cpp/src/io/json/json_column.cu  |  7 +++++--
 cpp/src/io/json/json_tree.cu    | 13 ++++++++-----
 cpp/src/io/json/nested_json.hpp |  2 ++
 cpp/tests/io/json_tree.cpp      | 15 ++++++++-------
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 25962f0a0be..4524f311245 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -983,8 +983,11 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     const auto [tokens_gpu, token_indices_gpu] =
       get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
     // gpu tree generation
-    return get_tree_representation(
-      tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    return get_tree_representation(tokens_gpu,
+                                   token_indices_gpu,
+                                   options.is_enabled_mixed_types_as_string(),
+                                   stream,
+                                   rmm::mr::get_current_device_resource());
   }();  // IILE used to free memory of token data.
 #define NJP_DEBUG_PRINT
 #ifdef NJP_DEBUG_PRINT
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 1b0729b3052..0a4acc2c251 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -248,6 +248,7 @@ void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_level
 // Generates a tree representation of the given tokens, token_indices.
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
+                                    bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
@@ -414,11 +415,13 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     stream);
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
 
-  // TODO do this only if mixed type as string flag is enabled.
-  // Fixes here for struct, list nodes with correct range_end. How?
-  // Extract, struct, list - begin & end separately, then push, pop, levels, scan, similar propagate
-  // (segmented scan), then scatter.
-  {
+  // Extract Struct, List range_end
+  // 1. Extract Struct, List - begin & end separately, their token ids
+  // 2. push, pop to get levels
+  // 3. copy first child's parent token_id, also translate to node_id
+  // 4. propagate to siblings using levels, parent token id. (segmented scan)
+  // 5. scatter to node_range_end for all nested end tokens. (if it's end)
+  if (is_strict_nested_boundaries) {
     // Whether the token pushes onto the parent node stack
     auto const is_nested = [] __device__(PdaTokenT const token) -> bool {
       switch (token) {
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 8d89f4ff927..5323b818f61 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -216,6 +216,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
  *
  * @param tokens Vector of token types in the json string
  * @param token_indices The indices within the input string corresponding to each token
+ * @param is_strict_nested_boundaries Whether to extract node end of nested types strictly
  * @param stream The CUDA stream to which kernels are dispatched
  * @param mr Optional, resource with which to allocate
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
@@ -223,6 +224,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
  */
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
+                                    bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 56e2404b683..ece7d5242b0 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -594,7 +594,7 @@ TEST_F(JsonTest, TreeRepresentation)
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -682,7 +682,7 @@ TEST_F(JsonTest, TreeRepresentation2)
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -757,7 +757,7 @@ TEST_F(JsonTest, TreeRepresentation3)
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -783,9 +783,10 @@ TEST_F(JsonTest, TreeRepresentationError)
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
-  EXPECT_THROW(cuio_json::detail::get_tree_representation(
-                 tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cuio_json::detail::get_tree_representation(
+      tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 }
 
 /**
@@ -874,7 +875,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
     records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream);
   // gpu tree generation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
 
 #if LIBCUDF_JSON_DEBUG_DUMP
   printf("BEFORE traversal (gpu_tree):\n");

From f8521f603404284e8ee57a0e636bbc345c6f4f38 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 7 Dec 2023 23:40:11 +0530
Subject: [PATCH 06/21] add more test cases for MixedTypes

---
 cpp/tests/io/json_test.cpp | 85 ++++++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 21 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0aa2b0c212b..dfa2ebb6223 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2049,30 +2049,73 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
     float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
 }
 
-TEST_F(JsonReaderTest, Mixed)
+TEST_F(JsonReaderTest, MixedTypes)
 {
-  // std::string json_string = R"( [{"a":[123], "b":1.0}, {"b":1.1, "c": {"0": 123}}, {"b":2.1}])";
-  std::string json_string = R"( [{"a":[123], "b":1.0}, {"a":1.1}, {"b":2.1, "a": {"0": 123}}])";
-
-  // TODO Force to string via schema
-  // std::map<std::string, cudf::io::schema_element> dtype_schema{
-  //   {"a",
-  //    {
-  //      data_type{cudf::type_id::LIST},
-  //      {{"element", {data_type{cudf::type_id::STRUCT}, {{"0", {dtype<float>()}}}}}},
-  //    }},
-  //   {"b", {dtype<int32_t>()}},
-  // };
+  {
+    std::string json_string = R"({ "foo": [1,2,3], "bar": 123 }
+                               { "foo": { "a": 1 }, "bar": 456 })";
 
-  cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(
-      cudf::io::source_info{json_string.data(), json_string.size()})
-      // .dtypes(dtype_schema)
-      .mixed_types_as_string(true)
-      .lines(false);
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .mixed_types_as_string(true)
+        .lines(true);
 
-  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-  cudf::test::print(result.tbl->view().column(0));
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 2);
+    EXPECT_EQ(result.tbl->num_rows(), 2);
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
+    EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT64);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                   cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"a\": 1 }"}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                   cudf::test::fixed_width_column_wrapper<int64_t>({123, 456}));
+  }
+
+  auto test_fn = [](std::string_view json_string, cudf::column_view expected) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .mixed_types_as_string(true)
+        .lines(true);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected);
+  };
+
+  // test cases.
+  test_fn(R"(
+{ "a": "123" }
+{ "a": 123 }
+)",
+          cudf::test::strings_column_wrapper({"123", "123"}));
+
+  test_fn(R"(
+{ "a": [1,2,3] }
+{ "a": { "b": 1 } }
+)",
+          cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }"}));
+
+  test_fn(R"(
+{ "a": "fox" }
+{ "a": { "b": 1 } }
+)",
+          cudf::test::strings_column_wrapper({"fox", "{ \"b\": 1 }"}));
+
+  test_fn(R"(
+{ "a": [1,2,3] }
+{ "a": "fox" }
+)",
+          cudf::test::strings_column_wrapper({"[1,2,3]", "fox"}));
+
+  test_fn(R"(
+{ "a": [1,2,3] }
+{ "a": [true,false,true] }
+{ "a": ["a", "b", "c"] }
+)",
+          cudf::test::lists_column_wrapper<cudf::string_view>{
+            {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}});
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 377ac3d580b318143c920d62a592f0f7fb76df7d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 9 Dec 2023 01:34:20 +0530
Subject: [PATCH 07/21] bug fix for categeroy update of old col_id

---
 cpp/src/io/json/json_column.cu | 75 ++++++++++++++++++++++++++--------
 cpp/tests/io/json_test.cpp     | 15 +++++++
 2 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 4524f311245..dbfc2e453d7 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -530,6 +530,19 @@ void make_device_json_column(device_span<SymbolT const> input,
     col.type = to_json_col_type(column_categories[i]);
   };
 
+  auto reinitialize_as_string = [&](auto i, auto& col) {
+    col.string_offsets.resize(max_row_offsets[i] + 1, stream);
+    col.string_lengths.resize(max_row_offsets[i] + 1, stream);
+    init_to_zero(col.string_offsets);
+    init_to_zero(col.string_lengths);
+    col.num_rows = max_row_offsets[i] + 1;
+    col.validity =
+      cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
+    col.type = json_col_t::StringColumn;
+    col.child_columns.clear();  // their references should be deleted too.
+    col.column_order.clear();
+  };
+
   // 2. generate nested columns tree and its device_memory
   // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
   auto h_range_col_id_it =
@@ -569,18 +582,24 @@ void make_device_json_column(device_span<SymbolT const> input,
       auto field_name_col_id = parent_col_id;
       parent_col_id          = column_parent_ids[parent_col_id];
       name                   = column_names[field_name_col_id];
-    } else if (is_mixed_string_column[parent_col_id] == 1) {
+    } else {
+      std::cout << "col_id:" << this_col_id << ", pcid:" << parent_col_id << "\n\n\n";
+      CUDF_FAIL("Unexpected parent column category");
+    }
+
+    print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
+    if (is_mixed_string_column[parent_col_id] == 1) {
       // if parent is mixed string column, ignore this column.
       is_mixed_string_column[this_col_id] = 1;
       ignore_vals[this_col_id]            = 1;
       continue;
-    } else {
-      CUDF_FAIL("Unexpected parent column category");
     }
     // If the child is already found,
     // replace if this column is a nested column and the existing was a value column
     // ignore this column if this column is a value column and the existing was a nested column
     auto it = columns.find(parent_col_id);
+    // if(it == columns.end())
+    //   std::cout<<"col_id:"<<this_col_id<<", pcid:"<<parent_col_id<<"\n\n\n";
     CUDF_EXPECTS(it != columns.end(), "Parent column not found");
     auto& parent_col = it->second.get();
     bool replaced    = false;
@@ -597,14 +616,13 @@ void make_device_json_column(device_span<SymbolT const> input,
         remapped_col_id[this_col_id]        = old_col_id;
         // if old col type (not cat) is string/val, keep it.
         // else replace with string.
-        column_categories[this_col_id] = NC_STR;
-        auto& col                      = columns.at(old_col_id).get();
+        auto& col = columns.at(old_col_id).get();
         if (col.type != json_col_t::StringColumn) {
-          column_categories[old_col_id] = NC_STR;
           // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests.
-          initialize_json_columns(old_col_id, col);
-          // TODO all its children (which are already inserted) should be ignored.
+          reinitialize_as_string(old_col_id, col);
+          // all its children (which are already inserted) are ignored below.
         }
+        is_mixed_string_column[old_col_id] = 1;
         columns.try_emplace(this_col_id, columns.at(old_col_id));
         continue;
       }
@@ -653,7 +671,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   for (auto i = 0ul; i < num_columns; i++)
     printf("%3lu ", i);
   printf(" col_id\n");
-  print_vec(column_categories, "column_categories", to_int);
+  print_vec(column_categories, "column_categories", to_cat);
   print_vec(ignore_vals, "ignore_vals", to_int);
   print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
   for (auto i = 0ul; i < num_columns; i++)
@@ -663,6 +681,27 @@ void make_device_json_column(device_span<SymbolT const> input,
     std::cout << key.first << "+" << key.second << ":" << value << "\n";
   }
 
+  if (is_mixed_type_as_string_enabled) {
+    // ignore all children of mixed type columns
+    for (auto const this_col_id : unique_col_ids) {
+      auto parent_col_id = column_parent_ids[this_col_id];
+      if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 1) {
+        is_mixed_string_column[this_col_id] = 1;
+        ignore_vals[this_col_id]            = 1;
+        columns.erase(this_col_id);
+      }
+      // Convert only mixed type columns as string (so to copy)
+      if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and
+          is_mixed_string_column[this_col_id] == 1)
+        column_categories[this_col_id] = NC_STR;
+    }
+    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
+                    column_categories.data(),
+                    column_categories.size() * sizeof(column_categories[0]),
+                    cudaMemcpyDefault,
+                    stream.value());
+  }
+
   // restore unique_col_ids order
   std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
     return thrust::get<1>(a) < thrust::get<1>(b);
@@ -679,16 +718,13 @@ void make_device_json_column(device_span<SymbolT const> input,
                                             static_cast<bitmask_type*>(col.validity.data())};
   }
 
+  // print_vec(column_categories, "column_categories", to_cat);
+  // print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
+  // print_vec(ignore_vals, "ignore_vals", to_int);
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
     ignore_vals, stream, rmm::mr::get_current_device_resource());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
     columns_data, stream, rmm::mr::get_current_device_resource());
-  if (is_mixed_type_as_string_enabled)
-    cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                    column_categories.data(),
-                    column_categories.size() * sizeof(column_categories[0]),
-                    cudaMemcpyDefault,
-                    stream.value());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
@@ -719,7 +755,7 @@ void make_device_json_column(device_span<SymbolT const> input,
         default: break;
       }
     });
-  std::cout << "after for_each_n\n";
+  std::cout << "after str for_each_n\n";
 
   // 4. scatter List offset
   // copy_if only node's whose parent is list, (node_id, parent_col_id)
@@ -745,14 +781,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
     [  // node_categories = tree.node_categories.begin(),
+      d_ignore_vals     = d_ignore_vals.begin(),
       parent_node_ids   = tree.parent_node_ids.begin(),
       column_categories = d_column_tree.node_categories.begin(),
       col_ids           = col_ids.begin()] __device__(size_type node_id) {
       auto parent_node_id = parent_node_ids[node_id];
       return parent_node_id != parent_node_sentinel and
-             column_categories[col_ids[parent_node_id]] == NC_LIST;
+             column_categories[col_ids[parent_node_id]] == NC_LIST and
+             (!d_ignore_vals[col_ids[parent_node_id]]);
       // node_categories[parent_node_id] == NC_LIST;
     });
+  std::cout << "after copy_if\n";
 
   auto const num_list_children =
     list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
@@ -760,6 +799,8 @@ void make_device_json_column(device_span<SymbolT const> input,
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
                              node_ids.begin());
+  std::cout << "after stable_sort_by_key\n";
+  std::cout << num_list_children << "\n";
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index dfa2ebb6223..0863f561735 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2116,6 +2116,21 @@ TEST_F(JsonReaderTest, MixedTypes)
 )",
           cudf::test::lists_column_wrapper<cudf::string_view>{
             {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}});
+  {
+    std::string json_string = R"(
+{ "var1": true }
+{ "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]] }
+  )";
+
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .mixed_types_as_string(true)
+        .lines(true);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    cudf::test::print(result.tbl->get_column(0));
+  }
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From d779c073fa7b6fe45e82c1e1d6217e2ebdca1d71 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Tue, 12 Dec 2023 18:39:38 -0700
Subject: [PATCH 08/21] Java bindings for mixed types as strings (@andygrove)

* java bindings
* tests
* change default for mixedTypesAsStrings to false for backwards compatibility
---
 .../main/java/ai/rapids/cudf/JSONOptions.java | 19 ++++++++++
 java/src/main/java/ai/rapids/cudf/Table.java  | 16 +++++----
 java/src/main/native/src/TableJni.cpp         | 13 ++++---
 .../test/java/ai/rapids/cudf/TableTest.java   | 36 +++++++++++++++++++
 java/src/test/resources/mixed_types_1.json    |  2 ++
 java/src/test/resources/mixed_types_2.json    |  2 ++
 6 files changed, 77 insertions(+), 11 deletions(-)
 create mode 100644 java/src/test/resources/mixed_types_1.json
 create mode 100644 java/src/test/resources/mixed_types_2.json

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index f98687df5fa..2ee3be03379 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -30,12 +30,14 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean dayFirst;
   private final boolean lines;
   private final boolean recoverWithNull;
+  private final boolean mixedTypesAsStrings;
 
   private JSONOptions(Builder builder) {
     super(builder);
     dayFirst = builder.dayFirst;
     lines = builder.lines;
     recoverWithNull = builder.recoverWithNull;
+    mixedTypesAsStrings = builder.mixedTypesAsStrings;
   }
 
   public boolean isDayFirst() {
@@ -51,6 +53,10 @@ public boolean isRecoverWithNull() {
     return recoverWithNull;
   }
 
+  public boolean isMixedTypesAsStrings() {
+    return mixedTypesAsStrings;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -66,6 +72,8 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
 
     private boolean recoverWithNull = false;
 
+    private boolean mixedTypesAsStrings = false;
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
@@ -101,6 +109,17 @@ public Builder withRecoverWithNull(boolean recoverWithNull) {
       return this;
     }
 
+    /**
+     * Specify how to handle columns that contain mixed types.
+     *
+     * @param mixedTypesAsStrings true: return unparsed JSON, false: throw exception
+     * @@return builder for chaining
+     */
+    public Builder withMixedTypesAsStrings(boolean mixedTypesAsStrings) {
+      this.mixedTypesAsStrings = mixedTypesAsStrings;
+      return this;
+    }
+
     @Override
     public Builder includeColumn(String... names) {
       throw new UnsupportedOperationException("JSON reader didn't support column prune");
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 3bd1e3f25a7..8c1c069cfbf 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -250,16 +250,18 @@ private static native long readJSON(String[] columnNames,
                                         int[] dTypeIds, int[] dTypeScales,
                                         String filePath, long address, long length,
                                         boolean dayFirst, boolean lines,
-                                        boolean recoverWithNulls) throws CudfException;
+                                        boolean recoverWithNulls,
+                                        boolean mixedTypesAsStrings) throws CudfException;
 
   private static native long readJSONFromDataSource(String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
                                       boolean dayFirst, boolean lines,
                                       boolean recoverWithNulls,
+                                      boolean mixedTypesAsStrings,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSON(long address, long length,
-      boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
+      boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean mixedTypesAsStrings) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1095,7 +1097,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
             readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
                     path.getAbsolutePath(),
                     0, 0,
-                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()))) {
+                    opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+                    opts.isMixedTypesAsStrings()))) {
 
       return gatherJSONColumns(schema, twm);
     }
@@ -1147,7 +1150,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.length - offset;
     assert offset >= 0 && offset < buffer.length;
     return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
-        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull()));
+        opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
+        opts.isMixedTypesAsStrings()));
   }
 
   /**
@@ -1170,7 +1174,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), null,
             buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull()))) {
+            opts.isRecoverWithNull(), opts.isMixedTypesAsStrings()))) {
       return gatherJSONColumns(schema, twm);
     }
   }
@@ -1186,7 +1190,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
             schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), dsHandle))) {
+            opts.isRecoverWithNull(), opts.isMixedTypesAsStrings(), dsHandle))) {
       return gatherJSONColumns(schema, twm);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index fad19bdf895..16a46122a46 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1393,7 +1393,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null) {
+    jboolean recover_with_null, jboolean mixed_types_as_string) {
 
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1412,7 +1412,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .mixed_types_as_string(mixed_types_as_string);
 
     auto result =
         std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1470,7 +1471,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) {
+    jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean mixed_types_as_string,
+    jlong ds_handle) {
 
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
@@ -1537,7 +1539,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null) {
+    jboolean recover_with_null, jboolean mixed_types_as_string) {
 
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1587,7 +1589,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .mixed_types_as_string(mixed_types_as_string);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b0dd4122b0e..23a4568e216 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -87,6 +87,8 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
   private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
   private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json");
+  private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json");
+  private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -327,6 +329,40 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadMixedType1JSONFile() {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("123", "123" )
+            .build();
+         Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_1_JSON)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadMixedType2JSONFile() {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("[1,2,3]", "{ \"b\": 1 }" )
+            .build();
+         Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadJSONFromDataSource() throws IOException {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/mixed_types_1.json b/java/src/test/resources/mixed_types_1.json
new file mode 100644
index 00000000000..288b06957e3
--- /dev/null
+++ b/java/src/test/resources/mixed_types_1.json
@@ -0,0 +1,2 @@
+{ "a": "123" }
+{ "a": 123 }
\ No newline at end of file
diff --git a/java/src/test/resources/mixed_types_2.json b/java/src/test/resources/mixed_types_2.json
new file mode 100644
index 00000000000..4f1a9d0e3d9
--- /dev/null
+++ b/java/src/test/resources/mixed_types_2.json
@@ -0,0 +1,2 @@
+{ "a": [1,2,3] }
+{ "a": { "b": 1 } }
\ No newline at end of file

From ed64288c5a26a5b8f4f6f3edefd0d34843bc7374 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 14 Dec 2023 18:11:02 +0530
Subject: [PATCH 09/21] newline at eof style fix.

---
 java/src/test/resources/mixed_types_1.json | 2 +-
 java/src/test/resources/mixed_types_2.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/test/resources/mixed_types_1.json b/java/src/test/resources/mixed_types_1.json
index 288b06957e3..21d625bbf2a 100644
--- a/java/src/test/resources/mixed_types_1.json
+++ b/java/src/test/resources/mixed_types_1.json
@@ -1,2 +1,2 @@
 { "a": "123" }
-{ "a": 123 }
\ No newline at end of file
+{ "a": 123 }
diff --git a/java/src/test/resources/mixed_types_2.json b/java/src/test/resources/mixed_types_2.json
index 4f1a9d0e3d9..becad2d0db7 100644
--- a/java/src/test/resources/mixed_types_2.json
+++ b/java/src/test/resources/mixed_types_2.json
@@ -1,2 +1,2 @@
 { "a": [1,2,3] }
-{ "a": { "b": 1 } }
\ No newline at end of file
+{ "a": { "b": 1 } }

From 7b537e4977c18bf839978ce785a43045349c45d3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 4 Jan 2024 15:44:43 +0530
Subject: [PATCH 10/21] copyright year

---
 cpp/include/cudf/io/json.hpp                  |  2 +-
 cpp/src/io/json/json_column.cu                | 49 ++++++++++++++++++-
 cpp/src/io/json/json_tree.cu                  |  2 +-
 cpp/src/io/json/nested_json.hpp               |  2 +-
 cpp/tests/io/json_test.cpp                    |  4 +-
 cpp/tests/io/json_tree.cpp                    |  2 +-
 .../main/java/ai/rapids/cudf/JSONOptions.java |  2 +-
 java/src/main/java/ai/rapids/cudf/Table.java  |  2 +-
 java/src/main/native/src/TableJni.cpp         |  2 +-
 .../test/java/ai/rapids/cudf/TableTest.java   |  2 +-
 python/cudf/cudf/_lib/cpp/io/json.pxd         |  2 +-
 python/cudf/cudf/_lib/json.pyx                |  2 +-
 python/cudf/cudf/io/json.py                   |  2 +-
 13 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7738b15243d..67acee363da 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 33f05a077e8..3bd946e0ffc 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <vector>
 
 namespace cudf::io::json::detail {
 
@@ -393,6 +394,44 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   return to_host(d_column_names->view());
 }
 
+using variant_dtype = std::variant<std::vector<data_type>,
+               std::map<std::string, data_type>,
+               std::map<std::string, schema_element>>;
+
+// pass base column name 
+// children level. get thier col ids, then extract their names.
+// [1, 2]\n[1, 2]                  1
+// [[1, 2], [1, 2]]                2
+// {a: 1, b: 2}\n {a: 1, b: 2}     1
+// [{a: 1, b: 2}, {a: 1, b: 2}]    2
+void map_types_to_strings(variant_dtype const& var_dtype, std::vector<NodeIndexT> const& base_column_indices, std::vector<std::string> const& column_names) {
+  // Get level 0 or 1 names.
+  // create a map of schema always with name?
+      std::optional<schema_element> child_schema_element = std::visit(
+      cudf::detail::visitor_overload{
+        // TODO processing required here only. (extract base col names and construct schema)
+        [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
+          return (static_cast<std::size_t>(column_index) < user_dtypes.size())
+                   ? std::optional<schema_element>{{user_dtypes[column_index]}}
+                   : std::optional<schema_element>{};
+        },
+        // TODO just transform and return it.
+        [col_name](
+          std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
+          return (user_dtypes.find(col_name) != std::end(user_dtypes))
+                   ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
+                   : std::optional<schema_element>{};
+        },
+        // TODO just return it.
+        [col_name](std::map<std::string, schema_element> const& user_dtypes)
+          -> std::optional<schema_element> {
+          return (user_dtypes.find(col_name) != std::end(user_dtypes))
+                   ? user_dtypes.find(col_name)->second
+                   : std::optional<schema_element>{};
+        }},
+      options.get_dtypes());
+}
+
 /**
  * @brief Holds member data pointers of `d_json_column`
  *
@@ -429,6 +468,7 @@ void make_device_json_column(device_span<SymbolT const> input,
                              bool is_array_of_arrays,
                              bool is_enabled_lines,
                              bool is_mixed_type_as_string_enabled,
+                             variant_dtype const& var_dtype,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
@@ -566,6 +606,10 @@ void make_device_json_column(device_span<SymbolT const> input,
   // TODO go through input schema, and force string columns to be string.
   // ignore their children too during below processing.
 
+  // get schema, find max depth, reserve that depth in the vector<std::string> path;
+  // construct path for each
+  // Find all struct nodes, and build its path, and check if it is present in schema as string.
+
   for (auto const this_col_id : unique_col_ids) {
     if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
       continue;
@@ -589,7 +633,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
 
     print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
-    if (is_mixed_string_column[parent_col_id] == 1) {
+    if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) {
       // if parent is mixed string column, ignore this column.
       is_mixed_string_column[this_col_id] = 1;
       ignore_vals[this_col_id]            = 1;
@@ -1079,6 +1123,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                           is_array_of_arrays,
                           options.is_enabled_lines(),
                           options.is_enabled_mixed_types_as_string(),
+                          options.get_dtypes(),
                           stream,
                           mr);
 
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index f3c1da96dda..d68eede536a 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 5323b818f61..c13daf9b9f5 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0863f561735..aa388a773f5 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -2052,6 +2052,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
 TEST_F(JsonReaderTest, MixedTypes)
 {
   {
+    // Simple test for mixed types
     std::string json_string = R"({ "foo": [1,2,3], "bar": 123 }
                                { "foo": { "a": 1 }, "bar": 456 })";
 
@@ -2073,6 +2074,7 @@ TEST_F(JsonReaderTest, MixedTypes)
                                    cudf::test::fixed_width_column_wrapper<int64_t>({123, 456}));
   }
 
+  // Testing function for mixed types in JSON (for spark json reader)
   auto test_fn = [](std::string_view json_string, cudf::column_view expected) {
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index ece7d5242b0..4147d85b3fc 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 2ee3be03379..ae964f897f9 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 8c4693e2607..300c540b8c0 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 37bc93e52ce..c43a99a58d5 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 6c0a6947581..99fb5532332 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 965a0b5bc23..b916c2b7ad9 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index b6124fcbced..e1f13df9d26 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 0088c59f8c3..35d91f9c062 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import warnings
 from collections import abc

From 0dbc9f56400d33a7e1d90bd04923090b5f38e6ba Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 10 Jan 2024 04:38:27 +0530
Subject: [PATCH 11/21] undo mixed type code

---
 cpp/src/io/json/json_column.cu | 45 ----------------------------------
 cpp/src/io/json/json_tree.cu   |  5 ++--
 2 files changed, 3 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3bd946e0ffc..9d51dee1d90 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -51,7 +51,6 @@
 
 #include <algorithm>
 #include <cstdint>
-#include <vector>
 
 namespace cudf::io::json::detail {
 
@@ -394,44 +393,6 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   return to_host(d_column_names->view());
 }
 
-using variant_dtype = std::variant<std::vector<data_type>,
-               std::map<std::string, data_type>,
-               std::map<std::string, schema_element>>;
-
-// pass base column name 
-// children level. get thier col ids, then extract their names.
-// [1, 2]\n[1, 2]                  1
-// [[1, 2], [1, 2]]                2
-// {a: 1, b: 2}\n {a: 1, b: 2}     1
-// [{a: 1, b: 2}, {a: 1, b: 2}]    2
-void map_types_to_strings(variant_dtype const& var_dtype, std::vector<NodeIndexT> const& base_column_indices, std::vector<std::string> const& column_names) {
-  // Get level 0 or 1 names.
-  // create a map of schema always with name?
-      std::optional<schema_element> child_schema_element = std::visit(
-      cudf::detail::visitor_overload{
-        // TODO processing required here only. (extract base col names and construct schema)
-        [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-          return (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                   ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                   : std::optional<schema_element>{};
-        },
-        // TODO just transform and return it.
-        [col_name](
-          std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-          return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                   ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                   : std::optional<schema_element>{};
-        },
-        // TODO just return it.
-        [col_name](std::map<std::string, schema_element> const& user_dtypes)
-          -> std::optional<schema_element> {
-          return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                   ? user_dtypes.find(col_name)->second
-                   : std::optional<schema_element>{};
-        }},
-      options.get_dtypes());
-}
-
 /**
  * @brief Holds member data pointers of `d_json_column`
  *
@@ -468,7 +429,6 @@ void make_device_json_column(device_span<SymbolT const> input,
                              bool is_array_of_arrays,
                              bool is_enabled_lines,
                              bool is_mixed_type_as_string_enabled,
-                             variant_dtype const& var_dtype,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
@@ -606,10 +566,6 @@ void make_device_json_column(device_span<SymbolT const> input,
   // TODO go through input schema, and force string columns to be string.
   // ignore their children too during below processing.
 
-  // get schema, find max depth, reserve that depth in the vector<std::string> path;
-  // construct path for each
-  // Find all struct nodes, and build its path, and check if it is present in schema as string.
-
   for (auto const this_col_id : unique_col_ids) {
     if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
       continue;
@@ -1123,7 +1079,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                           is_array_of_arrays,
                           options.is_enabled_lines(),
                           options.is_enabled_mixed_types_as_string(),
-                          options.get_dtypes(),
                           stream,
                           mr);
 
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index d68eede536a..1c6b60bfbf1 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -442,11 +442,12 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     rmm::device_uvector<NodeIndexT> token_id(num_nested, stream);         // 4B*2=8B, or 2B+
     rmm::device_uvector<NodeIndexT> parent_node_ids(num_nested, stream);  // 4B*2=8B, or 2B+
     auto const push_pop_it = thrust::make_transform_iterator(
-      tokens.begin(), [] __device__(PdaTokenT const token) -> size_type {
+      tokens.begin(),
+      cuda::proclaim_return_type<cudf::size_type>([] __device__(PdaTokenT const token) {
         int const is_begin = token == token_t::StructBegin or token == token_t::ListBegin;
         int const is_end   = token == token_t::StructEnd or token == token_t::ListEnd;
         return is_begin - is_end;
-      });
+      }));
     // copy_if only struct/list, stable sort by level,
     // corresponding node indices?,
     // then scatter to node_range_end for struct/list end.

From c09b776fcf8b4595de492ade7eff63129a08170b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 10 Jan 2024 06:27:28 +0530
Subject: [PATCH 12/21] remove debug prints

---
 cpp/src/io/json/json_column.cu | 37 ----------------------------
 cpp/src/io/json/json_tree.cu   | 45 +---------------------------------
 2 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 9d51dee1d90..9e8a0a6bf9e 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -434,8 +434,6 @@ void make_device_json_column(device_span<SymbolT const> input,
 {
   CUDF_FUNC_RANGE();
   auto num_nodes = col_ids.size();
-  // TODO think about replacing all col_ids which are children of string column to ignore? (useful
-  // to reduce unique_col_id count for map types).
   rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream);  // make a copy
   thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());
 
@@ -479,8 +477,6 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
   std::vector<std::string> column_names = copy_strings_to_host(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  // for(auto str: column_names) std::cout<<str<<"---\n";
-  // TODO preprocess input schema and try to avoid copying map type's field names by skipping them
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -562,10 +558,6 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::vector<NodeIndexT> remapped_col_id(num_columns, -1);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
-  // TODO for map types support
-  // TODO go through input schema, and force string columns to be string.
-  // ignore their children too during below processing.
-
   for (auto const this_col_id : unique_col_ids) {
     if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
       continue;
@@ -584,11 +576,9 @@ void make_device_json_column(device_span<SymbolT const> input,
       parent_col_id          = column_parent_ids[parent_col_id];
       name                   = column_names[field_name_col_id];
     } else {
-      std::cout << "col_id:" << this_col_id << ", pcid:" << parent_col_id << "\n\n\n";
       CUDF_FAIL("Unexpected parent column category");
     }
 
-    print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
     if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) {
       // if parent is mixed string column, ignore this column.
       is_mixed_string_column[this_col_id] = 1;
@@ -599,8 +589,6 @@ void make_device_json_column(device_span<SymbolT const> input,
     // replace if this column is a nested column and the existing was a value column
     // ignore this column if this column is a value column and the existing was a nested column
     auto it = columns.find(parent_col_id);
-    // if(it == columns.end())
-    //   std::cout<<"col_id:"<<this_col_id<<", pcid:"<<parent_col_id<<"\n\n\n";
     CUDF_EXPECTS(it != columns.end(), "Parent column not found");
     auto& parent_col = it->second.get();
     bool replaced    = false;
@@ -668,19 +656,6 @@ void make_device_json_column(device_span<SymbolT const> input,
     columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
     mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
   }
-  // debug prints
-  for (auto i = 0ul; i < num_columns; i++)
-    printf("%3lu ", i);
-  printf(" col_id\n");
-  print_vec(column_categories, "column_categories", to_cat);
-  print_vec(ignore_vals, "ignore_vals", to_int);
-  print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
-  for (auto i = 0ul; i < num_columns; i++)
-    printf("%3lu ", columns.count(i));
-  printf(" columns\n");
-  for (auto const& [key, value] : mapped_columns) {
-    std::cout << key.first << "+" << key.second << ":" << value << "\n";
-  }
 
   if (is_mixed_type_as_string_enabled) {
     // ignore all children of mixed type columns
@@ -719,9 +694,6 @@ void make_device_json_column(device_span<SymbolT const> input,
                                             static_cast<bitmask_type*>(col.validity.data())};
   }
 
-  // print_vec(column_categories, "column_categories", to_cat);
-  // print_vec(is_mixed_string_column, "is_mixed_string_column", to_int);
-  // print_vec(ignore_vals, "ignore_vals", to_int);
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
     ignore_vals, stream, rmm::mr::get_current_device_resource());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
@@ -756,7 +728,6 @@ void make_device_json_column(device_span<SymbolT const> input,
         default: break;
       }
     });
-  std::cout << "after str for_each_n\n";
 
   // 4. scatter List offset
   // copy_if only node's whose parent is list, (node_id, parent_col_id)
@@ -793,7 +764,6 @@ void make_device_json_column(device_span<SymbolT const> input,
              (!d_ignore_vals[col_ids[parent_node_id]]);
       // node_categories[parent_node_id] == NC_LIST;
     });
-  std::cout << "after copy_if\n";
 
   auto const num_list_children =
     list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
@@ -801,8 +771,6 @@ void make_device_json_column(device_span<SymbolT const> input,
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
                              node_ids.begin());
-  std::cout << "after stable_sort_by_key\n";
-  std::cout << num_list_children << "\n";
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
@@ -826,7 +794,6 @@ void make_device_json_column(device_span<SymbolT const> input,
           row_offsets[node_id] + 1;
       }
     });
-  std::cout << "after list for_each_n\n";
 
   // 5. scan on offsets.
   for (auto& [id, col_ref] : columns) {
@@ -1032,7 +999,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                    stream,
                                    rmm::mr::get_current_device_resource());
   }();  // IILE used to free memory of token data.
-#define NJP_DEBUG_PRINT
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
   print_tree(h_input, gpu_tree, stream);
@@ -1059,9 +1025,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   stream,
                                   rmm::mr::get_current_device_resource());
 
-  print_vec(cudf::detail::make_std_vector_async(gpu_col_id, stream), "gpu_col_id", to_int);
-  print_vec(
-    cudf::detail::make_std_vector_async(gpu_row_offsets, stream), "gpu_row_offsets", to_int);
   device_json_column root_column(stream, mr);
   root_column.type = json_col_t::ListColumn;
   root_column.child_offsets.resize(2, stream);
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 1c6b60bfbf1..a298b51f55c 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -60,38 +60,8 @@
 #include <limits>
 
 namespace cudf::io::json {
-// Debug print helpers
-[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string {
-  switch (token) {
-    case token_t::StructBegin: return " {";
-    case token_t::StructEnd: return " }";
-    case token_t::ListBegin: return " [";
-    case token_t::ListEnd: return " ]";
-    case token_t::FieldNameBegin: return "FB";
-    case token_t::FieldNameEnd: return "FE";
-    case token_t::StringBegin: return "SB";
-    case token_t::StringEnd: return "SE";
-    case token_t::ErrorBegin: return "er";
-    case token_t::ValueBegin: return "VB";
-    case token_t::ValueEnd: return "VE";
-    case token_t::StructMemberBegin: return " <";
-    case token_t::StructMemberEnd: return " >";
-    case token_t::LineEnd: return ";";
-    default: return ".";
-  }
-};
-auto to_int    = [](auto v) { return std::to_string(static_cast<int>(v)); };
-auto print_vec = [](auto const& cpu, auto const name, auto converter) {
-  for (auto const& v : cpu)
-    printf("%3s,", converter(v).c_str());
-  std::cout << name << std::endl;
-};
 namespace detail {
 
-void print_tree(host_span<SymbolT const> input,
-                tree_meta_t const& d_gpu_tree,
-                rmm::cuda_stream_view stream);
-
 // The node that a token represents
 struct token_to_node {
   __device__ auto operator()(PdaTokenT const token) -> NodeT
@@ -464,19 +434,10 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                is_nested,
                                stream);
 
-    print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int);
-
     thrust::exclusive_scan(
       rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin());
 
-    print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int);
-
-    rmm::device_uvector<TreeDepthT> ntokens(num_nested, stream);
-    cudf::detail::copy_if_safe(
-      tokens.begin(), tokens.end(), tokens.begin(), ntokens.begin(), is_nested, stream);
-    print_vec(cudf::detail::make_std_vector_async(ntokens, stream), "ntokens", to_token_str);
-    print_vec(cudf::detail::make_std_vector_async(token_id, stream), "ntoken_id", to_int);
-    //
+    // Get parent of first child of struct/list begin.
     auto const first_childs_parent_token_id2 =
       [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT {
       if (i <= 0) { return -1; }
@@ -509,15 +470,11 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
         // parent_node_sentinel is -1, useful for segmented max operation below
       });
 
-    print_vec(
-      cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int);
     // propagate to siblings.
     propagate_parent_to_siblings(
       cudf::device_span<TreeDepthT const>{token_levels.data(), token_levels.size()},
       parent_node_ids,
       stream);
-    print_vec(
-      cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int);
 
     // scatter to node_range_end for all nested end tokens. (if it's end)
     auto token_indices_it =

From 5837ca399552606955fe5eeeac8b8cf67071289a Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 10 Jan 2024 06:28:07 +0530
Subject: [PATCH 13/21] cleanup code and comments

---
 cpp/src/io/json/json_column.cu | 45 +++++++++++-----------------------
 cpp/src/io/json/json_tree.cu   | 30 +++++++++++------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 9e8a0a6bf9e..2c6c655b72c 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -555,7 +555,6 @@ void make_device_json_column(device_span<SymbolT const> input,
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
   std::vector<uint8_t> is_mixed_string_column(num_columns, 0);
-  std::vector<NodeIndexT> remapped_col_id(num_columns, -1);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   for (auto const this_col_id : unique_col_ids) {
@@ -593,36 +592,24 @@ void make_device_json_column(device_span<SymbolT const> input,
     auto& parent_col = it->second.get();
     bool replaced    = false;
     if (mapped_columns.count({parent_col_id, name}) > 0) {
-      /**/
-      // TODO if mixed type is enabled.
-      // make both of them as str, merge them how?
-      // all its child columns should be ignored from parsing. (is adding to ignore_vals enough?)
-      // is key_value column going to slow anyway? because of host copy?
+      // If mixed type is enabled, make both of them as str, merge them.
+      // all its child columns will be ignored from parsing.
       if (is_mixed_type_as_string_enabled) {
         // VAL/STR or STRUCT or LIST
+        auto old_col_id = mapped_columns[{parent_col_id, name}];
+
         is_mixed_string_column[this_col_id] = 1;
-        auto old_col_id                     = mapped_columns[{parent_col_id, name}];
-        remapped_col_id[this_col_id]        = old_col_id;
-        // if old col type (not cat) is string/val, keep it.
-        // else replace with string.
+        is_mixed_string_column[old_col_id]  = 1;
+        // if old col type (not cat) is not string/val, replace with string.
         auto& col = columns.at(old_col_id).get();
         if (col.type != json_col_t::StringColumn) {
           // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests.
           reinitialize_as_string(old_col_id, col);
-          // all its children (which are already inserted) are ignored below.
+          // all its children (which are already inserted) are ignored later.
         }
-        is_mixed_string_column[old_col_id] = 1;
         columns.try_emplace(this_col_id, columns.at(old_col_id));
         continue;
       }
-      // old new new
-      // VAL SCT LST
-      // VAL LST SCT
-      // SCT LST VAL
-      // SCT VAL LST
-      // LST VAL SCT
-      // LST SCT VAL
-      /**/
 
       if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
         ignore_vals[this_col_id] = 1;
@@ -666,7 +653,7 @@ void make_device_json_column(device_span<SymbolT const> input,
         ignore_vals[this_col_id]            = 1;
         columns.erase(this_col_id);
       }
-      // Convert only mixed type columns as string (so to copy)
+      // Convert only mixed type columns as string (so to copy), but not its children
       if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and
           is_mixed_string_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
@@ -686,8 +673,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::vector<json_column_data> columns_data(num_columns);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
-    auto& col = col_ref.get();
-    // if(ignore_vals[col_id]) continue;
+    auto& col            = col_ref.get();
     columns_data[col_id] = json_column_data{col.string_offsets.data(),
                                             col.string_lengths.data(),
                                             col.child_offsets.data(),
@@ -704,8 +690,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     rmm::exec_policy(stream),
     thrust::counting_iterator<size_type>(0),
     num_nodes,
-    [node_categories   = tree.node_categories.begin(),
-     column_categories = d_column_tree.node_categories.begin(),
+    [column_categories = d_column_tree.node_categories.begin(),
      col_ids           = col_ids.begin(),
      row_offsets       = row_offsets.begin(),
      range_begin       = tree.node_range_begin.begin(),
@@ -714,7 +699,6 @@ void make_device_json_column(device_span<SymbolT const> input,
      d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
       if (d_ignore_vals[col_ids[i]]) return;
       auto const node_category = column_categories[col_ids[i]];
-      // switch (node_categories[i]) {
       switch (node_category) {
         case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
         case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
@@ -753,11 +737,10 @@ void make_device_json_column(device_span<SymbolT const> input,
       num_nodes,
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
-    [  // node_categories = tree.node_categories.begin(),
-      d_ignore_vals     = d_ignore_vals.begin(),
-      parent_node_ids   = tree.parent_node_ids.begin(),
-      column_categories = d_column_tree.node_categories.begin(),
-      col_ids           = col_ids.begin()] __device__(size_type node_id) {
+    [d_ignore_vals     = d_ignore_vals.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin()] __device__(size_type node_id) {
       auto parent_node_id = parent_node_ids[node_id];
       return parent_node_id != parent_node_sentinel and
              column_categories[col_ids[parent_node_id]] == NC_LIST and
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index a298b51f55c..9391ec23c13 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -389,14 +389,14 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     stream);
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
 
-  // Extract Struct, List range_end
+  // Extract Struct, List range_end:
   // 1. Extract Struct, List - begin & end separately, their token ids
   // 2. push, pop to get levels
   // 3. copy first child's parent token_id, also translate to node_id
   // 4. propagate to siblings using levels, parent token id. (segmented scan)
-  // 5. scatter to node_range_end for all nested end tokens. (if it's end)
+  // 5. scatter to node_range_end for only nested end tokens.
   if (is_strict_nested_boundaries) {
-    // Whether the token pushes onto the parent node stack
+    // Whether the token is nested
     auto const is_nested = [] __device__(PdaTokenT const token) -> bool {
       switch (token) {
         case token_t::StructBegin:
@@ -409,8 +409,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     auto const num_nested =
       thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_nested);
     rmm::device_uvector<TreeDepthT> token_levels(num_nested, stream);
-    rmm::device_uvector<NodeIndexT> token_id(num_nested, stream);         // 4B*2=8B, or 2B+
-    rmm::device_uvector<NodeIndexT> parent_node_ids(num_nested, stream);  // 4B*2=8B, or 2B+
+    rmm::device_uvector<NodeIndexT> token_id(num_nested, stream);
+    rmm::device_uvector<NodeIndexT> parent_node_ids(num_nested, stream);
     auto const push_pop_it = thrust::make_transform_iterator(
       tokens.begin(),
       cuda::proclaim_return_type<cudf::size_type>([] __device__(PdaTokenT const token) {
@@ -418,9 +418,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
         int const is_end   = token == token_t::StructEnd or token == token_t::ListEnd;
         return is_begin - is_end;
       }));
-    // copy_if only struct/list, stable sort by level,
-    // corresponding node indices?,
-    // then scatter to node_range_end for struct/list end.
+    // copy_if only struct/list's token levels, token ids, tokens.
     cudf::detail::copy_if_safe(push_pop_it,
                                push_pop_it + num_tokens,
                                tokens.begin(),
@@ -450,10 +448,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     };
 
     // copied L+S tokens, and their token ids, their token levels.
-    // first child parent token ids
-    // propagate to siblings
-    // parent token id for all ends -> similar binary search here to find its node id.
-    // scatter to that location.
+    // initialize first child parent token ids
+    // translate token ids to node id using similar binary search.
     thrust::transform(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<NodeIndexT>(0),
@@ -476,12 +472,14 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       parent_node_ids,
       stream);
 
-    // scatter to node_range_end for all nested end tokens. (if it's end)
+    // scatter to node_range_end for only nested end tokens.
     auto token_indices_it =
       thrust::make_permutation_iterator(token_indices.begin(), token_id.begin());
-    // add +1 to include end symbol.
-    auto nested_node_range_end_it = thrust::make_transform_output_iterator(
-      node_range_end.begin(), [] __device__(auto i) { return i + 1; });
+    auto nested_node_range_end_it =
+      thrust::make_transform_output_iterator(node_range_end.begin(), [] __device__(auto i) {
+        // add +1 to include end symbol.
+        return i + 1;
+      });
     auto stencil = thrust::make_transform_iterator(token_id.begin(), is_nested_end{tokens.begin()});
     thrust::scatter_if(rmm::exec_policy(stream),
                        token_indices_it,

From 2ba5e9b77b164ff1061d0b913517ca263a84df47 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 11 Jan 2024 15:05:06 +0530
Subject: [PATCH 14/21] testcase when the MixedTypesAsStrings feature is
 disabled

Co-authored-by: Andy Grove <andygrove73@gmail.com>
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 99fb5532332..119169b6458 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -328,7 +328,18 @@ void testReadJSONFile() {
       assertTablesAreEqual(expected, table);
     }
   }
-
+  @Test
+  void testReadMixedType2JSONFileFeatureDisabled() {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "a")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .withMixedTypesAsStrings(false)
+            .build();
+    assertThrows(CudfException.class, () ->
+      Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON));
+  }
   @Test
   void testReadMixedType1JSONFile() {
     Schema schema = Schema.builder()

From a3c1fe2b0882a245acc8937c57a805eab95d34a5 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 11 Jan 2024 17:21:59 +0530
Subject: [PATCH 15/21] update mixed string, enable test for data source json

---
 java/src/main/native/src/TableJni.cpp            | 3 ++-
 java/src/test/java/ai/rapids/cudf/TableTest.java | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c43a99a58d5..85780589afb 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1506,7 +1506,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
                                                      .dayfirst(static_cast<bool>(day_first))
                                                      .lines(static_cast<bool>(lines))
-                                                     .recovery_mode(recovery_mode);
+                                                     .recovery_mode(recovery_mode)
+                                                     .mixed_types_as_string(mixed_types_as_string);
 
     if (!n_col_names.is_null() && data_types.size() > 0) {
       if (n_col_names.size() != n_types.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 119169b6458..f98476d9786 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -358,7 +358,7 @@ void testReadMixedType1JSONFile() {
   }
 
   @Test
-  void testReadMixedType2JSONFile() {
+  void testReadMixedType2JSONFile() throws IOException {
     Schema schema = Schema.builder()
             .column(DType.STRING, "a")
             .build();
@@ -369,7 +369,8 @@ void testReadMixedType2JSONFile() {
     try (Table expected = new Table.TestBuilder()
             .column("[1,2,3]", "{ \"b\": 1 }" )
             .build();
-         Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)) {
+         MultiBufferDataSource source = sourceFrom(TEST_MIXED_TYPE_2_JSON);
+         Table table = Table.readJSON(schema, opts, source)) {
       assertTablesAreEqual(expected, table);
     }
   }

From e4da81e2b78c6f64f064938d0c8eeed553076d1d Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 11 Jan 2024 21:31:13 +0530
Subject: [PATCH 16/21] add line to separate tests

Co-authored-by: Andy Grove <andygrove73@gmail.com>
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index f98476d9786..73002644858 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -328,6 +328,7 @@ void testReadJSONFile() {
       assertTablesAreEqual(expected, table);
     }
   }
+
   @Test
   void testReadMixedType2JSONFileFeatureDisabled() {
     Schema schema = Schema.builder()
@@ -340,6 +341,7 @@ void testReadMixedType2JSONFileFeatureDisabled() {
     assertThrows(CudfException.class, () ->
       Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON));
   }
+
   @Test
   void testReadMixedType1JSONFile() {
     Schema schema = Schema.builder()

From df4eb7d31c5c7a4640bde90801e7032308c14401 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Sat, 13 Jan 2024 03:16:01 +0530
Subject: [PATCH 17/21] addressed review comments (@elstehle)

---
 cpp/src/io/json/json_tree.cu | 37 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 9391ec23c13..7ef3707332b 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -414,34 +414,27 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     auto const push_pop_it = thrust::make_transform_iterator(
       tokens.begin(),
       cuda::proclaim_return_type<cudf::size_type>([] __device__(PdaTokenT const token) {
-        int const is_begin = token == token_t::StructBegin or token == token_t::ListBegin;
-        int const is_end   = token == token_t::StructEnd or token == token_t::ListEnd;
+        size_type const is_begin = token == token_t::StructBegin or token == token_t::ListBegin;
+        size_type const is_end   = token == token_t::StructEnd or token == token_t::ListEnd;
         return is_begin - is_end;
       }));
     // copy_if only struct/list's token levels, token ids, tokens.
-    cudf::detail::copy_if_safe(push_pop_it,
-                               push_pop_it + num_tokens,
-                               tokens.begin(),
-                               token_levels.begin(),
-                               is_nested,
-                               stream);
-    cudf::detail::copy_if_safe(thrust::make_counting_iterator<NodeIndexT>(0),
-                               thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
-                               tokens.begin(),
-                               token_id.begin(),
-                               is_nested,
-                               stream);
+    auto zipped_in_it =
+      thrust::make_zip_iterator(push_pop_it, thrust::make_counting_iterator<NodeIndexT>(0));
+    auto zipped_out_it = thrust::make_zip_iterator(token_levels.begin(), token_id.begin());
+    cudf::detail::copy_if_safe(
+      zipped_in_it, zipped_in_it + num_tokens, tokens.begin(), zipped_out_it, is_nested, stream);
 
     thrust::exclusive_scan(
       rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin());
 
     // Get parent of first child of struct/list begin.
-    auto const first_childs_parent_token_id2 =
+    auto const nested_first_childs_parent_token_id =
       [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT {
       if (i <= 0) { return -1; }
-      auto id = token_id[i - 1];  // token indices.
+      auto id = token_id[i - 1];  // current token's predecessor
       if (tokens_gpu[id] == token_t::StructBegin or tokens_gpu[id] == token_t::ListBegin) {
-        return token_id[i - 1];
+        return id;
       } else {
         return -1;
       }
@@ -455,10 +448,12 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       thrust::make_counting_iterator<NodeIndexT>(0),
       thrust::make_counting_iterator<NodeIndexT>(0) + num_nested,
       parent_node_ids.begin(),
-      [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id2] __device__(
-        NodeIndexT const tid) -> NodeIndexT {
-        auto const pid = first_childs_parent_token_id2(tid);
-        // return pid;
+      [node_ids_gpu = node_token_ids.begin(),
+       num_nodes,
+       nested_first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT {
+        auto const pid = nested_first_childs_parent_token_id(tid);
+        // token_ids which are converted to nodes, are stored in node_ids_gpu in order
+        // so finding index of token_id in node_ids_gpu will return its node index.
         return pid < 0
                  ? parent_node_sentinel
                  : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -

From 6ddac345f988096f9378f511e9f1734829758258 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 16 Jan 2024 20:13:00 +0530
Subject: [PATCH 18/21] Apply suggestions from code review

Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com>
---
 cpp/src/io/json/json_column.cu | 1 -
 cpp/src/io/json/json_tree.cu   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 2c6c655b72c..11683eb0586 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -745,7 +745,6 @@ void make_device_json_column(device_span<SymbolT const> input,
       return parent_node_id != parent_node_sentinel and
              column_categories[col_ids[parent_node_id]] == NC_LIST and
              (!d_ignore_vals[col_ids[parent_node_id]]);
-      // node_categories[parent_node_id] == NC_LIST;
     });
 
   auto const num_list_children =
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 7ef3707332b..82a60327a3b 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -135,7 +135,7 @@ struct node_ranges {
 };
 
 struct is_nested_end {
-  SymbolT const* tokens;
+  PdaTokenT const* tokens;
   __device__ auto operator()(NodeIndexT i) -> bool
   {
     return tokens[i] == token_t::StructEnd or tokens[i] == token_t::ListEnd;

From ea40c558d49a9e74ee3a06580095230a010efc7c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 19 Jan 2024 01:45:28 +0530
Subject: [PATCH 19/21] address review comments (bdice), add test cases for max
 row offset test

---
 cpp/include/cudf/io/json.hpp                  | 14 ++++-----
 cpp/src/io/json/json_column.cu                | 27 ++++++++---------
 cpp/src/io/json/json_tree.cu                  | 14 +++++----
 cpp/tests/io/json_test.cpp                    | 30 +++++++++++++++++--
 .../main/java/ai/rapids/cudf/JSONOptions.java |  2 +-
 5 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 67acee363da..2a39a539cc7 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -98,7 +98,7 @@ class json_reader_options {
 
   // Read the file as a json object per line
   bool _lines = false;
-  // Read the mixed types as string column
+  // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
 
   // Bytes to skip from the start
@@ -228,9 +228,9 @@ class json_reader_options {
   bool is_enabled_lines() const { return _lines; }
 
   /**
-   * @brief Whether to read the mixed types as string column.
+   * @brief Whether to parse mixed types as a string column.
    *
-   * @return `true` if reading the mixed types as string column
+   * @return `true` if mixed types are parsed as a string column
    */
   bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
 
@@ -312,9 +312,9 @@ class json_reader_options {
   void enable_lines(bool val) { _lines = val; }
 
   /**
-   * @brief Set whether to read the mixed types as string column.
+   * @brief Set whether to parse mixed types as a string column.
    *
-   * @param val Boolean value to enable/disable the option to read the mixed types as string column
+   * @param val Boolean value to enable/disable parsing mixed types as a string column
    */
   void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
 
@@ -454,9 +454,9 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set whether to read the mixed types as string column.
+   * @brief Set whether to parse mixed types as a string column.
    *
-   * @param val Boolean value to enable/disable the option to read the mixed types as string column
+   * @param val Boolean value to enable/disable parsing mixed types as a string column
    * @return this for chaining
    */
   json_reader_options_builder& mixed_types_as_string(bool val)
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 11683eb0586..65d2fb7f28b 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -554,7 +554,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
-  std::vector<uint8_t> is_mixed_string_column(num_columns, 0);
+  std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
   for (auto const this_col_id : unique_col_ids) {
@@ -578,10 +578,10 @@ void make_device_json_column(device_span<SymbolT const> input,
       CUDF_FAIL("Unexpected parent column category");
     }
 
-    if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) {
+    if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
       // if parent is mixed string column, ignore this column.
-      is_mixed_string_column[this_col_id] = 1;
-      ignore_vals[this_col_id]            = 1;
+      is_mixed_type_column[this_col_id] = 1;
+      ignore_vals[this_col_id]          = 1;
       continue;
     }
     // If the child is already found,
@@ -592,18 +592,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     auto& parent_col = it->second.get();
     bool replaced    = false;
     if (mapped_columns.count({parent_col_id, name}) > 0) {
-      // If mixed type is enabled, make both of them as str, merge them.
-      // all its child columns will be ignored from parsing.
+      // If mixed type as string is enabled, make both of them strings and merge them.
+      // All child columns will be ignored when parsing.
       if (is_mixed_type_as_string_enabled) {
         // VAL/STR or STRUCT or LIST
         auto old_col_id = mapped_columns[{parent_col_id, name}];
 
-        is_mixed_string_column[this_col_id] = 1;
-        is_mixed_string_column[old_col_id]  = 1;
+        is_mixed_type_column[this_col_id] = 1;
+        is_mixed_type_column[old_col_id]  = 1;
         // if old col type (not cat) is not string/val, replace with string.
         auto& col = columns.at(old_col_id).get();
         if (col.type != json_col_t::StringColumn) {
-          // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests.
           reinitialize_as_string(old_col_id, col);
           // all its children (which are already inserted) are ignored later.
         }
@@ -648,14 +647,14 @@ void make_device_json_column(device_span<SymbolT const> input,
     // ignore all children of mixed type columns
     for (auto const this_col_id : unique_col_ids) {
       auto parent_col_id = column_parent_ids[this_col_id];
-      if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 1) {
-        is_mixed_string_column[this_col_id] = 1;
-        ignore_vals[this_col_id]            = 1;
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
+        is_mixed_type_column[this_col_id] = 1;
+        ignore_vals[this_col_id]          = 1;
         columns.erase(this_col_id);
       }
       // Convert only mixed type columns as string (so to copy), but not its children
-      if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and
-          is_mixed_string_column[this_col_id] == 1)
+      if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
+          is_mixed_type_column[this_col_id] == 1)
         column_categories[this_col_id] = NC_STR;
     }
     cudaMemcpyAsync(d_column_tree.node_categories.begin(),
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 82a60327a3b..01c7b869217 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -413,11 +413,15 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     rmm::device_uvector<NodeIndexT> parent_node_ids(num_nested, stream);
     auto const push_pop_it = thrust::make_transform_iterator(
       tokens.begin(),
-      cuda::proclaim_return_type<cudf::size_type>([] __device__(PdaTokenT const token) {
-        size_type const is_begin = token == token_t::StructBegin or token == token_t::ListBegin;
-        size_type const is_end   = token == token_t::StructEnd or token == token_t::ListEnd;
-        return is_begin - is_end;
-      }));
+      cuda::proclaim_return_type<cudf::size_type>(
+        [] __device__(PdaTokenT const token) -> size_type {
+          if (token == token_t::StructBegin or token == token_t::ListBegin) {
+            return 1;
+          } else if (token == token_t::StructEnd or token == token_t::ListEnd) {
+            return -1;
+          }
+          return 0;
+        }));
     // copy_if only struct/list's token levels, token ids, tokens.
     auto zipped_in_it =
       thrust::make_zip_iterator(push_pop_it, thrust::make_counting_iterator<NodeIndexT>(0));
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index e0de52c3870..22c2f0de924 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/random.hpp>
@@ -2133,8 +2132,35 @@ TEST_F(JsonReaderTest, MixedTypes)
         .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-    cudf::test::print(result.tbl->get_column(0));
   }
+
+  // test to confirm if reinitialize a non-string column as string affects max_rowoffsets.
+  // max_rowoffsets is generated based on parent col id,
+  // so, even if mixed types are present, their row offset will be correct.
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  using valid_t = std::vector<cudf::valid_type>;
+
+  cudf::test::lists_column_wrapper expected_list{
+    {
+      cudf::test::lists_column_wrapper({LCW({"1", "2", "3"}), LCW({"4", "5", "6"})}),
+      cudf::test::lists_column_wrapper({LCW()}),
+      cudf::test::lists_column_wrapper({LCW()}),  // null
+      cudf::test::lists_column_wrapper({LCW()}),  // null
+      cudf::test::lists_column_wrapper({LCW({"{\"c\": -1}"}), LCW({"5"})}),
+      cudf::test::lists_column_wrapper({LCW({"7"}), LCW({"8", "9"})}),
+      cudf::test::lists_column_wrapper({LCW()}),  // null
+    },
+    valid_t{1, 1, 0, 0, 1, 1, 0}.begin()};
+  test_fn(R"(
+{"b": [ [1, 2, 3], [ 4, 5, 6] ]}
+{"b": [[]]}
+{}
+{}
+{"b": [ [ {"c": -1} ], [ 5 ] ]}
+{"b": [ [7], [8, 9]]}
+{}
+)",
+          expected_list);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index ae964f897f9..523d594f8ba 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -113,7 +113,7 @@ public Builder withRecoverWithNull(boolean recoverWithNull) {
      * Specify how to handle columns that contain mixed types.
      *
      * @param mixedTypesAsStrings true: return unparsed JSON, false: throw exception
-     * @@return builder for chaining
+     * @return builder for chaining
      */
     public Builder withMixedTypesAsStrings(boolean mixedTypesAsStrings) {
       this.mixedTypesAsStrings = mixedTypesAsStrings;

From 49058081a854b01e1ada44e829c8283ab3acc961 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 19 Jan 2024 08:53:50 +0530
Subject: [PATCH 20/21] renaming arguments

---
 cpp/src/io/json/json_column.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 09440e81ceb..c61cc533a81 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -417,6 +417,7 @@ struct json_column_data {
  * @param root Root node of the `d_json_column` tree
  * @param is_array_of_arrays Whether the tree is an array of arrays
  * @param is_enabled_lines Whether the input is a line-delimited JSON
+ * @param is_enabled_mixed_types_as_string Whether to enable reading mixed types as string
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the device memory
  * of child_offets and validity members of `d_json_column`
@@ -428,7 +429,7 @@ void make_device_json_column(device_span<SymbolT const> input,
                              device_json_column& root,
                              bool is_array_of_arrays,
                              bool is_enabled_lines,
-                             bool is_mixed_type_as_string_enabled,
+                             bool is_enabled_mixed_types_as_string,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
@@ -579,7 +580,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
 
     if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
-      // if parent is mixed string column, ignore this column.
+      // if parent is mixed type column, ignore this column.
       is_mixed_type_column[this_col_id] = 1;
       ignore_vals[this_col_id]          = 1;
       continue;
@@ -594,7 +595,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     if (mapped_columns.count({parent_col_id, name}) > 0) {
       // If mixed type as string is enabled, make both of them strings and merge them.
       // All child columns will be ignored when parsing.
-      if (is_mixed_type_as_string_enabled) {
+      if (is_enabled_mixed_types_as_string) {
         // VAL/STR or STRUCT or LIST
         auto old_col_id = mapped_columns[{parent_col_id, name}];
 
@@ -643,7 +644,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
   }
 
-  if (is_mixed_type_as_string_enabled) {
+  if (is_enabled_mixed_types_as_string) {
     // ignore all children of mixed type columns
     for (auto const this_col_id : unique_col_ids) {
       auto parent_col_id = column_parent_ids[this_col_id];

From d2e06911381d00755bc6ccb148623bb5aa1ea5f2 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:00:36 +0530
Subject: [PATCH 21/21] address review comments, rename, fix reinit condition

---
 cpp/src/io/json/json_column.cu |  4 ++--
 cpp/src/io/json/json_tree.cu   | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index c61cc533a81..b1dc2c9dd7f 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -601,9 +601,9 @@ void make_device_json_column(device_span<SymbolT const> input,
 
         is_mixed_type_column[this_col_id] = 1;
         is_mixed_type_column[old_col_id]  = 1;
-        // if old col type (not cat) is not string/val, replace with string.
+        // if old col type (not cat) is list or struct, replace with string.
         auto& col = columns.at(old_col_id).get();
-        if (col.type != json_col_t::StringColumn) {
+        if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
           reinitialize_as_string(old_col_id, col);
           // all its children (which are already inserted) are ignored later.
         }
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 01c7b869217..275907c19c9 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -192,16 +192,16 @@ std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_s
 }
 
 /**
- * @brief Propagate parent node to siblings from first sibling.
+ * @brief Propagate parent node from first sibling to other siblings.
  *
  * @param node_levels Node levels of each node
  * @param parent_node_ids parent node ids initialized for first child of each push node,
  *                       and other siblings are initialized to -1.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
-                                  cudf::device_span<NodeIndexT> parent_node_ids,
-                                  rmm::cuda_stream_view stream)
+void propagate_first_sibling_to_other(cudf::device_span<TreeDepthT const> node_levels,
+                                      cudf::device_span<NodeIndexT> parent_node_ids,
+                                      rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
@@ -354,7 +354,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       });
   }
   // Propagate parent node to siblings from first sibling - inplace.
-  propagate_parent_to_siblings(
+  propagate_first_sibling_to_other(
     cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
     parent_node_ids,
     stream);
@@ -465,8 +465,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
         // parent_node_sentinel is -1, useful for segmented max operation below
       });
 
-    // propagate to siblings.
-    propagate_parent_to_siblings(
+    // propagate parent node from first sibling to other siblings - inplace.
+    propagate_first_sibling_to_other(
       cudf::device_span<TreeDepthT const>{token_levels.data(), token_levels.size()},
       parent_node_ids,
       stream);