From 49a73eb67dad6ffbbd3747f6e59888d256df4162 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 6 Dec 2023 00:02:48 +0530 Subject: [PATCH 01/21] Add mixed_types_as_string reader option --- cpp/include/cudf/io/json.hpp | 28 +++++++++++++++++++++++++++ python/cudf/cudf/_lib/cpp/io/json.pxd | 5 +++++ python/cudf/cudf/_lib/json.pyx | 4 +++- python/cudf/cudf/io/json.py | 2 ++ 4 files changed, 38 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 472d42b1db5..7738b15243d 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -98,6 +98,8 @@ class json_reader_options { // Read the file as a json object per line bool _lines = false; + // Read the mixed types as string column + bool _mixed_types_as_string = false; // Bytes to skip from the start size_t _byte_range_offset = 0; @@ -225,6 +227,13 @@ class json_reader_options { */ bool is_enabled_lines() const { return _lines; } + /** + * @brief Whether to read the mixed types as string column. + * + * @return `true` if reading the mixed types as string column + */ + bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } + /** * @brief Whether to parse dates as DD/MM versus MM/DD. * @@ -302,6 +311,13 @@ class json_reader_options { */ void enable_lines(bool val) { _lines = val; } + /** + * @brief Set whether to read the mixed types as string column. + * + * @param val Boolean value to enable/disable the option to read the mixed types as string column + */ + void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * @@ -437,6 +453,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether to read the mixed types as string column. + * + * @param val Boolean value to enable/disable the option to read the mixed types as string column + * @return this for chaining + */ + json_reader_options_builder& mixed_types_as_string(bool val) + { + options._mixed_types_as_string = val; + return *this; + } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index ad618cc4ed6..965a0b5bc23 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -27,6 +27,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_offset() except + size_type get_byte_range_size() except + bool is_enabled_lines() except + + bool is_enabled_mixed_types_as_string() except + bool is_enabled_dayfirst() except + bool is_enabled_experimental() except + @@ -39,6 +40,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_offset(size_type offset) except + void set_byte_range_size(size_type size) except + void enable_lines(bool val) except + + void enable_mixed_types_as_string(bool val) except + void enable_dayfirst(bool val) except + void enable_experimental(bool val) except + void enable_keep_quotes(bool val) except + @@ -74,6 +76,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& lines( bool val ) except + + json_reader_options_builder& mixed_types_as_string( + bool val + ) except + json_reader_options_builder& dayfirst( bool val ) except + diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 437c3ef6ec4..b6124fcbced 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -53,7 +53,8 @@ cpdef read_json(object filepaths_or_buffers, object compression, object byte_range, bool legacy, - bool keep_quotes): + bool keep_quotes, + bool mixed_types_as_string): """ Cython function to call into libcudf API, see `read_json`. @@ -131,6 +132,7 @@ cpdef read_json(object filepaths_or_buffers, opts.set_dtypes(c_dtypes_schema_map) opts.enable_keep_quotes(keep_quotes) + opts.enable_mixed_types_as_string(mixed_types_as_string) # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ae2f0203642..0088c59f8c3 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -25,6 +25,7 @@ def read_json( byte_range=None, keep_quotes=False, storage_options=None, + mixed_types_as_string=False, *args, **kwargs, ): @@ -116,6 +117,7 @@ def read_json( byte_range, engine == "cudf_legacy", keep_quotes, + mixed_types_as_string, ) else: warnings.warn( From 6bc0819b4c6c16be72e7d789e434ccfe1406501c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 6 Dec 2023 00:13:48 +0530 Subject: [PATCH 02/21] Extract correct Struct, List node range end --- cpp/src/io/json/json_column.cu | 10 +++ cpp/src/io/json/json_tree.cu | 152 ++++++++++++++++++++++++++++++++- 2 files changed, 161 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 5d7fb9d6b43..fd97b325ebb 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -276,6 +276,16 @@ reduce_to_column_tree(tree_meta_t& tree, return is_non_list_parent(parent_col_id); }); + // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) + thrust::transform_if( + rmm::exec_policy(stream), + col_range_begin.begin(), + col_range_begin.end(), + column_categories.begin(), + col_range_end.begin(), + [] __device__(auto i) { return i + 1; }, + [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); + return std::tuple{tree_meta_t{std::move(column_categories), std::move(parent_col_ids), std::move(column_levels), diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index da5b0eedfbd..1b0729b3052 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -58,8 +58,38 @@ #include namespace cudf::io::json { +// Debug print helpers +[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string { + switch (token) { + case token_t::StructBegin: return " {"; + case token_t::StructEnd: return " }"; + case token_t::ListBegin: return " ["; + case token_t::ListEnd: return " ]"; + case token_t::FieldNameBegin: return "FB"; + case token_t::FieldNameEnd: return "FE"; + case token_t::StringBegin: return "SB"; + case token_t::StringEnd: return "SE"; + case token_t::ErrorBegin: return "er"; + case token_t::ValueBegin: return "VB"; + case token_t::ValueEnd: return "VE"; + case token_t::StructMemberBegin: return " <"; + case token_t::StructMemberEnd: return " >"; + case token_t::LineEnd: return ";"; + default: return "."; + } +}; +auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; +auto print_vec = [](auto const& cpu, auto const name, auto converter) { + for (auto const& v : cpu) + printf("%3s,", converter(v).c_str()); + std::cout << name << std::endl; +}; namespace detail { +void print_tree(host_span input, + tree_meta_t const& d_gpu_tree, + rmm::cuda_stream_view stream); + // The node that a token represents struct token_to_node { __device__ auto operator()(PdaTokenT const token) -> NodeT @@ -132,6 +162,14 @@ struct node_ranges { } }; +struct is_nested_end { + SymbolT const* tokens; + __device__ auto operator()(NodeIndexT i) -> bool + { + return tokens[i] == token_t::StructEnd or tokens[i] == token_t::ListEnd; + } +}; + /** * @brief Returns stable sorted keys and its sorted order * @@ -293,9 +331,9 @@ tree_meta_t get_tree_representation(device_span tokens, // Node parent ids: // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. rmm::device_uvector parent_node_ids(num_nodes, stream, mr); + rmm::device_uvector node_token_ids(num_nodes, stream); // needed for SE, LE later // This block of code is generalized logical stack algorithm. TODO: make this a separate function. { - rmm::device_uvector node_token_ids(num_nodes, stream); cudf::detail::copy_if_safe(thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_tokens, tokens.begin(), @@ -376,6 +414,118 @@ tree_meta_t get_tree_representation(device_span tokens, stream); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); + // TODO do this only if mixed type as string flag is enabled. + // Fixes here for struct, list nodes with correct range_end. How? + // Extract, struct, list - begin & end separately, then push, pop, levels, scan, similar propagate + // (segmented scan), then scatter. + { + // Whether the token pushes onto the parent node stack + auto const is_nested = [] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::StructBegin: + case token_t::StructEnd: + case token_t::ListBegin: + case token_t::ListEnd: return true; + default: return false; + }; + }; + auto const num_nested = + thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_nested); + rmm::device_uvector token_levels(num_nested, stream); + rmm::device_uvector token_id(num_nested, stream); // 4B*2=8B, or 2B+ + rmm::device_uvector parent_node_ids(num_nested, stream); // 4B*2=8B, or 2B+ + auto const push_pop_it = thrust::make_transform_iterator( + tokens.begin(), [] __device__(PdaTokenT const token) -> size_type { + int const is_begin = token == token_t::StructBegin or token == token_t::ListBegin; + int const is_end = token == token_t::StructEnd or token == token_t::ListEnd; + return is_begin - is_end; + }); + // copy_if only struct/list, stable sort by level, + // corresponding node indices?, + // then scatter to node_range_end for struct/list end. + cudf::detail::copy_if_safe(push_pop_it, + push_pop_it + num_tokens, + tokens.begin(), + token_levels.begin(), + is_nested, + stream); + cudf::detail::copy_if_safe(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + tokens.begin(), + token_id.begin(), + is_nested, + stream); + + print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int); + + thrust::exclusive_scan( + rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin()); + + print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int); + + rmm::device_uvector ntokens(num_nested, stream); + cudf::detail::copy_if_safe( + tokens.begin(), tokens.end(), tokens.begin(), ntokens.begin(), is_nested, stream); + print_vec(cudf::detail::make_std_vector_async(ntokens, stream), "ntokens", to_token_str); + print_vec(cudf::detail::make_std_vector_async(token_id, stream), "ntoken_id", to_int); + // + auto const first_childs_parent_token_id2 = + [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { return -1; } + auto id = token_id[i - 1]; // token indices. + if (tokens_gpu[id] == token_t::StructBegin or tokens_gpu[id] == token_t::ListBegin) { + return token_id[i - 1]; + } else { + return -1; + } + }; + + // copied L+S tokens, and their token ids, their token levels. + // first child parent token ids + // propagate to siblings + // parent token id for all ends -> similar binary search here to find its node id. + // scatter to that location. + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_nested, + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id2] __device__( + NodeIndexT const tid) -> NodeIndexT { + auto const pid = first_childs_parent_token_id2(tid); + // return pid; + return pid < 0 + ? parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + // parent_node_sentinel is -1, useful for segmented max operation below + }); + + print_vec( + cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int); + // propagate to siblings. + propagate_parent_to_siblings( + cudf::device_span{token_levels.data(), token_levels.size()}, + parent_node_ids, + stream); + print_vec( + cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int); + + // scatter to node_range_end for all nested end tokens. (if it's end) + auto token_indices_it = + thrust::make_permutation_iterator(token_indices.begin(), token_id.begin()); + // add +1 to include end symbol. + auto nested_node_range_end_it = thrust::make_transform_output_iterator( + node_range_end.begin(), [] __device__(auto i) { return i + 1; }); + auto stencil = thrust::make_transform_iterator(token_id.begin(), is_nested_end{tokens.begin()}); + thrust::scatter_if(rmm::exec_policy(stream), + token_indices_it, + token_indices_it + num_nested, + parent_node_ids.begin(), + stencil, + nested_node_range_end_it); + } + return {std::move(node_categories), std::move(parent_node_ids), std::move(node_levels), From aa03a95307f934009760d7a5528d7154f49bbef3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 6 Dec 2023 00:14:39 +0530 Subject: [PATCH 03/21] Force mixed types as string --- cpp/src/io/json/json_column.cu | 108 +++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 12 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index fd97b325ebb..06af050cc17 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -427,11 +427,14 @@ void make_device_json_column(device_span input, device_json_column& root, bool is_array_of_arrays, bool is_enabled_lines, + bool is_mixed_type_as_string_enabled, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto num_nodes = col_ids.size(); + // TODO think about replacing all col_ids which are children of string column to ignore? (useful + // to reduce unique_col_id count for map types). rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); @@ -475,6 +478,8 @@ void make_device_json_column(device_span input, auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); std::vector column_names = copy_strings_to_host( input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // for(auto str: column_names) std::cout< input, std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity std::vector ignore_vals(num_columns, 0); + std::vector is_mixed_string_column(num_columns, 0); + std::vector remapped_col_id(num_columns, -1); columns.try_emplace(parent_node_sentinel, std::ref(root)); + // TODO for map types support + // TODO go through input schema, and force string columns to be string. + // ignore their children too during below processing. + for (auto const this_col_id : unique_col_ids) { if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { continue; @@ -558,6 +569,11 @@ void make_device_json_column(device_span input, auto field_name_col_id = parent_col_id; parent_col_id = column_parent_ids[parent_col_id]; name = column_names[field_name_col_id]; + } else if (is_mixed_string_column[parent_col_id] == 1) { + // if parent is mixed string column, ignore this column. + is_mixed_string_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + continue; } else { CUDF_FAIL("Unexpected parent column category"); } @@ -569,6 +585,38 @@ void make_device_json_column(device_span input, auto& parent_col = it->second.get(); bool replaced = false; if (mapped_columns.count({parent_col_id, name}) > 0) { + /**/ + // TODO if mixed type is enabled. + // make both of them as str, merge them how? + // all its child columns should be ignored from parsing. (is adding to ignore_vals enough?) + // is key_value column going to slow anyway? because of host copy? + if (is_mixed_type_as_string_enabled) { + // VAL/STR or STRUCT or LIST + is_mixed_string_column[this_col_id] = 1; + auto old_col_id = mapped_columns[{parent_col_id, name}]; + remapped_col_id[this_col_id] = old_col_id; + // if old col type (not cat) is string/val, keep it. + // else replace with string. + column_categories[this_col_id] = NC_STR; + auto& col = columns.at(old_col_id).get(); + if (col.type != json_col_t::StringColumn) { + column_categories[old_col_id] = NC_STR; + // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests. + initialize_json_columns(old_col_id, col); + // TODO all its children (which are already inserted) should be ignored. + } + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + // old new new + // VAL SCT LST + // VAL LST SCT + // SCT LST VAL + // SCT VAL LST + // LST VAL SCT + // LST SCT VAL + /**/ + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { ignore_vals[this_col_id] = 1; continue; @@ -601,6 +649,20 @@ void make_device_json_column(device_span input, columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); } + // debug prints + for (auto i = 0ul; i < num_columns; i++) + printf("%3lu ", i); + printf(" col_id\n"); + print_vec(column_categories, "column_categories", to_int); + print_vec(ignore_vals, "ignore_vals", to_int); + print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); + for (auto i = 0ul; i < num_columns; i++) + printf("%3lu ", columns.count(i)); + printf(" columns\n"); + for (auto const& [key, value] : mapped_columns) { + std::cout << key.first << "+" << key.second << ":" << value << "\n"; + } + // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { return thrust::get<1>(a) < thrust::get<1>(b); @@ -609,7 +671,8 @@ void make_device_json_column(device_span input, std::vector columns_data(num_columns); for (auto& [col_id, col_ref] : columns) { if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); + auto& col = col_ref.get(); + // if(ignore_vals[col_id]) continue; columns_data[col_id] = json_column_data{col.string_offsets.data(), col.string_lengths.data(), col.child_offsets.data(), @@ -620,20 +683,30 @@ void make_device_json_column(device_span input, ignore_vals, stream, rmm::mr::get_current_device_resource()); auto d_columns_data = cudf::detail::make_device_uvector_async( columns_data, stream, rmm::mr::get_current_device_resource()); + if (is_mixed_type_as_string_enabled) + cudaMemcpyAsync(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudaMemcpyDefault, + stream.value()); // 3. scatter string offsets to respective columns, set validity bits thrust::for_each_n( rmm::exec_policy(stream), thrust::counting_iterator(0), num_nodes, - [node_categories = tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - switch (node_categories[i]) { + [node_categories = tree.node_categories.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + // switch (node_categories[i]) { + switch (node_category) { case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; case NC_STR: [[fallthrough]]; @@ -646,6 +719,7 @@ void make_device_json_column(device_span input, default: break; } }); + std::cout << "after for_each_n\n"; // 4. scatter List offset // copy_if only node's whose parent is list, (node_id, parent_col_id) @@ -670,10 +744,14 @@ void make_device_json_column(device_span input, num_nodes, thrust::make_counting_iterator(0), thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [node_categories = tree.node_categories.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + [ // node_categories = tree.node_categories.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST; + // node_categories[parent_node_id] == NC_LIST; }); auto const num_list_children = @@ -705,6 +783,7 @@ void make_device_json_column(device_span input, row_offsets[node_id] + 1; } }); + std::cout << "after list for_each_n\n"; // 5. scan on offsets. for (auto& [id, col_ref] : columns) { @@ -909,6 +988,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, return get_tree_representation( tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); }(); // IILE used to free memory of token data. +#define NJP_DEBUG_PRINT #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); print_tree(h_input, gpu_tree, stream); @@ -935,6 +1015,9 @@ table_with_metadata device_parse_nested_json(device_span d_input, stream, rmm::mr::get_current_device_resource()); + print_vec(cudf::detail::make_std_vector_async(gpu_col_id, stream), "gpu_col_id", to_int); + print_vec( + cudf::detail::make_std_vector_async(gpu_row_offsets, stream), "gpu_row_offsets", to_int); device_json_column root_column(stream, mr); root_column.type = json_col_t::ListColumn; root_column.child_offsets.resize(2, stream); @@ -951,6 +1034,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, root_column, is_array_of_arrays, options.is_enabled_lines(), + options.is_enabled_mixed_types_as_string(), stream, mr); From 3341109875c08d438ec3c25462bf3483e665e834 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 6 Dec 2023 00:16:06 +0530 Subject: [PATCH 04/21] Add simple mixed type testcase --- cpp/tests/io/json_test.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index a2db2d69984..fa4c0cb9f0b 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -2090,4 +2091,30 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()}); } +TEST_F(JsonReaderTest, Mixed) +{ + // std::string json_string = R"( [{"a":[123], "b":1.0}, {"b":1.1, "c": {"0": 123}}, {"b":2.1}])"; + std::string json_string = R"( [{"a":[123], "b":1.0}, {"a":1.1}, {"b":2.1, "a": {"0": 123}}])"; + + // TODO Force to string via schema + // std::map dtype_schema{ + // {"a", + // { + // data_type{cudf::type_id::LIST}, + // {{"element", {data_type{cudf::type_id::STRUCT}, {{"0", {dtype()}}}}}}, + // }}, + // {"b", {dtype()}}, + // }; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + // .dtypes(dtype_schema) + .mixed_types_as_string(true) + .lines(false); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + cudf::test::print(result.tbl->view().column(0)); +} + CUDF_TEST_PROGRAM_MAIN() From 12040a5de3300c38b709f122b7842e8f1f159a68 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 7 Dec 2023 23:38:37 +0530 Subject: [PATCH 05/21] add is_strict_nested_boundaries --- cpp/src/io/json/json_column.cu | 7 +++++-- cpp/src/io/json/json_tree.cu | 13 ++++++++----- cpp/src/io/json/nested_json.hpp | 2 ++ cpp/tests/io/json_tree.cpp | 15 ++++++++------- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 25962f0a0be..4524f311245 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -983,8 +983,11 @@ table_with_metadata device_parse_nested_json(device_span d_input, const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource()); // gpu tree generation - return get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + return get_tree_representation(tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string(), + stream, + rmm::mr::get_current_device_resource()); }(); // IILE used to free memory of token data. #define NJP_DEBUG_PRINT #ifdef NJP_DEBUG_PRINT diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 1b0729b3052..0a4acc2c251 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -248,6 +248,7 @@ void propagate_parent_to_siblings(cudf::device_span node_level // Generates a tree representation of the given tokens, token_indices. tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, + bool is_strict_nested_boundaries, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -414,11 +415,13 @@ tree_meta_t get_tree_representation(device_span tokens, stream); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); - // TODO do this only if mixed type as string flag is enabled. - // Fixes here for struct, list nodes with correct range_end. How? - // Extract, struct, list - begin & end separately, then push, pop, levels, scan, similar propagate - // (segmented scan), then scatter. - { + // Extract Struct, List range_end + // 1. Extract Struct, List - begin & end separately, their token ids + // 2. push, pop to get levels + // 3. copy first child's parent token_id, also translate to node_id + // 4. propagate to siblings using levels, parent token id. (segmented scan) + // 5. scatter to node_range_end for all nested end tokens. (if it's end) + if (is_strict_nested_boundaries) { // Whether the token pushes onto the parent node stack auto const is_nested = [] __device__(PdaTokenT const token) -> bool { switch (token) { diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 8d89f4ff927..5323b818f61 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -216,6 +216,7 @@ std::pair, rmm::device_uvector> pr * * @param tokens Vector of token types in the json string * @param token_indices The indices within the input string corresponding to each token + * @param is_strict_nested_boundaries Whether to extract node end of nested types strictly * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate * @return A tree representation of the input JSON string as vectors of node type, parent index, @@ -223,6 +224,7 @@ std::pair, rmm::device_uvector> pr */ tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, + bool is_strict_nested_boundaries, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index 56e2404b683..ece7d5242b0 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -594,7 +594,7 @@ TEST_F(JsonTest, TreeRepresentation) // Get the JSON's tree representation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); // host tree generation auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); compare_trees(cpu_tree, gpu_tree); @@ -682,7 +682,7 @@ TEST_F(JsonTest, TreeRepresentation2) // Get the JSON's tree representation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); // host tree generation auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); compare_trees(cpu_tree, gpu_tree); @@ -757,7 +757,7 @@ TEST_F(JsonTest, TreeRepresentation3) // Get the JSON's tree representation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); // host tree generation auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); compare_trees(cpu_tree, gpu_tree); @@ -783,9 +783,10 @@ TEST_F(JsonTest, TreeRepresentationError) // Get the JSON's tree representation // This JSON is invalid and will raise an exception. - EXPECT_THROW(cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()), - cudf::logic_error); + EXPECT_THROW( + cuio_json::detail::get_tree_representation( + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()), + cudf::logic_error); } /** @@ -874,7 +875,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal) records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream); // gpu tree generation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); #if LIBCUDF_JSON_DEBUG_DUMP printf("BEFORE traversal (gpu_tree):\n"); From f8521f603404284e8ee57a0e636bbc345c6f4f38 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 7 Dec 2023 23:40:11 +0530 Subject: [PATCH 06/21] add more test cases for MixedTypes --- cpp/tests/io/json_test.cpp | 85 ++++++++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 21 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 0aa2b0c212b..dfa2ebb6223 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2049,30 +2049,73 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()}); } -TEST_F(JsonReaderTest, Mixed) +TEST_F(JsonReaderTest, MixedTypes) { - // std::string json_string = R"( [{"a":[123], "b":1.0}, {"b":1.1, "c": {"0": 123}}, {"b":2.1}])"; - std::string json_string = R"( [{"a":[123], "b":1.0}, {"a":1.1}, {"b":2.1, "a": {"0": 123}}])"; - - // TODO Force to string via schema - // std::map dtype_schema{ - // {"a", - // { - // data_type{cudf::type_id::LIST}, - // {{"element", {data_type{cudf::type_id::STRUCT}, {{"0", {dtype()}}}}}}, - // }}, - // {"b", {dtype()}}, - // }; + { + std::string json_string = R"({ "foo": [1,2,3], "bar": 123 } + { "foo": { "a": 1 }, "bar": 456 })"; - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder( - cudf::io::source_info{json_string.data(), json_string.size()}) - // .dtypes(dtype_schema) - .mixed_types_as_string(true) - .lines(false); + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .mixed_types_as_string(true) + .lines(true); - cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - cudf::test::print(result.tbl->view().column(0)); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 2); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT64); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"a\": 1 }"})); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + cudf::test::fixed_width_column_wrapper({123, 456})); + } + + auto test_fn = [](std::string_view json_string, cudf::column_view expected) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .mixed_types_as_string(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected); + }; + + // test cases. + test_fn(R"( +{ "a": "123" } +{ "a": 123 } +)", + cudf::test::strings_column_wrapper({"123", "123"})); + + test_fn(R"( +{ "a": [1,2,3] } +{ "a": { "b": 1 } } +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }"})); + + test_fn(R"( +{ "a": "fox" } +{ "a": { "b": 1 } } +)", + cudf::test::strings_column_wrapper({"fox", "{ \"b\": 1 }"})); + + test_fn(R"( +{ "a": [1,2,3] } +{ "a": "fox" } +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "fox"})); + + test_fn(R"( +{ "a": [1,2,3] } +{ "a": [true,false,true] } +{ "a": ["a", "b", "c"] } +)", + cudf::test::lists_column_wrapper{ + {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}}); } CUDF_TEST_PROGRAM_MAIN() From 377ac3d580b318143c920d62a592f0f7fb76df7d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 9 Dec 2023 01:34:20 +0530 Subject: [PATCH 07/21] bug fix for categeroy update of old col_id --- cpp/src/io/json/json_column.cu | 75 ++++++++++++++++++++++++++-------- cpp/tests/io/json_test.cpp | 15 +++++++ 2 files changed, 73 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 4524f311245..dbfc2e453d7 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -530,6 +530,19 @@ void make_device_json_column(device_span input, col.type = to_json_col_type(column_categories[i]); }; + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + col.child_columns.clear(); // their references should be deleted too. + col.column_order.clear(); + }; + // 2. generate nested columns tree and its device_memory // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. auto h_range_col_id_it = @@ -569,18 +582,24 @@ void make_device_json_column(device_span input, auto field_name_col_id = parent_col_id; parent_col_id = column_parent_ids[parent_col_id]; name = column_names[field_name_col_id]; - } else if (is_mixed_string_column[parent_col_id] == 1) { + } else { + std::cout << "col_id:" << this_col_id << ", pcid:" << parent_col_id << "\n\n\n"; + CUDF_FAIL("Unexpected parent column category"); + } + + print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); + if (is_mixed_string_column[parent_col_id] == 1) { // if parent is mixed string column, ignore this column. is_mixed_string_column[this_col_id] = 1; ignore_vals[this_col_id] = 1; continue; - } else { - CUDF_FAIL("Unexpected parent column category"); } // If the child is already found, // replace if this column is a nested column and the existing was a value column // ignore this column if this column is a value column and the existing was a nested column auto it = columns.find(parent_col_id); + // if(it == columns.end()) + // std::cout<<"col_id:"<second.get(); bool replaced = false; @@ -597,14 +616,13 @@ void make_device_json_column(device_span input, remapped_col_id[this_col_id] = old_col_id; // if old col type (not cat) is string/val, keep it. // else replace with string. - column_categories[this_col_id] = NC_STR; - auto& col = columns.at(old_col_id).get(); + auto& col = columns.at(old_col_id).get(); if (col.type != json_col_t::StringColumn) { - column_categories[old_col_id] = NC_STR; // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests. - initialize_json_columns(old_col_id, col); - // TODO all its children (which are already inserted) should be ignored. + reinitialize_as_string(old_col_id, col); + // all its children (which are already inserted) are ignored below. } + is_mixed_string_column[old_col_id] = 1; columns.try_emplace(this_col_id, columns.at(old_col_id)); continue; } @@ -653,7 +671,7 @@ void make_device_json_column(device_span input, for (auto i = 0ul; i < num_columns; i++) printf("%3lu ", i); printf(" col_id\n"); - print_vec(column_categories, "column_categories", to_int); + print_vec(column_categories, "column_categories", to_cat); print_vec(ignore_vals, "ignore_vals", to_int); print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); for (auto i = 0ul; i < num_columns; i++) @@ -663,6 +681,27 @@ void make_device_json_column(device_span input, std::cout << key.first << "+" << key.second << ":" << value << "\n"; } + if (is_mixed_type_as_string_enabled) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 1) { + is_mixed_string_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy) + if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and + is_mixed_string_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudaMemcpyAsync(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudaMemcpyDefault, + stream.value()); + } + // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { return thrust::get<1>(a) < thrust::get<1>(b); @@ -679,16 +718,13 @@ void make_device_json_column(device_span input, static_cast(col.validity.data())}; } + // print_vec(column_categories, "column_categories", to_cat); + // print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); + // print_vec(ignore_vals, "ignore_vals", to_int); auto d_ignore_vals = cudf::detail::make_device_uvector_async( ignore_vals, stream, rmm::mr::get_current_device_resource()); auto d_columns_data = cudf::detail::make_device_uvector_async( columns_data, stream, rmm::mr::get_current_device_resource()); - if (is_mixed_type_as_string_enabled) - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); // 3. scatter string offsets to respective columns, set validity bits thrust::for_each_n( @@ -719,7 +755,7 @@ void make_device_json_column(device_span input, default: break; } }); - std::cout << "after for_each_n\n"; + std::cout << "after str for_each_n\n"; // 4. scatter List offset // copy_if only node's whose parent is list, (node_id, parent_col_id) @@ -745,14 +781,17 @@ void make_device_json_column(device_span input, thrust::make_counting_iterator(0), thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), [ // node_categories = tree.node_categories.begin(), + d_ignore_vals = d_ignore_vals.begin(), parent_node_ids = tree.parent_node_ids.begin(), column_categories = d_column_tree.node_categories.begin(), col_ids = col_ids.begin()] __device__(size_type node_id) { auto parent_node_id = parent_node_ids[node_id]; return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST; + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); // node_categories[parent_node_id] == NC_LIST; }); + std::cout << "after copy_if\n"; auto const num_list_children = list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); @@ -760,6 +799,8 @@ void make_device_json_column(device_span input, parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, node_ids.begin()); + std::cout << "after stable_sort_by_key\n"; + std::cout << num_list_children << "\n"; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index dfa2ebb6223..0863f561735 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2116,6 +2116,21 @@ TEST_F(JsonReaderTest, MixedTypes) )", cudf::test::lists_column_wrapper{ {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}}); + { + std::string json_string = R"( +{ "var1": true } +{ "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]] } + )"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .mixed_types_as_string(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + cudf::test::print(result.tbl->get_column(0)); + } } CUDF_TEST_PROGRAM_MAIN() From d779c073fa7b6fe45e82c1e1d6217e2ebdca1d71 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 12 Dec 2023 18:39:38 -0700 Subject: [PATCH 08/21] Java bindings for mixed types as strings (@andygrove) * java bindings * tests * change default for mixedTypesAsStrings to false for backwards compatibility --- .../main/java/ai/rapids/cudf/JSONOptions.java | 19 ++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 16 +++++---- java/src/main/native/src/TableJni.cpp | 13 ++++--- .../test/java/ai/rapids/cudf/TableTest.java | 36 +++++++++++++++++++ java/src/test/resources/mixed_types_1.json | 2 ++ java/src/test/resources/mixed_types_2.json | 2 ++ 6 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 java/src/test/resources/mixed_types_1.json create mode 100644 java/src/test/resources/mixed_types_2.json diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index f98687df5fa..2ee3be03379 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -30,12 +30,14 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean dayFirst; private final boolean lines; private final boolean recoverWithNull; + private final boolean mixedTypesAsStrings; private JSONOptions(Builder builder) { super(builder); dayFirst = builder.dayFirst; lines = builder.lines; recoverWithNull = builder.recoverWithNull; + mixedTypesAsStrings = builder.mixedTypesAsStrings; } public boolean isDayFirst() { @@ -51,6 +53,10 @@ public boolean isRecoverWithNull() { return recoverWithNull; } + public boolean isMixedTypesAsStrings() { + return mixedTypesAsStrings; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -66,6 +72,8 @@ public static final class Builder extends ColumnFilterOptions.Builder= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, - opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull())); + opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(), + opts.isMixedTypesAsStrings())); } /** @@ -1170,7 +1174,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), null, buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull()))) { + opts.isRecoverWithNull(), opts.isMixedTypesAsStrings()))) { return gatherJSONColumns(schema, twm); } } @@ -1186,7 +1190,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull(), dsHandle))) { + opts.isRecoverWithNull(), opts.isMixedTypesAsStrings(), dsHandle))) { return gatherJSONColumns(schema, twm); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index fad19bdf895..16a46122a46 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1393,7 +1393,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null) { + jboolean recover_with_null, jboolean mixed_types_as_string) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1412,7 +1412,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .mixed_types_as_string(mixed_types_as_string); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1470,7 +1471,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) { + jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean mixed_types_as_string, + jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1537,7 +1539,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null) { + jboolean recover_with_null, jboolean mixed_types_as_string) { bool read_buffer = true; if (buffer == 0) { @@ -1587,7 +1589,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .mixed_types_as_string(mixed_types_as_string); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index b0dd4122b0e..23a4568e216 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -87,6 +87,8 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); + private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json"); + private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -327,6 +329,40 @@ void testReadJSONFile() { } } + @Test + void testReadMixedType1JSONFile() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("123", "123" ) + .build(); + Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_1_JSON)) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testReadMixedType2JSONFile() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("[1,2,3]", "{ \"b\": 1 }" ) + .build(); + Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() diff --git a/java/src/test/resources/mixed_types_1.json b/java/src/test/resources/mixed_types_1.json new file mode 100644 index 00000000000..288b06957e3 --- /dev/null +++ b/java/src/test/resources/mixed_types_1.json @@ -0,0 +1,2 @@ +{ "a": "123" } +{ "a": 123 } \ No newline at end of file diff --git a/java/src/test/resources/mixed_types_2.json b/java/src/test/resources/mixed_types_2.json new file mode 100644 index 00000000000..4f1a9d0e3d9 --- /dev/null +++ b/java/src/test/resources/mixed_types_2.json @@ -0,0 +1,2 @@ +{ "a": [1,2,3] } +{ "a": { "b": 1 } } \ No newline at end of file From ed64288c5a26a5b8f4f6f3edefd0d34843bc7374 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 14 Dec 2023 18:11:02 +0530 Subject: [PATCH 09/21] newline at eof style fix. --- java/src/test/resources/mixed_types_1.json | 2 +- java/src/test/resources/mixed_types_2.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/src/test/resources/mixed_types_1.json b/java/src/test/resources/mixed_types_1.json index 288b06957e3..21d625bbf2a 100644 --- a/java/src/test/resources/mixed_types_1.json +++ b/java/src/test/resources/mixed_types_1.json @@ -1,2 +1,2 @@ { "a": "123" } -{ "a": 123 } \ No newline at end of file +{ "a": 123 } diff --git a/java/src/test/resources/mixed_types_2.json b/java/src/test/resources/mixed_types_2.json index 4f1a9d0e3d9..becad2d0db7 100644 --- a/java/src/test/resources/mixed_types_2.json +++ b/java/src/test/resources/mixed_types_2.json @@ -1,2 +1,2 @@ { "a": [1,2,3] } -{ "a": { "b": 1 } } \ No newline at end of file +{ "a": { "b": 1 } } From 7b537e4977c18bf839978ce785a43045349c45d3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 4 Jan 2024 15:44:43 +0530 Subject: [PATCH 10/21] copyright year --- cpp/include/cudf/io/json.hpp | 2 +- cpp/src/io/json/json_column.cu | 49 ++++++++++++++++++- cpp/src/io/json/json_tree.cu | 2 +- cpp/src/io/json/nested_json.hpp | 2 +- cpp/tests/io/json_test.cpp | 4 +- cpp/tests/io/json_tree.cpp | 2 +- .../main/java/ai/rapids/cudf/JSONOptions.java | 2 +- java/src/main/java/ai/rapids/cudf/Table.java | 2 +- java/src/main/native/src/TableJni.cpp | 2 +- .../test/java/ai/rapids/cudf/TableTest.java | 2 +- python/cudf/cudf/_lib/cpp/io/json.pxd | 2 +- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/io/json.py | 2 +- 13 files changed, 61 insertions(+), 14 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7738b15243d..67acee363da 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 33f05a077e8..3bd946e0ffc 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,6 +51,7 @@ #include #include +#include namespace cudf::io::json::detail { @@ -393,6 +394,44 @@ std::vector copy_strings_to_host(device_span input, return to_host(d_column_names->view()); } +using variant_dtype = std::variant, + std::map, + std::map>; + +// pass base column name +// children level. get thier col ids, then extract their names. +// [1, 2]\n[1, 2] 1 +// [[1, 2], [1, 2]] 2 +// {a: 1, b: 2}\n {a: 1, b: 2} 1 +// [{a: 1, b: 2}, {a: 1, b: 2}] 2 +void map_types_to_strings(variant_dtype const& var_dtype, std::vector const& base_column_indices, std::vector const& column_names) { + // Get level 0 or 1 names. + // create a map of schema always with name? + std::optional child_schema_element = std::visit( + cudf::detail::visitor_overload{ + // TODO processing required here only. (extract base col names and construct schema) + [column_index](std::vector const& user_dtypes) -> std::optional { + return (static_cast(column_index) < user_dtypes.size()) + ? std::optional{{user_dtypes[column_index]}} + : std::optional{}; + }, + // TODO just transform and return it. + [col_name]( + std::map const& user_dtypes) -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? std::optional{{user_dtypes.find(col_name)->second}} + : std::optional{}; + }, + // TODO just return it. + [col_name](std::map const& user_dtypes) + -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? user_dtypes.find(col_name)->second + : std::optional{}; + }}, + options.get_dtypes()); +} + /** * @brief Holds member data pointers of `d_json_column` * @@ -429,6 +468,7 @@ void make_device_json_column(device_span input, bool is_array_of_arrays, bool is_enabled_lines, bool is_mixed_type_as_string_enabled, + variant_dtype const& var_dtype, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -566,6 +606,10 @@ void make_device_json_column(device_span input, // TODO go through input schema, and force string columns to be string. // ignore their children too during below processing. + // get schema, find max depth, reserve that depth in the vector path; + // construct path for each + // Find all struct nodes, and build its path, and check if it is present in schema as string. + for (auto const this_col_id : unique_col_ids) { if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { continue; @@ -589,7 +633,7 @@ void make_device_json_column(device_span input, } print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); - if (is_mixed_string_column[parent_col_id] == 1) { + if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) { // if parent is mixed string column, ignore this column. is_mixed_string_column[this_col_id] = 1; ignore_vals[this_col_id] = 1; @@ -1079,6 +1123,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, is_array_of_arrays, options.is_enabled_lines(), options.is_enabled_mixed_types_as_string(), + options.get_dtypes(), stream, mr); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index f3c1da96dda..d68eede536a 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 5323b818f61..c13daf9b9f5 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 0863f561735..aa388a773f5 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2052,6 +2052,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) TEST_F(JsonReaderTest, MixedTypes) { { + // Simple test for mixed types std::string json_string = R"({ "foo": [1,2,3], "bar": 123 } { "foo": { "a": 1 }, "bar": 456 })"; @@ -2073,6 +2074,7 @@ TEST_F(JsonReaderTest, MixedTypes) cudf::test::fixed_width_column_wrapper({123, 456})); } + // Testing function for mixed types in JSON (for spark json reader) auto test_fn = [](std::string_view json_string, cudf::column_view expected) { cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder( diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index ece7d5242b0..4147d85b3fc 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 2ee3be03379..ae964f897f9 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 8c4693e2607..300c540b8c0 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 37bc93e52ce..c43a99a58d5 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 6c0a6947581..99fb5532332 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index 965a0b5bc23..b916c2b7ad9 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index b6124fcbced..e1f13df9d26 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # cython: boundscheck = False diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 0088c59f8c3..35d91f9c062 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import warnings from collections import abc From 0dbc9f56400d33a7e1d90bd04923090b5f38e6ba Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 10 Jan 2024 04:38:27 +0530 Subject: [PATCH 11/21] undo mixed type code --- cpp/src/io/json/json_column.cu | 45 ---------------------------------- cpp/src/io/json/json_tree.cu | 5 ++-- 2 files changed, 3 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 3bd946e0ffc..9d51dee1d90 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -51,7 +51,6 @@ #include #include -#include namespace cudf::io::json::detail { @@ -394,44 +393,6 @@ std::vector copy_strings_to_host(device_span input, return to_host(d_column_names->view()); } -using variant_dtype = std::variant, - std::map, - std::map>; - -// pass base column name -// children level. get thier col ids, then extract their names. -// [1, 2]\n[1, 2] 1 -// [[1, 2], [1, 2]] 2 -// {a: 1, b: 2}\n {a: 1, b: 2} 1 -// [{a: 1, b: 2}, {a: 1, b: 2}] 2 -void map_types_to_strings(variant_dtype const& var_dtype, std::vector const& base_column_indices, std::vector const& column_names) { - // Get level 0 or 1 names. - // create a map of schema always with name? - std::optional child_schema_element = std::visit( - cudf::detail::visitor_overload{ - // TODO processing required here only. (extract base col names and construct schema) - [column_index](std::vector const& user_dtypes) -> std::optional { - return (static_cast(column_index) < user_dtypes.size()) - ? std::optional{{user_dtypes[column_index]}} - : std::optional{}; - }, - // TODO just transform and return it. - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; - }, - // TODO just return it. - [col_name](std::map const& user_dtypes) - -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; - }}, - options.get_dtypes()); -} - /** * @brief Holds member data pointers of `d_json_column` * @@ -468,7 +429,6 @@ void make_device_json_column(device_span input, bool is_array_of_arrays, bool is_enabled_lines, bool is_mixed_type_as_string_enabled, - variant_dtype const& var_dtype, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -606,10 +566,6 @@ void make_device_json_column(device_span input, // TODO go through input schema, and force string columns to be string. // ignore their children too during below processing. - // get schema, find max depth, reserve that depth in the vector path; - // construct path for each - // Find all struct nodes, and build its path, and check if it is present in schema as string. - for (auto const this_col_id : unique_col_ids) { if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { continue; @@ -1123,7 +1079,6 @@ table_with_metadata device_parse_nested_json(device_span d_input, is_array_of_arrays, options.is_enabled_lines(), options.is_enabled_mixed_types_as_string(), - options.get_dtypes(), stream, mr); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index d68eede536a..1c6b60bfbf1 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -442,11 +442,12 @@ tree_meta_t get_tree_representation(device_span tokens, rmm::device_uvector token_id(num_nested, stream); // 4B*2=8B, or 2B+ rmm::device_uvector parent_node_ids(num_nested, stream); // 4B*2=8B, or 2B+ auto const push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [] __device__(PdaTokenT const token) -> size_type { + tokens.begin(), + cuda::proclaim_return_type([] __device__(PdaTokenT const token) { int const is_begin = token == token_t::StructBegin or token == token_t::ListBegin; int const is_end = token == token_t::StructEnd or token == token_t::ListEnd; return is_begin - is_end; - }); + })); // copy_if only struct/list, stable sort by level, // corresponding node indices?, // then scatter to node_range_end for struct/list end. From c09b776fcf8b4595de492ade7eff63129a08170b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 10 Jan 2024 06:27:28 +0530 Subject: [PATCH 12/21] remove debug prints --- cpp/src/io/json/json_column.cu | 37 ---------------------------- cpp/src/io/json/json_tree.cu | 45 +--------------------------------- 2 files changed, 1 insertion(+), 81 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 9d51dee1d90..9e8a0a6bf9e 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -434,8 +434,6 @@ void make_device_json_column(device_span input, { CUDF_FUNC_RANGE(); auto num_nodes = col_ids.size(); - // TODO think about replacing all col_ids which are children of string column to ignore? (useful - // to reduce unique_col_id count for map types). rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); @@ -479,8 +477,6 @@ void make_device_json_column(device_span input, auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream); std::vector column_names = copy_strings_to_host( input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // for(auto str: column_names) std::cout< input, std::vector remapped_col_id(num_columns, -1); columns.try_emplace(parent_node_sentinel, std::ref(root)); - // TODO for map types support - // TODO go through input schema, and force string columns to be string. - // ignore their children too during below processing. - for (auto const this_col_id : unique_col_ids) { if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { continue; @@ -584,11 +576,9 @@ void make_device_json_column(device_span input, parent_col_id = column_parent_ids[parent_col_id]; name = column_names[field_name_col_id]; } else { - std::cout << "col_id:" << this_col_id << ", pcid:" << parent_col_id << "\n\n\n"; CUDF_FAIL("Unexpected parent column category"); } - print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) { // if parent is mixed string column, ignore this column. is_mixed_string_column[this_col_id] = 1; @@ -599,8 +589,6 @@ void make_device_json_column(device_span input, // replace if this column is a nested column and the existing was a value column // ignore this column if this column is a value column and the existing was a nested column auto it = columns.find(parent_col_id); - // if(it == columns.end()) - // std::cout<<"col_id:"<second.get(); bool replaced = false; @@ -668,19 +656,6 @@ void make_device_json_column(device_span input, columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); } - // debug prints - for (auto i = 0ul; i < num_columns; i++) - printf("%3lu ", i); - printf(" col_id\n"); - print_vec(column_categories, "column_categories", to_cat); - print_vec(ignore_vals, "ignore_vals", to_int); - print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); - for (auto i = 0ul; i < num_columns; i++) - printf("%3lu ", columns.count(i)); - printf(" columns\n"); - for (auto const& [key, value] : mapped_columns) { - std::cout << key.first << "+" << key.second << ":" << value << "\n"; - } if (is_mixed_type_as_string_enabled) { // ignore all children of mixed type columns @@ -719,9 +694,6 @@ void make_device_json_column(device_span input, static_cast(col.validity.data())}; } - // print_vec(column_categories, "column_categories", to_cat); - // print_vec(is_mixed_string_column, "is_mixed_string_column", to_int); - // print_vec(ignore_vals, "ignore_vals", to_int); auto d_ignore_vals = cudf::detail::make_device_uvector_async( ignore_vals, stream, rmm::mr::get_current_device_resource()); auto d_columns_data = cudf::detail::make_device_uvector_async( @@ -756,7 +728,6 @@ void make_device_json_column(device_span input, default: break; } }); - std::cout << "after str for_each_n\n"; // 4. scatter List offset // copy_if only node's whose parent is list, (node_id, parent_col_id) @@ -793,7 +764,6 @@ void make_device_json_column(device_span input, (!d_ignore_vals[col_ids[parent_node_id]]); // node_categories[parent_node_id] == NC_LIST; }); - std::cout << "after copy_if\n"; auto const num_list_children = list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); @@ -801,8 +771,6 @@ void make_device_json_column(device_span input, parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, node_ids.begin()); - std::cout << "after stable_sort_by_key\n"; - std::cout << num_list_children << "\n"; thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -826,7 +794,6 @@ void make_device_json_column(device_span input, row_offsets[node_id] + 1; } }); - std::cout << "after list for_each_n\n"; // 5. scan on offsets. for (auto& [id, col_ref] : columns) { @@ -1032,7 +999,6 @@ table_with_metadata device_parse_nested_json(device_span d_input, stream, rmm::mr::get_current_device_resource()); }(); // IILE used to free memory of token data. -#define NJP_DEBUG_PRINT #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); print_tree(h_input, gpu_tree, stream); @@ -1059,9 +1025,6 @@ table_with_metadata device_parse_nested_json(device_span d_input, stream, rmm::mr::get_current_device_resource()); - print_vec(cudf::detail::make_std_vector_async(gpu_col_id, stream), "gpu_col_id", to_int); - print_vec( - cudf::detail::make_std_vector_async(gpu_row_offsets, stream), "gpu_row_offsets", to_int); device_json_column root_column(stream, mr); root_column.type = json_col_t::ListColumn; root_column.child_offsets.resize(2, stream); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 1c6b60bfbf1..a298b51f55c 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -60,38 +60,8 @@ #include namespace cudf::io::json { -// Debug print helpers -[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string { - switch (token) { - case token_t::StructBegin: return " {"; - case token_t::StructEnd: return " }"; - case token_t::ListBegin: return " ["; - case token_t::ListEnd: return " ]"; - case token_t::FieldNameBegin: return "FB"; - case token_t::FieldNameEnd: return "FE"; - case token_t::StringBegin: return "SB"; - case token_t::StringEnd: return "SE"; - case token_t::ErrorBegin: return "er"; - case token_t::ValueBegin: return "VB"; - case token_t::ValueEnd: return "VE"; - case token_t::StructMemberBegin: return " <"; - case token_t::StructMemberEnd: return " >"; - case token_t::LineEnd: return ";"; - default: return "."; - } -}; -auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; -auto print_vec = [](auto const& cpu, auto const name, auto converter) { - for (auto const& v : cpu) - printf("%3s,", converter(v).c_str()); - std::cout << name << std::endl; -}; namespace detail { -void print_tree(host_span input, - tree_meta_t const& d_gpu_tree, - rmm::cuda_stream_view stream); - // The node that a token represents struct token_to_node { __device__ auto operator()(PdaTokenT const token) -> NodeT @@ -464,19 +434,10 @@ tree_meta_t get_tree_representation(device_span tokens, is_nested, stream); - print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int); - thrust::exclusive_scan( rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin()); - print_vec(cudf::detail::make_std_vector_async(token_levels, stream), "ntoken_levels", to_int); - - rmm::device_uvector ntokens(num_nested, stream); - cudf::detail::copy_if_safe( - tokens.begin(), tokens.end(), tokens.begin(), ntokens.begin(), is_nested, stream); - print_vec(cudf::detail::make_std_vector_async(ntokens, stream), "ntokens", to_token_str); - print_vec(cudf::detail::make_std_vector_async(token_id, stream), "ntoken_id", to_int); - // + // Get parent of first child of struct/list begin. auto const first_childs_parent_token_id2 = [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT { if (i <= 0) { return -1; } @@ -509,15 +470,11 @@ tree_meta_t get_tree_representation(device_span tokens, // parent_node_sentinel is -1, useful for segmented max operation below }); - print_vec( - cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int); // propagate to siblings. propagate_parent_to_siblings( cudf::device_span{token_levels.data(), token_levels.size()}, parent_node_ids, stream); - print_vec( - cudf::detail::make_std_vector_async(parent_node_ids, stream), "nparent_node_ids", to_int); // scatter to node_range_end for all nested end tokens. (if it's end) auto token_indices_it = From 5837ca399552606955fe5eeeac8b8cf67071289a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 10 Jan 2024 06:28:07 +0530 Subject: [PATCH 13/21] cleanup code and comments --- cpp/src/io/json/json_column.cu | 45 +++++++++++----------------------- cpp/src/io/json/json_tree.cu | 30 +++++++++++------------ 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 9e8a0a6bf9e..2c6c655b72c 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -555,7 +555,6 @@ void make_device_json_column(device_span input, // find column_ids which are values, but should be ignored in validity std::vector ignore_vals(num_columns, 0); std::vector is_mixed_string_column(num_columns, 0); - std::vector remapped_col_id(num_columns, -1); columns.try_emplace(parent_node_sentinel, std::ref(root)); for (auto const this_col_id : unique_col_ids) { @@ -593,36 +592,24 @@ void make_device_json_column(device_span input, auto& parent_col = it->second.get(); bool replaced = false; if (mapped_columns.count({parent_col_id, name}) > 0) { - /**/ - // TODO if mixed type is enabled. - // make both of them as str, merge them how? - // all its child columns should be ignored from parsing. (is adding to ignore_vals enough?) - // is key_value column going to slow anyway? because of host copy? + // If mixed type is enabled, make both of them as str, merge them. + // all its child columns will be ignored from parsing. if (is_mixed_type_as_string_enabled) { // VAL/STR or STRUCT or LIST + auto old_col_id = mapped_columns[{parent_col_id, name}]; + is_mixed_string_column[this_col_id] = 1; - auto old_col_id = mapped_columns[{parent_col_id, name}]; - remapped_col_id[this_col_id] = old_col_id; - // if old col type (not cat) is string/val, keep it. - // else replace with string. + is_mixed_string_column[old_col_id] = 1; + // if old col type (not cat) is not string/val, replace with string. auto& col = columns.at(old_col_id).get(); if (col.type != json_col_t::StringColumn) { // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests. reinitialize_as_string(old_col_id, col); - // all its children (which are already inserted) are ignored below. + // all its children (which are already inserted) are ignored later. } - is_mixed_string_column[old_col_id] = 1; columns.try_emplace(this_col_id, columns.at(old_col_id)); continue; } - // old new new - // VAL SCT LST - // VAL LST SCT - // SCT LST VAL - // SCT VAL LST - // LST VAL SCT - // LST SCT VAL - /**/ if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { ignore_vals[this_col_id] = 1; @@ -666,7 +653,7 @@ void make_device_json_column(device_span input, ignore_vals[this_col_id] = 1; columns.erase(this_col_id); } - // Convert only mixed type columns as string (so to copy) + // Convert only mixed type columns as string (so to copy), but not its children if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and is_mixed_string_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; @@ -686,8 +673,7 @@ void make_device_json_column(device_span input, std::vector columns_data(num_columns); for (auto& [col_id, col_ref] : columns) { if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - // if(ignore_vals[col_id]) continue; + auto& col = col_ref.get(); columns_data[col_id] = json_column_data{col.string_offsets.data(), col.string_lengths.data(), col.child_offsets.data(), @@ -704,8 +690,7 @@ void make_device_json_column(device_span input, rmm::exec_policy(stream), thrust::counting_iterator(0), num_nodes, - [node_categories = tree.node_categories.begin(), - column_categories = d_column_tree.node_categories.begin(), + [column_categories = d_column_tree.node_categories.begin(), col_ids = col_ids.begin(), row_offsets = row_offsets.begin(), range_begin = tree.node_range_begin.begin(), @@ -714,7 +699,6 @@ void make_device_json_column(device_span input, d_columns_data = d_columns_data.begin()] __device__(size_type i) { if (d_ignore_vals[col_ids[i]]) return; auto const node_category = column_categories[col_ids[i]]; - // switch (node_categories[i]) { switch (node_category) { case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; @@ -753,11 +737,10 @@ void make_device_json_column(device_span input, num_nodes, thrust::make_counting_iterator(0), thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [ // node_categories = tree.node_categories.begin(), - d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { auto parent_node_id = parent_node_ids[node_id]; return parent_node_id != parent_node_sentinel and column_categories[col_ids[parent_node_id]] == NC_LIST and diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index a298b51f55c..9391ec23c13 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -389,14 +389,14 @@ tree_meta_t get_tree_representation(device_span tokens, stream); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); - // Extract Struct, List range_end + // Extract Struct, List range_end: // 1. Extract Struct, List - begin & end separately, their token ids // 2. push, pop to get levels // 3. copy first child's parent token_id, also translate to node_id // 4. propagate to siblings using levels, parent token id. (segmented scan) - // 5. scatter to node_range_end for all nested end tokens. (if it's end) + // 5. scatter to node_range_end for only nested end tokens. if (is_strict_nested_boundaries) { - // Whether the token pushes onto the parent node stack + // Whether the token is nested auto const is_nested = [] __device__(PdaTokenT const token) -> bool { switch (token) { case token_t::StructBegin: @@ -409,8 +409,8 @@ tree_meta_t get_tree_representation(device_span tokens, auto const num_nested = thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_nested); rmm::device_uvector token_levels(num_nested, stream); - rmm::device_uvector token_id(num_nested, stream); // 4B*2=8B, or 2B+ - rmm::device_uvector parent_node_ids(num_nested, stream); // 4B*2=8B, or 2B+ + rmm::device_uvector token_id(num_nested, stream); + rmm::device_uvector parent_node_ids(num_nested, stream); auto const push_pop_it = thrust::make_transform_iterator( tokens.begin(), cuda::proclaim_return_type([] __device__(PdaTokenT const token) { @@ -418,9 +418,7 @@ tree_meta_t get_tree_representation(device_span tokens, int const is_end = token == token_t::StructEnd or token == token_t::ListEnd; return is_begin - is_end; })); - // copy_if only struct/list, stable sort by level, - // corresponding node indices?, - // then scatter to node_range_end for struct/list end. + // copy_if only struct/list's token levels, token ids, tokens. cudf::detail::copy_if_safe(push_pop_it, push_pop_it + num_tokens, tokens.begin(), @@ -450,10 +448,8 @@ tree_meta_t get_tree_representation(device_span tokens, }; // copied L+S tokens, and their token ids, their token levels. - // first child parent token ids - // propagate to siblings - // parent token id for all ends -> similar binary search here to find its node id. - // scatter to that location. + // initialize first child parent token ids + // translate token ids to node id using similar binary search. thrust::transform( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -476,12 +472,14 @@ tree_meta_t get_tree_representation(device_span tokens, parent_node_ids, stream); - // scatter to node_range_end for all nested end tokens. (if it's end) + // scatter to node_range_end for only nested end tokens. auto token_indices_it = thrust::make_permutation_iterator(token_indices.begin(), token_id.begin()); - // add +1 to include end symbol. - auto nested_node_range_end_it = thrust::make_transform_output_iterator( - node_range_end.begin(), [] __device__(auto i) { return i + 1; }); + auto nested_node_range_end_it = + thrust::make_transform_output_iterator(node_range_end.begin(), [] __device__(auto i) { + // add +1 to include end symbol. + return i + 1; + }); auto stencil = thrust::make_transform_iterator(token_id.begin(), is_nested_end{tokens.begin()}); thrust::scatter_if(rmm::exec_policy(stream), token_indices_it, From 2ba5e9b77b164ff1061d0b913517ca263a84df47 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:05:06 +0530 Subject: [PATCH 14/21] testcase when the MixedTypesAsStrings feature is disabled Co-authored-by: Andy Grove --- java/src/test/java/ai/rapids/cudf/TableTest.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 99fb5532332..119169b6458 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -328,7 +328,18 @@ void testReadJSONFile() { assertTablesAreEqual(expected, table); } } - + @Test + void testReadMixedType2JSONFileFeatureDisabled() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(false) + .build(); + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)); + } @Test void testReadMixedType1JSONFile() { Schema schema = Schema.builder() From a3c1fe2b0882a245acc8937c57a805eab95d34a5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 11 Jan 2024 17:21:59 +0530 Subject: [PATCH 15/21] update mixed string, enable test for data source json --- java/src/main/native/src/TableJni.cpp | 3 ++- java/src/test/java/ai/rapids/cudf/TableTest.java | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index c43a99a58d5..85780589afb 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1506,7 +1506,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .mixed_types_as_string(mixed_types_as_string); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 119169b6458..f98476d9786 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -358,7 +358,7 @@ void testReadMixedType1JSONFile() { } @Test - void testReadMixedType2JSONFile() { + void testReadMixedType2JSONFile() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "a") .build(); @@ -369,7 +369,8 @@ void testReadMixedType2JSONFile() { try (Table expected = new Table.TestBuilder() .column("[1,2,3]", "{ \"b\": 1 }" ) .build(); - Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)) { + MultiBufferDataSource source = sourceFrom(TEST_MIXED_TYPE_2_JSON); + Table table = Table.readJSON(schema, opts, source)) { assertTablesAreEqual(expected, table); } } From e4da81e2b78c6f64f064938d0c8eeed553076d1d Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 11 Jan 2024 21:31:13 +0530 Subject: [PATCH 16/21] add line to separate tests Co-authored-by: Andy Grove --- java/src/test/java/ai/rapids/cudf/TableTest.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index f98476d9786..73002644858 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -328,6 +328,7 @@ void testReadJSONFile() { assertTablesAreEqual(expected, table); } } + @Test void testReadMixedType2JSONFileFeatureDisabled() { Schema schema = Schema.builder() @@ -340,6 +341,7 @@ void testReadMixedType2JSONFileFeatureDisabled() { assertThrows(CudfException.class, () -> Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)); } + @Test void testReadMixedType1JSONFile() { Schema schema = Schema.builder() From df4eb7d31c5c7a4640bde90801e7032308c14401 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 13 Jan 2024 03:16:01 +0530 Subject: [PATCH 17/21] addressed review comments (@elstehle) --- cpp/src/io/json/json_tree.cu | 37 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 9391ec23c13..7ef3707332b 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -414,34 +414,27 @@ tree_meta_t get_tree_representation(device_span tokens, auto const push_pop_it = thrust::make_transform_iterator( tokens.begin(), cuda::proclaim_return_type([] __device__(PdaTokenT const token) { - int const is_begin = token == token_t::StructBegin or token == token_t::ListBegin; - int const is_end = token == token_t::StructEnd or token == token_t::ListEnd; + size_type const is_begin = token == token_t::StructBegin or token == token_t::ListBegin; + size_type const is_end = token == token_t::StructEnd or token == token_t::ListEnd; return is_begin - is_end; })); // copy_if only struct/list's token levels, token ids, tokens. - cudf::detail::copy_if_safe(push_pop_it, - push_pop_it + num_tokens, - tokens.begin(), - token_levels.begin(), - is_nested, - stream); - cudf::detail::copy_if_safe(thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_tokens, - tokens.begin(), - token_id.begin(), - is_nested, - stream); + auto zipped_in_it = + thrust::make_zip_iterator(push_pop_it, thrust::make_counting_iterator(0)); + auto zipped_out_it = thrust::make_zip_iterator(token_levels.begin(), token_id.begin()); + cudf::detail::copy_if_safe( + zipped_in_it, zipped_in_it + num_tokens, tokens.begin(), zipped_out_it, is_nested, stream); thrust::exclusive_scan( rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin()); // Get parent of first child of struct/list begin. - auto const first_childs_parent_token_id2 = + auto const nested_first_childs_parent_token_id = [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT { if (i <= 0) { return -1; } - auto id = token_id[i - 1]; // token indices. + auto id = token_id[i - 1]; // current token's predecessor if (tokens_gpu[id] == token_t::StructBegin or tokens_gpu[id] == token_t::ListBegin) { - return token_id[i - 1]; + return id; } else { return -1; } @@ -455,10 +448,12 @@ tree_meta_t get_tree_representation(device_span tokens, thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_nested, parent_node_ids.begin(), - [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id2] __device__( - NodeIndexT const tid) -> NodeIndexT { - auto const pid = first_childs_parent_token_id2(tid); - // return pid; + [node_ids_gpu = node_token_ids.begin(), + num_nodes, + nested_first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT { + auto const pid = nested_first_childs_parent_token_id(tid); + // token_ids which are converted to nodes, are stored in node_ids_gpu in order + // so finding index of token_id in node_ids_gpu will return its node index. return pid < 0 ? parent_node_sentinel : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - From 6ddac345f988096f9378f511e9f1734829758258 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:13:00 +0530 Subject: [PATCH 18/21] Apply suggestions from code review Co-authored-by: Elias Stehle <3958403+elstehle@users.noreply.github.com> --- cpp/src/io/json/json_column.cu | 1 - cpp/src/io/json/json_tree.cu | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 2c6c655b72c..11683eb0586 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -745,7 +745,6 @@ void make_device_json_column(device_span input, return parent_node_id != parent_node_sentinel and column_categories[col_ids[parent_node_id]] == NC_LIST and (!d_ignore_vals[col_ids[parent_node_id]]); - // node_categories[parent_node_id] == NC_LIST; }); auto const num_list_children = diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 7ef3707332b..82a60327a3b 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -135,7 +135,7 @@ struct node_ranges { }; struct is_nested_end { - SymbolT const* tokens; + PdaTokenT const* tokens; __device__ auto operator()(NodeIndexT i) -> bool { return tokens[i] == token_t::StructEnd or tokens[i] == token_t::ListEnd; From ea40c558d49a9e74ee3a06580095230a010efc7c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 19 Jan 2024 01:45:28 +0530 Subject: [PATCH 19/21] address review comments (bdice), add test cases for max row offset test --- cpp/include/cudf/io/json.hpp | 14 ++++----- cpp/src/io/json/json_column.cu | 27 ++++++++--------- cpp/src/io/json/json_tree.cu | 14 +++++---- cpp/tests/io/json_test.cpp | 30 +++++++++++++++++-- .../main/java/ai/rapids/cudf/JSONOptions.java | 2 +- 5 files changed, 58 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 67acee363da..2a39a539cc7 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -98,7 +98,7 @@ class json_reader_options { // Read the file as a json object per line bool _lines = false; - // Read the mixed types as string column + // Parse mixed types as a string column bool _mixed_types_as_string = false; // Bytes to skip from the start @@ -228,9 +228,9 @@ class json_reader_options { bool is_enabled_lines() const { return _lines; } /** - * @brief Whether to read the mixed types as string column. + * @brief Whether to parse mixed types as a string column. * - * @return `true` if reading the mixed types as string column + * @return `true` if mixed types are parsed as a string column */ bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } @@ -312,9 +312,9 @@ class json_reader_options { void enable_lines(bool val) { _lines = val; } /** - * @brief Set whether to read the mixed types as string column. + * @brief Set whether to parse mixed types as a string column. * - * @param val Boolean value to enable/disable the option to read the mixed types as string column + * @param val Boolean value to enable/disable parsing mixed types as a string column */ void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; } @@ -454,9 +454,9 @@ class json_reader_options_builder { } /** - * @brief Set whether to read the mixed types as string column. + * @brief Set whether to parse mixed types as a string column. * - * @param val Boolean value to enable/disable the option to read the mixed types as string column + * @param val Boolean value to enable/disable parsing mixed types as a string column * @return this for chaining */ json_reader_options_builder& mixed_types_as_string(bool val) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 11683eb0586..65d2fb7f28b 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -554,7 +554,7 @@ void make_device_json_column(device_span input, std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity std::vector ignore_vals(num_columns, 0); - std::vector is_mixed_string_column(num_columns, 0); + std::vector is_mixed_type_column(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); for (auto const this_col_id : unique_col_ids) { @@ -578,10 +578,10 @@ void make_device_json_column(device_span input, CUDF_FAIL("Unexpected parent column category"); } - if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) { + if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) { // if parent is mixed string column, ignore this column. - is_mixed_string_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; continue; } // If the child is already found, @@ -592,18 +592,17 @@ void make_device_json_column(device_span input, auto& parent_col = it->second.get(); bool replaced = false; if (mapped_columns.count({parent_col_id, name}) > 0) { - // If mixed type is enabled, make both of them as str, merge them. - // all its child columns will be ignored from parsing. + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. if (is_mixed_type_as_string_enabled) { // VAL/STR or STRUCT or LIST auto old_col_id = mapped_columns[{parent_col_id, name}]; - is_mixed_string_column[this_col_id] = 1; - is_mixed_string_column[old_col_id] = 1; + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; // if old col type (not cat) is not string/val, replace with string. auto& col = columns.at(old_col_id).get(); if (col.type != json_col_t::StringColumn) { - // TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests. reinitialize_as_string(old_col_id, col); // all its children (which are already inserted) are ignored later. } @@ -648,14 +647,14 @@ void make_device_json_column(device_span input, // ignore all children of mixed type columns for (auto const this_col_id : unique_col_ids) { auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 1) { - is_mixed_string_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; columns.erase(this_col_id); } // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and - is_mixed_string_column[this_col_id] == 1) + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) column_categories[this_col_id] = NC_STR; } cudaMemcpyAsync(d_column_tree.node_categories.begin(), diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 82a60327a3b..01c7b869217 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -413,11 +413,15 @@ tree_meta_t get_tree_representation(device_span tokens, rmm::device_uvector parent_node_ids(num_nested, stream); auto const push_pop_it = thrust::make_transform_iterator( tokens.begin(), - cuda::proclaim_return_type([] __device__(PdaTokenT const token) { - size_type const is_begin = token == token_t::StructBegin or token == token_t::ListBegin; - size_type const is_end = token == token_t::StructEnd or token == token_t::ListEnd; - return is_begin - is_end; - })); + cuda::proclaim_return_type( + [] __device__(PdaTokenT const token) -> size_type { + if (token == token_t::StructBegin or token == token_t::ListBegin) { + return 1; + } else if (token == token_t::StructEnd or token == token_t::ListEnd) { + return -1; + } + return 0; + })); // copy_if only struct/list's token levels, token ids, tokens. auto zipped_in_it = thrust::make_zip_iterator(push_pop_it, thrust::make_counting_iterator(0)); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index e0de52c3870..22c2f0de924 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -2133,8 +2132,35 @@ TEST_F(JsonReaderTest, MixedTypes) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - cudf::test::print(result.tbl->get_column(0)); } + + // test to confirm if reinitialize a non-string column as string affects max_rowoffsets. + // max_rowoffsets is generated based on parent col id, + // so, even if mixed types are present, their row offset will be correct. + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + cudf::test::lists_column_wrapper expected_list{ + { + cudf::test::lists_column_wrapper({LCW({"1", "2", "3"}), LCW({"4", "5", "6"})}), + cudf::test::lists_column_wrapper({LCW()}), + cudf::test::lists_column_wrapper({LCW()}), // null + cudf::test::lists_column_wrapper({LCW()}), // null + cudf::test::lists_column_wrapper({LCW({"{\"c\": -1}"}), LCW({"5"})}), + cudf::test::lists_column_wrapper({LCW({"7"}), LCW({"8", "9"})}), + cudf::test::lists_column_wrapper({LCW()}), // null + }, + valid_t{1, 1, 0, 0, 1, 1, 0}.begin()}; + test_fn(R"( +{"b": [ [1, 2, 3], [ 4, 5, 6] ]} +{"b": [[]]} +{} +{} +{"b": [ [ {"c": -1} ], [ 5 ] ]} +{"b": [ [7], [8, 9]]} +{} +)", + expected_list); } CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index ae964f897f9..523d594f8ba 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -113,7 +113,7 @@ public Builder withRecoverWithNull(boolean recoverWithNull) { * Specify how to handle columns that contain mixed types. * * @param mixedTypesAsStrings true: return unparsed JSON, false: throw exception - * @@return builder for chaining + * @return builder for chaining */ public Builder withMixedTypesAsStrings(boolean mixedTypesAsStrings) { this.mixedTypesAsStrings = mixedTypesAsStrings; From 49058081a854b01e1ada44e829c8283ab3acc961 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 19 Jan 2024 08:53:50 +0530 Subject: [PATCH 20/21] renaming arguments --- cpp/src/io/json/json_column.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 09440e81ceb..c61cc533a81 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -417,6 +417,7 @@ struct json_column_data { * @param root Root node of the `d_json_column` tree * @param is_array_of_arrays Whether the tree is an array of arrays * @param is_enabled_lines Whether the input is a line-delimited JSON + * @param is_enabled_mixed_types_as_string Whether to enable reading mixed types as string * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the device memory * of child_offets and validity members of `d_json_column` @@ -428,7 +429,7 @@ void make_device_json_column(device_span input, device_json_column& root, bool is_array_of_arrays, bool is_enabled_lines, - bool is_mixed_type_as_string_enabled, + bool is_enabled_mixed_types_as_string, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -579,7 +580,7 @@ void make_device_json_column(device_span input, } if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) { - // if parent is mixed string column, ignore this column. + // if parent is mixed type column, ignore this column. is_mixed_type_column[this_col_id] = 1; ignore_vals[this_col_id] = 1; continue; @@ -594,7 +595,7 @@ void make_device_json_column(device_span input, if (mapped_columns.count({parent_col_id, name}) > 0) { // If mixed type as string is enabled, make both of them strings and merge them. // All child columns will be ignored when parsing. - if (is_mixed_type_as_string_enabled) { + if (is_enabled_mixed_types_as_string) { // VAL/STR or STRUCT or LIST auto old_col_id = mapped_columns[{parent_col_id, name}]; @@ -643,7 +644,7 @@ void make_device_json_column(device_span input, mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); } - if (is_mixed_type_as_string_enabled) { + if (is_enabled_mixed_types_as_string) { // ignore all children of mixed type columns for (auto const this_col_id : unique_col_ids) { auto parent_col_id = column_parent_ids[this_col_id]; From d2e06911381d00755bc6ccb148623bb5aa1ea5f2 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 22 Jan 2024 18:00:36 +0530 Subject: [PATCH 21/21] address review comments, rename, fix reinit condition --- cpp/src/io/json/json_column.cu | 4 ++-- cpp/src/io/json/json_tree.cu | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index c61cc533a81..b1dc2c9dd7f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -601,9 +601,9 @@ void make_device_json_column(device_span input, is_mixed_type_column[this_col_id] = 1; is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is not string/val, replace with string. + // if old col type (not cat) is list or struct, replace with string. auto& col = columns.at(old_col_id).get(); - if (col.type != json_col_t::StringColumn) { + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { reinitialize_as_string(old_col_id, col); // all its children (which are already inserted) are ignored later. } diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 01c7b869217..275907c19c9 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -192,16 +192,16 @@ std::pair, rmm::device_uvector> stable_s } /** - * @brief Propagate parent node to siblings from first sibling. + * @brief Propagate parent node from first sibling to other siblings. * * @param node_levels Node levels of each node * @param parent_node_ids parent node ids initialized for first child of each push node, * and other siblings are initialized to -1. * @param stream CUDA stream used for device memory operations and kernel launches. */ -void propagate_parent_to_siblings(cudf::device_span node_levels, - cudf::device_span parent_node_ids, - rmm::cuda_stream_view stream) +void propagate_first_sibling_to_other(cudf::device_span node_levels, + cudf::device_span parent_node_ids, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); auto [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); @@ -354,7 +354,7 @@ tree_meta_t get_tree_representation(device_span tokens, }); } // Propagate parent node to siblings from first sibling - inplace. - propagate_parent_to_siblings( + propagate_first_sibling_to_other( cudf::device_span{node_levels.data(), node_levels.size()}, parent_node_ids, stream); @@ -465,8 +465,8 @@ tree_meta_t get_tree_representation(device_span tokens, // parent_node_sentinel is -1, useful for segmented max operation below }); - // propagate to siblings. - propagate_parent_to_siblings( + // propagate parent node from first sibling to other siblings - inplace. + propagate_first_sibling_to_other( cudf::device_span{token_levels.data(), token_levels.size()}, parent_node_ids, stream);