Use more pylibcudf.io.types enums in cudf._libs (#17237)

If we consider the `pylibcudf.libcudf` namespace to eventually be more "private", this PR replaces that usage, specifically when accessing enums, with their public counterparts Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: #17237
rapidsai · Nov 4, 2024 · a2001dd · a2001dd
1 parent 076ad58
commit a2001dd
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 128 deletions.
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -28,7 +28,7 @@ from pylibcudf.libcudf.io.csv cimport (
     write_csv as cpp_write_csv,
 )
 from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport compression_type, sink_info
+from pylibcudf.libcudf.io.types cimport sink_info
 from pylibcudf.libcudf.table.table_view cimport table_view
 
 from cudf._lib.io.utils cimport make_sink_info
@@ -148,13 +148,13 @@ def read_csv(
         byte_range = (0, 0)
 
     if compression is None:
-        c_compression = compression_type.NONE
+        c_compression = plc.io.types.CompressionType.NONE
     else:
         compression_map = {
-            "infer": compression_type.AUTO,
-            "gzip": compression_type.GZIP,
-            "bz2": compression_type.BZIP2,
-            "zip": compression_type.ZIP,
+            "infer": plc.io.types.CompressionType.AUTO,
+            "gzip": plc.io.types.CompressionType.GZIP,
+            "bz2": plc.io.types.CompressionType.BZIP2,
+            "zip": plc.io.types.CompressionType.ZIP,
         }
         c_compression = compression_map[compression]
 

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
@@ -9,10 +9,6 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
-from pylibcudf.io.types cimport compression_type
-from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
-from pylibcudf.libcudf.io.types cimport compression_type
 from pylibcudf.libcudf.types cimport data_type, type_id
 from pylibcudf.types cimport DataType
 
@@ -24,15 +20,6 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 import pylibcudf as plc
 
 
-cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
-    if on_bad_lines.lower() == "error":
-        return json_recovery_mode_t.FAIL
-    elif on_bad_lines.lower() == "recover":
-        return json_recovery_mode_t.RECOVER_WITH_NULL
-    else:
-        raise TypeError(f"Invalid parameter for {on_bad_lines=}")
-
-
 cpdef read_json(object filepaths_or_buffers,
                 object dtype,
                 bool lines,
@@ -41,7 +28,7 @@ cpdef read_json(object filepaths_or_buffers,
                 bool keep_quotes,
                 bool mixed_types_as_string,
                 bool prune_columns,
-                object on_bad_lines):
+                str on_bad_lines):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -64,19 +51,24 @@ cpdef read_json(object filepaths_or_buffers,
             filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
 
     # Setup arguments
-    cdef cudf_io_types.compression_type c_compression
-
     if compression is not None:
         if compression == 'gzip':
-            c_compression = cudf_io_types.compression_type.GZIP
+            c_compression = plc.io.types.CompressionType.GZIP
         elif compression == 'bz2':
-            c_compression = cudf_io_types.compression_type.BZIP2
+            c_compression = plc.io.types.CompressionType.BZIP2
         elif compression == 'zip':
-            c_compression = cudf_io_types.compression_type.ZIP
+            c_compression = plc.io.types.CompressionType.ZIP
         else:
-            c_compression = cudf_io_types.compression_type.AUTO
+            c_compression = plc.io.types.CompressionType.AUTO
+    else:
+        c_compression = plc.io.types.CompressionType.NONE
+
+    if on_bad_lines.lower() == "error":
+        c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL
+    elif on_bad_lines.lower() == "recover":
+        c_on_bad_lines = plc.io.types.JSONRecoveryMode.RECOVER_WITH_NULL
     else:
-        c_compression = cudf_io_types.compression_type.NONE
+        raise TypeError(f"Invalid parameter for {on_bad_lines=}")
 
     processed_dtypes = None
 
@@ -108,7 +100,7 @@ cpdef read_json(object filepaths_or_buffers,
             keep_quotes = keep_quotes,
             mixed_types_as_string = mixed_types_as_string,
             prune_columns = prune_columns,
-            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+            recovery_mode = c_on_bad_lines
         )
         df = cudf.DataFrame._from_data(
             *_data_from_columns(
@@ -130,7 +122,7 @@ cpdef read_json(object filepaths_or_buffers,
             keep_quotes = keep_quotes,
             mixed_types_as_string = mixed_types_as_string,
             prune_columns = prune_columns,
-            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+            recovery_mode = c_on_bad_lines
         )
 
         df = cudf.DataFrame._from_data(

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
@@ -15,7 +15,6 @@ try:
 except ImportError:
     import json
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
 cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.orc cimport (
@@ -26,7 +25,6 @@ from pylibcudf.libcudf.io.orc cimport (
 )
 from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
-    compression_type,
     sink_info,
     table_input_metadata,
 )
@@ -137,22 +135,23 @@ cpdef read_orc(object filepaths_or_buffers,
     return data, index
 
 
-cdef compression_type _get_comp_type(object compression):
+def _get_comp_type(object compression):
     if compression is None or compression is False:
-        return compression_type.NONE
+        return plc.io.types.CompressionType.NONE
 
     compression = str(compression).upper()
     if compression == "SNAPPY":
-        return compression_type.SNAPPY
+        return plc.io.types.CompressionType.SNAPPY
     elif compression == "ZLIB":
-        return compression_type.ZLIB
+        return plc.io.types.CompressionType.ZLIB
     elif compression == "ZSTD":
-        return compression_type.ZSTD
+        return plc.io.types.CompressionType.ZSTD
     elif compression == "LZ4":
-        return compression_type.LZ4
+        return plc.io.types.CompressionType.LZ4
     else:
         raise ValueError(f"Unsupported `compression` type {compression}")
 
+
 cdef tuple _get_index_from_metadata(
         vector[map[string, string]] user_data,
         object names,
@@ -210,19 +209,20 @@ cdef tuple _get_index_from_metadata(
         range_idx
     )
 
-cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
+
+def _get_orc_stat_freq(str statistics):
     """
     Convert ORC statistics terms to CUDF convention:
       - ORC "STRIPE"   == CUDF "ROWGROUP"
       - ORC "ROWGROUP" == CUDF "PAGE"
     """
     statistics = str(statistics).upper()
     if statistics == "NONE":
-        return cudf_io_types.statistics_freq.STATISTICS_NONE
+        return plc.io.types.StatisticsFreq.STATISTICS_NONE
     elif statistics == "STRIPE":
-        return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP
+        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
     elif statistics == "ROWGROUP":
-        return cudf_io_types.statistics_freq.STATISTICS_PAGE
+        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
     else:
         raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
 
@@ -232,7 +232,7 @@ def write_orc(
     table,
     object path_or_buf,
     object compression="snappy",
-    object statistics="ROWGROUP",
+    str statistics="ROWGROUP",
     object stripe_size_bytes=None,
     object stripe_size_rows=None,
     object row_index_stride=None,
@@ -246,7 +246,6 @@ def write_orc(
     --------
     cudf.read_orc
     """
-    cdef compression_type compression_ = _get_comp_type(compression)
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
     cdef table_input_metadata tbl_meta
@@ -289,7 +288,7 @@ def write_orc(
             sink_info_c, tv
         ).metadata(tbl_meta)
         .key_value_metadata(move(user_data))
-        .compression(compression_)
+        .compression(_get_comp_type(compression))
         .enable_statistics(_get_orc_stat_freq(statistics))
         .build()
     )
@@ -330,8 +329,8 @@ cdef class ORCWriter:
     cdef unique_ptr[orc_chunked_writer] writer
     cdef sink_info sink
     cdef unique_ptr[data_sink] _data_sink
-    cdef cudf_io_types.statistics_freq stat_freq
-    cdef compression_type comp_type
+    cdef str statistics
+    cdef object compression
     cdef object index
     cdef table_input_metadata tbl_meta
     cdef object cols_as_map_type
@@ -343,15 +342,15 @@ cdef class ORCWriter:
                   object path,
                   object index=None,
                   object compression="snappy",
-                  object statistics="ROWGROUP",
+                  str statistics="ROWGROUP",
                   object cols_as_map_type=None,
                   object stripe_size_bytes=None,
                   object stripe_size_rows=None,
                   object row_index_stride=None):
 
         self.sink = make_sink_info(path, self._data_sink)
-        self.stat_freq = _get_orc_stat_freq(statistics)
-        self.comp_type = _get_comp_type(compression)
+        self.statistics = statistics
+        self.compression = compression
         self.index = index
         self.cols_as_map_type = cols_as_map_type \
             if cols_as_map_type is None else set(cols_as_map_type)
@@ -429,8 +428,8 @@ cdef class ORCWriter:
                 chunked_orc_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta)
                 .key_value_metadata(move(user_data))
-                .compression(self.comp_type)
-                .enable_statistics(self.stat_freq)
+                .compression(_get_comp_type(self.compression))
+                .enable_statistics(_get_orc_stat_freq(self.statistics))
                 .build()
             )
         if self.stripe_size_bytes is not None: