[Datasets] Update docs for drop_columns and fix typos (#26317)

We added drop_columns() API to datasets in #26200, so updating documentation here to use the new API - doc/source/data/examples/nyc_taxi_basic_processing.ipynb. In addition, fixing some minor typos after proofreading the datasets documentation.
ray-project · Jul 8, 2022 · 4e674b6 · 4e674b6
1 parent ea94cda
commit 4e674b6
Show file tree

Hide file tree

Showing 6 changed files with 13 additions and 13 deletions.
diff --git a/doc/source/data/creating-datasets.rst b/doc/source/data/creating-datasets.rst
@@ -390,7 +390,7 @@ Supported File Formats
   treating the outermost ndarray dimension as the row dimension. See our
   :ref:`tensor data guide <datasets_tensor_support>` for more information on working
   with tensors in Datasets. Although this simple example demonstrates reading a single
-  file, note that Datasets can also read directories of JSON files, with one tensor
+  file, note that Datasets can also read directories of NumPy files, with one tensor
   block created per file.
 
   .. literalinclude:: ./doc_code/creating_datasets.py

diff --git a/doc/source/data/dataset-ml-preprocessing.rst b/doc/source/data/dataset-ml-preprocessing.rst
@@ -24,7 +24,7 @@ ML pipeline completely within Ray without requiring data to be materialized to e
    :width: 650px
    :align: center
 
-See the :ref:`ML preprocessing docs <datasets-ml-preprocessing>` for information on how to use Datasets as the
+See below for information on how to use Datasets as the
 last-mile bridge to model training and inference, and see :ref:`the Talks section <data-talks>` for more
 Datasets ML use cases and benchmarks.
 

diff --git a/doc/source/data/doc_code/saving_datasets.py b/doc/source/data/doc_code/saving_datasets.py
@@ -32,13 +32,13 @@
 
 # Write out just one file.
 ds.repartition(1).write_csv("/tmp/one_csv")
-# -> /tmp/one_parquet/d757569dfb2845589b0ccbcb263e8cc3_000000.csv
+# -> /tmp/one_csv/d757569dfb2845589b0ccbcb263e8cc3_000000.csv
 
 # Write out multiple files.
 ds.repartition(3).write_csv("/tmp/multi_csv")
-# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000000.csv
-# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000001.csv
-# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000002.csv
+# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000000.csv
+# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000001.csv
+# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000002.csv
 # __write_csv_end__
 # fmt: on
 
@@ -53,13 +53,13 @@
 
 # Write out just one file.
 ds.repartition(1).write_json("/tmp/one_json")
-# -> /tmp/one_parquet/ab693fde13634f4c8cdaef1db9595ac1_000000.json
+# -> /tmp/one_json/ab693fde13634f4c8cdaef1db9595ac1_000000.json
 
 # Write out multiple files.
 ds.repartition(3).write_json("/tmp/multi_json")
-# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000000.json
-# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000001.json
-# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000002.json
+# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000000.json
+# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000001.json
+# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000002.json
 # __write_json_end__
 # fmt: on
 

diff --git a/doc/source/data/examples/nyc_taxi_basic_processing.ipynb b/doc/source/data/examples/nyc_taxi_basic_processing.ipynb
@@ -569,7 +569,7 @@
    ],
    "source": [
     "# Drop some columns.\n",
-    "ds = ds.map_batches(lambda df: df.drop(columns=[\"store_and_fwd_flag\", \"mta_tax\"]))"
+    "ds = ds.drop_columns([\"store_and_fwd_flag\", \"mta_tax\"])"
    ]
   },
   {

diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst
@@ -99,7 +99,7 @@ just two of the five columns of Iris dataset.
 Parquet Row Pruning
 ~~~~~~~~~~~~~~~~~~~
 
-Similarly, you can pass in a filter to ``ray.data.read_parquet()`` (selection pushdown)
+Similarly, you can pass in a filter to ``ray.data.read_parquet()`` (filter pushdown)
 which will be applied at the file scan so only rows that match the filter predicate
 will be returned.
 For example, use ``ray.data.read_parquet("example://iris.parquet", filter=pa.dataset.field("sepal.length") > 5.0``

diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
@@ -561,7 +561,7 @@ def drop_columns(
             compute: The compute strategy, either "tasks" (default) to use Ray
                 tasks, or ActorPoolStrategy(min, max) to use an autoscaling actor pool.
             ray_remote_args: Additional resource requirements to request from
-                ray (e.g., num_gpus=1 to request GPUs for the map tasks
+                ray (e.g., num_gpus=1 to request GPUs for the map tasks).
         """
 
         return self.map_batches(