From 4e674b6ad363a083e3f5ba02ef792a1456bebf48 Mon Sep 17 00:00:00 2001 From: Cheng Su Date: Thu, 7 Jul 2022 17:17:33 -0700 Subject: [PATCH] [Datasets] Update docs for drop_columns and fix typos (#26317) We added drop_columns() API to datasets in #26200, so updating documentation here to use the new API - doc/source/data/examples/nyc_taxi_basic_processing.ipynb. In addition, fixing some minor typos after proofreading the datasets documentation. --- doc/source/data/creating-datasets.rst | 2 +- doc/source/data/dataset-ml-preprocessing.rst | 2 +- doc/source/data/doc_code/saving_datasets.py | 16 ++++++++-------- .../examples/nyc_taxi_basic_processing.ipynb | 2 +- doc/source/data/performance-tips.rst | 2 +- python/ray/data/dataset.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/source/data/creating-datasets.rst b/doc/source/data/creating-datasets.rst index d7ec4437ceef..959fec24a94d 100644 --- a/doc/source/data/creating-datasets.rst +++ b/doc/source/data/creating-datasets.rst @@ -390,7 +390,7 @@ Supported File Formats treating the outermost ndarray dimension as the row dimension. See our :ref:`tensor data guide ` for more information on working with tensors in Datasets. Although this simple example demonstrates reading a single - file, note that Datasets can also read directories of JSON files, with one tensor + file, note that Datasets can also read directories of NumPy files, with one tensor block created per file. .. literalinclude:: ./doc_code/creating_datasets.py diff --git a/doc/source/data/dataset-ml-preprocessing.rst b/doc/source/data/dataset-ml-preprocessing.rst index 1f090ec0bf27..004ce302f8af 100644 --- a/doc/source/data/dataset-ml-preprocessing.rst +++ b/doc/source/data/dataset-ml-preprocessing.rst @@ -24,7 +24,7 @@ ML pipeline completely within Ray without requiring data to be materialized to e :width: 650px :align: center -See the :ref:`ML preprocessing docs ` for information on how to use Datasets as the +See below for information on how to use Datasets as the last-mile bridge to model training and inference, and see :ref:`the Talks section ` for more Datasets ML use cases and benchmarks. diff --git a/doc/source/data/doc_code/saving_datasets.py b/doc/source/data/doc_code/saving_datasets.py index 77dafdce8d8c..1b16ec560ed9 100644 --- a/doc/source/data/doc_code/saving_datasets.py +++ b/doc/source/data/doc_code/saving_datasets.py @@ -32,13 +32,13 @@ # Write out just one file. ds.repartition(1).write_csv("/tmp/one_csv") -# -> /tmp/one_parquet/d757569dfb2845589b0ccbcb263e8cc3_000000.csv +# -> /tmp/one_csv/d757569dfb2845589b0ccbcb263e8cc3_000000.csv # Write out multiple files. ds.repartition(3).write_csv("/tmp/multi_csv") -# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000000.csv -# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000001.csv -# -> /tmp/multi_parquet/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000002.csv +# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000000.csv +# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000001.csv +# -> /tmp/multi_csv/2b529dc5d8eb45e5ad03e69fb7ad8bc0_000002.csv # __write_csv_end__ # fmt: on @@ -53,13 +53,13 @@ # Write out just one file. ds.repartition(1).write_json("/tmp/one_json") -# -> /tmp/one_parquet/ab693fde13634f4c8cdaef1db9595ac1_000000.json +# -> /tmp/one_json/ab693fde13634f4c8cdaef1db9595ac1_000000.json # Write out multiple files. ds.repartition(3).write_json("/tmp/multi_json") -# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000000.json -# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000001.json -# -> /tmp/multi_parquet/f467636b3c41420bb109505ab56c6eae_000002.json +# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000000.json +# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000001.json +# -> /tmp/multi_json/f467636b3c41420bb109505ab56c6eae_000002.json # __write_json_end__ # fmt: on diff --git a/doc/source/data/examples/nyc_taxi_basic_processing.ipynb b/doc/source/data/examples/nyc_taxi_basic_processing.ipynb index ffd164b390e5..5d86b3c0f8f8 100644 --- a/doc/source/data/examples/nyc_taxi_basic_processing.ipynb +++ b/doc/source/data/examples/nyc_taxi_basic_processing.ipynb @@ -569,7 +569,7 @@ ], "source": [ "# Drop some columns.\n", - "ds = ds.map_batches(lambda df: df.drop(columns=[\"store_and_fwd_flag\", \"mta_tax\"]))" + "ds = ds.drop_columns([\"store_and_fwd_flag\", \"mta_tax\"])" ] }, { diff --git a/doc/source/data/performance-tips.rst b/doc/source/data/performance-tips.rst index ba51447742ba..8a389f1f3d9c 100644 --- a/doc/source/data/performance-tips.rst +++ b/doc/source/data/performance-tips.rst @@ -99,7 +99,7 @@ just two of the five columns of Iris dataset. Parquet Row Pruning ~~~~~~~~~~~~~~~~~~~ -Similarly, you can pass in a filter to ``ray.data.read_parquet()`` (selection pushdown) +Similarly, you can pass in a filter to ``ray.data.read_parquet()`` (filter pushdown) which will be applied at the file scan so only rows that match the filter predicate will be returned. For example, use ``ray.data.read_parquet("example://iris.parquet", filter=pa.dataset.field("sepal.length") > 5.0`` diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py index 3864fb5d1cc2..b862e0b05bec 100644 --- a/python/ray/data/dataset.py +++ b/python/ray/data/dataset.py @@ -561,7 +561,7 @@ def drop_columns( compute: The compute strategy, either "tasks" (default) to use Ray tasks, or ActorPoolStrategy(min, max) to use an autoscaling actor pool. ray_remote_args: Additional resource requirements to request from - ray (e.g., num_gpus=1 to request GPUs for the map tasks + ray (e.g., num_gpus=1 to request GPUs for the map tasks). """ return self.map_batches(