From ab052b3a017939305843dd5f62db9b8d8144ad35 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Mon, 5 Sep 2022 16:47:22 -0700 Subject: [PATCH] [docs] Document using a different separator for read_csv (#27850) See discussion in https://github.com/ray-project/ray/issues/27738 Co-authored-by: matthewdeng Signed-off-by: Zhi Lin --- python/ray/data/examples/data/iris.tsv | 151 +++++++++++++++++++++++++ python/ray/data/read_api.py | 12 +- 2 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 python/ray/data/examples/data/iris.tsv diff --git a/python/ray/data/examples/data/iris.tsv b/python/ray/data/examples/data/iris.tsv new file mode 100644 index 000000000000..271b9fde67c8 --- /dev/null +++ b/python/ray/data/examples/data/iris.tsv @@ -0,0 +1,151 @@ +sepal.length sepal.width petal.length petal.width variety +5.1 3.5 1.4 0.2 setosa +4.9 3.0 1.4 0.2 setosa +4.7 3.2 1.3 0.2 setosa +4.6 3.1 1.5 0.2 setosa +5.0 3.6 1.4 0.2 setosa +5.4 3.9 1.7 0.4 setosa +4.6 3.4 1.4 0.3 setosa +5.0 3.4 1.5 0.2 setosa +4.4 2.9 1.4 0.2 setosa +4.9 3.1 1.5 0.1 setosa +5.4 3.7 1.5 0.2 setosa +4.8 3.4 1.6 0.2 setosa +4.8 3.0 1.4 0.1 setosa +4.3 3.0 1.1 0.1 setosa +5.8 4.0 1.2 0.2 setosa +5.7 4.4 1.5 0.4 setosa +5.4 3.9 1.3 0.4 setosa +5.1 3.5 1.4 0.3 setosa +5.7 3.8 1.7 0.3 setosa +5.1 3.8 1.5 0.3 setosa +5.4 3.4 1.7 0.2 setosa +5.1 3.7 1.5 0.4 setosa +4.6 3.6 1.0 0.2 setosa +5.1 3.3 1.7 0.5 setosa +4.8 3.4 1.9 0.2 setosa +5.0 3.0 1.6 0.2 setosa +5.0 3.4 1.6 0.4 setosa +5.2 3.5 1.5 0.2 setosa +5.2 3.4 1.4 0.2 setosa +4.7 3.2 1.6 0.2 setosa +4.8 3.1 1.6 0.2 setosa +5.4 3.4 1.5 0.4 setosa +5.2 4.1 1.5 0.1 setosa +5.5 4.2 1.4 0.2 setosa +4.9 3.1 1.5 0.2 setosa +5.0 3.2 1.2 0.2 setosa +5.5 3.5 1.3 0.2 setosa +4.9 3.6 1.4 0.1 setosa +4.4 3.0 1.3 0.2 setosa +5.1 3.4 1.5 0.2 setosa +5.0 3.5 1.3 0.3 setosa +4.5 2.3 1.3 0.3 setosa +4.4 3.2 1.3 0.2 setosa +5.0 3.5 1.6 0.6 setosa +5.1 3.8 1.9 0.4 setosa +4.8 3.0 1.4 0.3 setosa +5.1 3.8 1.6 0.2 setosa +4.6 3.2 1.4 0.2 setosa +5.3 3.7 1.5 0.2 setosa +5.0 3.3 1.4 0.2 setosa +7.0 3.2 4.7 1.4 versicolor +6.4 3.2 4.5 1.5 versicolor +6.9 3.1 4.9 1.5 versicolor +5.5 2.3 4.0 1.3 versicolor +6.5 2.8 4.6 1.5 versicolor +5.7 2.8 4.5 1.3 versicolor +6.3 3.3 4.7 1.6 versicolor +4.9 2.4 3.3 1.0 versicolor +6.6 2.9 4.6 1.3 versicolor +5.2 2.7 3.9 1.4 versicolor +5.0 2.0 3.5 1.0 versicolor +5.9 3.0 4.2 1.5 versicolor +6.0 2.2 4.0 1.0 versicolor +6.1 2.9 4.7 1.4 versicolor +5.6 2.9 3.6 1.3 versicolor +6.7 3.1 4.4 1.4 versicolor +5.6 3.0 4.5 1.5 versicolor +5.8 2.7 4.1 1.0 versicolor +6.2 2.2 4.5 1.5 versicolor +5.6 2.5 3.9 1.1 versicolor +5.9 3.2 4.8 1.8 versicolor +6.1 2.8 4.0 1.3 versicolor +6.3 2.5 4.9 1.5 versicolor +6.1 2.8 4.7 1.2 versicolor +6.4 2.9 4.3 1.3 versicolor +6.6 3.0 4.4 1.4 versicolor +6.8 2.8 4.8 1.4 versicolor +6.7 3.0 5.0 1.7 versicolor +6.0 2.9 4.5 1.5 versicolor +5.7 2.6 3.5 1.0 versicolor +5.5 2.4 3.8 1.1 versicolor +5.5 2.4 3.7 1.0 versicolor +5.8 2.7 3.9 1.2 versicolor +6.0 2.7 5.1 1.6 versicolor +5.4 3.0 4.5 1.5 versicolor +6.0 3.4 4.5 1.6 versicolor +6.7 3.1 4.7 1.5 versicolor +6.3 2.3 4.4 1.3 versicolor +5.6 3.0 4.1 1.3 versicolor +5.5 2.5 4.0 1.3 versicolor +5.5 2.6 4.4 1.2 versicolor +6.1 3.0 4.6 1.4 versicolor +5.8 2.6 4.0 1.2 versicolor +5.0 2.3 3.3 1.0 versicolor +5.6 2.7 4.2 1.3 versicolor +5.7 3.0 4.2 1.2 versicolor +5.7 2.9 4.2 1.3 versicolor +6.2 2.9 4.3 1.3 versicolor +5.1 2.5 3.0 1.1 versicolor +5.7 2.8 4.1 1.3 versicolor +6.3 3.3 6.0 2.5 virginica +5.8 2.7 5.1 1.9 virginica +7.1 3.0 5.9 2.1 virginica +6.3 2.9 5.6 1.8 virginica +6.5 3.0 5.8 2.2 virginica +7.6 3.0 6.6 2.1 virginica +4.9 2.5 4.5 1.7 virginica +7.3 2.9 6.3 1.8 virginica +6.7 2.5 5.8 1.8 virginica +7.2 3.6 6.1 2.5 virginica +6.5 3.2 5.1 2.0 virginica +6.4 2.7 5.3 1.9 virginica +6.8 3.0 5.5 2.1 virginica +5.7 2.5 5.0 2.0 virginica +5.8 2.8 5.1 2.4 virginica +6.4 3.2 5.3 2.3 virginica +6.5 3.0 5.5 1.8 virginica +7.7 3.8 6.7 2.2 virginica +7.7 2.6 6.9 2.3 virginica +6.0 2.2 5.0 1.5 virginica +6.9 3.2 5.7 2.3 virginica +5.6 2.8 4.9 2.0 virginica +7.7 2.8 6.7 2.0 virginica +6.3 2.7 4.9 1.8 virginica +6.7 3.3 5.7 2.1 virginica +7.2 3.2 6.0 1.8 virginica +6.2 2.8 4.8 1.8 virginica +6.1 3.0 4.9 1.8 virginica +6.4 2.8 5.6 2.1 virginica +7.2 3.0 5.8 1.6 virginica +7.4 2.8 6.1 1.9 virginica +7.9 3.8 6.4 2.0 virginica +6.4 2.8 5.6 2.2 virginica +6.3 2.8 5.1 1.5 virginica +6.1 2.6 5.6 1.4 virginica +7.7 3.0 6.1 2.3 virginica +6.3 3.4 5.6 2.4 virginica +6.4 3.1 5.5 1.8 virginica +6.0 3.0 4.8 1.8 virginica +6.9 3.1 5.4 2.1 virginica +6.7 3.1 5.6 2.4 virginica +6.9 3.1 5.1 2.3 virginica +5.8 2.7 5.1 1.9 virginica +6.8 3.2 5.9 2.3 virginica +6.7 3.3 5.7 2.5 virginica +6.7 3.0 5.2 2.3 virginica +6.3 2.5 5.0 1.9 virginica +6.5 3.0 5.2 2.0 virginica +6.2 3.4 5.4 2.3 virginica +5.9 3.0 5.1 1.8 virginica diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 506a89b72f8e..694f89a47c0b 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -548,7 +548,7 @@ def read_csv( ] = CSVDatasource.file_extension_filter(), **arrow_csv_args, ) -> Dataset[ArrowRow]: - """Create an Arrow dataset from csv files. + r"""Create an Arrow dataset from csv files. Examples: >>> import ray @@ -562,6 +562,16 @@ def read_csv( >>> ray.data.read_csv( # doctest: +SKIP ... ["s3://bucket/path1", "s3://bucket/path2"]) + >>> # Read files that use a different delimiter. The partition_filter=None is needed here + >>> # because by default read_csv only reads .csv files. For more uses of ParseOptions see + >>> # https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html # noqa: #501 + >>> from pyarrow import csv + >>> parse_options = csv.ParseOptions(delimiter="\t") + >>> ray.data.read_csv( # doctest: +SKIP + ... "example://iris.tsv", + ... parse_options=parse_options, + ... partition_filter=None) + >>> # Convert a date column with a custom format from a CSV file. >>> # For more uses of ConvertOptions see >>> # https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html # noqa: #501