From f6eb66e86428984d1d38d14109527d24ecf03f12 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 9 May 2022 22:58:53 -0700 Subject: [PATCH 01/23] Add files --- .../datasource/image_folder_datasource.py | 77 +++++++++++++++++++ python/ray/data/impl/pandas_block.py | 9 ++- 2 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 python/ray/data/datasource/image_folder_datasource.py diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py new file mode 100644 index 000000000000..991ce42e012e --- /dev/null +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -0,0 +1,77 @@ +import pathlib +from typing import Any, Callable, Dict, List, Optional, Union + +import imageio as iio +import numpy as np +from pyarrow.fs import FileSelector, FileType +from ray.data.block import Block +from ray.data.datasource.binary_datasource import BinaryDatasource +from ray.data.datasource.datasource import ReadTask +from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem +from ray.data.datasource.file_meta_provider import ( + BaseFileMetadataProvider, + DefaultFileMetadataProvider, + FastFileMetadataProvider, +) +from ray.data.datasource.partitioning import PathPartitionFilter + +IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"] + + +class ImageFolderDatasource(BinaryDatasource): + def prepare_read( + self, + parallelism: int, + paths: Union[str, List[str]], + filesystem: Optional["pyarrow.fs.FileSystem"] = None, + schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, + open_stream_args: Optional[Dict[str, Any]] = None, + meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), + partition_filter: PathPartitionFilter = None, + # TODO(ekl) deprecate this once read fusion is available. + _block_udf: Optional[Callable[[Block], Block]] = None, + **reader_args, + ) -> List[ReadTask]: + paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) + assert len(paths) == 1 + self.root = paths[0] + + paths, _ = meta_provider.expand_paths(paths, filesystem) + paths = [path for path in paths if _is_image_file(path)] + + return super().prepare_read( + parallelism=parallelism, + paths=paths, + filesystem=filesystem, + schema=schema, + open_stream_args=open_stream_args, + meta_provider=FastFileMetadataProvider(), + partition_filter=partition_filter, + _block_udf=_block_udf, + ) + + def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): + import pandas as pd + + records = super()._read_file(f, path, include_paths=True) + assert len(records) == 1 + path, data = records[0] + + image = iio.imread(data) + label = _get_class_from_path(path, self.root) + + return pd.DataFrame({"image": [np.array(image)], "label": [label]}) + + +def _is_image_file(path: str) -> bool: + return any(path.lower().endswith(extension) for extension in IMAGE_EXTENSIONS) + + +def _get_class_from_path(path: str, root: str) -> str: + # The class is the name of the first directory after the root. For example, if + # the root is "/data/imagenet/train" and the path is + # "/data/imagenet/train/n01443537/images/n01443537_0.JPEG", then the class is + # "n01443537". + path, root = pathlib.PurePath(path), pathlib.PurePath(root) + assert root in path.parents + return path.parts[len(root.parts) :][0] \ No newline at end of file diff --git a/python/ray/data/impl/pandas_block.py b/python/ray/data/impl/pandas_block.py index 07bbdc1eb50a..308a60b05cf4 100644 --- a/python/ray/data/impl/pandas_block.py +++ b/python/ray/data/impl/pandas_block.py @@ -49,9 +49,12 @@ def __getitem__(self, key: str) -> Any: return None item = col.iloc[0] try: - # Try to interpret this as a numpy-type value. - # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types. # noqa: E501 - return item.item() + if item.size == 1: + # Try to interpret this as a numpy-type value. + # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types. # noqa: E501 + return item.item() + else: + return item except AttributeError: # Fallback to the original form. return item From 8fb8d75d79a59b403d28ce7839437f743d80830b Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 01:35:14 -0700 Subject: [PATCH 02/23] Add files --- python/ray/data/datasource/__init__.py | 2 + .../datasource/image_folder_datasource.py | 51 +++++++++++++++++- .../ray/data/tests/image-folder/cat/123.png | Bin 0 -> 2109 bytes .../data/tests/image-folder/cat/not-an-image | 0 .../ray/data/tests/image-folder/dog/xxx.png | Bin 0 -> 2370 bytes .../ray/data/tests/image-folder/not-an-image | 0 python/ray/data/tests/test_dataset_formats.py | 21 ++++++++ 7 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 python/ray/data/tests/image-folder/cat/123.png create mode 100644 python/ray/data/tests/image-folder/cat/not-an-image create mode 100644 python/ray/data/tests/image-folder/dog/xxx.png create mode 100644 python/ray/data/tests/image-folder/not-an-image diff --git a/python/ray/data/datasource/__init__.py b/python/ray/data/datasource/__init__.py index e51390b1b953..149127dc2ce5 100644 --- a/python/ray/data/datasource/__init__.py +++ b/python/ray/data/datasource/__init__.py @@ -22,6 +22,7 @@ FileMetadataProvider, ParquetMetadataProvider, ) +from ray.data.datasource.image_folder_datasource import ImageFolderDatasource from ray.data.datasource.json_datasource import JSONDatasource from ray.data.datasource.numpy_datasource import NumpyDatasource from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource @@ -48,6 +49,7 @@ "FastFileMetadataProvider", "FileBasedDatasource", "FileMetadataProvider", + "ImageFolderDatasource", "JSONDatasource", "NumpyDatasource", "ParquetBaseDatasource", diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 991ce42e012e..49b02918d7b4 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -19,6 +19,35 @@ class ImageFolderDatasource(BinaryDatasource): + """A datasource that allows you to load datasets like ImageNet. + + This datasource works with any dataset where the images are arranged in this way: + + ``` + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png + + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png + ``` + + Examples: + >>> import ray + >>> from ray.data.datasource import ImageFolderDatasource + >>> + >>> ds = ray.data.read_datasource( + ... ImageFolderDatasource(), + ... paths=["s3://tiny-imagenet/train"] + ... ) + >>> TODO + + Raises: + ValueError: if more than one path is provided. You should only provide the path + to the dataset root. + """ + def prepare_read( self, parallelism: int, @@ -32,8 +61,26 @@ def prepare_read( _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: + if len(paths) > 1: + raise ValueError( + "`ImageFolderDatasource` expects 1 path representing the dataset " + f"root, but it got {len(paths)} paths instead. To fix this error, " + "pass in a single-element list containing the dataset root (for " + 'example, `paths=["s3://imagenet/train"]`)' + ) + + try: + import imageio + except ImportError: + raise ValueError( + "`ImageFolderDatasource` depends on 'imageio', but 'imageio' couldn't " + "be imported. You can install 'imageio' by running " + "`pip install imageio`." + ) + + # We call `_resolve_paths_and_filesystem` so that the dataset root is formatted + # in the same way as the paths passed to `_get_class_from_path`. paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) - assert len(paths) == 1 self.root = paths[0] paths, _ = meta_provider.expand_paths(paths, filesystem) @@ -74,4 +121,4 @@ def _get_class_from_path(path: str, root: str) -> str: # "n01443537". path, root = pathlib.PurePath(path), pathlib.PurePath(root) assert root in path.parents - return path.parts[len(root.parts) :][0] \ No newline at end of file + return path.parts[len(root.parts) :][0] diff --git a/python/ray/data/tests/image-folder/cat/123.png b/python/ray/data/tests/image-folder/cat/123.png new file mode 100644 index 0000000000000000000000000000000000000000..90797d74701c0b771b7a708fa834a5489801d4a8 GIT binary patch literal 2109 zcmV-D2*US?P)nCm2P7mQlAQ!Qw&ZSmH?_-EtJdGlImZ|rfDq3WyazA+)9-$Lc7B$}{m}Qj%iI3l zUs1lexw-xKPyg`Qzx}gPeDv$zUcL7#v>+xA7m;>)eEa9$+p9ljEzMJdeQ0oSb=1L)E$tu^GC?qF}diweg^~Gly&YtRV zY4bW9p5<{TLXwiIswPntO{%K*bD745d3C6uwC_IOucztiFeu#|hKq}S=MQ@Sg}S3m zl*fm=7Y|?kN6b$p4_;er$F$qa?h?oMkZFK*VF znOU>uwt1_%y1YEi`!2N&I6m<(^j%&{wY4IPu!?%L1~W7BRzNLJ)4a^};c-5lE^giz zim(6kUrVsp*;@?{_b?c_!9#=ju~?7`;^F@7Nax*f)&gZYPPy|4QsIC#Gq3Dhn5X^Z z5BH=J9{qF{OsBiQKosE-7C{8j2m~Xb%+wGCO;WespYp1$O>%iv>M9SfrRya3L2x+g z7R#(juP+W9x>X*cfQANSK2pZ@9L&D|S|)4VE?yGJ;j05-A!Ay_|n=hnlA!*IBM zwjM)O-uvY*sCipw5muc!4tCt7qRM@Pw;%+c^B?4@GlO`h8U<+%tl_vDlU;J!4)$Q&@&Q7`~8qBNtDw@4EE6L5%IQEBYfl!;H zHE{p#FF!T}td&x{mQvPQs=1@qFjIQ^jx5~jY;5fU-f~C7bS3>rse>K2tqe>NzP6G? zx_$r&NvFxp%CNtF|C0|r2<+CVMXOfB9A@R|aoegUJc6luZKv|~8@alE)?HoXd?}si zFm4oRB_aT3717?RgHGe2_ej31?dh~^-x7r~TV-- z!`1QlOeKL7mr^|McY^N%-|hfO&`2^tUxLe#Cf z854DVM=f-kG7}4vB-w1ce>m1P-#xVd`t-B)^z_jOH>AGbO_Doj-%5G;;#;q$ZoEy2 z3{;JriODG%AOSWK08Y%Bg+hmpnJ6)(&i1*m>pGWVccqdD?c48u==W3i^LO_bI}!Eh z`r-1X+u#1|gN&`%;kYg$Tkbm*0RRHvL`DF@3D6`?L`1r7wjAHgC*S7f;(Gk>j$_uERRQ`uk0Fzi5wzxwEt^XC^5vlGTn)8i}msCH=DAtVrB3Xqd-&52mU*&?i! zd9Dw~;|s>gDalZr3eu9`gA+@}z6*npr7jsaSl!AsZ4cKHR^&IX_(AoWH7DL*0y*>I_pxIo(s9 zxbIr=B$Bx@HpkjHvPY{J4V$OM_7S8mXCBpM3$3J;RUC4xY_Hxzz8EiWv;#c%)1?lR zmlkb>D!Qp!MeH_QOZz@Jf+#3}AOwTmo$xw^sSt=cL`&4L8eYx0);KLS@!Q*OG?Sdi zZXCNjG+3xmNAu&7Dl;Ky*M)?yK?(x9*T8}vTsl+)E22c0TXnDDEgVE5%%2%s$ z`ek0`rzH&sl9W1~PmlL^cdC@TwA<$*LyncHnwfLB^-l+;*M-+zrXc0Id^mx>%@)tQUsQaten zcmzj81XK?(lq#j7wQ`HHY*A|-`&R7!hnLf!M1;ldO(jkHfjnBzg$?kIn3F^U zZB|68pamG=0XRr6uCCT~-PUzoS1Z-lrMI>8ZWRgw8ig2TNWq&^HcRRy8KkoJ& zlr%L6RnvJU;Vo&6#`r%3b=w#w%qb;=hYPAEP82Q{kRIMr(k#<%7%vXFAACEVW?PO& zn)W>+JJQo8HRfH4Fbe!X5$d&hs{~5Sp&F5iLR^)S*Aid~(LU$>PNyC~S-fth-nMn| zL^KR7dJpv3DVT!YBVZmt5IHG)V;De0k;D-ZgfOOH3vv%3O3Hnr%m}we3=tOdfFlg4 zhD>7Q5!NEeBO=0y0~COUZG>PFqJS__50UT?W)}*l#Izgp*ry}`AOLr_;Kp6;4w1Sp negujGKp?_BKm;atc;LSQWg4{P2b{s^00000NkvXXu0mjf4loqX literal 0 HcmV?d00001 diff --git a/python/ray/data/tests/image-folder/cat/not-an-image b/python/ray/data/tests/image-folder/cat/not-an-image new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ray/data/tests/image-folder/dog/xxx.png b/python/ray/data/tests/image-folder/dog/xxx.png new file mode 100644 index 0000000000000000000000000000000000000000..1613ff63ad408cb4e16cea0306b388add050c20c GIT binary patch literal 2370 zcmV-I3BC4-P)gb&b8J;2xCl=rVgC<9soJ#gbczkJUKZ5 z07|gd>gm&G-DZ?BguuDVOa)<7 zsZ`RLnbXr#?>)hYQwrWwLRb(+mC9@~Gb2QR2`$Uwyyr3q;v}C=h9|us3@Vk`@iey{ z+U)}X->BD%EZ^Thh~rq>z4i5VA!Vo2ky0w90GzW%D_tl>-?@2XVQD!CB7`v^gmX?Q zjg$E2zqqya?)&rebKT=Z#+d~eW%*3Cc45A0jh&lo20^%f?K)bMuVRUbma3 zsrMdZf)Me5vRpAC1f!G?f)H}fdGEE>FVxU#U271XT$mVdFLELh*BCwQKeELWoU#D#uyzDqe*a{5XE%=H%a<-Q&H<8ipYg zQhM)|QV3zjxKbJ+R1|t+O>($zo)s@o!Wql~6gb;!;ef!PzD2^$m7-LGQ)+S97>;0v*wR){O7@TVSeuU)8{J}muHe>(C@vmcI6Mh|KsAq;>L}e1l&o#S8Al}GXPApjE%-) z=bUp609)&dqCm5=jb?LcdHL(Fzkc-S(dtU;@bGALW%cN&Q?EC|ARe8c7kOT-RoC^T5C#~ z2Lyn$HXncdt5&Nu7z|##cv0j90E9u1rfDDpqjedyY^V3WC<;a?Ej3CA0YH^XMJW{o zfz~<O#DTUc1A)oKVK&bT%9lTUtk zczB308I4AN`~35RgM%oFcXxL;H@6T%Q54i_HKo*KGO5*SQj#)(kdl7z-aFN5RR|FT zp_H<;31PfGe<6%xOsNb)F6G=@(^)f}j6VDPi*Nt&Po*_swBBfLZf+(?97RzjO^3r_ z|KyZV7KI__3;^;xCna0>>r*OfGMRL{$CJqv0Ic`MT0$v8es^!@!Gi}`o>^U|yWL(1KhC+75@XCt2b6=zdvC2p2pO%9kB*Wko=(PboWyC8rfH+u zG~emvrG?JXp;mdbIk$ClgCN*``uL=GeDB`9Pe1*W(dKQfHr~AEtdUY$YZ*dNG5{ek z#(3{BLLNbpX9(O;yDfrn{n~XxFu_C$acSjZKAl)&DwPTb9}3alfA!TDfB*M?9`y$U z@BQOvPcE&t(l99B%1S08gaDBD4iI335%?mT7TE-YZ@=1KSX`>4@o+da#t?)yZd?~a zYNfi}j(7I3{i@x5Wt?rTv?`U^XV0Eyc{W$C8DoSHOlxh7vDRYzx>gu@?~PIfA;IYW z%a<2dFGs;jc@twul2l4TDl^mZ_0$AxtoM0D!?@kmvc?*;&0_XN*!x`~4H=+{{d+ zsB#BuR-_LMg@sVN|VFtJUg{KmOQ1=^});wr$rH{wAixM=OoR}do11fU4G(Ts&KQ&DIU9{e<$%K&v)3IXgjnaCwSiRg~Dk80UE|r38QyEaeu< oC}oUt&M9SiQ5a*4F_X#Uf06rv)J1*L5dZ)H07*qoM6N<$g3e-d(EtDd literal 0 HcmV?d00001 diff --git a/python/ray/data/tests/image-folder/not-an-image b/python/ray/data/tests/image-folder/not-an-image new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index 18c1fbbfafb5..f35281672522 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -25,6 +25,7 @@ DefaultFileMetadataProvider, DefaultParquetMetadataProvider, FastFileMetadataProvider, + ImageFolderDatasource, PathPartitionFilter, PathPartitionEncoder, PartitionStyle, @@ -2520,6 +2521,26 @@ def get_node_id(): assert node_ids == {bar_node_id} +def test_image_folder_datasource(ray_start_regular_shared): + root = os.path.join(__file__, "image-folder") + ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) + + assert ds.count() == 2 + + df = ds.to_pandas() + assert set(df["label"]) == {"cat", "dog"} + assert all(isinstance(array, np.ndarray) for array in df["image"]) + assert all(array.shape == (32, 32, 3) for array in df["image"]) + + +def test_image_folder_datasource_raises_value_error(ray_start_regular_shared): + # `ImageFolderDatasource` should raise an error if more than one path is passed. + with pytest.raises(ValueError): + ray.data.read_datasource( + ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"] + ) + + if __name__ == "__main__": import sys From 083c9bc8d7b01be1ab22d2341e5e727a16d86527 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 01:50:20 -0700 Subject: [PATCH 03/23] Fix stuff --- .../datasource/image_folder_datasource.py | 22 ++++++++++++++----- python/ray/data/tests/test_dataset_formats.py | 2 +- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 49b02918d7b4..a752d2014844 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -1,9 +1,8 @@ import pathlib -from typing import Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union import imageio as iio import numpy as np -from pyarrow.fs import FileSelector, FileType from ray.data.block import Block from ray.data.datasource.binary_datasource import BinaryDatasource from ray.data.datasource.datasource import ReadTask @@ -15,6 +14,9 @@ ) from ray.data.datasource.partitioning import PathPartitionFilter +if TYPE_CHECKING: + import pyarrow + IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"] @@ -33,15 +35,23 @@ class ImageFolderDatasource(BinaryDatasource): root/cat/[...]/asd932_.png ``` + Datasets read with `ImageFolderDatasource` contain two columns: 'image' and + 'label'. The 'image' column contains `ndarray`s of shape (H, W, C), and the + `label` column contain strings corresponding to class. + Examples: >>> import ray >>> from ray.data.datasource import ImageFolderDatasource >>> - >>> ds = ray.data.read_datasource( + >>> ds = ray.data.read_datasource( # doctest: +SKIP ... ImageFolderDatasource(), ... paths=["s3://tiny-imagenet/train"] ... ) - >>> TODO + >>> sample = ds.take(1)[0] # doctest: +SKIP + >>> sample["image"].shape # doctest: +SKIP + (469, 387, 3) + >>> sample["label"] # doctest: +SKIP + 'n01443537' Raises: ValueError: if more than one path is provided. You should only provide the path @@ -70,14 +80,14 @@ def prepare_read( ) try: - import imageio + import imageio # noqa: F401 except ImportError: raise ValueError( "`ImageFolderDatasource` depends on 'imageio', but 'imageio' couldn't " "be imported. You can install 'imageio' by running " "`pip install imageio`." ) - + # We call `_resolve_paths_and_filesystem` so that the dataset root is formatted # in the same way as the paths passed to `_get_class_from_path`. paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index f35281672522..a3f31d508f2b 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -2522,7 +2522,7 @@ def get_node_id(): def test_image_folder_datasource(ray_start_regular_shared): - root = os.path.join(__file__, "image-folder") + root = os.path.join(os.path.dirname(__file__), "image-folder") ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) assert ds.count() == 2 From 4f0b8cec1d4fd27af3df15da7270dac87d5a91d1 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 01:51:11 -0700 Subject: [PATCH 04/23] Rename file --- .../tests/image-folder/dog/{xxx.png => xyz.PnG} | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename python/ray/data/tests/image-folder/dog/{xxx.png => xyz.PnG} (100%) diff --git a/python/ray/data/tests/image-folder/dog/xxx.png b/python/ray/data/tests/image-folder/dog/xyz.PnG similarity index 100% rename from python/ray/data/tests/image-folder/dog/xxx.png rename to python/ray/data/tests/image-folder/dog/xyz.PnG From cd6e25e7c10bda0bb1f97642077c4b1244f95c98 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 01:51:38 -0700 Subject: [PATCH 05/23] Rename file --- .../tests/image-folder/dog/{xyz.PnG => xxx.PNG} | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename python/ray/data/tests/image-folder/dog/{xyz.PnG => xxx.PNG} (100%) diff --git a/python/ray/data/tests/image-folder/dog/xyz.PnG b/python/ray/data/tests/image-folder/dog/xxx.PNG similarity index 100% rename from python/ray/data/tests/image-folder/dog/xyz.PnG rename to python/ray/data/tests/image-folder/dog/xxx.PNG From 98b07f7e3e72a07a3a2942bf2f121e6b9079b402 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 01:53:39 -0700 Subject: [PATCH 06/23] Update image_folder_datasource.py --- python/ray/data/datasource/image_folder_datasource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index a752d2014844..1f6204e35019 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -37,7 +37,7 @@ class ImageFolderDatasource(BinaryDatasource): Datasets read with `ImageFolderDatasource` contain two columns: 'image' and 'label'. The 'image' column contains `ndarray`s of shape (H, W, C), and the - `label` column contain strings corresponding to class. + `label` column contains strings corresponding to class. Examples: >>> import ray From 6f7b2eb8431447716e04319c459e226283773840 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 02:15:24 -0700 Subject: [PATCH 07/23] Update docs --- doc/Makefile | 2 +- doc/source/data/package-ref.rst | 2 ++ .../datasource/image_folder_datasource.py | 28 +++++++++---------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/doc/Makefile b/doc/Makefile index 0b4fab525153..a38073ab911b 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -52,7 +52,7 @@ clean: rm -rf $(BUILDDIR)/* html: - $(SPHINXBUILD) -W -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst index bc769b36d0d3..0c1fc446a9ce 100644 --- a/doc/source/data/package-ref.rst +++ b/doc/source/data/package-ref.rst @@ -135,6 +135,8 @@ Built-in Datasources .. autoclass:: ray.data.datasource.FileBasedDatasource :members: +.. autoclass:: ray.data.datasource.ImageFolderDatasource + .. autoclass:: ray.data.datasource.JSONDatasource :members: diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 1f6204e35019..03f9bfa86b0d 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -21,23 +21,23 @@ class ImageFolderDatasource(BinaryDatasource): - """A datasource that allows you to load datasets like ImageNet. + """A datasource that lets you read datasets like `ImageNet `_. - This datasource works with any dataset where the images are arranged in this way: + This datasource works with any dataset where images are arranged in this way: - ``` - root/dog/xxx.png - root/dog/xxy.png - root/dog/[...]/xxz.png + .. code-block:: - root/cat/123.png - root/cat/nsdf3.png - root/cat/[...]/asd932_.png - ``` + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png - Datasets read with `ImageFolderDatasource` contain two columns: 'image' and - 'label'. The 'image' column contains `ndarray`s of shape (H, W, C), and the - `label` column contains strings corresponding to class. + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png + + Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and + ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape + :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to labels. Examples: >>> import ray @@ -45,7 +45,7 @@ class ImageFolderDatasource(BinaryDatasource): >>> >>> ds = ray.data.read_datasource( # doctest: +SKIP ... ImageFolderDatasource(), - ... paths=["s3://tiny-imagenet/train"] + ... paths=["/data/imagenet/train"] ... ) >>> sample = ds.take(1)[0] # doctest: +SKIP >>> sample["image"].shape # doctest: +SKIP From db05d3583be5a2cb2c7f4c592f865843938932ef Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 02:19:34 -0700 Subject: [PATCH 08/23] Update file_meta_provider.py --- python/ray/data/datasource/file_meta_provider.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py index ad1bdf62d104..9e03d46630d2 100644 --- a/python/ray/data/datasource/file_meta_provider.py +++ b/python/ray/data/datasource/file_meta_provider.py @@ -196,11 +196,6 @@ def expand_paths( paths: List[str], filesystem: "pyarrow.fs.FileSystem", ) -> Tuple[List[str], List[Optional[int]]]: - logger.warning( - f"Skipping expansion of {len(paths)} path(s). If your paths contain " - f"directories or if file size collection is required, try rerunning this " - f"read with `meta_provider=DefaultFileMetadataProvider()`." - ) import numpy as np return paths, np.empty(len(paths), dtype=object) From 52ae3c8b31d72d41473e6123bf468a4134baa261 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 02:27:34 -0700 Subject: [PATCH 09/23] Update image_folder_datasource.py --- python/ray/data/datasource/image_folder_datasource.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 03f9bfa86b0d..e25ec84b4641 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -37,7 +37,8 @@ class ImageFolderDatasource(BinaryDatasource): Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape - :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to labels. + :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to + labels. Examples: >>> import ray From 813f5de94969b87e4931f225bf9e55f2d81c8d70 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 11 May 2022 02:28:31 -0700 Subject: [PATCH 10/23] Update Makefile --- doc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/Makefile b/doc/Makefile index a38073ab911b..0b4fab525153 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -52,7 +52,7 @@ clean: rm -rf $(BUILDDIR)/* html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + $(SPHINXBUILD) -W -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." From d68fe1d26181b2295de1244d862d5108f138a6c6 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 18 May 2022 01:58:52 -0700 Subject: [PATCH 11/23] Update python/ray/data/datasource/image_folder_datasource.py Co-authored-by: matthewdeng --- python/ray/data/datasource/image_folder_datasource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index e25ec84b4641..a8bdfaa33fbf 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -72,7 +72,7 @@ def prepare_read( _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: - if len(paths) > 1: + if len(paths) != 1: raise ValueError( "`ImageFolderDatasource` expects 1 path representing the dataset " f"root, but it got {len(paths)} paths instead. To fix this error, " From 45989c2715bd929b931b6ff4846e66d9881e8959 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 18 May 2022 02:09:32 -0700 Subject: [PATCH 12/23] Re-add warning --- python/ray/data/datasource/file_meta_provider.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py index 9e03d46630d2..ad1bdf62d104 100644 --- a/python/ray/data/datasource/file_meta_provider.py +++ b/python/ray/data/datasource/file_meta_provider.py @@ -196,6 +196,11 @@ def expand_paths( paths: List[str], filesystem: "pyarrow.fs.FileSystem", ) -> Tuple[List[str], List[Optional[int]]]: + logger.warning( + f"Skipping expansion of {len(paths)} path(s). If your paths contain " + f"directories or if file size collection is required, try rerunning this " + f"read with `meta_provider=DefaultFileMetadataProvider()`." + ) import numpy as np return paths, np.empty(len(paths), dtype=object) From 2f95235afb09539459ac9511e038646566516392 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 14 Jul 2022 16:50:54 -0700 Subject: [PATCH 13/23] Update implementation --- .../datasource/image_folder_datasource.py | 78 +++++++++--------- .../ray/data/tests/image-folder/cat/foo.jpg | Bin 0 -> 923 bytes python/ray/data/tests/test_dataset_formats.py | 13 ++- 3 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 python/ray/data/tests/image-folder/cat/foo.jpg diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index a8bdfaa33fbf..169a51e45bd1 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -1,27 +1,24 @@ import pathlib -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union -import imageio as iio import numpy as np -from ray.data.block import Block from ray.data.datasource.binary_datasource import BinaryDatasource -from ray.data.datasource.datasource import ReadTask -from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem -from ray.data.datasource.file_meta_provider import ( - BaseFileMetadataProvider, - DefaultFileMetadataProvider, - FastFileMetadataProvider, +from ray.data.datasource.datasource import Reader +from ray.data.datasource.file_based_datasource import ( + _resolve_paths_and_filesystem, + FileExtensionFilter, ) from ray.data.datasource.partitioning import PathPartitionFilter if TYPE_CHECKING: import pyarrow + from ray.data.block import T -IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"] +IMAGE_EXTENSIONS = ["png", "jpg", "jpeg", "tiff", "bmp", "gif"] class ImageFolderDatasource(BinaryDatasource): - """A datasource that lets you read datasets like `ImageNet `_. + """A datasource that lets you read datasets like `ImageNet `_. # noqa: E501 This datasource works with any dataset where images are arranged in this way: @@ -36,7 +33,7 @@ class ImageFolderDatasource(BinaryDatasource): root/cat/[...]/asd932_.png Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and - ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape + ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to labels. @@ -59,25 +56,19 @@ class ImageFolderDatasource(BinaryDatasource): to the dataset root. """ - def prepare_read( + def create_reader( self, - parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, - schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, - open_stream_args: Optional[Dict[str, Any]] = None, - meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: PathPartitionFilter = None, - # TODO(ekl) deprecate this once read fusion is available. - _block_udf: Optional[Callable[[Block], Block]] = None, - **reader_args, - ) -> List[ReadTask]: + **kwargs, + ) -> "Reader[T]": if len(paths) != 1: raise ValueError( "`ImageFolderDatasource` expects 1 path representing the dataset " - f"root, but it got {len(paths)} paths instead. To fix this error, " - "pass in a single-element list containing the dataset root (for " - 'example, `paths=["s3://imagenet/train"]`)' + f"root, but it got {len(paths)} paths instead. To fix this " + "error, pass in a single-element list containing the dataset root " + '(for example, `paths=["s3://imagenet/train"]`)' ) try: @@ -89,27 +80,35 @@ def prepare_read( "`pip install imageio`." ) + if partition_filter is None: + partition_filter = FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS) + # We call `_resolve_paths_and_filesystem` so that the dataset root is formatted # in the same way as the paths passed to `_get_class_from_path`. paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) self.root = paths[0] - paths, _ = meta_provider.expand_paths(paths, filesystem) - paths = [path for path in paths if _is_image_file(path)] + from pyarrow.fs import FileType, FileSelector + + labels = [ + file_info.base_name + for file_info in filesystem.get_file_info(FileSelector(self.root)) + if file_info.type is FileType.Directory + ] + labels.sort() # Sort labels so that targets are consistent. + self.label_to_target = {label: labels.index(label) for label in labels} - return super().prepare_read( - parallelism=parallelism, + return super().create_reader( paths=paths, filesystem=filesystem, - schema=schema, - open_stream_args=open_stream_args, - meta_provider=FastFileMetadataProvider(), partition_filter=partition_filter, - _block_udf=_block_udf, + **kwargs, ) def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): + import imageio as iio import pandas as pd + from ray.data.extensions import TensorArray records = super()._read_file(f, path, include_paths=True) assert len(records) == 1 @@ -117,12 +116,15 @@ def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): image = iio.imread(data) label = _get_class_from_path(path, self.root) - - return pd.DataFrame({"image": [np.array(image)], "label": [label]}) - - -def _is_image_file(path: str) -> bool: - return any(path.lower().endswith(extension) for extension in IMAGE_EXTENSIONS) + target = self.label_to_target[label] + + return pd.DataFrame( + { + "image": TensorArray([np.array(image)]), + "label": [label], + "target": [target], + } + ) def _get_class_from_path(path: str, root: str) -> str: diff --git a/python/ray/data/tests/image-folder/cat/foo.jpg b/python/ray/data/tests/image-folder/cat/foo.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9df96a2de1ff6fdb3f1fdc7c1cee07bc6e49daff GIT binary patch literal 923 zcmex=_1P|rX?qqI0P zFI~aY%U!`Mz|~!$%)&rZN1?DZF(yimrOX!XqN1l2cOC z(lau%ic3n%$}1|Xnp;}i+B-VCCQY6)b=ve9GiNPYykzOJeA&aSFc^aar4&0M~|O8efIpt%U2&ieg5+G+xH(oe}VkP$iNKo7TjlO z{t^WGi;0DWnS~wXFGi+vAZ8Y1VO2C_6LJh>Pb?HxGHT=yahkYr<3UbkKb$@|Xn~>={b-^Sx1pn zGrSIp{$p`D=`+h^U!#o=WO(W<{xfiGU0w0$*tLs&b7${7??16gf2X?WvD1%h%RI}u zH#FOqD#!3&TJ2P7zRphk=C;rI)ix>pQ!IEG|7!i4`(SJ1#jl)4ep@N6WBm1>q1APH z#miaO1gq2D8=k0Ce&}wKQg*hiy4GE-+jYuPjW;P-w;ZmR+|*6_wDw4g&}j*khT@*W zm}zq#{INZ7MR|(Ejmn(WpSD|`-!GH2SAy}{D93NZ#bCMzD!Oy foV@)?xQjX8mZ!af+qG7x_f6Srbzy4b{QoxrB28ZI literal 0 HcmV?d00001 diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index aca805d13662..894565591194 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -42,6 +42,7 @@ _deserialize_pieces_with_retry, ) from ray.data.tests.conftest import * # noqa +from ray.data.extensions import TensorDtype from ray.tests.conftest import * # noqa from ray.types import ObjectRef @@ -2858,12 +2859,16 @@ def test_image_folder_datasource(ray_start_regular_shared): root = os.path.join(os.path.dirname(__file__), "image-folder") ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) - assert ds.count() == 2 + assert ds.count() == 3 df = ds.to_pandas() - assert set(df["label"]) == {"cat", "dog"} - assert all(isinstance(array, np.ndarray) for array in df["image"]) - assert all(array.shape == (32, 32, 3) for array in df["image"]) + assert type(df["image"].dtype) is TensorDtype + assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"]) + + df = df.sort_values("label") + # Targets should be assigned alphabetically to labels. + assert df["label"].tolist() == ["cat", "cat", "dog"] + assert df["target"].tolist() == [0, 0, 1] def test_image_folder_datasource_raises_value_error(ray_start_regular_shared): From 3c9e3c03977b0c8261a08e05a0ac338edd52492c Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 14 Jul 2022 17:05:12 -0700 Subject: [PATCH 14/23] Add read API --- python/ray/data/__init__.py | 1 + .../datasource/image_folder_datasource.py | 38 +--------------- python/ray/data/read_api.py | 45 +++++++++++++++++++ python/ray/data/tests/test_dataset_formats.py | 13 +----- 4 files changed, 49 insertions(+), 48 deletions(-) diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index d9770fc099f6..295503eb0f4c 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -33,6 +33,7 @@ read_parquet, read_parquet_bulk, read_text, + read_image_folder, ) # Register custom Arrow JSON ReadOptions serializer after worker has initialized. diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 169a51e45bd1..2233c6b73d68 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -18,43 +18,7 @@ class ImageFolderDatasource(BinaryDatasource): - """A datasource that lets you read datasets like `ImageNet `_. # noqa: E501 - - This datasource works with any dataset where images are arranged in this way: - - .. code-block:: - - root/dog/xxx.png - root/dog/xxy.png - root/dog/[...]/xxz.png - - root/cat/123.png - root/cat/nsdf3.png - root/cat/[...]/asd932_.png - - Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and - ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape - :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to - labels. - - Examples: - >>> import ray - >>> from ray.data.datasource import ImageFolderDatasource - >>> - >>> ds = ray.data.read_datasource( # doctest: +SKIP - ... ImageFolderDatasource(), - ... paths=["/data/imagenet/train"] - ... ) - >>> sample = ds.take(1)[0] # doctest: +SKIP - >>> sample["image"].shape # doctest: +SKIP - (469, 387, 3) - >>> sample["label"] # doctest: +SKIP - 'n01443537' - - Raises: - ValueError: if more than one path is provided. You should only provide the path - to the dataset root. - """ + """A datasource that lets you read datasets like `ImageNet `_.""" # noqa: E501 def create_reader( self, diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 3b6d5118a1e9..b4b0615017bd 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -24,6 +24,7 @@ DefaultFileMetadataProvider, DefaultParquetMetadataProvider, FastFileMetadataProvider, + ImageFolderDatasource, JSONDatasource, NumpyDatasource, ParquetBaseDatasource, @@ -1069,6 +1070,50 @@ def convert(ds: "datasets.Dataset") -> Dataset[ArrowRow]: ) +@PublicAPI +def read_image_folder(root: str, *, parallelism: int = -1): + """Read a dataset structured like `ImageNet `_. + + This function works with any dataset where images are arranged in this way: + + .. code-block:: + + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png + + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png + + Datasets read with this function contain three columns: ``'image'``, ``'label'`` and + ``'target'``. + + * The ``'image'`` column contains ``ndarray`` objects of shape :math:`(H, W, C)` + * The ``'label'`` column contains strings representing class names. + * The ``'target'`` column contain integer targets corresponding to class. + + Arguments: + path: Path to the directory root. + parallelism: The user-requested parallelism, or -1 for autodetection. + + Examples: + >>> import ray + >>> + >>> ds = ray.data.read_image_folder("/data/imagenet/train") + >>> sample = ds.take(1)[0] # doctest: +SKIP + >>> sample["image"].shape # doctest: +SKIP + (469, 387, 3) + >>> sample["label"] # doctest: +SKIP + 'n01443537' + >>> sample["target] # doctest: +SKIP + 71 + """ + return read_datasource( + ImageFolderDatasource(), paths=[root], parallelism=parallelism + ) + + def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]: stats = BlockExecStats.builder() import pyarrow as pa diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index 894565591194..7b087842ab82 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -26,7 +26,6 @@ DefaultParquetMetadataProvider, DummyOutputDatasource, FastFileMetadataProvider, - ImageFolderDatasource, PartitionStyle, PathPartitionEncoder, PathPartitionFilter, @@ -2855,9 +2854,9 @@ def get_node_id(): assert node_ids == {bar_node_id} -def test_image_folder_datasource(ray_start_regular_shared): +def test_read_image_folder(ray_start_regular_shared): root = os.path.join(os.path.dirname(__file__), "image-folder") - ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) + ds = ray.data.read_image_folder(root) assert ds.count() == 3 @@ -2871,14 +2870,6 @@ def test_image_folder_datasource(ray_start_regular_shared): assert df["target"].tolist() == [0, 0, 1] -def test_image_folder_datasource_raises_value_error(ray_start_regular_shared): - # `ImageFolderDatasource` should raise an error if more than one path is passed. - with pytest.raises(ValueError): - ray.data.read_datasource( - ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"] - ) - - def test_read_text_remote_args(ray_start_cluster, tmp_path): cluster = ray_start_cluster cluster.add_node( From cdaca5991162e452dd2b5ba5d735d9815b793eb8 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Thu, 14 Jul 2022 17:37:14 -0700 Subject: [PATCH 15/23] Fix error in documentation --- python/ray/data/read_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index ab34c8ffc09b..79b0415119fb 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -1093,7 +1093,7 @@ def read_image_folder(root: str, *, parallelism: int = -1): Datasets read with this function contain three columns: ``'image'``, ``'label'`` and ``'target'``. - * The ``'image'`` column contains ``ndarray`` objects of shape :math:`(H, W, C)` + * The ``'image'`` column is of type ``TensorDtype`` and contains tensors of shape :math:`(H, W, C)`. * The ``'label'`` column contains strings representing class names. * The ``'target'`` column contain integer targets corresponding to class. From 7c0a31754495b5e86d82d1e70f9841c543e0d076 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 11:16:02 -0700 Subject: [PATCH 16/23] Update documentation and add test --- doc/source/data/package-ref.rst | 1 + python/ray/data/read_api.py | 28 +++++++++++---- python/ray/data/tests/test_dataset_formats.py | 34 +++++++++++++++++++ 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst index 39e0c9c6f3e6..c5606fb8f444 100644 --- a/doc/source/data/package-ref.rst +++ b/doc/source/data/package-ref.rst @@ -16,6 +16,7 @@ Creating Datasets .. autofunction:: ray.data.read_numpy .. autofunction:: ray.data.read_text .. autofunction:: ray.data.read_binary_files +.. autofunction:: ray.data.read_image_folder .. autofunction:: ray.data.read_datasource .. autofunction:: ray.data.from_items .. autofunction:: ray.data.from_arrow diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 79b0415119fb..3c962f0fc2b0 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -1090,12 +1090,12 @@ def read_image_folder(root: str, *, parallelism: int = -1): root/cat/nsdf3.png root/cat/[...]/asd932_.png - Datasets read with this function contain three columns: ``'image'``, ``'label'`` and - ``'target'``. + Datasets read with this function contain two columns: ``'image'`` and ``'label'``. - * The ``'image'`` column is of type ``TensorDtype`` and contains tensors of shape :math:`(H, W, C)`. - * The ``'label'`` column contains strings representing class names. - * The ``'target'`` column contain integer targets corresponding to class. + * The ``'image'`` column is of type + :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype` and contains + tensors of shape :math:`(H, W, C)`. + * The ``'label'`` column contains strings representing class names (e.g., 'cat'). Arguments: path: Path to the directory root. @@ -1105,12 +1105,26 @@ def read_image_folder(root: str, *, parallelism: int = -1): >>> import ray >>> >>> ds = ray.data.read_image_folder("/data/imagenet/train") + >>> >>> sample = ds.take(1)[0] # doctest: +SKIP - >>> sample["image"].shape # doctest: +SKIP + >>> sample["image"].to_numpy().shape # doctest: +SKIP (469, 387, 3) >>> sample["label"] # doctest: +SKIP 'n01443537' - >>> sample["target] # doctest: +SKIP + + To convert class labels to integer-valued targets, use + :py:class:`~ray.data.preprocessors.OrdinalEncoder`. + + >>> import ray + >>> from ray.data.preprocessors import OrdinalEncoder + >>> + >>> ds = ray.data.read_image_folder("/data/imagenet/train") + >>> oe = OrdinalEncoder(columns=["label"]) + >>> + >>> ds = oe.fit_transform(ds) + >>> + >>> sample = ds.take(1)[0] + >>> sample["label"] 71 """ return read_datasource( diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index 990a6de651cf..13e54618b351 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -40,6 +40,7 @@ _SerializedPiece, _deserialize_pieces_with_retry, ) +from ray.data.preprocessors import BatchMapper from ray.data.tests.conftest import * # noqa from ray.data.extensions import TensorDtype from ray.tests.conftest import * # noqa @@ -2871,6 +2872,39 @@ def test_read_image_folder(ray_start_regular_shared): assert df["target"].tolist() == [0, 0, 1] +def test_read_image_folder_e2e(ray_start_regular_shared): + from ray.air.util.tensor_extensions.pandas import TensorArray + from ray.train.torch import to_air_checkpoint, TorchPredictor + from ray.train.batch_predictor import BatchPredictor + + from torchvision import transforms + from torchvision.models import resnet18 + + dataset = ray.data.read_image_folder("image-folder") + + def preprocess(df): + # We convert the `TensorArrayElement` to a NumPy array because `ToTensor` + # expects a NumPy array or PIL image. `ToTensor` is necessary because Torch + # expects images to have shape (C, H, W), and `ToTensor` changes the shape of + # the data from (H, W, C) to (C, H, W). + preprocess = transforms.Compose( + [ + lambda ray_tensor: ray_tensor.to_numpy(), + transforms.ToTensor(), + ] + ) + df["image"] = TensorArray([preprocess(image) for image in df["image"]]) + return df + + preprocessor = BatchMapper(preprocess) + + model = resnet18(pretrained=True) + checkpoint = to_air_checkpoint(model=model, preprocessor=preprocessor) + + predictor = BatchPredictor.from_checkpoint(checkpoint, TorchPredictor) + predictor.predict(dataset, feature_columns=["image"]) + + def test_read_text_remote_args(ray_start_cluster, tmp_path): cluster = ray_start_cluster cluster.add_node( From e07d1f8570ecdfdcd5199769d2ee97d743bcedc5 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 11:22:41 -0700 Subject: [PATCH 17/23] Remove `target` column --- .../ray/data/datasource/image_folder_datasource.py | 12 ------------ python/ray/data/tests/test_dataset_formats.py | 6 +----- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 2233c6b73d68..52c27de16ac9 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -52,16 +52,6 @@ def create_reader( paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) self.root = paths[0] - from pyarrow.fs import FileType, FileSelector - - labels = [ - file_info.base_name - for file_info in filesystem.get_file_info(FileSelector(self.root)) - if file_info.type is FileType.Directory - ] - labels.sort() # Sort labels so that targets are consistent. - self.label_to_target = {label: labels.index(label) for label in labels} - return super().create_reader( paths=paths, filesystem=filesystem, @@ -80,13 +70,11 @@ def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): image = iio.imread(data) label = _get_class_from_path(path, self.root) - target = self.label_to_target[label] return pd.DataFrame( { "image": TensorArray([np.array(image)]), "label": [label], - "target": [target], } ) diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index 13e54618b351..3e02a424559b 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -2863,14 +2863,10 @@ def test_read_image_folder(ray_start_regular_shared): assert ds.count() == 3 df = ds.to_pandas() + assert sorted(df["label"]) == ["cat", "cat", "dog"] assert type(df["image"].dtype) is TensorDtype assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"]) - df = df.sort_values("label") - # Targets should be assigned alphabetically to labels. - assert df["label"].tolist() == ["cat", "cat", "dog"] - assert df["target"].tolist() == [0, 0, 1] - def test_read_image_folder_e2e(ray_start_regular_shared): from ray.air.util.tensor_extensions.pandas import TensorArray From 298c1442c371eec645bb6681269c8e5db8bf8ae3 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 11:25:48 -0700 Subject: [PATCH 18/23] Change error type from `ValueError` to `ImportError` --- python/ray/data/datasource/image_folder_datasource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 52c27de16ac9..89cf50a7f82f 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -38,7 +38,7 @@ def create_reader( try: import imageio # noqa: F401 except ImportError: - raise ValueError( + raise ImportError( "`ImageFolderDatasource` depends on 'imageio', but 'imageio' couldn't " "be imported. You can install 'imageio' by running " "`pip install imageio`." From dc9c6e217627ed78e26b3df57096844063648e2b Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 11:53:24 -0700 Subject: [PATCH 19/23] Remove `read_image_folder` --- doc/source/data/package-ref.rst | 1 - python/ray/data/__init__.py | 1 - .../datasource/image_folder_datasource.py | 55 ++++++++++++++++- python/ray/data/read_api.py | 59 ------------------- python/ray/data/tests/test_dataset_formats.py | 18 ++++-- 5 files changed, 68 insertions(+), 66 deletions(-) diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst index c5606fb8f444..39e0c9c6f3e6 100644 --- a/doc/source/data/package-ref.rst +++ b/doc/source/data/package-ref.rst @@ -16,7 +16,6 @@ Creating Datasets .. autofunction:: ray.data.read_numpy .. autofunction:: ray.data.read_text .. autofunction:: ray.data.read_binary_files -.. autofunction:: ray.data.read_image_folder .. autofunction:: ray.data.read_datasource .. autofunction:: ray.data.from_items .. autofunction:: ray.data.from_arrow diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 295503eb0f4c..d9770fc099f6 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -33,7 +33,6 @@ read_parquet, read_parquet_bulk, read_text, - read_image_folder, ) # Register custom Arrow JSON ReadOptions serializer after worker has initialized. diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 89cf50a7f82f..9aeb500e616e 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -18,7 +18,60 @@ class ImageFolderDatasource(BinaryDatasource): - """A datasource that lets you read datasets like `ImageNet `_.""" # noqa: E501 + """A datasource that lets you read datasets like `ImageNet `_. + + This datasource works with any dataset where images are arranged in this way: + + .. code-block:: + + root/dog/xxx.png + root/dog/xxy.png + root/dog/[...]/xxz.png + + root/cat/123.png + root/cat/nsdf3.png + root/cat/[...]/asd932_.png + + Datasets read with this datasource contain two columns: ``'image'`` and ``'label'``. + + * The ``'image'`` column is of type + :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype` and contains + tensors of shape :math:`(H, W, C)`. + * The ``'label'`` column contains strings representing class names (e.g., 'cat'). + + Examples: + >>> import ray + >>> from ray.data.datasource import ImageFolderDatasource + >>> + >>> ds = ray.data.read_datasource( + ... ImageFolderDatasource(), + ... paths=["/data/imagenet/train"] + ... ) + >>> + >>> sample = ds.take(1)[0] # doctest: +SKIP + >>> sample["image"].to_numpy().shape # doctest: +SKIP + (469, 387, 3) + >>> sample["label"] # doctest: +SKIP + 'n01443537' + + To convert class labels to integer-valued targets, use + :py:class:`~ray.data.preprocessors.OrdinalEncoder`. + + >>> import ray + >>> from ray.data.preprocessors import OrdinalEncoder + >>> + >>> ds = ray.data.read_datasource( + ... ImageFolderDatasource(), + ... paths=["/data/imagenet/train"] + ... ) + >>> oe = OrdinalEncoder(columns=["label"]) + >>> + >>> ds = oe.fit_transform(ds) + >>> + >>> sample = ds.take(1)[0] + >>> sample["label"] + 71 + """ # noqa: E501 def create_reader( self, diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index 3c962f0fc2b0..b1350fbd08b4 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -27,7 +27,6 @@ DefaultFileMetadataProvider, DefaultParquetMetadataProvider, FastFileMetadataProvider, - ImageFolderDatasource, JSONDatasource, NumpyDatasource, ParquetBaseDatasource, @@ -1074,64 +1073,6 @@ def convert(ds: "datasets.Dataset") -> Dataset[ArrowRow]: ) -@PublicAPI -def read_image_folder(root: str, *, parallelism: int = -1): - """Read a dataset structured like `ImageNet `_. - - This function works with any dataset where images are arranged in this way: - - .. code-block:: - - root/dog/xxx.png - root/dog/xxy.png - root/dog/[...]/xxz.png - - root/cat/123.png - root/cat/nsdf3.png - root/cat/[...]/asd932_.png - - Datasets read with this function contain two columns: ``'image'`` and ``'label'``. - - * The ``'image'`` column is of type - :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype` and contains - tensors of shape :math:`(H, W, C)`. - * The ``'label'`` column contains strings representing class names (e.g., 'cat'). - - Arguments: - path: Path to the directory root. - parallelism: The user-requested parallelism, or -1 for autodetection. - - Examples: - >>> import ray - >>> - >>> ds = ray.data.read_image_folder("/data/imagenet/train") - >>> - >>> sample = ds.take(1)[0] # doctest: +SKIP - >>> sample["image"].to_numpy().shape # doctest: +SKIP - (469, 387, 3) - >>> sample["label"] # doctest: +SKIP - 'n01443537' - - To convert class labels to integer-valued targets, use - :py:class:`~ray.data.preprocessors.OrdinalEncoder`. - - >>> import ray - >>> from ray.data.preprocessors import OrdinalEncoder - >>> - >>> ds = ray.data.read_image_folder("/data/imagenet/train") - >>> oe = OrdinalEncoder(columns=["label"]) - >>> - >>> ds = oe.fit_transform(ds) - >>> - >>> sample = ds.take(1)[0] - >>> sample["label"] - 71 - """ - return read_datasource( - ImageFolderDatasource(), paths=[root], parallelism=parallelism - ) - - def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]: stats = BlockExecStats.builder() import pyarrow as pa diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index 3e02a424559b..4f7b49244beb 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -26,6 +26,7 @@ DefaultParquetMetadataProvider, DummyOutputDatasource, FastFileMetadataProvider, + ImageFolderDatasource, PartitionStyle, PathPartitionEncoder, PathPartitionFilter, @@ -2856,9 +2857,9 @@ def get_node_id(): assert node_ids == {bar_node_id} -def test_read_image_folder(ray_start_regular_shared): +def test_image_folder_datasource(ray_start_regular_shared): root = os.path.join(os.path.dirname(__file__), "image-folder") - ds = ray.data.read_image_folder(root) + ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) assert ds.count() == 3 @@ -2868,7 +2869,15 @@ def test_read_image_folder(ray_start_regular_shared): assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"]) -def test_read_image_folder_e2e(ray_start_regular_shared): +def test_image_folder_datasource_raises_value_error(ray_start_regular_shared): + # `ImageFolderDatasource` should raise an error if more than one path is passed. + with pytest.raises(ValueError): + ray.data.read_datasource( + ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"] + ) + + +def test_image_folder_datasource_e2e(ray_start_regular_shared): from ray.air.util.tensor_extensions.pandas import TensorArray from ray.train.torch import to_air_checkpoint, TorchPredictor from ray.train.batch_predictor import BatchPredictor @@ -2876,7 +2885,8 @@ def test_read_image_folder_e2e(ray_start_regular_shared): from torchvision import transforms from torchvision.models import resnet18 - dataset = ray.data.read_image_folder("image-folder") + root = os.path.join(os.path.dirname(__file__), "image-folder") + dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) def preprocess(df): # We convert the `TensorArrayElement` to a NumPy array because `ToTensor` From 1d4d9205ddd2d46c44ab707ea03a09109209559e Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 13:04:18 -0700 Subject: [PATCH 20/23] Add API annotation --- python/ray/data/datasource/image_folder_datasource.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 9aeb500e616e..21b004191595 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -17,6 +17,7 @@ IMAGE_EXTENSIONS = ["png", "jpg", "jpeg", "tiff", "bmp", "gif"] +@DeveloperAPI class ImageFolderDatasource(BinaryDatasource): """A datasource that lets you read datasets like `ImageNet `_. From 0f5e089b15e1fdb368dc5286d085654831c428b4 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 13:05:11 -0700 Subject: [PATCH 21/23] Add missing `DeveloperAPI` import --- python/ray/data/datasource/image_folder_datasource.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 21b004191595..7ff08afce337 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -9,6 +9,7 @@ FileExtensionFilter, ) from ray.data.datasource.partitioning import PathPartitionFilter +from ray.util.annotations import DeveloperAPI if TYPE_CHECKING: import pyarrow From 18dd5662811dc31791447a94a102951bcd910394 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Fri, 15 Jul 2022 15:14:08 -0700 Subject: [PATCH 22/23] Skip doctests --- .../ray/data/datasource/image_folder_datasource.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py index 7ff08afce337..dcf6843cf75d 100644 --- a/python/ray/data/datasource/image_folder_datasource.py +++ b/python/ray/data/datasource/image_folder_datasource.py @@ -45,7 +45,7 @@ class ImageFolderDatasource(BinaryDatasource): >>> import ray >>> from ray.data.datasource import ImageFolderDatasource >>> - >>> ds = ray.data.read_datasource( + >>> ds = ray.data.read_datasource( # doctest: +SKIP ... ImageFolderDatasource(), ... paths=["/data/imagenet/train"] ... ) @@ -62,16 +62,16 @@ class ImageFolderDatasource(BinaryDatasource): >>> import ray >>> from ray.data.preprocessors import OrdinalEncoder >>> - >>> ds = ray.data.read_datasource( + >>> ds = ray.data.read_datasource( # doctest: +SKIP ... ImageFolderDatasource(), ... paths=["/data/imagenet/train"] ... ) - >>> oe = OrdinalEncoder(columns=["label"]) + >>> oe = OrdinalEncoder(columns=["label"]) # doctest: +SKIP >>> - >>> ds = oe.fit_transform(ds) + >>> ds = oe.fit_transform(ds) # doctest: +SKIP >>> - >>> sample = ds.take(1)[0] - >>> sample["label"] + >>> sample = ds.take(1)[0] # doctest: +SKIP + >>> sample["label"] # doctest: +SKIP 71 """ # noqa: E501 From 70a47d7fa6566a85f1fcac7ed40971dc1bc0e3ac Mon Sep 17 00:00:00 2001 From: Richard Liaw Date: Fri, 15 Jul 2022 21:06:36 -0700 Subject: [PATCH 23/23] update Signed-off-by: Richard Liaw --- python/ray/data/tests/test_dataset_formats.py | 108 +++++++++--------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py index 5c994a00ca63..d277c16ef393 100644 --- a/python/ray/data/tests/test_dataset_formats.py +++ b/python/ray/data/tests/test_dataset_formats.py @@ -2732,6 +2732,60 @@ def test_torch_datasource_value_error(ray_start_regular_shared, local_path): ) +def test_image_folder_datasource(ray_start_regular_shared): + root = os.path.join(os.path.dirname(__file__), "image-folder") + ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) + + assert ds.count() == 3 + + df = ds.to_pandas() + assert sorted(df["label"]) == ["cat", "cat", "dog"] + assert type(df["image"].dtype) is TensorDtype + assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"]) + + +def test_image_folder_datasource_raises_value_error(ray_start_regular_shared): + # `ImageFolderDatasource` should raise an error if more than one path is passed. + with pytest.raises(ValueError): + ray.data.read_datasource( + ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"] + ) + + +def test_image_folder_datasource_e2e(ray_start_regular_shared): + from ray.air.util.tensor_extensions.pandas import TensorArray + from ray.train.torch import to_air_checkpoint, TorchPredictor + from ray.train.batch_predictor import BatchPredictor + + from torchvision import transforms + from torchvision.models import resnet18 + + root = os.path.join(os.path.dirname(__file__), "image-folder") + dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) + + def preprocess(df): + # We convert the `TensorArrayElement` to a NumPy array because `ToTensor` + # expects a NumPy array or PIL image. `ToTensor` is necessary because Torch + # expects images to have shape (C, H, W), and `ToTensor` changes the shape of + # the data from (H, W, C) to (C, H, W). + preprocess = transforms.Compose( + [ + lambda ray_tensor: ray_tensor.to_numpy(), + transforms.ToTensor(), + ] + ) + df["image"] = TensorArray([preprocess(image) for image in df["image"]]) + return df + + preprocessor = BatchMapper(preprocess) + + model = resnet18(pretrained=True) + checkpoint = to_air_checkpoint(model=model, preprocessor=preprocessor) + + predictor = BatchPredictor.from_checkpoint(checkpoint, TorchPredictor) + predictor.predict(dataset, feature_columns=["image"]) + + # NOTE: The last test using the shared ray_start_regular_shared cluster must use the # shutdown_only fixture so the shared cluster is shut down, otherwise the below # test_write_datasource_ray_remote_args test, which uses a cluster_utils cluster, will @@ -2867,60 +2921,6 @@ def get_node_id(): assert node_ids == {bar_node_id} -def test_image_folder_datasource(ray_start_regular_shared): - root = os.path.join(os.path.dirname(__file__), "image-folder") - ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) - - assert ds.count() == 3 - - df = ds.to_pandas() - assert sorted(df["label"]) == ["cat", "cat", "dog"] - assert type(df["image"].dtype) is TensorDtype - assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"]) - - -def test_image_folder_datasource_raises_value_error(ray_start_regular_shared): - # `ImageFolderDatasource` should raise an error if more than one path is passed. - with pytest.raises(ValueError): - ray.data.read_datasource( - ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"] - ) - - -def test_image_folder_datasource_e2e(ray_start_regular_shared): - from ray.air.util.tensor_extensions.pandas import TensorArray - from ray.train.torch import to_air_checkpoint, TorchPredictor - from ray.train.batch_predictor import BatchPredictor - - from torchvision import transforms - from torchvision.models import resnet18 - - root = os.path.join(os.path.dirname(__file__), "image-folder") - dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[root]) - - def preprocess(df): - # We convert the `TensorArrayElement` to a NumPy array because `ToTensor` - # expects a NumPy array or PIL image. `ToTensor` is necessary because Torch - # expects images to have shape (C, H, W), and `ToTensor` changes the shape of - # the data from (H, W, C) to (C, H, W). - preprocess = transforms.Compose( - [ - lambda ray_tensor: ray_tensor.to_numpy(), - transforms.ToTensor(), - ] - ) - df["image"] = TensorArray([preprocess(image) for image in df["image"]]) - return df - - preprocessor = BatchMapper(preprocess) - - model = resnet18(pretrained=True) - checkpoint = to_air_checkpoint(model=model, preprocessor=preprocessor) - - predictor = BatchPredictor.from_checkpoint(checkpoint, TorchPredictor) - predictor.predict(dataset, feature_columns=["image"]) - - def test_read_text_remote_args(ray_start_cluster, tmp_path): cluster = ray_start_cluster cluster.add_node(