From f6eb66e86428984d1d38d14109527d24ecf03f12 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Mon, 9 May 2022 22:58:53 -0700
Subject: [PATCH 01/23] Add files

---
 .../datasource/image_folder_datasource.py     | 77 +++++++++++++++++++
 python/ray/data/impl/pandas_block.py          |  9 ++-
 2 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 python/ray/data/datasource/image_folder_datasource.py

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
new file mode 100644
index 000000000000..991ce42e012e
--- /dev/null
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -0,0 +1,77 @@
+import pathlib
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import imageio as iio
+import numpy as np
+from pyarrow.fs import FileSelector, FileType
+from ray.data.block import Block
+from ray.data.datasource.binary_datasource import BinaryDatasource
+from ray.data.datasource.datasource import ReadTask
+from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
+from ray.data.datasource.file_meta_provider import (
+    BaseFileMetadataProvider,
+    DefaultFileMetadataProvider,
+    FastFileMetadataProvider,
+)
+from ray.data.datasource.partitioning import PathPartitionFilter
+
+IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"]
+
+
+class ImageFolderDatasource(BinaryDatasource):
+    def prepare_read(
+        self,
+        parallelism: int,
+        paths: Union[str, List[str]],
+        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
+        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
+        open_stream_args: Optional[Dict[str, Any]] = None,
+        meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(),
+        partition_filter: PathPartitionFilter = None,
+        # TODO(ekl) deprecate this once read fusion is available.
+        _block_udf: Optional[Callable[[Block], Block]] = None,
+        **reader_args,
+    ) -> List[ReadTask]:
+        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
+        assert len(paths) == 1
+        self.root = paths[0]
+
+        paths, _ = meta_provider.expand_paths(paths, filesystem)
+        paths = [path for path in paths if _is_image_file(path)]
+
+        return super().prepare_read(
+            parallelism=parallelism,
+            paths=paths,
+            filesystem=filesystem,
+            schema=schema,
+            open_stream_args=open_stream_args,
+            meta_provider=FastFileMetadataProvider(),
+            partition_filter=partition_filter,
+            _block_udf=_block_udf,
+        )
+
+    def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
+        import pandas as pd
+
+        records = super()._read_file(f, path, include_paths=True)
+        assert len(records) == 1
+        path, data = records[0]
+
+        image = iio.imread(data)
+        label = _get_class_from_path(path, self.root)
+
+        return pd.DataFrame({"image": [np.array(image)], "label": [label]})
+
+
+def _is_image_file(path: str) -> bool:
+    return any(path.lower().endswith(extension) for extension in IMAGE_EXTENSIONS)
+
+
+def _get_class_from_path(path: str, root: str) -> str:
+    # The class is the name of the first directory after the root. For example, if
+    # the root is "/data/imagenet/train" and the path is
+    # "/data/imagenet/train/n01443537/images/n01443537_0.JPEG", then the class is
+    # "n01443537".
+    path, root = pathlib.PurePath(path), pathlib.PurePath(root)
+    assert root in path.parents
+    return path.parts[len(root.parts) :][0]
\ No newline at end of file
diff --git a/python/ray/data/impl/pandas_block.py b/python/ray/data/impl/pandas_block.py
index 07bbdc1eb50a..308a60b05cf4 100644
--- a/python/ray/data/impl/pandas_block.py
+++ b/python/ray/data/impl/pandas_block.py
@@ -49,9 +49,12 @@ def __getitem__(self, key: str) -> Any:
             return None
         item = col.iloc[0]
         try:
-            # Try to interpret this as a numpy-type value.
-            # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types.  # noqa: E501
-            return item.item()
+            if item.size == 1:
+                # Try to interpret this as a numpy-type value.
+                # See https://stackoverflow.com/questions/9452775/converting-numpy-dtypes-to-native-python-types.  # noqa: E501
+                return item.item()
+            else:
+                return item
         except AttributeError:
             # Fallback to the original form.
             return item

From 8fb8d75d79a59b403d28ce7839437f743d80830b Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 01:35:14 -0700
Subject: [PATCH 02/23] Add files

---
 python/ray/data/datasource/__init__.py        |   2 +
 .../datasource/image_folder_datasource.py     |  51 +++++++++++++++++-
 .../ray/data/tests/image-folder/cat/123.png   | Bin 0 -> 2109 bytes
 .../data/tests/image-folder/cat/not-an-image  |   0
 .../ray/data/tests/image-folder/dog/xxx.png   | Bin 0 -> 2370 bytes
 .../ray/data/tests/image-folder/not-an-image  |   0
 python/ray/data/tests/test_dataset_formats.py |  21 ++++++++
 7 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 python/ray/data/tests/image-folder/cat/123.png
 create mode 100644 python/ray/data/tests/image-folder/cat/not-an-image
 create mode 100644 python/ray/data/tests/image-folder/dog/xxx.png
 create mode 100644 python/ray/data/tests/image-folder/not-an-image

diff --git a/python/ray/data/datasource/__init__.py b/python/ray/data/datasource/__init__.py
index e51390b1b953..149127dc2ce5 100644
--- a/python/ray/data/datasource/__init__.py
+++ b/python/ray/data/datasource/__init__.py
@@ -22,6 +22,7 @@
     FileMetadataProvider,
     ParquetMetadataProvider,
 )
+from ray.data.datasource.image_folder_datasource import ImageFolderDatasource
 from ray.data.datasource.json_datasource import JSONDatasource
 from ray.data.datasource.numpy_datasource import NumpyDatasource
 from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource
@@ -48,6 +49,7 @@
     "FastFileMetadataProvider",
     "FileBasedDatasource",
     "FileMetadataProvider",
+    "ImageFolderDatasource",
     "JSONDatasource",
     "NumpyDatasource",
     "ParquetBaseDatasource",
diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 991ce42e012e..49b02918d7b4 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -19,6 +19,35 @@
 
 
 class ImageFolderDatasource(BinaryDatasource):
+    """A datasource that allows you to load datasets like ImageNet.
+
+    This datasource works with any dataset where the images are arranged in this way:
+
+    ```
+    root/dog/xxx.png
+    root/dog/xxy.png
+    root/dog/[...]/xxz.png
+
+    root/cat/123.png
+    root/cat/nsdf3.png
+    root/cat/[...]/asd932_.png
+    ```
+
+    Examples:
+        >>> import ray
+        >>> from ray.data.datasource import ImageFolderDatasource
+        >>>
+        >>> ds = ray.data.read_datasource(
+        ...     ImageFolderDatasource(),
+        ...     paths=["s3://tiny-imagenet/train"]
+        ... )
+        >>> TODO
+
+    Raises:
+        ValueError: if more than one path is provided. You should only provide the path
+            to the dataset root.
+    """
+
     def prepare_read(
         self,
         parallelism: int,
@@ -32,8 +61,26 @@ def prepare_read(
         _block_udf: Optional[Callable[[Block], Block]] = None,
         **reader_args,
     ) -> List[ReadTask]:
+        if len(paths) > 1:
+            raise ValueError(
+                "`ImageFolderDatasource` expects 1 path representing the dataset "
+                f"root, but it got {len(paths)} paths instead. To fix this error, "
+                "pass in a single-element list containing the dataset root (for "
+                'example, `paths=["s3://imagenet/train"]`)'
+            )
+
+        try:
+            import imageio
+        except ImportError:
+            raise ValueError(
+                "`ImageFolderDatasource` depends on 'imageio', but 'imageio' couldn't "
+                "be imported. You can install 'imageio' by running "
+                "`pip install imageio`."
+            )
+            
+        # We call `_resolve_paths_and_filesystem` so that the dataset root is formatted
+        # in the same way as the paths passed to `_get_class_from_path`.
         paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
-        assert len(paths) == 1
         self.root = paths[0]
 
         paths, _ = meta_provider.expand_paths(paths, filesystem)
@@ -74,4 +121,4 @@ def _get_class_from_path(path: str, root: str) -> str:
     # "n01443537".
     path, root = pathlib.PurePath(path), pathlib.PurePath(root)
     assert root in path.parents
-    return path.parts[len(root.parts) :][0]
\ No newline at end of file
+    return path.parts[len(root.parts) :][0]
diff --git a/python/ray/data/tests/image-folder/cat/123.png b/python/ray/data/tests/image-folder/cat/123.png
new file mode 100644
index 0000000000000000000000000000000000000000..90797d74701c0b771b7a708fa834a5489801d4a8
GIT binary patch
literal 2109
zcmV-D2*US?P)<h;3K|Lk000e1NJLTq001BW001Be0ssI2{21+{00007bV*G`2iON6
z04pP0^dleu00;3&L_t(YiA~bkj;+^KhT(5CbN=00tJ=NGDLaX+5G-OWWC;$UNJu%#
zO>nCm2P7mQlAQ!Qw&ZSmH?_-EtJdGlImZ|rfDq3WyazA+)9-$Lc7B$}{m}Qj%iI3l
zUs1lexw-xKPyg`Qzx}gPeDv$zUcL7#v>+xA7m;>)eEa9$+p9ljEz<XJ)APSR``h0+
z30aLmgd^PD+=8nWw<Z9&uMc<j>MJdeQ0oSb=1L)E$tu^GC?qF}diweg^~Gly&YtRV
zY4bW9p5<{TLXwiIswPntO{%K*bD745d3C6uwC_IOucztiFeu#|hKq}S=MQ@Sg}S3m
zl*fm=7Y|?kN6b$p4_;er$F$qa?h<hN-*^KeA{_2yvcEjL7{(~psx`2AYxi$q+x~30
zIqT2Q#}rkus+js}%W;15{o@bc)MYJO*|rs}^2JZCKK_lQUI`*#CL$03ieT5f({lIY
z@v<MSVM<+9v2`0?eR;Uv4`bhKtF2hA{V-T_^E&PNoKiKfX6~NGJX}9>?oMkZFK*VF
znOU>uwt1_%y1YEi`!2N&I6m<(^j%&{wY4IPu!?%L1~W7BRzNLJ)4a^};c-5lE^giz
zim(6kUrVsp*;@?{_b?c_!9#=ju~?7`;^F@7Nax*f)&gZYPPy|4QsIC#Gq3Dhn5X^Z
z5BH=J9{qF{OsBiQKosE-7C{8j2m~Xb%+wGCO;WespYp1$O>%iv>M9SfrRya3L2x+g
z7R#(juP+W<p1)YPw)g4sS<M~sQP|wQg*k{22qzFAIwBYmLg&vf+6H$M*nYPMVL9Dp
zn=%QJXwcAQv>9x>X*cfQANSK2pZ@9L&D|S|)4VE?yGJ;j05-A!Ay_|n=hnlA!*IBM
zwjM)O-uvY*sCipw5m<o#XF7A4au;L35G|{|^V9dV>uc!4tCt7qRM@Pw;%+c^B?4<W
z^u53N#KXkP+G2lwIgF#Nr7g!{Qtk#b0-6W85ui*$npCrZIi(N({vV!-zx|irThvTa
zYN@4KYi>@GlO`h8U<+%tl_vDlU;J!4)$Q&@&Q7`~8qBNtDw@4EE6L5%IQEBYfl!;H
zHE{p#FF!T}td&x{mQvPQs=1@qFjIQ^jx5~jY;5fU-f~C7bS3>rse>K2tqe>NzP6G?
zx_$r&NvFxp%CNtF|C0|r2<+CVMXOfB9A@R|aoegUJc6luZKv|~8@alE)?HoXd?}si
zFm4oRB_aT3717?RgHGe2_ej31?dh~^-x7r~TV-<tIT2S<+id7|x*vkz(X91JR29^X
zqjVyqiQ%f+cd*&4Cf1bm@-#QteA;r~QQa6lTWK%@hr??@SOh$1eL5Zux*L?4I1=>-
z!`1Ql<f=M&Eoj?mN;rG|i~uCj#3}Kgzx>OeKL7mr^|McY^N%-|hfO&`2^tUxLe#Cf
z854DVM=f-kG7}4vB-w1ce>m1P-#xVd`t-B)^z_jOH>AGbO_Doj-%5G;;#;q$ZoEy2
z3{;JriODG%AOSWK08Y%Bg+hmpnJ6)(&i1*m>pGWVccqdD?c48u==W3i^LO_bI}!Eh
z`r-1X+u#1|gN&`%;kYg$Tkbm*0RRHvL`DF@3D6`?L`1r7wjAHgC*S7f;(Gk><G=gr
z%g>j$_uERRQ`uk0Fzi5wzxwEt^XC^5vlGTn)8i}msCH=DAtVrB3Xqd-&52mU*&?i!
zd9Dw~<Kt;dd@gwXlXuT<fBxIoFJ8CDAJ-O3*>;|s>gDalZr3eu9`gA+<aameYlwq;
zi(E}hNR@7jzIpN>@}z6*npr7jsaSl!AsZ4cKHR^&IX_(AoWH7DL*0y*>I_pxIo(s9
zxbIr=B$Bx@HpkjHvPY{J4V$OM_7S8mXCBpM3$3J;RUC4xY_Hxzz8EiWv;#c%)1?lR
zmlkb>D!Qp!MeH_QOZz@Jf+#3}AOwTmo$xw^sSt=cL`&4L8eYx0);KLS@!Q*OG?Sdi
zZXCNjG+3xmNAu&7Dl;Ky*M)?yK?(x9*T8}vTsl+)E22c0TXnDDEgVE5%%<I1>2%s$
z`ek0`rzH&sl9W1~PmlL^cdC@TwA<$*LyncHnwfLB^-<SOES!x(Ppj2hx2-Nqo7X5M
zoJuvJq%~xzEq7Gz=k@+&67S9~JMCY7^ZnE5G)x`HtLH{p+zkQfW>l+<WPvaQ13E8J
zYqT1*Q8)<X76`Oj-W&_qZig<@HvjMyKfR*oH>;*M-+zrXc0Id^mx>%@)tQUsQaten
zcmzj81XK?<wys>(lq#j7wQ`HHY*A|-`&R7!hnLf!M1;ldO(jkHfjnBzg$?kIn3F^U
zZB|68pamG=0XRr6uCCT~-PUzoS1Z-lrMI>8ZWRgw8ig<Bl>2TNWq&^HcRRy8KkoJ&
zlr%L6RnvJU;Vo&6#`r%3b=w#w%qb;=hYPAEP82Q{kRIMr(k#<%7%vXFAACEVW?PO&
zn)W>+JJQo8HRfH4Fbe!X5$d&hs{~5Sp&F5iLR^)S*Aid~(LU$>PNyC~S-fth-nMn|
zL^KR7dJpv3DVT!YBVZmt5IHG)V;De0k;D-ZgfOOH3vv%3O3Hnr%m}we3=tOdfFlg4
zhD>7Q5!NEeBO=0y0~COUZG>PFqJS__50UT?W)}*l#Izgp*ry}`AOLr_;Kp6;4w1Sp
negujGKp?_BKm;atc;LSQWg4{P2b{s^00000NkvXXu0mjf4loqX

literal 0
HcmV?d00001

diff --git a/python/ray/data/tests/image-folder/cat/not-an-image b/python/ray/data/tests/image-folder/cat/not-an-image
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/ray/data/tests/image-folder/dog/xxx.png b/python/ray/data/tests/image-folder/dog/xxx.png
new file mode 100644
index 0000000000000000000000000000000000000000..1613ff63ad408cb4e16cea0306b388add050c20c
GIT binary patch
literal 2370
zcmV-I3BC4-P)<h;3K|Lk000e1NJLTq001BW001Be0ssI2{21+{00007bV*G`2iON5
z7bGQ<+DJ+O00{O;L_t(Yi8Yp4b6i&u$IpK5s?ltcwRj|3v1P}yl!XLH0<H=V6nug3
z4S+93-k^AiNk~$afGKadNa977WE@!|jb=t8jb^#`p0#<n_&lHLKBxQt?_c9fi;Xyr
z5yliD3{bDvS65f9wa)u62qu#Wro=dlF{X@Om|w`U+*;f1b`io{Pypy09eDsrIRePC
z%vy^G!IWW)l~Qc3*=RPKoC|^gJTw}Ot5>gb&b8J;2xCl=rVgC<9soJ#gbczkJUKZ5
z07|gd>gm&G<MD_Q2Hrd8wAR*pLMcL!Wf_}m)E5^n2*EL;-g`=Ezu)Ja6G9kcgb+fA
zlu|1-oldne9#9xX0AQ^Zj511p_Wt{a?Zd(8sdtVLVvGUg0lfDx98zXf;jQ6;6hX+i
zV3cBv%jlF+06+lXJtI^wE`@X$PbQPQ_wFss&o5tCP(|KoG~zI{X`0T=SZfi+-WhAO
zF@}I=&Kkk_Y^5T@h%(6-MW~GF5ki0v0OVOFrL4`)YU?`vzBc->-DZ?BguuDVOa)<7
zsZ`RLnbXr#?>)hYQwrWwLRb(+mC9@~Gb2QR2`$Uwyyr3q;v}C=h9|us3@Vk`@iey{
z+U)}X->BD%EZ^Thh~rq>z4i5VA!Vo2ky0w90GzW%D_tl>-?@2XVQD!CB7`v^gmX?Q
zjg$E2zqqya?)&rebKT=Z#+d~eW%*3Cc45A0jh&lo20^%f?K<P)hbK>)bMuVRUbma3
zsrMdZf)Me5vRpAC1f!G?f)H}fdGEE><H^K2o1|&LX|-DY{`>FVxU#<a_PbG(2u5DK
z*vs?c&Ye4xEIT^x<f<sN>U271XT$mVdFLELh*BCwQKeELWoU#D#uy<K1OXvrG&=v`
z`yaNqw>zDqe*a{5XE%=H%a<-Q&H<B^OKWev`xE2g+3rrebF_4Ec{CkAe)5D;LMf$`
zdhg3FdhZG6T)aL~$}kM2lyMw?cKfrhzWnmx!-se8-Ytq^cV}lj9&4?Qb<Mf?#l@wq
zt+)Co{qc0V+-eO@PqQprTU#qNO$aFy7-L8h$59k9#sGj2QWOOMXswSs$BjmVQhIRk
zDoK<5{rx-ty7S+cFUMJ?jI|!7d4AM6o@CR5!}iYJ-qO-i7={QTLP#9PaU7><8ipYg
zQhM)|QV3zjxKbJ+R1|t+<HlQCn}c5W%GLGi?9A|d^zgy`!P)t*fAibc^2Nz`ytlg-
z$MLOOx8~>O>($zo)s@o!Wql~6gb;!;ef!PzD2^$m7-LGQ)+S97>;0v*wR){O7@T<U
z(`+mQxwpG>VSeuU)8{J}muHe>(C@vmcI6Mh|KsAq;>L}e1l&o#S8Al}GXPApjE%-)
z=bUp609)&dqCm5=jb?LcdHL(Fzkc-S(dtU;@bGALW%cN&Q?EC|ARe8c7kOT-R<B*Z
zvAVK48jTyZdOkkeeZE~jUkG8XwbnZ4h;t4A7-N(Y@4a)b*XxC0*lM++C>oC^T5C#~
z2Lyn$HXncdt5&Nu7z|##cv0j90E9u1rfDDpqjedyY^V3WC<;a?Ej3CA0YH^XMJW{o
zfz~<<!&|q0DFt6$X?^p}x2n(|e(+JN)$-oozkh#pe$E)v#w1B114$`O)6_brwJxR4
z7{eGd&N(K;Ip;kv#)S}!F(Cv(sANMFM&tAIzx?^H7cZ`?u3lD3J%0T7_U+q25We@*
z^CHg~<CL;63_U;)NY0to1^_tcQV7O5FP$tz5Cnk`f>O#DTUc1A)oKVK&bT%9lTUtk
zczB308I4AN`~35RgM%oFcXxL;H@6T%Q54i_HKo*KGO5*SQj#)(kdl7z-aFN5RR|FT
zp_H<;31PfGe<6%xOsNb)F6G=@(^)f}j6VDPi*Nt&Po*_swBBfLZf+(?97RzjO^3r_
z|KyZV7KI__3;^;xCna0>>r*OfGMRL{$CJqv0Ic`MT0$v8es^!@!Gi}`o>^<Vy<VCm
z+NhJkKxs8N8y>U|yWL(1KhC+75@XCt2b6=zdvC2p2pO%9kB*Wko=(PboWyC8rfH+u
zG~emvrG?JXp;mdbIk$ClgCN*``uL=GeDB`9Pe1*W(dKQfHr~AEtdUY$YZ*dNG5{ek
z#(3{BLLNbpX9(O;yDfrn{n~XxFu_C$acSjZKAl)&DwPTb9}3alfA!TDfB*M?9`y$U
z@BQOvPcE&t(l99B%1S08gaDBD4iI335%?mT7TE-YZ@=1KSX`>4@o+da#t?)yZd?~a
zYNfi}j(7I3{i@x5Wt?rTv?`U^XV0Eyc{W$C8DoSHOlxh7vDRYzx>gu@?~PIfA;IYW
z%a<2dFGs;jc@twul2l4TDl^mZ_<T5Ilo+i78OSiKHyh|VDJu~K!R+iT!x)u(D7%d@
z_TD@2)mjx{)a~`Uz3zB)J{^y&QPyaJX%t3MO3u0SV4U*^&W6L$G^=8=_k4!{Vw5Fu
z97Ykwm<b_E`bZflrLDCX<0y)5Y-~8^j=NpXx!`;<KA((7TI(cE!Y~Sg03n3D2STj(
zS&=*I2q26|C7$7w%ODU!5JFhVol>0$AxtoM0D!?@kmvc?*;&0_XN*!x`~4H=+{{d+
z<hJ*oGDa9HapJ7U7%8QMWMLGP3zF8FS!2p|j8X=G2qr*4DYd=5?Y)npC{9u#1f^^|
z9>sB#BuR-_LMg@sVN|VFtJUg{KmOQ1=^});wr<vIRizXF7-LAe`II9NV_g0NA!Low
zT64}XEG#T7ElsCW0BAOvjIk11j8e)dW2{=O-n@BpcD5n}+uXeI(MKOfQCK3g^Z|0t
zO{Y_Y2<QC&cJt1Gcisyjn$4zGy5H|bQ51w>$rH{wAixM=OoR}do11fU4G(T<afwpu
zjROy5VVF`X%QB@@Q7Fzu5J-#(AaLNFb3qV{hQofZzx(`|6tXA^1h~vWDMhb2NC;VN
zwE#TEM3+>s&KQ&DIU9{e<$%K&v)3IXgjnaCwS<sf=hy>iRg~Dk80UE|r38QyEaeu<
oC}oUt&M9SiQ5a*4F_X#Uf06rv)J1*L5dZ)H07*qoM6N<$g3e-d(EtDd

literal 0
HcmV?d00001

diff --git a/python/ray/data/tests/image-folder/not-an-image b/python/ray/data/tests/image-folder/not-an-image
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index 18c1fbbfafb5..f35281672522 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -25,6 +25,7 @@
     DefaultFileMetadataProvider,
     DefaultParquetMetadataProvider,
     FastFileMetadataProvider,
+    ImageFolderDatasource,
     PathPartitionFilter,
     PathPartitionEncoder,
     PartitionStyle,
@@ -2520,6 +2521,26 @@ def get_node_id():
     assert node_ids == {bar_node_id}
 
 
+def test_image_folder_datasource(ray_start_regular_shared):
+    root = os.path.join(__file__, "image-folder")
+    ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
+
+    assert ds.count() == 2
+
+    df = ds.to_pandas()
+    assert set(df["label"]) == {"cat", "dog"}
+    assert all(isinstance(array, np.ndarray) for array in df["image"])
+    assert all(array.shape == (32, 32, 3) for array in df["image"])
+
+
+def test_image_folder_datasource_raises_value_error(ray_start_regular_shared):
+    # `ImageFolderDatasource` should raise an error if more than one path is passed.
+    with pytest.raises(ValueError):
+        ray.data.read_datasource(
+            ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"]
+        )
+
+
 if __name__ == "__main__":
     import sys
 

From 083c9bc8d7b01be1ab22d2341e5e727a16d86527 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 01:50:20 -0700
Subject: [PATCH 03/23] Fix stuff

---
 .../datasource/image_folder_datasource.py     | 22 ++++++++++++++-----
 python/ray/data/tests/test_dataset_formats.py |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 49b02918d7b4..a752d2014844 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -1,9 +1,8 @@
 import pathlib
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import imageio as iio
 import numpy as np
-from pyarrow.fs import FileSelector, FileType
 from ray.data.block import Block
 from ray.data.datasource.binary_datasource import BinaryDatasource
 from ray.data.datasource.datasource import ReadTask
@@ -15,6 +14,9 @@
 )
 from ray.data.datasource.partitioning import PathPartitionFilter
 
+if TYPE_CHECKING:
+    import pyarrow
+
 IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"]
 
 
@@ -33,15 +35,23 @@ class ImageFolderDatasource(BinaryDatasource):
     root/cat/[...]/asd932_.png
     ```
 
+    Datasets read with `ImageFolderDatasource` contain two columns: 'image' and
+    'label'. The 'image' column contains `ndarray`s of shape (H, W, C), and the
+    `label` column contain strings corresponding to class.
+
     Examples:
         >>> import ray
         >>> from ray.data.datasource import ImageFolderDatasource
         >>>
-        >>> ds = ray.data.read_datasource(
+        >>> ds = ray.data.read_datasource(  # doctest: +SKIP
         ...     ImageFolderDatasource(),
         ...     paths=["s3://tiny-imagenet/train"]
         ... )
-        >>> TODO
+        >>> sample = ds.take(1)[0]  # doctest: +SKIP
+        >>> sample["image"].shape  # doctest: +SKIP
+        (469, 387, 3)
+        >>> sample["label"]  # doctest: +SKIP
+        'n01443537'
 
     Raises:
         ValueError: if more than one path is provided. You should only provide the path
@@ -70,14 +80,14 @@ def prepare_read(
             )
 
         try:
-            import imageio
+            import imageio  # noqa: F401
         except ImportError:
             raise ValueError(
                 "`ImageFolderDatasource` depends on 'imageio', but 'imageio' couldn't "
                 "be imported. You can install 'imageio' by running "
                 "`pip install imageio`."
             )
-            
+
         # We call `_resolve_paths_and_filesystem` so that the dataset root is formatted
         # in the same way as the paths passed to `_get_class_from_path`.
         paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index f35281672522..a3f31d508f2b 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -2522,7 +2522,7 @@ def get_node_id():
 
 
 def test_image_folder_datasource(ray_start_regular_shared):
-    root = os.path.join(__file__, "image-folder")
+    root = os.path.join(os.path.dirname(__file__), "image-folder")
     ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
 
     assert ds.count() == 2

From 4f0b8cec1d4fd27af3df15da7270dac87d5a91d1 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 01:51:11 -0700
Subject: [PATCH 04/23] Rename file

---
 .../tests/image-folder/dog/{xxx.png => xyz.PnG}     | Bin
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/ray/data/tests/image-folder/dog/{xxx.png => xyz.PnG} (100%)

diff --git a/python/ray/data/tests/image-folder/dog/xxx.png b/python/ray/data/tests/image-folder/dog/xyz.PnG
similarity index 100%
rename from python/ray/data/tests/image-folder/dog/xxx.png
rename to python/ray/data/tests/image-folder/dog/xyz.PnG

From cd6e25e7c10bda0bb1f97642077c4b1244f95c98 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 01:51:38 -0700
Subject: [PATCH 05/23] Rename file

---
 .../tests/image-folder/dog/{xyz.PnG => xxx.PNG}     | Bin
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename python/ray/data/tests/image-folder/dog/{xyz.PnG => xxx.PNG} (100%)

diff --git a/python/ray/data/tests/image-folder/dog/xyz.PnG b/python/ray/data/tests/image-folder/dog/xxx.PNG
similarity index 100%
rename from python/ray/data/tests/image-folder/dog/xyz.PnG
rename to python/ray/data/tests/image-folder/dog/xxx.PNG

From 98b07f7e3e72a07a3a2942bf2f121e6b9079b402 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 01:53:39 -0700
Subject: [PATCH 06/23] Update image_folder_datasource.py

---
 python/ray/data/datasource/image_folder_datasource.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index a752d2014844..1f6204e35019 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -37,7 +37,7 @@ class ImageFolderDatasource(BinaryDatasource):
 
     Datasets read with `ImageFolderDatasource` contain two columns: 'image' and
     'label'. The 'image' column contains `ndarray`s of shape (H, W, C), and the
-    `label` column contain strings corresponding to class.
+    `label` column contains strings corresponding to class.
 
     Examples:
         >>> import ray

From 6f7b2eb8431447716e04319c459e226283773840 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 02:15:24 -0700
Subject: [PATCH 07/23] Update docs

---
 doc/Makefile                                  |  2 +-
 doc/source/data/package-ref.rst               |  2 ++
 .../datasource/image_folder_datasource.py     | 28 +++++++++----------
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/doc/Makefile b/doc/Makefile
index 0b4fab525153..a38073ab911b 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -52,7 +52,7 @@ clean:
 	rm -rf $(BUILDDIR)/*
 
 html:
-	$(SPHINXBUILD) -W -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 
diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst
index bc769b36d0d3..0c1fc446a9ce 100644
--- a/doc/source/data/package-ref.rst
+++ b/doc/source/data/package-ref.rst
@@ -135,6 +135,8 @@ Built-in Datasources
 .. autoclass:: ray.data.datasource.FileBasedDatasource
     :members:
 
+.. autoclass:: ray.data.datasource.ImageFolderDatasource
+
 .. autoclass:: ray.data.datasource.JSONDatasource
     :members:
 
diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 1f6204e35019..03f9bfa86b0d 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -21,23 +21,23 @@
 
 
 class ImageFolderDatasource(BinaryDatasource):
-    """A datasource that allows you to load datasets like ImageNet.
+    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_.
 
-    This datasource works with any dataset where the images are arranged in this way:
+    This datasource works with any dataset where images are arranged in this way:
 
-    ```
-    root/dog/xxx.png
-    root/dog/xxy.png
-    root/dog/[...]/xxz.png
+    .. code-block::
 
-    root/cat/123.png
-    root/cat/nsdf3.png
-    root/cat/[...]/asd932_.png
-    ```
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/[...]/xxz.png
 
-    Datasets read with `ImageFolderDatasource` contain two columns: 'image' and
-    'label'. The 'image' column contains `ndarray`s of shape (H, W, C), and the
-    `label` column contains strings corresponding to class.
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/[...]/asd932_.png
+
+    Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and
+    ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape 
+    :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to labels.
 
     Examples:
         >>> import ray
@@ -45,7 +45,7 @@ class ImageFolderDatasource(BinaryDatasource):
         >>>
         >>> ds = ray.data.read_datasource(  # doctest: +SKIP
         ...     ImageFolderDatasource(),
-        ...     paths=["s3://tiny-imagenet/train"]
+        ...     paths=["/data/imagenet/train"]
         ... )
         >>> sample = ds.take(1)[0]  # doctest: +SKIP
         >>> sample["image"].shape  # doctest: +SKIP

From db05d3583be5a2cb2c7f4c592f865843938932ef Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 02:19:34 -0700
Subject: [PATCH 08/23] Update file_meta_provider.py

---
 python/ray/data/datasource/file_meta_provider.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py
index ad1bdf62d104..9e03d46630d2 100644
--- a/python/ray/data/datasource/file_meta_provider.py
+++ b/python/ray/data/datasource/file_meta_provider.py
@@ -196,11 +196,6 @@ def expand_paths(
         paths: List[str],
         filesystem: "pyarrow.fs.FileSystem",
     ) -> Tuple[List[str], List[Optional[int]]]:
-        logger.warning(
-            f"Skipping expansion of {len(paths)} path(s). If your paths contain "
-            f"directories or if file size collection is required, try rerunning this "
-            f"read with `meta_provider=DefaultFileMetadataProvider()`."
-        )
         import numpy as np
 
         return paths, np.empty(len(paths), dtype=object)

From 52ae3c8b31d72d41473e6123bf468a4134baa261 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 02:27:34 -0700
Subject: [PATCH 09/23] Update image_folder_datasource.py

---
 python/ray/data/datasource/image_folder_datasource.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 03f9bfa86b0d..e25ec84b4641 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -37,7 +37,8 @@ class ImageFolderDatasource(BinaryDatasource):
 
     Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and
     ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape 
-    :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to labels.
+    :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to
+    labels.
 
     Examples:
         >>> import ray

From 813f5de94969b87e4931f225bf9e55f2d81c8d70 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 11 May 2022 02:28:31 -0700
Subject: [PATCH 10/23] Update Makefile

---
 doc/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/Makefile b/doc/Makefile
index a38073ab911b..0b4fab525153 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -52,7 +52,7 @@ clean:
 	rm -rf $(BUILDDIR)/*
 
 html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	$(SPHINXBUILD) -W -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 

From d68fe1d26181b2295de1244d862d5108f138a6c6 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 18 May 2022 01:58:52 -0700
Subject: [PATCH 11/23] Update
 python/ray/data/datasource/image_folder_datasource.py

Co-authored-by: matthewdeng <matthew.j.deng@gmail.com>
---
 python/ray/data/datasource/image_folder_datasource.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index e25ec84b4641..a8bdfaa33fbf 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -72,7 +72,7 @@ def prepare_read(
         _block_udf: Optional[Callable[[Block], Block]] = None,
         **reader_args,
     ) -> List[ReadTask]:
-        if len(paths) > 1:
+        if len(paths) != 1:
             raise ValueError(
                 "`ImageFolderDatasource` expects 1 path representing the dataset "
                 f"root, but it got {len(paths)} paths instead. To fix this error, "

From 45989c2715bd929b931b6ff4846e66d9881e8959 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Wed, 18 May 2022 02:09:32 -0700
Subject: [PATCH 12/23] Re-add warning

---
 python/ray/data/datasource/file_meta_provider.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/ray/data/datasource/file_meta_provider.py b/python/ray/data/datasource/file_meta_provider.py
index 9e03d46630d2..ad1bdf62d104 100644
--- a/python/ray/data/datasource/file_meta_provider.py
+++ b/python/ray/data/datasource/file_meta_provider.py
@@ -196,6 +196,11 @@ def expand_paths(
         paths: List[str],
         filesystem: "pyarrow.fs.FileSystem",
     ) -> Tuple[List[str], List[Optional[int]]]:
+        logger.warning(
+            f"Skipping expansion of {len(paths)} path(s). If your paths contain "
+            f"directories or if file size collection is required, try rerunning this "
+            f"read with `meta_provider=DefaultFileMetadataProvider()`."
+        )
         import numpy as np
 
         return paths, np.empty(len(paths), dtype=object)

From 2f95235afb09539459ac9511e038646566516392 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Thu, 14 Jul 2022 16:50:54 -0700
Subject: [PATCH 13/23] Update implementation

---
 .../datasource/image_folder_datasource.py     |  78 +++++++++---------
 .../ray/data/tests/image-folder/cat/foo.jpg   | Bin 0 -> 923 bytes
 python/ray/data/tests/test_dataset_formats.py |  13 ++-
 3 files changed, 49 insertions(+), 42 deletions(-)
 create mode 100644 python/ray/data/tests/image-folder/cat/foo.jpg

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index a8bdfaa33fbf..169a51e45bd1 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -1,27 +1,24 @@
 import pathlib
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
-import imageio as iio
 import numpy as np
-from ray.data.block import Block
 from ray.data.datasource.binary_datasource import BinaryDatasource
-from ray.data.datasource.datasource import ReadTask
-from ray.data.datasource.file_based_datasource import _resolve_paths_and_filesystem
-from ray.data.datasource.file_meta_provider import (
-    BaseFileMetadataProvider,
-    DefaultFileMetadataProvider,
-    FastFileMetadataProvider,
+from ray.data.datasource.datasource import Reader
+from ray.data.datasource.file_based_datasource import (
+    _resolve_paths_and_filesystem,
+    FileExtensionFilter,
 )
 from ray.data.datasource.partitioning import PathPartitionFilter
 
 if TYPE_CHECKING:
     import pyarrow
+    from ray.data.block import T
 
-IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"]
+IMAGE_EXTENSIONS = ["png", "jpg", "jpeg", "tiff", "bmp", "gif"]
 
 
 class ImageFolderDatasource(BinaryDatasource):
-    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_.
+    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_.  # noqa: E501
 
     This datasource works with any dataset where images are arranged in this way:
 
@@ -36,7 +33,7 @@ class ImageFolderDatasource(BinaryDatasource):
         root/cat/[...]/asd932_.png
 
     Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and
-    ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape 
+    ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape
     :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to
     labels.
 
@@ -59,25 +56,19 @@ class ImageFolderDatasource(BinaryDatasource):
             to the dataset root.
     """
 
-    def prepare_read(
+    def create_reader(
         self,
-        parallelism: int,
         paths: Union[str, List[str]],
         filesystem: Optional["pyarrow.fs.FileSystem"] = None,
-        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
-        open_stream_args: Optional[Dict[str, Any]] = None,
-        meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(),
         partition_filter: PathPartitionFilter = None,
-        # TODO(ekl) deprecate this once read fusion is available.
-        _block_udf: Optional[Callable[[Block], Block]] = None,
-        **reader_args,
-    ) -> List[ReadTask]:
+        **kwargs,
+    ) -> "Reader[T]":
         if len(paths) != 1:
             raise ValueError(
                 "`ImageFolderDatasource` expects 1 path representing the dataset "
-                f"root, but it got {len(paths)} paths instead. To fix this error, "
-                "pass in a single-element list containing the dataset root (for "
-                'example, `paths=["s3://imagenet/train"]`)'
+                f"root, but it got {len(paths)} paths instead. To fix this "
+                "error, pass in a single-element list containing the dataset root "
+                '(for example, `paths=["s3://imagenet/train"]`)'
             )
 
         try:
@@ -89,27 +80,35 @@ def prepare_read(
                 "`pip install imageio`."
             )
 
+        if partition_filter is None:
+            partition_filter = FileExtensionFilter(file_extensions=IMAGE_EXTENSIONS)
+
         # We call `_resolve_paths_and_filesystem` so that the dataset root is formatted
         # in the same way as the paths passed to `_get_class_from_path`.
         paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
         self.root = paths[0]
 
-        paths, _ = meta_provider.expand_paths(paths, filesystem)
-        paths = [path for path in paths if _is_image_file(path)]
+        from pyarrow.fs import FileType, FileSelector
+
+        labels = [
+            file_info.base_name
+            for file_info in filesystem.get_file_info(FileSelector(self.root))
+            if file_info.type is FileType.Directory
+        ]
+        labels.sort()  # Sort labels so that targets are consistent.
+        self.label_to_target = {label: labels.index(label) for label in labels}
 
-        return super().prepare_read(
-            parallelism=parallelism,
+        return super().create_reader(
             paths=paths,
             filesystem=filesystem,
-            schema=schema,
-            open_stream_args=open_stream_args,
-            meta_provider=FastFileMetadataProvider(),
             partition_filter=partition_filter,
-            _block_udf=_block_udf,
+            **kwargs,
         )
 
     def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
+        import imageio as iio
         import pandas as pd
+        from ray.data.extensions import TensorArray
 
         records = super()._read_file(f, path, include_paths=True)
         assert len(records) == 1
@@ -117,12 +116,15 @@ def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
 
         image = iio.imread(data)
         label = _get_class_from_path(path, self.root)
-
-        return pd.DataFrame({"image": [np.array(image)], "label": [label]})
-
-
-def _is_image_file(path: str) -> bool:
-    return any(path.lower().endswith(extension) for extension in IMAGE_EXTENSIONS)
+        target = self.label_to_target[label]
+
+        return pd.DataFrame(
+            {
+                "image": TensorArray([np.array(image)]),
+                "label": [label],
+                "target": [target],
+            }
+        )
 
 
 def _get_class_from_path(path: str, root: str) -> str:
diff --git a/python/ray/data/tests/image-folder/cat/foo.jpg b/python/ray/data/tests/image-folder/cat/foo.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9df96a2de1ff6fdb3f1fdc7c1cee07bc6e49daff
GIT binary patch
literal 923
zcmex=<NpH&0WUXCHwH#VMur3+WcdG&!P+^<)iK0B$VwqSMK`M;HC>_1P|rX?qqI0P
zFI~aY%U!`Mz|~!$%)&rZN1?DZF(<R6Qo&Zi%#`c@Z3brsZVnDE4t8!XPA*;^ZeBqd
zAwhnAK_y8k5gAPtZ7mHIb#+}M2XkEmTLX1<3m;2cXIBqT4{bC5P(Qa22X_xwkRgmb
zyu5<^f{H>yimrO<dafjc{|6WZIT#ce6qp&67?=bZnFSgDA7PMZU|?hgdKKhbC}3n_
zW?^Mx=iubx1}fMpz`(@F%*@2X%*x8b0#scKlxJWOWEE00bYv3_Ok`Io6ftU?xR68H
zY2!iBpo<?=jFXC*IJv~cB_yR()zmdKwM<OS%q=XfoLyYq+&w(Kf<r>X!XqN1l2cOC
z(lau%ic3n%$}1|Xnp;}i+B-VCCQY6)b=ve9GiNPYykzOJ<ttXM+O&Dg)@|E&?A&$e
z@R6g(j-NPr>eA&aSFc^aar4&0M~|O8efIpt%U2&ieg5+G+xH(oe}VkP$iNKo7TjlO
z{t^WGi;0DWnS~wXFGi+vAZ8Y1VO2C_6LJh>Pb?HxGHT=yahkYr<3Ubk<Dd_sNktdA
z#8gZks(u7{4eT@GJk~^(&)^<I`0Ew}4>Kb$@|Xn~>={b-^Sx1<Uob0T(eXPy48>pn
zGrSIp{$p`D=`+h^U!#o=WO(W<{xfiGU0w0$*tLs&b7${7??16gf2X?WvD1%h%RI}u
zH#FOqD#!3&TJ2P7zRphk=C;rI)ix>pQ!IEG|7!i4`(SJ1#jl)4ep@N6WBm1>q1APH
z#miaO1gq2D8=k0Ce&}wKQg*hiy4GE-+jYuPjW;P-w;ZmR+|*6_wDw4g&}j*khT@*W
zm}zq#{INZ7MR|(Ejmn(WpSD|`-!GH2SAy}{<jI%WMc%lcE4ul1>D93NZ#bCMzD!Oy
foV@)?xQjX8mZ!af+qG7x_f6Srbzy4b{QoxrB28ZI

literal 0
HcmV?d00001

diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index aca805d13662..894565591194 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -42,6 +42,7 @@
     _deserialize_pieces_with_retry,
 )
 from ray.data.tests.conftest import *  # noqa
+from ray.data.extensions import TensorDtype
 from ray.tests.conftest import *  # noqa
 from ray.types import ObjectRef
 
@@ -2858,12 +2859,16 @@ def test_image_folder_datasource(ray_start_regular_shared):
     root = os.path.join(os.path.dirname(__file__), "image-folder")
     ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
 
-    assert ds.count() == 2
+    assert ds.count() == 3
 
     df = ds.to_pandas()
-    assert set(df["label"]) == {"cat", "dog"}
-    assert all(isinstance(array, np.ndarray) for array in df["image"])
-    assert all(array.shape == (32, 32, 3) for array in df["image"])
+    assert type(df["image"].dtype) is TensorDtype
+    assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"])
+
+    df = df.sort_values("label")
+    # Targets should be assigned alphabetically to labels.
+    assert df["label"].tolist() == ["cat", "cat", "dog"]
+    assert df["target"].tolist() == [0, 0, 1]
 
 
 def test_image_folder_datasource_raises_value_error(ray_start_regular_shared):

From 3c9e3c03977b0c8261a08e05a0ac338edd52492c Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Thu, 14 Jul 2022 17:05:12 -0700
Subject: [PATCH 14/23] Add read API

---
 python/ray/data/__init__.py                   |  1 +
 .../datasource/image_folder_datasource.py     | 38 +---------------
 python/ray/data/read_api.py                   | 45 +++++++++++++++++++
 python/ray/data/tests/test_dataset_formats.py | 13 +-----
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py
index d9770fc099f6..295503eb0f4c 100644
--- a/python/ray/data/__init__.py
+++ b/python/ray/data/__init__.py
@@ -33,6 +33,7 @@
     read_parquet,
     read_parquet_bulk,
     read_text,
+    read_image_folder,
 )
 
 # Register custom Arrow JSON ReadOptions serializer after worker has initialized.
diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 169a51e45bd1..2233c6b73d68 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -18,43 +18,7 @@
 
 
 class ImageFolderDatasource(BinaryDatasource):
-    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_.  # noqa: E501
-
-    This datasource works with any dataset where images are arranged in this way:
-
-    .. code-block::
-
-        root/dog/xxx.png
-        root/dog/xxy.png
-        root/dog/[...]/xxz.png
-
-        root/cat/123.png
-        root/cat/nsdf3.png
-        root/cat/[...]/asd932_.png
-
-    Datasets read with ``ImageFolderDatasource`` contain two columns: ``'image'`` and
-    ``'label'``. The ``'image'`` column contains ``ndarray`` objects of shape
-    :math:`(H, W, C)`, and the ``label`` column contains strings corresponding to
-    labels.
-
-    Examples:
-        >>> import ray
-        >>> from ray.data.datasource import ImageFolderDatasource
-        >>>
-        >>> ds = ray.data.read_datasource(  # doctest: +SKIP
-        ...     ImageFolderDatasource(),
-        ...     paths=["/data/imagenet/train"]
-        ... )
-        >>> sample = ds.take(1)[0]  # doctest: +SKIP
-        >>> sample["image"].shape  # doctest: +SKIP
-        (469, 387, 3)
-        >>> sample["label"]  # doctest: +SKIP
-        'n01443537'
-
-    Raises:
-        ValueError: if more than one path is provided. You should only provide the path
-            to the dataset root.
-    """
+    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_."""  # noqa: E501
 
     def create_reader(
         self,
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 3b6d5118a1e9..b4b0615017bd 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -24,6 +24,7 @@
     DefaultFileMetadataProvider,
     DefaultParquetMetadataProvider,
     FastFileMetadataProvider,
+    ImageFolderDatasource,
     JSONDatasource,
     NumpyDatasource,
     ParquetBaseDatasource,
@@ -1069,6 +1070,50 @@ def convert(ds: "datasets.Dataset") -> Dataset[ArrowRow]:
         )
 
 
+@PublicAPI
+def read_image_folder(root: str, *, parallelism: int = -1):
+    """Read a dataset structured like `ImageNet <https://www.image-net.org/>`_.
+
+    This function works with any dataset where images are arranged in this way:
+
+    .. code-block::
+
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/[...]/xxz.png
+
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/[...]/asd932_.png
+
+    Datasets read with this function contain three columns: ``'image'``, ``'label'`` and
+    ``'target'``.
+
+    * The ``'image'`` column contains ``ndarray`` objects of shape :math:`(H, W, C)`
+    * The ``'label'`` column contains strings representing class names.
+    * The ``'target'`` column contain integer targets corresponding to class.
+
+    Arguments:
+        path: Path to the directory root.
+        parallelism: The user-requested parallelism, or -1 for autodetection.
+
+    Examples:
+        >>> import ray
+        >>>
+        >>> ds = ray.data.read_image_folder("/data/imagenet/train")
+        >>> sample = ds.take(1)[0]  # doctest: +SKIP
+        >>> sample["image"].shape  # doctest: +SKIP
+        (469, 387, 3)
+        >>> sample["label"]  # doctest: +SKIP
+        'n01443537'
+        >>> sample["target]  # doctest: +SKIP
+        71
+    """
+    return read_datasource(
+        ImageFolderDatasource(), paths=[root], parallelism=parallelism
+    )
+
+
 def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
     stats = BlockExecStats.builder()
     import pyarrow as pa
diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index 894565591194..7b087842ab82 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -26,7 +26,6 @@
     DefaultParquetMetadataProvider,
     DummyOutputDatasource,
     FastFileMetadataProvider,
-    ImageFolderDatasource,
     PartitionStyle,
     PathPartitionEncoder,
     PathPartitionFilter,
@@ -2855,9 +2854,9 @@ def get_node_id():
     assert node_ids == {bar_node_id}
 
 
-def test_image_folder_datasource(ray_start_regular_shared):
+def test_read_image_folder(ray_start_regular_shared):
     root = os.path.join(os.path.dirname(__file__), "image-folder")
-    ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
+    ds = ray.data.read_image_folder(root)
 
     assert ds.count() == 3
 
@@ -2871,14 +2870,6 @@ def test_image_folder_datasource(ray_start_regular_shared):
     assert df["target"].tolist() == [0, 0, 1]
 
 
-def test_image_folder_datasource_raises_value_error(ray_start_regular_shared):
-    # `ImageFolderDatasource` should raise an error if more than one path is passed.
-    with pytest.raises(ValueError):
-        ray.data.read_datasource(
-            ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"]
-        )
-
-
 def test_read_text_remote_args(ray_start_cluster, tmp_path):
     cluster = ray_start_cluster
     cluster.add_node(

From cdaca5991162e452dd2b5ba5d735d9815b793eb8 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Thu, 14 Jul 2022 17:37:14 -0700
Subject: [PATCH 15/23] Fix error in documentation

---
 python/ray/data/read_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index ab34c8ffc09b..79b0415119fb 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -1093,7 +1093,7 @@ def read_image_folder(root: str, *, parallelism: int = -1):
     Datasets read with this function contain three columns: ``'image'``, ``'label'`` and
     ``'target'``.
 
-    * The ``'image'`` column contains ``ndarray`` objects of shape :math:`(H, W, C)`
+    * The ``'image'`` column is of type ``TensorDtype`` and contains tensors of shape :math:`(H, W, C)`.
     * The ``'label'`` column contains strings representing class names.
     * The ``'target'`` column contain integer targets corresponding to class.
 

From 7c0a31754495b5e86d82d1e70f9841c543e0d076 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 11:16:02 -0700
Subject: [PATCH 16/23] Update documentation and add test

---
 doc/source/data/package-ref.rst               |  1 +
 python/ray/data/read_api.py                   | 28 +++++++++++----
 python/ray/data/tests/test_dataset_formats.py | 34 +++++++++++++++++++
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst
index 39e0c9c6f3e6..c5606fb8f444 100644
--- a/doc/source/data/package-ref.rst
+++ b/doc/source/data/package-ref.rst
@@ -16,6 +16,7 @@ Creating Datasets
 .. autofunction:: ray.data.read_numpy
 .. autofunction:: ray.data.read_text
 .. autofunction:: ray.data.read_binary_files
+.. autofunction:: ray.data.read_image_folder
 .. autofunction:: ray.data.read_datasource
 .. autofunction:: ray.data.from_items
 .. autofunction:: ray.data.from_arrow
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 79b0415119fb..3c962f0fc2b0 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -1090,12 +1090,12 @@ def read_image_folder(root: str, *, parallelism: int = -1):
         root/cat/nsdf3.png
         root/cat/[...]/asd932_.png
 
-    Datasets read with this function contain three columns: ``'image'``, ``'label'`` and
-    ``'target'``.
+    Datasets read with this function contain two columns: ``'image'`` and ``'label'``.
 
-    * The ``'image'`` column is of type ``TensorDtype`` and contains tensors of shape :math:`(H, W, C)`.
-    * The ``'label'`` column contains strings representing class names.
-    * The ``'target'`` column contain integer targets corresponding to class.
+    * The ``'image'`` column is of type
+      :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype` and contains
+      tensors of shape :math:`(H, W, C)`.
+    * The ``'label'`` column contains strings representing class names (e.g., 'cat').
 
     Arguments:
         path: Path to the directory root.
@@ -1105,12 +1105,26 @@ def read_image_folder(root: str, *, parallelism: int = -1):
         >>> import ray
         >>>
         >>> ds = ray.data.read_image_folder("/data/imagenet/train")
+        >>>
         >>> sample = ds.take(1)[0]  # doctest: +SKIP
-        >>> sample["image"].shape  # doctest: +SKIP
+        >>> sample["image"].to_numpy().shape  # doctest: +SKIP
         (469, 387, 3)
         >>> sample["label"]  # doctest: +SKIP
         'n01443537'
-        >>> sample["target]  # doctest: +SKIP
+
+        To convert class labels to integer-valued targets, use
+        :py:class:`~ray.data.preprocessors.OrdinalEncoder`.
+
+        >>> import ray
+        >>> from ray.data.preprocessors import OrdinalEncoder
+        >>>
+        >>> ds = ray.data.read_image_folder("/data/imagenet/train")
+        >>> oe = OrdinalEncoder(columns=["label"])
+        >>>
+        >>> ds = oe.fit_transform(ds)
+        >>>
+        >>> sample = ds.take(1)[0]
+        >>> sample["label"]
         71
     """
     return read_datasource(
diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index 990a6de651cf..13e54618b351 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -40,6 +40,7 @@
     _SerializedPiece,
     _deserialize_pieces_with_retry,
 )
+from ray.data.preprocessors import BatchMapper
 from ray.data.tests.conftest import *  # noqa
 from ray.data.extensions import TensorDtype
 from ray.tests.conftest import *  # noqa
@@ -2871,6 +2872,39 @@ def test_read_image_folder(ray_start_regular_shared):
     assert df["target"].tolist() == [0, 0, 1]
 
 
+def test_read_image_folder_e2e(ray_start_regular_shared):
+    from ray.air.util.tensor_extensions.pandas import TensorArray
+    from ray.train.torch import to_air_checkpoint, TorchPredictor
+    from ray.train.batch_predictor import BatchPredictor
+
+    from torchvision import transforms
+    from torchvision.models import resnet18
+
+    dataset = ray.data.read_image_folder("image-folder")
+
+    def preprocess(df):
+        # We convert the `TensorArrayElement` to a NumPy array because `ToTensor`
+        # expects a NumPy array or PIL image. `ToTensor` is necessary because Torch
+        # expects images to have shape (C, H, W), and `ToTensor` changes the shape of
+        # the data from (H, W, C) to (C, H, W).
+        preprocess = transforms.Compose(
+            [
+                lambda ray_tensor: ray_tensor.to_numpy(),
+                transforms.ToTensor(),
+            ]
+        )
+        df["image"] = TensorArray([preprocess(image) for image in df["image"]])
+        return df
+
+    preprocessor = BatchMapper(preprocess)
+
+    model = resnet18(pretrained=True)
+    checkpoint = to_air_checkpoint(model=model, preprocessor=preprocessor)
+
+    predictor = BatchPredictor.from_checkpoint(checkpoint, TorchPredictor)
+    predictor.predict(dataset, feature_columns=["image"])
+
+
 def test_read_text_remote_args(ray_start_cluster, tmp_path):
     cluster = ray_start_cluster
     cluster.add_node(

From e07d1f8570ecdfdcd5199769d2ee97d743bcedc5 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 11:22:41 -0700
Subject: [PATCH 17/23] Remove `target` column

---
 .../ray/data/datasource/image_folder_datasource.py   | 12 ------------
 python/ray/data/tests/test_dataset_formats.py        |  6 +-----
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 2233c6b73d68..52c27de16ac9 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -52,16 +52,6 @@ def create_reader(
         paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
         self.root = paths[0]
 
-        from pyarrow.fs import FileType, FileSelector
-
-        labels = [
-            file_info.base_name
-            for file_info in filesystem.get_file_info(FileSelector(self.root))
-            if file_info.type is FileType.Directory
-        ]
-        labels.sort()  # Sort labels so that targets are consistent.
-        self.label_to_target = {label: labels.index(label) for label in labels}
-
         return super().create_reader(
             paths=paths,
             filesystem=filesystem,
@@ -80,13 +70,11 @@ def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
 
         image = iio.imread(data)
         label = _get_class_from_path(path, self.root)
-        target = self.label_to_target[label]
 
         return pd.DataFrame(
             {
                 "image": TensorArray([np.array(image)]),
                 "label": [label],
-                "target": [target],
             }
         )
 
diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index 13e54618b351..3e02a424559b 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -2863,14 +2863,10 @@ def test_read_image_folder(ray_start_regular_shared):
     assert ds.count() == 3
 
     df = ds.to_pandas()
+    assert sorted(df["label"]) == ["cat", "cat", "dog"]
     assert type(df["image"].dtype) is TensorDtype
     assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"])
 
-    df = df.sort_values("label")
-    # Targets should be assigned alphabetically to labels.
-    assert df["label"].tolist() == ["cat", "cat", "dog"]
-    assert df["target"].tolist() == [0, 0, 1]
-
 
 def test_read_image_folder_e2e(ray_start_regular_shared):
     from ray.air.util.tensor_extensions.pandas import TensorArray

From 298c1442c371eec645bb6681269c8e5db8bf8ae3 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 11:25:48 -0700
Subject: [PATCH 18/23] Change error type from `ValueError` to `ImportError`

---
 python/ray/data/datasource/image_folder_datasource.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 52c27de16ac9..89cf50a7f82f 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -38,7 +38,7 @@ def create_reader(
         try:
             import imageio  # noqa: F401
         except ImportError:
-            raise ValueError(
+            raise ImportError(
                 "`ImageFolderDatasource` depends on 'imageio', but 'imageio' couldn't "
                 "be imported. You can install 'imageio' by running "
                 "`pip install imageio`."

From dc9c6e217627ed78e26b3df57096844063648e2b Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 11:53:24 -0700
Subject: [PATCH 19/23] Remove `read_image_folder`

---
 doc/source/data/package-ref.rst               |  1 -
 python/ray/data/__init__.py                   |  1 -
 .../datasource/image_folder_datasource.py     | 55 ++++++++++++++++-
 python/ray/data/read_api.py                   | 59 -------------------
 python/ray/data/tests/test_dataset_formats.py | 18 ++++--
 5 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst
index c5606fb8f444..39e0c9c6f3e6 100644
--- a/doc/source/data/package-ref.rst
+++ b/doc/source/data/package-ref.rst
@@ -16,7 +16,6 @@ Creating Datasets
 .. autofunction:: ray.data.read_numpy
 .. autofunction:: ray.data.read_text
 .. autofunction:: ray.data.read_binary_files
-.. autofunction:: ray.data.read_image_folder
 .. autofunction:: ray.data.read_datasource
 .. autofunction:: ray.data.from_items
 .. autofunction:: ray.data.from_arrow
diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py
index 295503eb0f4c..d9770fc099f6 100644
--- a/python/ray/data/__init__.py
+++ b/python/ray/data/__init__.py
@@ -33,7 +33,6 @@
     read_parquet,
     read_parquet_bulk,
     read_text,
-    read_image_folder,
 )
 
 # Register custom Arrow JSON ReadOptions serializer after worker has initialized.
diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 89cf50a7f82f..9aeb500e616e 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -18,7 +18,60 @@
 
 
 class ImageFolderDatasource(BinaryDatasource):
-    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_."""  # noqa: E501
+    """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_.
+
+    This datasource works with any dataset where images are arranged in this way:
+
+    .. code-block::
+
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/[...]/xxz.png
+
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/[...]/asd932_.png
+
+    Datasets read with this datasource contain two columns: ``'image'`` and ``'label'``.
+
+    * The ``'image'`` column is of type
+      :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype` and contains
+      tensors of shape :math:`(H, W, C)`.
+    * The ``'label'`` column contains strings representing class names (e.g., 'cat').
+
+    Examples:
+        >>> import ray
+        >>> from ray.data.datasource import ImageFolderDatasource
+        >>>
+        >>> ds = ray.data.read_datasource(
+        ...     ImageFolderDatasource(),
+        ...     paths=["/data/imagenet/train"]
+        ... )
+        >>>
+        >>> sample = ds.take(1)[0]  # doctest: +SKIP
+        >>> sample["image"].to_numpy().shape  # doctest: +SKIP
+        (469, 387, 3)
+        >>> sample["label"]  # doctest: +SKIP
+        'n01443537'
+
+        To convert class labels to integer-valued targets, use
+        :py:class:`~ray.data.preprocessors.OrdinalEncoder`.
+
+        >>> import ray
+        >>> from ray.data.preprocessors import OrdinalEncoder
+        >>>
+        >>> ds = ray.data.read_datasource(
+        ...     ImageFolderDatasource(),
+        ...     paths=["/data/imagenet/train"]
+        ... )
+        >>> oe = OrdinalEncoder(columns=["label"])
+        >>>
+        >>> ds = oe.fit_transform(ds)
+        >>>
+        >>> sample = ds.take(1)[0]
+        >>> sample["label"]
+        71
+    """  # noqa: E501
 
     def create_reader(
         self,
diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py
index 3c962f0fc2b0..b1350fbd08b4 100644
--- a/python/ray/data/read_api.py
+++ b/python/ray/data/read_api.py
@@ -27,7 +27,6 @@
     DefaultFileMetadataProvider,
     DefaultParquetMetadataProvider,
     FastFileMetadataProvider,
-    ImageFolderDatasource,
     JSONDatasource,
     NumpyDatasource,
     ParquetBaseDatasource,
@@ -1074,64 +1073,6 @@ def convert(ds: "datasets.Dataset") -> Dataset[ArrowRow]:
         )
 
 
-@PublicAPI
-def read_image_folder(root: str, *, parallelism: int = -1):
-    """Read a dataset structured like `ImageNet <https://www.image-net.org/>`_.
-
-    This function works with any dataset where images are arranged in this way:
-
-    .. code-block::
-
-        root/dog/xxx.png
-        root/dog/xxy.png
-        root/dog/[...]/xxz.png
-
-        root/cat/123.png
-        root/cat/nsdf3.png
-        root/cat/[...]/asd932_.png
-
-    Datasets read with this function contain two columns: ``'image'`` and ``'label'``.
-
-    * The ``'image'`` column is of type
-      :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype` and contains
-      tensors of shape :math:`(H, W, C)`.
-    * The ``'label'`` column contains strings representing class names (e.g., 'cat').
-
-    Arguments:
-        path: Path to the directory root.
-        parallelism: The user-requested parallelism, or -1 for autodetection.
-
-    Examples:
-        >>> import ray
-        >>>
-        >>> ds = ray.data.read_image_folder("/data/imagenet/train")
-        >>>
-        >>> sample = ds.take(1)[0]  # doctest: +SKIP
-        >>> sample["image"].to_numpy().shape  # doctest: +SKIP
-        (469, 387, 3)
-        >>> sample["label"]  # doctest: +SKIP
-        'n01443537'
-
-        To convert class labels to integer-valued targets, use
-        :py:class:`~ray.data.preprocessors.OrdinalEncoder`.
-
-        >>> import ray
-        >>> from ray.data.preprocessors import OrdinalEncoder
-        >>>
-        >>> ds = ray.data.read_image_folder("/data/imagenet/train")
-        >>> oe = OrdinalEncoder(columns=["label"])
-        >>>
-        >>> ds = oe.fit_transform(ds)
-        >>>
-        >>> sample = ds.take(1)[0]
-        >>> sample["label"]
-        71
-    """
-    return read_datasource(
-        ImageFolderDatasource(), paths=[root], parallelism=parallelism
-    )
-
-
 def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
     stats = BlockExecStats.builder()
     import pyarrow as pa
diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index 3e02a424559b..4f7b49244beb 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -26,6 +26,7 @@
     DefaultParquetMetadataProvider,
     DummyOutputDatasource,
     FastFileMetadataProvider,
+    ImageFolderDatasource,
     PartitionStyle,
     PathPartitionEncoder,
     PathPartitionFilter,
@@ -2856,9 +2857,9 @@ def get_node_id():
     assert node_ids == {bar_node_id}
 
 
-def test_read_image_folder(ray_start_regular_shared):
+def test_image_folder_datasource(ray_start_regular_shared):
     root = os.path.join(os.path.dirname(__file__), "image-folder")
-    ds = ray.data.read_image_folder(root)
+    ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
 
     assert ds.count() == 3
 
@@ -2868,7 +2869,15 @@ def test_read_image_folder(ray_start_regular_shared):
     assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"])
 
 
-def test_read_image_folder_e2e(ray_start_regular_shared):
+def test_image_folder_datasource_raises_value_error(ray_start_regular_shared):
+    # `ImageFolderDatasource` should raise an error if more than one path is passed.
+    with pytest.raises(ValueError):
+        ray.data.read_datasource(
+            ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"]
+        )
+
+
+def test_image_folder_datasource_e2e(ray_start_regular_shared):
     from ray.air.util.tensor_extensions.pandas import TensorArray
     from ray.train.torch import to_air_checkpoint, TorchPredictor
     from ray.train.batch_predictor import BatchPredictor
@@ -2876,7 +2885,8 @@ def test_read_image_folder_e2e(ray_start_regular_shared):
     from torchvision import transforms
     from torchvision.models import resnet18
 
-    dataset = ray.data.read_image_folder("image-folder")
+    root = os.path.join(os.path.dirname(__file__), "image-folder")
+    dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
 
     def preprocess(df):
         # We convert the `TensorArrayElement` to a NumPy array because `ToTensor`

From 1d4d9205ddd2d46c44ab707ea03a09109209559e Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 13:04:18 -0700
Subject: [PATCH 20/23] Add API annotation

---
 python/ray/data/datasource/image_folder_datasource.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 9aeb500e616e..21b004191595 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -17,6 +17,7 @@
 IMAGE_EXTENSIONS = ["png", "jpg", "jpeg", "tiff", "bmp", "gif"]
 
 
+@DeveloperAPI
 class ImageFolderDatasource(BinaryDatasource):
     """A datasource that lets you read datasets like `ImageNet <https://www.image-net.org/>`_.
 

From 0f5e089b15e1fdb368dc5286d085654831c428b4 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 13:05:11 -0700
Subject: [PATCH 21/23] Add missing `DeveloperAPI` import

---
 python/ray/data/datasource/image_folder_datasource.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 21b004191595..7ff08afce337 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -9,6 +9,7 @@
     FileExtensionFilter,
 )
 from ray.data.datasource.partitioning import PathPartitionFilter
+from ray.util.annotations import DeveloperAPI
 
 if TYPE_CHECKING:
     import pyarrow

From 18dd5662811dc31791447a94a102951bcd910394 Mon Sep 17 00:00:00 2001
From: Balaji Veeramani <bveeramani@berkeley.edu>
Date: Fri, 15 Jul 2022 15:14:08 -0700
Subject: [PATCH 22/23] Skip doctests

---
 .../ray/data/datasource/image_folder_datasource.py   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/ray/data/datasource/image_folder_datasource.py b/python/ray/data/datasource/image_folder_datasource.py
index 7ff08afce337..dcf6843cf75d 100644
--- a/python/ray/data/datasource/image_folder_datasource.py
+++ b/python/ray/data/datasource/image_folder_datasource.py
@@ -45,7 +45,7 @@ class ImageFolderDatasource(BinaryDatasource):
         >>> import ray
         >>> from ray.data.datasource import ImageFolderDatasource
         >>>
-        >>> ds = ray.data.read_datasource(
+        >>> ds = ray.data.read_datasource(  # doctest: +SKIP
         ...     ImageFolderDatasource(),
         ...     paths=["/data/imagenet/train"]
         ... )
@@ -62,16 +62,16 @@ class ImageFolderDatasource(BinaryDatasource):
         >>> import ray
         >>> from ray.data.preprocessors import OrdinalEncoder
         >>>
-        >>> ds = ray.data.read_datasource(
+        >>> ds = ray.data.read_datasource(  # doctest: +SKIP
         ...     ImageFolderDatasource(),
         ...     paths=["/data/imagenet/train"]
         ... )
-        >>> oe = OrdinalEncoder(columns=["label"])
+        >>> oe = OrdinalEncoder(columns=["label"])  # doctest: +SKIP
         >>>
-        >>> ds = oe.fit_transform(ds)
+        >>> ds = oe.fit_transform(ds)  # doctest: +SKIP
         >>>
-        >>> sample = ds.take(1)[0]
-        >>> sample["label"]
+        >>> sample = ds.take(1)[0]  # doctest: +SKIP
+        >>> sample["label"]  # doctest: +SKIP
         71
     """  # noqa: E501
 

From 70a47d7fa6566a85f1fcac7ed40971dc1bc0e3ac Mon Sep 17 00:00:00 2001
From: Richard Liaw <rliaw@berkeley.edu>
Date: Fri, 15 Jul 2022 21:06:36 -0700
Subject: [PATCH 23/23] update

Signed-off-by: Richard Liaw <rliaw@berkeley.edu>
---
 python/ray/data/tests/test_dataset_formats.py | 108 +++++++++---------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/python/ray/data/tests/test_dataset_formats.py b/python/ray/data/tests/test_dataset_formats.py
index 5c994a00ca63..d277c16ef393 100644
--- a/python/ray/data/tests/test_dataset_formats.py
+++ b/python/ray/data/tests/test_dataset_formats.py
@@ -2732,6 +2732,60 @@ def test_torch_datasource_value_error(ray_start_regular_shared, local_path):
         )
 
 
+def test_image_folder_datasource(ray_start_regular_shared):
+    root = os.path.join(os.path.dirname(__file__), "image-folder")
+    ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
+
+    assert ds.count() == 3
+
+    df = ds.to_pandas()
+    assert sorted(df["label"]) == ["cat", "cat", "dog"]
+    assert type(df["image"].dtype) is TensorDtype
+    assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"])
+
+
+def test_image_folder_datasource_raises_value_error(ray_start_regular_shared):
+    # `ImageFolderDatasource` should raise an error if more than one path is passed.
+    with pytest.raises(ValueError):
+        ray.data.read_datasource(
+            ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"]
+        )
+
+
+def test_image_folder_datasource_e2e(ray_start_regular_shared):
+    from ray.air.util.tensor_extensions.pandas import TensorArray
+    from ray.train.torch import to_air_checkpoint, TorchPredictor
+    from ray.train.batch_predictor import BatchPredictor
+
+    from torchvision import transforms
+    from torchvision.models import resnet18
+
+    root = os.path.join(os.path.dirname(__file__), "image-folder")
+    dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
+
+    def preprocess(df):
+        # We convert the `TensorArrayElement` to a NumPy array because `ToTensor`
+        # expects a NumPy array or PIL image. `ToTensor` is necessary because Torch
+        # expects images to have shape (C, H, W), and `ToTensor` changes the shape of
+        # the data from (H, W, C) to (C, H, W).
+        preprocess = transforms.Compose(
+            [
+                lambda ray_tensor: ray_tensor.to_numpy(),
+                transforms.ToTensor(),
+            ]
+        )
+        df["image"] = TensorArray([preprocess(image) for image in df["image"]])
+        return df
+
+    preprocessor = BatchMapper(preprocess)
+
+    model = resnet18(pretrained=True)
+    checkpoint = to_air_checkpoint(model=model, preprocessor=preprocessor)
+
+    predictor = BatchPredictor.from_checkpoint(checkpoint, TorchPredictor)
+    predictor.predict(dataset, feature_columns=["image"])
+
+
 # NOTE: The last test using the shared ray_start_regular_shared cluster must use the
 # shutdown_only fixture so the shared cluster is shut down, otherwise the below
 # test_write_datasource_ray_remote_args test, which uses a cluster_utils cluster, will
@@ -2867,60 +2921,6 @@ def get_node_id():
     assert node_ids == {bar_node_id}
 
 
-def test_image_folder_datasource(ray_start_regular_shared):
-    root = os.path.join(os.path.dirname(__file__), "image-folder")
-    ds = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
-
-    assert ds.count() == 3
-
-    df = ds.to_pandas()
-    assert sorted(df["label"]) == ["cat", "cat", "dog"]
-    assert type(df["image"].dtype) is TensorDtype
-    assert all(tensor.to_numpy().shape == (32, 32, 3) for tensor in df["image"])
-
-
-def test_image_folder_datasource_raises_value_error(ray_start_regular_shared):
-    # `ImageFolderDatasource` should raise an error if more than one path is passed.
-    with pytest.raises(ValueError):
-        ray.data.read_datasource(
-            ImageFolderDatasource(), paths=["imagenet/train", "imagenet/test"]
-        )
-
-
-def test_image_folder_datasource_e2e(ray_start_regular_shared):
-    from ray.air.util.tensor_extensions.pandas import TensorArray
-    from ray.train.torch import to_air_checkpoint, TorchPredictor
-    from ray.train.batch_predictor import BatchPredictor
-
-    from torchvision import transforms
-    from torchvision.models import resnet18
-
-    root = os.path.join(os.path.dirname(__file__), "image-folder")
-    dataset = ray.data.read_datasource(ImageFolderDatasource(), paths=[root])
-
-    def preprocess(df):
-        # We convert the `TensorArrayElement` to a NumPy array because `ToTensor`
-        # expects a NumPy array or PIL image. `ToTensor` is necessary because Torch
-        # expects images to have shape (C, H, W), and `ToTensor` changes the shape of
-        # the data from (H, W, C) to (C, H, W).
-        preprocess = transforms.Compose(
-            [
-                lambda ray_tensor: ray_tensor.to_numpy(),
-                transforms.ToTensor(),
-            ]
-        )
-        df["image"] = TensorArray([preprocess(image) for image in df["image"]])
-        return df
-
-    preprocessor = BatchMapper(preprocess)
-
-    model = resnet18(pretrained=True)
-    checkpoint = to_air_checkpoint(model=model, preprocessor=preprocessor)
-
-    predictor = BatchPredictor.from_checkpoint(checkpoint, TorchPredictor)
-    predictor.predict(dataset, feature_columns=["image"])
-
-
 def test_read_text_remote_args(ray_start_cluster, tmp_path):
     cluster = ray_start_cluster
     cluster.add_node(