Skip to content

Commit

Permalink
[Datasets] Add read benchmark for images (#28724)
Browse files Browse the repository at this point in the history
This PR add data benchmark for reading images in various settings - different number of images, image sizes, modes, formats. See `read_benchmark.py:run_images_benchmark()` for details.
  • Loading branch information
c21 authored Sep 29, 2022
1 parent 3685260 commit 6dbc116
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 1 deletion.
109 changes: 109 additions & 0 deletions release/nightly_tests/dataset/read_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os
import random
import shutil
import tempfile
from typing import List, Optional, Tuple

from PIL import Image

import ray
from ray.data.dataset import Dataset
from ray.data.datasource import ImageFolderDatasource

from benchmark import Benchmark


def read_images(
root: str, size: Optional[Tuple[int, int]] = None, mode: Optional[str] = None
) -> Dataset:

return ray.data.read_datasource(
ImageFolderDatasource(), root=root, size=size, mode=mode
)


def generate_images(
num_images: int, sizes: List[Tuple[int, int]], modes: List[str], formats: List[str]
) -> str:

dimensions = []
for mode in modes:
if mode in ["1", "L", "P"]:
dimension = 1
elif mode in ["RGB", "YCbCr", "LAB", "HSV"]:
dimension = 3
elif mode in ["RGBA", "CMYK", "I", "F"]:
dimension = 4
else:
raise ValueError(f"Found unknown image mode: {mode}.")
dimensions.append(dimension)

images_dir = tempfile.mkdtemp()

for image_idx in range(num_images):
size = random.choice(sizes)
file_format = random.choice(formats)
mode_idx = random.randrange(len(modes))
mode = modes[mode_idx]
dimension = dimensions[mode_idx]

width, height = size
file_name = f"{images_dir}/{image_idx}.{file_format}"
pixels_per_dimension = []
for _ in range(dimension):
pixels = os.urandom(width * height)
pixels_per_dimension.append(pixels)

image = Image.new(mode, size)
if len(pixels_per_dimension) == 1:
image.putdata(pixels_per_dimension[0])
else:
image.putdata(list(zip(*pixels_per_dimension)))
image.save(file_name)

return images_dir


def run_images_benchmark(benchmark: Benchmark):
# Set global random seed.
random.seed(42)

test_input = [
generate_images(100, [(256, 256)], ["RGB"], ["jpg"]),
generate_images(100, [(2048, 2048)], ["RGB"], ["jpg"]),
generate_images(
1000, [(64, 64), (256, 256)], ["RGB", "L"], ["jpg", "jpeg", "png"]
),
]

benchmark.run("images-100-256-rbg-jpg", read_images, root=test_input[0])
benchmark.run("images-100-2048-rbg-jpg", read_images, root=test_input[1])
benchmark.run(
"images-100-2048-to-256-rbg-jpg",
read_images,
root=test_input[1],
size=(256, 256),
)
benchmark.run(
"images-1000-mix", read_images, root=test_input[2], size=(256, 256), mode="RGB"
)

for root in test_input:
shutil.rmtree(root)

# TODO(chengsu): run benchmark on 20G and 100G imagenet data.
benchmark.run(
"images-imagenet-1g",
read_images,
root="s3://air-example-data-2/1G-image-data-synthetic-raw",
)


if __name__ == "__main__":
ray.init()

benchmark = Benchmark("read")

run_images_benchmark(benchmark)

benchmark.write_result()
19 changes: 18 additions & 1 deletion release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4213,7 +4213,7 @@
team: data
cluster:
cluster_env: app_config.yaml
cluster_compute: aggregate_benchmark_compute.yaml
cluster_compute: single_node_benchmark_compute.yaml

run:
timeout: 1800
Expand All @@ -4222,6 +4222,23 @@
type: sdk_command
file_manager: sdk

- name: read_benchmark_single_node
group: core-dataset-tests
working_dir: nightly_tests/dataset

frequency: multi
team: data
cluster:
cluster_env: app_config.yaml
cluster_compute: single_node_benchmark_compute.yaml

run:
timeout: 1800
script: python read_benchmark.py

type: sdk_command
file_manager: sdk

- name: pipelined_training_50_gb
group: core-dataset-tests
working_dir: nightly_tests/dataset
Expand Down

0 comments on commit 6dbc116

Please sign in to comment.