From 49fe680fc948a651745bfef2326d1ff28cb36087 Mon Sep 17 00:00:00 2001 From: Cheng Su Date: Thu, 29 Sep 2022 15:57:58 -0700 Subject: [PATCH] [Datasets] Add read benchmark for images (#28724) This PR add data benchmark for reading images in various settings - different number of images, image sizes, modes, formats. See `read_benchmark.py:run_images_benchmark()` for details. Signed-off-by: Weichen Xu --- .../nightly_tests/dataset/read_benchmark.py | 109 ++++++++++++++++++ ...aml => single_node_benchmark_compute.yaml} | 0 release/release_tests.yaml | 19 ++- 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 release/nightly_tests/dataset/read_benchmark.py rename release/nightly_tests/dataset/{aggregate_benchmark_compute.yaml => single_node_benchmark_compute.yaml} (100%) diff --git a/release/nightly_tests/dataset/read_benchmark.py b/release/nightly_tests/dataset/read_benchmark.py new file mode 100644 index 000000000000..566613c7c5d3 --- /dev/null +++ b/release/nightly_tests/dataset/read_benchmark.py @@ -0,0 +1,109 @@ +import os +import random +import shutil +import tempfile +from typing import List, Optional, Tuple + +from PIL import Image + +import ray +from ray.data.dataset import Dataset +from ray.data.datasource import ImageFolderDatasource + +from benchmark import Benchmark + + +def read_images( + root: str, size: Optional[Tuple[int, int]] = None, mode: Optional[str] = None +) -> Dataset: + + return ray.data.read_datasource( + ImageFolderDatasource(), root=root, size=size, mode=mode + ) + + +def generate_images( + num_images: int, sizes: List[Tuple[int, int]], modes: List[str], formats: List[str] +) -> str: + + dimensions = [] + for mode in modes: + if mode in ["1", "L", "P"]: + dimension = 1 + elif mode in ["RGB", "YCbCr", "LAB", "HSV"]: + dimension = 3 + elif mode in ["RGBA", "CMYK", "I", "F"]: + dimension = 4 + else: + raise ValueError(f"Found unknown image mode: {mode}.") + dimensions.append(dimension) + + images_dir = tempfile.mkdtemp() + + for image_idx in range(num_images): + size = random.choice(sizes) + file_format = random.choice(formats) + mode_idx = random.randrange(len(modes)) + mode = modes[mode_idx] + dimension = dimensions[mode_idx] + + width, height = size + file_name = f"{images_dir}/{image_idx}.{file_format}" + pixels_per_dimension = [] + for _ in range(dimension): + pixels = os.urandom(width * height) + pixels_per_dimension.append(pixels) + + image = Image.new(mode, size) + if len(pixels_per_dimension) == 1: + image.putdata(pixels_per_dimension[0]) + else: + image.putdata(list(zip(*pixels_per_dimension))) + image.save(file_name) + + return images_dir + + +def run_images_benchmark(benchmark: Benchmark): + # Set global random seed. + random.seed(42) + + test_input = [ + generate_images(100, [(256, 256)], ["RGB"], ["jpg"]), + generate_images(100, [(2048, 2048)], ["RGB"], ["jpg"]), + generate_images( + 1000, [(64, 64), (256, 256)], ["RGB", "L"], ["jpg", "jpeg", "png"] + ), + ] + + benchmark.run("images-100-256-rbg-jpg", read_images, root=test_input[0]) + benchmark.run("images-100-2048-rbg-jpg", read_images, root=test_input[1]) + benchmark.run( + "images-100-2048-to-256-rbg-jpg", + read_images, + root=test_input[1], + size=(256, 256), + ) + benchmark.run( + "images-1000-mix", read_images, root=test_input[2], size=(256, 256), mode="RGB" + ) + + for root in test_input: + shutil.rmtree(root) + + # TODO(chengsu): run benchmark on 20G and 100G imagenet data. + benchmark.run( + "images-imagenet-1g", + read_images, + root="s3://air-example-data-2/1G-image-data-synthetic-raw", + ) + + +if __name__ == "__main__": + ray.init() + + benchmark = Benchmark("read") + + run_images_benchmark(benchmark) + + benchmark.write_result() diff --git a/release/nightly_tests/dataset/aggregate_benchmark_compute.yaml b/release/nightly_tests/dataset/single_node_benchmark_compute.yaml similarity index 100% rename from release/nightly_tests/dataset/aggregate_benchmark_compute.yaml rename to release/nightly_tests/dataset/single_node_benchmark_compute.yaml diff --git a/release/release_tests.yaml b/release/release_tests.yaml index f1a731c0eeca..b6c7f607b602 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -4213,7 +4213,7 @@ team: data cluster: cluster_env: app_config.yaml - cluster_compute: aggregate_benchmark_compute.yaml + cluster_compute: single_node_benchmark_compute.yaml run: timeout: 1800 @@ -4222,6 +4222,23 @@ type: sdk_command file_manager: sdk +- name: read_benchmark_single_node + group: core-dataset-tests + working_dir: nightly_tests/dataset + + frequency: multi + team: data + cluster: + cluster_env: app_config.yaml + cluster_compute: single_node_benchmark_compute.yaml + + run: + timeout: 1800 + script: python read_benchmark.py + + type: sdk_command + file_manager: sdk + - name: pipelined_training_50_gb group: core-dataset-tests working_dir: nightly_tests/dataset