[Datasets] [Arrow 7+ Support - 3/N] Add support for Arrow 8, 9, 10, a…

…nd nightly. (#29999) This PR adds support for Arrow 8, 9, 10, and nightly in Ray, and is the third PR in a set of stacked PRs making up this mono-PR for Arrow 7+ support (#29161), and is stacked on top of a PR fixing task cancellation in Ray Core (#29984) and a PR adding support for Arrow 7 (#29993). The last two commits are the relevant commits for review. Summary of Changes This PR: - For Arrow 9+, add allow_bucket_creation=true to S3 URIs for the Ray Core Storage API and for the Datasets S3 write API ([Datasets] In Arrow 9+, creating S3 buckets requires explicit opt-in. #29815). - For Arrow 9+, create an ExtensionScalar subclass for tensor extension types that returns an ndarray view from .as_py() ([Datasets] For Arrow 8+, tensor column element access returns an ExtensionScalar. #29816). - For Arrow 8.*, we manually convert the ExtensionScalar to an ndarray for tensor extension types, since the ExtensionScalar type exists but isn't subclassable in Arrow 8 ([Datasets] For Arrow 8+, tensor column element access returns an ExtensionScalar. #29816). - For Arrow 10+, we match on other potential error messages when encountering permission issues when interacting with S3 ([Datasets] In Arrow 10+, S3 errors raised due to permission issues can vary beyond our current pattern matching #29994). - adds CI jobs for Arrow 8, 9, 10, and nightly - removes the pyarrow version upper bound
ray-project · Nov 9, 2022 · 06d5dc3 · 06d5dc3
1 parent 45ffe6e
commit 06d5dc3
Show file tree

Hide file tree

Showing 14 changed files with 435 additions and 116 deletions.
diff --git a/.buildkite/pipeline.ml.yml b/.buildkite/pipeline.ml.yml
@@ -269,6 +269,46 @@
     # Dask tests and examples.
     - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=-client python/ray/util/dask/...
 
+- label: "Dataset tests (Arrow nightly)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - DATA_PROCESSING_TESTING=1 ARROW_VERSION=nightly ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/data/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_data python/ray/air/...
+
+- label: "Dataset tests (Arrow 10)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - DATA_PROCESSING_TESTING=1 ARROW_VERSION=10.* ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/data/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_data python/ray/air/...
+
+- label: "Dataset tests (Arrow 9)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - DATA_PROCESSING_TESTING=1 ARROW_VERSION=9.* ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/data/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_data python/ray/air/...
+
+- label: "Dataset tests (Arrow 8)"
+  conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"]
+  instance_size: medium
+  commands:
+    - cleanup() { if [ "${BUILDKITE_PULL_REQUEST}" = "false" ]; then ./ci/build/upload_build_info.sh; fi }; trap cleanup EXIT
+    - DATA_PROCESSING_TESTING=1 ARROW_VERSION=8.* ./ci/env/install-dependencies.sh
+    - ./ci/env/env_info.sh
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only python/ray/data/...
+    - bazel test --config=ci $(./ci/run/bazel_export_options) --build_tests_only --test_tag_filters=ray_data python/ray/air/...
+
 - label: "Dataset tests (Arrow 7)"
   conditions: ["NO_WHEELS_REQUIRED", "RAY_CI_PYTHON_AFFECTED", "RAY_CI_DATA_AFFECTED"]
   instance_size: medium

diff --git a/python/ray/_private/storage.py b/python/ray/_private/storage.py
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING, List, Optional
 
 from ray._private.client_mode_hook import client_mode_hook
+from ray._private.utils import _add_creatable_buckets_param_if_s3_uri
 
 if TYPE_CHECKING:
     import pyarrow.fs
@@ -368,6 +369,9 @@ def _init_filesystem(create_valid_file: bool = False, check_valid_file: bool = T
         fs_creator = _load_class(parsed_uri.netloc)
         _filesystem, _storage_prefix = fs_creator(parsed_uri.path)
     else:
+        # Arrow's S3FileSystem doesn't allow creating buckets by default, so we add a
+        # query arg enabling bucket creation if an S3 URI is provided.
+        _storage_uri = _add_creatable_buckets_param_if_s3_uri(_storage_uri)
         _filesystem, _storage_prefix = pyarrow.fs.FileSystem.from_uri(_storage_uri)
 
     if os.name == "nt":

diff --git a/python/ray/_private/utils.py b/python/ray/_private/utils.py
@@ -15,6 +15,7 @@
 import tempfile
 import threading
 import time
+from urllib.parse import urlencode, unquote, urlparse, parse_qsl, urlunparse
 import uuid
 import warnings
 from inspect import signature
@@ -1599,6 +1600,73 @@ def split_address(address: str) -> Tuple[str, str]:
     return (module_string, inner_address)
 
 
+def _add_url_query_params(url: str, params: Dict[str, str]) -> str:
+    """Add params to the provided url as query parameters.
+
+    If url already contains query parameters, they will be merged with params, with the
+    existing query parameters overriding any in params with the same parameter name.
+
+    Args:
+        url: The URL to add query parameters to.
+        params: The query parameters to add.
+
+    Returns:
+        URL with params added as query parameters.
+    """
+    # Unquote URL first so we don't lose existing args.
+    url = unquote(url)
+    # Parse URL.
+    parsed_url = urlparse(url)
+    # Merge URL query string arguments dict with new params.
+    base_params = params
+    params = dict(parse_qsl(parsed_url.query))
+    base_params.update(params)
+    # bool and dict values should be converted to json-friendly values.
+    base_params.update(
+        {
+            k: json.dumps(v)
+            for k, v in base_params.items()
+            if isinstance(v, (bool, dict))
+        }
+    )
+
+    # Convert URL arguments to proper query string.
+    encoded_params = urlencode(base_params, doseq=True)
+    # Replace query string in parsed URL with updated query string.
+    parsed_url = parsed_url._replace(query=encoded_params)
+    # Convert back to URL.
+    return urlunparse(parsed_url)
+
+
+def _add_creatable_buckets_param_if_s3_uri(uri: str) -> str:
+    """If the provided URI is an S3 URL, add allow_bucket_creation=true as a query
+    parameter. For pyarrow >= 9.0.0, this is required in order to allow
+    ``S3FileSystem.create_dir()`` to create S3 buckets.
+
+    If the provided URI is not an S3 URL or if pyarrow < 9.0.0 is installed, we return
+    the URI unchanged.
+
+    Args:
+        uri: The URI that we'll add the query parameter to, if it's an S3 URL.
+
+    Returns:
+        A URI with the added allow_bucket_creation=true query parameter, if the provided
+        URI is an S3 URL; uri will be returned unchanged otherwise.
+    """
+    from pkg_resources._vendor.packaging.version import parse as parse_version
+
+    pyarrow_version = _get_pyarrow_version()
+    if pyarrow_version is not None:
+        pyarrow_version = parse_version(pyarrow_version)
+    if pyarrow_version is not None and pyarrow_version < parse_version("9.0.0"):
+        # This bucket creation query parameter is not required for pyarrow < 9.0.0.
+        return uri
+    parsed_uri = urlparse(uri)
+    if parsed_uri.scheme == "s3":
+        uri = _add_url_query_params(uri, {"allow_bucket_creation": True})
+    return uri
+
+
 def _get_pyarrow_version() -> Optional[str]:
     """Get the version of the installed pyarrow package, returned as a tuple of ints.
     Returns None if the package is not found.

diff --git a/python/ray/air/tests/test_tensor_extension.py b/python/ray/air/tests/test_tensor_extension.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+from pkg_resources._vendor.packaging.version import parse as parse_version
 import pyarrow as pa
 import pytest
 
@@ -12,6 +13,7 @@
     ArrowVariableShapedTensorType,
 )
 from ray.air.util.tensor_extensions.pandas import TensorArray, TensorDtype
+from ray._private.utils import _get_pyarrow_version
 
 
 def test_tensor_array_validation():
@@ -343,8 +345,20 @@ def test_arrow_tensor_array_getitem(chunked):
     if chunked:
         t_arr = pa.chunked_array(t_arr)
 
-    for idx in range(outer_dim):
-        np.testing.assert_array_equal(t_arr[idx], arr[idx])
+    pyarrow_version = parse_version(_get_pyarrow_version())
+    if (
+        chunked
+        and pyarrow_version >= parse_version("8.0.0")
+        and pyarrow_version < parse_version("9.0.0")
+    ):
+        for idx in range(outer_dim):
+            item = t_arr[idx]
+            assert isinstance(item, pa.ExtensionScalar)
+            item = item.type._extension_scalar_to_ndarray(item)
+            np.testing.assert_array_equal(item, arr[idx])
+    else:
+        for idx in range(outer_dim):
+            np.testing.assert_array_equal(t_arr[idx], arr[idx])
 
     # Test __iter__.
     for t_subarr, subarr in zip(t_arr, arr):
@@ -368,8 +382,19 @@ def test_arrow_tensor_array_getitem(chunked):
 
     np.testing.assert_array_equal(t_arr2_npy, arr[1:])
 
-    for idx in range(1, outer_dim):
-        np.testing.assert_array_equal(t_arr2[idx - 1], arr[idx])
+    if (
+        chunked
+        and pyarrow_version >= parse_version("8.0.0")
+        and pyarrow_version < parse_version("9.0.0")
+    ):
+        for idx in range(1, outer_dim):
+            item = t_arr2[idx - 1]
+            assert isinstance(item, pa.ExtensionScalar)
+            item = item.type._extension_scalar_to_ndarray(item)
+            np.testing.assert_array_equal(item, arr[idx])
+    else:
+        for idx in range(1, outer_dim):
+            np.testing.assert_array_equal(t_arr2[idx - 1], arr[idx])
 
 
 @pytest.mark.parametrize("chunked", [False, True])
@@ -387,8 +412,20 @@ def test_arrow_variable_shaped_tensor_array_getitem(chunked):
     if chunked:
         t_arr = pa.chunked_array(t_arr)
 
-    for idx in range(outer_dim):
-        np.testing.assert_array_equal(t_arr[idx], arr[idx])
+    pyarrow_version = parse_version(_get_pyarrow_version())
+    if (
+        chunked
+        and pyarrow_version >= parse_version("8.0.0")
+        and pyarrow_version < parse_version("9.0.0")
+    ):
+        for idx in range(outer_dim):
+            item = t_arr[idx]
+            assert isinstance(item, pa.ExtensionScalar)
+            item = item.type._extension_scalar_to_ndarray(item)
+            np.testing.assert_array_equal(item, arr[idx])
+    else:
+        for idx in range(outer_dim):
+            np.testing.assert_array_equal(t_arr[idx], arr[idx])
 
     # Test __iter__.
     for t_subarr, subarr in zip(t_arr, arr):
@@ -414,8 +451,19 @@ def test_arrow_variable_shaped_tensor_array_getitem(chunked):
     for t_subarr, subarr in zip(t_arr2_npy, arr[1:]):
         np.testing.assert_array_equal(t_subarr, subarr)
 
-    for idx in range(1, outer_dim):
-        np.testing.assert_array_equal(t_arr2[idx - 1], arr[idx])
+    if (
+        chunked
+        and pyarrow_version >= parse_version("8.0.0")
+        and pyarrow_version < parse_version("9.0.0")
+    ):
+        for idx in range(1, outer_dim):
+            item = t_arr2[idx - 1]
+            assert isinstance(item, pa.ExtensionScalar)
+            item = item.type._extension_scalar_to_ndarray(item)
+            np.testing.assert_array_equal(item, arr[idx])
+    else:
+        for idx in range(1, outer_dim):
+            np.testing.assert_array_equal(t_arr2[idx - 1], arr[idx])
 
 
 @pytest.mark.parametrize(