[Core] Support Arrow zerocopy serialization in object store (ray-proj…

…ect#35110) Support Arrow in object store with zerocopy and improve performance. We made a benchmark under the dataset [NYC TAXI FARE](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/data), which has 8 columns and 55423855 rows in csv, 5.4G on disk. Here are the results: | Java to Java | Java Write(ms) | Java Read(ms) | | :-----: | :----: | :----: | | Before | 23,637 | 3,162 | | After | 23,320 | 226 | | Java to Python | Java Write(ms) | Python Read(ms) | | :---: | :---: | :---: | | Before | 28,771 | 2,645 | | After | 25,864 | 8 | | Python to Java | Python Write(ms) | Java Read(ms) | | :---: | :---: | :---: | | Before | 10,597 | 3,386 | | After | 5,271 | 3,251 | | Python to Python | Python Write(ms) | Python Read(ms) | | :---: | :---: | :---: | | Before | 9,113 | 988 | | After | 5,636 | 66 | Benchmark code: ```python import ray, raydp, time from pyarrow import csv import sys file_path = "FilePath_/train.csv" # file_path = "FilePath_/train_tiny.csv" if __name__ == '__main__': ray.init() write, read = sys.argv[1], sys.argv[2] assert write in ("java", "python") and read in ("java", "python"), "Illegal arguments. Please use java or python" spark = raydp.init_spark('benchmark', 10, 5, '2G', configs={"spark.default.parallelism": 50}) if write == "java": df = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(f"file://{file_path}") print(df.count()) start = time.time() blocks, _ = raydp.spark.dataset._save_spark_df_to_object_store(df, False) end = time.time() ds = ray.data.from_arrow_refs(blocks) elif write == "python": table = csv.read_csv(file_path) start = time.time() ds = ray.data.from_arrow(table) end = time.time() print(ds.num_blocks()) ds = ds.repartition(50) print(f"{write} writing takes {end - start} seconds.") if read == "java": start = time.time() df = ds.to_spark(spark) end = time.time() print(df.count()) elif read == "python": start = time.time() ray.get(ds.get_internal_block_refs()) end = time.time() print(f"{read} reading takes {end - start} seconds.") raydp.stop_spark() ray.shutdown() ```
lmco · Jun 1, 2023 · 158c2bf · 158c2bf
1 parent 1332133
commit 158c2bf
Show file tree

Hide file tree

Showing 11 changed files with 289 additions and 4 deletions.
diff --git a/java/BUILD.bazel b/java/BUILD.bazel
@@ -91,6 +91,9 @@ define_java_module(
         "@maven//:commons_io_commons_io",
         "@maven//:de_ruedigermoeller_fst",
         "@maven//:net_java_dev_jna_jna",
+        "@maven//:org_apache_arrow_arrow_memory_core",
+        "@maven//:org_apache_arrow_arrow_memory_unsafe",
+        "@maven//:org_apache_arrow_arrow_vector",
         "@maven//:org_apache_commons_commons_lang3",
         "@maven//:org_apache_logging_log4j_log4j_api",
         "@maven//:org_apache_logging_log4j_log4j_core",
@@ -117,6 +120,7 @@ define_java_module(
         "@maven//:com_sun_xml_bind_jaxb_impl",
         "@maven//:commons_io_commons_io",
         "@maven//:javax_xml_bind_jaxb_api",
+        "@maven//:org_apache_arrow_arrow_vector",
         "@maven//:org_apache_commons_commons_lang3",
         "@maven//:org_apache_logging_log4j_log4j_api",
         "@maven//:org_apache_logging_log4j_log4j_core",

diff --git a/java/dependencies.bzl b/java/dependencies.bzl
@@ -27,6 +27,9 @@ def gen_java_deps():
             "org.slf4j:slf4j-api:1.7.25",
             "com.lmax:disruptor:3.3.4",
             "net.java.dev.jna:jna:5.8.0",
+            "org.apache.arrow:arrow-memory-core:5.0.0",
+            "org.apache.arrow:arrow-memory-unsafe:5.0.0",
+            "org.apache.arrow:arrow-vector:5.0.0",
             "org.apache.httpcomponents.client5:httpclient5:5.0.3",
             "org.apache.httpcomponents.core5:httpcore5:5.0.2",
             "org.apache.httpcomponents.client5:httpclient5-fluent:5.0.3",

diff --git a/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java b/java/runtime/src/main/java/io/ray/runtime/object/ObjectSerializer.java
@@ -14,13 +14,15 @@
 import io.ray.runtime.generated.Common.ErrorType;
 import io.ray.runtime.serializer.RayExceptionSerializer;
 import io.ray.runtime.serializer.Serializer;
+import io.ray.runtime.util.ArrowUtil;
 import io.ray.runtime.util.IdUtil;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.commons.lang3.tuple.Pair;
 
 /**
@@ -49,6 +51,7 @@ public class ObjectSerializer {
   private static final byte[] TASK_EXECUTION_EXCEPTION_META =
       String.valueOf(ErrorType.TASK_EXECUTION_EXCEPTION.getNumber()).getBytes();
 
+  public static final byte[] OBJECT_METADATA_TYPE_ARROW = "ARROW".getBytes();
   public static final byte[] OBJECT_METADATA_TYPE_CROSS_LANGUAGE = "XLANG".getBytes();
   public static final byte[] OBJECT_METADATA_TYPE_JAVA = "JAVA".getBytes();
   public static final byte[] OBJECT_METADATA_TYPE_PYTHON = "PYTHON".getBytes();
@@ -80,7 +83,9 @@ public static Object deserialize(
 
     if (meta != null && meta.length > 0) {
       // If meta is not null, deserialize the object from meta.
-      if (Bytes.indexOf(meta, OBJECT_METADATA_TYPE_RAW) == 0) {
+      if (Bytes.indexOf(meta, OBJECT_METADATA_TYPE_ARROW) == 0) {
+        return ArrowUtil.deserialize(data);
+      } else if (Bytes.indexOf(meta, OBJECT_METADATA_TYPE_RAW) == 0) {
         if (objectType == ByteBuffer.class) {
           return ByteBuffer.wrap(data);
         }
@@ -136,6 +141,10 @@ public static NativeRayObject serialize(Object object) {
       // If the object is a byte array, skip serializing it and use a special metadata to
       // indicate it's raw binary. So that this object can also be read by Python.
       return new NativeRayObject((byte[]) object, OBJECT_METADATA_TYPE_RAW);
+    } else if (object instanceof VectorSchemaRoot) {
+      // serialize arrow data using IPC Stream format
+      byte[] bytes = ArrowUtil.serialize((VectorSchemaRoot) object);
+      return new NativeRayObject(bytes, OBJECT_METADATA_TYPE_ARROW);
     } else if (object instanceof ByteBuffer) {
       // Serialize ByteBuffer to raw bytes.
       ByteBuffer buffer = (ByteBuffer) object;

diff --git a/java/runtime/src/main/java/io/ray/runtime/util/ArrowUtil.java b/java/runtime/src/main/java/io/ray/runtime/util/ArrowUtil.java
@@ -0,0 +1,61 @@
+package io.ray.runtime.util;
+
+import io.ray.api.exception.RayException;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.nio.channels.Channels;
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.VectorLoader;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowStreamWriter;
+import org.apache.arrow.vector.ipc.ReadChannel;
+import org.apache.arrow.vector.ipc.message.ArrowRecordBatch;
+import org.apache.arrow.vector.ipc.message.MessageChannelReader;
+import org.apache.arrow.vector.ipc.message.MessageResult;
+import org.apache.arrow.vector.ipc.message.MessageSerializer;
+import org.apache.arrow.vector.types.pojo.Schema;
+
+/** Helper method for serialize and deserialize arrow data. */
+public class ArrowUtil {
+  public static final RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE);
+
+  /**
+   * Deserialize data in byte array to arrow data format.
+   *
+   * @return The vector schema root of arrow.
+   */
+  public static VectorSchemaRoot deserialize(byte[] data) {
+    try (MessageChannelReader reader =
+        new MessageChannelReader(
+            new ReadChannel(Channels.newChannel(new ByteArrayInputStream(data))), rootAllocator)) {
+      MessageResult result = reader.readNext();
+      Schema schema = MessageSerializer.deserializeSchema(result.getMessage());
+      VectorSchemaRoot root = VectorSchemaRoot.create(schema, rootAllocator);
+      VectorLoader loader = new VectorLoader(root);
+      result = reader.readNext();
+      ArrowRecordBatch batch =
+          MessageSerializer.deserializeRecordBatch(result.getMessage(), result.getBodyBuffer());
+      loader.load(batch);
+      return root;
+    } catch (Exception e) {
+      throw new RayException("Failed to deserialize Arrow data", e.getCause());
+    }
+  }
+
+  /**
+   * Serialize data from arrow data format to byte array.
+   *
+   * @return The byte array of data.
+   */
+  public static byte[] serialize(VectorSchemaRoot root) {
+    try (ByteArrayOutputStream sink = new ByteArrayOutputStream();
+        ArrowStreamWriter writer = new ArrowStreamWriter(root, null, sink)) {
+      writer.start();
+      writer.writeBatch();
+      writer.end();
+      return sink.toByteArray();
+    } catch (Exception e) {
+      throw new RayException("Failed to serialize Arrow data", e.getCause());
+    }
+  }
+}
diff --git a/java/test/src/main/java/io/ray/test/CrossLanguageObjectStoreTest.java b/java/test/src/main/java/io/ray/test/CrossLanguageObjectStoreTest.java
@@ -0,0 +1,92 @@
+package io.ray.test;
+
+import io.ray.api.ObjectRef;
+import io.ray.api.Ray;
+import io.ray.api.function.PyFunction;
+import io.ray.runtime.util.ArrowUtil;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.arrow.vector.BigIntVector;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.commons.io.FileUtils;
+import org.testng.Assert;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+@Test(groups = {"cluster"})
+public class CrossLanguageObjectStoreTest extends BaseTest {
+
+  private static final String PYTHON_MODULE = "test_cross_language_invocation";
+  private static final int vecSize = 5;
+
+  @BeforeClass
+  public void beforeClass() {
+    // Delete and re-create the temp dir.
+    File tempDir =
+        new File(
+            System.getProperty("java.io.tmpdir")
+                + File.separator
+                + "ray_cross_language_object_store_test");
+    FileUtils.deleteQuietly(tempDir);
+    tempDir.mkdirs();
+    tempDir.deleteOnExit();
+
+    // Write the test Python file to the temp dir.
+    InputStream in =
+        CrossLanguageObjectStoreTest.class.getResourceAsStream(
+            File.separator + PYTHON_MODULE + ".py");
+    File pythonFile = new File(tempDir.getAbsolutePath() + File.separator + PYTHON_MODULE + ".py");
+    try {
+      FileUtils.copyInputStreamToFile(in, pythonFile);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+
+    System.setProperty(
+        "ray.job.code-search-path",
+        System.getProperty("java.class.path") + File.pathSeparator + tempDir.getAbsolutePath());
+  }
+
+  @Test
+  public void testPythonPutAndJavaGet() {
+    ObjectRef<VectorSchemaRoot> res =
+        Ray.task(PyFunction.of(PYTHON_MODULE, "py_put_into_object_store", VectorSchemaRoot.class))
+            .remote();
+    VectorSchemaRoot root = res.get();
+    BigIntVector newVector = (BigIntVector) root.getVector(0);
+    for (int i = 0; i < vecSize; i++) {
+      Assert.assertEquals(i, newVector.get(i));
+    }
+  }
+
+  @Test
+  public void testJavaPutAndPythonGet() {
+    BigIntVector vector = new BigIntVector("ArrowBigIntVector", ArrowUtil.rootAllocator);
+    vector.setValueCount(vecSize);
+    for (int i = 0; i < vecSize; i++) {
+      vector.setSafe(i, i);
+    }
+    List<Field> fields = Arrays.asList(vector.getField());
+    List<FieldVector> vectors = Arrays.asList(vector);
+    VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
+    ObjectRef<VectorSchemaRoot> obj = Ray.put(root);
+
+    ObjectRef<VectorSchemaRoot> res =
+        Ray.task(
+                PyFunction.of(
+                    PYTHON_MODULE, "py_object_store_get_and_check", VectorSchemaRoot.class),
+                obj)
+            .remote();
+
+    VectorSchemaRoot newRoot = res.get();
+    BigIntVector newVector = (BigIntVector) newRoot.getVector(0);
+    for (int i = 0; i < vecSize; i++) {
+      Assert.assertEquals(i, newVector.get(i));
+    }
+  }
+}
diff --git a/java/test/src/main/java/io/ray/test/ObjectStoreTest.java b/java/test/src/main/java/io/ray/test/ObjectStoreTest.java
@@ -6,8 +6,14 @@
 import io.ray.api.Ray;
 import io.ray.api.exception.RayTaskException;
 import io.ray.api.exception.UnreconstructableException;
+import io.ray.runtime.util.ArrowUtil;
+import java.util.Arrays;
 import java.util.List;
 import java.util.stream.Collectors;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.IntVector;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.types.pojo.Field;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 
@@ -75,6 +81,25 @@ public void testGetMultipleObjects() {
     Assert.assertEquals(ints, Ray.get(refs));
   }
 
+  @Test
+  public void testArrowObjects() {
+    final int vecSize = 10;
+    IntVector vector = new IntVector("ArrowIntVector", ArrowUtil.rootAllocator);
+    vector.setValueCount(vecSize);
+    for (int i = 0; i < vecSize; i++) {
+      vector.setSafe(i, i);
+    }
+    List<Field> fields = Arrays.asList(vector.getField());
+    List<FieldVector> vectors = Arrays.asList(vector);
+    VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors);
+    ObjectRef<VectorSchemaRoot> obj = Ray.put(root);
+    VectorSchemaRoot newRoot = obj.get();
+    IntVector newVector = (IntVector) newRoot.getVector(0);
+    for (int i = 0; i < vecSize; i++) {
+      Assert.assertEquals(i, newVector.get(i));
+    }
+  }
+
   @Test(groups = {"cluster"})
   public void testOwnerAssignWhenPut() throws Exception {
     // This test should align with test_owner_assign_when_put in Python

diff --git a/java/test/src/main/resources/test_cross_language_invocation.py b/java/test/src/main/resources/test_cross_language_invocation.py
@@ -3,6 +3,8 @@
 
 import asyncio
 
+import pyarrow as pa
+
 import ray
 
 
@@ -189,3 +191,35 @@ def py_func_call_java_overloaded_method():
     result = ray.get([ref1, ref2])
     assert result == ["first", "firstsecond"]
     return True
+
+
+@ray.remote
+def py_put_into_object_store():
+    column_values = [0, 1, 2, 3, 4]
+    column_array = pa.array(column_values)
+    table = pa.Table.from_arrays([column_array], names=["ArrowBigIntVector"])
+    return table
+
+
+@ray.remote
+def py_object_store_get_and_check(table):
+    column_values = [0, 1, 2, 3, 4]
+    column_array = pa.array(column_values)
+    expected_table = pa.Table.from_arrays([column_array], names=["ArrowBigIntVector"])
+
+    for column_name in table.column_names:
+        column1 = table[column_name]
+        column2 = expected_table[column_name]
+
+        indices = pa.compute.equal(column1, column2).to_pylist()
+        differing_rows = [i for i, index in enumerate(indices) if not index]
+
+        if differing_rows:
+            print(f"Differences in column '{column_name}':")
+            for row in differing_rows:
+                value1 = column1[row].as_py()
+                value2 = column2[row].as_py()
+                print(f"Row {row}: {value1} != {value2}")
+            raise RuntimeError("Check failed, two tables are not equal!")
+
+    return table
diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py
@@ -301,6 +301,8 @@ def env_set_by_user(key):
 OBJECT_METADATA_TYPE_PYTHON = b"PYTHON"
 # A constant used as object metadata to indicate the object is raw bytes.
 OBJECT_METADATA_TYPE_RAW = b"RAW"
+# A constant used as object metadata to indicate the object is arrow data.
+OBJECT_METADATA_TYPE_ARROW = b"ARROW"
 
 # A constant used as object metadata to indicate the object is an actor handle.
 # This value should be synchronized with the Java definition in

diff --git a/python/ray/_private/serialization.py b/python/ray/_private/serialization.py
@@ -8,6 +8,7 @@
 import ray.cloudpickle as pickle
 from ray._private import ray_constants
 from ray._raylet import (
+    ArrowSerializedObject,
     MessagePackSerializedObject,
     MessagePackSerializer,
     ObjectRefGenerator,
@@ -47,6 +48,11 @@
 from ray.util import serialization_addons
 from ray.util import inspect_serializability
 
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
 logger = logging.getLogger(__name__)
 
 
@@ -270,6 +276,12 @@ def _deserialize_object(self, data, metadata, object_ref):
                 if data is None:
                     return b""
                 return data.to_pybytes()
+            elif metadata_fields[0] == ray_constants.OBJECT_METADATA_TYPE_ARROW:
+                assert (
+                    pa is not None
+                ), "pyarrow should be imported while deserializing arrow objects"
+                reader = pa.BufferReader(data)
+                return pa.ipc.open_stream(reader).read_all()
             elif metadata_fields[0] == ray_constants.OBJECT_METADATA_TYPE_ACTOR_HANDLE:
                 obj = self._deserialize_msgpack_data(data, metadata_fields)
                 return _actor_handle_deserializer(obj)
@@ -461,5 +473,12 @@ def serialize(self, value):
             # use a special metadata to indicate it's raw binary. So
             # that this object can also be read by Java.
             return RawSerializedObject(value)
-        else:
-            return self._serialize_to_msgpack(value)
+
+        # Check whether arrow is installed. If so, use Arrow IPC format
+        # to serialize this object, then it can also be read by Java.
+        if pa is not None and (
+            isinstance(value, pa.Table) or isinstance(value, pa.RecordBatch)
+        ):
+            return ArrowSerializedObject(value)
+
+        return self._serialize_to_msgpack(value)
diff --git a/python/ray/data/_internal/arrow_ops/transform_pyarrow.py b/python/ray/data/_internal/arrow_ops/transform_pyarrow.py
@@ -278,7 +278,9 @@ def combine_chunks(table: "pyarrow.Table") -> "pyarrow.Table":
     cols = table.columns
     new_cols = []
     for col in cols:
-        if _is_column_extension_type(col):
+        if col.num_chunks == 0:
+            arr = pyarrow.chunked_array([], type=col.type)
+        elif _is_column_extension_type(col):
             # Extension arrays don't support concatenation.
             arr = _concatenate_extension_column(col)
         else: