forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Core] Support Arrow zerocopy serialization in object store (ray-proj…
…ect#35110) Support Arrow in object store with zerocopy and improve performance. We made a benchmark under the dataset [NYC TAXI FARE](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/data), which has 8 columns and 55423855 rows in csv, 5.4G on disk. Here are the results: | Java to Java | Java Write(ms) | Java Read(ms) | | :-----: | :----: | :----: | | Before | 23,637 | 3,162 | | After | 23,320 | 226 | | Java to Python | Java Write(ms) | Python Read(ms) | | :---: | :---: | :---: | | Before | 28,771 | 2,645 | | After | 25,864 | 8 | | Python to Java | Python Write(ms) | Java Read(ms) | | :---: | :---: | :---: | | Before | 10,597 | 3,386 | | After | 5,271 | 3,251 | | Python to Python | Python Write(ms) | Python Read(ms) | | :---: | :---: | :---: | | Before | 9,113 | 988 | | After | 5,636 | 66 | Benchmark code: ```python import ray, raydp, time from pyarrow import csv import sys file_path = "FilePath_/train.csv" # file_path = "FilePath_/train_tiny.csv" if __name__ == '__main__': ray.init() write, read = sys.argv[1], sys.argv[2] assert write in ("java", "python") and read in ("java", "python"), "Illegal arguments. Please use java or python" spark = raydp.init_spark('benchmark', 10, 5, '2G', configs={"spark.default.parallelism": 50}) if write == "java": df = spark.read.format("csv").option("header", "true") \ .option("inferSchema", "true") \ .load(f"file://{file_path}") print(df.count()) start = time.time() blocks, _ = raydp.spark.dataset._save_spark_df_to_object_store(df, False) end = time.time() ds = ray.data.from_arrow_refs(blocks) elif write == "python": table = csv.read_csv(file_path) start = time.time() ds = ray.data.from_arrow(table) end = time.time() print(ds.num_blocks()) ds = ds.repartition(50) print(f"{write} writing takes {end - start} seconds.") if read == "java": start = time.time() df = ds.to_spark(spark) end = time.time() print(df.count()) elif read == "python": start = time.time() ray.get(ds.get_internal_block_refs()) end = time.time() print(f"{read} reading takes {end - start} seconds.") raydp.stop_spark() ray.shutdown() ```
- Loading branch information
Showing
11 changed files
with
289 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
61 changes: 61 additions & 0 deletions
61
java/runtime/src/main/java/io/ray/runtime/util/ArrowUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
package io.ray.runtime.util; | ||
|
||
import io.ray.api.exception.RayException; | ||
import java.io.ByteArrayInputStream; | ||
import java.io.ByteArrayOutputStream; | ||
import java.nio.channels.Channels; | ||
import org.apache.arrow.memory.RootAllocator; | ||
import org.apache.arrow.vector.VectorLoader; | ||
import org.apache.arrow.vector.VectorSchemaRoot; | ||
import org.apache.arrow.vector.ipc.ArrowStreamWriter; | ||
import org.apache.arrow.vector.ipc.ReadChannel; | ||
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; | ||
import org.apache.arrow.vector.ipc.message.MessageChannelReader; | ||
import org.apache.arrow.vector.ipc.message.MessageResult; | ||
import org.apache.arrow.vector.ipc.message.MessageSerializer; | ||
import org.apache.arrow.vector.types.pojo.Schema; | ||
|
||
/** Helper method for serialize and deserialize arrow data. */ | ||
public class ArrowUtil { | ||
public static final RootAllocator rootAllocator = new RootAllocator(Long.MAX_VALUE); | ||
|
||
/** | ||
* Deserialize data in byte array to arrow data format. | ||
* | ||
* @return The vector schema root of arrow. | ||
*/ | ||
public static VectorSchemaRoot deserialize(byte[] data) { | ||
try (MessageChannelReader reader = | ||
new MessageChannelReader( | ||
new ReadChannel(Channels.newChannel(new ByteArrayInputStream(data))), rootAllocator)) { | ||
MessageResult result = reader.readNext(); | ||
Schema schema = MessageSerializer.deserializeSchema(result.getMessage()); | ||
VectorSchemaRoot root = VectorSchemaRoot.create(schema, rootAllocator); | ||
VectorLoader loader = new VectorLoader(root); | ||
result = reader.readNext(); | ||
ArrowRecordBatch batch = | ||
MessageSerializer.deserializeRecordBatch(result.getMessage(), result.getBodyBuffer()); | ||
loader.load(batch); | ||
return root; | ||
} catch (Exception e) { | ||
throw new RayException("Failed to deserialize Arrow data", e.getCause()); | ||
} | ||
} | ||
|
||
/** | ||
* Serialize data from arrow data format to byte array. | ||
* | ||
* @return The byte array of data. | ||
*/ | ||
public static byte[] serialize(VectorSchemaRoot root) { | ||
try (ByteArrayOutputStream sink = new ByteArrayOutputStream(); | ||
ArrowStreamWriter writer = new ArrowStreamWriter(root, null, sink)) { | ||
writer.start(); | ||
writer.writeBatch(); | ||
writer.end(); | ||
return sink.toByteArray(); | ||
} catch (Exception e) { | ||
throw new RayException("Failed to serialize Arrow data", e.getCause()); | ||
} | ||
} | ||
} |
92 changes: 92 additions & 0 deletions
92
java/test/src/main/java/io/ray/test/CrossLanguageObjectStoreTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
package io.ray.test; | ||
|
||
import io.ray.api.ObjectRef; | ||
import io.ray.api.Ray; | ||
import io.ray.api.function.PyFunction; | ||
import io.ray.runtime.util.ArrowUtil; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import org.apache.arrow.vector.BigIntVector; | ||
import org.apache.arrow.vector.FieldVector; | ||
import org.apache.arrow.vector.VectorSchemaRoot; | ||
import org.apache.arrow.vector.types.pojo.Field; | ||
import org.apache.commons.io.FileUtils; | ||
import org.testng.Assert; | ||
import org.testng.annotations.BeforeClass; | ||
import org.testng.annotations.Test; | ||
|
||
@Test(groups = {"cluster"}) | ||
public class CrossLanguageObjectStoreTest extends BaseTest { | ||
|
||
private static final String PYTHON_MODULE = "test_cross_language_invocation"; | ||
private static final int vecSize = 5; | ||
|
||
@BeforeClass | ||
public void beforeClass() { | ||
// Delete and re-create the temp dir. | ||
File tempDir = | ||
new File( | ||
System.getProperty("java.io.tmpdir") | ||
+ File.separator | ||
+ "ray_cross_language_object_store_test"); | ||
FileUtils.deleteQuietly(tempDir); | ||
tempDir.mkdirs(); | ||
tempDir.deleteOnExit(); | ||
|
||
// Write the test Python file to the temp dir. | ||
InputStream in = | ||
CrossLanguageObjectStoreTest.class.getResourceAsStream( | ||
File.separator + PYTHON_MODULE + ".py"); | ||
File pythonFile = new File(tempDir.getAbsolutePath() + File.separator + PYTHON_MODULE + ".py"); | ||
try { | ||
FileUtils.copyInputStreamToFile(in, pythonFile); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
|
||
System.setProperty( | ||
"ray.job.code-search-path", | ||
System.getProperty("java.class.path") + File.pathSeparator + tempDir.getAbsolutePath()); | ||
} | ||
|
||
@Test | ||
public void testPythonPutAndJavaGet() { | ||
ObjectRef<VectorSchemaRoot> res = | ||
Ray.task(PyFunction.of(PYTHON_MODULE, "py_put_into_object_store", VectorSchemaRoot.class)) | ||
.remote(); | ||
VectorSchemaRoot root = res.get(); | ||
BigIntVector newVector = (BigIntVector) root.getVector(0); | ||
for (int i = 0; i < vecSize; i++) { | ||
Assert.assertEquals(i, newVector.get(i)); | ||
} | ||
} | ||
|
||
@Test | ||
public void testJavaPutAndPythonGet() { | ||
BigIntVector vector = new BigIntVector("ArrowBigIntVector", ArrowUtil.rootAllocator); | ||
vector.setValueCount(vecSize); | ||
for (int i = 0; i < vecSize; i++) { | ||
vector.setSafe(i, i); | ||
} | ||
List<Field> fields = Arrays.asList(vector.getField()); | ||
List<FieldVector> vectors = Arrays.asList(vector); | ||
VectorSchemaRoot root = new VectorSchemaRoot(fields, vectors); | ||
ObjectRef<VectorSchemaRoot> obj = Ray.put(root); | ||
|
||
ObjectRef<VectorSchemaRoot> res = | ||
Ray.task( | ||
PyFunction.of( | ||
PYTHON_MODULE, "py_object_store_get_and_check", VectorSchemaRoot.class), | ||
obj) | ||
.remote(); | ||
|
||
VectorSchemaRoot newRoot = res.get(); | ||
BigIntVector newVector = (BigIntVector) newRoot.getVector(0); | ||
for (int i = 0; i < vecSize; i++) { | ||
Assert.assertEquals(i, newVector.get(i)); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.