From b8daefa982dd4f2af50e9bf7d4e3a1a2d43fd9d7 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Tue, 21 Dec 2021 17:11:49 +0000 Subject: [PATCH] Improve serialization performance (#2165) Signed-off-by: Judah Rand <17158624+judahrand@users.noreply.github.com> --- sdk/python/feast/infra/provider.py | 26 ++++++++++++++++++-------- sdk/python/feast/type_map.py | 4 ++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 8f72a48bd3..797486acd0 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -301,8 +301,14 @@ def _convert_arrow_to_proto( feature_view: FeatureView, join_keys: List[str], ) -> List[Tuple[EntityKeyProto, Dict[str, ValueProto], datetime, Optional[datetime]]]: + # Avoid ChunkedArrays which guarentees `zero_copy_only` availiable. + if isinstance(table, pyarrow.Table): + table = table.to_batches()[0] + # Handle join keys - join_key_values = {k: table.column(k).to_pylist() for k in join_keys} + join_key_values = { + k: table.column(k).to_numpy(zero_copy_only=False) for k in join_keys + } entity_keys = [ EntityKeyProto( join_keys=join_keys, @@ -317,7 +323,7 @@ def _convert_arrow_to_proto( feature_dict = { feature.name: [ python_value_to_proto_value(val, feature.dtype) - for val in table.column(feature.name).to_pylist() + for val in table.column(feature.name).to_numpy(zero_copy_only=False) ] for feature in feature_view.features } @@ -326,18 +332,22 @@ def _convert_arrow_to_proto( # Convert event_timestamps event_timestamps = [ _coerce_datetime(val) - for val in table.column( - feature_view.batch_source.event_timestamp_column - ).to_pylist() + for val in pandas.to_datetime( + table.column(feature_view.batch_source.event_timestamp_column).to_numpy( + zero_copy_only=False + ) + ) ] # Convert created_timestamps if they exist if feature_view.batch_source.created_timestamp_column: created_timestamps = [ _coerce_datetime(val) - for val in table.column( - feature_view.batch_source.created_timestamp_column - ).to_pylist() + for val in pandas.to_datetime( + table.column( + feature_view.batch_source.created_timestamp_column + ).to_numpy(zero_copy_only=False) + ) ] else: created_timestamps = [None] * table.num_rows diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index df8b2c4f05..f985f413ea 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -207,7 +207,7 @@ def _type_err(item, dtype): "double_list_val", [np.float64, np.float32, float], ), - ValueType.INT32_LIST: (Int32List, "int32_list_val", [np.int32, int]), + ValueType.INT32_LIST: (Int32List, "int32_list_val", [np.int64, np.int32, int]), ValueType.INT64_LIST: (Int64List, "int64_list_val", [np.int64, np.int32, int]), ValueType.UNIX_TIMESTAMP_LIST: ( Int64List, @@ -234,7 +234,7 @@ def _type_err(item, dtype): ValueType.DOUBLE: ("double_val", lambda x: x, {float, np.float64}), ValueType.STRING: ("string_val", lambda x: str(x), None), ValueType.BYTES: ("bytes_val", lambda x: x, {bytes}), - ValueType.BOOL: ("bool_val", lambda x: x, {bool}), + ValueType.BOOL: ("bool_val", lambda x: x, {bool, np.bool_}), }