From 92a3441b4ff4b680ba044264e01bfc4888c81145 Mon Sep 17 00:00:00 2001
From: Alex Mirrington <alex.mirrington@rokt.com>
Date: Mon, 24 Jun 2024 19:40:28 +1000
Subject: [PATCH] Fix OnDemandFeatureView type inference for array types

Signed-off-by: Alex Mirrington <alex.mirrington@rokt.com>
---
 .../transformation/pandas_transformation.py   |  26 +-
 .../transformation/python_transformation.py   |   4 +-
 .../substrait_transformation.py               |   4 +-
 sdk/python/feast/type_map.py                  |   1 +
 .../test_on_demand_pandas_transformation.py   | 252 +++++++++++++++++-
 .../test_on_demand_python_transformation.py   | 204 +++++++++++++-
 6 files changed, 477 insertions(+), 14 deletions(-)

diff --git a/sdk/python/feast/transformation/pandas_transformation.py b/sdk/python/feast/transformation/pandas_transformation.py
index e9dab721608..3c8304beb54 100644
--- a/sdk/python/feast/transformation/pandas_transformation.py
+++ b/sdk/python/feast/transformation/pandas_transformation.py
@@ -40,15 +40,25 @@ def infer_features(self, random_input: dict[str, list[Any]]) -> list[Field]:
         df = pd.DataFrame.from_dict(random_input)
         output_df: pd.DataFrame = self.transform(df)
 
-        return [
-            Field(
-                name=f,
-                dtype=from_value_type(
-                    python_type_to_feast_value_type(f, type_name=str(dt))
-                ),
+        fields = []
+        for feature_name, feature_type in zip(output_df.columns, output_df.dtypes):
+            feature_value = output_df[feature_name].tolist()
+            if len(feature_value) <= 0:
+                raise TypeError(
+                    f"Failed to infer type for feature '{feature_name}' with value "
+                    + f"'{feature_value}' since no items were returned by the UDF."
+                )
+            fields.append(
+                Field(
+                    name=feature_name,
+                    dtype=from_value_type(
+                        python_type_to_feast_value_type(
+                            feature_name, value=feature_value[0], type_name=str(feature_type)
+                        )
+                    ),
+                )
             )
-            for f, dt in zip(output_df.columns, output_df.dtypes)
-        ]
+        return fields
 
     def __eq__(self, other):
         if not isinstance(other, PandasTransformation):
diff --git a/sdk/python/feast/transformation/python_transformation.py b/sdk/python/feast/transformation/python_transformation.py
index 2a9c7db8763..dafe3eddf1d 100644
--- a/sdk/python/feast/transformation/python_transformation.py
+++ b/sdk/python/feast/transformation/python_transformation.py
@@ -44,7 +44,9 @@ def infer_features(self, random_input: dict[str, list[Any]]) -> list[Field]:
             Field(
                 name=f,
                 dtype=from_value_type(
-                    python_type_to_feast_value_type(f, type_name=type(dt[0]).__name__)
+                    python_type_to_feast_value_type(
+                        f, value=dt[0], type_name=type(dt[0]).__name__
+                    )
                 ),
             )
             for f, dt in output_dict.items()
diff --git a/sdk/python/feast/transformation/substrait_transformation.py b/sdk/python/feast/transformation/substrait_transformation.py
index 17c40cf0a16..1e0680faf9e 100644
--- a/sdk/python/feast/transformation/substrait_transformation.py
+++ b/sdk/python/feast/transformation/substrait_transformation.py
@@ -64,7 +64,9 @@ def infer_features(self, random_input: dict[str, list[Any]]) -> list[Field]:
             Field(
                 name=f,
                 dtype=from_value_type(
-                    python_type_to_feast_value_type(f, type_name=str(dt))
+                    python_type_to_feast_value_type(
+                        f, value=output_df[f].tolist()[0], type_name=str(dt)
+                    )
                 ),
             )
             for f, dt in zip(output_df.columns, output_df.dtypes)
diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py
index a0859f2f7ad..6ba61fc8c5f 100644
--- a/sdk/python/feast/type_map.py
+++ b/sdk/python/feast/type_map.py
@@ -155,6 +155,7 @@ def python_type_to_feast_value_type(
         "uint16": ValueType.INT32,
         "uint8": ValueType.INT32,
         "int8": ValueType.INT32,
+        "bool_": ValueType.BOOL,  # np.bool_
         "bool": ValueType.BOOL,
         "boolean": ValueType.BOOL,
         "timedelta": ValueType.UNIX_TIMESTAMP,
diff --git a/sdk/python/tests/unit/test_on_demand_pandas_transformation.py b/sdk/python/tests/unit/test_on_demand_pandas_transformation.py
index c5f066dd83d..faf7a06938d 100644
--- a/sdk/python/tests/unit/test_on_demand_pandas_transformation.py
+++ b/sdk/python/tests/unit/test_on_demand_pandas_transformation.py
@@ -1,15 +1,31 @@
 import os
+import re
 import tempfile
 from datetime import datetime, timedelta
 
 import pandas as pd
+import pytest
 
-from feast import Entity, FeatureStore, FeatureView, FileSource, RepoConfig
+from feast import (
+    Entity,
+    FeatureStore,
+    FeatureView,
+    FileSource,
+    RepoConfig,
+    RequestSource,
+)
 from feast.driver_test_data import create_driver_hourly_stats_df
 from feast.field import Field
 from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig
 from feast.on_demand_feature_view import on_demand_feature_view
-from feast.types import Float32, Float64, Int64
+from feast.types import (
+    Array,
+    Bool,
+    Float32,
+    Float64,
+    Int64,
+    String,
+)
 
 
 def test_pandas_transformation():
@@ -91,3 +107,235 @@ def pandas_view(inputs: pd.DataFrame) -> pd.DataFrame:
         assert online_response["conv_rate_plus_acc"].equals(
             online_response["conv_rate"] + online_response["acc_rate"]
         )
+
+
+def test_pandas_transformation_returning_all_data_types():
+    with tempfile.TemporaryDirectory() as data_dir:
+        store = FeatureStore(
+            config=RepoConfig(
+                project="test_on_demand_python_transformation",
+                registry=os.path.join(data_dir, "registry.db"),
+                provider="local",
+                entity_key_serialization_version=2,
+                online_store=SqliteOnlineStoreConfig(
+                    path=os.path.join(data_dir, "online.db")
+                ),
+            )
+        )
+
+        # Generate test data.
+        end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
+        start_date = end_date - timedelta(days=15)
+
+        driver_entities = [1001, 1002, 1003, 1004, 1005]
+        driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date)
+        driver_stats_path = os.path.join(data_dir, "driver_stats.parquet")
+        driver_df.to_parquet(path=driver_stats_path, allow_truncated_timestamps=True)
+
+        driver = Entity(name="driver", join_keys=["driver_id"])
+
+        driver_stats_source = FileSource(
+            name="driver_hourly_stats_source",
+            path=driver_stats_path,
+            timestamp_field="event_timestamp",
+            created_timestamp_column="created",
+        )
+
+        driver_stats_fv = FeatureView(
+            name="driver_hourly_stats",
+            entities=[driver],
+            ttl=timedelta(days=0),
+            schema=[
+                Field(name="conv_rate", dtype=Float32),
+                Field(name="acc_rate", dtype=Float32),
+                Field(name="avg_daily_trips", dtype=Int64),
+            ],
+            online=True,
+            source=driver_stats_source,
+        )
+
+        request_source = RequestSource(
+            name="request_source",
+            schema=[
+                Field(name="avg_daily_trip_rank_thresholds", dtype=Array(Int64)),
+                Field(name="avg_daily_trip_rank_names", dtype=Array(String)),
+            ],
+        )
+
+        @on_demand_feature_view(
+            sources=[request_source, driver_stats_fv],
+            schema=[
+                Field(name="highest_achieved_rank", dtype=String),
+                Field(name="avg_daily_trips_plus_one", dtype=Int64),
+                Field(name="conv_rate_plus_acc", dtype=Float64),
+                Field(name="is_highest_rank", dtype=Bool),
+                Field(name="achieved_ranks", dtype=Array(String)),
+                Field(name="trips_until_next_rank_int", dtype=Array(Int64)),
+                Field(name="trips_until_next_rank_float", dtype=Array(Float64)),
+                Field(name="achieved_ranks_mask", dtype=Array(Bool)),
+            ],
+            mode="pandas",
+        )
+        def pandas_view(inputs: pd.DataFrame) -> pd.DataFrame:
+            df = pd.DataFrame()
+            df["conv_rate_plus_acc"] = inputs["conv_rate"] + inputs["acc_rate"]
+            df["avg_daily_trips_plus_one"] = inputs["avg_daily_trips"] + 1
+
+            df["trips_until_next_rank_int"] = inputs[
+                ["avg_daily_trips", "avg_daily_trip_rank_thresholds"]
+            ].apply(
+                lambda x: [max(threshold - x.iloc[0], 0) for threshold in x.iloc[1]],
+                axis=1,
+            )
+            df["trips_until_next_rank_float"] = df["trips_until_next_rank_int"].map(
+                lambda values: [float(value) for value in values]
+            )
+            df["achieved_ranks_mask"] = df["trips_until_next_rank_int"].map(
+                lambda values: [value <= 0 for value in values]
+            )
+
+            temp = pd.concat(
+                [df[["achieved_ranks_mask"]], inputs[["avg_daily_trip_rank_names"]]],
+                axis=1,
+            )
+            df["achieved_ranks"] = temp.apply(
+                lambda x: [
+                    rank if achieved else "Locked"
+                    for achieved, rank in zip(x.iloc[0], x.iloc[1])
+                ],
+                axis=1,
+            )
+            df["highest_achieved_rank"] = (
+                df["achieved_ranks"]
+                .map(
+                    lambda ranks: str(
+                        ([rank for rank in ranks if rank != "Locked"][-1:] or ["None"])[
+                            0
+                        ]
+                    )
+                )
+                .astype("string")
+            )
+            df["is_highest_rank"] = df["achieved_ranks"].map(
+                lambda ranks: ranks[-1] != "Locked"
+            )
+            return df
+
+        store.apply([driver, driver_stats_source, driver_stats_fv, pandas_view])
+
+        entity_rows = [
+            {
+                "driver_id": 1001,
+                "avg_daily_trip_rank_thresholds": [100, 250, 500, 1000],
+                "avg_daily_trip_rank_names": ["Bronze", "Silver", "Gold", "Platinum"],
+            }
+        ]
+        store.write_to_online_store(
+            feature_view_name="driver_hourly_stats", df=driver_df
+        )
+
+        online_response = store.get_online_features(
+            entity_rows=entity_rows,
+            features=[
+                "driver_hourly_stats:conv_rate",
+                "driver_hourly_stats:acc_rate",
+                "driver_hourly_stats:avg_daily_trips",
+                "pandas_view:avg_daily_trips_plus_one",
+                "pandas_view:conv_rate_plus_acc",
+                "pandas_view:trips_until_next_rank_int",
+                "pandas_view:trips_until_next_rank_float",
+                "pandas_view:achieved_ranks_mask",
+                "pandas_view:achieved_ranks",
+                "pandas_view:highest_achieved_rank",
+                "pandas_view:is_highest_rank",
+            ],
+        ).to_df()
+        # We use to_df here to ensure we use the pandas backend, but convert to a dict for comparisons
+        result = online_response.to_dict(orient="records")[0]
+
+        # Type assertions
+        # Materialized view
+        assert type(result["conv_rate"]) == float
+        assert type(result["acc_rate"]) == float
+        assert type(result["avg_daily_trips"]) == int
+        # On-demand view
+        assert type(result["avg_daily_trips_plus_one"]) == int
+        assert type(result["conv_rate_plus_acc"]) == float
+        assert type(result["highest_achieved_rank"]) == str
+        assert type(result["is_highest_rank"]) == bool
+
+        assert type(result["trips_until_next_rank_int"]) == list
+        assert all([type(e) == int for e in result["trips_until_next_rank_int"]])
+
+        assert type(result["trips_until_next_rank_float"]) == list
+        assert all([type(e) == float for e in result["trips_until_next_rank_float"]])
+
+        assert type(result["achieved_ranks"]) == list
+        assert all([type(e) == str for e in result["achieved_ranks"]])
+
+        assert type(result["achieved_ranks_mask"]) == list
+        assert all([type(e) == bool for e in result["achieved_ranks_mask"]])
+
+        # Value assertions
+        expected_trips_until_next_rank = [
+            max(threshold - result["avg_daily_trips"], 0)
+            for threshold in entity_rows[0]["avg_daily_trip_rank_thresholds"]
+        ]
+        expected_mask = [value <= 0 for value in expected_trips_until_next_rank]
+        expected_ranks = [
+            rank if achieved else "Locked"
+            for achieved, rank in zip(
+                expected_mask, entity_rows[0]["avg_daily_trip_rank_names"]
+            )
+        ]
+        highest_rank = (
+            [rank for rank in expected_ranks if rank != "Locked"][-1:] or ["None"]
+        )[0]
+
+        assert result["conv_rate_plus_acc"] == result["conv_rate"] + result["acc_rate"]
+        assert result["avg_daily_trips_plus_one"] == result["avg_daily_trips"] + 1
+        assert result["highest_achieved_rank"] == highest_rank
+        assert result["is_highest_rank"] == (expected_ranks[-1] != "Locked")
+
+        assert result["trips_until_next_rank_int"] == expected_trips_until_next_rank
+        assert result["trips_until_next_rank_float"] == [
+            float(value) for value in expected_trips_until_next_rank
+        ]
+        assert result["achieved_ranks_mask"] == expected_mask
+        assert result["achieved_ranks"] == expected_ranks
+
+
+def test_invalid_pandas_transformation_raises_type_error_on_apply():
+    with tempfile.TemporaryDirectory() as data_dir:
+        store = FeatureStore(
+            config=RepoConfig(
+                project="test_on_demand_python_transformation",
+                registry=os.path.join(data_dir, "registry.db"),
+                provider="local",
+                entity_key_serialization_version=2,
+                online_store=SqliteOnlineStoreConfig(
+                    path=os.path.join(data_dir, "online.db")
+                ),
+            )
+        )
+
+        request_source = RequestSource(
+            name="request_source",
+            schema=[
+                Field(name="driver_name", dtype=String),
+            ],
+        )
+
+        @on_demand_feature_view(
+            sources=[request_source],
+            schema=[Field(name="driver_name_lower", dtype=String)],
+            mode="pandas",
+        )
+        def pandas_view(inputs: pd.DataFrame) -> pd.DataFrame:
+            return pd.DataFrame({"driver_name_lower": []})
+
+        with pytest.raises(
+            TypeError,
+            match=re.escape("Failed to infer type for feature 'driver_name_lower' with value '[]' since no items were returned by the UDF.")
+        ):
+            store.apply([request_source, pandas_view])
diff --git a/sdk/python/tests/unit/test_on_demand_python_transformation.py b/sdk/python/tests/unit/test_on_demand_python_transformation.py
index 72e9b53a101..793b7cc09b8 100644
--- a/sdk/python/tests/unit/test_on_demand_python_transformation.py
+++ b/sdk/python/tests/unit/test_on_demand_python_transformation.py
@@ -7,12 +7,19 @@
 import pandas as pd
 import pytest
 
-from feast import Entity, FeatureStore, FeatureView, FileSource, RepoConfig
+from feast import (
+    Entity,
+    FeatureStore,
+    FeatureView,
+    FileSource,
+    RepoConfig,
+    RequestSource,
+)
 from feast.driver_test_data import create_driver_hourly_stats_df
 from feast.field import Field
 from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig
 from feast.on_demand_feature_view import on_demand_feature_view
-from feast.types import Float32, Float64, Int64
+from feast.types import Array, Bool, Float32, Float64, Int64, String
 
 
 class TestOnDemandPythonTransformation(unittest.TestCase):
@@ -248,3 +255,196 @@ def test_python_docs_demo(self):
             + online_python_response["acc_rate"][0]
             == online_python_response["conv_rate_plus_val2_python"][0]
         )
+
+
+class TestOnDemandPythonTransformationAllDataTypes(unittest.TestCase):
+    def setUp(self):
+        with tempfile.TemporaryDirectory() as data_dir:
+            self.store = FeatureStore(
+                config=RepoConfig(
+                    project="test_on_demand_python_transformation",
+                    registry=os.path.join(data_dir, "registry.db"),
+                    provider="local",
+                    entity_key_serialization_version=2,
+                    online_store=SqliteOnlineStoreConfig(
+                        path=os.path.join(data_dir, "online.db")
+                    ),
+                )
+            )
+
+            # Generate test data.
+            end_date = datetime.now().replace(microsecond=0, second=0, minute=0)
+            start_date = end_date - timedelta(days=15)
+
+            driver_entities = [1001, 1002, 1003, 1004, 1005]
+            driver_df = create_driver_hourly_stats_df(
+                driver_entities, start_date, end_date
+            )
+            driver_stats_path = os.path.join(data_dir, "driver_stats.parquet")
+            driver_df.to_parquet(
+                path=driver_stats_path, allow_truncated_timestamps=True
+            )
+
+            driver = Entity(name="driver", join_keys=["driver_id"])
+
+            driver_stats_source = FileSource(
+                name="driver_hourly_stats_source",
+                path=driver_stats_path,
+                timestamp_field="event_timestamp",
+                created_timestamp_column="created",
+            )
+
+            driver_stats_fv = FeatureView(
+                name="driver_hourly_stats",
+                entities=[driver],
+                ttl=timedelta(days=0),
+                schema=[
+                    Field(name="conv_rate", dtype=Float32),
+                    Field(name="acc_rate", dtype=Float32),
+                    Field(name="avg_daily_trips", dtype=Int64),
+                ],
+                online=True,
+                source=driver_stats_source,
+            )
+
+            request_source = RequestSource(
+                name="request_source",
+                schema=[
+                    Field(name="avg_daily_trip_rank_thresholds", dtype=Array(Int64)),
+                    Field(name="avg_daily_trip_rank_names", dtype=Array(String)),
+                ],
+            )
+
+            @on_demand_feature_view(
+                sources=[request_source, driver_stats_fv],
+                schema=[
+                    Field(name="highest_achieved_rank", dtype=String),
+                    Field(name="avg_daily_trips_plus_one", dtype=Int64),
+                    Field(name="conv_rate_plus_acc", dtype=Float64),
+                    Field(name="is_highest_rank", dtype=Bool),
+                    Field(name="achieved_ranks", dtype=Array(String)),
+                    Field(name="trips_until_next_rank_int", dtype=Array(Int64)),
+                    Field(name="trips_until_next_rank_float", dtype=Array(Float64)),
+                    Field(name="achieved_ranks_mask", dtype=Array(Bool)),
+                ],
+                mode="python",
+            )
+            def python_view(inputs: dict[str, Any]) -> dict[str, Any]:
+                output = {}
+                trips_until_next_rank = [
+                    [max(threshold - row[1], 0) for threshold in row[0]]
+                    for row in zip(
+                        inputs["avg_daily_trip_rank_thresholds"],
+                        inputs["avg_daily_trips"],
+                    )
+                ]
+                mask = [[value <= 0 for value in row] for row in trips_until_next_rank]
+                ranks = [
+                    [rank if mask else "Locked" for mask, rank in zip(*row)]
+                    for row in zip(mask, inputs["avg_daily_trip_rank_names"])
+                ]
+                highest_rank = [
+                    ([rank for rank in row if rank != "Locked"][-1:] or ["None"])[0]
+                    for row in ranks
+                ]
+
+                output["conv_rate_plus_acc"] = [
+                    sum(row) for row in zip(inputs["conv_rate"], inputs["acc_rate"])
+                ]
+                output["avg_daily_trips_plus_one"] = [
+                    row + 1 for row in inputs["avg_daily_trips"]
+                ]
+                output["highest_achieved_rank"] = highest_rank
+                output["is_highest_rank"] = [row[-1] != "Locked" for row in ranks]
+
+                output["trips_until_next_rank_int"] = trips_until_next_rank
+                output["trips_until_next_rank_float"] = [
+                    [float(value) for value in row] for row in trips_until_next_rank
+                ]
+                output["achieved_ranks_mask"] = mask
+                output["achieved_ranks"] = ranks
+                return output
+
+            self.store.apply(
+                [driver, driver_stats_source, driver_stats_fv, python_view]
+            )
+            self.store.write_to_online_store(
+                feature_view_name="driver_hourly_stats", df=driver_df
+            )
+
+    def test_python_transformation_returning_all_data_types(self):
+        entity_rows = [
+            {
+                "driver_id": 1001,
+                "avg_daily_trip_rank_thresholds": [100, 250, 500, 1000],
+                "avg_daily_trip_rank_names": ["Bronze", "Silver", "Gold", "Platinum"],
+            }
+        ]
+        online_response = self.store.get_online_features(
+            entity_rows=entity_rows,
+            features=[
+                "driver_hourly_stats:conv_rate",
+                "driver_hourly_stats:acc_rate",
+                "driver_hourly_stats:avg_daily_trips",
+                "python_view:avg_daily_trips_plus_one",
+                "python_view:conv_rate_plus_acc",
+                "python_view:trips_until_next_rank_int",
+                "python_view:trips_until_next_rank_float",
+                "python_view:achieved_ranks_mask",
+                "python_view:achieved_ranks",
+                "python_view:highest_achieved_rank",
+                "python_view:is_highest_rank",
+            ],
+        ).to_dict()
+        result = {name: value[0] for name, value in online_response.items()}
+
+        # Type assertions
+        # Materialized view
+        assert type(result["conv_rate"]) == float
+        assert type(result["acc_rate"]) == float
+        assert type(result["avg_daily_trips"]) == int
+        # On-demand view
+        assert type(result["avg_daily_trips_plus_one"]) == int
+        assert type(result["conv_rate_plus_acc"]) == float
+        assert type(result["highest_achieved_rank"]) == str
+        assert type(result["is_highest_rank"]) == bool
+
+        assert type(result["trips_until_next_rank_int"]) == list
+        assert all([type(e) == int for e in result["trips_until_next_rank_int"]])
+
+        assert type(result["trips_until_next_rank_float"]) == list
+        assert all([type(e) == float for e in result["trips_until_next_rank_float"]])
+
+        assert type(result["achieved_ranks"]) == list
+        assert all([type(e) == str for e in result["achieved_ranks"]])
+
+        assert type(result["achieved_ranks_mask"]) == list
+        assert all([type(e) == bool for e in result["achieved_ranks_mask"]])
+
+        # Value assertions
+        expected_trips_until_next_rank = [
+            max(threshold - result["avg_daily_trips"], 0)
+            for threshold in entity_rows[0]["avg_daily_trip_rank_thresholds"]
+        ]
+        expected_mask = [value <= 0 for value in expected_trips_until_next_rank]
+        expected_ranks = [
+            rank if achieved else "Locked"
+            for achieved, rank in zip(
+                expected_mask, entity_rows[0]["avg_daily_trip_rank_names"]
+            )
+        ]
+        highest_rank = (
+            [rank for rank in expected_ranks if rank != "Locked"][-1:] or ["None"]
+        )[0]
+
+        assert result["conv_rate_plus_acc"] == result["conv_rate"] + result["acc_rate"]
+        assert result["avg_daily_trips_plus_one"] == result["avg_daily_trips"] + 1
+        assert result["highest_achieved_rank"] == highest_rank
+        assert result["is_highest_rank"] == (expected_ranks[-1] != "Locked")
+
+        assert result["trips_until_next_rank_int"] == expected_trips_until_next_rank
+        assert result["trips_until_next_rank_float"] == [
+            float(value) for value in expected_trips_until_next_rank
+        ]
+        assert result["achieved_ranks_mask"] == expected_mask
+        assert result["achieved_ranks"] == expected_ranks