bbqiu · bbqiu · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/docs/source/python_api/mlflow.metrics.rst b/docs/source/python_api/mlflow.metrics.rst
@@ -110,7 +110,7 @@ your retrieval model. The function should take a Pandas DataFrame containing inp
 ground-truth relevant doc IDs, and return a DataFrame with a column of retrieved relevant doc IDs.
 
 A "doc ID" is a string that uniquely identifies a document. All doc IDs should be entered as a 
-tuple of doc ID  strings.
+list of doc ID  strings.
 
 Parameters:
 

diff --git a/mlflow/metrics/metric_definitions.py b/mlflow/metrics/metric_definitions.py
@@ -38,24 +38,18 @@ def _validate_text_data(data, metric_name, col_specifier):
     return True
 
 
-def _validate_and_fix_text_tuple_data(data, metric_name, column_name):
-    """Validates that the data is a pandas Series of a tuple of strings and is non-empty"""
+def _validate_list_str_data(data, metric_name, col_specifier):
+    """Validates that the data is a list of lists of strings and is non-empty"""
     if data is None or len(data) == 0:
         return False
 
     for index, value in data.items():
-        if not isinstance(value, tuple) or not all(isinstance(val, str) for val in value):
-            # Single entry tuples are automatically unpacked by Pandas.
-            # So if the entry is a string, put it back into a tuple.
-            if isinstance(value, str):
-                data[index] = (value,)
-            else:
-                _logger.warning(
-                    f"Cannot calculate metric '{metric_name}' for non-tuple[str] inputs. "
-                    f"Row #{index} of column '{column_name}' has a non-tuple[str] value of:"
-                    f"{value}. Skipping metric logging."
-                )
-                return False
+        if not isinstance(value, list) or not all(isinstance(val, str) for val in value):
+            _logger.warning(
+                f"Cannot calculate metric '{metric_name}' for non-list[str] inputs. "
+                f"Non-list[str] found for {col_specifier} on row {index}. Skipping metric logging."
+            )
+            return False
 
     return True
 
@@ -344,9 +338,9 @@ def _f1_score_eval_fn(
 
 def _precision_at_k_eval_fn(k):
     def _fn(predictions, targets):
-        if not _validate_and_fix_text_tuple_data(
-            predictions, "precision_at_k", "predictions"
-        ) or not _validate_and_fix_text_tuple_data(targets, "precision_at_k", "targets"):
+        if not _validate_list_str_data(
+            predictions, "precision_at_k", predictions_col_specifier
+        ) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier):
             return
 
         scores = []
@@ -367,11 +361,9 @@ def _fn(predictions, targets):
 
 def _recall_at_k_eval_fn(k):
     def _fn(predictions, targets):
-        if not _validate_and_fix_text_tuple_data(
+        if not _validate_list_str_data(
             predictions, "precision_at_k", predictions_col_specifier
-        ) or not _validate_and_fix_text_tuple_data(
-            targets, "precision_at_k", targets_col_specifier
-        ):
+        ) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier):
             return
 
         scores = []

diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
@@ -413,6 +413,11 @@ def _hash_array_like_element_as_bytes(v):
 
         data = data.applymap(_hash_array_like_element_as_bytes)
         return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data))
+    elif isinstance(data, np.ndarray) and len(data) > 0 and isinstance(data[0], list):
+        # convert numpy array of lists into numpy array of numpy arrays
+        # because lists are not hashable
+        hashable = np.array(json.dumps(val) for val in data)
+        return _hash_ndarray_as_bytes(hashable)
     elif isinstance(data, np.ndarray):
         return _hash_ndarray_as_bytes(data)
     elif isinstance(data, list):

diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py
@@ -3220,8 +3220,6 @@ def validate_retriever_logged_data(logged_data, k=3):
     columns = {
         "question",
         "retrieved_context",
-        # TODO: fix the logged data to name the model output column "retrieved_context"
-        # Right now, it's hard-coded "outputs", which is not ideal
         f"precision_at_{k}/score",
         f"recall_at_{k}/score",
         "ground_truth",
@@ -3237,10 +3235,10 @@ def validate_retriever_logged_data(logged_data, k=3):
 
 
 def test_evaluate_retriever():
-    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3})
 
     def fn(X):
-        return pd.DataFrame({"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)})
+        return pd.DataFrame({"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)})
 
     with mlflow.start_run() as run:
         results = mlflow.evaluate(
@@ -3310,9 +3308,9 @@ def fn(X):
 
     # test with multiple chunks from same doc
     def fn2(X):
-        return pd.DataFrame({"retrieved_context": [("doc1", "doc1", "doc3")] * len(X)})
+        return pd.DataFrame({"retrieved_context": [["doc1", "doc1", "doc3"]] * len(X)})
 
-    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc3")] * 3})
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc3"]] * 3})
 
     with mlflow.start_run() as run:
         results = mlflow.evaluate(
@@ -3338,7 +3336,7 @@ def fn2(X):
 
     # test with empty retrieved doc
     def fn3(X):
-        return pd.DataFrame({"output": [()] * len(X)})
+        return pd.DataFrame({"output": [[]] * len(X)})
 
     with mlflow.start_run() as run:
         mlflow.evaluate(
@@ -3364,7 +3362,7 @@ def fn3(X):
 
     # test with single retrieved doc
     def fn4(X):
-        return pd.DataFrame({"output": [("doc1")] * len(X)})
+        return pd.DataFrame({"output": [["doc1"]] * len(X)})
 
     with mlflow.start_run() as run:
         mlflow.evaluate(
@@ -3389,7 +3387,7 @@ def fn4(X):
     }
 
     # test with single ground truth doc
-    X_1 = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1")] * 3})
+    X_1 = pd.DataFrame({"question": [["q1?"]] * 3, "ground_truth": [["doc1"]] * 3})
 
     with mlflow.start_run() as run:
         mlflow.evaluate(
@@ -3415,10 +3413,10 @@ def fn4(X):
 
 
 def test_evaluate_retriever_builtin_metrics_no_model_type():
-    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3})
 
     def fn(X):
-        return {"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)}
+        return {"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)}
 
     with mlflow.start_run() as run:
         results = mlflow.evaluate(
@@ -3551,6 +3549,3 @@ def test_target_prediction_col_mapping():
                 "correctness/v1/variance": 0.0,
                 "correctness/v1/p90": 3.0,
             }
-
-
-# bbqiu add tests for recall
diff --git a/tests/metrics/test_metric_definitions.py b/tests/metrics/test_metric_definitions.py
@@ -33,8 +33,6 @@
         ari_grade_level(),
         exact_match(),
         flesch_kincaid_grade_level(),
-        precision_at_k(3),
-        recall_at_k(3),
         rouge1(),
         rouge2(),
         rougeL(),
@@ -245,8 +243,8 @@ def test_binary_f1_score():
 
 
 def test_precision_at_k():
-    predictions = pd.Series([("a", "b"), ("c", "d"), ("e"), ("f", "g")])
-    targets = pd.Series([("a", "b"), ("c", "b"), ("e"), ("c")])
+    predictions = pd.Series([["a", "b"], ["c", "d"], ["e"], ["f", "g"]])
+    targets = pd.Series([["a", "b"], ["c", "b"], ["e"], ["c"]])
     result = precision_at_k(4).eval_fn(predictions, targets)
 
     assert result.scores == [1.0, 0.5, 1.0, 0.0]
@@ -258,8 +256,8 @@ def test_precision_at_k():
 
 
 def test_recall_at_k():
-    predictions = pd.Series([("a", "b"), ("c", "d", "e"), (), ("f", "g"), ("a", "a", "a")])
-    targets = pd.Series([("a", "b", "c", "d"), ("c", "b", "a", "d"), (), (), ("a", "c")])
+    predictions = pd.Series([["a", "b"], ["c", "d", "e"], [], ["f", "g"], ["a", "a", "a"]])
+    targets = pd.Series([["a", "b", "c", "d"], ["c", "b", "a", "d"], [], [], ["a", "c"]])
     result = recall_at_k(4).eval_fn(predictions, targets)
 
     assert result.scores == [0.5, 0.5, 1.0, 0.0, 0.5]