From d5cf8f66fd2f3b2c37aae664df8a9b77982db2d1 Mon Sep 17 00:00:00 2001
From: Bryan Qiu <bryan.qiu@databricks.com>
Date: Thu, 26 Oct 2023 18:01:46 -0700
Subject: [PATCH 1/6] temp

Signed-off-by: Bryan Qiu <bryan.qiu@databricks.com>
---
 mlflow/data/digest_utils.py              |  8 +++-
 mlflow/data/pandas_dataset.py            |  3 ++
 mlflow/models/evaluation/base.py         | 29 ++++++++++++-
 tests/evaluate/test_default_evaluator.py | 52 +++++++++++++++++++++++-
 4 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/mlflow/data/digest_utils.py b/mlflow/data/digest_utils.py
index bbaa02dc20041..f95b1ccf57a7f 100644
--- a/mlflow/data/digest_utils.py
+++ b/mlflow/data/digest_utils.py
@@ -28,10 +28,16 @@ def compute_pandas_digest(df) -> str:
 
     desired_columns = string_columns.union(numeric_columns)
     trimmed_df = trimmed_df[desired_columns]
+    values = pd.util.hash_pandas_object(trimmed_df).values
+
+    for val in values:
+        if isinstance(val, list):
+            # turn into a tuple here so it's hashable
+            pass
 
     return get_normalized_md5_digest(
         [
-            pd.util.hash_pandas_object(trimmed_df).values,
+            values,
             np.int64(len(df)),
         ]
         + [str(x).encode() for x in df.columns]
diff --git a/mlflow/data/pandas_dataset.py b/mlflow/data/pandas_dataset.py
index 3c765168d169a..d79fef813a1d8 100644
--- a/mlflow/data/pandas_dataset.py
+++ b/mlflow/data/pandas_dataset.py
@@ -153,6 +153,9 @@ def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationData
         Converts the dataset to an EvaluationDataset for model evaluation. Required
         for use with mlflow.evaluate().
         """
+
+        print("bbqiu", self._df)
+
         return EvaluationDataset(
             data=self._df,
             targets=self._targets,
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index b9236468982fa..ae73a2fab6966 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -377,12 +377,15 @@ def tables(self) -> Dict[str, "pd.DataFrame"]:
 
 
 def _hash_uint64_ndarray_as_bytes(array):
+    print("skirm12", array)
     assert len(array.shape) == 1
     # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
     return struct.pack(f">{array.size}Q", *array)
 
 
 def _hash_ndarray_as_bytes(nd_array):
+    print("skirm11", nd_array)
+    print(type(nd_array))
     return _hash_uint64_ndarray_as_bytes(
         pd.util.hash_array(nd_array.flatten(order="C"))
     ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64"))
@@ -393,7 +396,9 @@ def _hash_array_like_obj_as_bytes(data):
     Helper method to convert pandas dataframe/numpy array/list into bytes for
     MD5 calculation purpose.
     """
+    print("skirm", data)
     if isinstance(data, pd.DataFrame):
+        print("skirm2", data)
         # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
         # run code not related to pyspark.
         if "pyspark" in sys.modules:
@@ -403,21 +408,31 @@ def _hash_array_like_obj_as_bytes(data):
 
         def _hash_array_like_element_as_bytes(v):
             if spark_vector_type is not None:
+                print("skirm3", data)
                 if isinstance(v, spark_vector_type):
+                    print("skirm4", data)
                     return _hash_ndarray_as_bytes(v.toArray())
             if isinstance(v, np.ndarray):
+                print("skirm5", data)
                 return _hash_ndarray_as_bytes(v)
             if isinstance(v, list):
+                print("skirm6", data)
                 return _hash_ndarray_as_bytes(np.array(v))
+            print("skirm7", data)
             return v
 
         data = data.applymap(_hash_array_like_element_as_bytes)
         return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data))
     elif isinstance(data, np.ndarray):
+        print("skirm8", data)
+        print(type(data))
         return _hash_ndarray_as_bytes(data)
     elif isinstance(data, list):
+        print("skirm9", data)
         return _hash_ndarray_as_bytes(np.array(data))
     else:
+        print("skirm10", data)
+        print(type(data))
         raise ValueError("Unsupported data type.")
 
 
@@ -431,6 +446,8 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
     len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
     md5_gen.update(len_bytes)
     if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
+        print("skirm15", data)
+        print(type(data))
         md5_gen.update(_hash_array_like_obj_as_bytes(data))
     else:
         head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
@@ -574,12 +591,12 @@ def __init__(
                 data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
 
             if has_targets:
-                self._labels_data = data[targets].to_numpy()
+                self._labels_data = self._list_to_numpy(data[targets].to_numpy())
                 self._targets_name = targets
 
             self._has_predictions = predictions is not None
             if self._has_predictions:
-                self._predictions_data = data[predictions].to_numpy()
+                self._predictions_data = self._list_to_numpy(data[predictions].to_numpy())
                 self._predictions_name = predictions
 
             if feature_names is not None:
@@ -611,6 +628,7 @@ def __init__(
         if self._labels_data is not None:
             _gen_md5_for_arraylike_obj(md5_gen, self._labels_data)
         if self._predictions_data is not None:
+            print("skirm 16", self._predictions_data)
             _gen_md5_for_arraylike_obj(md5_gen, self._predictions_data)
         md5_gen.update(",".join(list(map(str, self._feature_names))).encode("UTF-8"))
 
@@ -704,6 +722,11 @@ def _metadata(self):
             metadata["path"] = self.path
         return metadata
 
+    def _list_to_numpy(self, data):
+        if isinstance(data, (list, np.ndarray)):
+            return np.array([self._list_to_numpy(lst) for lst in data])
+        return data
+
     def _log_dataset_tag(self, client, run_id, model_uuid):
         """
         Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
@@ -1668,6 +1691,8 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
     from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel
     from mlflow.utils import env_manager as _EnvManager
 
+    print("bbqiu2", data)
+
     if evaluator_config is not None:
         col_mapping = evaluator_config.get("col_mapping", {})
 
diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py
index 951c923601c03..b49911b2f8375 100644
--- a/tests/evaluate/test_default_evaluator.py
+++ b/tests/evaluate/test_default_evaluator.py
@@ -3553,4 +3553,54 @@ def test_target_prediction_col_mapping():
             }
 
 
-# bbqiu add tests for recall
+def test_temp():
+    data = pd.DataFrame(
+        {
+            "questions": [
+                "What is MLflow?",
+                "What is Databricks?",
+                "How to serve a model on Databricks?",
+                "How to enable MLflow Autologging for my workspace by default?",
+            ],
+            "retrieved_context": [
+                [
+                    "https://docs.databricks.com/en/mlflow/index.html",
+                    "https://docs.databricks.com/en/mlflow/quick-start.html",
+                ],
+                [
+                    "https://docs.databricks.com/en/introduction/index.html",
+                    "https://docs.databricks.com/en/getting-started/overview.html",
+                ],
+                [
+                    "https://docs.databricks.com/en/machine-learning/model-serving/index.html",
+                    "https://docs.databricks.com/en/machine-learning/model-serving/model-serving-intro.html",
+                ],
+                [],
+            ],
+            "ground_truth_context": [
+                ["https://docs.databricks.com/en/mlflow/index.html"],
+                ["https://docs.databricks.com/en/introduction/index.html"],
+                [
+                    "https://docs.databricks.com/en/machine-learning/model-serving/index.html",
+                    "https://docs.databricks.com/en/machine-learning/model-serving/llm-optimized-model-serving.html",
+                ],
+                ["https://docs.databricks.com/en/mlflow/databricks-autologging.html"],
+            ],
+        }
+    )
+
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            data=data,
+            model_type="retriever",
+            targets="ground_truth_context",
+            predictions="retrieved_context",
+            evaluators="default",
+            evaluator_config={"k": 3},
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 1,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 1,
+    }

From 80dab16ec65bde944d2449fb194647989e088009 Mon Sep 17 00:00:00 2001
From: Bryan Qiu <bryan.qiu@databricks.com>
Date: Thu, 26 Oct 2023 18:02:25 -0700
Subject: [PATCH 2/6] revert

Signed-off-by: Bryan Qiu <bryan.qiu@databricks.com>
---
 mlflow/data/digest_utils.py              |  8 +---
 mlflow/data/pandas_dataset.py            |  3 --
 mlflow/models/evaluation/base.py         | 29 +------------
 tests/evaluate/test_default_evaluator.py | 53 ------------------------
 4 files changed, 3 insertions(+), 90 deletions(-)

diff --git a/mlflow/data/digest_utils.py b/mlflow/data/digest_utils.py
index f95b1ccf57a7f..bbaa02dc20041 100644
--- a/mlflow/data/digest_utils.py
+++ b/mlflow/data/digest_utils.py
@@ -28,16 +28,10 @@ def compute_pandas_digest(df) -> str:
 
     desired_columns = string_columns.union(numeric_columns)
     trimmed_df = trimmed_df[desired_columns]
-    values = pd.util.hash_pandas_object(trimmed_df).values
-
-    for val in values:
-        if isinstance(val, list):
-            # turn into a tuple here so it's hashable
-            pass
 
     return get_normalized_md5_digest(
         [
-            values,
+            pd.util.hash_pandas_object(trimmed_df).values,
             np.int64(len(df)),
         ]
         + [str(x).encode() for x in df.columns]
diff --git a/mlflow/data/pandas_dataset.py b/mlflow/data/pandas_dataset.py
index d79fef813a1d8..3c765168d169a 100644
--- a/mlflow/data/pandas_dataset.py
+++ b/mlflow/data/pandas_dataset.py
@@ -153,9 +153,6 @@ def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationData
         Converts the dataset to an EvaluationDataset for model evaluation. Required
         for use with mlflow.evaluate().
         """
-
-        print("bbqiu", self._df)
-
         return EvaluationDataset(
             data=self._df,
             targets=self._targets,
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index ae73a2fab6966..b9236468982fa 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -377,15 +377,12 @@ def tables(self) -> Dict[str, "pd.DataFrame"]:
 
 
 def _hash_uint64_ndarray_as_bytes(array):
-    print("skirm12", array)
     assert len(array.shape) == 1
     # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings
     return struct.pack(f">{array.size}Q", *array)
 
 
 def _hash_ndarray_as_bytes(nd_array):
-    print("skirm11", nd_array)
-    print(type(nd_array))
     return _hash_uint64_ndarray_as_bytes(
         pd.util.hash_array(nd_array.flatten(order="C"))
     ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64"))
@@ -396,9 +393,7 @@ def _hash_array_like_obj_as_bytes(data):
     Helper method to convert pandas dataframe/numpy array/list into bytes for
     MD5 calculation purpose.
     """
-    print("skirm", data)
     if isinstance(data, pd.DataFrame):
-        print("skirm2", data)
         # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user
         # run code not related to pyspark.
         if "pyspark" in sys.modules:
@@ -408,31 +403,21 @@ def _hash_array_like_obj_as_bytes(data):
 
         def _hash_array_like_element_as_bytes(v):
             if spark_vector_type is not None:
-                print("skirm3", data)
                 if isinstance(v, spark_vector_type):
-                    print("skirm4", data)
                     return _hash_ndarray_as_bytes(v.toArray())
             if isinstance(v, np.ndarray):
-                print("skirm5", data)
                 return _hash_ndarray_as_bytes(v)
             if isinstance(v, list):
-                print("skirm6", data)
                 return _hash_ndarray_as_bytes(np.array(v))
-            print("skirm7", data)
             return v
 
         data = data.applymap(_hash_array_like_element_as_bytes)
         return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data))
     elif isinstance(data, np.ndarray):
-        print("skirm8", data)
-        print(type(data))
         return _hash_ndarray_as_bytes(data)
     elif isinstance(data, list):
-        print("skirm9", data)
         return _hash_ndarray_as_bytes(np.array(data))
     else:
-        print("skirm10", data)
-        print(type(data))
         raise ValueError("Unsupported data type.")
 
 
@@ -446,8 +431,6 @@ def _gen_md5_for_arraylike_obj(md5_gen, data):
     len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64"))
     md5_gen.update(len_bytes)
     if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2:
-        print("skirm15", data)
-        print(type(data))
         md5_gen.update(_hash_array_like_obj_as_bytes(data))
     else:
         head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH]
@@ -591,12 +574,12 @@ def __init__(
                 data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas()
 
             if has_targets:
-                self._labels_data = self._list_to_numpy(data[targets].to_numpy())
+                self._labels_data = data[targets].to_numpy()
                 self._targets_name = targets
 
             self._has_predictions = predictions is not None
             if self._has_predictions:
-                self._predictions_data = self._list_to_numpy(data[predictions].to_numpy())
+                self._predictions_data = data[predictions].to_numpy()
                 self._predictions_name = predictions
 
             if feature_names is not None:
@@ -628,7 +611,6 @@ def __init__(
         if self._labels_data is not None:
             _gen_md5_for_arraylike_obj(md5_gen, self._labels_data)
         if self._predictions_data is not None:
-            print("skirm 16", self._predictions_data)
             _gen_md5_for_arraylike_obj(md5_gen, self._predictions_data)
         md5_gen.update(",".join(list(map(str, self._feature_names))).encode("UTF-8"))
 
@@ -722,11 +704,6 @@ def _metadata(self):
             metadata["path"] = self.path
         return metadata
 
-    def _list_to_numpy(self, data):
-        if isinstance(data, (list, np.ndarray)):
-            return np.array([self._list_to_numpy(lst) for lst in data])
-        return data
-
     def _log_dataset_tag(self, client, run_id, model_uuid):
         """
         Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will
@@ -1691,8 +1668,6 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
     from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel
     from mlflow.utils import env_manager as _EnvManager
 
-    print("bbqiu2", data)
-
     if evaluator_config is not None:
         col_mapping = evaluator_config.get("col_mapping", {})
 
diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py
index b49911b2f8375..a2623a4b97a65 100644
--- a/tests/evaluate/test_default_evaluator.py
+++ b/tests/evaluate/test_default_evaluator.py
@@ -3551,56 +3551,3 @@ def test_target_prediction_col_mapping():
                 "correctness/v1/variance": 0.0,
                 "correctness/v1/p90": 3.0,
             }
-
-
-def test_temp():
-    data = pd.DataFrame(
-        {
-            "questions": [
-                "What is MLflow?",
-                "What is Databricks?",
-                "How to serve a model on Databricks?",
-                "How to enable MLflow Autologging for my workspace by default?",
-            ],
-            "retrieved_context": [
-                [
-                    "https://docs.databricks.com/en/mlflow/index.html",
-                    "https://docs.databricks.com/en/mlflow/quick-start.html",
-                ],
-                [
-                    "https://docs.databricks.com/en/introduction/index.html",
-                    "https://docs.databricks.com/en/getting-started/overview.html",
-                ],
-                [
-                    "https://docs.databricks.com/en/machine-learning/model-serving/index.html",
-                    "https://docs.databricks.com/en/machine-learning/model-serving/model-serving-intro.html",
-                ],
-                [],
-            ],
-            "ground_truth_context": [
-                ["https://docs.databricks.com/en/mlflow/index.html"],
-                ["https://docs.databricks.com/en/introduction/index.html"],
-                [
-                    "https://docs.databricks.com/en/machine-learning/model-serving/index.html",
-                    "https://docs.databricks.com/en/machine-learning/model-serving/llm-optimized-model-serving.html",
-                ],
-                ["https://docs.databricks.com/en/mlflow/databricks-autologging.html"],
-            ],
-        }
-    )
-
-    with mlflow.start_run() as run:
-        mlflow.evaluate(
-            data=data,
-            model_type="retriever",
-            targets="ground_truth_context",
-            predictions="retrieved_context",
-            evaluators="default",
-            evaluator_config={"k": 3},
-        )
-    run = mlflow.get_run(run.info.run_id)
-    assert run.data.metrics == {
-        "precision_at_k/v1/mean": 1,
-        "precision_at_k/v1/variance": 0,
-        "precision_at_k/v1/p90": 1,
-    }

From aeb76dfb5f4e6a5c869a52dc3414833e8024fb10 Mon Sep 17 00:00:00 2001
From: Bryan Qiu <bryan.qiu@databricks.com>
Date: Thu, 26 Oct 2023 20:06:19 -0700
Subject: [PATCH 3/6] switch to lists

Signed-off-by: Bryan Qiu <bryan.qiu@databricks.com>
---
 docs/source/python_api/mlflow.metrics.rst |  2 +-
 mlflow/metrics/metric_definitions.py      | 34 +++++++++--------------
 mlflow/models/evaluation/base.py          |  5 ++++
 tests/evaluate/test_default_evaluator.py  | 20 ++++++-------
 4 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/docs/source/python_api/mlflow.metrics.rst b/docs/source/python_api/mlflow.metrics.rst
index 8a4d6f525afc9..893589c671720 100644
--- a/docs/source/python_api/mlflow.metrics.rst
+++ b/docs/source/python_api/mlflow.metrics.rst
@@ -110,7 +110,7 @@ your retrieval model. The function should take a Pandas DataFrame containing inp
 ground-truth relevant doc IDs, and return a DataFrame with a column of retrieved relevant doc IDs.
 
 A "doc ID" is a string that uniquely identifies a document. All doc IDs should be entered as a 
-tuple of doc ID  strings.
+list of doc ID  strings.
 
 Parameters:
 
diff --git a/mlflow/metrics/metric_definitions.py b/mlflow/metrics/metric_definitions.py
index 9211e13927af1..649f9cf699d0d 100644
--- a/mlflow/metrics/metric_definitions.py
+++ b/mlflow/metrics/metric_definitions.py
@@ -38,24 +38,18 @@ def _validate_text_data(data, metric_name, col_specifier):
     return True
 
 
-def _validate_and_fix_text_tuple_data(data, metric_name, column_name):
-    """Validates that the data is a pandas Series of a tuple of strings and is non-empty"""
+def _validate_list_str_data(data, metric_name, col_specifier):
+    """Validates that the data is a list of lists of strings and is non-empty"""
     if data is None or len(data) == 0:
         return False
 
     for index, value in data.items():
-        if not isinstance(value, tuple) or not all(isinstance(val, str) for val in value):
-            # Single entry tuples are automatically unpacked by Pandas.
-            # So if the entry is a string, put it back into a tuple.
-            if isinstance(value, str):
-                data[index] = (value,)
-            else:
-                _logger.warning(
-                    f"Cannot calculate metric '{metric_name}' for non-tuple[str] inputs. "
-                    f"Row #{index} of column '{column_name}' has a non-tuple[str] value of:"
-                    f"{value}. Skipping metric logging."
-                )
-                return False
+        if not isinstance(value, list) or not all(isinstance(val, str) for val in value):
+            _logger.warning(
+                f"Cannot calculate metric '{metric_name}' for non-list[str] inputs. "
+                f"Non-list[str] found for {col_specifier} on row {index}. Skipping metric logging."
+            )
+            return False
 
     return True
 
@@ -344,9 +338,9 @@ def _f1_score_eval_fn(
 
 def _precision_at_k_eval_fn(k):
     def _fn(predictions, targets):
-        if not _validate_and_fix_text_tuple_data(
-            predictions, "precision_at_k", "predictions"
-        ) or not _validate_and_fix_text_tuple_data(targets, "precision_at_k", "targets"):
+        if not _validate_list_str_data(
+            predictions, "precision_at_k", predictions_col_specifier
+        ) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier):
             return
 
         scores = []
@@ -367,11 +361,9 @@ def _fn(predictions, targets):
 
 def _recall_at_k_eval_fn(k):
     def _fn(predictions, targets):
-        if not _validate_and_fix_text_tuple_data(
+        if not _validate_list_str_data(
             predictions, "precision_at_k", predictions_col_specifier
-        ) or not _validate_and_fix_text_tuple_data(
-            targets, "precision_at_k", targets_col_specifier
-        ):
+        ) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier):
             return
 
         scores = []
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index b9236468982fa..2d2dfa79de63c 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -413,6 +413,11 @@ def _hash_array_like_element_as_bytes(v):
 
         data = data.applymap(_hash_array_like_element_as_bytes)
         return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data))
+    elif isinstance(data, np.ndarray) and len(data) > 0 and isinstance(data[0], list):
+        # convert numpy array of lists into numpy array of numpy arrays
+        # because lists are not hashable
+        hashable = np.array(json.dumps(val) for val in data)
+        return _hash_ndarray_as_bytes(hashable)
     elif isinstance(data, np.ndarray):
         return _hash_ndarray_as_bytes(data)
     elif isinstance(data, list):
diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py
index a2623a4b97a65..23ba6da4cb113 100644
--- a/tests/evaluate/test_default_evaluator.py
+++ b/tests/evaluate/test_default_evaluator.py
@@ -3220,8 +3220,6 @@ def validate_retriever_logged_data(logged_data, k=3):
     columns = {
         "question",
         "retrieved_context",
-        # TODO: fix the logged data to name the model output column "retrieved_context"
-        # Right now, it's hard-coded "outputs", which is not ideal
         f"precision_at_{k}/score",
         f"recall_at_{k}/score",
         "ground_truth",
@@ -3237,10 +3235,10 @@ def validate_retriever_logged_data(logged_data, k=3):
 
 
 def test_evaluate_retriever():
-    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3})
 
     def fn(X):
-        return pd.DataFrame({"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)})
+        return pd.DataFrame({"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)})
 
     with mlflow.start_run() as run:
         results = mlflow.evaluate(
@@ -3310,9 +3308,9 @@ def fn(X):
 
     # test with multiple chunks from same doc
     def fn2(X):
-        return pd.DataFrame({"retrieved_context": [("doc1", "doc1", "doc3")] * len(X)})
+        return pd.DataFrame({"retrieved_context": [["doc1", "doc1", "doc3"]] * len(X)})
 
-    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc3")] * 3})
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc3"]] * 3})
 
     with mlflow.start_run() as run:
         results = mlflow.evaluate(
@@ -3338,7 +3336,7 @@ def fn2(X):
 
     # test with empty retrieved doc
     def fn3(X):
-        return pd.DataFrame({"output": [()] * len(X)})
+        return pd.DataFrame({"output": [[]] * len(X)})
 
     with mlflow.start_run() as run:
         mlflow.evaluate(
@@ -3364,7 +3362,7 @@ def fn3(X):
 
     # test with single retrieved doc
     def fn4(X):
-        return pd.DataFrame({"output": [("doc1")] * len(X)})
+        return pd.DataFrame({"output": [["doc1"]] * len(X)})
 
     with mlflow.start_run() as run:
         mlflow.evaluate(
@@ -3389,7 +3387,7 @@ def fn4(X):
     }
 
     # test with single ground truth doc
-    X_1 = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1")] * 3})
+    X_1 = pd.DataFrame({"question": [["q1?"]] * 3, "ground_truth": [["doc1"]] * 3})
 
     with mlflow.start_run() as run:
         mlflow.evaluate(
@@ -3415,10 +3413,10 @@ def fn4(X):
 
 
 def test_evaluate_retriever_builtin_metrics_no_model_type():
-    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3})
 
     def fn(X):
-        return {"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)}
+        return {"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)}
 
     with mlflow.start_run() as run:
         results = mlflow.evaluate(

From 57108066dde6080f8de020e58d9d607103299866 Mon Sep 17 00:00:00 2001
From: Bryan Qiu <bryan.qiu@databricks.com>
Date: Thu, 26 Oct 2023 21:18:39 -0700
Subject: [PATCH 4/6] fix tests

Signed-off-by: Bryan Qiu <bryan.qiu@databricks.com>
---
 tests/metrics/test_metric_definitions.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/metrics/test_metric_definitions.py b/tests/metrics/test_metric_definitions.py
index cf2da6ca2abab..34163aff759b7 100644
--- a/tests/metrics/test_metric_definitions.py
+++ b/tests/metrics/test_metric_definitions.py
@@ -245,8 +245,8 @@ def test_binary_f1_score():
 
 
 def test_precision_at_k():
-    predictions = pd.Series([("a", "b"), ("c", "d"), ("e"), ("f", "g")])
-    targets = pd.Series([("a", "b"), ("c", "b"), ("e"), ("c")])
+    predictions = pd.Series([["a", "b"], ["c", "d"], ["e"], ["f", "g"]])
+    targets = pd.Series([["a", "b"], ["c", "b"], ["e"], ["c"]])
     result = precision_at_k(4).eval_fn(predictions, targets)
 
     assert result.scores == [1.0, 0.5, 1.0, 0.0]
@@ -258,8 +258,8 @@ def test_precision_at_k():
 
 
 def test_recall_at_k():
-    predictions = pd.Series([("a", "b"), ("c", "d", "e"), (), ("f", "g"), ("a", "a", "a")])
-    targets = pd.Series([("a", "b", "c", "d"), ("c", "b", "a", "d"), (), (), ("a", "c")])
+    predictions = pd.Series([["a", "b"], ["c", "d", "e"], [], ["f", "g"], ["a", "a", "a"]])
+    targets = pd.Series([["a", "b", "c", "d"], ["c", "b", "a", "d"], [], [], ["a", "c"]])
     result = recall_at_k(4).eval_fn(predictions, targets)
 
     assert result.scores == [0.5, 0.5, 1.0, 0.0, 0.5]

From 908d187adf88054fa27d176db27a172dde64f37f Mon Sep 17 00:00:00 2001
From: Bryan Qiu <bryan.qiu@databricks.com>
Date: Thu, 26 Oct 2023 21:20:10 -0700
Subject: [PATCH 5/6] fix tests

Signed-off-by: Bryan Qiu <bryan.qiu@databricks.com>
---
 tests/metrics/test_metric_definitions.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/metrics/test_metric_definitions.py b/tests/metrics/test_metric_definitions.py
index 34163aff759b7..d75fa2a8a2eef 100644
--- a/tests/metrics/test_metric_definitions.py
+++ b/tests/metrics/test_metric_definitions.py
@@ -33,8 +33,6 @@
         ari_grade_level(),
         exact_match(),
         flesch_kincaid_grade_level(),
-        precision_at_k(3),
-        recall_at_k(3),
         rouge1(),
         rouge2(),
         rougeL(),

From d96d59dbdfd89d32d6e57b033962cbbf66526546 Mon Sep 17 00:00:00 2001
From: Bryan Qiu <bryan.qiu@databricks.com>
Date: Thu, 26 Oct 2023 23:11:24 -0700
Subject: [PATCH 6/6] address comments

Signed-off-by: Bryan Qiu <bryan.qiu@databricks.com>
---
 mlflow/metrics/metric_definitions.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlflow/metrics/metric_definitions.py b/mlflow/metrics/metric_definitions.py
index 649f9cf699d0d..2a491f34086fc 100644
--- a/mlflow/metrics/metric_definitions.py
+++ b/mlflow/metrics/metric_definitions.py
@@ -46,8 +46,9 @@ def _validate_list_str_data(data, metric_name, col_specifier):
     for index, value in data.items():
         if not isinstance(value, list) or not all(isinstance(val, str) for val in value):
             _logger.warning(
-                f"Cannot calculate metric '{metric_name}' for non-list[str] inputs. "
-                f"Non-list[str] found for {col_specifier} on row {index}. Skipping metric logging."
+                f"Cannot calculate metric '{metric_name}' for non-list of string inputs. "
+                f"Non-list of strings found for {col_specifier} on row {index}. Skipping metric "
+                f"logging."
             )
             return False