From d5cf8f66fd2f3b2c37aae664df8a9b77982db2d1 Mon Sep 17 00:00:00 2001 From: Bryan Qiu Date: Thu, 26 Oct 2023 18:01:46 -0700 Subject: [PATCH 1/6] temp Signed-off-by: Bryan Qiu --- mlflow/data/digest_utils.py | 8 +++- mlflow/data/pandas_dataset.py | 3 ++ mlflow/models/evaluation/base.py | 29 ++++++++++++- tests/evaluate/test_default_evaluator.py | 52 +++++++++++++++++++++++- 4 files changed, 88 insertions(+), 4 deletions(-) diff --git a/mlflow/data/digest_utils.py b/mlflow/data/digest_utils.py index bbaa02dc20041..f95b1ccf57a7f 100644 --- a/mlflow/data/digest_utils.py +++ b/mlflow/data/digest_utils.py @@ -28,10 +28,16 @@ def compute_pandas_digest(df) -> str: desired_columns = string_columns.union(numeric_columns) trimmed_df = trimmed_df[desired_columns] + values = pd.util.hash_pandas_object(trimmed_df).values + + for val in values: + if isinstance(val, list): + # turn into a tuple here so it's hashable + pass return get_normalized_md5_digest( [ - pd.util.hash_pandas_object(trimmed_df).values, + values, np.int64(len(df)), ] + [str(x).encode() for x in df.columns] diff --git a/mlflow/data/pandas_dataset.py b/mlflow/data/pandas_dataset.py index 3c765168d169a..d79fef813a1d8 100644 --- a/mlflow/data/pandas_dataset.py +++ b/mlflow/data/pandas_dataset.py @@ -153,6 +153,9 @@ def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationData Converts the dataset to an EvaluationDataset for model evaluation. Required for use with mlflow.evaluate(). """ + + print("bbqiu", self._df) + return EvaluationDataset( data=self._df, targets=self._targets, diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py index b9236468982fa..ae73a2fab6966 100644 --- a/mlflow/models/evaluation/base.py +++ b/mlflow/models/evaluation/base.py @@ -377,12 +377,15 @@ def tables(self) -> Dict[str, "pd.DataFrame"]: def _hash_uint64_ndarray_as_bytes(array): + print("skirm12", array) assert len(array.shape) == 1 # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings return struct.pack(f">{array.size}Q", *array) def _hash_ndarray_as_bytes(nd_array): + print("skirm11", nd_array) + print(type(nd_array)) return _hash_uint64_ndarray_as_bytes( pd.util.hash_array(nd_array.flatten(order="C")) ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64")) @@ -393,7 +396,9 @@ def _hash_array_like_obj_as_bytes(data): Helper method to convert pandas dataframe/numpy array/list into bytes for MD5 calculation purpose. """ + print("skirm", data) if isinstance(data, pd.DataFrame): + print("skirm2", data) # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user # run code not related to pyspark. if "pyspark" in sys.modules: @@ -403,21 +408,31 @@ def _hash_array_like_obj_as_bytes(data): def _hash_array_like_element_as_bytes(v): if spark_vector_type is not None: + print("skirm3", data) if isinstance(v, spark_vector_type): + print("skirm4", data) return _hash_ndarray_as_bytes(v.toArray()) if isinstance(v, np.ndarray): + print("skirm5", data) return _hash_ndarray_as_bytes(v) if isinstance(v, list): + print("skirm6", data) return _hash_ndarray_as_bytes(np.array(v)) + print("skirm7", data) return v data = data.applymap(_hash_array_like_element_as_bytes) return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data)) elif isinstance(data, np.ndarray): + print("skirm8", data) + print(type(data)) return _hash_ndarray_as_bytes(data) elif isinstance(data, list): + print("skirm9", data) return _hash_ndarray_as_bytes(np.array(data)) else: + print("skirm10", data) + print(type(data)) raise ValueError("Unsupported data type.") @@ -431,6 +446,8 @@ def _gen_md5_for_arraylike_obj(md5_gen, data): len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64")) md5_gen.update(len_bytes) if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2: + print("skirm15", data) + print(type(data)) md5_gen.update(_hash_array_like_obj_as_bytes(data)) else: head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH] @@ -574,12 +591,12 @@ def __init__( data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas() if has_targets: - self._labels_data = data[targets].to_numpy() + self._labels_data = self._list_to_numpy(data[targets].to_numpy()) self._targets_name = targets self._has_predictions = predictions is not None if self._has_predictions: - self._predictions_data = data[predictions].to_numpy() + self._predictions_data = self._list_to_numpy(data[predictions].to_numpy()) self._predictions_name = predictions if feature_names is not None: @@ -611,6 +628,7 @@ def __init__( if self._labels_data is not None: _gen_md5_for_arraylike_obj(md5_gen, self._labels_data) if self._predictions_data is not None: + print("skirm 16", self._predictions_data) _gen_md5_for_arraylike_obj(md5_gen, self._predictions_data) md5_gen.update(",".join(list(map(str, self._feature_names))).encode("UTF-8")) @@ -704,6 +722,11 @@ def _metadata(self): metadata["path"] = self.path return metadata + def _list_to_numpy(self, data): + if isinstance(data, (list, np.ndarray)): + return np.array([self._list_to_numpy(lst) for lst in data]) + return data + def _log_dataset_tag(self, client, run_id, model_uuid): """ Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will @@ -1668,6 +1691,8 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir): from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel from mlflow.utils import env_manager as _EnvManager + print("bbqiu2", data) + if evaluator_config is not None: col_mapping = evaluator_config.get("col_mapping", {}) diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py index 951c923601c03..b49911b2f8375 100644 --- a/tests/evaluate/test_default_evaluator.py +++ b/tests/evaluate/test_default_evaluator.py @@ -3553,4 +3553,54 @@ def test_target_prediction_col_mapping(): } -# bbqiu add tests for recall +def test_temp(): + data = pd.DataFrame( + { + "questions": [ + "What is MLflow?", + "What is Databricks?", + "How to serve a model on Databricks?", + "How to enable MLflow Autologging for my workspace by default?", + ], + "retrieved_context": [ + [ + "https://docs.databricks.com/en/mlflow/index.html", + "https://docs.databricks.com/en/mlflow/quick-start.html", + ], + [ + "https://docs.databricks.com/en/introduction/index.html", + "https://docs.databricks.com/en/getting-started/overview.html", + ], + [ + "https://docs.databricks.com/en/machine-learning/model-serving/index.html", + "https://docs.databricks.com/en/machine-learning/model-serving/model-serving-intro.html", + ], + [], + ], + "ground_truth_context": [ + ["https://docs.databricks.com/en/mlflow/index.html"], + ["https://docs.databricks.com/en/introduction/index.html"], + [ + "https://docs.databricks.com/en/machine-learning/model-serving/index.html", + "https://docs.databricks.com/en/machine-learning/model-serving/llm-optimized-model-serving.html", + ], + ["https://docs.databricks.com/en/mlflow/databricks-autologging.html"], + ], + } + ) + + with mlflow.start_run() as run: + mlflow.evaluate( + data=data, + model_type="retriever", + targets="ground_truth_context", + predictions="retrieved_context", + evaluators="default", + evaluator_config={"k": 3}, + ) + run = mlflow.get_run(run.info.run_id) + assert run.data.metrics == { + "precision_at_k/v1/mean": 1, + "precision_at_k/v1/variance": 0, + "precision_at_k/v1/p90": 1, + } From 80dab16ec65bde944d2449fb194647989e088009 Mon Sep 17 00:00:00 2001 From: Bryan Qiu Date: Thu, 26 Oct 2023 18:02:25 -0700 Subject: [PATCH 2/6] revert Signed-off-by: Bryan Qiu --- mlflow/data/digest_utils.py | 8 +--- mlflow/data/pandas_dataset.py | 3 -- mlflow/models/evaluation/base.py | 29 +------------ tests/evaluate/test_default_evaluator.py | 53 ------------------------ 4 files changed, 3 insertions(+), 90 deletions(-) diff --git a/mlflow/data/digest_utils.py b/mlflow/data/digest_utils.py index f95b1ccf57a7f..bbaa02dc20041 100644 --- a/mlflow/data/digest_utils.py +++ b/mlflow/data/digest_utils.py @@ -28,16 +28,10 @@ def compute_pandas_digest(df) -> str: desired_columns = string_columns.union(numeric_columns) trimmed_df = trimmed_df[desired_columns] - values = pd.util.hash_pandas_object(trimmed_df).values - - for val in values: - if isinstance(val, list): - # turn into a tuple here so it's hashable - pass return get_normalized_md5_digest( [ - values, + pd.util.hash_pandas_object(trimmed_df).values, np.int64(len(df)), ] + [str(x).encode() for x in df.columns] diff --git a/mlflow/data/pandas_dataset.py b/mlflow/data/pandas_dataset.py index d79fef813a1d8..3c765168d169a 100644 --- a/mlflow/data/pandas_dataset.py +++ b/mlflow/data/pandas_dataset.py @@ -153,9 +153,6 @@ def to_evaluation_dataset(self, path=None, feature_names=None) -> EvaluationData Converts the dataset to an EvaluationDataset for model evaluation. Required for use with mlflow.evaluate(). """ - - print("bbqiu", self._df) - return EvaluationDataset( data=self._df, targets=self._targets, diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py index ae73a2fab6966..b9236468982fa 100644 --- a/mlflow/models/evaluation/base.py +++ b/mlflow/models/evaluation/base.py @@ -377,15 +377,12 @@ def tables(self) -> Dict[str, "pd.DataFrame"]: def _hash_uint64_ndarray_as_bytes(array): - print("skirm12", array) assert len(array.shape) == 1 # see struct pack format string https://docs.python.org/3/library/struct.html#format-strings return struct.pack(f">{array.size}Q", *array) def _hash_ndarray_as_bytes(nd_array): - print("skirm11", nd_array) - print(type(nd_array)) return _hash_uint64_ndarray_as_bytes( pd.util.hash_array(nd_array.flatten(order="C")) ) + _hash_uint64_ndarray_as_bytes(np.array(nd_array.shape, dtype="uint64")) @@ -396,9 +393,7 @@ def _hash_array_like_obj_as_bytes(data): Helper method to convert pandas dataframe/numpy array/list into bytes for MD5 calculation purpose. """ - print("skirm", data) if isinstance(data, pd.DataFrame): - print("skirm2", data) # add checking `'pyspark' in sys.modules` to avoid importing pyspark when user # run code not related to pyspark. if "pyspark" in sys.modules: @@ -408,31 +403,21 @@ def _hash_array_like_obj_as_bytes(data): def _hash_array_like_element_as_bytes(v): if spark_vector_type is not None: - print("skirm3", data) if isinstance(v, spark_vector_type): - print("skirm4", data) return _hash_ndarray_as_bytes(v.toArray()) if isinstance(v, np.ndarray): - print("skirm5", data) return _hash_ndarray_as_bytes(v) if isinstance(v, list): - print("skirm6", data) return _hash_ndarray_as_bytes(np.array(v)) - print("skirm7", data) return v data = data.applymap(_hash_array_like_element_as_bytes) return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data)) elif isinstance(data, np.ndarray): - print("skirm8", data) - print(type(data)) return _hash_ndarray_as_bytes(data) elif isinstance(data, list): - print("skirm9", data) return _hash_ndarray_as_bytes(np.array(data)) else: - print("skirm10", data) - print(type(data)) raise ValueError("Unsupported data type.") @@ -446,8 +431,6 @@ def _gen_md5_for_arraylike_obj(md5_gen, data): len_bytes = _hash_uint64_ndarray_as_bytes(np.array([len(data)], dtype="uint64")) md5_gen.update(len_bytes) if len(data) < EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH * 2: - print("skirm15", data) - print(type(data)) md5_gen.update(_hash_array_like_obj_as_bytes(data)) else: head_rows = data[: EvaluationDataset.NUM_SAMPLE_ROWS_FOR_HASH] @@ -591,12 +574,12 @@ def __init__( data = data.limit(EvaluationDataset.SPARK_DATAFRAME_LIMIT).toPandas() if has_targets: - self._labels_data = self._list_to_numpy(data[targets].to_numpy()) + self._labels_data = data[targets].to_numpy() self._targets_name = targets self._has_predictions = predictions is not None if self._has_predictions: - self._predictions_data = self._list_to_numpy(data[predictions].to_numpy()) + self._predictions_data = data[predictions].to_numpy() self._predictions_name = predictions if feature_names is not None: @@ -628,7 +611,6 @@ def __init__( if self._labels_data is not None: _gen_md5_for_arraylike_obj(md5_gen, self._labels_data) if self._predictions_data is not None: - print("skirm 16", self._predictions_data) _gen_md5_for_arraylike_obj(md5_gen, self._predictions_data) md5_gen.update(",".join(list(map(str, self._feature_names))).encode("UTF-8")) @@ -722,11 +704,6 @@ def _metadata(self): metadata["path"] = self.path return metadata - def _list_to_numpy(self, data): - if isinstance(data, (list, np.ndarray)): - return np.array([self._list_to_numpy(lst) for lst in data]) - return data - def _log_dataset_tag(self, client, run_id, model_uuid): """ Log dataset metadata as a tag "mlflow.datasets", if the tag already exists, it will @@ -1691,8 +1668,6 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir): from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel from mlflow.utils import env_manager as _EnvManager - print("bbqiu2", data) - if evaluator_config is not None: col_mapping = evaluator_config.get("col_mapping", {}) diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py index b49911b2f8375..a2623a4b97a65 100644 --- a/tests/evaluate/test_default_evaluator.py +++ b/tests/evaluate/test_default_evaluator.py @@ -3551,56 +3551,3 @@ def test_target_prediction_col_mapping(): "correctness/v1/variance": 0.0, "correctness/v1/p90": 3.0, } - - -def test_temp(): - data = pd.DataFrame( - { - "questions": [ - "What is MLflow?", - "What is Databricks?", - "How to serve a model on Databricks?", - "How to enable MLflow Autologging for my workspace by default?", - ], - "retrieved_context": [ - [ - "https://docs.databricks.com/en/mlflow/index.html", - "https://docs.databricks.com/en/mlflow/quick-start.html", - ], - [ - "https://docs.databricks.com/en/introduction/index.html", - "https://docs.databricks.com/en/getting-started/overview.html", - ], - [ - "https://docs.databricks.com/en/machine-learning/model-serving/index.html", - "https://docs.databricks.com/en/machine-learning/model-serving/model-serving-intro.html", - ], - [], - ], - "ground_truth_context": [ - ["https://docs.databricks.com/en/mlflow/index.html"], - ["https://docs.databricks.com/en/introduction/index.html"], - [ - "https://docs.databricks.com/en/machine-learning/model-serving/index.html", - "https://docs.databricks.com/en/machine-learning/model-serving/llm-optimized-model-serving.html", - ], - ["https://docs.databricks.com/en/mlflow/databricks-autologging.html"], - ], - } - ) - - with mlflow.start_run() as run: - mlflow.evaluate( - data=data, - model_type="retriever", - targets="ground_truth_context", - predictions="retrieved_context", - evaluators="default", - evaluator_config={"k": 3}, - ) - run = mlflow.get_run(run.info.run_id) - assert run.data.metrics == { - "precision_at_k/v1/mean": 1, - "precision_at_k/v1/variance": 0, - "precision_at_k/v1/p90": 1, - } From aeb76dfb5f4e6a5c869a52dc3414833e8024fb10 Mon Sep 17 00:00:00 2001 From: Bryan Qiu Date: Thu, 26 Oct 2023 20:06:19 -0700 Subject: [PATCH 3/6] switch to lists Signed-off-by: Bryan Qiu --- docs/source/python_api/mlflow.metrics.rst | 2 +- mlflow/metrics/metric_definitions.py | 34 +++++++++-------------- mlflow/models/evaluation/base.py | 5 ++++ tests/evaluate/test_default_evaluator.py | 20 ++++++------- 4 files changed, 28 insertions(+), 33 deletions(-) diff --git a/docs/source/python_api/mlflow.metrics.rst b/docs/source/python_api/mlflow.metrics.rst index 8a4d6f525afc9..893589c671720 100644 --- a/docs/source/python_api/mlflow.metrics.rst +++ b/docs/source/python_api/mlflow.metrics.rst @@ -110,7 +110,7 @@ your retrieval model. The function should take a Pandas DataFrame containing inp ground-truth relevant doc IDs, and return a DataFrame with a column of retrieved relevant doc IDs. A "doc ID" is a string that uniquely identifies a document. All doc IDs should be entered as a -tuple of doc ID strings. +list of doc ID strings. Parameters: diff --git a/mlflow/metrics/metric_definitions.py b/mlflow/metrics/metric_definitions.py index 9211e13927af1..649f9cf699d0d 100644 --- a/mlflow/metrics/metric_definitions.py +++ b/mlflow/metrics/metric_definitions.py @@ -38,24 +38,18 @@ def _validate_text_data(data, metric_name, col_specifier): return True -def _validate_and_fix_text_tuple_data(data, metric_name, column_name): - """Validates that the data is a pandas Series of a tuple of strings and is non-empty""" +def _validate_list_str_data(data, metric_name, col_specifier): + """Validates that the data is a list of lists of strings and is non-empty""" if data is None or len(data) == 0: return False for index, value in data.items(): - if not isinstance(value, tuple) or not all(isinstance(val, str) for val in value): - # Single entry tuples are automatically unpacked by Pandas. - # So if the entry is a string, put it back into a tuple. - if isinstance(value, str): - data[index] = (value,) - else: - _logger.warning( - f"Cannot calculate metric '{metric_name}' for non-tuple[str] inputs. " - f"Row #{index} of column '{column_name}' has a non-tuple[str] value of:" - f"{value}. Skipping metric logging." - ) - return False + if not isinstance(value, list) or not all(isinstance(val, str) for val in value): + _logger.warning( + f"Cannot calculate metric '{metric_name}' for non-list[str] inputs. " + f"Non-list[str] found for {col_specifier} on row {index}. Skipping metric logging." + ) + return False return True @@ -344,9 +338,9 @@ def _f1_score_eval_fn( def _precision_at_k_eval_fn(k): def _fn(predictions, targets): - if not _validate_and_fix_text_tuple_data( - predictions, "precision_at_k", "predictions" - ) or not _validate_and_fix_text_tuple_data(targets, "precision_at_k", "targets"): + if not _validate_list_str_data( + predictions, "precision_at_k", predictions_col_specifier + ) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier): return scores = [] @@ -367,11 +361,9 @@ def _fn(predictions, targets): def _recall_at_k_eval_fn(k): def _fn(predictions, targets): - if not _validate_and_fix_text_tuple_data( + if not _validate_list_str_data( predictions, "precision_at_k", predictions_col_specifier - ) or not _validate_and_fix_text_tuple_data( - targets, "precision_at_k", targets_col_specifier - ): + ) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier): return scores = [] diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py index b9236468982fa..2d2dfa79de63c 100644 --- a/mlflow/models/evaluation/base.py +++ b/mlflow/models/evaluation/base.py @@ -413,6 +413,11 @@ def _hash_array_like_element_as_bytes(v): data = data.applymap(_hash_array_like_element_as_bytes) return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data)) + elif isinstance(data, np.ndarray) and len(data) > 0 and isinstance(data[0], list): + # convert numpy array of lists into numpy array of numpy arrays + # because lists are not hashable + hashable = np.array(json.dumps(val) for val in data) + return _hash_ndarray_as_bytes(hashable) elif isinstance(data, np.ndarray): return _hash_ndarray_as_bytes(data) elif isinstance(data, list): diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py index a2623a4b97a65..23ba6da4cb113 100644 --- a/tests/evaluate/test_default_evaluator.py +++ b/tests/evaluate/test_default_evaluator.py @@ -3220,8 +3220,6 @@ def validate_retriever_logged_data(logged_data, k=3): columns = { "question", "retrieved_context", - # TODO: fix the logged data to name the model output column "retrieved_context" - # Right now, it's hard-coded "outputs", which is not ideal f"precision_at_{k}/score", f"recall_at_{k}/score", "ground_truth", @@ -3237,10 +3235,10 @@ def validate_retriever_logged_data(logged_data, k=3): def test_evaluate_retriever(): - X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3}) + X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3}) def fn(X): - return pd.DataFrame({"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)}) + return pd.DataFrame({"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)}) with mlflow.start_run() as run: results = mlflow.evaluate( @@ -3310,9 +3308,9 @@ def fn(X): # test with multiple chunks from same doc def fn2(X): - return pd.DataFrame({"retrieved_context": [("doc1", "doc1", "doc3")] * len(X)}) + return pd.DataFrame({"retrieved_context": [["doc1", "doc1", "doc3"]] * len(X)}) - X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc3")] * 3}) + X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc3"]] * 3}) with mlflow.start_run() as run: results = mlflow.evaluate( @@ -3338,7 +3336,7 @@ def fn2(X): # test with empty retrieved doc def fn3(X): - return pd.DataFrame({"output": [()] * len(X)}) + return pd.DataFrame({"output": [[]] * len(X)}) with mlflow.start_run() as run: mlflow.evaluate( @@ -3364,7 +3362,7 @@ def fn3(X): # test with single retrieved doc def fn4(X): - return pd.DataFrame({"output": [("doc1")] * len(X)}) + return pd.DataFrame({"output": [["doc1"]] * len(X)}) with mlflow.start_run() as run: mlflow.evaluate( @@ -3389,7 +3387,7 @@ def fn4(X): } # test with single ground truth doc - X_1 = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1")] * 3}) + X_1 = pd.DataFrame({"question": [["q1?"]] * 3, "ground_truth": [["doc1"]] * 3}) with mlflow.start_run() as run: mlflow.evaluate( @@ -3415,10 +3413,10 @@ def fn4(X): def test_evaluate_retriever_builtin_metrics_no_model_type(): - X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3}) + X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3}) def fn(X): - return {"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)} + return {"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)} with mlflow.start_run() as run: results = mlflow.evaluate( From 57108066dde6080f8de020e58d9d607103299866 Mon Sep 17 00:00:00 2001 From: Bryan Qiu Date: Thu, 26 Oct 2023 21:18:39 -0700 Subject: [PATCH 4/6] fix tests Signed-off-by: Bryan Qiu --- tests/metrics/test_metric_definitions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/metrics/test_metric_definitions.py b/tests/metrics/test_metric_definitions.py index cf2da6ca2abab..34163aff759b7 100644 --- a/tests/metrics/test_metric_definitions.py +++ b/tests/metrics/test_metric_definitions.py @@ -245,8 +245,8 @@ def test_binary_f1_score(): def test_precision_at_k(): - predictions = pd.Series([("a", "b"), ("c", "d"), ("e"), ("f", "g")]) - targets = pd.Series([("a", "b"), ("c", "b"), ("e"), ("c")]) + predictions = pd.Series([["a", "b"], ["c", "d"], ["e"], ["f", "g"]]) + targets = pd.Series([["a", "b"], ["c", "b"], ["e"], ["c"]]) result = precision_at_k(4).eval_fn(predictions, targets) assert result.scores == [1.0, 0.5, 1.0, 0.0] @@ -258,8 +258,8 @@ def test_precision_at_k(): def test_recall_at_k(): - predictions = pd.Series([("a", "b"), ("c", "d", "e"), (), ("f", "g"), ("a", "a", "a")]) - targets = pd.Series([("a", "b", "c", "d"), ("c", "b", "a", "d"), (), (), ("a", "c")]) + predictions = pd.Series([["a", "b"], ["c", "d", "e"], [], ["f", "g"], ["a", "a", "a"]]) + targets = pd.Series([["a", "b", "c", "d"], ["c", "b", "a", "d"], [], [], ["a", "c"]]) result = recall_at_k(4).eval_fn(predictions, targets) assert result.scores == [0.5, 0.5, 1.0, 0.0, 0.5] From 908d187adf88054fa27d176db27a172dde64f37f Mon Sep 17 00:00:00 2001 From: Bryan Qiu Date: Thu, 26 Oct 2023 21:20:10 -0700 Subject: [PATCH 5/6] fix tests Signed-off-by: Bryan Qiu --- tests/metrics/test_metric_definitions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/metrics/test_metric_definitions.py b/tests/metrics/test_metric_definitions.py index 34163aff759b7..d75fa2a8a2eef 100644 --- a/tests/metrics/test_metric_definitions.py +++ b/tests/metrics/test_metric_definitions.py @@ -33,8 +33,6 @@ ari_grade_level(), exact_match(), flesch_kincaid_grade_level(), - precision_at_k(3), - recall_at_k(3), rouge1(), rouge2(), rougeL(), From d96d59dbdfd89d32d6e57b033962cbbf66526546 Mon Sep 17 00:00:00 2001 From: Bryan Qiu Date: Thu, 26 Oct 2023 23:11:24 -0700 Subject: [PATCH 6/6] address comments Signed-off-by: Bryan Qiu --- mlflow/metrics/metric_definitions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mlflow/metrics/metric_definitions.py b/mlflow/metrics/metric_definitions.py index 649f9cf699d0d..2a491f34086fc 100644 --- a/mlflow/metrics/metric_definitions.py +++ b/mlflow/metrics/metric_definitions.py @@ -46,8 +46,9 @@ def _validate_list_str_data(data, metric_name, col_specifier): for index, value in data.items(): if not isinstance(value, list) or not all(isinstance(val, str) for val in value): _logger.warning( - f"Cannot calculate metric '{metric_name}' for non-list[str] inputs. " - f"Non-list[str] found for {col_specifier} on row {index}. Skipping metric logging." + f"Cannot calculate metric '{metric_name}' for non-list of string inputs. " + f"Non-list of strings found for {col_specifier} on row {index}. Skipping metric " + f"logging." ) return False