Skip to content

Commit

Permalink
Merge pull request #1 from bbqiu/retriever-change-pandas-hash
Browse files Browse the repository at this point in the history
Retriever change pandas hash
  • Loading branch information
bbqiu committed Oct 27, 2023
2 parents 6d06cbf + d96d59d commit b5963d3
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 39 deletions.
2 changes: 1 addition & 1 deletion docs/source/python_api/mlflow.metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ your retrieval model. The function should take a Pandas DataFrame containing inp
ground-truth relevant doc IDs, and return a DataFrame with a column of retrieved relevant doc IDs.

A "doc ID" is a string that uniquely identifies a document. All doc IDs should be entered as a
tuple of doc ID strings.
list of doc ID strings.

Parameters:

Expand Down
35 changes: 14 additions & 21 deletions mlflow/metrics/metric_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,19 @@ def _validate_text_data(data, metric_name, col_specifier):
return True


def _validate_and_fix_text_tuple_data(data, metric_name, column_name):
"""Validates that the data is a pandas Series of a tuple of strings and is non-empty"""
def _validate_list_str_data(data, metric_name, col_specifier):
"""Validates that the data is a list of lists of strings and is non-empty"""
if data is None or len(data) == 0:
return False

for index, value in data.items():
if not isinstance(value, tuple) or not all(isinstance(val, str) for val in value):
# Single entry tuples are automatically unpacked by Pandas.
# So if the entry is a string, put it back into a tuple.
if isinstance(value, str):
data[index] = (value,)
else:
_logger.warning(
f"Cannot calculate metric '{metric_name}' for non-tuple[str] inputs. "
f"Row #{index} of column '{column_name}' has a non-tuple[str] value of:"
f"{value}. Skipping metric logging."
)
return False
if not isinstance(value, list) or not all(isinstance(val, str) for val in value):
_logger.warning(
f"Cannot calculate metric '{metric_name}' for non-list of string inputs. "
f"Non-list of strings found for {col_specifier} on row {index}. Skipping metric "
f"logging."
)
return False

return True

Expand Down Expand Up @@ -344,9 +339,9 @@ def _f1_score_eval_fn(

def _precision_at_k_eval_fn(k):
def _fn(predictions, targets):
if not _validate_and_fix_text_tuple_data(
predictions, "precision_at_k", "predictions"
) or not _validate_and_fix_text_tuple_data(targets, "precision_at_k", "targets"):
if not _validate_list_str_data(
predictions, "precision_at_k", predictions_col_specifier
) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier):
return

scores = []
Expand All @@ -367,11 +362,9 @@ def _fn(predictions, targets):

def _recall_at_k_eval_fn(k):
def _fn(predictions, targets):
if not _validate_and_fix_text_tuple_data(
if not _validate_list_str_data(
predictions, "precision_at_k", predictions_col_specifier
) or not _validate_and_fix_text_tuple_data(
targets, "precision_at_k", targets_col_specifier
):
) or not _validate_list_str_data(targets, "precision_at_k", targets_col_specifier):
return

scores = []
Expand Down
5 changes: 5 additions & 0 deletions mlflow/models/evaluation/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,11 @@ def _hash_array_like_element_as_bytes(v):

data = data.applymap(_hash_array_like_element_as_bytes)
return _hash_uint64_ndarray_as_bytes(pd.util.hash_pandas_object(data))
elif isinstance(data, np.ndarray) and len(data) > 0 and isinstance(data[0], list):
# convert numpy array of lists into numpy array of numpy arrays
# because lists are not hashable
hashable = np.array(json.dumps(val) for val in data)
return _hash_ndarray_as_bytes(hashable)
elif isinstance(data, np.ndarray):
return _hash_ndarray_as_bytes(data)
elif isinstance(data, list):
Expand Down
20 changes: 9 additions & 11 deletions tests/evaluate/test_default_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3220,8 +3220,6 @@ def validate_retriever_logged_data(logged_data, k=3):
columns = {
"question",
"retrieved_context",
# TODO: fix the logged data to name the model output column "retrieved_context"
# Right now, it's hard-coded "outputs", which is not ideal
f"precision_at_{k}/score",
f"recall_at_{k}/score",
"ground_truth",
Expand All @@ -3237,10 +3235,10 @@ def validate_retriever_logged_data(logged_data, k=3):


def test_evaluate_retriever():
X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3})

def fn(X):
return pd.DataFrame({"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)})
return pd.DataFrame({"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)})

with mlflow.start_run() as run:
results = mlflow.evaluate(
Expand Down Expand Up @@ -3310,9 +3308,9 @@ def fn(X):

# test with multiple chunks from same doc
def fn2(X):
return pd.DataFrame({"retrieved_context": [("doc1", "doc1", "doc3")] * len(X)})
return pd.DataFrame({"retrieved_context": [["doc1", "doc1", "doc3"]] * len(X)})

X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc3")] * 3})
X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc3"]] * 3})

with mlflow.start_run() as run:
results = mlflow.evaluate(
Expand All @@ -3338,7 +3336,7 @@ def fn2(X):

# test with empty retrieved doc
def fn3(X):
return pd.DataFrame({"output": [()] * len(X)})
return pd.DataFrame({"output": [[]] * len(X)})

with mlflow.start_run() as run:
mlflow.evaluate(
Expand All @@ -3364,7 +3362,7 @@ def fn3(X):

# test with single retrieved doc
def fn4(X):
return pd.DataFrame({"output": [("doc1")] * len(X)})
return pd.DataFrame({"output": [["doc1"]] * len(X)})

with mlflow.start_run() as run:
mlflow.evaluate(
Expand All @@ -3389,7 +3387,7 @@ def fn4(X):
}

# test with single ground truth doc
X_1 = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1")] * 3})
X_1 = pd.DataFrame({"question": [["q1?"]] * 3, "ground_truth": [["doc1"]] * 3})

with mlflow.start_run() as run:
mlflow.evaluate(
Expand All @@ -3415,10 +3413,10 @@ def fn4(X):


def test_evaluate_retriever_builtin_metrics_no_model_type():
X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [["doc1", "doc2"]] * 3})

def fn(X):
return {"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)}
return {"retrieved_context": [["doc1", "doc3", "doc2"]] * len(X)}

with mlflow.start_run() as run:
results = mlflow.evaluate(
Expand Down
10 changes: 4 additions & 6 deletions tests/metrics/test_metric_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@
ari_grade_level(),
exact_match(),
flesch_kincaid_grade_level(),
precision_at_k(3),
recall_at_k(3),
rouge1(),
rouge2(),
rougeL(),
Expand Down Expand Up @@ -245,8 +243,8 @@ def test_binary_f1_score():


def test_precision_at_k():
predictions = pd.Series([("a", "b"), ("c", "d"), ("e"), ("f", "g")])
targets = pd.Series([("a", "b"), ("c", "b"), ("e"), ("c")])
predictions = pd.Series([["a", "b"], ["c", "d"], ["e"], ["f", "g"]])
targets = pd.Series([["a", "b"], ["c", "b"], ["e"], ["c"]])
result = precision_at_k(4).eval_fn(predictions, targets)

assert result.scores == [1.0, 0.5, 1.0, 0.0]
Expand All @@ -258,8 +256,8 @@ def test_precision_at_k():


def test_recall_at_k():
predictions = pd.Series([("a", "b"), ("c", "d", "e"), (), ("f", "g"), ("a", "a", "a")])
targets = pd.Series([("a", "b", "c", "d"), ("c", "b", "a", "d"), (), (), ("a", "c")])
predictions = pd.Series([["a", "b"], ["c", "d", "e"], [], ["f", "g"], ["a", "a", "a"]])
targets = pd.Series([["a", "b", "c", "d"], ["c", "b", "a", "d"], [], [], ["a", "c"]])
result = recall_at_k(4).eval_fn(predictions, targets)

assert result.scores == [0.5, 0.5, 1.0, 0.0, 0.5]
Expand Down

0 comments on commit b5963d3

Please sign in to comment.