Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ranking metric map to be the same as Spark's #2004

Merged
merged 39 commits into from
Nov 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
530ee2a
Announcement LF
miguelgfierro Sep 28, 2023
f2301f7
Update email
miguelgfierro Sep 29, 2023
505f864
Update README.md
anargyri Sep 28, 2023
6756701
security
miguelgfierro Sep 29, 2023
462f9b2
license and contribution notice
miguelgfierro Sep 29, 2023
20a62ff
update author link
miguelgfierro Sep 29, 2023
9bdb3f9
Add new code of conduct from LF
miguelgfierro Sep 29, 2023
27f7674
Replacing references GRU4Rec to GRU
miguelgfierro Sep 27, 2023
2025f95
Replacing references GRU4Rec to GRU
miguelgfierro Sep 27, 2023
3ceb844
Replacing references GRU4Rec in config files
miguelgfierro Sep 27, 2023
c150872
Update references
miguelgfierro Sep 28, 2023
6aad8fd
Delete conda.md
miguelgfierro Sep 29, 2023
ccc9a67
refactor map_at_k and map to be the same as Spark's
loomlike Sep 28, 2023
52d8308
list of test failing to fix
loomlike Oct 3, 2023
64521a5
Update readme LF feedback @wutaomsft
miguelgfierro Oct 4, 2023
36e22c6
Update NEWS.md
miguelgfierro Oct 5, 2023
6106740
Update README.md
miguelgfierro Oct 5, 2023
094c68a
Fix test errors, Refactor column check utils to be simpler
loomlike Oct 6, 2023
b76efe4
Rename ranking tests to be _at_k suffixed
loomlike Oct 6, 2023
15b6a01
Change test names in the test group
loomlike Oct 11, 2023
ba6f466
add comment to mocked fn in a test
loomlike Oct 13, 2023
33420ec
:memo:
miguelgfierro Oct 6, 2023
cdfe8a3
remove unused input
miguelgfierro Oct 6, 2023
388ec8d
:memo:
miguelgfierro Oct 6, 2023
a693b95
no need to output the logs twice
miguelgfierro Oct 6, 2023
4c0c72b
packages
miguelgfierro Oct 6, 2023
437ba13
skipping flaky test
miguelgfierro Oct 6, 2023
99f6863
Issue with TF
miguelgfierro Oct 12, 2023
bdeb52e
Comment out the PR gate affected tests with the upgrade to TF>2.10.1
miguelgfierro Oct 13, 2023
fa9eb69
Comment out the nightly builds affected tests with the upgrade to TF>…
miguelgfierro Oct 13, 2023
d954fbc
:bug:
miguelgfierro Oct 13, 2023
e14aafa
Comment out the nightly builds affected tests with the upgrade to TF>…
miguelgfierro Oct 13, 2023
9e11963
revert the breaking tests with TF 2.10.1
miguelgfierro Oct 13, 2023
9428cf3
temporary pin to TF=2.8.4
miguelgfierro Oct 13, 2023
dba7edf
Update security tests
miguelgfierro Oct 13, 2023
d22f1da
Update expected values to not use fixture
loomlike Oct 30, 2023
86f3744
list of test failing to fix
loomlike Oct 3, 2023
f818efd
Merge branch 'staging' into jumin/evaluation
loomlike Oct 31, 2023
f2b342e
Fix missing fixture error
loomlike Nov 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions recommenders/datasets/pandas_df_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,19 +360,14 @@ def has_columns(df, columns):

Args:
df (pandas.DataFrame): DataFrame
columns (list(str): columns to check for
columns (iterable(str)): columns to check for

Returns:
bool: True if DataFrame has specified columns.
"""

result = True
for column in columns:
if column not in df.columns:
logger.error("Missing column: {} in DataFrame".format(column))
result = False

return result
if not isinstance(columns, set):
columns = set(columns)
return columns.issubset(df.columns)


def has_same_base_dtype(df_1, df_2, columns=None):
Expand Down
179 changes: 114 additions & 65 deletions recommenders/evaluation/python_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@
)


class ColumnMismatchError(Exception):
pass


class ColumnTypeMismatchError(Exception):
pass


def _check_column_dtypes(func):
"""Checks columns of DataFrame inputs

Expand All @@ -53,7 +61,6 @@ def check_column_dtypes_wrapper(
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
*args,
**kwargs
Expand All @@ -68,22 +75,24 @@ def check_column_dtypes_wrapper(
col_rating (str): column name for rating
col_prediction (str): column name for prediction
"""

if not has_columns(rating_true, [col_user, col_item, col_rating]):
raise ValueError("Missing columns in true rating DataFrame")
if not has_columns(rating_pred, [col_user, col_item, col_prediction]):
raise ValueError("Missing columns in predicted rating DataFrame")
if not has_same_base_dtype(
rating_true, rating_pred, columns=[col_user, col_item]
):
raise ValueError("Columns in provided DataFrames are not the same datatype")
# Some ranking metrics don't have the rating column, so we don't need to check.
expected_true_columns = {col_user, col_item}
if "col_rating" in kwargs:
expected_true_columns.add(kwargs["col_rating"])
if not has_columns(rating_true, expected_true_columns):
raise ColumnMismatchError("Missing columns in true rating DataFrame")

if not has_columns(rating_pred, {col_user, col_item, col_prediction}):
raise ColumnMismatchError("Missing columns in predicted rating DataFrame")

if not has_same_base_dtype(rating_true, rating_pred, columns=[col_user, col_item]):
raise ColumnTypeMismatchError("Columns in provided DataFrames are not the same datatype")

return func(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_rating=col_rating,
col_prediction=col_prediction,
*args,
**kwargs
Expand Down Expand Up @@ -350,11 +359,11 @@ def merge_ranking_true_pred(
rating_pred,
col_user,
col_item,
col_rating,
col_prediction,
relevancy_method,
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
**_,
):
"""Filter truth and prediction data frames on common users

Expand All @@ -363,7 +372,6 @@ def merge_ranking_true_pred(
rating_pred (pandas.DataFrame): Predicted DataFrame
col_user (str): column name for user
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
Expand Down Expand Up @@ -424,7 +432,7 @@ def precision_at_k(
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
**kwargs
**_,
):
"""Precision at K.

Expand All @@ -440,7 +448,6 @@ def precision_at_k(
rating_pred (pandas.DataFrame): Predicted DataFrame
col_user (str): column name for user
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
Expand All @@ -450,13 +457,11 @@ def precision_at_k(
Returns:
float: precision at k (min=0, max=1)
"""
col_rating = _get_rating_column(relevancy_method, **kwargs)
df_hit, df_hit_count, n_users = merge_ranking_true_pred(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_rating=col_rating,
col_prediction=col_prediction,
relevancy_method=relevancy_method,
k=k,
Expand All @@ -478,7 +483,7 @@ def recall_at_k(
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
**kwargs
**_,
):
"""Recall at K.

Expand All @@ -487,7 +492,6 @@ def recall_at_k(
rating_pred (pandas.DataFrame): Predicted DataFrame
col_user (str): column name for user
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
Expand All @@ -498,13 +502,11 @@ def recall_at_k(
float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than
k items exist for a user in rating_true.
"""
col_rating = _get_rating_column(relevancy_method, **kwargs)
df_hit, df_hit_count, n_users = merge_ranking_true_pred(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_rating=col_rating,
col_prediction=col_prediction,
relevancy_method=relevancy_method,
k=k,
Expand All @@ -522,13 +524,14 @@ def ndcg_at_k(
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_prediction=DEFAULT_PREDICTION_COL,
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
score_type="binary",
discfun_type="loge",
**kwargs
**_,
):
"""Normalized Discounted Cumulative Gain (nDCG).

Expand All @@ -553,13 +556,11 @@ def ndcg_at_k(
Returns:
float: nDCG at k (min=0, max=1).
"""
col_rating = _get_rating_column(relevancy_method, **kwargs)
df_hit, _, _ = merge_ranking_true_pred(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_rating=col_rating,
col_prediction=col_prediction,
relevancy_method=relevancy_method,
k=k,
Expand Down Expand Up @@ -616,7 +617,8 @@ def ndcg_at_k(
return df_user["ndcg"].mean()


def map_at_k(
@lru_cache_df(maxsize=1)
def _get_reciprocal_rank(
rating_true,
rating_pred,
col_user=DEFAULT_USER_COL,
Expand All @@ -625,9 +627,43 @@ def map_at_k(
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
**kwargs
):
"""Mean Average Precision at k
df_hit, df_hit_count, n_users = merge_ranking_true_pred(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_prediction=col_prediction,
relevancy_method=relevancy_method,
k=k,
threshold=threshold,
)

if df_hit.shape[0] == 0:
return None, n_users

# calculate reciprocal rank of items for each user and sum them up
df_hit_sorted = df_hit.copy()
df_hit_sorted["rr"] = (
df_hit_sorted.groupby(col_user).cumcount() + 1
) / df_hit_sorted["rank"]
df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()

return pd.merge(df_hit_sorted, df_hit_count, on=col_user), n_users


def map(
rating_true,
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_prediction=DEFAULT_PREDICTION_COL,
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
**_,
):
"""Mean Average Precision for top k prediction items

The implementation of MAP is referenced from Spark MLlib evaluation metrics.
https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems
Expand All @@ -636,52 +672,85 @@ def map_at_k(
http://web.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf

Note:
1. The evaluation function is named as 'MAP is at k' because the evaluation class takes top k items for
the prediction items. The naming is different from Spark.

2. The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of
The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of
relevant items in the ground truth data, instead of k.

Args:
rating_true (pandas.DataFrame): True DataFrame
rating_pred (pandas.DataFrame): Predicted DataFrame
col_user (str): column name for user
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)

Returns:
float: MAP at k (min=0, max=1).
float: MAP (min=0, max=1)
"""
col_rating = _get_rating_column(relevancy_method, **kwargs)
df_hit, df_hit_count, n_users = merge_ranking_true_pred(
df_merge, n_users = _get_reciprocal_rank(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_rating=col_rating,
col_prediction=col_prediction,
relevancy_method=relevancy_method,
k=k,
threshold=threshold,
)

if df_hit.shape[0] == 0:
if df_merge is None:
return 0.0
else:
return (df_merge["rr"] / df_merge["actual"]).sum() / n_users

# calculate reciprocal rank of items for each user and sum them up
df_hit_sorted = df_hit.copy()
df_hit_sorted["rr"] = (
df_hit_sorted.groupby(col_user).cumcount() + 1
) / df_hit_sorted["rank"]
df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()

df_merge = pd.merge(df_hit_sorted, df_hit_count, on=col_user)
return (df_merge["rr"] / df_merge["actual"]).sum() / n_users
def map_at_k(
rating_true,
rating_pred,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_prediction=DEFAULT_PREDICTION_COL,
relevancy_method="top_k",
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
**_,
):
"""Mean Average Precision at k

The implementation of MAP@k is referenced from Spark MLlib evaluation metrics.
https://github.com/apache/spark/blob/b938ff9f520fd4e4997938284ffa0aba9ea271fc/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala#L99

Args:
rating_true (pandas.DataFrame): True DataFrame
rating_pred (pandas.DataFrame): Predicted DataFrame
col_user (str): column name for user
col_item (str): column name for item
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)

Returns:
float: MAP@k (min=0, max=1)
"""
df_merge, n_users = _get_reciprocal_rank(
rating_true=rating_true,
rating_pred=rating_pred,
col_user=col_user,
col_item=col_item,
col_prediction=col_prediction,
relevancy_method=relevancy_method,
k=k,
threshold=threshold,
)

if df_merge is None:
return 0.0
else:
return (df_merge["rr"] / df_merge["actual"].apply(lambda x: min(x, k))).sum() / n_users


def get_top_k_items(
Expand Down Expand Up @@ -736,26 +805,6 @@ def get_top_k_items(
}


def _get_rating_column(relevancy_method: str, **kwargs) -> str:
r"""Helper utility to simplify the arguments of eval metrics
Attemtps to address https://github.com/microsoft/recommenders/issues/1737.

Args:
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.

Returns:
str: rating column name.
"""
if relevancy_method != "top_k":
if "col_rating" not in kwargs:
raise ValueError("Expected an argument `col_rating` but wasn't found.")
col_rating = kwargs.get("col_rating")
else:
col_rating = kwargs.get("col_rating", DEFAULT_RATING_COL)
return col_rating


# diversity metrics
def _check_column_dtypes_diversity_serendipity(func):
"""Checks columns of DataFrame inputs
Expand Down
Loading