-
Notifications
You must be signed in to change notification settings - Fork 0
/
RecS_baseline.py
executable file
·251 lines (186 loc) · 9.36 KB
/
RecS_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import pandas as pd
import numpy as np
import os
import random
import copy
import json
import glob
class RecS_baseline_class():
"""Class for baseline recommender system model.
Attributes:
current_df (dataframe): dataframe with ratings
sim_matrix (Series): similarities for each item from current_df
"""
def __init__(self, current_df, current_items_array, save_prefix="Test", load_precomputed_matrix=None):
"""
Args:
current_df (dataframe): ratings in dataframe format
current_items_array (array): array with all items itemId's
utility: items-users utility matrix
save_prefix (str): (optional) prefix for saving files during the training
load_precomputed_matrix (str): (optional) path to precomputed pivot_predictions matrix
"""
self.__save_prefix__ = save_prefix
self.current_df = current_df
self.__items_array__ = current_items_array
# calculate similarity
if load_precomputed_matrix is None:
self.__train__()
else:
self.sim_matrix = pd.read_csv(load_precomputed_matrix, index_col=None, header=0)
def __train__(self):
""" method for calculating whole similarity matrix
"""
self.__prepare_data__()
matrix_columns = ['item1', 'item2', 'sim']
self.sim_matrix = pd.DataFrame(columns=matrix_columns)
for k, j in enumerate(self.__items_array__):
self.sim_matrix = self.sim_matrix.append(self.__compute_similarities__(j))
# clean the space
self.adjusted_ratings = None
# save results
self.sim_matrix.to_csv("./out/" + self.__save_prefix__ + "_similarities.csv")
def predictRating(self, userId, itemId):
"""
Method for returning prediction of user's rating for item
Args:
userId (str): unique user id
itemId (str): unique item id
Returns:
float: predicted rating
"""
items_of_user = self.current_df.loc[(self.current_df["userId"] == userId)]
pred_nom = 0
pred_denom = 0
# check whether user already bought this item. if so, don't compute
if itemId in items_of_user["itemId"].values:
print("User ", str(userId), " already purchased item ", str(itemId), ". Please choose another item.")
return 0.0
else:
L = self.sim_matrix.loc[(self.sim_matrix["item1"] == itemId)]
L = L[L['item2'].isin(items_of_user["itemId"].values)]
L = L.loc[pd.to_numeric(L["sim"]) > 0.0]
if L.shape[0] == 0:
# we can't predict anything
return 0.0
# sort the similarities first
L_sorted = L.sort_values(by=["sim"], ascending=False)
# then extract the most similar ones
L_new = L_sorted.head(10)
# compute prediction. don't use mean free ratings, but the original ones
for l in L_new["item2"]:
rat_l = \
self.current_df.loc[(self.current_df["itemId"] == l) & (self.current_df["userId"] == userId)][
"rating"].values[0]
if rat_l > 0.0:
pred_nom += float(L_new.loc[L_new["item2"] == l]["sim"].values[0]) * rat_l
pred_denom += float(np.abs(float(L_new.loc[L_new["item2"] == l]["sim"].values[0])))
pred = pred_nom / pred_denom
return pred
def __prepare_data__(self):
""" method for generating mean free adjusted ratings dataframe
"""
mean_user_rating = self.current_df.groupby(["userId"],
as_index=False,
sort=False).mean().rename(columns={'rating': 'mean_rating'})[
['userId',
'mean_rating']]
self.adjusted_ratings = pd.merge(self.current_df, mean_user_rating, on='userId', how='left', sort=False)
self.adjusted_ratings['rating_adjusted'] = self.adjusted_ratings['rating'] - self.adjusted_ratings[
'mean_rating']
def __compute_similarities__(self, item):
""" method for calculating whole similarity between input item and other items in training set
Args:
item (str): unique item id
Returns:
Series: partially filled similarity matrix
"""
# prepare titles of columns and sim matrix
matrix_columns = ['item1', 'item2', 'sim']
self.sim_matrix = pd.DataFrame(columns=matrix_columns)
users_who_rated_item = self.adjusted_ratings.loc[self.adjusted_ratings["itemId"] == item]
distinct_users = np.unique(users_who_rated_item['userId'])
# each item-item pair, which were purchased together will be stored
titles = ['userId', 'item1', 'item2', 'rating1', 'rating2']
record_1_2 = pd.DataFrame(columns=titles)
# for all users, who bought the item, find all the other items bought together
for user in distinct_users:
items_of_user = self.adjusted_ratings.loc[
(self.adjusted_ratings["userId"] == user) & (self.adjusted_ratings["itemId"] != item)]
# how our item was rated by this user:
rating1 = self.adjusted_ratings.loc[
(self.adjusted_ratings["itemId"] == item) & (self.adjusted_ratings["userId"] == user)][
"rating_adjusted"].values[0]
# look at other items that this user bought
for other_item in items_of_user["itemId"]:
# how this second item was rated by this user:
rating2 = self.adjusted_ratings.loc[
(self.adjusted_ratings["itemId"] == other_item) & (self.adjusted_ratings["userId"] == user)][
"rating_adjusted"].values[0]
# store everything
record = pd.Series([user, item, other_item, rating1, rating2], index=titles)
record_1_2 = record_1_2.append(record, ignore_index=True)
# a list of all other items
distinct_others = np.unique(record_1_2['item2'])
for other in distinct_others:
# get info of the other item
paired_1_2 = record_1_2.loc[record_1_2['item2'] == other]
# prepare the nominator as always
sim_value_numerator = float((paired_1_2['rating1'] * paired_1_2['rating2']).sum())
# for denominator we get all the ratings for items to avoid 1.0 similarities
sim_value_denominator = float(
np.sqrt(np.square(
self.adjusted_ratings.loc[self.adjusted_ratings["itemId"] == item]["rating_adjusted"].values).sum())
*
np.sqrt(np.square(self.adjusted_ratings.loc[self.adjusted_ratings["itemId"] == other][
"rating_adjusted"].values).sum()))
sim_value_denominator = sim_value_denominator if sim_value_denominator != 0 else 1e-8
# adjusted weird cosine similarity
sim_value = sim_value_numerator / sim_value_denominator
# get rid of 1.0000000002 and -1.000000002 if they still exist
if sim_value > 1.0:
sim_value = 1.0
if sim_value < -1.0:
sim_value = -1.0
# append to sim matrix ['item1', 'item2', 'sim']
self.sim_matrix = self.sim_matrix.append(pd.Series([item, other, sim_value], index=matrix_columns),
ignore_index=True)
return self.sim_matrix
def __get_neighbors__(self, userId):
""" method for calculating recommendation candidates for user
Args:
userId (str): unique user id
Returns:
array: array of items id which are from neighborhood area
"""
n_arr = []
items_of_user = self.current_df.loc[(self.current_df["userId"] == userId)]
for itemId in items_of_user["itemId"].values:
L = self.sim_matrix.loc[(self.sim_matrix["item1"] == itemId)]
L = L.loc[pd.to_numeric(L["sim"]) > 0.0]
L_sorted = L.sort_values(by=["sim"], ascending=False)
L_new = L_sorted.head(10)
for l in L_new["item2"]:
if (l not in n_arr) and (l not in items_of_user["itemId"].values):
n_arr.append(l)
return n_arr
def predictTopKRecommendations(self, userId, k):
"""
Method for returning list of best k recommendations for userId
Args:
userId (str): unique user id
k (int): number of items to be recommended
Returns:
array: array of triples (itemId, predicted_rating, 0)
"""
# because it is very time consuming we predict ratings only for "neighboring" items for each user
# not for full-catalog minus train set
# more details in the report
n_arr = self.__get_neighbors__(userId)
all_predictions = []
for i, item in enumerate(n_arr):
predicted_rating = float(self.predictRating(userId, item))
if predicted_rating > 0.0:
all_predictions.append([item, predicted_rating, 0])
sorted_all_predictions = sorted(all_predictions, key=lambda x: (x[1], x[1], x[1]), reverse=True)
return sorted_all_predictions[:k]