diff --git a/pom.xml b/pom.xml index 8169ff7..dc52941 100644 --- a/pom.xml +++ b/pom.xml @@ -10,9 +10,11 @@ big-data http://maven.apache.org - - UTF-8 - + + UTF-8 + 7 + 7 + diff --git a/src/main/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java b/src/main/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java new file mode 100644 index 0000000..ee20d4b --- /dev/null +++ b/src/main/java/nearsoft/academy/bigdata/recommendation/MovieRecommender.java @@ -0,0 +1,154 @@ +package nearsoft.academy.bigdata.recommendation; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.model.file.FileDataModel; +import org.apache.mahout.cf.taste.impl.neighborhood.ThresholdUserNeighborhood; +import org.apache.mahout.cf.taste.impl.recommender.GenericUserBasedRecommender; +import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.cf.taste.recommender.UserBasedRecommender; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import java.io.*; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.zip.GZIPInputStream; + +public class MovieRecommender { + HashMap users = new HashMap(); + HashMap items = new HashMap(); + Integer totalReviews = 0; + DataModel model = null; + UserBasedRecommender recommender = null; + String[] itemCodes = new String[1000000]; + + public MovieRecommender(String s) throws Exception { + String ROOT_PATH = System.getProperty("user.dir"); + + File rawData = new File(ROOT_PATH + "/src/main/resources/" + s); + File csvData = new File(ROOT_PATH + "/src/main/resources/data.csv"); + if (csvData.exists() && !csvData.isDirectory()) { + BufferedReader csvDataReader = new BufferedReader(new FileReader(ROOT_PATH + "/src/main/resources/data.csv")); + BufferedReader csvUsersReader = new BufferedReader(new FileReader(ROOT_PATH + "/src/main/resources/users.csv")); + BufferedReader csvItemsReader = new BufferedReader(new FileReader(ROOT_PATH + "/src/main/resources/items.csv")); + String row = null; + // Count the number of reviews + while ((row = csvDataReader.readLine()) != null) { + totalReviews++; + } + csvDataReader.close(); + // Create the user hash map + while ((row = csvUsersReader.readLine()) != null) { + String[] data = row.split(","); + users.put(data[0],Integer.valueOf(data[1])); + } + csvUsersReader.close(); + // Create the item hash map + while ((row = csvItemsReader.readLine()) != null) { + String[] data = row.split(","); + items.put(data[0], Integer.valueOf(data[1])); + itemCodes[Integer.valueOf(data[1])] = data[0]; + } + csvItemsReader.close(); + } + else if(rawData.exists() && !rawData.isDirectory()) { + // Read the file + InputStream fileStream = new FileInputStream(ROOT_PATH + "/src/main/resources/" + s); + InputStream gzipStream = new GZIPInputStream(fileStream); + Reader decoder = new InputStreamReader(gzipStream, "UTF-8"); + BufferedReader buffered = new BufferedReader(decoder); + + // Prepare the writer for the CSVs, we need 3 + // data: file used by the mahout recommender + // users: file that maps each userId by their string ID + // products: file that maps each productId by their string ID + FileWriter csvDataWriter = new FileWriter(ROOT_PATH + "/src/main/resources/data.csv"); + FileWriter csvUsersWriter = new FileWriter(ROOT_PATH + "/src/main/resources/users.csv"); + FileWriter csvItemsWriter = new FileWriter(ROOT_PATH + "/src/main/resources/items.csv"); + + // Go line by line and construct each line for the recommender + String line = null; + int userCounter = 1; + int itemCounter = 1; + int itemId = 0; + int userId = 0; + while ((line = buffered.readLine()) != null) { + if (line.contains("product/productId")){ + line = line.replace("product/productId: ", ""); + if (items.get(line) == null){ + items.put(line, itemCounter); + itemCounter++; + } + itemId = items.get(line); + itemCodes[itemId] = line; + } + else if (line.contains("review/userId")) { + line = line.replace("review/userId: ", ""); + if (users.get(line) == null){ + users.put(line, userCounter); + userCounter++; + } + userId = users.get(line); + } + else if (line.contains("review/score")){ + line = line.replace("review/score: ", ""); + csvDataWriter.append(String.valueOf(userId) + "," + String.valueOf(itemId) + "," + line + "\n"); + totalReviews++; + } + } + // Close the data file + csvDataWriter.flush(); + csvDataWriter.close(); + // Create the user hash csv file + for (String key: users.keySet()) { + csvUsersWriter.append(key + "," + users.get(key) + "\n"); + } + csvUsersWriter.flush(); + csvUsersWriter.close(); + // Create the item hash csv file + for (String key: items.keySet()) { + csvItemsWriter.append(key + "," + items.get(key) + "\n"); + } + csvItemsWriter.flush(); + csvItemsWriter.close(); + } + else { + throw new Exception("No such file exists"); + } + + // With the file either being read or created, create the recommendation model + model = new FileDataModel(new File(ROOT_PATH + "/src/main/resources/data.csv")); + UserSimilarity similarity = new PearsonCorrelationSimilarity(model); + UserNeighborhood neighborhood = new ThresholdUserNeighborhood(0.1, similarity, model); + recommender = new GenericUserBasedRecommender(model, neighborhood, similarity); + } + + public int getTotalReviews() + { + return totalReviews; + } + + public int getTotalProducts() { + return items.size(); + } + + public int getTotalUsers() { + return users.size(); + } + + public List getRecommendationsForUser(String userCode) throws TasteException { + int userId = users.get(userCode); + String[] recommendationIds = new String[3]; + int i = 0; + List recommendations = recommender.recommend(userId, 3); + for (RecommendedItem recommendation : recommendations) { + recommendationIds[i] = itemCodes[(int) recommendation.getItemID()]; + i++; + } + List top3 = Arrays.asList(recommendationIds); + return top3; + } +} diff --git a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java index 0d0b1fe..5bae3b0 100644 --- a/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java +++ b/src/test/java/nearsoft/academy/bigdata/recommendation/MovieRecommenderTest.java @@ -12,14 +12,16 @@ public class MovieRecommenderTest { @Test - public void testDataInfo() throws IOException, TasteException { + public void testDataInfo() throws Exception { //download movies.txt.gz from // http://snap.stanford.edu/data/web-Movies.html - MovieRecommender recommender = new MovieRecommender("/path/to/movies.txt.gz"); + + MovieRecommender recommender = new MovieRecommender("movies.txt.gz"); assertEquals(7911684, recommender.getTotalReviews()); assertEquals(253059, recommender.getTotalProducts()); assertEquals(889176, recommender.getTotalUsers()); + List recommendations = recommender.getRecommendationsForUser("A141HP4LYPWMSR"); assertThat(recommendations, hasItem("B0002O7Y8U")); assertThat(recommendations, hasItem("B00004CQTF"));