From 771fc343e3c84ebae982dd17438050ce6ec03849 Mon Sep 17 00:00:00 2001 From: Takuya Kitazawa Date: Sun, 13 Feb 2022 05:56:17 -0800 Subject: [PATCH 1/4] Ensure `exdir` exists before unzipping --- src/datasets.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/datasets.jl b/src/datasets.jl index abfdff0..5760003 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -48,6 +48,7 @@ function unzip(path::String, exdir::Union{String, Nothing}=nothing) if exdir == nothing exdir = dirname(path) end + get_data_home(exdir) zip_reader = ZipFile.Reader(path) for file in zip_reader.files out_path = joinpath(exdir, file.name) From fcfcd8c6f7160da1b33ef924d796da6de9462fa7 Mon Sep 17 00:00:00 2001 From: Takuya Kitazawa Date: Sun, 13 Feb 2022 05:56:57 -0800 Subject: [PATCH 2/4] Cleanse `load_movielens_latest` docstring/variables --- src/datasets.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/datasets.jl b/src/datasets.jl index 5760003..5490e64 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -126,7 +126,7 @@ end """ - load_movielens_latest([path=nothing]) + load_movielens_latest([path=nothing]) -> DataAccessor `path` points to a locally saved [MovieLens Latest (Small)](https://files.grouplens.org/datasets/movielens/ml-latest-small-README.html). Read user-item-rating triples in the folder, and convert them into a `DataAccessor` instance. @@ -134,10 +134,6 @@ Read user-item-rating triples in the folder, and convert them into a `DataAccess Download and decompress a corresponding zip file, if `path` is not given or the specified folder does not exist. """ function load_movielens_latest(path::Union{String, Nothing}=nothing) - n_user = 610 - n_item = 9742 - R = matrix(n_user, n_item) - if path == nothing || !isdir(path) zip_path = path if zip_path != nothing From 76947dd93361e513560c18b1059dc6de9c22e9ca Mon Sep 17 00:00:00 2001 From: Takuya Kitazawa Date: Sun, 13 Feb 2022 05:57:44 -0800 Subject: [PATCH 3/4] Add `load_lastfm` that loads user-artist listening frequency data --- src/datasets.jl | 48 ++++++++++++++++++++++++++++++++++++++++++- test/test_datasets.jl | 9 ++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/datasets.jl b/src/datasets.jl index 5490e64..c934487 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -1,4 +1,4 @@ -export get_data_home, download_file, unzip, load_movielens_100k, load_movielens_latest, load_amazon_review +export get_data_home, download_file, unzip, load_movielens_100k, load_movielens_latest, load_amazon_review, load_lastfm """ get_data_home([data_home=nothing]) -> String @@ -279,3 +279,49 @@ function load_amazon_review(path::Union{String, Nothing}=nothing; category::Stri end DataAccessor(events, n_user, n_item) end + +""" + load_lastfm([path=nothing]) -> DataAccessor + +`path` points to a locally saved [HetRec 2011 Last.FM dataset](https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-readme.txt) +Each row has a tuple of (user, artist, # of listenings). +""" +function load_lastfm(path::Union{String, Nothing}=nothing) + if path == nothing || !isdir(path) + zip_path = path + if zip_path != nothing + zip_path = joinpath(dirname(zip_path), "hetrec2011-lastfm-2k.zip") + end + zip_path = download_file("https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip", zip_path) + path = unzip(zip_path, path) + end + + events = Array{Event, 1}() + n_user, n_item = 0, 0 + user_ids, item_ids = Dict{Integer, Integer}(), Dict{Integer, Integer}() + open(joinpath(path, "user_artists.dat"), "r") do io + for (index, line) in enumerate(eachline(io)) + if index == 1 + continue + end + l = split(line, "\t") + user, item, cnt = parse(Int, l[1]), parse(Int, l[2]), parse(Int, l[3]) + if haskey(user_ids, user) + u = user_ids[user] + else + n_user += 1 + u = n_user + user_ids[user] = n_user + end + if haskey(item_ids, item) + i = item_ids[item] + else + n_item += 1 + i = n_item + item_ids[item] = n_item + end + push!(events, Event(u, i, cnt)) + end + end + DataAccessor(events, n_user, n_item) +end diff --git a/test/test_datasets.jl b/test/test_datasets.jl index e87e44e..dea5ae1 100644 --- a/test/test_datasets.jl +++ b/test/test_datasets.jl @@ -91,9 +91,18 @@ function test_load_amazon_review() @test data.R[1, 1] == 5.0 end +function test_load_lastfm() + path = tempname() + println("-- Testing download and read Last.FM user-artist listening frequency dataset at: $path") + + data = load_lastfm(path) + @test data.R[1, 1] == 13883 +end + test_get_data_home() test_download_file() test_unzip() test_load_movielens_100k() test_load_movielens_latest() test_load_amazon_review() +test_load_lastfm() From 209605121b4c255950ffa09eb92e6fb6a9064bc6 Mon Sep 17 00:00:00 2001 From: Takuya Kitazawa Date: Sun, 13 Feb 2022 06:46:29 -0800 Subject: [PATCH 4/4] Conditionally trigger unit tests that download 3rd-party data --- README.md | 10 ++++++++-- test/test_datasets.jl | 13 ++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index ad3e130..8a750ec 100644 --- a/README.md +++ b/README.md @@ -32,8 +32,14 @@ Change the code and test locally: ``` $ julia julia> using Pkg; Pkg.activate(@__DIR__); Pkg.instantiate() -# hit `]` -(Recommendation) pkg> test +julia> Pkg.test("Recommendation") +julia> Pkg.test("Recommendation", test_args=["download"]) +``` + +Note that unit tests for dataset loaders (e.g., `load_movielens_lates()`) are conditionally triggered as follows, so that CI does not make excessive download requests to the external sites: + +``` +julia> Pkg.test("Recommendation", test_args=["download"]) ``` Build documentation contents: diff --git a/test/test_datasets.jl b/test/test_datasets.jl index dea5ae1..e1331a2 100644 --- a/test/test_datasets.jl +++ b/test/test_datasets.jl @@ -100,9 +100,12 @@ function test_load_lastfm() end test_get_data_home() -test_download_file() test_unzip() -test_load_movielens_100k() -test_load_movielens_latest() -test_load_amazon_review() -test_load_lastfm() + +if "download" in ARGS + test_download_file() + test_load_movielens_100k() + test_load_movielens_latest() + test_load_amazon_review() + test_load_lastfm() +end