Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add load_lastfm data loader to get the user-artist listening frequency data #43

Merged
merged 4 commits into from
Feb 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,14 @@ Change the code and test locally:
```
$ julia
julia> using Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
# hit `]`
(Recommendation) pkg> test
julia> Pkg.test("Recommendation")
julia> Pkg.test("Recommendation", test_args=["download"])
```

Note that unit tests for dataset loaders (e.g., `load_movielens_lates()`) are conditionally triggered as follows, so that CI does not make excessive download requests to the external sites:

```
julia> Pkg.test("Recommendation", test_args=["download"])
```

Build documentation contents:
Expand Down
55 changes: 49 additions & 6 deletions src/datasets.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export get_data_home, download_file, unzip, load_movielens_100k, load_movielens_latest, load_amazon_review
export get_data_home, download_file, unzip, load_movielens_100k, load_movielens_latest, load_amazon_review, load_lastfm

"""
get_data_home([data_home=nothing]) -> String
Expand Down Expand Up @@ -48,6 +48,7 @@ function unzip(path::String, exdir::Union{String, Nothing}=nothing)
if exdir == nothing
exdir = dirname(path)
end
get_data_home(exdir)
zip_reader = ZipFile.Reader(path)
for file in zip_reader.files
out_path = joinpath(exdir, file.name)
Expand Down Expand Up @@ -125,18 +126,14 @@ end


"""
load_movielens_latest([path=nothing])
load_movielens_latest([path=nothing]) -> DataAccessor

`path` points to a locally saved [MovieLens Latest (Small)](https://files.grouplens.org/datasets/movielens/ml-latest-small-README.html).
Read user-item-rating triples in the folder, and convert them into a `DataAccessor` instance.

Download and decompress a corresponding zip file, if `path` is not given or the specified folder does not exist.
"""
function load_movielens_latest(path::Union{String, Nothing}=nothing)
n_user = 610
n_item = 9742
R = matrix(n_user, n_item)

if path == nothing || !isdir(path)
zip_path = path
if zip_path != nothing
Expand Down Expand Up @@ -282,3 +279,49 @@ function load_amazon_review(path::Union{String, Nothing}=nothing; category::Stri
end
DataAccessor(events, n_user, n_item)
end

"""
load_lastfm([path=nothing]) -> DataAccessor

`path` points to a locally saved [HetRec 2011 Last.FM dataset](https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-readme.txt)
Each row has a tuple of (user, artist, # of listenings).
"""
function load_lastfm(path::Union{String, Nothing}=nothing)
if path == nothing || !isdir(path)
zip_path = path
if zip_path != nothing
zip_path = joinpath(dirname(zip_path), "hetrec2011-lastfm-2k.zip")
end
zip_path = download_file("https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip", zip_path)
path = unzip(zip_path, path)
end

events = Array{Event, 1}()
n_user, n_item = 0, 0
user_ids, item_ids = Dict{Integer, Integer}(), Dict{Integer, Integer}()
open(joinpath(path, "user_artists.dat"), "r") do io
for (index, line) in enumerate(eachline(io))
if index == 1
continue
end
l = split(line, "\t")
user, item, cnt = parse(Int, l[1]), parse(Int, l[2]), parse(Int, l[3])
if haskey(user_ids, user)
u = user_ids[user]
else
n_user += 1
u = n_user
user_ids[user] = n_user
end
if haskey(item_ids, item)
i = item_ids[item]
else
n_item += 1
i = n_item
item_ids[item] = n_item
end
push!(events, Event(u, i, cnt))
end
end
DataAccessor(events, n_user, n_item)
end
20 changes: 16 additions & 4 deletions test/test_datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,21 @@ function test_load_amazon_review()
@test data.R[1, 1] == 5.0
end

function test_load_lastfm()
path = tempname()
println("-- Testing download and read Last.FM user-artist listening frequency dataset at: $path")

data = load_lastfm(path)
@test data.R[1, 1] == 13883
end

test_get_data_home()
test_download_file()
test_unzip()
test_load_movielens_100k()
test_load_movielens_latest()
test_load_amazon_review()

if "download" in ARGS
test_download_file()
test_load_movielens_100k()
test_load_movielens_latest()
test_load_amazon_review()
test_load_lastfm()
end