Skip to content

Commit

Permalink
Merge pull request #43 from takuti/lastfm
Browse files Browse the repository at this point in the history
Add `load_lastfm` data loader to get the user-artist listening frequency data
  • Loading branch information
takuti authored Feb 13, 2022
2 parents 42fcd36 + 2096051 commit 75dfc22
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 12 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,14 @@ Change the code and test locally:
```
$ julia
julia> using Pkg; Pkg.activate(@__DIR__); Pkg.instantiate()
# hit `]`
(Recommendation) pkg> test
julia> Pkg.test("Recommendation")
julia> Pkg.test("Recommendation", test_args=["download"])
```

Note that unit tests for dataset loaders (e.g., `load_movielens_lates()`) are conditionally triggered as follows, so that CI does not make excessive download requests to the external sites:

```
julia> Pkg.test("Recommendation", test_args=["download"])
```

Build documentation contents:
Expand Down
55 changes: 49 additions & 6 deletions src/datasets.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export get_data_home, download_file, unzip, load_movielens_100k, load_movielens_latest, load_amazon_review
export get_data_home, download_file, unzip, load_movielens_100k, load_movielens_latest, load_amazon_review, load_lastfm

"""
get_data_home([data_home=nothing]) -> String
Expand Down Expand Up @@ -48,6 +48,7 @@ function unzip(path::String, exdir::Union{String, Nothing}=nothing)
if exdir == nothing
exdir = dirname(path)
end
get_data_home(exdir)
zip_reader = ZipFile.Reader(path)
for file in zip_reader.files
out_path = joinpath(exdir, file.name)
Expand Down Expand Up @@ -125,18 +126,14 @@ end


"""
load_movielens_latest([path=nothing])
load_movielens_latest([path=nothing]) -> DataAccessor
`path` points to a locally saved [MovieLens Latest (Small)](https://files.grouplens.org/datasets/movielens/ml-latest-small-README.html).
Read user-item-rating triples in the folder, and convert them into a `DataAccessor` instance.
Download and decompress a corresponding zip file, if `path` is not given or the specified folder does not exist.
"""
function load_movielens_latest(path::Union{String, Nothing}=nothing)
n_user = 610
n_item = 9742
R = matrix(n_user, n_item)

if path == nothing || !isdir(path)
zip_path = path
if zip_path != nothing
Expand Down Expand Up @@ -282,3 +279,49 @@ function load_amazon_review(path::Union{String, Nothing}=nothing; category::Stri
end
DataAccessor(events, n_user, n_item)
end

"""
load_lastfm([path=nothing]) -> DataAccessor
`path` points to a locally saved [HetRec 2011 Last.FM dataset](https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-readme.txt)
Each row has a tuple of (user, artist, # of listenings).
"""
function load_lastfm(path::Union{String, Nothing}=nothing)
if path == nothing || !isdir(path)
zip_path = path
if zip_path != nothing
zip_path = joinpath(dirname(zip_path), "hetrec2011-lastfm-2k.zip")
end
zip_path = download_file("https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip", zip_path)
path = unzip(zip_path, path)
end

events = Array{Event, 1}()
n_user, n_item = 0, 0
user_ids, item_ids = Dict{Integer, Integer}(), Dict{Integer, Integer}()
open(joinpath(path, "user_artists.dat"), "r") do io
for (index, line) in enumerate(eachline(io))
if index == 1
continue
end
l = split(line, "\t")
user, item, cnt = parse(Int, l[1]), parse(Int, l[2]), parse(Int, l[3])
if haskey(user_ids, user)
u = user_ids[user]
else
n_user += 1
u = n_user
user_ids[user] = n_user
end
if haskey(item_ids, item)
i = item_ids[item]
else
n_item += 1
i = n_item
item_ids[item] = n_item
end
push!(events, Event(u, i, cnt))
end
end
DataAccessor(events, n_user, n_item)
end
20 changes: 16 additions & 4 deletions test/test_datasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,21 @@ function test_load_amazon_review()
@test data.R[1, 1] == 5.0
end

function test_load_lastfm()
path = tempname()
println("-- Testing download and read Last.FM user-artist listening frequency dataset at: $path")

data = load_lastfm(path)
@test data.R[1, 1] == 13883
end

test_get_data_home()
test_download_file()
test_unzip()
test_load_movielens_100k()
test_load_movielens_latest()
test_load_amazon_review()

if "download" in ARGS
test_download_file()
test_load_movielens_100k()
test_load_movielens_latest()
test_load_amazon_review()
test_load_lastfm()
end

0 comments on commit 75dfc22

Please sign in to comment.