Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move default location of data to the user home folder #652

Merged
merged 40 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
3e00a8c
Change default location of data_home
flefebv Jul 18, 2023
b786dc9
Removed the `clear_data_home` function
flefebv Jul 18, 2023
9feaef7
Add changes to CHANGES.rst
flefebv Jul 18, 2023
4ea3570
Apply suggestions from code review
flefebv Jul 18, 2023
5b22b55
update _utils.py and test_utils.py to use pathlib instead os.path
flefebv Jul 19, 2023
3f6b8b1
Propagate changes
flefebv Jul 19, 2023
d3254fd
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
2e63488
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
16189e5
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
78ba357
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
c686850
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
01e8227
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
eeeb2ca
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
c0251bc
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
ccb9f56
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
79bac36
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
56bbb15
allow data_home to be a string
flefebv Jul 19, 2023
c538354
Improved test for `get_data_home`
flefebv Jul 19, 2023
45afc07
Apply suggestions from code review
LilianBoulard Jul 19, 2023
7fd24e8
Simplified test for `get_data_home`
flefebv Jul 19, 2023
435b2e7
change `data_home` for `data_directory` when needed
flefebv Jul 19, 2023
9a0722d
Merge branch 'main' of github.com:flefebv/skrub into data_home
flefebv Jul 19, 2023
0b031b2
Fixes bug with user directory
flefebv Jul 19, 2023
64d46c1
Correct type hint
flefebv Jul 19, 2023
19c661f
Update skrub/datasets/_utils.py
GaelVaroquaux Jul 20, 2023
7886313
Apply suggestions from code review
LilianBoulard Jul 20, 2023
3fafe8d
Merge branch 'main' into pr_652
GaelVaroquaux Jul 21, 2023
5764738
make black happy
GaelVaroquaux Jul 21, 2023
eaa8c7d
Make test work on my machine
GaelVaroquaux Jul 21, 2023
dee3b98
Refactor tests
GaelVaroquaux Jul 21, 2023
34e29ae
make black happy
GaelVaroquaux Jul 21, 2023
e987fc0
make black happy
GaelVaroquaux Jul 21, 2023
f81593b
Tests working on all platforms
GaelVaroquaux Jul 21, 2023
5ce6e68
Tests more robust
GaelVaroquaux Jul 21, 2023
9f640dc
Last commit was an error
GaelVaroquaux Jul 21, 2023
5d67623
Try to make tests more robust
GaelVaroquaux Jul 21, 2023
feab5f9
tired
GaelVaroquaux Jul 21, 2023
19471d8
wired
GaelVaroquaux Jul 21, 2023
8f9559f
Try something
GaelVaroquaux Jul 21, 2023
c7dbeab
Cleaner code
GaelVaroquaux Jul 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ Minor changes
by converting them to string before type inference.
:pr:`623`by :user:`Leo Grinsztajn <LeoGrin>`

* Moved the default storage location of data to the user's home folder.
:pr:`652` by :user:`Felix Lefebvre <flefebv>`

Before skrub: dirty_cat
========================

Expand Down
43 changes: 38 additions & 5 deletions skrub/datasets/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,39 @@
from pathlib import Path


def get_data_dir(name: str | None = None) -> Path:
def get_data_home(data_home=None) -> str:
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
"""Returns the path of the skrub data directory.

This folder is used by some large dataset loaders to avoid downloading the
data several times.

By default the data directory is set to a folder named 'skrub_data' in the
user home folder.

Alternatively, it can be set programmatically by giving an explicit folder
path. The '~' symbol is expanded to the user home folder.

If the folder does not already exist, it is automatically created.

Parameters
----------
data_home : str, default=None
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
The path to skrub data directory. If `None`, the default path
is `~/skrub_data`.

Returns
-------
data_home: str or path-like, default=None
flefebv marked this conversation as resolved.
Show resolved Hide resolved
flefebv marked this conversation as resolved.
Show resolved Hide resolved
The path to skrub data directory.
flefebv marked this conversation as resolved.
Show resolved Hide resolved
"""
if data_home is None:
data_home = os.path.join("~", "skrub_data")
data_home = os.path.expanduser(data_home)
os.makedirs(data_home, exist_ok=True)
return data_home
flefebv marked this conversation as resolved.
Show resolved Hide resolved


def get_data_dir(name: str | None = None, data_home: str | None = None) -> Path:
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns the directory in which skrub looks for data.

Expand All @@ -11,13 +43,14 @@ def get_data_dir(name: str | None = None) -> Path:

Parameters
----------
data_home : str, default=None
The path to skrub data directory. If `None`, the default path
is `~/skrub_data`.
name: str, optional
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
Subdirectory name. If omitted, the root data directory is returned.
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
"""
# Note: we stick to os.path instead of pathlib.Path because
# it's easier to test, and the functionality is the same.
module_path = Path(os.path.dirname(__file__)).resolve()
data_dir = module_path / "data"
data_home = get_data_home(data_home)
data_dir = Path(data_home).resolve()
if name is not None:
data_dir = data_dir / name
return data_dir
58 changes: 49 additions & 9 deletions skrub/datasets/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,58 @@
import os
import shutil
import tempfile
from pathlib import Path
from unittest import mock

from skrub.datasets._utils import get_data_dir, get_data_home

@mock.patch("os.path.dirname")
def test_get_data_dir(mock_os_path_dirname):

def test_get_data_dir():
Copy link
Member

@glemaitre glemaitre Jul 19, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def test_get_data_dir():
@pytest.mark.parametrize("data_home_type", ["string", "path"])
@pytest.mark.parametrize("subdirectory_name", [None, "tests"])
def test_get_data_dir_existing_folder(data_home_type, subdirectory_name):

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can parametrize with subdirectory_name and it will avoid repeating code below

"""
Tests function ``get_data_dir()``.
"""
flefebv marked this conversation as resolved.
Show resolved Hide resolved
from skrub.datasets._utils import get_data_dir
with tempfile.TemporaryDirectory() as tmpdirname:
expected_return_value_default = Path(tmpdirname)
assert get_data_dir(data_home=tmpdirname) == expected_return_value_default

expected_return_value_custom = expected_return_value_default / "tests"
assert (
get_data_dir(name="tests", data_home=tmpdirname)
== expected_return_value_custom
)


def test_get_data_home_str():
"""
Test function for ``get_data_home()`` when `data_home` is a string.
flefebv marked this conversation as resolved.
Show resolved Hide resolved
"""
with tempfile.TemporaryDirectory() as tmpdirname:
# get_data_home will point to a pre-existing folder
assert os.path.exists(tmpdirname)
data_home = get_data_home(data_home=tmpdirname)
assert data_home == tmpdirname
assert os.path.exists(data_home)

# if the folder is missing it will be created again
shutil.rmtree(data_home)
assert not os.path.exists(data_home)
assert not os.path.exists(tmpdirname)
data_home = get_data_home(data_home=tmpdirname)
assert os.path.exists(data_home)


expected_return_value_default = Path("/user/directory/data").absolute()
def test_get_data_home_None():
flefebv marked this conversation as resolved.
Show resolved Hide resolved
"""
Test function for ``get_data_home()`` with `data_home` set to `None`.
"""
flefebv marked this conversation as resolved.
Show resolved Hide resolved
# get path of the folder 'skrub_data' in the user home folder
flefebv marked this conversation as resolved.
Show resolved Hide resolved
user_path = os.path.join("~", "skrub_data")
user_path = os.path.expanduser(user_path)

mock_os_path_dirname.return_value = expected_return_value_default.parent
assert get_data_dir() == expected_return_value_default
# if the folder does not exist, create it, and clear it
flefebv marked this conversation as resolved.
Show resolved Hide resolved
if not os.path.exists(user_path):
data_home = get_data_home(data_home=None)
assert data_home == user_path
assert os.path.exists(data_home)

expected_return_value_custom = expected_return_value_default / "tests"
assert get_data_dir("tests") == expected_return_value_custom
shutil.rmtree(data_home)
assert not os.path.exists(data_home)