Skip to content

Commit

Permalink
Move default location of data to the user home folder (#652)
Browse files Browse the repository at this point in the history
* Change default location of data_home

* Removed the `clear_data_home` function

* Add changes to CHANGES.rst

* Apply suggestions from code review

Co-authored-by: Jérémie du Boisberranger <[email protected]>

* update _utils.py and test_utils.py to use pathlib instead os.path

* Propagate changes

* Update skrub/datasets/tests/test_utils.py

Co-authored-by: Guillaume Lemaitre <[email protected]>

* Update skrub/datasets/_utils.py

Co-authored-by: Guillaume Lemaitre <[email protected]>

* Update skrub/datasets/_utils.py

Co-authored-by: Guillaume Lemaitre <[email protected]>

* Update skrub/datasets/tests/test_utils.py

Co-authored-by: Guillaume Lemaitre <[email protected]>

* Update skrub/datasets/_utils.py

Co-authored-by: Lilian <[email protected]>

* Update skrub/datasets/_utils.py

Co-authored-by: Lilian <[email protected]>

* Update skrub/datasets/_utils.py

Co-authored-by: Lilian <[email protected]>

* Update skrub/datasets/_utils.py

Co-authored-by: Lilian <[email protected]>

* Update skrub/datasets/tests/test_utils.py

Co-authored-by: Guillaume Lemaitre <[email protected]>

* Update skrub/datasets/tests/test_utils.py

Co-authored-by: Guillaume Lemaitre <[email protected]>

* allow data_home to be a string

* Improved test for `get_data_home`

* Apply suggestions from code review

Co-authored-by: Guillaume Lemaitre <[email protected]>

* Simplified test for `get_data_home`

* change `data_home` for `data_directory` when needed

* Fixes bug with user directory

* Correct type hint

* Update skrub/datasets/_utils.py

Co-authored-by: Lilian <[email protected]>

* Apply suggestions from code review

* Make test work on my machine

* Refactor tests

* Tests working on all platforms

* Tests more robust


---------

Co-authored-by: Jérémie du Boisberranger <[email protected]>
Co-authored-by: Guillaume Lemaitre <[email protected]>
Co-authored-by: Lilian <[email protected]>
Co-authored-by: Gael Varoquaux <[email protected]>
  • Loading branch information
5 people authored Jul 21, 2023
1 parent c47d7ea commit dd35027
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 66 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ Minor changes
by converting them to string before type inference.
:pr:`623`by :user:`Leo Grinsztajn <LeoGrin>`

* Moved the default storage location of data to the user's home folder.
:pr:`652` by :user:`Felix Lefebvre <flefebv>` and
:user:`Gael Varoquaux <GaelVaroquaux>`

* Fixed bug when using :class:`TableVectorizer`'s `transform` method on
categorical columns with missing values.
:pr:`644` by :user:`Leo Grinsztajn <LeoGrin>`
Expand Down
13 changes: 9 additions & 4 deletions benchmarks/bench_fuzzy_join_sparse_vs_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@
"""

import math
from pathlib import Path
from utils import default_parser, find_result, monitor
from utils.join import evaluate, fetch_big_data
from argparse import ArgumentParser
import numbers
import warnings
from argparse import ArgumentParser
from collections.abc import Iterable
from time import perf_counter
from typing import Literal
Expand All @@ -29,8 +32,6 @@
)
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from utils import default_parser, find_result, monitor
from utils.join import evaluate, fetch_big_data


def _numeric_encoding(
Expand Down Expand Up @@ -525,8 +526,12 @@ def benchmark(
dataset_name: str,
analyzer: Literal["char_wb", "char", "word"],
ngram_range: tuple,
data_home: Path | str | None = None,
data_directory: str | None = "benchmarks_data",
):
left_table, right_table, gt = fetch_big_data(dataset_name)
left_table, right_table, gt = fetch_big_data(
dataset_name=dataset_name, data_home=data_home, data_directory=data_directory
)

start_time = perf_counter()
joined_fj = fuzzy_join(
Expand Down
39 changes: 33 additions & 6 deletions benchmarks/utils/join.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import pandas as pd
from pathlib import Path

from skrub.datasets._utils import get_data_dir


def get_local_data(dataset_name: str, data_directory: str = None):
def get_local_data(
dataset_name: str,
data_home: Path | str | None = None,
data_directory: str | None = None,
):
"""Get the path to the local datasets."""
if data_directory is None:
data_directory = get_data_dir("benchmarks_data")
data_directory = get_data_dir(data_directory, data_home)
left_path = str(data_directory) + f"/left_{dataset_name}.parquet"
right_path = str(data_directory) + f"/right_{dataset_name}.parquet"
gt_path = str(data_directory) + f"/gt_{dataset_name}.parquet"
Expand All @@ -21,7 +25,10 @@ def get_local_data(dataset_name: str, data_directory: str = None):


def fetch_data(
dataset_name: str, save: bool = True
dataset_name: str,
save: bool = True,
data_home: Path | str | None = None,
data_directory: str | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Fetch datasets from https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark # noqa
Expand All @@ -33,6 +40,13 @@ def fetch_data(
save: bool, default=true
Wheter to save the datasets locally.
data_home: Path or str, optional
The path to the root data directory.
By default, will point to the skrub data directory.
data_directory: str, optional
The name of the subdirectory in which data is stored.
Returns
-------
left: pd.DataFrame
Expand All @@ -44,7 +58,9 @@ def fetch_data(
gt: pd.DataFrame
Ground truth dataset.
"""
left_path, right_path, gt_path, file_paths = get_local_data(dataset_name)
left_path, right_path, gt_path, file_paths = get_local_data(
dataset_name, data_home, data_directory
)
if len(file_paths) == 0:
repository = "Yeye-He/Auto-Join"
dataset_name = dataset_name.replace(" ", "%20")
Expand All @@ -70,6 +86,8 @@ def fetch_big_data(
dataset_name: str,
data_type: str = "Dirty",
save: bool = True,
data_home: Path | str | None = None,
data_directory: str | None = None,
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Fetch datasets from https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md # noqa
Expand All @@ -85,6 +103,13 @@ def fetch_big_data(
save: bool, default=true
Wheter to save the datasets locally.
data_home: Path or str, optional
The path to the root data directory.
By default, will point to the skrub data directory.
data_directory: str, optional
The name of the subdirectory in which data is stored.
Returns
-------
left: pd.DataFrame
Expand All @@ -97,7 +122,9 @@ def fetch_big_data(
Ground truth dataset.
"""
link = "https://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/"
left_path, right_path, gt_path, file_paths = get_local_data(dataset_name)
left_path, right_path, gt_path, file_paths = get_local_data(
dataset_name, data_home, data_directory
)
if len(file_paths) == 0:
test_idx = pd.read_csv(f"{link}/{data_type}/{dataset_name}/exp_data/test.csv")
train_idx = pd.read_csv(f"{link}/{data_type}/{dataset_name}/exp_data/train.csv")
Expand Down
65 changes: 35 additions & 30 deletions skrub/datasets/_fetching.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,16 @@ def _fetch_openml_dataset(
dataset_id: int,
data_directory: Path | None = None,
) -> dict[str, Any]:
"""Gets a dataset from OpenML (https://www.openml.org).
"""
Gets a dataset from OpenML (https://www.openml.org).
Parameters
----------
dataset_id : int
The ID of the dataset to fetch.
data_directory : Path, optional
A directory to save the data to.
By default, the skrub data directory.
data_directory : pathlib.Path, optional
The directory where the dataset is stored.
By default, a subdirectory "openml" in the skrub data directory.
Returns
-------
Expand All @@ -176,7 +177,7 @@ def _fetch_openml_dataset(
saved as a CSV file.
"""
if data_directory is None:
data_directory = get_data_dir()
data_directory = get_data_dir(name="openml")

# Make path absolute
data_directory = data_directory.resolve()
Expand Down Expand Up @@ -242,9 +243,9 @@ def _fetch_world_bank_data(
----------
indicator_id : str
The ID of the indicator's dataset to fetch.
data_directory : Path, optional
A directory to save the data to.
By default, the skrub data directory.
data_directory : pathlib.Path, optional
The directory where the dataset is stored.
By default, a subdirectory "world_bank" in the skrub data directory.
Returns
-------
Expand All @@ -260,7 +261,7 @@ def _fetch_world_bank_data(
saved as a CSV file.
"""
if data_directory is None:
data_directory = get_data_dir()
data_directory = get_data_dir(name="world_bank")

csv_path = (data_directory / f"{indicator_id}.csv").resolve()
data_directory.mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -326,8 +327,8 @@ def _fetch_figshare(
figshare_id : str
The ID of the dataset to fetch.
data_directory : pathlib.Path, optional
A directory to save the data to.
By default, the skrub data directory.
The directory where the dataset is stored.
By default, a subdirectory "figshare" in the skrub data directory.
Returns
-------
Expand All @@ -347,7 +348,8 @@ def _fetch_figshare(
pyarrow installed to run correctly.
"""
if data_directory is None:
data_directory = get_data_dir()
data_directory = get_data_dir(name="figshare")

parquet_path = (data_directory / f"figshare_{figshare_id}.parquet").resolve()
data_directory.mkdir(parents=True, exist_ok=True)
url = f"https://ndownloader.figshare.com/files/{figshare_id}"
Expand Down Expand Up @@ -666,7 +668,7 @@ def fetch_employee_salaries(
load_dataframe: bool = True,
drop_linked: bool = True,
drop_irrelevant: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the employee salaries dataset (regression), available at https://openml.org/d/42125
Expand All @@ -685,6 +687,9 @@ def fetch_employee_salaries(
Drops column "full_name", which is usually irrelevant to the
statistical analysis.
data_directory: pathlib.Path or str, optional
The directory where the dataset is stored.
Returns
-------
DatasetAll
Expand All @@ -704,7 +709,7 @@ def fetch_employee_salaries(
"na_values": ["?"],
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)
if load_dataframe:
if drop_linked:
Expand All @@ -720,7 +725,7 @@ def fetch_employee_salaries(
def fetch_road_safety(
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the road safety dataset (classification), available at https://openml.org/d/42803
Expand All @@ -747,14 +752,14 @@ def fetch_road_safety(
"na_values": ["?"],
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_medical_charge(
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the medical charge dataset (regression), available at https://openml.org/d/42720
Expand Down Expand Up @@ -786,14 +791,14 @@ def fetch_medical_charge(
"escapechar": "\\",
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_midwest_survey(
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the midwest survey dataset (classification), available at https://openml.org/d/42805
Expand All @@ -818,14 +823,14 @@ def fetch_midwest_survey(
"escapechar": "\\",
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_open_payments(
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the open payments dataset (classification), available at https://openml.org/d/42738
Expand All @@ -852,14 +857,14 @@ def fetch_open_payments(
"na_values": ["?"],
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_traffic_violations(
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the traffic violations dataset (classification), available at https://openml.org/d/42132
Expand Down Expand Up @@ -888,14 +893,14 @@ def fetch_traffic_violations(
"na_values": ["?"],
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_drug_directory(
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches the drug directory dataset (classification), available at https://openml.org/d/43044
Expand All @@ -921,15 +926,15 @@ def fetch_drug_directory(
"escapechar": "\\",
},
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_world_bank_indicator(
indicator_id: str,
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches a dataset of an indicator from the World Bank open data platform.
Expand All @@ -952,15 +957,15 @@ def fetch_world_bank_indicator(
dataset_id=indicator_id,
target=None,
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)


def fetch_figshare(
figshare_id: str,
*,
load_dataframe: bool = True,
directory: Path | str | None = None,
data_directory: Path | str | None = None,
) -> DatasetAll | DatasetInfoOnly:
"""Fetches a table of from figshare.
Expand All @@ -978,5 +983,5 @@ def fetch_figshare(
dataset_id=figshare_id,
target=None,
load_dataframe=load_dataframe,
data_directory=directory,
data_directory=data_directory,
)
Loading

0 comments on commit dd35027

Please sign in to comment.