Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move default location of data to the user home folder #652

Merged
merged 40 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
3e00a8c
Change default location of data_home
flefebv Jul 18, 2023
b786dc9
Removed the `clear_data_home` function
flefebv Jul 18, 2023
9feaef7
Add changes to CHANGES.rst
flefebv Jul 18, 2023
4ea3570
Apply suggestions from code review
flefebv Jul 18, 2023
5b22b55
update _utils.py and test_utils.py to use pathlib instead os.path
flefebv Jul 19, 2023
3f6b8b1
Propagate changes
flefebv Jul 19, 2023
d3254fd
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
2e63488
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
16189e5
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
78ba357
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
c686850
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
01e8227
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
eeeb2ca
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
c0251bc
Update skrub/datasets/_utils.py
flefebv Jul 19, 2023
ccb9f56
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
79bac36
Update skrub/datasets/tests/test_utils.py
flefebv Jul 19, 2023
56bbb15
allow data_home to be a string
flefebv Jul 19, 2023
c538354
Improved test for `get_data_home`
flefebv Jul 19, 2023
45afc07
Apply suggestions from code review
LilianBoulard Jul 19, 2023
7fd24e8
Simplified test for `get_data_home`
flefebv Jul 19, 2023
435b2e7
change `data_home` for `data_directory` when needed
flefebv Jul 19, 2023
9a0722d
Merge branch 'main' of github.com:flefebv/skrub into data_home
flefebv Jul 19, 2023
0b031b2
Fixes bug with user directory
flefebv Jul 19, 2023
64d46c1
Correct type hint
flefebv Jul 19, 2023
19c661f
Update skrub/datasets/_utils.py
GaelVaroquaux Jul 20, 2023
7886313
Apply suggestions from code review
LilianBoulard Jul 20, 2023
3fafe8d
Merge branch 'main' into pr_652
GaelVaroquaux Jul 21, 2023
5764738
make black happy
GaelVaroquaux Jul 21, 2023
eaa8c7d
Make test work on my machine
GaelVaroquaux Jul 21, 2023
dee3b98
Refactor tests
GaelVaroquaux Jul 21, 2023
34e29ae
make black happy
GaelVaroquaux Jul 21, 2023
e987fc0
make black happy
GaelVaroquaux Jul 21, 2023
f81593b
Tests working on all platforms
GaelVaroquaux Jul 21, 2023
5ce6e68
Tests more robust
GaelVaroquaux Jul 21, 2023
9f640dc
Last commit was an error
GaelVaroquaux Jul 21, 2023
5d67623
Try to make tests more robust
GaelVaroquaux Jul 21, 2023
feab5f9
tired
GaelVaroquaux Jul 21, 2023
19471d8
wired
GaelVaroquaux Jul 21, 2023
8f9559f
Try something
GaelVaroquaux Jul 21, 2023
c7dbeab
Cleaner code
GaelVaroquaux Jul 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion benchmarks/bench_fuzzy_join_sparse_vs_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""

import math
from pathlib import Path
from utils import default_parser, find_result, monitor
from utils.join import evaluate, fetch_big_data
from argparse import ArgumentParser
Expand Down Expand Up @@ -526,8 +527,12 @@ def benchmark(
dataset_name: str,
analyzer: Literal["char_wb", "char", "word"],
ngram_range: tuple,
data_home: Path | str = None,
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
data_directory: str = "benchmarks_data",
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
):
left_table, right_table, gt = fetch_big_data(dataset_name)
left_table, right_table, gt = fetch_big_data(
dataset_name=dataset_name, data_home=data_home, data_directory=data_directory
)

start_time = perf_counter()
joined_fj = fuzzy_join(
Expand Down
37 changes: 31 additions & 6 deletions benchmarks/utils/join.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import pandas as pd
from pathlib import Path
from skrub.datasets._utils import get_data_dir


def get_local_data(dataset_name: str, data_directory: str = None):
def get_local_data(
dataset_name: str, data_home: Path | str = None, data_directory: str = None
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
):
"""Get the path to the local datasets."""
if data_directory is None:
data_directory = get_data_dir("benchmarks_data")
data_directory = get_data_dir(data_directory, data_home)
left_path = str(data_directory) + f"/left_{dataset_name}.parquet"
right_path = str(data_directory) + f"/right_{dataset_name}.parquet"
gt_path = str(data_directory) + f"/gt_{dataset_name}.parquet"
Expand All @@ -20,7 +22,10 @@ def get_local_data(dataset_name: str, data_directory: str = None):


def fetch_data(
dataset_name: str, save: bool = True
dataset_name: str,
save: bool = True,
data_home: Path | str = None,
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
data_directory: str = None,
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Fetch datasets from https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark

Expand All @@ -32,6 +37,13 @@ def fetch_data(
save: bool, default=true
Wheter to save the datasets locally.

data_home: Path or str, optional
The path to the root data directory.
By default, will point to the skrub data directory.

data_directory: str, default=None
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
The name of the subdirectory in which data is stored.

Returns
-------
left: pd.DataFrame
Expand All @@ -43,7 +55,9 @@ def fetch_data(
gt: pd.DataFrame
Ground truth dataset.
"""
left_path, right_path, gt_path, file_paths = get_local_data(dataset_name)
left_path, right_path, gt_path, file_paths = get_local_data(
dataset_name, data_home, data_directory
)
if len(file_paths) == 0:
repository = "Yeye-He/Auto-Join"
dataset_name = dataset_name.replace(" ", "%20")
Expand All @@ -69,6 +83,8 @@ def fetch_big_data(
dataset_name: str,
data_type: str = "Dirty",
save: bool = True,
data_home: Path | str = None,
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
data_directory: str = None,
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Fetch datasets from https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md

Expand All @@ -84,6 +100,13 @@ def fetch_big_data(
save: bool, default=true
Wheter to save the datasets locally.

data_home: Path or str, optional
The path to the root data directory.
By default, will point to the skrub data directory.

data_directory: str, default=None
LilianBoulard marked this conversation as resolved.
Show resolved Hide resolved
The name of the subdirectory in which data is stored.

Returns
-------
left: pd.DataFrame
Expand All @@ -96,7 +119,9 @@ def fetch_big_data(
Ground truth dataset.
"""
link = "https://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/"
left_path, right_path, gt_path, file_paths = get_local_data(dataset_name)
left_path, right_path, gt_path, file_paths = get_local_data(
dataset_name, data_home, data_directory
)
if len(file_paths) == 0:
test_idx = pd.read_csv(f"{link}/{data_type}/{dataset_name}/exp_data/test.csv")
train_idx = pd.read_csv(f"{link}/{data_type}/{dataset_name}/exp_data/train.csv")
Expand Down
Loading