Move default location of data to the user home folder (#652)

* Change default location of data_home * Removed the `clear_data_home` function * Add changes to CHANGES.rst * Apply suggestions from code review Co-authored-by: Jérémie du Boisberranger <[email protected]> * update _utils.py and test_utils.py to use pathlib instead os.path * Propagate changes * Update skrub/datasets/tests/test_utils.py Co-authored-by: Guillaume Lemaitre <[email protected]> * Update skrub/datasets/_utils.py Co-authored-by: Guillaume Lemaitre <[email protected]> * Update skrub/datasets/_utils.py Co-authored-by: Guillaume Lemaitre <[email protected]> * Update skrub/datasets/tests/test_utils.py Co-authored-by: Guillaume Lemaitre <[email protected]> * Update skrub/datasets/_utils.py Co-authored-by: Lilian <[email protected]> * Update skrub/datasets/_utils.py Co-authored-by: Lilian <[email protected]> * Update skrub/datasets/_utils.py Co-authored-by: Lilian <[email protected]> * Update skrub/datasets/_utils.py Co-authored-by: Lilian <[email protected]> * Update skrub/datasets/tests/test_utils.py Co-authored-by: Guillaume Lemaitre <[email protected]> * Update skrub/datasets/tests/test_utils.py Co-authored-by: Guillaume Lemaitre <[email protected]> * allow data_home to be a string * Improved test for `get_data_home` * Apply suggestions from code review Co-authored-by: Guillaume Lemaitre <[email protected]> * Simplified test for `get_data_home` * change `data_home` for `data_directory` when needed * Fixes bug with user directory * Correct type hint * Update skrub/datasets/_utils.py Co-authored-by: Lilian <[email protected]> * Apply suggestions from code review * Make test work on my machine * Refactor tests * Tests working on all platforms * Tests more robust --------- Co-authored-by: Jérémie du Boisberranger <[email protected]> Co-authored-by: Guillaume Lemaitre <[email protected]> Co-authored-by: Lilian <[email protected]> Co-authored-by: Gael Varoquaux <[email protected]>
skrub-data · Jul 21, 2023 · dd35027 · dd35027
1 parent c47d7ea
commit dd35027
Show file tree

Hide file tree

Showing 7 changed files with 170 additions and 66 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -88,6 +88,10 @@ Minor changes
   by converting them to string before type inference.
   :pr:`623`by :user:`Leo Grinsztajn <LeoGrin>`
 
+* Moved the default storage location of data to the user's home folder.
+  :pr:`652` by :user:`Felix Lefebvre <flefebv>` and
+  :user:`Gael Varoquaux <GaelVaroquaux>`
+
 * Fixed bug when using :class:`TableVectorizer`'s `transform` method on
   categorical columns with missing values.
   :pr:`644` by :user:`Leo Grinsztajn <LeoGrin>`

diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
@@ -9,9 +9,12 @@
 """
 
 import math
+from pathlib import Path
+from utils import default_parser, find_result, monitor
+from utils.join import evaluate, fetch_big_data
+from argparse import ArgumentParser
 import numbers
 import warnings
-from argparse import ArgumentParser
 from collections.abc import Iterable
 from time import perf_counter
 from typing import Literal
@@ -29,8 +32,6 @@
 )
 from sklearn.neighbors import NearestNeighbors
 from sklearn.preprocessing import StandardScaler
-from utils import default_parser, find_result, monitor
-from utils.join import evaluate, fetch_big_data
 
 
 def _numeric_encoding(
@@ -525,8 +526,12 @@ def benchmark(
     dataset_name: str,
     analyzer: Literal["char_wb", "char", "word"],
     ngram_range: tuple,
+    data_home: Path | str | None = None,
+    data_directory: str | None = "benchmarks_data",
 ):
-    left_table, right_table, gt = fetch_big_data(dataset_name)
+    left_table, right_table, gt = fetch_big_data(
+        dataset_name=dataset_name, data_home=data_home, data_directory=data_directory
+    )
 
     start_time = perf_counter()
     joined_fj = fuzzy_join(

diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py
@@ -1,12 +1,16 @@
 import pandas as pd
+from pathlib import Path
 
 from skrub.datasets._utils import get_data_dir
 
 
-def get_local_data(dataset_name: str, data_directory: str = None):
+def get_local_data(
+    dataset_name: str,
+    data_home: Path | str | None = None,
+    data_directory: str | None = None,
+):
     """Get the path to the local datasets."""
-    if data_directory is None:
-        data_directory = get_data_dir("benchmarks_data")
+    data_directory = get_data_dir(data_directory, data_home)
     left_path = str(data_directory) + f"/left_{dataset_name}.parquet"
     right_path = str(data_directory) + f"/right_{dataset_name}.parquet"
     gt_path = str(data_directory) + f"/gt_{dataset_name}.parquet"
@@ -21,7 +25,10 @@ def get_local_data(dataset_name: str, data_directory: str = None):
 
 
 def fetch_data(
-    dataset_name: str, save: bool = True
+    dataset_name: str,
+    save: bool = True,
+    data_home: Path | str | None = None,
+    data_directory: str | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Fetch datasets from https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark  # noqa
 
@@ -33,6 +40,13 @@ def fetch_data(
     save: bool, default=true
         Wheter to save the datasets locally.
 
+    data_home: Path or str, optional
+        The path to the root data directory.
+        By default, will point to the skrub data directory.
+
+    data_directory: str, optional
+        The name of the subdirectory in which data is stored.
+
     Returns
     -------
     left: pd.DataFrame
@@ -44,7 +58,9 @@ def fetch_data(
     gt: pd.DataFrame
         Ground truth dataset.
     """
-    left_path, right_path, gt_path, file_paths = get_local_data(dataset_name)
+    left_path, right_path, gt_path, file_paths = get_local_data(
+        dataset_name, data_home, data_directory
+    )
     if len(file_paths) == 0:
         repository = "Yeye-He/Auto-Join"
         dataset_name = dataset_name.replace(" ", "%20")
@@ -70,6 +86,8 @@ def fetch_big_data(
     dataset_name: str,
     data_type: str = "Dirty",
     save: bool = True,
+    data_home: Path | str | None = None,
+    data_directory: str | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Fetch datasets from https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md  # noqa
 
@@ -85,6 +103,13 @@ def fetch_big_data(
     save: bool, default=true
         Wheter to save the datasets locally.
 
+    data_home: Path or str, optional
+        The path to the root data directory.
+        By default, will point to the skrub data directory.
+
+    data_directory: str, optional
+        The name of the subdirectory in which data is stored.
+
     Returns
     -------
     left: pd.DataFrame
@@ -97,7 +122,9 @@ def fetch_big_data(
         Ground truth dataset.
     """
     link = "https://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/"
-    left_path, right_path, gt_path, file_paths = get_local_data(dataset_name)
+    left_path, right_path, gt_path, file_paths = get_local_data(
+        dataset_name, data_home, data_directory
+    )
     if len(file_paths) == 0:
         test_idx = pd.read_csv(f"{link}/{data_type}/{dataset_name}/exp_data/test.csv")
         train_idx = pd.read_csv(f"{link}/{data_type}/{dataset_name}/exp_data/train.csv")

diff --git a/skrub/datasets/_fetching.py b/skrub/datasets/_fetching.py
@@ -152,15 +152,16 @@ def _fetch_openml_dataset(
     dataset_id: int,
     data_directory: Path | None = None,
 ) -> dict[str, Any]:
-    """Gets a dataset from OpenML (https://www.openml.org).
+    """
+    Gets a dataset from OpenML (https://www.openml.org).
 
     Parameters
     ----------
     dataset_id : int
         The ID of the dataset to fetch.
-    data_directory : Path, optional
-        A directory to save the data to.
-        By default, the skrub data directory.
+    data_directory : pathlib.Path, optional
+        The directory where the dataset is stored.
+        By default, a subdirectory "openml" in the skrub data directory.
 
     Returns
     -------
@@ -176,7 +177,7 @@ def _fetch_openml_dataset(
               saved as a CSV file.
     """
     if data_directory is None:
-        data_directory = get_data_dir()
+        data_directory = get_data_dir(name="openml")
 
     # Make path absolute
     data_directory = data_directory.resolve()
@@ -242,9 +243,9 @@ def _fetch_world_bank_data(
     ----------
     indicator_id : str
         The ID of the indicator's dataset to fetch.
-    data_directory : Path, optional
-        A directory to save the data to.
-        By default, the skrub data directory.
+    data_directory : pathlib.Path, optional
+        The directory where the dataset is stored.
+        By default, a subdirectory "world_bank" in the skrub data directory.
 
     Returns
     -------
@@ -260,7 +261,7 @@ def _fetch_world_bank_data(
               saved as a CSV file.
     """
     if data_directory is None:
-        data_directory = get_data_dir()
+        data_directory = get_data_dir(name="world_bank")
 
     csv_path = (data_directory / f"{indicator_id}.csv").resolve()
     data_directory.mkdir(parents=True, exist_ok=True)
@@ -326,8 +327,8 @@ def _fetch_figshare(
     figshare_id : str
         The ID of the dataset to fetch.
     data_directory : pathlib.Path, optional
-        A directory to save the data to.
-        By default, the skrub data directory.
+        The directory where the dataset is stored.
+        By default, a subdirectory "figshare" in the skrub data directory.
 
     Returns
     -------
@@ -347,7 +348,8 @@ def _fetch_figshare(
     pyarrow installed to run correctly.
     """
     if data_directory is None:
-        data_directory = get_data_dir()
+        data_directory = get_data_dir(name="figshare")
+
     parquet_path = (data_directory / f"figshare_{figshare_id}.parquet").resolve()
     data_directory.mkdir(parents=True, exist_ok=True)
     url = f"https://ndownloader.figshare.com/files/{figshare_id}"
@@ -666,7 +668,7 @@ def fetch_employee_salaries(
     load_dataframe: bool = True,
     drop_linked: bool = True,
     drop_irrelevant: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the employee salaries dataset (regression), available at https://openml.org/d/42125
 
@@ -685,6 +687,9 @@ def fetch_employee_salaries(
         Drops column "full_name", which is usually irrelevant to the
         statistical analysis.
 
+    data_directory: pathlib.Path or str, optional
+        The directory where the dataset is stored.
+
     Returns
     -------
     DatasetAll
@@ -704,7 +709,7 @@ def fetch_employee_salaries(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
     if load_dataframe:
         if drop_linked:
@@ -720,7 +725,7 @@ def fetch_employee_salaries(
 def fetch_road_safety(
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the road safety dataset (classification), available at https://openml.org/d/42803
 
@@ -747,14 +752,14 @@ def fetch_road_safety(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_medical_charge(
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the medical charge dataset (regression), available at https://openml.org/d/42720
 
@@ -786,14 +791,14 @@ def fetch_medical_charge(
             "escapechar": "\\",
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_midwest_survey(
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the midwest survey dataset (classification), available at https://openml.org/d/42805
 
@@ -818,14 +823,14 @@ def fetch_midwest_survey(
             "escapechar": "\\",
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_open_payments(
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the open payments dataset (classification), available at https://openml.org/d/42738
 
@@ -852,14 +857,14 @@ def fetch_open_payments(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_traffic_violations(
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the traffic violations dataset (classification), available at https://openml.org/d/42132
 
@@ -888,14 +893,14 @@ def fetch_traffic_violations(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_drug_directory(
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the drug directory dataset (classification), available at https://openml.org/d/43044
 
@@ -921,15 +926,15 @@ def fetch_drug_directory(
             "escapechar": "\\",
         },
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_world_bank_indicator(
     indicator_id: str,
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches a dataset of an indicator from the World Bank open data platform.
 
@@ -952,15 +957,15 @@ def fetch_world_bank_indicator(
         dataset_id=indicator_id,
         target=None,
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )
 
 
 def fetch_figshare(
     figshare_id: str,
     *,
     load_dataframe: bool = True,
-    directory: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches a table of from figshare.
 
@@ -978,5 +983,5 @@ def fetch_figshare(
         dataset_id=figshare_id,
         target=None,
         load_dataframe=load_dataframe,
-        data_directory=directory,
+        data_directory=data_directory,
     )