skrub-data · GaelVaroquaux · Jul 21, 2023 · Jul 18, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -86,6 +86,13 @@ Minor changes
 * Moved the default storage location of data to the user's home folder.
   :pr:`652` by :user:`Felix Lefebvre <flefebv>`
 
+* Fixed bug when using :class:`TableVectorizer`'s `transform` method on
+  categorical columns with missing values.
+  :pr:`644` by :user:`Leo Grinsztajn <LeoGrin>`
+
+* :class:`TableVectorizer` never output a sparse matrix by default. This can be changed by
+  increasing the `sparse_threshold` parameter. :pr:`646` by :user:`Leo Grinsztajn <LeoGrin>`
+
 Before skrub: dirty_cat
 ========================
 

diff --git a/benchmarks/bench_fuzzy_join_sparse_vs_dense.py b/benchmarks/bench_fuzzy_join_sparse_vs_dense.py
@@ -527,8 +527,8 @@ def benchmark(
     dataset_name: str,
     analyzer: Literal["char_wb", "char", "word"],
     ngram_range: tuple,
-    data_home: Path | str = None,
-    data_directory: str = "benchmarks_data",
+    data_home: Path | str | None = None,
+    data_directory: str | None = "benchmarks_data",
 ):
     left_table, right_table, gt = fetch_big_data(
         dataset_name=dataset_name, data_home=data_home, data_directory=data_directory

diff --git a/benchmarks/utils/join.py b/benchmarks/utils/join.py
@@ -4,7 +4,7 @@
 
 
 def get_local_data(
-    dataset_name: str, data_home: Path | str = None, data_directory: str = None
+    dataset_name: str, data_home: Path | str | None = None, data_directory: str | None = None
 ):
     """Get the path to the local datasets."""
     data_directory = get_data_dir(data_directory, data_home)
@@ -24,8 +24,8 @@ def get_local_data(
 def fetch_data(
     dataset_name: str,
     save: bool = True,
-    data_home: Path | str = None,
-    data_directory: str = None,
+    data_home: Path | str | None = None,
+    data_directory: str | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Fetch datasets from https://github.com/Yeye-He/Auto-Join/tree/master/autojoin-Benchmark
 
@@ -41,7 +41,7 @@ def fetch_data(
         The path to the root data directory.
         By default, will point to the skrub data directory.
 
-    data_directory: str, default=None
+    data_directory: str, optional
         The name of the subdirectory in which data is stored.
 
     Returns
@@ -83,8 +83,8 @@ def fetch_big_data(
     dataset_name: str,
     data_type: str = "Dirty",
     save: bool = True,
-    data_home: Path | str = None,
-    data_directory: str = None,
+    data_home: Path | str | None = None,
+    data_directory: str | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """Fetch datasets from https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md
 
@@ -104,7 +104,7 @@ def fetch_big_data(
         The path to the root data directory.
         By default, will point to the skrub data directory.
 
-    data_directory: str, default=None
+    data_directory: str, optional
         The name of the subdirectory in which data is stored.
 
     Returns

diff --git a/skrub/datasets/_fetching.py b/skrub/datasets/_fetching.py
@@ -139,17 +139,18 @@ class DatasetInfoOnly:
 
 def _fetch_openml_dataset(
     dataset_id: int,
-    data_home: Path | str | None = None,
+    data_directory: Path | None = None,
 ) -> dict[str, Any]:
-    """Gets a dataset from OpenML (https://www.openml.org).
+    """
+    Gets a dataset from OpenML (https://www.openml.org).
 
     Parameters
     ----------
     dataset_id : int
         The ID of the dataset to fetch.
-    data_home : Path or str, optional
-        The path to the root data directory.
-        By default, the skrub data directory.
+    data_directory : Path or str, optional
+        The directory where the dataset is stored.
+        By default, a subdirectory "openml" in the skrub data directory.
 
     Returns
     -------
@@ -164,7 +165,8 @@ def _fetch_openml_dataset(
               The local path leading to the dataset,
               saved as a CSV file.
     """
-    data_directory = get_data_dir("openml", data_home)
+    if data_directory is None:
+        data_directory = get_data_dir(name="openml")
 
     # Make path absolute
     data_directory = data_directory.resolve()
@@ -222,17 +224,17 @@ def _fetch_openml_dataset(
 
 def _fetch_world_bank_data(
     indicator_id: str,
-    data_home: Path | str | None = None,
+    data_directory: Path | None = None,
 ) -> dict[str, Any]:
     """Gets a dataset from World Bank open data platform (https://data.worldbank.org/).
 
     Parameters
     ----------
     indicator_id : str
         The ID of the indicator's dataset to fetch.
-    data_home : Path or str, optional
-        The path to the root data directory.
-        By default, the skrub data directory.
+    data_directory : Path or str, optional
+        The directory where the dataset is stored.
+        By default, a subdirectory "world_bank" in the skrub data directory.
 
     Returns
     -------
@@ -247,7 +249,8 @@ def _fetch_world_bank_data(
               The local path leading to the dataset,
               saved as a CSV file.
     """
-    data_directory = get_data_dir("world_bank", data_home)
+    if data_directory is None:
+        data_directory = get_data_dir(name="world_bank")
 
     csv_path = (data_directory / f"{indicator_id}.csv").resolve()
     data_directory.mkdir(parents=True, exist_ok=True)
@@ -304,17 +307,17 @@ def _fetch_world_bank_data(
 
 def _fetch_figshare(
     figshare_id: str,
-    data_home: Path | str | None = None,
+    data_directory: Path | None = None,
 ) -> dict[str, Any]:
     """Fetch a dataset from figshare using the download ID number.
 
     Parameters
     ----------
     figshare_id : str
         The ID of the dataset to fetch.
-    data_home : Path or str, optional
-        The path to the root data directory.
-        By default, the skrub data directory.
+    data_directory : Path or str, optional
+        The directory where the dataset is stored.
+        By default, a subdirectory "figshare" in the skrub data directory.
 
     Returns
     -------
@@ -333,7 +336,9 @@ def _fetch_figshare(
     The files are read and returned in parquet format, this function needs
     pyarrow installed to run correctly.
     """
-    data_directory = get_data_dir("figshare", data_home)
+    if data_directory is None:
+        data_directory = get_data_dir(name="figshare")
+
     parquet_path = (data_directory / f"figshare_{figshare_id}.parquet").resolve()
     data_directory.mkdir(parents=True, exist_ok=True)
     url = f"https://ndownloader.figshare.com/files/{figshare_id}"
@@ -371,6 +376,7 @@ def _fetch_figshare(
             "pyarrow", extra="pyarrow is required for parquet support."
         )
         from pyarrow.parquet import ParquetFile
+
         try:
             filehandle, _ = urllib.request.urlretrieve(url)
             df = ParquetFile(filehandle)
@@ -555,7 +561,7 @@ def _fetch_dataset_as_dataclass(
     dataset_id: int | str,
     target: str | None,
     load_dataframe: bool,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
     read_csv_kwargs: dict | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches a dataset from a source, and returns it as a dataclass.
@@ -567,7 +573,7 @@ def _fetch_dataset_as_dataclass(
     pass `load_dataframe=False`.
 
     To save/load the dataset to/from a specific directory,
-    pass `data_home`. If `None`, uses the default skrub
+    pass `data_directory`. If `None`, uses the default skrub
     data directory.
 
     If the dataset doesn't have a target (unsupervised learning or inapplicable),
@@ -581,12 +587,15 @@ def _fetch_dataset_as_dataclass(
     :obj:`DatasetInfoOnly`
         If `load_dataframe=False`
     """
+    if isinstance(data_directory, str):
+        data_directory = Path(data_directory)
+
     if source == "openml":
-        info = _fetch_openml_dataset(dataset_id, data_home)
+        info = _fetch_openml_dataset(dataset_id, data_directory)
     elif source == "world_bank":
-        info = _fetch_world_bank_data(dataset_id, data_home)
+        info = _fetch_world_bank_data(dataset_id, data_directory)
     elif source == "figshare":
-        info = _fetch_figshare(dataset_id, data_home)
+        info = _fetch_figshare(dataset_id, data_directory)
     else:
         raise ValueError(f"Unknown source {source!r}")
 
@@ -635,7 +644,7 @@ def fetch_employee_salaries(
     load_dataframe: bool = True,
     drop_linked: bool = True,
     drop_irrelevant: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the employee salaries dataset (regression), available at https://openml.org/d/42125
 
@@ -654,9 +663,8 @@ def fetch_employee_salaries(
         Drops column "full_name", which is usually irrelevant to the
         statistical analysis.
 
-    data_home: Path or str, default=None
-        The path to the root data directory.
-        By default, will point to the skrub data directory.
+    data_directory: Path or str, optional
+        The directory where the dataset is stored.
 
     Returns
     -------
@@ -677,7 +685,7 @@ def fetch_employee_salaries(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
     if load_dataframe:
         if drop_linked:
@@ -693,7 +701,7 @@ def fetch_employee_salaries(
 def fetch_road_safety(
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the road safety dataset (classification), available at https://openml.org/d/42803
 
@@ -720,14 +728,14 @@ def fetch_road_safety(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_medical_charge(
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the medical charge dataset (regression), available at https://openml.org/d/42720
 
@@ -759,14 +767,14 @@ def fetch_medical_charge(
             "escapechar": "\\",
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_midwest_survey(
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the midwest survey dataset (classification), available at https://openml.org/d/42805
 
@@ -791,14 +799,14 @@ def fetch_midwest_survey(
             "escapechar": "\\",
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_open_payments(
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the open payments dataset (classification), available at https://openml.org/d/42738
 
@@ -825,14 +833,14 @@ def fetch_open_payments(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_traffic_violations(
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the traffic violations dataset (classification), available at https://openml.org/d/42132
 
@@ -861,14 +869,14 @@ def fetch_traffic_violations(
             "na_values": ["?"],
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_drug_directory(
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches the drug directory dataset (classification), available at https://openml.org/d/43044
 
@@ -894,15 +902,15 @@ def fetch_drug_directory(
             "escapechar": "\\",
         },
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_world_bank_indicator(
     indicator_id: str,
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches a dataset of an indicator from the World Bank open data platform.
 
@@ -925,15 +933,15 @@ def fetch_world_bank_indicator(
         dataset_id=indicator_id,
         target=None,
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )
 
 
 def fetch_figshare(
     figshare_id: str,
     *,
     load_dataframe: bool = True,
-    data_home: Path | str | None = None,
+    data_directory: Path | str | None = None,
 ) -> DatasetAll | DatasetInfoOnly:
     """Fetches a table of from figshare.
 
@@ -951,5 +959,5 @@ def fetch_figshare(
         dataset_id=figshare_id,
         target=None,
         load_dataframe=load_dataframe,
-        data_home=data_home,
+        data_directory=data_directory,
     )