merantix-momentum · AlpAribal · Sep 6, 2022 · Aug 18, 2022 · Aug 18, 2022 · Aug 18, 2022
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include src/squirrel_datasets_core/py.typed
diff --git a/src/squirrel_datasets_core/__init__.py b/src/squirrel_datasets_core/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.10"
+__version__ = "0.1.11"
diff --git a/src/squirrel_datasets_core/datasets/conceptual_captions/driver.py b/src/squirrel_datasets_core/datasets/conceptual_captions/driver.py
@@ -9,8 +9,7 @@
 
 import numpy as np
 import requests
-from PIL import Image
-
+from PIL import Image, UnidentifiedImageError
 from squirrel.driver import IterDriver
 from squirrel.iterstream.source import IterableSource
 
@@ -79,6 +78,9 @@ def _map_fn(record: t.Dict[str, str]) -> t.Dict[str, t.Union[str, np.ndarray]]:
         except (urllib.error.HTTPError, urllib.error.URLError) as e:
             logger.info(f"Cannot access url {url} due to Error {e}")
             record["error"] = True
+        except UnidentifiedImageError as e:
+            logger.info(f"Cannot open image at {url} due to Error {e}")
+            record["error"] = True
 
         return record
 

diff --git a/src/squirrel_datasets_core/py.typed b/src/squirrel_datasets_core/py.typed
@@ -0,0 +1,3 @@
+This marker file declares that the package supports type checking. For details, you can refer to:
+- PEP561: https://www.python.org/dev/peps/pep-0561/
+- mypy docs: https://mypy.readthedocs.io/en/stable/installed_packages.html
diff --git a/test/conftest.py b/test/conftest.py
@@ -9,9 +9,16 @@
 """
 
 import pytest
+from squirrel.catalog import Catalog
 
 
 @pytest.fixture()
 def unit_test_fixture() -> str:
     """Fixture that is only used in the unit test scope."""
     return "unit test string"
+
+
+@pytest.fixture()
+def plugin_catalog() -> Catalog:
+    """Create catalog from plugins."""
+    return Catalog.from_plugins()
diff --git a/test/test_datasets/test_allenai.py b/test/test_datasets/test_allenai.py
@@ -4,9 +4,11 @@
 from typing import Iterator, Tuple
 
 import pytest
-from squirrel_datasets_core.datasets.allenai_c4 import C4DatasetDriver
+from squirrel.catalog import Catalog
 
 from mock_utils import create_random_dict, save_gzip
+from squirrel_datasets_core.datasets.allenai_c4 import C4DatasetDriver
+from squirrel_datasets_core.datasets.allenai_c4.constants import C4_MULTILINGUAL_CONFIG
 
 
 def mock_allenai_data(tmp_path: Path) -> Iterator[Tuple[str, str, int, Path]]:
@@ -52,3 +54,15 @@ def test_allenai(tmp_path: Path) -> None:
     )
     with pytest.raises(ValueError):
         driver.select("en").get_iter().collect()
+
+
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_allenai_public_data(plugin_catalog: Catalog) -> None:
+    """Test loading a single language from the C4 corpus."""
+    driver: C4DatasetDriver = plugin_catalog["c4"].get_driver()
+    assert sorted(driver.available_languages) == sorted(C4_MULTILINGUAL_CONFIG.keys())
+
+    TAKE = 10
+    it = driver.select("af", "valid").get_iter(shuffle_key_buffer=1, shuffle_item_buffer=1, prefetch_buffer=1)
+    data = it.take(TAKE).tqdm().collect()
+    assert len(data) == TAKE
diff --git a/test/test_datasets/test_cc100.py b/test/test_datasets/test_cc100.py
@@ -2,9 +2,11 @@
 
 import numpy as np
 import pytest
-from squirrel_datasets_core.datasets.cc100 import CC100Driver
+from squirrel.catalog import Catalog
 
 from mock_utils import create_random_str, save_xz
+from squirrel_datasets_core.datasets.cc100 import CC100Driver
+from squirrel_datasets_core.datasets.cc100.constants import CC_100_CONFIG
 
 
 def mock_cc100_data(samples: int, tmp_path: Path) -> Path:
@@ -47,3 +49,15 @@ def test_cc100(tmp_path: Path) -> None:
 
     with pytest.raises(KeyError):
         driver.select("en").get_iter().collect()
+
+
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_cc100_public_data(plugin_catalog: Catalog) -> None:
+    """Test loading a single language from the CC100 corpus."""
+    driver: CC100Driver = plugin_catalog["cc100"].get_driver()
+    assert sorted(driver.available_languages) == sorted(CC_100_CONFIG.keys())
+
+    TAKE = 10
+    it = driver.select("as").get_iter(shuffle_key_buffer=1, shuffle_item_buffer=1, prefetch_buffer=1)
+    data = it.take(TAKE).tqdm().collect()
+    assert len(data) == TAKE
diff --git a/test/test_datasets/test_conceptual_captions.py b/test/test_datasets/test_conceptual_captions.py
@@ -4,11 +4,13 @@
 from unittest.mock import patch
 
 import numpy as np
+import pytest
 import requests
 from PIL import Image
-from squirrel_datasets_core.datasets.conceptual_captions.driver import CC12MDriver
+from squirrel.catalog import Catalog
 
 from mock_utils import create_random_str
+from squirrel_datasets_core.datasets.conceptual_captions.driver import CC12MDriver
 
 SHAPE = (10, 10, 3)
 
@@ -87,3 +89,14 @@ def test_conceptual_captions_driver() -> None:
         assert sample["url"] is not None
         assert not sample["error"]
         assert sample["image"].shape == SHAPE
+
+
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_conceptual_captions_public_data(plugin_catalog: Catalog) -> None:
+    """Test the conceptual captions loader"""
+    driver: CC12MDriver = plugin_catalog["conceptual-captions-12m"].get_driver()
+    TAKE = 10
+    it = driver.get_iter(shuffle_key_buffer=1, shuffle_item_buffer=1, prefetch_buffer=1).take(TAKE)
+    data = it.collect()
+    assert sorted(data[0].keys()) == sorted(["url", "error", "image", "caption"])
+    assert len(data) == TAKE
diff --git a/test/test_datasets/test_huggingface.py b/test/test_datasets/test_huggingface.py
@@ -0,0 +1,39 @@
+from typing import Tuple
+
+import pytest
+from squirrel.catalog import Catalog
+
+from squirrel_datasets_core.driver.huggingface import HuggingfaceDriver
+
+TAKE = 10
+
+
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_mnist_public_data(plugin_catalog: Catalog) -> None:
+    """Test loading MNIST hosted by Hugging Face"""
+    driver: HuggingfaceDriver = plugin_catalog["mnist"].get_driver()
+    data = driver.get_iter(split="train").take(TAKE).collect()
+    assert len(data) == TAKE
+
+
+@pytest.mark.parametrize("cifar_set", ["cifar10", "cifar100"])
+@pytest.mark.parametrize("split", ["train", "test"])
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_cifar_public_data(plugin_catalog: Catalog, cifar_set: str, split: str) -> None:
+    """Test loading CIFAR10 hosted by Hugging Face."""
+    # version==1 is huggingface
+    driver: HuggingfaceDriver = plugin_catalog[(cifar_set, 1)].get_driver()
+    data = driver.get_iter(split=split).take(TAKE).collect()
+    assert len(data) == TAKE
+
+
+@pytest.mark.parametrize(
+    "catalog_key", [("wikitext-103-raw", 1), ("wikitext-103", 1), ("wikitext-2-raw", 1), ("wikitext-2", 1)]
+)
+@pytest.mark.parametrize("split", ["train", "test"])
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_wikitext_public_data(plugin_catalog: Catalog, catalog_key: Tuple[str, int], split: str) -> None:
+    """Test loading wikitext datasets hosted by Hugging Face."""
+    driver: HuggingfaceDriver = plugin_catalog[catalog_key].get_driver()
+    data = driver.get_iter(split=split).take(TAKE).collect()
+    assert len(data) == TAKE
diff --git a/test/test_datasets/test_torchvision.py b/test/test_datasets/test_torchvision.py
@@ -0,0 +1,34 @@
+import pytest
+from squirrel.catalog import Catalog
+
+TAKE = 1
+
+
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_caltech_public_data(plugin_catalog: Catalog) -> None:
+    """Test loading Caltech101 via torchvision."""
+    try:
+        plugin_catalog["caltech"].get_driver().get_iter().take(TAKE).join()
+    except RuntimeError as e:
+        if str(e).startswith("The daily quota of the file 101_ObjectCategories.tar.gz is exceeded"):
+            pytest.skip(msg=f"Failed to download data due to quota limit. original error: {str(e)}")
+        else:
+            raise e
+
+
+@pytest.mark.parametrize("cifar_set", ["cifar10", "cifar100"])
+@pytest.mark.parametrize("split", ["train", "test"])
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_cifar_public_data(plugin_catalog: Catalog, cifar_set: str, split: str) -> None:
+    """Test loading CIFAR-10 via torchvision."""
+    # version==2 is torchvision
+    version = 2
+    # torchvision.dataset.CIFAR10 and CIFAR100 use `train: bool` instead of `split: str`
+    plugin_catalog[cifar_set][version].get_driver().get_iter(train=(split == "train")).take(TAKE).join()
+
+
+@pytest.mark.parametrize("split", ["byclass", "bymerge", "balanced", "letters", "digits", "mnist"])
+@pytest.mark.skip(reason="Dataset is on public storage.")
+def test_emnist_public_data(plugin_catalog: Catalog, split: str) -> None:
+    """Test loading EMNIST via torchvision."""
+    plugin_catalog["emnist"].get_driver().get_iter(split=split).take(1).join()