merantix-momentum · winfried-ripken · Mar 14, 2022 · Mar 14, 2022 · Mar 14, 2022 · Mar 14, 2022
diff --git a/README.md b/README.md
@@ -39,11 +39,11 @@ git clone https://github.com/merantix-momentum/squirrel-datasets-core.git
 ```
 Then you can install both packages by
 ```shell
-pip install -e squirrel-core
+pip install -e "squirrel-core[all]"
 ```
 and
 ```shell
-pip install -e squirrel-core-datasets
+pip install -e "squirrel-datasets-core[all]"
 ```
 
 In the documentation, you may also see some requirements to install the two packages first, please follow the 

diff --git a/docs/source/add_dataset.rst b/docs/source/add_dataset.rst
@@ -10,7 +10,8 @@ Preprocessing
 -------------
 For the first task, i.e. preprocessing, we recommend using `Apache Spark`_. The scenario is that quite often you would
 like to work with data stored in Google Cloud Storage and finish your batch processing job on a kubernetes cluser. We use
-`PySpark`_ for defining the preprocessing logic in python.
+`PySpark`_ for defining the preprocessing logic in python. You can find a tutorial how to use spark for preprocessing under 
+:code:`examples/08.Spark_Preprocessing.ipynb`.
 
 Data Loading
 ------------
@@ -26,7 +27,7 @@ register the dataset into a catalog plugin.
 #. Define your preprocessing logic.
 
    - Create a new directory under :code:`squirrel_datasets_core/datasets` named after your dataset, e.g. "example_dataset".
-     Write your preprocessing scripts under a new ``preprocessing.py`` file in it.
+     Write your preprocessing scripts under a new ``preprocessing.py`` file in it if needed.
 
 #. Define your loading logic.
 

diff --git a/examples/01.Getting_Started.ipynb b/examples/01.Getting_Started.ipynb
@@ -1,12 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/Squirrel_Tutorial_Getting_Started.ipynb)\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -139,4 +132,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/examples/02.Catalog.ipynb b/examples/02.Catalog.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "69f9aa77",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/02.Catalog.ipynb)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "a669ba1a",
@@ -252,4 +244,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/examples/03.Pytorch_Model_Training.ipynb b/examples/03.Pytorch_Model_Training.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "6a943e60",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/Squirrel_Tutorial_Pytorch_Model_Training.ipynb)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "74e59921",

diff --git a/examples/04.PytorchLightning_Model_Training.ipynb b/examples/04.PytorchLightning_Model_Training.ipynb
@@ -1,12 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/Squirrel_Tutorial_PytorchLightning_Model_Training.ipynb)\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {

diff --git a/examples/05.Plugins.ipynb b/examples/05.Plugins.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "69f9aa77",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/Squirrel_Tutorial_Plugins.ipynb)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/examples/06.SquirrelStore_with_Spark.ipynb b/examples/06.SquirrelStore_with_Spark.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "0f2d7553",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/06.SquirrelStore_with_Spark.ipynb)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -241,4 +233,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/examples/07.Performance_Guideline.ipynb b/examples/07.Performance_Guideline.ipynb
@@ -1,13 +1,5 @@
 {
  "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "495e66a3",
-   "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merantix-momentum/squirrel-datasets-core/blob/main/examples/07.Performance_Guideline.ipynb)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -342,7 +334,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.12"
   }
  },
  "nbformat": 4,

diff --git a/examples/08.Spark_Preprocessing.ipynb b/examples/08.Spark_Preprocessing.ipynb
@@ -0,0 +1,157 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This tutorial is about demonstrating the preprocessing capabilities with spark.\n",
+    "To run this tutorial, please make sure that [Apache Spark](https://spark.apache.org/) along with [pyspark](https://spark.apache.org/docs/latest/api/python/getting_started/install.html) is installed.\n",
+    "Installation instructions for spark can be found for example [here for Ubuntu](https://phoenixnap.com/kb/install-spark-on-ubuntu) or [here for Mac](https://medium.com/beeranddiapers/installing-apache-spark-on-mac-os-ce416007d79f)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import squirrel\n",
+    "    import squirrel_datasets_core\n",
+    "    import numpy as np\n",
+    "    import matplotlib.pyplot as plt\n",
+    "except:\n",
+    "    !pip install -q --ignore-requires-python --upgrade squirrel-datasets-core numpy matplotlib # noqa\n",
+    "    import squirrel\n",
+    "    import squirrel_datasets_core\n",
+    "    import matplotlib.pyplot as plt\n",
+    "\n",
+    "print(squirrel.__version__)\n",
+    "print(squirrel_datasets_core.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Any squirrel `Composable` can be used as input to a preprocessing pipeline and processed. \n",
+    "For this example we will use the `TorchvisionDriver` and the `CIFAR10` dataset. \n",
+    "The `get_spark` method can be used to easily get access to a spark session."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from squirrel.catalog import Catalog\n",
+    "from squirrel_datasets_core.spark import get_spark\n",
+    "\n",
+    "it = Catalog.from_plugins()[\"cifar10\"].get_driver().get_iter()\n",
+    "spark_session = get_spark(\"preprocess-cifar\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The method `save_composable_to_shards` is used for processing the data and saving it to the fast messagepack format. \n",
+    "With the `hooks` parameter a list of functions can be specified to transform the data. Here we simply convert the PIL Image from the `TorchvisionDriver` to a numpy array as an example. \n",
+    "The output can be saved to the local disk as in this case and also to a Google Cloud bucket directly. \n",
+    "The number of shards for the messagepack format should be specified as well.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from squirrel_datasets_core.preprocessing.save_shards import save_composable_to_shards\n",
+    "\n",
+    "local_store = \"cifar_local\"\n",
+    "num_shards = 10\n",
+    "\n",
+    "def map_image_to_np(sample):\n",
+    "    return np.array(sample[0]), sample[1]\n",
+    "\n",
+    "save_composable_to_shards(it, spark_session, local_store, num_shards, hooks=[map_image_to_np])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As the data has been processed and saved locally, it can now be loaded using the squirrel `MessagepackDriver`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from squirrel.driver.msgpack import MessagepackDriver\n",
+    "\n",
+    "it_msgpack = MessagepackDriver(local_store).get_iter()\n",
+    "sample = it.take(1).collect()[0]\n",
+    "\n",
+    "plt.title(f'Class: {sample[1]}')\n",
+    "plt.imshow(sample[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the following, we can compare the loading speed for the full dataset using the `TorchvisionDriver` to the `MessagepackDriver`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# measure time to load full dataset (it/s) with torchvision driver (default in squirrel catalog)\n",
+    "it.tqdm().collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# measure time to load full dataset (it/s) with messagepack driver from local store\n",
+    "it_msgpack.tqdm().collect()"
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "338cfae0a1873dbc3e6a1266258c61ca663f4d905ce797b41b2835118552379a"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.12 ('sq-env')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/squirrel_datasets_core/preprocessing/save_shards.py b/src/squirrel_datasets_core/preprocessing/save_shards.py
@@ -57,6 +57,9 @@ def save_composable_to_shards(
     """
     if num_samples is not None:
         src_it = src_it.take(num_samples)
+
+    if hooks is None:
+        hooks = []
 
     store = SquirrelStore(out_url, serializer=MessagepackSerializer())
     pipe = session.sparkContext.parallelize(src_it)
@@ -90,9 +93,6 @@ def save_source_to_shards(
         iter_kwargs (Dict): Keyword arguments passed to :py:meth:`Driver.get_iter`.
         hooks (Optional[List[Callable[[Dict], Dict]]], optional): Methods to map to the samples before creating shards.
     """
-    if hooks is None:
-        hooks = []
-
     # get raw data
     cat = Catalog.from_plugins()
     d = cat[cfg.identifier][cfg.version]