merantix-momentum · AlirezaSohofi · Jun 10, 2022 · Jun 9, 2022 · Jun 9, 2022
diff --git a/examples/07.SquirrelStore_with_Spark.ipynb b/examples/07.SquirrelStore_with_Spark.ipynb
@@ -192,6 +192,57 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "9b150640",
+   "metadata": {},
+   "source": [
+    "# Reading big SquirrelStores into Spark\n",
+    "\n",
+    "When loading big datasets, the call `spark.sparkContext.parallelize(driver.get_iter())` will not work anymore because spark driver will tries to distribute the data. Instead, we should distribute the keys and only load the data on the executors. Squirrel makes it easy to achieve this thanks to the design of `SquirrelStore` which exposes a `keys()` and `get()` methods. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "304e9d9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_image_sample(x):\n",
+    "    return {\"img\": np.random.random((30, 30, 3)), \"label\": x}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f2b017c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from squirrel.iterstream import IterableSource\n",
+    "\n",
+    "tmpdir3 = tempfile.TemporaryDirectory()\n",
+    "N_SHARDS = 10\n",
+    "\n",
+    "store = SquirrelStore(tmpdir3.name, serializer=MessagepackSerializer())\n",
+    "IterableSource(range(100)).map(create_image_sample).batched(10).map(store.set).join()  # save some data in the store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "179fbbf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rdd = (\n",
+    "    spark.sparkContext.parallelize(store.keys())  # distribute keys between spark executors\n",
+    "    .map(lambda key: list(store.get(key)))  # load the shard on each executor\n",
+    "    .flatMap(lambda x: x)  # flattens each shard to get individual items\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "150adb08",
@@ -208,8 +259,17 @@
    "outputs": [],
    "source": [
     "tmpdir.cleanup()\n",
-    "tmpdir2.cleanup()"
+    "tmpdir2.cleanup()\n",
+    "tmpdir3.cleanup()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d7feb9e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -228,7 +288,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.12"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,