From 9610d293c310d2d90503d8c97502718db70de4c7 Mon Sep 17 00:00:00 2001 From: Andrew Tolopko Date: Fri, 16 Jun 2023 12:56:30 -0400 Subject: [PATCH 1/5] add pytorch notebook; minor pytorch api docstring updates --- .../experimental/ml/pytorch.py | 8 +- .../notebooks/experimental/pytorch.ipynb | 675 ++++++++++++++++++ docs/notebooks/experimental/pytorch.ipynb | 1 + 3 files changed, 680 insertions(+), 4 deletions(-) create mode 100644 api/python/notebooks/experimental/pytorch.ipynb create mode 120000 docs/notebooks/experimental/pytorch.ipynb diff --git a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py index 46da101ff..6afb76262 100644 --- a/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py +++ b/api/python/cellxgene_census/src/cellxgene_census/experimental/ml/pytorch.py @@ -211,7 +211,7 @@ def obs_batch(self) -> pd.DataFrame: class ExperimentDataPipe(pipes.IterDataPipe[Dataset[ObsDatum]]): # type: ignore """ An iterable-style PyTorch ``DataPipe`` that reads obs and X data from a SOMA Experiment, and returns an iterator of - tuples of PyTorch ``Tensor``s: + tuples of PyTorch ``Tensor`` objects. >>> (tensor([0., 0., 0., 0., 0., 1., 0., 0., 0.]), # X data tensor([2415, 0, 0], dtype=torch.int32)) # obs data, encoded @@ -459,9 +459,9 @@ def experiment_dataloader( **dataloader_kwargs: Any, ) -> DataLoader: """ - Factory method for PyTorch ``DataLoader``. Provides a safer, more convenient interface for instantiating a - ``DataLoader`` that works with the ``ExperimentDataPipe``, since not all of ``DataLoader``'s params can be - used (``batch_size``, ``sampler``, ``batch_sampler``, ``collate_fn``). + Factory method for PyTorch ``DataLoader``. This method can be used to safely instantiate a + ``DataLoader`` that works with the ``ExperimentDataPipe``, since not all of the ``DataLoader`` constructor params + can be used (``batch_size``, ``sampler``, ``batch_sampler``, ``collate_fn``). Returns: PyTorch ``DataLoader``. diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb new file mode 100644 index 000000000..b3776631b --- /dev/null +++ b/api/python/notebooks/experimental/pytorch.ipynb @@ -0,0 +1,675 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9c8899e7", + "metadata": {}, + "source": [ + "## Training a PyTorch Model\n", + "\n", + "This tutorial shows how to train a Logistic Regression model in PyTorch using the Census API's `experimental.ml.ExperimentDataPipe` class. This is intended only to demonstrate the use of the `ExperimentDataPipe`, and not as an example of how to train a biologically useful model.\n", + "\n", + "This tutorial assumes a basic familiarity with PyTorch and the Census API. See the [`Querying and fetching the single-cell data and cell/gene metadata`](../api_demo/census_query_extract.ipynb) notebook tutorial for a quick primer on Census API usage.\n", + "\n", + "**Contents**\n", + "* [Open the Census](#Open-the-Census)\n", + "* [Create a DataLoader](#Create-a-DataLoader)\n", + "* [Define the model](#Define-the-model)\n", + "* [Train the model](#Train-the-model)\n", + "* [Make predictions with the model](#Make-predictions-with-the-model)\n" + ] + }, + { + "cell_type": "markdown", + "id": "f874fb88", + "metadata": {}, + "source": [ + "## Open the Census\n", + "\n", + "First, obtain a handle to the Census data, in the usual manner:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c3dd549f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The \"stable\" release is currently 2023-05-15. Specify 'census_version=\"2023-05-15\"' in future calls to open_soma() to ensure data consistency.\n" + ] + } + ], + "source": [ + "import cellxgene_census\n", + "\n", + "census = cellxgene_census.open_soma()" + ] + }, + { + "cell_type": "markdown", + "id": "580b29f2", + "metadata": {}, + "source": [ + "## Create a DataLoader\n", + "\n", + "To train a model in PyTorch using this `census` data object, first instantiate an `ExperimentDataPipe` as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "54896e6f", + "metadata": {}, + "outputs": [], + "source": [ + "import cellxgene_census.experimental.ml as census_ml\n", + "import tiledbsoma as soma\n", + "\n", + "experiment_datapipe = census_ml.ExperimentDataPipe(\n", + " census[\"census_data\"][\"homo_sapiens\"],\n", + " measurement_name=\"RNA\",\n", + " X_name=\"raw\",\n", + " obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'tongue' and is_primary_data == True\"),\n", + " obs_column_names=[\"cell_type\"],\n", + " batch_size=16,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6c7c17c3", + "metadata": {}, + "source": [ + "This class provides an implementation of PyTorch's `DataPipe` interface, which defines a common mechanism for wrapping and accessing training data from any underlying source. The `ExperimentDataPipe` class encapsulates the details of querying and retrieving Census data from a single SOMA `Experiment` and returning it to the caller as PyTorch Tensors. Most importantly, it retrieves the data lazily from the Census in batches, avoiding having to load the entire training dataset into memory at once.\n", + "\n", + "The constructor only requires a single parameter, `experiment`, which is a `soma.Experiment` containing the data of the organism to be used for training.\n", + "\n", + "To retrieve a subset of the Experiment's data, along either the `obs` or `var` axes, you may specify query filters via the `obs_query` and `var_query` parameters, which are both `soma.AxisQuery` objects.\n", + "\n", + "The values for the prediction label(s) that you intend to use for training are specified via the `obs_column_names` array. \n", + "\n", + "Finally, the `batch_size` allows you to specify the number of cells to return in each training iteration step. You may exclude this parameter if you want single rows (`batch_size=1`).\n", + "\n", + "(Note: PyTorch also provides `DataSet` as a legacy interface for wrapping and accessing training data sources, but a `DataPipe` can be used interchangeably.)" + ] + }, + { + "cell_type": "markdown", + "id": "84ac17d2", + "metadata": {}, + "source": [ + "You can inspect the shape of the full dataset, without causing the full dataset to be loaded:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "70a2ddbe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(15020, 60664)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_datapipe.shape" + ] + }, + { + "cell_type": "markdown", + "id": "5251109a", + "metadata": {}, + "source": [ + "### Splitting the dataset\n", + "\n", + "You may split the overall dataset into the typical training, validation, and test sets by using the PyTorch [`RandomSplitter`](https://pytorch.org/data/main/generated/torchdata.datapipes.iter.RandomSplitter.html#torchdata.datapipes.iter.RandomSplitter) `DataPipe`. Using PyTorch's functional form for chaining `DataPipe`s, this is done as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "133f594f", + "metadata": {}, + "outputs": [], + "source": [ + "train_datapipe, validation_datapipe, test_datapipe = experiment_datapipe.random_split(\n", + " weights={\"train\": 0.8, \"validation\": 0.1, \"test\": 0.1}, seed=1\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "48411fe7", + "metadata": {}, + "source": [ + "### Shuffling the training dataset\n", + "\n", + "Commonly, you will want to randomize the ordering of the training data for each training epoch. You can use PyTorch [`Shuffler`](https://pytorch.org/data/main/generated/torchdata.datapipes.iter.Shuffler.html) `DataPipe`, chaining it to the earlier `experiment_dataloader` `DataPipe`. Again, using PyTorch's functional form for chaining the `Datapipe`s, we have:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7dfe3c75", + "metadata": {}, + "outputs": [], + "source": [ + "shuffled_train_datapipe = train_datapipe.shuffle()" + ] + }, + { + "cell_type": "markdown", + "id": "6c5d57bf", + "metadata": {}, + "source": [ + "Note that `shuffle()` operates on a fixed size of training rows, which is 10,000 by default, and can be specified via the `buffer_size` param. Increasing this value may be useful for improving the heterogeneity of cells appearing within a given shuffling \"window\", as low heterogeneity may result in overfitting in the model. Note, however, that larger `buffer_size` values will require more memory. \n", + "\n", + "If the training data is known to fit fully within in available memory, you can shuffle the entire dataset using `experiment_datapipe.shuffle(len(experiment_datapipe))`." + ] + }, + { + "cell_type": "markdown", + "id": "a825bccf", + "metadata": {}, + "source": [ + "### Create the DataLoader\n", + "\n", + "With the full set of DataPipe operations chained together, we can now instantiate a PyTorch [`DataLoader`](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) on the training data. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "39d30df2", + "metadata": {}, + "outputs": [], + "source": [ + "experiment_dataloader = census_ml.experiment_dataloader(shuffled_train_datapipe)" + ] + }, + { + "cell_type": "markdown", + "id": "8a3cbe3f", + "metadata": {}, + "source": [ + "Alternately, you can instantiate a `DataLoader` object directly via its constructor. However, many of the parameters are not usable with iterable-style DataPipes, which is the case for `ExperimentDataPipe`. In particular, the `batch_size`, `sampler`, `batch_sampler`, `collate_fn` parameters should not be specified. Using `experiment_dataloader` helps enforce correct usage." + ] + }, + { + "cell_type": "markdown", + "id": "fb9e93b6", + "metadata": {}, + "source": [ + "## Define the model\n", + "\n", + "With the training data retrieval code now in place, we can move on to defining a simple logistic regression model, using PyTorch's `torch.nn.Linear` class:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6b792b4b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import torch\n", + "\n", + "\n", + "class LogisticRegression(torch.nn.Module):\n", + " def __init__(self, input_dim, output_dim):\n", + " super(LogisticRegression, self).__init__()\n", + " self.linear = torch.nn.Linear(input_dim, output_dim)\n", + "\n", + " def forward(self, x):\n", + " outputs = torch.sigmoid(self.linear(x))\n", + " return outputs" + ] + }, + { + "cell_type": "markdown", + "id": "0e1752ef", + "metadata": {}, + "source": [ + "Next, we define a function to train the model for a single epoch:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b744cd21", + "metadata": {}, + "outputs": [], + "source": [ + "def train_epoch(model, train_dataloader, loss_fn, optimizer, device):\n", + " model.train()\n", + " train_loss = 0\n", + " train_correct = 0\n", + " train_total = 0\n", + "\n", + " for batch in train_dataloader:\n", + " optimizer.zero_grad()\n", + " X_batch, y_batch = batch\n", + "\n", + " X_batch = X_batch.to(device)\n", + "\n", + " # Perform prediction\n", + " outputs = model(X_batch)\n", + "\n", + " # Determine the predicted label\n", + " probabilities = torch.nn.functional.softmax(outputs, 1)\n", + " predictions = torch.argmax(probabilities, axis=1).cpu()\n", + "\n", + " # Compute the loss and perform back propagation\n", + "\n", + " # Exclude the cell_type labels, which are in the second column\n", + " y_batch = y_batch[:, 1]\n", + " y_batch = y_batch.to(device).cpu()\n", + "\n", + " train_correct += (predictions == y_batch).sum().item()\n", + " train_total += len(predictions)\n", + "\n", + " loss = loss_fn(outputs, y_batch.long())\n", + " train_loss += loss.item()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " train_loss /= train_total\n", + " train_accuracy = train_correct / train_total\n", + " return train_loss, train_accuracy" + ] + }, + { + "cell_type": "markdown", + "id": "a0a9fd7e", + "metadata": {}, + "source": [ + "Note the line, `X_batch, y_batch = batch`. Since the `train_dataloader` was configured with `batch_size=16`, these variables will hold tensors of rank 2. The `X_batch` tensor will appear, for example, as:\n", + "\n", + "```\n", + "tensor([[0., 0., 0., ..., 1., 0., 0.],\n", + " [0., 0., 2., ..., 0., 3., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 1., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 8.]])\n", + " \n", + "```\n", + "\n", + "For `batch_size=1`, the tensors will be of rank 1. The `X_batch` tensor will appear, for example, as:\n", + "\n", + "```\n", + "tensor([0., 0., 0., ..., 1., 0., 0.])\n", + "```\n", + " \n", + "Secondly, note the line, `y_batch = y_batch[:, 1]`. This line is extracting the user-specified `obs` `cell_type` training labels from the second column of the `y_batch` rank 2 Tensor. For example, this would take a `y_batch` tensor that looks like:\n", + "```\n", + "tensor([[42496620, 1],\n", + " [42496621, 1],\n", + " [42496622, 3],\n", + " ...,\n", + " [42496633, 2],\n", + " [42496634, 1],\n", + " [42496635, 4]], dtype=torch.int32)\n", + " \n", + "```\n", + "and return:\n", + "```\n", + "tensor([1, 1, 3, ..., 2, 1, 4])\n", + "\n", + "```\n", + "Note that cell type values are integer-encoded values, which can be decoded using `experiment_datapipe.obs_encoders()` (more on this below).\n" + ] + }, + { + "cell_type": "markdown", + "id": "79f8b731", + "metadata": {}, + "source": [ + "## Train the model\n", + "\n", + "Finally, we are ready to train the model. Here we instantiate the model, a loss function, and an optimization method and then iterate through the desired number of training epochs. Note how the `train_dataloader` is passed into `train_epoch`, where for each epoch it will provide a new iterator through the training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "733ec2fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1: Train Loss: 0.1156352 Accuracy 0.4472\n", + "Epoch 2: Train Loss: 0.1096967 Accuracy 0.7261\n", + "Epoch 3: Train Loss: 0.1082206 Accuracy 0.7729\n", + "Epoch 4: Train Loss: 0.1071653 Accuracy 0.8268\n", + "Epoch 5: Train Loss: 0.1067613 Accuracy 0.8481\n" + ] + } + ], + "source": [ + "RANDOM_SEED = 12345\n", + "np.random.seed(RANDOM_SEED)\n", + "torch.manual_seed(RANDOM_SEED)\n", + "\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "\n", + "# The size of the input dimension is the number of genes\n", + "input_dim = experiment_datapipe.shape[1]\n", + "\n", + "# The size of the output dimension is the number of distinct cell_type values\n", + "cell_type_encoder = experiment_datapipe.obs_encoders()[\"cell_type\"]\n", + "output_dim = len(cell_type_encoder.classes_)\n", + "\n", + "model = LogisticRegression(input_dim, output_dim).to(device)\n", + "loss_fn = torch.nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)\n", + "\n", + "for epoch in range(5):\n", + " train_loss, train_accuracy = train_epoch(model, experiment_dataloader, loss_fn, optimizer, device)\n", + " print(f\"Epoch {epoch + 1}: Train Loss: {train_loss:.7f} Accuracy {train_accuracy:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5e0ffb48", + "metadata": {}, + "source": [ + "## Make predictions with the model\n", + "\n", + "To make predictions with the model, we first create a new `DataLoader` using the `test_datapipe`, which provides the \"test\" split of the original dataset. For this example, we will only make predictions on a single batch of data from the test split." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d3e33edc", + "metadata": {}, + "outputs": [], + "source": [ + "experiment_dataloader = census_ml.experiment_dataloader(test_datapipe)\n", + "\n", + "X_batch, y_batch = next(iter(experiment_dataloader))" + ] + }, + { + "cell_type": "markdown", + "id": "19fabd54", + "metadata": {}, + "source": [ + "Next, we invoke the model on the `X_batch` input data and extract the predictions:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "00e12182", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([5, 1, 5, 5, 5, 5, 5, 5, 1, 5, 5, 1, 5, 5, 1, 1])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.eval()\n", + "\n", + "outputs = model(X_batch)\n", + "\n", + "probabilities = torch.nn.functional.softmax(outputs, 1)\n", + "predictions = torch.argmax(probabilities, axis=1).cpu()\n", + "\n", + "predictions" + ] + }, + { + "cell_type": "markdown", + "id": "7cb88a5f", + "metadata": {}, + "source": [ + "The predictions are returned as the encoded values of `cell_type` label. To recover the original cell type labels as strings, we decode using the `cell_type_encoder`. Here, we obtain the encoder from `experiment_datapipe.obs_encoders()`.\n", + "\n", + "At inference time, if the model inputs are not obtained via an `ExperimentDataPipe`, one could pickle the encoder at training time and save it along with the model. Then, at inference time it can be unpickled and used as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1cfff865", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['epithelial cell', 'basal cell', 'epithelial cell',\n", + " 'epithelial cell', 'epithelial cell', 'epithelial cell',\n", + " 'epithelial cell', 'epithelial cell', 'basal cell',\n", + " 'epithelial cell', 'epithelial cell', 'basal cell',\n", + " 'epithelial cell', 'epithelial cell', 'basal cell', 'basal cell'],\n", + " dtype=object)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cell_type_encoder = experiment_datapipe.obs_encoders()[\"cell_type\"]\n", + "\n", + "predicted_cell_types = cell_type_encoder.inverse_transform(predictions)\n", + "\n", + "predicted_cell_types" + ] + }, + { + "cell_type": "markdown", + "id": "16010d09", + "metadata": {}, + "source": [ + "Finally, we create a Pandas DataFrame to examine the predictions:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f4ac8087", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
actual cell typepredicted cell type
0epithelial cellepithelial cell
1epithelial cellbasal cell
2epithelial cellepithelial cell
3epithelial cellepithelial cell
4epithelial cellepithelial cell
5epithelial cellepithelial cell
6epithelial cellepithelial cell
7epithelial cellepithelial cell
8epithelial cellbasal cell
9epithelial cellepithelial cell
10epithelial cellepithelial cell
11epithelial cellbasal cell
12epithelial cellepithelial cell
13epithelial cellepithelial cell
14epithelial cellbasal cell
15epithelial cellbasal cell
\n", + "
" + ], + "text/plain": [ + " actual cell type predicted cell type\n", + "0 epithelial cell epithelial cell\n", + "1 epithelial cell basal cell\n", + "2 epithelial cell epithelial cell\n", + "3 epithelial cell epithelial cell\n", + "4 epithelial cell epithelial cell\n", + "5 epithelial cell epithelial cell\n", + "6 epithelial cell epithelial cell\n", + "7 epithelial cell epithelial cell\n", + "8 epithelial cell basal cell\n", + "9 epithelial cell epithelial cell\n", + "10 epithelial cell epithelial cell\n", + "11 epithelial cell basal cell\n", + "12 epithelial cell epithelial cell\n", + "13 epithelial cell epithelial cell\n", + "14 epithelial cell basal cell\n", + "15 epithelial cell basal cell" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "pd.DataFrame(\n", + " {\n", + " \"actual cell type\": cell_type_encoder.inverse_transform(y_batch[:, 1].numpy()),\n", + " \"predicted cell type\": predicted_cell_types,\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c2fd35fa", + "metadata": {}, + "source": [ + "As we can see, the predictions are poor. It is as left as an excercise to the reader to find a better performing model." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebooks/experimental/pytorch.ipynb b/docs/notebooks/experimental/pytorch.ipynb new file mode 120000 index 000000000..7df8308b4 --- /dev/null +++ b/docs/notebooks/experimental/pytorch.ipynb @@ -0,0 +1 @@ +../../../api/python/notebooks/experimental/pytorch.ipynb \ No newline at end of file From 0a2816ff850bde8104fec8e7f9e950ae2f754b22 Mon Sep 17 00:00:00 2001 From: Andrew Tolopko Date: Fri, 16 Jun 2023 14:04:38 -0400 Subject: [PATCH 2/5] CR feedback --- .../notebooks/experimental/pytorch.ipynb | 187 ++---------------- 1 file changed, 13 insertions(+), 174 deletions(-) diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb index b3776631b..726f3590b 100644 --- a/api/python/notebooks/experimental/pytorch.ipynb +++ b/api/python/notebooks/experimental/pytorch.ipynb @@ -69,8 +69,10 @@ "import cellxgene_census.experimental.ml as census_ml\n", "import tiledbsoma as soma\n", "\n", + "experiment = census[\"census_data\"][\"homo_sapiens\"]\n", + "\n", "experiment_datapipe = census_ml.ExperimentDataPipe(\n", - " census[\"census_data\"][\"homo_sapiens\"],\n", + " experiment,\n", " measurement_name=\"RNA\",\n", " X_name=\"raw\",\n", " obs_query=soma.AxisQuery(value_filter=\"tissue_general == 'tongue' and is_primary_data == True\"),\n", @@ -92,7 +94,7 @@ "\n", "The values for the prediction label(s) that you intend to use for training are specified via the `obs_column_names` array. \n", "\n", - "Finally, the `batch_size` allows you to specify the number of cells to return in each training iteration step. You may exclude this parameter if you want single rows (`batch_size=1`).\n", + "Finally, the `batch_size` allows you to specify the number of obs rows (cells) to return in each training iteration step. You may exclude this parameter if you want single rows (`batch_size=1`).\n", "\n", "(Note: PyTorch also provides `DataSet` as a legacy interface for wrapping and accessing training data sources, but a `DataPipe` can be used interchangeably.)" ] @@ -345,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "733ec2fb", "metadata": {}, "outputs": [ @@ -356,8 +358,7 @@ "Epoch 1: Train Loss: 0.1156352 Accuracy 0.4472\n", "Epoch 2: Train Loss: 0.1096967 Accuracy 0.7261\n", "Epoch 3: Train Loss: 0.1082206 Accuracy 0.7729\n", - "Epoch 4: Train Loss: 0.1071653 Accuracy 0.8268\n", - "Epoch 5: Train Loss: 0.1067613 Accuracy 0.8481\n" + "Epoch 4: Train Loss: 0.1071653 Accuracy 0.8268\n" ] } ], @@ -396,7 +397,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "d3e33edc", "metadata": {}, "outputs": [], @@ -416,21 +417,10 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "00e12182", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([5, 1, 5, 5, 5, 5, 5, 5, 1, 5, 5, 1, 5, 5, 1, 1])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model.eval()\n", "\n", @@ -454,26 +444,10 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "1cfff865", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['epithelial cell', 'basal cell', 'epithelial cell',\n", - " 'epithelial cell', 'epithelial cell', 'epithelial cell',\n", - " 'epithelial cell', 'epithelial cell', 'basal cell',\n", - " 'epithelial cell', 'epithelial cell', 'basal cell',\n", - " 'epithelial cell', 'epithelial cell', 'basal cell', 'basal cell'],\n", - " dtype=object)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "cell_type_encoder = experiment_datapipe.obs_encoders()[\"cell_type\"]\n", "\n", @@ -492,145 +466,10 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "f4ac8087", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
actual cell typepredicted cell type
0epithelial cellepithelial cell
1epithelial cellbasal cell
2epithelial cellepithelial cell
3epithelial cellepithelial cell
4epithelial cellepithelial cell
5epithelial cellepithelial cell
6epithelial cellepithelial cell
7epithelial cellepithelial cell
8epithelial cellbasal cell
9epithelial cellepithelial cell
10epithelial cellepithelial cell
11epithelial cellbasal cell
12epithelial cellepithelial cell
13epithelial cellepithelial cell
14epithelial cellbasal cell
15epithelial cellbasal cell
\n", - "
" - ], - "text/plain": [ - " actual cell type predicted cell type\n", - "0 epithelial cell epithelial cell\n", - "1 epithelial cell basal cell\n", - "2 epithelial cell epithelial cell\n", - "3 epithelial cell epithelial cell\n", - "4 epithelial cell epithelial cell\n", - "5 epithelial cell epithelial cell\n", - "6 epithelial cell epithelial cell\n", - "7 epithelial cell epithelial cell\n", - "8 epithelial cell basal cell\n", - "9 epithelial cell epithelial cell\n", - "10 epithelial cell epithelial cell\n", - "11 epithelial cell basal cell\n", - "12 epithelial cell epithelial cell\n", - "13 epithelial cell epithelial cell\n", - "14 epithelial cell basal cell\n", - "15 epithelial cell basal cell" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "\n", From 57053f710b54a0aa70aa8bcbfcd9e0533f1931c3 Mon Sep 17 00:00:00 2001 From: pablo-gar Date: Fri, 16 Jun 2023 15:36:11 -0600 Subject: [PATCH 3/5] Update api/python/notebooks/experimental/pytorch.ipynb --- api/python/notebooks/experimental/pytorch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb index 726f3590b..2f78a8e91 100644 --- a/api/python/notebooks/experimental/pytorch.ipynb +++ b/api/python/notebooks/experimental/pytorch.ipynb @@ -86,7 +86,7 @@ "id": "6c7c17c3", "metadata": {}, "source": [ - "This class provides an implementation of PyTorch's `DataPipe` interface, which defines a common mechanism for wrapping and accessing training data from any underlying source. The `ExperimentDataPipe` class encapsulates the details of querying and retrieving Census data from a single SOMA `Experiment` and returning it to the caller as PyTorch Tensors. Most importantly, it retrieves the data lazily from the Census in batches, avoiding having to load the entire training dataset into memory at once.\n", + "This class provides an implementation of PyTorch's [`DataPipe` interface](https://pytorch.org/data/main/torchdata.datapipes.iter.html), which defines a common mechanism for wrapping and accessing training data from any underlying source. The `ExperimentDataPipe` class encapsulates the details of querying and retrieving Census data from a single SOMA `Experiment` and returning it to the caller as PyTorch Tensors. Most importantly, it retrieves the data lazily from the Census in batches, avoiding having to load the entire training dataset into memory at once.\n", "\n", "The constructor only requires a single parameter, `experiment`, which is a `soma.Experiment` containing the data of the organism to be used for training.\n", "\n", From b546e812d1252a8621c40d9c5e85d4f75e13f0b6 Mon Sep 17 00:00:00 2001 From: pablo-gar Date: Fri, 16 Jun 2023 15:36:20 -0600 Subject: [PATCH 4/5] Update api/python/notebooks/experimental/pytorch.ipynb --- api/python/notebooks/experimental/pytorch.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb index 2f78a8e91..5af72e253 100644 --- a/api/python/notebooks/experimental/pytorch.ipynb +++ b/api/python/notebooks/experimental/pytorch.ipynb @@ -9,7 +9,7 @@ "\n", "This tutorial shows how to train a Logistic Regression model in PyTorch using the Census API's `experimental.ml.ExperimentDataPipe` class. This is intended only to demonstrate the use of the `ExperimentDataPipe`, and not as an example of how to train a biologically useful model.\n", "\n", - "This tutorial assumes a basic familiarity with PyTorch and the Census API. See the [`Querying and fetching the single-cell data and cell/gene metadata`](../api_demo/census_query_extract.ipynb) notebook tutorial for a quick primer on Census API usage.\n", + "This tutorial assumes a basic familiarity with PyTorch and the Census API. See the [Querying and fetching the single-cell data and cell/gene metadata](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_query_extract.html) notebook tutorial for a quick primer on Census API usage.\n", "\n", "**Contents**\n", "* [Open the Census](#Open-the-Census)\n", From c1667fdedff18cbe168a7ae0c167b91ae4863f68 Mon Sep 17 00:00:00 2001 From: Pablo E Garcia-Nieto Date: Fri, 16 Jun 2023 16:18:27 -0600 Subject: [PATCH 5/5] run full notebook --- .../notebooks/experimental/pytorch.ipynb | 188 ++++++++++++++++-- 1 file changed, 175 insertions(+), 13 deletions(-) diff --git a/api/python/notebooks/experimental/pytorch.ipynb b/api/python/notebooks/experimental/pytorch.ipynb index 5af72e253..54b0fd873 100644 --- a/api/python/notebooks/experimental/pytorch.ipynb +++ b/api/python/notebooks/experimental/pytorch.ipynb @@ -347,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "733ec2fb", "metadata": {}, "outputs": [ @@ -355,10 +355,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Epoch 1: Train Loss: 0.1156352 Accuracy 0.4472\n", - "Epoch 2: Train Loss: 0.1096967 Accuracy 0.7261\n", - "Epoch 3: Train Loss: 0.1082206 Accuracy 0.7729\n", - "Epoch 4: Train Loss: 0.1071653 Accuracy 0.8268\n" + "Epoch 1: Train Loss: 0.1156322 Accuracy 0.4494\n", + "Epoch 2: Train Loss: 0.1095988 Accuracy 0.7357\n", + "Epoch 3: Train Loss: 0.1089615 Accuracy 0.7300\n", + "Epoch 4: Train Loss: 0.1080885 Accuracy 0.7275\n", + "Epoch 5: Train Loss: 0.1101440 Accuracy 0.4220\n" ] } ], @@ -397,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "d3e33edc", "metadata": {}, "outputs": [], @@ -417,10 +418,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "00e12182", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model.eval()\n", "\n", @@ -444,10 +456,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "1cfff865", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array(['basal cell', 'basal cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'basal cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'basal cell', 'basal cell', 'basal cell',\n", + " 'basal cell', 'basal cell', 'basal cell', 'basal cell'],\n", + " dtype=object)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cell_type_encoder = experiment_datapipe.obs_encoders()[\"cell_type\"]\n", "\n", @@ -466,10 +493,145 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "f4ac8087", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
actual cell typepredicted cell type
0epithelial cellbasal cell
1epithelial cellbasal cell
2epithelial cellbasal cell
3epithelial cellbasal cell
4epithelial cellbasal cell
5epithelial cellbasal cell
6epithelial cellbasal cell
7epithelial cellbasal cell
8epithelial cellbasal cell
9epithelial cellbasal cell
10epithelial cellbasal cell
11epithelial cellbasal cell
12epithelial cellbasal cell
13epithelial cellbasal cell
14epithelial cellbasal cell
15epithelial cellbasal cell
\n", + "
" + ], + "text/plain": [ + " actual cell type predicted cell type\n", + "0 epithelial cell basal cell\n", + "1 epithelial cell basal cell\n", + "2 epithelial cell basal cell\n", + "3 epithelial cell basal cell\n", + "4 epithelial cell basal cell\n", + "5 epithelial cell basal cell\n", + "6 epithelial cell basal cell\n", + "7 epithelial cell basal cell\n", + "8 epithelial cell basal cell\n", + "9 epithelial cell basal cell\n", + "10 epithelial cell basal cell\n", + "11 epithelial cell basal cell\n", + "12 epithelial cell basal cell\n", + "13 epithelial cell basal cell\n", + "14 epithelial cell basal cell\n", + "15 epithelial cell basal cell" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -506,7 +668,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.10" } }, "nbformat": 4,