Merge branch 'torch/compute_loss_dtype' of github.com:edknv/models in…

…to torch/compute_loss_dtype
NVIDIA-Merlin · Jul 3, 2023 · ddd5bb4 · ddd5bb4
2 parents 8e18051 + 7d252bf
commit ddd5bb4
Show file tree

Hide file tree

Showing 30 changed files with 654 additions and 168 deletions.
diff --git a/.github/workflows/cpu-horovod.yml b/.github/workflows/cpu-horovod.yml
@@ -72,4 +72,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          EXTRA_PYTEST_MARKERS="$extra_pytest_markers" MERLIN_BRANCH="$merlin_branch" COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-horovod-cpu
+          EXTRA_PYTEST_MARKERS="$extra_pytest_markers" MERLIN_BRANCH="$merlin_branch" COMPARE_BRANCH=${{ github.base_ref }} tox -e horovod-cpu
diff --git a/.github/workflows/cpu-nvtabular.yml b/.github/workflows/cpu-nvtabular.yml
@@ -64,4 +64,4 @@ jobs:
       - name: Run tests
         run: |
           merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
-          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-nvtabular-cpu
+          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e nvtabular-cpu
diff --git a/.github/workflows/cpu-systems.yml b/.github/workflows/cpu-systems.yml
@@ -64,4 +64,4 @@ jobs:
       - name: Run tests
         run: |
           merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
-          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-systems-cpu
+          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e systems-cpu
diff --git a/.github/workflows/cpu-t4r.yml b/.github/workflows/cpu-t4r.yml
@@ -60,4 +60,4 @@ jobs:
       - name: Run tests
         run: |
           merlin_branch="${{ steps.get-branch-name.outputs.branch }}"
-          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e py38-transformers4rec-cpu
+          MERLIN_BRANCH="$merlin_branch" GIT_COMMIT=$(git rev-parse HEAD) tox -e transformers4rec-cpu
diff --git a/.github/workflows/gpu-multi.yml b/.github/workflows/gpu-multi.yml
@@ -56,4 +56,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; EXTRA_PYTEST_MARKERS=$extra_pytest_markers MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-multi-gpu
+          cd ${{ github.workspace }}; EXTRA_PYTEST_MARKERS=$extra_pytest_markers MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e multi-gpu
diff --git a/.github/workflows/gpu.yml b/.github/workflows/gpu.yml
@@ -34,7 +34,7 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-gpu
+          cd ${{ github.workspace }}; PYTEST_MARKERS="unit and not (examples or integration or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu
 
   tests-examples:
     runs-on: 1GPU
@@ -55,4 +55,4 @@ jobs:
           if [[ "${{ github.ref }}" != 'refs/heads/main' ]]; then
               extra_pytest_markers="and changed"
           fi
-          cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e py38-gpu
+          cd ${{ github.workspace }}; PYTEST_MARKERS="(examples or notebook) $extra_pytest_markers" MERLIN_BRANCH=$branch COMPARE_BRANCH=${{ github.base_ref }} tox -e gpu
diff --git a/README.md b/README.md
@@ -111,7 +111,7 @@ You can find more details and information about a low-level API in our overview
 
 ### Notebook Examples and Tutorials
 
-View the example notebooks in the [documentation](https://nvidia-merlin.github.io/models/stable/examples/README.html) to help you become familiar with Merlin Models.
+View the example notebooks in the [documentation](https://nvidia-merlin.github.io/models/stable/examples/) to help you become familiar with Merlin Models.
 
 The same notebooks are available in the `examples` directory from the [Merlin Models](https://github.com/NVIDIA-Merlin/models) GitHub repository.
 

diff --git a/merlin/models/tf/core/tabular.py b/merlin/models/tf/core/tabular.py
@@ -1,5 +1,5 @@
 import abc
-import collections
+import collections.abc
 import copy
 from typing import Dict, List, Optional, Sequence, Union, overload
 
@@ -600,7 +600,7 @@ def get_config(self):
     def select_by_tag(self, tags: Tags) -> Optional["Filter"]:
         if isinstance(self.feature_names, Tags):
             schema = self.schema.select_by_tag(self.feature_names).select_by_tag(tags)
-        elif isinstance(self.feature_names, collections.Sequence):
+        elif isinstance(self.feature_names, collections.abc.Sequence):
             schema = self.schema.select_by_name(self.feature_names).select_by_tag(tags)
         else:
             raise RuntimeError(

diff --git a/merlin/models/tf/inputs/embedding.py b/merlin/models/tf/inputs/embedding.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import collections
+import collections.abc
 import inspect
 from copy import deepcopy
 from dataclasses import dataclass
@@ -268,7 +268,7 @@ def select_by_tag(self, tags: Union[Tags, Sequence[Tags]]) -> Optional["Embeddin
         -------
         An EmbeddingTable if the tags match. If no features match, it returns None.
         """
-        if not isinstance(tags, collections.Sequence):
+        if not isinstance(tags, collections.abc.Sequence):
             tags = [tags]
 
         selected_schema = self.schema.select_by_tag(tags)

diff --git a/merlin/models/torch/__init__.py b/merlin/models/torch/__init__.py
@@ -17,6 +17,7 @@
 from merlin.models.torch import schema
 from merlin.models.torch.batch import Batch, Sequence
 from merlin.models.torch.block import Block, ParallelBlock
+from merlin.models.torch.blocks.dlrm import DLRMBlock
 from merlin.models.torch.blocks.mlp import MLPBlock
 from merlin.models.torch.inputs.embedding import EmbeddingTable, EmbeddingTables
 from merlin.models.torch.inputs.select import SelectFeatures, SelectKeys
@@ -51,4 +52,5 @@
     "Concat",
     "Stack",
     "schema",
+    "DLRMBlock",
 ]
diff --git a/merlin/models/torch/block.py b/merlin/models/torch/block.py
@@ -17,7 +17,7 @@
 import inspect
 import textwrap
 from copy import deepcopy
-from typing import Dict, Optional, TypeVar, Union
+from typing import Dict, Optional, Tuple, TypeVar, Union
 
 import torch
 from torch import nn
@@ -27,11 +27,12 @@
 from merlin.models.torch.container import BlockContainer, BlockContainerDict
 from merlin.models.torch.link import Link, LinkType
 from merlin.models.torch.registry import registry
+from merlin.models.torch.utils.traversal_utils import TraversableMixin, leaf
 from merlin.models.utils.registry import RegistryMixin
 from merlin.schema import Schema
 
 
-class Block(BlockContainer, RegistryMixin):
+class Block(BlockContainer, RegistryMixin, TraversableMixin):
     """A base-class that calls it's modules sequentially.
 
     Parameters
@@ -113,6 +114,15 @@ def copy(self) -> "Block":
         """
         return deepcopy(self)
 
+    @torch.jit.ignore
+    def select(self, selection: schema.Selection) -> "Block":
+        return _select_block(self, selection)
+
+    @torch.jit.ignore
+    def extract(self, selection: schema.Selection) -> Tuple[nn.Module, nn.Module]:
+        selected = self.select(selection)
+        return _extract_block(self, selection, selected), selected
+
 
 class ParallelBlock(Block):
     """A base-class that calls its modules in parallel.
@@ -338,6 +348,19 @@ def replace(self, pre=None, branches=None, post=None) -> "ParallelBlock":
 
         return output
 
+    def leaf(self) -> nn.Module:
+        if self.pre:
+            raise ValueError("Cannot call leaf() on a ParallelBlock with a pre-processing stage")
+
+        if len(self.branches) != 1:
+            raise ValueError("Cannot call leaf() on a ParallelBlock with multiple branches")
+
+        first = list(self.branches.values())[0]
+        if hasattr(first, "leaf"):
+            return first.leaf()
+
+        return leaf(first)
+
     def __getitem__(self, idx: Union[slice, int, str]):
         if isinstance(idx, str) and idx in self.branches:
             return self.branches[idx]
@@ -541,7 +564,7 @@ def _extract_parallel(main, selection, route, name=None):
 
 
 @schema.extract.register(BlockContainer)
-def _(main, selection, route, name=None):
+def _extract_block(main, selection, route, name=None):
     if isinstance(main, ParallelBlock):
         return _extract_parallel(main, selection, route=route, name=name)
 

diff --git a/merlin/models/torch/blocks/dlrm.py b/merlin/models/torch/blocks/dlrm.py
@@ -0,0 +1,141 @@
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+
+from merlin.models.torch.block import Block
+from merlin.models.torch.inputs.embedding import EmbeddingTables
+from merlin.models.torch.inputs.tabular import TabularInputBlock
+from merlin.models.torch.link import Link
+from merlin.models.torch.transforms.agg import MaybeAgg, Stack
+from merlin.models.utils.doc_utils import docstring_parameter
+from merlin.schema import Schema, Tags
+
+_DLRM_REF = """
+    References
+    ----------
+    ..  [1] Naumov, Maxim, et al. "Deep learning recommendation model for
+        personalization and recommendation systems." arXiv preprint arXiv:1906.00091 (2019).
+"""
+
+
+@docstring_parameter(dlrm_reference=_DLRM_REF)
+class DLRMInputBlock(TabularInputBlock):
+    """Input block for DLRM model.
+
+    Parameters
+    ----------
+    schema : Schema, optional
+        The schema to use for selection. Default is None.
+    dim : int
+        The dimensionality of the output vectors.
+    bottom_block : Block
+        Block to pass the continuous features to.
+        Note that, the output dimensionality of this block must be equal to ``dim``.
+
+    {dlrm_reference}
+
+    Raises
+    ------
+    ValueError
+        If no categorical input is provided in the schema.
+
+    """
+
+    def __init__(self, schema: Schema, dim: int, bottom_block: Block):
+        super().__init__(schema)
+        self.add_route(Tags.CATEGORICAL, EmbeddingTables(dim, seq_combiner="mean"))
+        self.add_route(Tags.CONTINUOUS, bottom_block)
+
+        if "categorical" not in self:
+            raise ValueError("DLRMInputBlock must have a categorical input")
+
+
+@docstring_parameter(dlrm_reference=_DLRM_REF)
+class DLRMInteraction(nn.Module):
+    """
+    This class defines the forward interaction operation as proposed
+    in the DLRM
+     `paper https://arxiv.org/pdf/1906.00091.pdf`_ [1]_.
+
+    This forward operation performs elementwise multiplication
+    followed by a reduction sum (equivalent to a dot product) of all embedding pairs.
+
+    {dlrm_reference}
+
+    """
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if not hasattr(self, "triu_indices"):
+            self.register_buffer(
+                "triu_indices", torch.triu_indices(inputs.shape[1], inputs.shape[1], offset=1)
+            )
+
+        interactions = torch.bmm(inputs, torch.transpose(inputs, 1, 2))
+        interactions_flat = interactions[:, self.triu_indices[0], self.triu_indices[1]]
+
+        return interactions_flat
+
+
+class ShortcutConcatContinuous(Link):
+    """
+    A shortcut connection that concatenates
+    continuous input features and intermediate outputs.
+
+    When there's no continuous input, the intermediate output is returned.
+    """
+
+    def forward(self, inputs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        intermediate_output = self.output(inputs)
+
+        if "continuous" in inputs:
+            return torch.cat((inputs["continuous"], intermediate_output), dim=1)
+
+        return intermediate_output
+
+
+@docstring_parameter(dlrm_reference=_DLRM_REF)
+class DLRMBlock(Block):
+    """Builds the DLRM architecture, as proposed in the following
+     `paper https://arxiv.org/pdf/1906.00091.pdf`_ [1]_.
+
+    Parameters
+    ----------
+    schema : Schema, optional
+        The schema to use for selection. Default is None.
+    dim : int
+        The dimensionality of the output vectors.
+    bottom_block : Block
+        Block to pass the continuous features to.
+        Note that, the output dimensionality of this block must be equal to ``dim``.
+    top_block : Block, optional
+        An optional upper-level block of the model.
+    interaction : nn.Module, optional
+        Interaction module for DLRM.
+        If not provided, DLRMInteraction will be used by default.
+
+    {dlrm_reference}
+
+    Raises
+    ------
+    ValueError
+        If no categorical input is provided in the schema.
+    """
+
+    def __init__(
+        self,
+        schema: Schema,
+        dim: int,
+        bottom_block: Block,
+        top_block: Optional[Block] = None,
+        interaction: Optional[nn.Module] = None,
+    ):
+        super().__init__(DLRMInputBlock(schema, dim, bottom_block))
+
+        self.append(
+            Block(MaybeAgg(Stack(dim=1)), interaction or DLRMInteraction()),
+            link=ShortcutConcatContinuous(),
+        )
+
+        if top_block:
+            self.append(top_block)
diff --git a/merlin/models/torch/models/base.py b/merlin/models/torch/models/base.py
@@ -108,7 +108,7 @@ def configure_optimizers(self):
 
     def model_outputs(self) -> List[ModelOutput]:
         """Finds all instances of `ModelOutput` in the model."""
-        return module_utils.find_all_instances(self, ModelOutput)
+        return self.find(ModelOutput)
 
     def first(self) -> nn.Module:
         """Returns the first block in the model."""

diff --git a/merlin/models/torch/outputs/classification.py b/merlin/models/torch/outputs/classification.py
@@ -36,24 +36,22 @@ class BinaryOutput(ModelOutput):
         The metrics used for evaluation. Default includes Accuracy, AUROC, Precision, and Recall.
     """
 
+    DEFAULT_LOSS_CLS = nn.BCEWithLogitsLoss
+    DEFAULT_METRICS_CLS = (Accuracy, AUROC, Precision, Recall)
+
     def __init__(
         self,
         schema: Optional[ColumnSchema] = None,
-        loss: nn.Module = nn.BCEWithLogitsLoss(),
-        metrics: Sequence[Metric] = (
-            Accuracy(task="binary"),
-            AUROC(task="binary"),
-            Precision(task="binary"),
-            Recall(task="binary"),
-        ),
+        loss: Optional[nn.Module] = None,
+        metrics: Sequence[Metric] = (),
     ):
         """Initializes a BinaryOutput object."""
         super().__init__(
             nn.LazyLinear(1),
             nn.Sigmoid(),
             schema=schema,
-            loss=loss,
-            metrics=metrics,
+            loss=loss or self.DEFAULT_LOSS_CLS(),
+            metrics=metrics or [m(task="binary") for m in self.DEFAULT_METRICS_CLS],
         )
 
     def setup_schema(self, target: Optional[Union[ColumnSchema, Schema]]):

diff --git a/merlin/models/torch/outputs/regression.py b/merlin/models/torch/outputs/regression.py
@@ -36,18 +36,21 @@ class RegressionOutput(ModelOutput):
         The metrics used for evaluation. Default is MeanSquaredError.
     """
 
+    DEFAULT_LOSS_CLS = nn.MSELoss
+    DEFAULT_METRICS_CLS = (MeanSquaredError,)
+
     def __init__(
         self,
         schema: Optional[ColumnSchema] = None,
-        loss: nn.Module = nn.MSELoss(),
-        metrics: Sequence[Metric] = (MeanSquaredError(),),
+        loss: Optional[nn.Module] = None,
+        metrics: Sequence[Metric] = (),
     ):
         """Initializes a RegressionOutput object."""
         super().__init__(
             nn.LazyLinear(1),
             schema=schema,
-            loss=loss,
-            metrics=metrics,
+            loss=loss or self.DEFAULT_LOSS_CLS(),
+            metrics=metrics or [m() for m in self.DEFAULT_METRICS_CLS],
         )
 
     def setup_schema(self, target: Optional[Union[ColumnSchema, Schema]]):

diff --git a/merlin/models/torch/router.py b/merlin/models/torch/router.py
@@ -88,6 +88,9 @@ def add_route(
         """
 
         routing_module = schema.select(self.selectable, selection)
+        if not routing_module:
+            return self
+
         if module is not None:
             schema.setup_schema(module, routing_module.schema)
 
@@ -100,6 +103,8 @@ def add_route(
                 branch = module
         else:
             if self.prepend_routing_module:
+                if not routing_module:
+                    return self
                 branch = routing_module
             else:
                 raise ValueError("Must provide a module.")