Transformer.predict: do not broadcast to listeners (#345)

* Transformer.predict: do not broadcast to listeners The output of a transformer is passed through in two different ways: - Prediction: the data is passed through the `Doc._.trf_data` attribute. - Training: the data is broadcast directly to the transformer's listeners. However, the `Transformer.predict` method breaks the strict separation between training and prediction by also broadcasting transformer outputs to its listeners. However, this breaks down when we are training a model with an unfrozen transformer when the transformer is also in `annotating_components`. The transformer will first (as part of its update step) broadcast the tensors and backprop function to its listeners. However, then when acting as an annotating component, it would immediately override its own output and clear the backprop function. As a result, gradients will not flow into the transformer. This change removes the broadcast from the `predict` method. If a listener does not receive a batch, attempt to get the transformer output from the `Doc` instances. This makes it possible to train a pipeline with a frozen transformer. * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <[email protected]> * Require spaCy 3.5.0 * Use spaCy error code * Fix missing import --------- Co-authored-by: Sofie Van Landeghem <[email protected]>
explosion · Jan 30, 2023 · e66c73d · e66c73d
1 parent 9e40fc9
commit e66c73d
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 13 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -98,7 +98,7 @@ jobs:
     condition: and(startsWith(variables['imageName'], 'ubuntu'), eq(variables['python.version'], '3.9'))
 
   - script: |
-      pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.4.0/en_core_web_trf-3.4.0-py3-none-any.whl --no-deps
+      pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.5.0/en_core_web_trf-3.5.0-py3-none-any.whl --no-deps
       python -c "import spacy; nlp = spacy.load('en_core_web_trf'); doc = nlp('test')"
     displayName: 'Test backwards compatibility for v1.1 models'
     condition: and(startsWith(variables['imageName'], 'ubuntu'), eq(variables['python.version'], '3.9'))
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-spacy>=3.4.0,<4.0.0
+spacy>=3.5.0,<4.0.0
 numpy>=1.15.0
 transformers>=3.4.0,<4.27.0
 torch>=1.6.0

diff --git a/setup.cfg b/setup.cfg
@@ -31,7 +31,7 @@ zip_safe = false
 include_package_data = true
 python_requires = >=3.6
 install_requires =
-    spacy>=3.4.0,<4.0.0
+    spacy>=3.5.0,<4.0.0
     numpy>=1.15.0
     transformers>=3.4.0,<4.27.0
     torch>=1.6.0

diff --git a/spacy_transformers/layers/listener.py b/spacy_transformers/layers/listener.py
@@ -1,5 +1,6 @@
 from typing import Optional, Callable, List
 from thinc.api import Model
+from spacy.errors import Errors
 from spacy.tokens import Doc
 from ..data_classes import TransformerData
 
@@ -58,16 +59,29 @@ def verify_inputs(self, inputs):
 
 def forward(model: TransformerListener, docs, is_train):
     if is_train:
-        model.verify_inputs(docs)
-        return model._outputs, model.backprop_and_clear
+        # This might occur during training when the transformer layer is frozen / hasn't been updated.
+        # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
+        if model._batch_id is None:
+            outputs = []
+            for doc in docs:
+                if doc._.trf_data is None:
+                    raise ValueError(Errors.E203.format(name="transformer"))
+                else:
+                    outputs.append(doc._.trf_data)
+            return outputs, _empty_backprop
+        else:
+            model.verify_inputs(docs)
+            return model._outputs, model.backprop_and_clear
     else:
         width = model.get_dim("nO")
         outputs = []
         for doc in docs:
             if doc._.trf_data is None:
-                outputs.append(
-                    TransformerData.zeros(len(doc), width, xp=model.ops.xp)
-                )
+                outputs.append(TransformerData.zeros(len(doc), width, xp=model.ops.xp))
             else:
                 outputs.append(doc._.trf_data)
-        return outputs, lambda d_data: []
+        return outputs, _empty_backprop
+
+
+def _empty_backprop(dX):
+    return []
diff --git a/spacy_transformers/pipeline_component.py b/spacy_transformers/pipeline_component.py
@@ -227,9 +227,6 @@ def predict(self, docs: Iterable[Doc]) -> FullTransformerBatch:
             activations = FullTransformerBatch.empty(len(docs))
         else:
             activations = self.model.predict(docs)
-        batch_id = TransformerListener.get_batch_id(docs)
-        for listener in self.listeners:
-            listener.receive(batch_id, activations.doc_data, None)
         return activations
 
     def set_annotations(

diff --git a/spacy_transformers/tests/test_pipeline_component.py b/spacy_transformers/tests/test_pipeline_component.py
@@ -459,9 +459,33 @@ def test_frozen_listener():
     # train further with frozen listener
     for i in range(2):
         losses = {}
-        nlp.update(examples, sgd=optimizer, losses=losses, exclude=["transformer"])
+        nlp.update(
+            examples,
+            sgd=optimizer,
+            losses=losses,
+            exclude=["transformer"],
+            annotates=["transformer"],
+        )
         doc = nlp(text)
 
     # only tagger was updated
     assert nlp.get_pipe("transformer").to_bytes() == transformer_bytes
     assert nlp.get_pipe("tagger").to_bytes() != tagger_bytes
+
+
+def test_no_update_listener_in_predict():
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    listener = nlp.get_pipe("tagger").model.get_ref("tok2vec").get_ref("listener")
+    transformer = nlp.get_pipe("transformer")
+
+    text = "This is awesome"
+    examples = [Example.from_dict(nlp.make_doc(text), {"tags": ["A", "B", "C"]})]
+    docs = [eg.predicted for eg in examples]
+    nlp.initialize(lambda: examples)
+
+    transformer.update(examples)
+    assert listener._backprop is not None
+
+    transformer.predict(docs)
+    assert listener._backprop is not None