Get rid of expensive Blockwise(Reshape)

pymc-devs · Jul 4, 2024 · 2e95049 · 2e95049
1 parent 47cd634
commit 2e95049
Show file tree

Hide file tree

Showing 5 changed files with 215 additions and 91 deletions.
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
@@ -1,15 +1,19 @@
+from pytensor import Variable
 from pytensor.compile.mode import optdb
 from pytensor.graph import Constant, node_rewriter
 from pytensor.graph.replace import vectorize_node
 from pytensor.graph.rewriting.basic import copy_stack_trace, out2in
 from pytensor.tensor.basic import Alloc, ARange, alloc, shape_padleft
 from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.elemwise import DimShuffle
 from pytensor.tensor.math import Dot
 from pytensor.tensor.rewriting.basic import (
     register_canonicalize,
     register_specialize,
     register_stabilize,
 )
+from pytensor.tensor.rewriting.uncanonicalize import local_dimshuffle_alloc
+from pytensor.tensor.shape import Reshape
 from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor, Subtensor
 
 
@@ -70,7 +74,7 @@ def local_eager_useless_unbatched_blockwise(fgraph, node):
         Dot | Alloc | ARange | Subtensor | AdvancedSubtensor | AdvancedIncSubtensor,
     ):
         # Many Dot-related rewrites (eg, all of BlasOpt) happen before specialize
-        # These other Ops can't always be trivially vectored at runtime,
+        # These other Ops can't always be trivially vectorized at runtime,
         # since their inputs may imply non-rectangular shapes.
         return local_useless_unbatched_blockwise.fn(fgraph, node)
 
@@ -86,6 +90,18 @@ def _squeeze_left(x, stop_at_dim: int | None = None):
     return x.squeeze(axis=tuple(range(squeeze_ndim)))
 
 
+def alloc_or_expand_dims_of_alloc(var: Variable) -> bool:
+    return var.owner and (
+        isinstance(var.owner.op, Alloc)
+        or (
+            isinstance(var.owner.op, DimShuffle)
+            and var.owner.inputs[0].owner
+            and isinstance(var.owner.inputs[0].owner.op, Alloc)
+        )
+    )
+
+
+@register_canonicalize("shape_unsafe")
 @register_specialize("shape_unsafe")
 @node_rewriter([Blockwise])
 def local_blockwise_alloc(fgraph, node):
@@ -97,19 +113,25 @@ def local_blockwise_alloc(fgraph, node):
     BOp(matrix, alloc(vector, 10, 5)) -> BOp(matrix, vector)
     """
 
-    if not any(isinstance(inp.owner.op, Alloc) for inp in node.inputs if inp.owner):
-        return None
-
     op: Blockwise = node.op  # type: ignore
 
     batch_ndim = op.batch_ndim(node)
     if not batch_ndim:
         return None
 
+    if not any(alloc_or_expand_dims_of_alloc(var) for var in node.inputs):
+        return None
+
     new_inputs = []
     batch_shapes = []
     can_push_any_alloc = False
     for inp, inp_sig in zip(node.inputs, op.inputs_sig):
+        if inp.owner and isinstance(inp.owner.op, DimShuffle):
+            # Convert DimShuffle of Alloc to Alloc
+            new_inp = local_dimshuffle_alloc.transform(None, inp.owner)
+            if new_inp:
+                [inp] = new_inp
+
         if inp.owner and isinstance(inp.owner.op, Alloc):
             # Push batch dims from Alloc
             value, *shape = inp.owner.inputs
@@ -167,17 +189,15 @@ def local_blockwise_alloc(fgraph, node):
         missing_ndim = old_out_type.ndim - new_out_type.ndim
         batch_shape = ([1] * missing_ndim + list(new_outs[0].shape))[:batch_ndim]
         for i, batch_dims in enumerate(zip(*batch_shapes)):  # Transpose shape tuples
+            if old_out_type.broadcastable[i]:
+                continue
             for batch_dim in batch_dims:
                 if batch_dim == 1:
                     continue
+                batch_shape[i] = batch_dim
                 if isinstance(batch_dim, Constant):
                     # Give preference to Constants
-                    batch_shape[i] = batch_dim
                     break
-                elif old_out_type.broadcastable[i]:
-                    # Only use non Constant shapes if absolutely necessary
-                    # Otherwise, we use the shape of the non-alloc output
-                    batch_shape[i] = batch_dim
 
         copy_stack_trace(node.outputs, new_outs)
         new_outs = [
@@ -190,3 +210,29 @@ def local_blockwise_alloc(fgraph, node):
         ]
     copy_stack_trace(node.outputs, new_outs)
     return new_outs
+
+
+@register_canonicalize
+@register_specialize
+@node_rewriter([Blockwise])
+def local_blockwise_reshape(fgraph, node):
+    """Rewrite away square Blockwise reshapes.
+
+    Reshape is tricky to vectorize eagerly, because a graph like
+    `x.reshape([x.shape[0] * x.shape[1], -1])` has many operations
+    that must be vectorized before we arrize at the reshape operation.
+
+    For the square Reshape case, we must wait for all the intemediate
+    operations to be lifted as Allocs
+    """
+    if not isinstance(node.op.core_op, Reshape):
+        return None
+
+    x, output_shape = node.inputs
+    batch_ndim = node.op.batch_ndim(node)
+    if all(output_shape.type.broadcastable[:batch_ndim]):
+        batched_shape = x.shape[:batch_ndim]
+        core_reshape = _squeeze_left(output_shape, batch_ndim)
+        new_out = x.reshape([*tuple(batched_shape), *tuple(core_reshape)])
+        copy_stack_trace(node.outputs[0], new_out)
+        return [new_out]
diff --git a/pytensor/tensor/rewriting/shape.py b/pytensor/tensor/rewriting/shape.py
@@ -32,7 +32,6 @@
 from pytensor.tensor.rewriting.basic import (
     register_canonicalize,
     register_specialize,
-    register_stabilize,
     register_useless,
     topo_constant_folding,
 )
@@ -749,51 +748,43 @@ def apply(self, fgraph):
 pytensor.compile.mode.optdb.register("UnShapeOpt", UnShapeOptimizer(), position=10)
 
 
-def local_reshape_chain(op):
-    @node_rewriter([op])
-    def f(fgraph, node):
-        """
-        Reshape(Reshape(shape1),shape2) -> Reshape(shape2)
-
-        """
-        if not check_chain(node, op, op):
-            return False
-
-        # TODO: this can permit a failing program to run by eliminating
-        #       the lower reshape
-        rval = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
-
-        # Copy over stacktrace from previous output node, as any error
-        # in new computational graph would have been caused by last op
-        # in the old computational graph.
-        copy_stack_trace(node.outputs, rval)
-
-        # It might happen that the desired output of this node has a
-        # broadcastable pattern that does not match that of 'rval'. This is
-        # when originally, we were able to figure out that one of the
-        # dimensions of the reshape is one, but some other transformation
-        # replaced the shape by one for which this cannot be guessed.
-        # We should try to figure out why we lost the information about this
-        # constant value... but in the meantime, better not apply this
-        # rewrite.
-        if rval.type.ndim == node.outputs[0].type.ndim and all(
-            s1 == s2
-            for s1, s2 in zip(rval.type.shape, node.outputs[0].type.shape)
-            if s1 == 1 or s2 == 1
-        ):
-            return [rval]
-        else:
-            return False
-
-    return f
+@register_canonicalize("shape_unsafe")
+@register_specialize("shape_unsafe")
+@node_rewriter([Reshape])
+def local_reshape_chain(fgraph, node):
+    """
+    Reshape(Reshape(x, shape1),shape2) -> Reshape(x, shape2)
 
+    """
+    if not check_chain(node, Reshape, Reshape):
+        return False
 
-register_canonicalize(local_reshape_chain(Reshape), name="local_reshape_chain")
+    rval = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
+
+    # Copy over stacktrace from previous output node, as any error
+    # in new computational graph would have been caused by last op
+    # in the old computational graph.
+    copy_stack_trace(node.outputs, rval)
+
+    # It might happen that the desired output of this node has a
+    # broadcastable pattern that does not match that of 'rval'. This is
+    # when originally, we were able to figure out that one of the
+    # dimensions of the reshape is one, but some other transformation
+    # replaced the shape by one for which this cannot be guessed.
+    # We should try to figure out why we lost the information about this
+    # constant value... but in the meantime, better not apply this
+    # rewrite.
+    if rval.type.ndim == node.outputs[0].type.ndim and all(
+        s1 == s2
+        for s1, s2 in zip(rval.type.shape, node.outputs[0].type.shape)
+        if s1 == 1 or s2 == 1
+    ):
+        return [rval]
 
 
-@register_useless
-@register_canonicalize
-@register_stabilize
+@register_useless("shape_unsafe")
+@register_canonicalize("shape_unsafe")
+@register_specialize("shape_unsafe")
 @node_rewriter([Reshape])
 def local_useless_reshape(fgraph, node):
     """Remove two kinds of useless `Reshape`.
@@ -802,24 +793,17 @@ def local_useless_reshape(fgraph, node):
     - Remove `Reshape` when reshaping to the shape of the input.
 
     """
-    inp = node.inputs[0]
-    output = node.outputs[0]
-    output_shape = node.inputs[1]
+    inp, output_shape = node.inputs
+    [output] = node.outputs
 
     if inp.type.ndim != output.type.ndim:
         return False
 
     # Simple case: both input and output have a single dimension.
-    # TODO FIXME XXX: This could hide errors if the user provides inconsistent
-    # shapes.
     if (
         inp.type.ndim == 1
         and output.type.ndim == 1
-        and all(
-            s1 == s2
-            for s1, s2 in zip(inp.type.shape, output.type.shape)
-            if s1 == 1 or s2 == 1
-        )
+        and inp.type.broadcastable == output.type.broadcastable
     ):
         return [inp]
 
@@ -832,8 +816,15 @@ def local_useless_reshape(fgraph, node):
 
     # Match Reshape(x, [x.shape[0], ..., x.shape[-1]]), accounting for
     # broadcastable and constant dimensions
-    if output_shape.owner and isinstance(output_shape.owner.op, MakeVector):
-        output_shape_is = output_shape.owner.inputs
+    if isinstance(output_shape, Constant) or (
+        output_shape.owner and isinstance(output_shape.owner.op, MakeVector)
+    ):
+        if isinstance(output_shape, Constant):
+            output_shape_is = [
+                as_tensor_variable(dim, ndim=0) for dim in output_shape.data
+            ]
+        else:
+            output_shape_is = output_shape.owner.inputs
 
         shape_feature = getattr(fgraph, "shape_feature", None)
 
@@ -865,9 +856,9 @@ def local_useless_reshape(fgraph, node):
                         shape_match[dim] = True
                         continue
 
-            # Match 1 if input.type.shape[dim] == 1
+            # Match constant if input.type.shape[dim] == constant
             cst_outshp_i = extract_constant(outshp_i, only_process_constants=1)
-            if inp.type.shape[dim] == 1 and cst_outshp_i == 1:
+            if inp.type.shape[dim] == cst_outshp_i:
                 shape_match[dim] = True
                 continue
 
@@ -881,17 +872,18 @@ def local_useless_reshape(fgraph, node):
             if shape_feature:
                 inpshp_i = shape_feature.get_shape(inp, dim)
                 if inpshp_i == outshp_i or (
-                    extract_constant(inpshp_i, only_process_constants=1)
-                    == extract_constant(outshp_i, only_process_constants=1)
+                    extract_constant(inpshp_i, only_process_constants=True)
+                    == extract_constant(outshp_i, only_process_constants=True)
                 ):
                     shape_match[dim] = True
                     continue
 
-        if all(shape_match) and nb_m1 <= 1:
+        if nb_m1 <= 1 and all(shape_match):
+            return [inp]
+
+        if (nb_m1 == 0) and (shape_match.count(False) == output.type.ndim - 1):
             return [inp]
 
-        # TODO later: if all the shapes except one match, we may want to
-        # consider it useless as well, like we do in the 1-dim case.
         return False
 
 
@@ -910,9 +902,8 @@ def local_reshape_to_dimshuffle(fgraph, node):
           -> DimShuffle{x,0,x,1,x,x}(Reshape(x, (m, n)))
     """
     op = node.op
-    inp = node.inputs[0]
-    output = node.outputs[0]
-    output_shape = node.inputs[1]
+    inp, output_shape = node.inputs
+    [output] = node.outputs
 
     dimshuffle_new_order = []
     new_output_shape = []
@@ -944,7 +935,7 @@ def local_reshape_to_dimshuffle(fgraph, node):
 
 
 @register_canonicalize
-@register_stabilize
+@register_specialize
 @node_rewriter([Reshape])
 def local_reshape_lift(fgraph, node):
     """

diff --git a/tests/tensor/rewriting/test_blockwise.py b/tests/tensor/rewriting/test_blockwise.py
@@ -1,14 +1,17 @@
 from functools import partial
 
-from pytensor import function
-from pytensor.graph import FunctionGraph, rewrite_graph
+import numpy as np
+
+from pytensor import Mode, function
+from pytensor.graph import FunctionGraph, rewrite_graph, vectorize_graph
 from pytensor.graph.basic import equal_computations
 from pytensor.scalar import log as scalar_log
 from pytensor.tensor import add, alloc, matrix, tensor, tensor3
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import Elemwise
 from pytensor.tensor.nlinalg import MatrixPinv
 from pytensor.tensor.rewriting.blockwise import local_useless_blockwise
+from pytensor.tensor.shape import Reshape
 
 
 def test_useless_blockwise_of_elemwise():
@@ -118,3 +121,27 @@ def test_blockwise_alloc():
     out = vector_add(x, alloc(y, 5))
     expected_out = out
     assert equal([rewrite(out)], [expected_out])
+
+
+def test_blockwise_reshape():
+    x = tensor("x", shape=(None, None, None))
+    y = x.reshape([x.shape[0] * x.shape[1], -1])
+
+    new_x = tensor("x", shape=(None, None, None, None))
+    new_y = vectorize_graph(y, {x: new_x})
+    assert not isinstance(new_y.owner.op, Reshape)
+    assert isinstance(new_y.owner.op, Blockwise) and isinstance(
+        new_y.owner.op.core_op, Reshape
+    )
+
+    rewritten_y = rewrite_graph(
+        new_y, include=("canonicalize", "specialize"), clone=True
+    )
+    assert isinstance(rewritten_y.owner.op, Reshape)
+
+    no_rewrites = Mode(linker="py", optimizer=None)
+    test_x = np.arange(5 * 4 * 3 * 2).reshape(5, 4, 3, 2)
+    np.testing.assert_allclose(
+        new_y.eval({"x": test_x}, mode=no_rewrites),
+        rewritten_y.eval({"x": test_x}, mode=no_rewrites),
+    )