From 3c4f73de723bd7e95570e25a0070c40ea53b4e76 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Wed, 17 Jul 2024 12:53:04 -0700
Subject: [PATCH 01/72] Add IfElse

---
 pytensor/link/pytorch/dispatch/basic.py | 13 +++++++++++++
 tests/link/pytorch/test_basic.py        | 20 +++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index c71e1606bf..03dc8b2362 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -5,6 +5,7 @@
 
 from pytensor.compile.ops import DeepCopyOp
 from pytensor.graph.fg import FunctionGraph
+from pytensor.ifelse import IfElse
 from pytensor.link.utils import fgraph_to_python
 from pytensor.raise_op import CheckAndRaise
 from pytensor.tensor.basic import Alloc, AllocEmpty, ARange, Eye, Join, MakeVector
@@ -124,6 +125,7 @@ def eye(N, M, k):
     return eye
 
 
+
 @pytorch_funcify.register(MakeVector)
 def pytorch_funcify_MakeVector(op, **kwargs):
     torch_dtype = getattr(torch, op.dtype)
@@ -132,3 +134,14 @@ def makevector(*x):
         return torch.tensor(x, dtype=torch_dtype)
 
     return makevector
+
+
+@pytorch_funcify.register(IfElse)
+def pytorch_funcify_IfElse(op, **kwargs):
+    n_outs = op.n_outs
+    assert n_outs == 1
+
+    def ifelse(cond, *args, n_outs=n_outs):
+        return torch.where(cond, *args)
+
+    return ifelse
diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index 27c1b1bd6a..d49ea1ab1e 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -11,7 +11,8 @@
 from pytensor.configdefaults import config
 from pytensor.graph.basic import Apply
 from pytensor.graph.fg import FunctionGraph
-from pytensor.graph.op import Op
+from pytensor.graph.op import Op, get_test_value
+from pytensor.ifelse import ifelse
 from pytensor.raise_op import CheckAndRaise
 from pytensor.tensor import alloc, arange, as_tensor, empty, eye
 from pytensor.tensor.type import matrix, scalar, vector
@@ -301,3 +302,20 @@ def test_pytorch_MakeVector():
     x_fg = FunctionGraph([], [x])
 
     compare_pytorch_and_py(x_fg, [])
+
+
+def test_pytorch_ifelse():
+    true_vals = np.r_[1, 2, 3]
+    false_vals = np.r_[-1, -2, -3]
+
+    x = ifelse(np.array(True), true_vals, false_vals)
+    x_fg = FunctionGraph([], [x])
+
+    compare_pytorch_and_py(x_fg, [])
+
+    a = scalar("a")
+    a.tag.test_value = np.array(0.2, dtype=config.floatX)
+    x = ifelse(a < 0.5, true_vals, false_vals)
+    x_fg = FunctionGraph([a], [x])  # I.e. False
+
+    compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])

From bfb97eaea7bbe39b9ba72471fa46a429e7402d03 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Wed, 17 Jul 2024 13:14:35 -0700
Subject: [PATCH 02/72] Remove space

---
 pytensor/link/pytorch/dispatch/basic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index 03dc8b2362..0039406907 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -125,7 +125,6 @@ def eye(N, M, k):
     return eye
 
 
-
 @pytorch_funcify.register(MakeVector)
 def pytorch_funcify_MakeVector(op, **kwargs):
     torch_dtype = getattr(torch, op.dtype)

From 6ad1c5cf5b710be0fe2e22371761ec6dfe107f7b Mon Sep 17 00:00:00 2001
From: Pham Nguyen Hung <97870091+HangenYuu@users.noreply.github.com>
Date: Thu, 18 Jul 2024 17:58:59 +0700
Subject: [PATCH 03/72] Implement Dot and BatchedDot in PyTensor (#878)

---
 pytensor/link/__init__.py                  |  1 +
 pytensor/link/pytorch/dispatch/__init__.py |  5 +++-
 pytensor/link/pytorch/dispatch/blas.py     | 14 +++++++++++
 pytensor/link/pytorch/dispatch/math.py     | 12 +++++++++
 tests/link/pytorch/test_blas.py            | 24 ++++++++++++++++++
 tests/link/pytorch/test_math.py            | 29 ++++++++++++++++++++++
 6 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 pytensor/link/pytorch/dispatch/blas.py
 create mode 100644 pytensor/link/pytorch/dispatch/math.py
 create mode 100644 tests/link/pytorch/test_blas.py
 create mode 100644 tests/link/pytorch/test_math.py

diff --git a/pytensor/link/__init__.py b/pytensor/link/__init__.py
index e69de29bb2..c8c236a854 100644
--- a/pytensor/link/__init__.py
+++ b/pytensor/link/__init__.py
@@ -0,0 +1 @@
+from pytensor.link.pytorch.linker import PytorchLinker
diff --git a/pytensor/link/pytorch/dispatch/__init__.py b/pytensor/link/pytorch/dispatch/__init__.py
index 017e57df64..fa47908d74 100644
--- a/pytensor/link/pytorch/dispatch/__init__.py
+++ b/pytensor/link/pytorch/dispatch/__init__.py
@@ -2,9 +2,12 @@
 from pytensor.link.pytorch.dispatch.basic import pytorch_funcify, pytorch_typify
 
 # # Load dispatch specializations
+import pytensor.link.pytorch.dispatch.blas
 import pytensor.link.pytorch.dispatch.scalar
 import pytensor.link.pytorch.dispatch.elemwise
+import pytensor.link.pytorch.dispatch.math
 import pytensor.link.pytorch.dispatch.extra_ops
-import pytensor.link.pytorch.dispatch.sort
 import pytensor.link.pytorch.dispatch.shape
+import pytensor.link.pytorch.dispatch.sort
+
 # isort: on
diff --git a/pytensor/link/pytorch/dispatch/blas.py b/pytensor/link/pytorch/dispatch/blas.py
new file mode 100644
index 0000000000..5691551998
--- /dev/null
+++ b/pytensor/link/pytorch/dispatch/blas.py
@@ -0,0 +1,14 @@
+import torch
+
+from pytensor.link.pytorch.dispatch import pytorch_funcify
+from pytensor.tensor.blas import BatchedDot
+
+
+@pytorch_funcify.register(BatchedDot)
+def pytorch_funcify_BatchedDot(op, **kwargs):
+    def batched_dot(a, b):
+        if a.shape[0] != b.shape[0]:
+            raise TypeError("Shapes must match in the 0-th dimension")
+        return torch.bmm(a, b)
+
+    return batched_dot
diff --git a/pytensor/link/pytorch/dispatch/math.py b/pytensor/link/pytorch/dispatch/math.py
new file mode 100644
index 0000000000..4275424f0a
--- /dev/null
+++ b/pytensor/link/pytorch/dispatch/math.py
@@ -0,0 +1,12 @@
+import torch
+
+from pytensor.link.pytorch.dispatch import pytorch_funcify
+from pytensor.tensor.math import Dot
+
+
+@pytorch_funcify.register(Dot)
+def pytorch_funcify_Dot(op, **kwargs):
+    def dot(x, y):
+        return torch.matmul(x, y)
+
+    return dot
diff --git a/tests/link/pytorch/test_blas.py b/tests/link/pytorch/test_blas.py
new file mode 100644
index 0000000000..35f7dd7b6a
--- /dev/null
+++ b/tests/link/pytorch/test_blas.py
@@ -0,0 +1,24 @@
+import numpy as np
+import pytest
+
+from pytensor.configdefaults import config
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor import blas as pt_blas
+from pytensor.tensor.type import tensor3
+from tests.link.pytorch.test_basic import compare_pytorch_and_py
+
+
+def test_pytorch_BatchedDot():
+    # tensor3 . tensor3
+    a = tensor3("a")
+    a_test = np.linspace(-1, 1, 10 * 5 * 3).astype(config.floatX).reshape((10, 5, 3))
+    b = tensor3("b")
+    b_test = np.linspace(1, -1, 10 * 3 * 2).astype(config.floatX).reshape((10, 3, 2))
+    out = pt_blas.BatchedDot()(a, b)
+    fgraph = FunctionGraph([a, b], [out])
+    pytensor_pytorch_fn, _ = compare_pytorch_and_py(fgraph, [a_test, b_test])
+
+    # A dimension mismatch should raise a TypeError for compatibility
+    inputs = [a_test[:-1], b_test]
+    with pytest.raises(TypeError):
+        pytensor_pytorch_fn(*inputs)
diff --git a/tests/link/pytorch/test_math.py b/tests/link/pytorch/test_math.py
new file mode 100644
index 0000000000..affca4ad32
--- /dev/null
+++ b/tests/link/pytorch/test_math.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+from pytensor.configdefaults import config
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor.type import matrix, scalar, vector
+from tests.link.pytorch.test_basic import compare_pytorch_and_py
+
+
+def test_pytorch_dot():
+    y = vector("y")
+    y_test = np.r_[1.0, 2.0].astype(config.floatX)
+    x = vector("x")
+    x_test = np.r_[3.0, 4.0].astype(config.floatX)
+    A = matrix("A")
+    A_test = np.array([[6, 3], [3, 0]], dtype=config.floatX)
+    alpha = scalar("alpha")
+    alpha_test = np.array(3.0, dtype=config.floatX)
+    beta = scalar("beta")
+    beta_test = np.array(5.0, dtype=config.floatX)
+
+    # 2D * 2D
+    out = A.dot(A * alpha) + beta * A
+    fgraph = FunctionGraph([A, alpha, beta], [out])
+    compare_pytorch_and_py(fgraph, [A_test, alpha_test, beta_test])
+
+    # 1D * 2D and 1D * 1D
+    out = y.dot(alpha * A).dot(x) + beta * y
+    fgraph = FunctionGraph([y, x, A, alpha, beta], [out])
+    compare_pytorch_and_py(fgraph, [y_test, x_test, A_test, alpha_test, beta_test])

From cac9febfdae5387ea9272bc7b73d4d9149a01e63 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <48652735+jessegrabowski@users.noreply.github.com>
Date: Thu, 18 Jul 2024 09:08:56 -0500
Subject: [PATCH 04/72] Add `OpFromGraph` wrapper around `alloc_diag` (#915)

* Add `OpFromGraph` wrapper around `alloc_diag`

* Remove depreciated `AllocDiag` `Op`, rename `AllocDiag2 -> AllocDiag`

* Set `inline = False`

* Add rewrite to inline all `OpFromGraph` `Op`s

* Add `is_zero_offset` helper to `Eye`

* Add `is_left_expand_dims` and `is_right_expand_dims` attributes to `DimShuffle`

* Seed `test_local_lift_through_linalg` test
---
 pytensor/compile/builders.py          |  32 +-----
 pytensor/link/jax/dispatch/basic.py   |  24 +++++
 pytensor/tensor/basic.py              | 141 +++++++++-----------------
 pytensor/tensor/elemwise.py           |   8 ++
 pytensor/tensor/rewriting/__init__.py |   1 +
 pytensor/tensor/rewriting/linalg.py   | 119 ++++++++++++++--------
 pytensor/tensor/rewriting/ofg.py      |  68 +++++++++++++
 tests/link/jax/test_basic.py          |  19 +++-
 tests/tensor/rewriting/test_linalg.py |  17 +++-
 tests/tensor/rewriting/test_ofg.py    |  22 ++++
 10 files changed, 278 insertions(+), 173 deletions(-)
 create mode 100644 pytensor/tensor/rewriting/ofg.py
 create mode 100644 tests/tensor/rewriting/test_ofg.py

diff --git a/pytensor/compile/builders.py b/pytensor/compile/builders.py
index 91588a5ecc..759c9b09bb 100644
--- a/pytensor/compile/builders.py
+++ b/pytensor/compile/builders.py
@@ -8,7 +8,6 @@
 
 from pytensor.compile.function import function
 from pytensor.compile.function.pfunc import rebuild_collect_shared
-from pytensor.compile.mode import optdb
 from pytensor.compile.sharedvalue import SharedVariable
 from pytensor.configdefaults import config
 from pytensor.gradient import DisconnectedType, Rop, grad
@@ -24,7 +23,6 @@
 from pytensor.graph.null_type import NullType
 from pytensor.graph.op import HasInnerGraph, Op
 from pytensor.graph.replace import clone_replace
-from pytensor.graph.rewriting.basic import in2out, node_rewriter
 from pytensor.graph.utils import MissingInputError
 
 
@@ -575,7 +573,7 @@ def lop_overrides(inps, grads):
             for inp_grad in input_grads
             if not isinstance(inp_grad.type, DisconnectedType | NullType)
         ]
-        lop_op = type(self)(
+        lop_op = OpFromGraph(
             inputs=inner_inputs + connected_inner_outputs + connected_output_grads,
             outputs=connected_input_grads,
             inline=self.is_inline,
@@ -669,7 +667,7 @@ def _build_and_cache_rop_op(self):
             for out_grad in output_grads
             if not isinstance(out_grad.type, DisconnectedType | NullType)
         ]
-        rop_op = type(self)(
+        rop_op = OpFromGraph(
             inputs=inner_inputs + eval_points,
             outputs=filtered_output_grads,
             inline=self.is_inline,
@@ -852,29 +850,3 @@ def perform(self, node, inputs, outputs):
         assert len(variables) == len(outputs)
         for output, variable in zip(outputs, variables):
             output[0] = variable
-
-
-@node_rewriter([OpFromGraph])
-def inline_ofg_expansion(fgraph, node):
-    """
-    This optimization expands internal graph of OpFromGraph.
-    Only performed if node.op.is_inline == True
-    Doing so can improve optimization at the cost of compilation speed.
-    """
-    op = node.op
-    if not isinstance(op, OpFromGraph):
-        return False
-    if not op.is_inline:
-        return False
-    return clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
-
-
-# We want to run this before the first merge optimizer
-# and before the first scan optimizer.
-optdb.register(
-    "inline_ofg_expansion",
-    in2out(inline_ofg_expansion),
-    "fast_compile",
-    "fast_run",
-    position=-0.01,
-)
diff --git a/pytensor/link/jax/dispatch/basic.py b/pytensor/link/jax/dispatch/basic.py
index b35759f837..bd559ee716 100644
--- a/pytensor/link/jax/dispatch/basic.py
+++ b/pytensor/link/jax/dispatch/basic.py
@@ -1,10 +1,13 @@
 import warnings
+from collections.abc import Callable
 from functools import singledispatch
 
 import jax
 import jax.numpy as jnp
 import numpy as np
 
+from pytensor.compile import JAX
+from pytensor.compile.builders import OpFromGraph
 from pytensor.compile.ops import DeepCopyOp, ViewOp
 from pytensor.configdefaults import config
 from pytensor.graph.fg import FunctionGraph
@@ -114,3 +117,24 @@ def viewop(x):
         return x
 
     return viewop
+
+
+@jax_funcify.register(OpFromGraph)
+def jax_funcify_OpFromGraph(ofg: OpFromGraph, node=None, **kwargs) -> Callable:
+    _ = kwargs.pop("storage_map", None)
+
+    # Apply inner rewrites
+    JAX.optimizer(ofg.fgraph)
+    fgraph_fn = jax_funcify(ofg.fgraph, **kwargs)
+
+    if len(ofg.fgraph.outputs) == 1:
+
+        def opfromgraph(*inputs):
+            return fgraph_fn(*inputs)[0]
+
+    else:
+
+        def opfromgraph(*inputs):
+            return fgraph_fn(*inputs)
+
+    return opfromgraph
diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
index 014ae80e4c..119c44c647 100644
--- a/pytensor/tensor/basic.py
+++ b/pytensor/tensor/basic.py
@@ -21,6 +21,7 @@
 import pytensor.scalar.sharedvar
 from pytensor import compile, config, printing
 from pytensor import scalar as ps
+from pytensor.compile.builders import OpFromGraph
 from pytensor.gradient import DisconnectedType, grad_undefined
 from pytensor.graph import RewriteDatabaseQuery
 from pytensor.graph.basic import Apply, Constant, Variable, equal_computations
@@ -1334,6 +1335,25 @@ def infer_shape(self, fgraph, node, in_shapes):
     def grad(self, inp, grads):
         return [grad_undefined(self, i, inp[i]) for i in range(3)]
 
+    @staticmethod
+    def is_offset_zero(node) -> bool:
+        """
+        Test if an Eye Op has a diagonal offset of zero
+
+        Parameters
+        ----------
+        node
+            Eye node to test
+
+        Returns
+        -------
+        is_offset_zero: bool
+            True if the offset is zero (``k = 0``).
+        """
+
+        offset = node.inputs[-1]
+        return isinstance(offset, Constant) and offset.data.item() == 0
+
 
 def eye(n, m=None, k=0, dtype=None):
     """Return a 2-D array with ones on the diagonal and zeros elsewhere.
@@ -3749,109 +3769,37 @@ def trace(a, offset=0, axis1=0, axis2=1):
     return diagonal(a, offset=offset, axis1=axis1, axis2=axis2).sum(-1)
 
 
-class AllocDiag(Op):
-    """An `Op` that copies a vector to the diagonal of a zero-ed matrix."""
+class AllocDiag(OpFromGraph):
+    """
+    Wrapper Op for alloc_diag graphs
+    """
 
-    __props__ = ("offset", "axis1", "axis2")
+    __props__ = ("axis1", "axis2")
 
-    def __init__(self, offset=0, axis1=0, axis2=1):
-        """
-        Parameters
-        ----------
-        offset: int
-            Offset of the diagonal from the main diagonal defined by `axis1`
-            and `axis2`. Can be positive or negative.  Defaults to main
-            diagonal (i.e. 0).
-        axis1: int
-            Axis to be used as the first axis of the 2-D sub-arrays to which
-            the diagonals will be allocated.  Defaults to first axis (i.e. 0).
-        axis2: int
-            Axis to be used as the second axis of the 2-D sub-arrays to which
-            the diagonals will be allocated.  Defaults to second axis (i.e. 1).
-        """
-        warnings.warn(
-            "AllocDiag is deprecated. Use `alloc_diag` instead",
-            FutureWarning,
-        )
-        self.offset = offset
-        if axis1 < 0 or axis2 < 0:
-            raise NotImplementedError("AllocDiag does not support negative axis")
-        if axis1 == axis2:
-            raise ValueError("axis1 and axis2 cannot be the same")
+    def __init__(self, *args, axis1, axis2, offset, **kwargs):
         self.axis1 = axis1
         self.axis2 = axis2
+        self.offset = offset
 
-    def make_node(self, diag):
-        diag = as_tensor_variable(diag)
-        if diag.type.ndim < 1:
-            raise ValueError(
-                "AllocDiag needs an input with 1 or more dimensions", diag.type
-            )
-        return Apply(
-            self,
-            [diag],
-            [diag.type.clone(shape=(None,) * (diag.ndim + 1))()],
-        )
-
-    def perform(self, node, inputs, outputs):
-        (x,) = inputs
-        (z,) = outputs
-
-        axis1 = np.minimum(self.axis1, self.axis2)
-        axis2 = np.maximum(self.axis1, self.axis2)
-        offset = self.offset
-
-        # Create array with one extra dimension for resulting matrix
-        result_shape = x.shape[:-1] + (x.shape[-1] + abs(offset),) * 2
-        result = np.zeros(result_shape, dtype=x.dtype)
-
-        # Create slice for diagonal in final 2 axes
-        idxs = np.arange(x.shape[-1])
-        diagonal_slice = (len(result_shape) - 2) * [slice(None)] + [
-            idxs + np.maximum(0, -offset),
-            idxs + np.maximum(0, offset),
-        ]
-
-        # Fill in final 2 axes with x
-        result[tuple(diagonal_slice)] = x
-
-        if len(x.shape) > 1:
-            # Re-order axes so they correspond to diagonals at axis1, axis2
-            axes = list(range(len(x.shape[:-1])))
-            last_idx = axes[-1]
-            axes = axes[:axis1] + [last_idx + 1] + axes[axis1:]
-            axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
-            result = result.transpose(axes)
-
-        z[0] = result
-
-    def grad(self, inputs, gout):
-        (gz,) = gout
-        return [diagonal(gz, offset=self.offset, axis1=self.axis1, axis2=self.axis2)]
-
-    def infer_shape(self, fgraph, nodes, shapes):
-        (x_shape,) = shapes
-        axis1 = np.minimum(self.axis1, self.axis2)
-        axis2 = np.maximum(self.axis1, self.axis2)
+        super().__init__(*args, **kwargs, strict=True)
 
-        result_shape = list(x_shape[:-1])
-        diag_shape = x_shape[-1] + abs(self.offset)
-        result_shape = result_shape[:axis1] + [diag_shape] + result_shape[axis1:]
-        result_shape = result_shape[:axis2] + [diag_shape] + result_shape[axis2:]
-        return [tuple(result_shape)]
+    @staticmethod
+    def is_offset_zero(node) -> bool:
+        """
+        Test if an AllocDiag Op has a diagonal offset of zero
 
-    def __setstate__(self, state):
-        if "view_map" in state:
-            del state["view_map"]
+        Parameters
+        ----------
+        node
+            AllocDiag node to test
 
-        self.__dict__.update(state)
+        Returns
+        -------
+        is_offset_zero: bool
+            True if the offset is zero (``k = 0``).
+        """
 
-        if "offset" not in state:
-            self.offset = 0
-        if "axis1" not in state:
-            self.axis1 = 0
-        if "axis2" not in state:
-            self.axis2 = 1
+        return node.op.offset == 0
 
 
 def alloc_diag(diag, offset=0, axis1=0, axis2=1):
@@ -3862,6 +3810,7 @@ def alloc_diag(diag, offset=0, axis1=0, axis2=1):
     from pytensor.tensor import set_subtensor
 
     diag = as_tensor_variable(diag)
+
     axis1, axis2 = normalize_axis_tuple((axis1, axis2), ndim=diag.type.ndim + 1)
     if axis1 > axis2:
         axis1, axis2 = axis2, axis1
@@ -3888,7 +3837,9 @@ def alloc_diag(diag, offset=0, axis1=0, axis2=1):
         axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
         result = result.transpose(axes)
 
-    return result
+    return AllocDiag(
+        inputs=[diag], outputs=[result], axis1=axis1, axis2=axis2, offset=offset
+    )(diag)
 
 
 def diag(v, k=0):
diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
index 971be19f46..de966f1a78 100644
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
@@ -185,6 +185,14 @@ def __init__(self, input_broadcastable, new_order):
         self.augment = sorted(i for i, x in enumerate(new_order) if x == "x")
         self.drop = drop
 
+        input_ndim = len(input_broadcastable)
+        self.is_left_expand_dims = self.augment and (
+            input_ndim == 0 or new_order[-input_ndim:] == list(range(input_ndim))
+        )
+        self.is_right_expand_dims = self.augment and new_order[:input_ndim] == list(
+            range(input_ndim)
+        )
+
         if self.inplace:
             self.view_map = {0: [0]}
 
diff --git a/pytensor/tensor/rewriting/__init__.py b/pytensor/tensor/rewriting/__init__.py
index 617eab04fa..168b636041 100644
--- a/pytensor/tensor/rewriting/__init__.py
+++ b/pytensor/tensor/rewriting/__init__.py
@@ -10,6 +10,7 @@
 import pytensor.tensor.rewriting.jax
 import pytensor.tensor.rewriting.linalg
 import pytensor.tensor.rewriting.math
+import pytensor.tensor.rewriting.ofg
 import pytensor.tensor.rewriting.shape
 import pytensor.tensor.rewriting.special
 import pytensor.tensor.rewriting.subtensor
diff --git a/pytensor/tensor/rewriting/linalg.py b/pytensor/tensor/rewriting/linalg.py
index 38ed9a51d5..5f2e8cf388 100644
--- a/pytensor/tensor/rewriting/linalg.py
+++ b/pytensor/tensor/rewriting/linalg.py
@@ -5,12 +5,16 @@
 from pytensor import Variable
 from pytensor.graph import Apply, FunctionGraph
 from pytensor.graph.rewriting.basic import (
-    PatternNodeRewriter,
     copy_stack_trace,
     node_rewriter,
 )
 from pytensor.scalar.basic import Mul
-from pytensor.tensor.basic import ARange, Eye, TensorVariable, alloc, diagonal
+from pytensor.tensor.basic import (
+    AllocDiag,
+    Eye,
+    TensorVariable,
+    diagonal,
+)
 from pytensor.tensor.blas import Dot22
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import DimShuffle, Elemwise
@@ -41,7 +45,6 @@
     solve,
     solve_triangular,
 )
-from pytensor.tensor.subtensor import advanced_set_subtensor
 
 
 logger = logging.getLogger(__name__)
@@ -402,30 +405,68 @@ def _find_diag_from_eye_mul(potential_mul_input):
     eye_input = [
         mul_input
         for mul_input in inputs_to_mul
-        if mul_input.owner and isinstance(mul_input.owner.op, Eye)
+        if mul_input.owner
+        and (
+            isinstance(mul_input.owner.op, Eye)
+            or
+            # This whole condition checks if there is an Eye hiding inside a DimShuffle.
+            # This arises from batched elementwise multiplication between a tensor and an eye, e.g.:
+            # tensor(shape=(None, 3, 3) * eye(3). This is still potentially valid for diag rewrites.
+            (
+                isinstance(mul_input.owner.op, DimShuffle)
+                and (
+                    mul_input.owner.op.is_left_expand_dims
+                    or mul_input.owner.op.is_right_expand_dims
+                )
+                and mul_input.owner.inputs[0].owner is not None
+                and isinstance(mul_input.owner.inputs[0].owner.op, Eye)
+            )
+        )
     ]
 
-    # Check if 1's are being put on the main diagonal only (k = 0)
-    if eye_input and getattr(eye_input[0].owner.inputs[-1], "data", -1).item() != 0:
+    if not eye_input:
         return None
 
-    # If the broadcast pattern of eye_input is not (False, False), we do not get a diagonal matrix and thus, dont need to apply the rewrite
-    if eye_input and eye_input[0].broadcastable[-2:] != (False, False):
+    eye_input = eye_input[0]
+    # If eye_input is an Eye Op (it's not wrapped in a DimShuffle), check it doesn't have an offset
+    if isinstance(eye_input.owner.op, Eye) and (
+        not Eye.is_offset_zero(eye_input.owner)
+        or eye_input.broadcastable[-2:] != (False, False)
+    ):
         return None
 
+    # Otherwise, an Eye was found but it is wrapped in a DimShuffle (i.e. there was some broadcasting going on).
+    # We have to look inside DimShuffle to decide if the rewrite can be applied
+    if isinstance(eye_input.owner.op, DimShuffle) and (
+        eye_input.owner.op.is_left_expand_dims
+        or eye_input.owner.op.is_right_expand_dims
+    ):
+        inner_eye = eye_input.owner.inputs[0]
+        # We can only rewrite when the Eye is on the main diagonal (the offset is zero) and the identity isn't
+        # degenerate
+        if not Eye.is_offset_zero(inner_eye.owner) or inner_eye.broadcastable[-2:] != (
+            False,
+            False,
+        ):
+            return None
+
     # Get all non Eye inputs (scalars/matrices/vectors)
-    non_eye_inputs = list(set(inputs_to_mul) - set(eye_input))
+    non_eye_inputs = list(set(inputs_to_mul) - {eye_input})
     return eye_input, non_eye_inputs
 
 
 @register_canonicalize("shape_unsafe")
 @register_stabilize("shape_unsafe")
 @node_rewriter([det])
-def rewrite_det_diag_from_eye_mul(fgraph, node):
+def rewrite_det_diag_to_prod_diag(fgraph, node):
     """
-     This rewrite takes advantage of the fact that for a diagonal matrix, the determinant value is the product of its diagonal elements.
+     This rewrite takes advantage of the fact that for a diagonal matrix, the determinant value is the product of its
+     diagonal elements.
 
-    The presence of a diagonal matrix is detected by inspecting the graph. This rewrite can identify diagonal matrices that arise as the result of elementwise multiplication with an identity matrix. Specialized computation is used to make this rewrite as efficient as possible, depending on whether the multiplication was with a scalar, vector or a matrix.
+    The presence of a diagonal matrix is detected by inspecting the graph. This rewrite can identify diagonal matrices
+    that arise as the result of elementwise multiplication with an identity matrix. Specialized computation is used to
+    make this rewrite as efficient as possible, depending on whether the multiplication was with a scalar,
+    vector or a matrix.
 
     Parameters
     ----------
@@ -439,53 +480,45 @@ def rewrite_det_diag_from_eye_mul(fgraph, node):
     list of Variable, optional
         List of optimized variables, or None if no optimization was performed
     """
-    potential_mul_input = node.inputs[0]
-    eye_non_eye_inputs = _find_diag_from_eye_mul(potential_mul_input)
-    if eye_non_eye_inputs is None:
+    inputs = node.inputs[0]
+
+    # Check for use of pt.diag first
+    if (
+        inputs.owner
+        and isinstance(inputs.owner.op, AllocDiag)
+        and AllocDiag.is_offset_zero(inputs.owner)
+    ):
+        diag_input = inputs.owner.inputs[0]
+        det_val = diag_input.prod(axis=-1)
+        return [det_val]
+
+    # Check if the input is an elemwise multiply with identity matrix -- this also results in a diagonal matrix
+    inputs_or_none = _find_diag_from_eye_mul(inputs)
+    if inputs_or_none is None:
         return None
-    eye_input, non_eye_inputs = eye_non_eye_inputs
+
+    eye_input, non_eye_inputs = inputs_or_none
 
     # Dealing with only one other input
     if len(non_eye_inputs) != 1:
         return None
 
-    useful_eye, useful_non_eye = eye_input[0], non_eye_inputs[0]
+    eye_input, non_eye_input = eye_input[0], non_eye_inputs[0]
 
     # Checking if original x was scalar/vector/matrix
-    if useful_non_eye.type.broadcastable[-2:] == (True, True):
+    if non_eye_input.type.broadcastable[-2:] == (True, True):
         # For scalar
-        det_val = useful_non_eye.squeeze(axis=(-1, -2)) ** (useful_eye.shape[0])
-    elif useful_non_eye.type.broadcastable[-2:] == (False, False):
+        det_val = non_eye_input.squeeze(axis=(-1, -2)) ** (eye_input.shape[0])
+    elif non_eye_input.type.broadcastable[-2:] == (False, False):
         # For Matrix
-        det_val = useful_non_eye.diagonal(axis1=-1, axis2=-2).prod(axis=-1)
+        det_val = non_eye_input.diagonal(axis1=-1, axis2=-2).prod(axis=-1)
     else:
         # For vector
-        det_val = useful_non_eye.prod(axis=(-1, -2))
+        det_val = non_eye_input.prod(axis=(-1, -2))
     det_val = det_val.astype(node.outputs[0].type.dtype)
     return [det_val]
 
 
-arange = ARange("int64")
-det_diag_from_diag = PatternNodeRewriter(
-    (
-        det,
-        (
-            advanced_set_subtensor,
-            (alloc, 0, "sh1", "sh2"),
-            "x",
-            (arange, 0, "stop", 1),
-            (arange, 0, "stop", 1),
-        ),
-    ),
-    (prod, "x"),
-    name="det_diag_from_diag",
-    allow_multiple_clients=True,
-)
-register_canonicalize(det_diag_from_diag)
-register_stabilize(det_diag_from_diag)
-register_specialize(det_diag_from_diag)
-
-
 @register_canonicalize
 @register_stabilize
 @register_specialize
diff --git a/pytensor/tensor/rewriting/ofg.py b/pytensor/tensor/rewriting/ofg.py
new file mode 100644
index 0000000000..265f3ff2e8
--- /dev/null
+++ b/pytensor/tensor/rewriting/ofg.py
@@ -0,0 +1,68 @@
+from pytensor import clone_replace
+from pytensor.compile import optdb
+from pytensor.compile.builders import OpFromGraph
+from pytensor.graph import node_rewriter
+from pytensor.graph.rewriting.basic import copy_stack_trace, in2out
+from pytensor.tensor.basic import AllocDiag
+from pytensor.tensor.rewriting.basic import register_specialize
+
+
+@node_rewriter([OpFromGraph])
+def inline_ofg_expansion(fgraph, node):
+    """
+    This optimization expands internal graph of OpFromGraph.
+    Only performed if node.op.is_inline == True
+    Doing so can improve optimization at the cost of compilation speed.
+    """
+    op = node.op
+    if not op.is_inline:
+        return False
+
+    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
+    copy_stack_trace(op.inner_outputs, new_out)
+
+    return new_out
+
+
+# We want to run this before the first merge optimizer
+# and before the first scan optimizer.
+optdb.register(
+    "inline_ofg_expansion",
+    in2out(inline_ofg_expansion),
+    "fast_compile",
+    "fast_run",
+    position=-0.01,
+)
+
+
+@register_specialize("inline_ofg")
+@node_rewriter([AllocDiag])
+def late_inline_OpFromGraph(fgraph, node):
+    """
+    Inline `OpFromGraph` nodes.
+
+    OpFromGraph nodes are used to compactly represent the output of a function graph. Certain `Ops`, like, einsum,
+    diag, and kron, are implemented using pytensor `Op`s. As a result, their outputs are not a single `Op`, but a
+    graph. To allow rewrites to easily spot and manipulate these "composite functions", we use the `OpFromGraph` node.
+    This node is a thin wrapper around the output graph. It is not, however, meant to be included in the final
+    program, because it hides the inner graph from certain optimizations.
+
+    This rewrite specifies that all `OpFromGraph` nodes should be replaced by their inner graphs by setting the
+    `inplace=True` flag.
+
+    Parameters
+    ----------
+    fgraph: FunctionGraph
+        The function graph being rewritten
+    node: Apply
+        Node of the function graph to be optimized
+
+    Returns
+    -------
+
+    """
+    op = node.op
+    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
+    copy_stack_trace(op.inner_outputs, new_out)
+
+    return new_out
diff --git a/tests/link/jax/test_basic.py b/tests/link/jax/test_basic.py
index 76c8b4b329..5cd2bd54c6 100644
--- a/tests/link/jax/test_basic.py
+++ b/tests/link/jax/test_basic.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from pytensor.compile.builders import OpFromGraph
 from pytensor.compile.function import function
 from pytensor.compile.mode import get_mode
 from pytensor.compile.sharedvalue import SharedVariable, shared
@@ -13,7 +14,7 @@
 from pytensor.graph.op import Op, get_test_value
 from pytensor.ifelse import ifelse
 from pytensor.raise_op import assert_op
-from pytensor.tensor.type import dscalar, scalar, vector
+from pytensor.tensor.type import dscalar, matrices, scalar, vector
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -209,3 +210,19 @@ def test_jax_checkandraise():
 def set_test_value(x, v):
     x.tag.test_value = v
     return x
+
+
+def test_OpFromGraph():
+    x, y, z = matrices("xyz")
+    ofg_1 = OpFromGraph([x, y], [x + y], inline=False)
+    ofg_2 = OpFromGraph([x, y], [x * y, x - y], inline=False)
+
+    o1, o2 = ofg_2(y, z)
+    out = ofg_1(x, o1) + o2
+    out_fg = FunctionGraph([x, y, z], [out])
+
+    xv = np.ones((2, 2), dtype=config.floatX)
+    yv = np.ones((2, 2), dtype=config.floatX) * 3
+    zv = np.ones((2, 2), dtype=config.floatX) * 5
+
+    compare_jax_and_py(out_fg, [xv, yv, zv])
diff --git a/tests/tensor/rewriting/test_linalg.py b/tests/tensor/rewriting/test_linalg.py
index d59e3cc88f..0bc064fe65 100644
--- a/tests/tensor/rewriting/test_linalg.py
+++ b/tests/tensor/rewriting/test_linalg.py
@@ -362,6 +362,8 @@ def test_invalid_batched_a(self):
     ids=["block_diag", "kron"],
 )
 def test_local_lift_through_linalg(constructor, f_op, f, g_op, g):
+    rng = np.random.default_rng(sum(map(ord, "lift_through_linalg")))
+
     if pytensor.config.floatX.endswith("32"):
         pytest.skip("Test is flaky at half precision")
 
@@ -371,6 +373,7 @@ def test_local_lift_through_linalg(constructor, f_op, f, g_op, g):
     f1 = pytensor.function(
         [A, B], X, mode=get_default_mode().including("local_lift_through_linalg")
     )
+
     f2 = pytensor.function(
         [A, B], X, mode=get_default_mode().excluding("local_lift_through_linalg")
     )
@@ -386,9 +389,7 @@ def test_local_lift_through_linalg(constructor, f_op, f, g_op, g):
     assert len(f_ops) == 2
     assert len(g_ops) == 1
 
-    test_vals = [
-        np.random.normal(size=(3,) * A.ndim).astype(config.floatX) for _ in range(2)
-    ]
+    test_vals = [rng.normal(size=(3,) * A.ndim).astype(config.floatX) for _ in range(2)]
     test_vals = [x @ np.swapaxes(x, -1, -2) for x in test_vals]
 
     np.testing.assert_allclose(f1(*test_vals), f2(*test_vals), atol=1e-8)
@@ -403,13 +404,18 @@ def test_det_diag_from_eye_mul(shape):
     # Initializing x based on scalar/vector/matrix
     x = pt.tensor("x", shape=shape)
     y = pt.eye(7) * x
+
     # Calculating determinant value using pt.linalg.det
     z_det = pt.linalg.det(y)
 
     # REWRITE TEST
     f_rewritten = function([x], z_det, mode="FAST_RUN")
     nodes = f_rewritten.maker.fgraph.apply_nodes
-    assert not any(isinstance(node.op, Det) for node in nodes)
+
+    assert not any(
+        isinstance(node.op, Det) or isinstance(getattr(node.op, "core_op", None), Det)
+        for node in nodes
+    )
 
     # NUMERIC VALUE TEST
     if len(shape) == 0:
@@ -418,6 +424,7 @@ def test_det_diag_from_eye_mul(shape):
         x_test = np.random.rand(*shape).astype(config.floatX)
     else:
         x_test = np.random.rand(*shape).astype(config.floatX)
+
     x_test_matrix = np.eye(7) * x_test
     det_val = np.linalg.det(x_test_matrix)
     rewritten_val = f_rewritten(x_test)
@@ -459,6 +466,7 @@ def test_dont_apply_det_diag_rewrite_for_1_1():
     x_diag = pt.eye(1, 1) * x
     y = pt.linalg.det(x_diag)
     f_rewritten = function([x], y, mode="FAST_RUN")
+
     nodes = f_rewritten.maker.fgraph.apply_nodes
 
     assert any(isinstance(node.op, Det) for node in nodes)
@@ -468,6 +476,7 @@ def test_dont_apply_det_diag_rewrite_for_1_1():
     x_test_matrix = np.eye(1, 1) * x_test
     det_val = np.linalg.det(x_test_matrix)
     rewritten_val = f_rewritten(x_test)
+
     assert_allclose(
         det_val,
         rewritten_val,
diff --git a/tests/tensor/rewriting/test_ofg.py b/tests/tensor/rewriting/test_ofg.py
new file mode 100644
index 0000000000..6304939562
--- /dev/null
+++ b/tests/tensor/rewriting/test_ofg.py
@@ -0,0 +1,22 @@
+import pytest
+
+import pytensor
+import pytensor.tensor as pt
+from pytensor import config
+from pytensor.compile.builders import OpFromGraph
+
+
+@pytest.mark.skipif(
+    config.mode == "FAST_COMPILE",
+    reason="Rewrite is not applied in FAST_COMPILE mode",
+)
+def test_alloc_diag_inlined():
+    x = pt.tensor("x", shape=(None,))
+
+    z = pt.diag(x)
+    assert isinstance(z.owner.op, OpFromGraph)
+
+    f = pytensor.function([x], z)
+    nodes = f.maker.fgraph.apply_nodes
+
+    assert not any(isinstance(node.op, OpFromGraph) for node in nodes)

From ad27dc752798dfb75ed8d1c0a9b840859a8a898d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 18:15:50 +0200
Subject: [PATCH 05/72] Bump actions/upload-artifact from 3 to 4 (#560)

* Bump actions/upload-artifact from 3 to 4

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3 to 4.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Migrate pypi to GHA artifacts v4

* Bump download-artifact to v4

* Eliminate undefined matrix.python-version variable

* Upload/download each platform separately

* Use pattern arg to download-artifact

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Ben Mares <services-git-throwaway1@tensorial.com>
---
 .github/workflows/pypi.yml | 41 ++++++++++++++++++++++++++------------
 .github/workflows/test.yml |  9 +++++----
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index d129c0c32a..ca37e422d0 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -30,12 +30,13 @@ jobs:
       - name: Build SDist
         run: pipx run build --sdist
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
+          name: sdist
           path: dist/*.tar.gz
 
   build_wheels:
-    name: Build ${{ matrix.python-version }} wheels on ${{ matrix.platform }}
+    name: Build wheels for ${{ matrix.platform }}
     runs-on: ${{ matrix.platform }}
     strategy:
       matrix:
@@ -51,8 +52,9 @@ jobs:
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.19.2
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
+          name: wheels-${{ matrix.platform }}
           path: ./wheelhouse/*.whl
 
   check_dist:
@@ -60,10 +62,17 @@ jobs:
     needs: [make_sdist,build_wheels]
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
-          name: artifact
+          name: sdist
           path: dist
+
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: wheels-*
+          path: dist
+          merge-multiple: true
+
       - name: Check SDist
         run: |
           mkdir -p test-sdist
@@ -83,12 +92,18 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event_name == 'release' && github.event.action == 'published'
     steps:
-    - uses: actions/download-artifact@v3
-      with:
-        name: artifact
-        path: dist
+      - uses: actions/download-artifact@v4
+        with:
+          name: sdist
+          path: dist
 
-    - uses: pypa/gh-action-pypi-publish@v1.9.0
-      with:
-        user: __token__
-        password: ${{ secrets.pypi_password }}
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: wheels-*
+          path: dist
+          merge-multiple: true
+
+      - uses: pypa/gh-action-pypi-publish@v1.9.0
+        with:
+          user: __token__
+          password: ${{ secrets.pypi_password }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index eb36a61386..674bc52c7b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -187,9 +187,9 @@ jobs:
           FLOAT32: ${{ matrix.float32 }}
 
       - name: Upload coverage file
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          name: coverage
+          name: coverage-${{ steps.matrix-id.outputs.id }}
           path: coverage/coverage-${{ steps.matrix-id.outputs.id }}.xml
 
   benchmarks:
@@ -273,10 +273,11 @@ jobs:
           python -m pip install -U coverage>=5.1 coveralls
 
       - name: Download coverage file
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
-          name: coverage
+          pattern: coverage-*
           path: coverage
+          merge-multiple: true
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4

From f489cf4bd712c6878e4dc970a650136abab21473 Mon Sep 17 00:00:00 2001
From: Tanish <tanish.taneja@research.iiit.ac.in>
Date: Fri, 19 Jul 2024 16:20:37 +0530
Subject: [PATCH 06/72] Added rewrite for matrix inv(inv(x)) -> x (#893)

---
 pytensor/tensor/rewriting/linalg.py   | 42 +++++++++++++++++++++++++++
 tests/tensor/rewriting/test_linalg.py | 14 +++++++++
 2 files changed, 56 insertions(+)

diff --git a/pytensor/tensor/rewriting/linalg.py b/pytensor/tensor/rewriting/linalg.py
index 5f2e8cf388..1de6dbb373 100644
--- a/pytensor/tensor/rewriting/linalg.py
+++ b/pytensor/tensor/rewriting/linalg.py
@@ -569,3 +569,45 @@ def svd_uv_merge(fgraph, node):
                     or len(fgraph.clients[cl.outputs[2]]) > 0
                 ):
                     return [cl.outputs[1]]
+
+
+@register_canonicalize
+@register_stabilize
+@node_rewriter([Blockwise])
+def rewrite_inv_inv(fgraph, node):
+    """
+    This rewrite takes advantage of the fact that if there are two consecutive inverse operations (inv(inv(input))), we get back our original input without having to compute inverse once.
+
+    Here, we check for direct inverse operations (inv/pinv)  and allows for any combination of these "inverse" nodes to be simply rewritten.
+
+    Parameters
+    ----------
+    fgraph: FunctionGraph
+        Function graph being optimized
+    node: Apply
+        Node of the function graph to be optimized
+
+    Returns
+    -------
+    list of Variable, optional
+        List of optimized variables, or None if no optimization was performed
+    """
+    valid_inverses = (MatrixInverse, MatrixPinv)
+    # Check if its a valid inverse operation (either inv/pinv)
+    # In case the outer operation is an inverse, it directly goes to the next step of finding inner operation
+    # If the outer operation is not a valid inverse, we do not apply this rewrite
+    if not isinstance(node.op.core_op, valid_inverses):
+        return None
+
+    potential_inner_inv = node.inputs[0].owner
+    if potential_inner_inv is None or potential_inner_inv.op is None:
+        return None
+
+    # Check if inner op is blockwise and and possible inv
+    if not (
+        potential_inner_inv
+        and isinstance(potential_inner_inv.op, Blockwise)
+        and isinstance(potential_inner_inv.op.core_op, valid_inverses)
+    ):
+        return None
+    return [potential_inner_inv.inputs[0]]
diff --git a/tests/tensor/rewriting/test_linalg.py b/tests/tensor/rewriting/test_linalg.py
index 0bc064fe65..7353a82be0 100644
--- a/tests/tensor/rewriting/test_linalg.py
+++ b/tests/tensor/rewriting/test_linalg.py
@@ -10,6 +10,7 @@
 from pytensor import tensor as pt
 from pytensor.compile import get_default_mode
 from pytensor.configdefaults import config
+from pytensor.graph.rewriting.utils import rewrite_graph
 from pytensor.tensor import swapaxes
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import DimShuffle
@@ -554,3 +555,16 @@ def test_svd_uv_merge():
             assert node.op.compute_uv
             svd_counter += 1
     assert svd_counter == 1
+
+
+@pytest.mark.parametrize("inv_op_1", ["inv", "pinv"])
+@pytest.mark.parametrize("inv_op_2", ["inv", "pinv"])
+def test_inv_inv_rewrite(inv_op_1, inv_op_2):
+    def get_pt_function(x, op_name):
+        return getattr(pt.linalg, op_name)(x)
+
+    x = pt.matrix("x")
+    op1 = get_pt_function(x, inv_op_1)
+    op2 = get_pt_function(op1, inv_op_2)
+    rewritten_out = rewrite_graph(op2)
+    assert rewritten_out == x

From 981688c367ec56aab620f319ebf954ea5c36cd9c Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <48652735+jessegrabowski@users.noreply.github.com>
Date: Fri, 19 Jul 2024 08:08:35 -0500
Subject: [PATCH 07/72] Implement `pad` (#748)

* Add `pt.pad`

* Refactor linspace, logspace, and geomspace to match numpy implementation

* Add `pt.flip`

* Move `flip` to `tensor/subtensor.py`, add docstring

* Move `slice_at_axis` to `tensor/subtensor` and expose it in `pytensor.tensor`
---
 pytensor/link/jax/dispatch/__init__.py |   1 +
 pytensor/link/jax/dispatch/pad.py      |  53 ++
 pytensor/tensor/__init__.py            |   1 +
 pytensor/tensor/extra_ops.py           | 366 ++++++++++++-
 pytensor/tensor/pad.py                 | 690 +++++++++++++++++++++++++
 pytensor/tensor/subtensor.py           | 115 +++++
 tests/link/jax/test_pad.py             |  63 +++
 tests/link/numba/test_pad.py           |  68 +++
 tests/tensor/test_extra_ops.py         |  53 +-
 tests/tensor/test_pad.py               | 224 ++++++++
 tests/tensor/test_subtensor.py         |  38 ++
 11 files changed, 1632 insertions(+), 40 deletions(-)
 create mode 100644 pytensor/link/jax/dispatch/pad.py
 create mode 100644 pytensor/tensor/pad.py
 create mode 100644 tests/link/jax/test_pad.py
 create mode 100644 tests/link/numba/test_pad.py
 create mode 100644 tests/tensor/test_pad.py

diff --git a/pytensor/link/jax/dispatch/__init__.py b/pytensor/link/jax/dispatch/__init__.py
index 1d8ae33104..f4098416b8 100644
--- a/pytensor/link/jax/dispatch/__init__.py
+++ b/pytensor/link/jax/dispatch/__init__.py
@@ -6,6 +6,7 @@
 import pytensor.link.jax.dispatch.blockwise
 import pytensor.link.jax.dispatch.elemwise
 import pytensor.link.jax.dispatch.extra_ops
+import pytensor.link.jax.dispatch.pad
 import pytensor.link.jax.dispatch.math
 import pytensor.link.jax.dispatch.nlinalg
 import pytensor.link.jax.dispatch.random
diff --git a/pytensor/link/jax/dispatch/pad.py b/pytensor/link/jax/dispatch/pad.py
new file mode 100644
index 0000000000..6d40d20cc1
--- /dev/null
+++ b/pytensor/link/jax/dispatch/pad.py
@@ -0,0 +1,53 @@
+import jax.numpy as jnp
+import numpy as np
+
+from pytensor.link.jax.dispatch import jax_funcify
+from pytensor.tensor.pad import Pad
+
+
+@jax_funcify.register(Pad)
+def jax_funcify_pad(op, **kwargs):
+    pad_mode = op.pad_mode
+    reflect_type = op.reflect_type
+    has_stat_length = op.has_stat_length
+
+    if pad_mode == "constant":
+
+        def constant_pad(x, pad_width, constant_values):
+            return jnp.pad(x, pad_width, mode=pad_mode, constant_values=constant_values)
+
+        return constant_pad
+
+    elif pad_mode == "linear_ramp":
+
+        def lr_pad(x, pad_width, end_values):
+            # JAX does not allow a dynamic input if end_values is non-scalar
+            if not isinstance(end_values, int | float):
+                end_values = tuple(np.array(end_values))
+            return jnp.pad(x, pad_width, mode=pad_mode, end_values=end_values)
+
+        return lr_pad
+
+    elif pad_mode in ["maximum", "minimum", "mean"] and has_stat_length:
+
+        def stat_pad(x, pad_width, stat_length):
+            # JAX does not allow a dynamic input here, need to cast to tuple
+            return jnp.pad(
+                x, pad_width, mode=pad_mode, stat_length=tuple(np.array(stat_length))
+            )
+
+        return stat_pad
+
+    elif pad_mode in ["reflect", "symmetric"]:
+
+        def loop_pad(x, pad_width):
+            return jnp.pad(x, pad_width, mode=pad_mode, reflect_type=reflect_type)
+
+        return loop_pad
+
+    else:
+
+        def pad(x, pad_width):
+            return jnp.pad(x, pad_width, mode=pad_mode)
+
+        return pad
diff --git a/pytensor/tensor/__init__.py b/pytensor/tensor/__init__.py
index 3dfa1b4b7a..81cabfa6bd 100644
--- a/pytensor/tensor/__init__.py
+++ b/pytensor/tensor/__init__.py
@@ -130,6 +130,7 @@ def _get_vector_length_Constant(op: Op | Variable, var: Constant) -> int:
 from pytensor.tensor.extra_ops import *
 from pytensor.tensor.io import *
 from pytensor.tensor.math import *
+from pytensor.tensor.pad import pad
 from pytensor.tensor.shape import (
     reshape,
     shape,
diff --git a/pytensor/tensor/extra_ops.py b/pytensor/tensor/extra_ops.py
index b1eaf4f001..cf809a55ef 100644
--- a/pytensor/tensor/extra_ops.py
+++ b/pytensor/tensor/extra_ops.py
@@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Collection, Iterable
 
 import numpy as np
@@ -20,14 +21,24 @@
 from pytensor.raise_op import Assert
 from pytensor.scalar import int32 as int_t
 from pytensor.scalar import upcast
-from pytensor.tensor import as_tensor_variable
+from pytensor.tensor import TensorLike, as_tensor_variable
 from pytensor.tensor import basic as ptb
 from pytensor.tensor.basic import alloc, second
 from pytensor.tensor.exceptions import NotScalarConstantError
 from pytensor.tensor.math import abs as pt_abs
 from pytensor.tensor.math import all as pt_all
 from pytensor.tensor.math import eq as pt_eq
-from pytensor.tensor.math import ge, lt, maximum, minimum, prod, switch
+from pytensor.tensor.math import (
+    ge,
+    gt,
+    log,
+    lt,
+    maximum,
+    minimum,
+    prod,
+    sign,
+    switch,
+)
 from pytensor.tensor.math import max as pt_max
 from pytensor.tensor.math import sum as pt_sum
 from pytensor.tensor.shape import specify_broadcastable
@@ -1584,27 +1595,346 @@ def broadcast_shape_iter(
     return tuple(result_dims)
 
 
-def geomspace(start, end, steps, base=10.0):
-    from pytensor.tensor.math import log
+def _check_deprecated_inputs(stop, end, num, steps):
+    if end is not None:
+        warnings.warn(
+            "The 'end' parameter is deprecated and will be removed in a future version. Use 'stop' instead.",
+            DeprecationWarning,
+        )
+        stop = end
+    if steps is not None:
+        warnings.warn(
+            "The 'steps' parameter is deprecated and will be removed in a future version. Use 'num' instead.",
+            DeprecationWarning,
+        )
+        num = steps
+
+    return stop, num
+
+
+def _linspace_core(
+    start: TensorVariable,
+    stop: TensorVariable,
+    num: int,
+    endpoint=True,
+    retstep=False,
+    axis=0,
+) -> TensorVariable | tuple[TensorVariable, TensorVariable]:
+    div = (num - 1) if endpoint else num
+    delta = stop - start
+    samples = ptb.shape_padright(ptb.arange(0, num), delta.ndim)
+
+    step = delta / div
+    samples = switch(gt(div, 0), samples * delta / div + start, samples * delta + start)
+    if endpoint:
+        samples = switch(gt(num, 1), set_subtensor(samples[-1, ...], stop), samples)
+
+    if axis != 0:
+        samples = ptb.moveaxis(samples, 0, axis)
+
+    if retstep:
+        return samples, step
+
+    return samples
+
+
+def _broadcast_base_with_inputs(start, stop, base, axis):
+    """
+    Broadcast the base tensor with the start and stop tensors if base is not a scalar. This is important because it
+    may change how the axis argument is interpreted in the final output.
+
+    Parameters
+    ----------
+    start: TensorVariable
+        The start value(s) of the sequence(s).
+    stop: TensorVariable
+        The end value(s) of the sequence(s)
+    base: TensorVariable
+        The log base value(s) of the sequence(s)
+    axis: int
+        The axis along which to generate samples.
+
+    Returns
+    -------
+    start: TensorVariable
+        The start value(s) of the sequence(s), broadcast with the base tensor if necessary.
+    stop: TensorVariable
+        The end value(s) of the sequence(s), broadcast with the base tensor if necessary.
+    base: TensorVariable
+        The log base value(s) of the sequence(s), broadcast with the start and stop tensors if necessary.
+    """
+    base = ptb.as_tensor_variable(base)
+    if base.ndim > 0:
+        ndmax = len(broadcast_shape(start, stop, base))
+        start, stop, base = (
+            ptb.shape_padleft(a, ndmax - a.ndim) for a in (start, stop, base)
+        )
+        base = ptb.expand_dims(base, axis=(axis,))
+
+    return start, stop, base
+
+
+def linspace(
+    start: TensorLike,
+    stop: TensorLike,
+    num: TensorLike = 50,
+    endpoint: bool = True,
+    retstep: bool = False,
+    dtype: str | None = None,
+    axis: int = 0,
+    end: TensorLike | None = None,
+    steps: TensorLike | None = None,
+) -> TensorVariable | tuple[TensorVariable, TensorVariable]:
+    """
+    Return evenly spaced numbers over a specified interval.
+
+    Returns `num` evenly spaced samples, calculated over the interval [`start`, `stop`].
+
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start: int, float, or TensorVariable
+        The starting value of the sequence.
+
+    stop: int, float or TensorVariable
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `stop` is excluded.
+
+    num: int
+        Number of samples to generate. Must be non-negative.
 
-    start = ptb.as_tensor_variable(start)
-    end = ptb.as_tensor_variable(end)
-    return base ** linspace(log(start) / log(base), log(end) / log(base), steps)
+    endpoint: bool
+        Whether to include the endpoint in the range.
 
+    retstep: bool
+        If true, returns both the samples and an array of steps between samples.
 
-def logspace(start, end, steps, base=10.0):
-    start = ptb.as_tensor_variable(start)
-    end = ptb.as_tensor_variable(end)
-    return base ** linspace(start, end, steps)
+    dtype: str, optional
+        dtype of the output tensor(s). If None, the dtype is inferred from that of the values provided to the `start`
+        and `end` arguments.
 
+    axis: int
+        Axis along which to generate samples. Ignored if both `start` and `end` have dimension 0. By default, axis=0
+        will insert the samples on a new left-most dimension. To insert samples on a right-most dimension, use axis=-1.
+
+    end:  int, float or TensorVariable
+        .. warning::
+            The "end" parameter is deprecated and will be removed in a future version. Use "stop" instead.
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `end` is
+        excluded.
+
+    steps: float, int, or TensorVariable
+        .. warning::
+            The "steps" parameter is deprecated and will be removed in a future version. Use "num" instead.
+
+        Number of samples to generate. Must be non-negative
+
+    Returns
+    -------
+    samples: TensorVariable
+        Tensor containing `num` evenly-spaced values between [start, stop]. The range is inclusive if `endpoint` is True.
+
+    step: TensorVariable
+        Tensor containing the spacing between samples. Only returned if `retstep` is True.
+    """
+    if dtype is None:
+        dtype = pytensor.config.floatX
+    end, num = _check_deprecated_inputs(stop, end, num, steps)
+    start, stop = broadcast_arrays(start, stop)
+
+    ls = _linspace_core(
+        start=start,
+        stop=stop,
+        num=num,
+        endpoint=endpoint,
+        retstep=retstep,
+        axis=axis,
+    )
+
+    return ls.astype(dtype)
+
+
+def geomspace(
+    start: TensorLike,
+    stop: TensorLike,
+    num: int = 50,
+    base: float = 10.0,
+    endpoint: bool = True,
+    dtype: str | None = None,
+    axis: int = 0,
+    end: TensorLike | None = None,
+    steps: TensorLike | None = None,
+) -> TensorVariable:
+    """
+    Return numbers spaced evenly on a log scale (a geometric progression).
+
+    This is similar to logspace, but with endpoints specified directly. Each output sample is a constant multiple of
+    the previous.
+
+    Parameters
+    ----------
+    Returns `num` evenly spaced samples, calculated over the interval [`start`, `stop`].
+
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start: int, float, or TensorVariable
+        The starting value of the sequence.
+
+    stop: int, float or TensorVariable
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `stop` is excluded.
+
+    num: int
+        Number of samples to generate. Must be non-negative.
+
+    base: float
+        The base of the log space.
+
+    endpoint: bool
+        Whether to include the endpoint in the range.
+
+    dtype: str, optional
+        dtype of the output tensor(s). If None, the dtype is inferred from that of the values provided to the `start`
+        and `end` arguments.
+
+    axis: int
+        Axis along which to generate samples. Ignored if both `start` and `end` have dimension 0. By default, axis=0
+        will insert the samples on a new left-most dimension. To insert samples on a right-most dimension, use axis=-1.
+
+    end:  int, float or TensorVariable
+        .. warning::
+            The "end" parameter is deprecated and will be removed in a future version. Use "stop" instead.
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `end` is
+        excluded.
+
+    steps: float, int, or TensorVariable
+        .. warning::
+            The "steps" parameter is deprecated and will be removed in a future version. Use "num" instead.
+
+        Number of samples to generate. Must be non-negative
+
+    Returns
+    -------
+    samples: TensorVariable
+        Tensor containing `num` evenly-spaced (in log space) values between [start, stop]. The range is inclusive if
+         `endpoint` is True.
+    """
+    if dtype is None:
+        dtype = pytensor.config.floatX
+    stop, num = _check_deprecated_inputs(stop, end, num, steps)
+    start, stop = broadcast_arrays(start, stop)
+    start, stop, base = _broadcast_base_with_inputs(start, stop, base, axis)
+
+    out_sign = sign(start)
+    log_start, log_stop = (
+        log(start * out_sign) / log(base),
+        log(stop * out_sign) / log(base),
+    )
+    result = _linspace_core(
+        start=log_start,
+        stop=log_stop,
+        num=num,
+        endpoint=endpoint,
+        axis=0,
+        retstep=False,
+    )
+    result = base**result
+
+    result = switch(gt(num, 0), set_subtensor(result[0, ...], start), result)
+    if endpoint:
+        result = switch(gt(num, 1), set_subtensor(result[-1, ...], stop), result)
+
+    result = result * out_sign
+
+    if axis != 0:
+        result = ptb.moveaxis(result, 0, axis)
+
+    return result.astype(dtype)
+
+
+def logspace(
+    start: TensorLike,
+    stop: TensorLike,
+    num: int = 50,
+    base: float = 10.0,
+    endpoint: bool = True,
+    dtype: str | None = None,
+    axis: int = 0,
+    end: TensorLike | None = None,
+    steps: TensorLike | None = None,
+) -> TensorVariable:
+    """
+    Return numbers spaced evenly on a log scale.
+
+    In linear space, the sequence starts at ``base ** start`` (base to the power of start) and ends with ``base ** stop``
+     (see ``endpoint`` below).
+
+    Parameters
+    ----------
+    start: int, float, or TensorVariable
+        ``base ** start`` is the starting value of the sequence
+
+    stop: int, float or TensorVariable
+        ``base ** stop`` is the endpoint of the sequence, unless ``endopoint`` is set to False.
+        In that case, ``num + 1`` values are spaced over the interval in log-space, and the first ``num`` are returned.
+
+    num: int, default = 50
+        Number of samples to generate.
+
+    base: float, default = 10.0
+        The base of the log space. The step size between the elements in ``log(samples) / log(base)``
+         (or ``log_base(samples)`` is uniform.
+
+    endpoint: bool
+        Whether to include the endpoint in the range.
+
+    dtype: str, optional
+        dtype of the output tensor(s). If None, the dtype is inferred from that of the values provided to the `start`
+        and `stop` arguments.
+
+    axis: int
+        Axis along which to generate samples. Ignored if both `start` and `end` have dimension 0. By default, axis=0
+        will insert the samples on a new left-most dimension. To insert samples on a right-most dimension, use axis=-1.
+
+    end:  int float or TensorVariable
+        .. warning::
+            The "end" parameter is deprecated and will be removed in a future version. Use "stop" instead.
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `end` is
+        excluded.
+
+    steps: int or TensorVariable
+        .. warning::
+            The "steps" parameter is deprecated and will be removed in a future version. Use "num" instead.
+        Number of samples to generate. Must be non-negative
+
+    Returns
+    -------
+    samples: TensorVariable
+        Tensor containing `num` evenly-spaced (in log-pace) values between [start, stop]. The range is inclusive if
+        `endpoint` is True.
+    """
+    if dtype is None:
+        dtype = pytensor.config.floatX
+    stop, num = _check_deprecated_inputs(stop, end, num, steps)
+    start, stop = broadcast_arrays(start, stop)
+    start, stop, base = _broadcast_base_with_inputs(start, stop, base, axis)
+
+    ls = _linspace_core(
+        start=start,
+        stop=stop,
+        num=num,
+        endpoint=endpoint,
+        axis=axis,
+        retstep=False,
+    )
 
-def linspace(start, end, steps):
-    start = ptb.as_tensor_variable(start)
-    end = ptb.as_tensor_variable(end)
-    arr = ptb.arange(steps)
-    arr = ptb.shape_padright(arr, max(start.ndim, end.ndim))
-    multiplier = (end - start) / (steps - 1)
-    return start + arr * multiplier
+    return (base**ls).astype(dtype)
 
 
 def broadcast_to(
diff --git a/pytensor/tensor/pad.py b/pytensor/tensor/pad.py
new file mode 100644
index 0000000000..91aef44004
--- /dev/null
+++ b/pytensor/tensor/pad.py
@@ -0,0 +1,690 @@
+from collections.abc import Callable
+from functools import partial
+from typing import Literal, cast
+
+from pytensor.compile.builders import OpFromGraph
+from pytensor.ifelse import ifelse
+from pytensor.scan import scan
+from pytensor.tensor import TensorLike
+from pytensor.tensor.basic import (
+    TensorVariable,
+    as_tensor,
+    concatenate,
+    expand_dims,
+    moveaxis,
+    switch,
+    zeros,
+)
+from pytensor.tensor.extra_ops import broadcast_to, linspace
+from pytensor.tensor.math import divmod as pt_divmod
+from pytensor.tensor.math import eq, gt, mean, minimum
+from pytensor.tensor.math import max as pt_max
+from pytensor.tensor.math import min as pt_min
+from pytensor.tensor.shape import specify_broadcastable
+from pytensor.tensor.subtensor import flip, set_subtensor, slice_at_axis
+
+
+PadMode = Literal[
+    "constant",
+    "edge",
+    "linear_ramp",
+    "maximum",
+    "minimum",
+    "mean",
+    "median",
+    "wrap",
+    "symmetric",
+    "reflect",
+]
+stat_funcs = {"maximum": pt_max, "minimum": pt_min, "mean": mean}
+
+allowed_kwargs = {
+    "edge": [],
+    "wrap": [],
+    "constant": ["constant_values"],
+    "linear_ramp": ["end_values"],
+    "maximum": ["stat_length"],
+    "mean": ["stat_length"],
+    "median": ["stat_length"],
+    "minimum": ["stat_length"],
+    "reflect": ["reflect_type"],
+    "symmetric": ["reflect_type"],
+}
+
+
+def _get_edges(
+    padded: TensorVariable, axis: int, width_pair: tuple[TensorVariable, TensorVariable]
+) -> tuple[TensorVariable, TensorVariable]:
+    """
+    Retrieve edge values from empty-padded array in given dimension.
+
+    Copied from numpy.lib.arraypad._get_edges
+    https://github.com/numpy/numpy/blob/300096d384046eee479b0c7a70f79e308da52bff/numpy/lib/_arraypad_impl.py#L154
+
+    Parameters
+    ----------
+    padded : TensorVariable
+        Empty-padded array.
+    axis : int
+        Dimension in which the edges are considered.
+    width_pair : (TensorVariable, TensorVariable)
+        Pair of widths that mark the pad area on both sides in the given
+        dimension.
+
+    Returns
+    -------
+    left_edge, right_edge : TensorVariable
+        Edge values of the valid area in `padded` in the given dimension. Its
+        shape will always match `padded` except for the dimension given by
+        `axis` which will have a length of 1.
+    """
+    left_index = width_pair[0]
+    left_slice = slice_at_axis(slice(left_index, left_index + 1), axis)
+    left_edge = padded[left_slice]
+
+    right_index = padded.shape[axis] - width_pair[1]
+    right_slice = slice_at_axis(slice(right_index - 1, right_index), axis)
+    right_edge = padded[right_slice]
+
+    return left_edge, right_edge
+
+
+def _symbolic_pad(
+    x: TensorVariable, pad_width: TensorVariable
+) -> tuple[TensorVariable, tuple[slice, ...], TensorVariable]:
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+    new_shape = as_tensor(
+        [pad_width[i][0] + size + pad_width[i][1] for i, size in enumerate(x.shape)]
+    )
+    original_area_slice = tuple(
+        slice(pad_width[i][0], pad_width[i][0] + size) for i, size in enumerate(x.shape)
+    )
+    padded: TensorVariable = set_subtensor(zeros(new_shape)[original_area_slice], x)
+    return padded, original_area_slice, pad_width
+
+
+def _get_padding_slices(
+    dim_shape: TensorVariable,
+    width_pair: tuple[TensorVariable, TensorVariable],
+    axis: int,
+) -> tuple[tuple[slice, ...], tuple[slice, ...]]:
+    left_slice = slice_at_axis(slice(None, width_pair[0]), axis)
+    right_slice = slice_at_axis(slice(dim_shape - width_pair[1], None), axis)
+
+    return left_slice, right_slice
+
+
+def _constant_pad(
+    x: TensorVariable, pad_width: TensorVariable, constant_values: TensorVariable
+) -> TensorVariable:
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    values = broadcast_to(constant_values, as_tensor((padded.ndim, 2)))
+
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        value_pair = values[axis]
+        dim_shape = padded.shape[axis]
+
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+        padded = set_subtensor(padded[left_slice], value_pair[0])
+        padded = set_subtensor(padded[right_slice], value_pair[1])
+
+    return padded
+
+
+def _edge_pad(x: TensorVariable, pad_width: TensorVariable) -> TensorVariable:
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        dim_shape = padded.shape[axis]
+
+        left_edge, right_edge = _get_edges(padded, axis, width_pair)
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+
+        padded = set_subtensor(padded[left_slice], left_edge)
+        padded = set_subtensor(padded[right_slice], right_edge)
+
+    return padded
+
+
+def _get_stats(
+    padded: TensorVariable,
+    axis: int,
+    width_pair: TensorVariable,
+    length_pair: tuple[TensorVariable, TensorVariable] | tuple[None, None],
+    stat_func: Callable,
+):
+    """
+    Calculate statistic for the empty-padded array in given dimension.
+
+    Copied from numpy.lib.arraypad._get_stats
+    https://github.com/numpy/numpy/blob/300096d384046eee479b0c7a70f79e308da52bff/numpy/lib/_arraypad_impl.py#L230
+
+    Parameters
+    ----------
+    padded : TensorVariable
+        Empty-padded array.
+    axis : int
+        Dimension in which the statistic is calculated.
+    width_pair : (TensorVariable, TensorVariable)
+        Pair of widths that mark the pad area on both sides in the given dimension.
+    length_pair : 2-element sequence of None or TensorVariable
+        Gives the number of values in valid area from each side that is taken into account when calculating the
+        statistic. If None the entire valid area in `padded` is considered.
+    stat_func : function
+        Function to compute statistic. The expected signature is
+        ``stat_func(x: TensorVariable, axis: int, keepdims: bool) -> TensorVariable``.
+
+    Returns
+    -------
+    left_stat, right_stat : TensorVariable
+        Calculated statistic for both sides of `padded`.
+    """
+    # Calculate indices of the edges of the area with original values
+    left_index = width_pair[0]
+    right_index = padded.shape[axis] - width_pair[1]
+    # as well as its length
+    max_length = right_index - left_index
+
+    # Limit stat_lengths to max_length
+    left_length, right_length = length_pair
+
+    # Calculate statistic for the left side
+    left_length = (
+        minimum(left_length, max_length) if left_length is not None else max_length
+    )
+    left_slice = slice_at_axis(slice(left_index, left_index + left_length), axis)
+    left_chunk = padded[left_slice]
+    left_stat = stat_func(left_chunk, axis=axis, keepdims=True)
+    if left_length is None and right_length is None:
+        # We could also return early in the more general case of left_length == right_length, but we don't necessarily
+        # know these shapes.
+        # TODO: Add rewrite to simplify in this case
+        return left_stat, left_stat
+
+    # Calculate statistic for the right side
+    right_length = (
+        minimum(right_length, max_length) if right_length is not None else max_length
+    )
+    right_slice = slice_at_axis(slice(right_index - right_length, right_index), axis)
+    right_chunk = padded[right_slice]
+    right_stat = stat_func(right_chunk, axis=axis, keepdims=True)
+
+    return left_stat, right_stat
+
+
+def _stat_pad(
+    x: TensorVariable,
+    pad_width: TensorVariable,
+    stat_func: Callable,
+    stat_length: TensorVariable | None,
+):
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    if stat_length is None:
+        stat_length = [[None, None]] * padded.ndim  # type: ignore
+    else:
+        stat_length = broadcast_to(stat_length, as_tensor((padded.ndim, 2)))
+
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        length_pair = stat_length[axis]  # type: ignore
+        dim_shape = padded.shape[axis]
+
+        left_stat, right_stat = _get_stats(
+            padded, axis, width_pair, length_pair, stat_func
+        )
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+        padded = set_subtensor(padded[left_slice], left_stat)
+        padded = set_subtensor(padded[right_slice], right_stat)
+
+    return padded
+
+
+def _linear_ramp_pad(
+    x: TensorVariable, pad_width: TensorVariable, end_values: TensorVariable | int = 0
+) -> TensorVariable:
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    end_values = as_tensor(end_values)
+    end_values = broadcast_to(end_values, as_tensor((padded.ndim, 2)))
+
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        end_value_pair = end_values[axis]
+        edge_pair = _get_edges(padded, axis, width_pair)
+        dim_shape = padded.shape[axis]
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+
+        left_ramp, right_ramp = (
+            linspace(
+                start=end_value,
+                stop=specify_broadcastable(edge, axis).squeeze(axis),
+                num=width,
+                endpoint=False,
+                dtype=padded.dtype,
+                axis=axis,
+            )
+            for end_value, edge, width in zip(end_value_pair, edge_pair, width_pair)
+        )
+
+        # Reverse the direction of the ramp for the "right" side
+        right_ramp = right_ramp[slice_at_axis(slice(None, None, -1), axis)]  # type: ignore
+
+        padded = set_subtensor(padded[left_slice], left_ramp)
+        padded = set_subtensor(padded[right_slice], right_ramp)
+
+    return padded
+
+
+def _wrap_pad(x: TensorVariable, pad_width: TensorVariable) -> TensorVariable:
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+
+    for axis in range(x.ndim):
+        size = x.shape[axis]
+
+        # Compute how many complete copies of the input will be padded on this dimension, along with the amount of
+        # overflow on the final copy
+        repeats, (left_remainder, right_remainder) = pt_divmod(pad_width[axis], size)
+
+        # In the next step we will generate extra copies of the input, and then trim them down to the correct size.
+        left_trim = size - left_remainder
+        right_trim = size - right_remainder
+
+        # The total number of copies needed is always the sum of the number of complete copies to add, plus the original
+        # input itself, plus the two edge copies that will be trimmed down.
+        total_repeats = repeats.sum() + 3
+
+        # Create a batch dimension and clone the input the required number of times
+        parts = expand_dims(x, (0,)).repeat(total_repeats, axis=0)
+
+        # Move the batch dimension to the active dimension
+        parts = moveaxis(parts, 0, axis)
+
+        # Ravel the active dimension while preserving the shapes of the inactive dimensions. This will expand the
+        # active dimension to have the correctly padded shape, plus excess to be trimmed
+        new_shape = [-1 if i == axis else x.shape[i] for i in range(x.ndim)]
+        x = parts.reshape(new_shape)
+
+        # Trim the excess on the active dimension
+        trim_slice = slice_at_axis(slice(left_trim, -right_trim), axis)
+        x = x[trim_slice]
+
+    return x
+
+
+def _build_padding_one_direction(array, array_flipped, repeats, *, inner_func, axis):
+    [_, parts], _ = scan(
+        inner_func,
+        non_sequences=[array, array_flipped],
+        outputs_info=[0, None],
+        n_steps=repeats,
+    )
+
+    parts = moveaxis(parts, 0, axis)
+    new_shape = [-1 if i == axis else array.shape[i] for i in range(array.ndim)]
+    padding = parts.reshape(new_shape)
+
+    return padding
+
+
+def _symmetric_pad(x, pad_width):
+    def _symmetric_inner(i, x, x_flipped, padding_left):
+        return i + 1, ifelse(eq(i % 2, int(padding_left)), x_flipped, x)
+
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+
+    for axis in range(x.ndim):
+        x_flipped = flip(x, axis=axis)
+        original_size = x.shape[axis]
+
+        repeats, remainders = pt_divmod(pad_width[axis], original_size)
+        has_remainder = gt(remainders, 0)
+        repeats = repeats + has_remainder
+
+        left_padding = _build_padding_one_direction(
+            x,
+            x_flipped,
+            repeats[0],
+            axis=axis,
+            inner_func=partial(_symmetric_inner, padding_left=True),
+        )
+        right_padding = _build_padding_one_direction(
+            x,
+            x_flipped,
+            repeats[1],
+            axis=axis,
+            inner_func=partial(_symmetric_inner, padding_left=False),
+        )
+
+        x = concatenate([flip(left_padding, axis), x, right_padding], axis=axis)
+
+        (left_trim, right_trim) = switch(
+            has_remainder, original_size - remainders, remainders
+        )
+        right_trim = x.shape[axis] - right_trim
+
+        trim_slice = slice_at_axis(slice(left_trim, right_trim), axis)
+        x = x[trim_slice]
+
+    return x
+
+
+def _reflect_pad(x, pad_width):
+    def _reflect_inner(i, x, x_flipped, padding_left):
+        return i + 1, ifelse(eq(i % 2, int(padding_left)), x_flipped, x)
+
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+    for axis in range(x.ndim):
+        trimmed_size = x.shape[axis] - 1
+
+        trim_slice = slice_at_axis(slice(None, -1), axis)
+        x_trimmed = x[trim_slice]
+        x_flipped = flip(x, axis=axis)[trim_slice]
+
+        repeats, remainders = pt_divmod(pad_width[axis], trimmed_size)
+        repeats = repeats + 1
+
+        left_padding = _build_padding_one_direction(
+            x_trimmed,
+            x_flipped,
+            repeats[0],
+            axis=axis,
+            inner_func=partial(_reflect_inner, padding_left=True),
+        )
+        right_padding = _build_padding_one_direction(
+            x_trimmed,
+            x_flipped,
+            repeats[1],
+            axis=axis,
+            inner_func=partial(_reflect_inner, padding_left=False),
+        )
+
+        left_trim = slice_at_axis(slice(trimmed_size - remainders[0] - 1, -1), axis)
+        right_trim = slice_at_axis(
+            slice(1, right_padding.shape[axis] - trimmed_size + remainders[1] + 1), axis
+        )
+
+        x = concatenate(
+            [flip(left_padding, axis)[left_trim], x, right_padding[right_trim]],
+            axis=axis,
+        )
+    return x
+
+
+class Pad(OpFromGraph):
+    """
+    Wrapper Op for Pad graphs
+    """
+
+    def __init__(
+        self, inputs, outputs, pad_mode, reflect_type=None, has_stat_length=False
+    ):
+        self.pad_mode = pad_mode
+        self.reflect_type = reflect_type
+        self.has_stat_length = has_stat_length
+
+        super().__init__(inputs=inputs, outputs=outputs)
+
+
+def pad(
+    x: TensorLike, pad_width: TensorLike, mode: PadMode = "constant", **kwargs
+) -> TensorVariable:
+    """
+    Pad an array.
+
+    Parameters
+    ----------
+    array : array_like of rank N
+        The array to pad.
+
+    pad_width : sequence, array_like, or int
+        Number of values padded to the edges of each axis.
+        ``((before_1, after_1), ... (before_N, after_N))`` unique pad widths
+        for each axis.
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after pad for each axis.
+        ``(pad,)`` or ``int`` is a shortcut for before = after = pad width
+        for all axes.
+
+    mode : str or function, optional
+        One of the following string values or a user supplied function.
+
+        'constant' (default)
+            Pads with a constant value.
+        'edge'
+            Pads with the edge values of array.
+        'linear_ramp'
+            Pads with the linear ramp between end_value and the
+            array edge value.
+        'maximum'
+            Pads with the maximum value of all or part of the
+            vector along each axis.
+        'mean'
+            Pads with the mean value of all or part of the
+            vector along each axis.
+        'minimum'
+            Pads with the minimum value of all or part of the
+            vector along each axis.
+        'reflect'
+            Pads with the reflection of the vector mirrored on
+            the first and last values of the vector along each
+            axis.
+        'symmetric'
+            Pads with the reflection of the vector mirrored
+            along the edge of the array.
+        'wrap'
+            Pads with the wrap of the vector along the axis.
+            The first values are used to pad the end and the
+            end values are used to pad the beginning.
+
+    stat_length : sequence or int, optional
+        Used in 'maximum', 'mean', and 'minimum'.  Number of
+        values at edge of each axis used to calculate the statistic value.
+
+        ``((before_1, after_1), ... (before_N, after_N))`` unique statistic
+        lengths for each axis.
+
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after statistic lengths for each axis.
+
+        ``(stat_length,)`` or ``int`` is a shortcut for
+        ``before = after = statistic`` length for all axes.
+
+        Default is ``None``, to use the entire axis.
+
+    constant_values : sequence or scalar, optional
+        Used in 'constant'.  The values to set the padded values for each
+        axis.
+
+        ``((before_1, after_1), ... (before_N, after_N))`` unique pad constants
+        for each axis.
+
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after constants for each axis.
+
+        ``(constant,)`` or ``constant`` is a shortcut for
+        ``before = after = constant`` for all axes.
+
+        Default is 0.
+
+    end_values : sequence or scalar, optional
+        Used in 'linear_ramp'.  The values used for the ending value of the
+        linear_ramp and that will form the edge of the padded array.
+
+        ``((before_1, after_1), ... (before_N, after_N))`` unique end values
+        for each axis.
+
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after end values for each axis.
+
+        ``(constant,)`` or ``constant`` is a shortcut for
+        ``before = after = constant`` for all axes.
+
+        Default is 0.
+
+    reflect_type : str, optional
+        Only 'even' is currently accepted. Used in 'reflect', and 'symmetric'.  The 'even' style is the
+        default with an unaltered reflection around the edge value.
+
+    Returns
+    -------
+    pad : ndarray
+        Padded array of rank equal to `array` with shape increased
+        according to `pad_width`.
+
+    Examples
+    --------
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+        a = [1, 2, 3, 4, 5]
+        print(pt.pad(a, (2, 3), 'constant', constant_values=(4, 6)).eval())
+
+    .. testoutput::
+
+        [4. 4. 1. 2. 3. 4. 5. 6. 6. 6.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'edge').eval())
+
+    .. testoutput::
+
+         [1. 1. 1. 2. 3. 4. 5. 5. 5. 5.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'linear_ramp', end_values=(5, -4)).eval())
+
+    .. testoutput::
+
+        [ 5.  3.  1.  2.  3.  4.  5.  2. -1. -4.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2,), 'maximum').eval())
+
+    .. testoutput::
+
+        [5. 5. 1. 2. 3. 4. 5. 5. 5.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2,), 'mean').eval())
+
+    .. testoutput::
+
+        [3. 3. 1. 2. 3. 4. 5. 3. 3.]
+
+    .. testcode::
+
+        a = [[1, 2], [3, 4]]
+        print(pt.pad(a, ((3, 2), (2, 3)), 'minimum').eval())
+
+    .. testoutput::
+
+        [[1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [3. 3. 3. 4. 3. 3. 3.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]]
+
+    .. testcode::
+
+        a = [1, 2, 3, 4, 5]
+        print(pt.pad(a, (2, 3), 'reflect').eval())
+
+    .. testoutput::
+
+        [3 2 1 2 3 4 5 4 3 2]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'symmetric').eval())
+
+    .. testoutput::
+
+        [2 1 1 2 3 4 5 5 4 3]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'wrap').eval())
+
+    .. testoutput::
+
+        [4 5 1 2 3 4 5 1 2 3]
+
+    """
+    if any(value not in allowed_kwargs[mode] for value in kwargs.keys()):
+        raise ValueError(
+            f"Invalid keyword arguments for mode '{mode}': {kwargs.keys()}"
+        )
+    x = as_tensor(x, name="x")
+    pad_width = as_tensor(pad_width, name="pad_width")
+    inputs = [x, pad_width]
+    attrs = {}
+
+    if mode == "constant":
+        constant_values = as_tensor(
+            kwargs.pop("constant_values", 0), name="constant_values"
+        )
+        inputs += [constant_values]
+        outputs = _constant_pad(x, pad_width, constant_values)
+
+    elif mode == "edge":
+        outputs = _edge_pad(x, pad_width)
+
+    elif mode in ["maximum", "minimum", "mean", "median"]:
+        if mode == "median":
+            # TODO: Revisit this after we implement a quantile function.
+            #  See https://github.com/pymc-devs/pytensor/issues/53
+            raise NotImplementedError("Median padding not implemented")
+        stat_func = cast(Callable, stat_funcs[mode])
+        stat_length = kwargs.get("stat_length")
+        if stat_length is not None:
+            attrs.update({"has_stat_length": True})
+            stat_length = as_tensor(stat_length, name="stat_length")
+            inputs += [stat_length]
+
+        outputs = _stat_pad(x, pad_width, stat_func, stat_length)
+
+    elif mode == "linear_ramp":
+        end_values = kwargs.pop("end_values", 0)
+        end_values = as_tensor(end_values)
+
+        inputs += [end_values]
+        outputs = _linear_ramp_pad(x, pad_width, end_values)
+
+    elif mode == "wrap":
+        outputs = _wrap_pad(x, pad_width)
+
+    elif mode == "symmetric":
+        reflect_type = kwargs.pop("reflect_type", "even")
+        if reflect_type == "odd":
+            raise NotImplementedError(
+                "Odd reflection not implemented. If you need this feature, please open an "
+                "issue at https://github.com/pymc-devs/pytensor/issues"
+            )
+        attrs.update({"reflect_type": reflect_type})
+        outputs = _symmetric_pad(x, pad_width)
+
+    elif mode == "reflect":
+        reflect_type = kwargs.pop("reflect_type", "even")
+        if reflect_type == "odd":
+            raise NotImplementedError(
+                "Odd reflection not implemented. If you need this feature, please open an "
+                "issue at https://github.com/pymc-devs/pytensor/issues"
+            )
+        attrs.update({"reflect_type": reflect_type})
+        outputs = _reflect_pad(x, pad_width)
+
+    else:
+        raise ValueError(f"Invalid mode: {mode}")
+
+    op = Pad(inputs=inputs, outputs=[outputs], pad_mode=mode, **attrs)(*inputs)
+    return cast(TensorVariable, op)
+
+
+__all__ = ["pad", "flip"]
diff --git a/pytensor/tensor/subtensor.py b/pytensor/tensor/subtensor.py
index a21f2d7dcc..41b4c6bd5a 100644
--- a/pytensor/tensor/subtensor.py
+++ b/pytensor/tensor/subtensor.py
@@ -3013,8 +3013,123 @@ def _get_vector_length_Subtensor(op, var):
         raise ValueError(f"Length of {var} cannot be determined")
 
 
+def slice_at_axis(sl: slice, axis: int) -> tuple[slice, ...]:
+    """
+    Construct tuple of slices to slice an array in the given dimension.
+
+    Copied from numpy.lib.arraypad._slice_at_axis
+    https://github.com/numpy/numpy/blob/300096d384046eee479b0c7a70f79e308da52bff/numpy/lib/_arraypad_impl.py#L33
+
+    Parameters
+    ----------
+    sl : slice
+        The slice for the given dimension.
+    axis : int
+        The axis to which `sl` is applied. All other dimensions are left
+        "unsliced".
+
+    Returns
+    -------
+    sl : tuple of slices
+        A tuple with slices matching `shape` in length.
+
+    Examples
+    --------
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+
+        s = pt.slice_at_axis(slice(None, 1), 1)
+        print(s)
+
+    .. testoutput::
+
+        (slice(None, None, None), slice(None, 1, None), Ellipsis)
+
+    .. testcode::
+
+        x = pt.tensor('x', shape=(None, None, None))
+        x_sliced = x[s]
+
+        f = pytensor.function([x], x_sliced)
+        x = np.arange(27).reshape(3, 3, 3)
+        print(f(x))
+
+    .. testoutput::
+        [[[ 0.  1.  2.]]
+
+         [[ 9. 10. 11.]]
+
+         [[18. 19. 20.]]]
+
+    """
+    if axis >= 0:
+        return (slice(None),) * axis + (sl,) + (...,)  # type: ignore
+    else:
+        # If axis = -1 we want zero right padding (and so on), so subtract one
+        axis = abs(axis) - 1
+        return (...,) + (sl,) + (slice(None),) * axis  # type: ignore
+
+
+def flip(
+    arr: TensorVariable, axis: int | tuple[int] | TensorVariable | None = None
+) -> TensorVariable:
+    """
+    Reverse the order of elements in an tensor along the given axis.
+
+    Parameters
+    ----------
+    arr: TensorVariable
+        Input tensor.
+
+    axis: int | tuple[int] | TensorVariable, optional
+        Axis or axes along which to flip over. The default is to flip over all of the axes of the input tensor.
+
+    Returns
+    -------
+    arr: TensorVariable
+        A view of `arr` with the entries of axis reversed.
+
+    Examples
+    --------
+
+    .. testcode::
+
+        import pytensor
+        import pytensor.tensor as pt
+
+        x = pt.tensor('x', shape=(None, None))
+        x_flipped = pt.flip(x, axis=0)
+
+        f = pytensor.function([x], x_flipped)
+        x = [[1, 2], [3, 4]]
+        print(f(x))
+
+    .. testoutput::
+        [[3. 4.]
+         [1. 2.]]
+
+    """
+    if axis is None:
+        index = ((slice(None, None, -1)),) * arr.ndim
+    else:
+        if isinstance(axis, int):
+            axis = (axis,)
+        index = tuple(
+            [
+                slice(None, None, -1) if i in axis else slice(None, None, None)
+                for i in range(arr.ndim)
+            ]
+        )
+
+    return cast(TensorVariable, arr[index])
+
+
 __all__ = [
     "take",
+    "flip",
+    "slice_at_axis",
     "inc_subtensor",
     "set_subtensor",
 ]
diff --git a/tests/link/jax/test_pad.py b/tests/link/jax/test_pad.py
new file mode 100644
index 0000000000..2321645741
--- /dev/null
+++ b/tests/link/jax/test_pad.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+import pytensor.tensor as pt
+from pytensor import config
+from pytensor.graph import FunctionGraph
+from pytensor.tensor.pad import PadMode
+from tests.link.jax.test_basic import compare_jax_and_py
+
+
+jax = pytest.importorskip("jax")
+floatX = config.floatX
+RTOL = ATOL = 1e-6 if floatX.endswith("64") else 1e-3
+
+
+@pytest.mark.parametrize(
+    "mode, kwargs",
+    [
+        ("constant", {"constant_values": 0}),
+        ("constant", {"constant_values": (1, 2)}),
+        ("edge", {}),
+        ("linear_ramp", {"end_values": 0}),
+        ("linear_ramp", {"end_values": (1, 2)}),
+        ("reflect", {"reflect_type": "even"}),
+        ("wrap", {}),
+        ("symmetric", {"reflect_type": "even"}),
+        ("mean", {"stat_length": None}),
+        ("mean", {"stat_length": (10, 2)}),
+        ("maximum", {"stat_length": None}),
+        ("maximum", {"stat_length": (10, 2)}),
+        ("minimum", {"stat_length": None}),
+        ("minimum", {"stat_length": (10, 2)}),
+    ],
+    ids=[
+        "constant_default",
+        "constant_tuple",
+        "edge",
+        "linear_ramp_default",
+        "linear_ramp_tuple",
+        "reflect",
+        "wrap",
+        "symmetric",
+        "mean_default",
+        "mean_tuple",
+        "maximum_default",
+        "maximum_tuple",
+        "minimum_default",
+        "minimum_tuple",
+    ],
+)
+def test_jax_pad(mode: PadMode, kwargs):
+    x_pt = pt.tensor("x", shape=(3, 3))
+    x = np.random.normal(size=(3, 3))
+
+    res = pt.pad(x_pt, mode=mode, pad_width=3, **kwargs)
+    res_fg = FunctionGraph([x_pt], [res])
+
+    compare_jax_and_py(
+        res_fg,
+        [x],
+        assert_fn=lambda x, y: np.testing.assert_allclose(x, y, rtol=RTOL, atol=ATOL),
+        py_mode="FAST_RUN",
+    )
diff --git a/tests/link/numba/test_pad.py b/tests/link/numba/test_pad.py
new file mode 100644
index 0000000000..11877594d7
--- /dev/null
+++ b/tests/link/numba/test_pad.py
@@ -0,0 +1,68 @@
+import numpy as np
+import pytest
+
+import pytensor.tensor as pt
+from pytensor import config
+from pytensor.graph import FunctionGraph
+from pytensor.tensor.pad import PadMode
+from tests.link.numba.test_basic import compare_numba_and_py
+
+
+floatX = config.floatX
+RTOL = ATOL = 1e-6 if floatX.endswith("64") else 1e-3
+
+
+@pytest.mark.parametrize(
+    "mode, kwargs",
+    [
+        ("constant", {"constant_values": 0}),
+        ("constant", {"constant_values": (1, 2)}),
+        pytest.param(
+            "edge",
+            {},
+            marks=pytest.mark.skip(
+                "This is causing a segfault in NUMBA mode, but I have no idea why"
+            ),
+        ),
+        ("linear_ramp", {"end_values": 0}),
+        ("linear_ramp", {"end_values": (1, 2)}),
+        ("reflect", {"reflect_type": "even"}),
+        ("wrap", {}),
+        ("symmetric", {"reflect_type": "even"}),
+        ("mean", {"stat_length": None}),
+        ("mean", {"stat_length": (10, 2)}),
+        ("maximum", {"stat_length": None}),
+        ("maximum", {"stat_length": (10, 2)}),
+        ("minimum", {"stat_length": None}),
+        ("minimum", {"stat_length": (10, 2)}),
+    ],
+    ids=[
+        "constant_default",
+        "constant_tuple",
+        "edge",
+        "linear_ramp_default",
+        "linear_ramp_tuple",
+        "reflect",
+        "wrap",
+        "symmetric",
+        "mean_default",
+        "mean_tuple",
+        "maximum_default",
+        "maximum_tuple",
+        "minimum_default",
+        "minimum_tuple",
+    ],
+)
+def test_numba_pad(mode: PadMode, kwargs):
+    x_pt = pt.tensor("x", shape=(3, 3))
+    x = np.random.normal(size=(3, 3))
+
+    res = pt.pad(x_pt, mode=mode, pad_width=3, **kwargs)
+    res_fg = FunctionGraph([x_pt], [res])
+
+    compare_numba_and_py(
+        res_fg,
+        [x],
+        assert_fn=lambda x, y: np.testing.assert_allclose(x, y, rtol=RTOL, atol=ATOL),
+        py_mode="FAST_RUN",
+    )
diff --git a/tests/tensor/test_extra_ops.py b/tests/tensor/test_extra_ops.py
index 4376ab1d32..3b3cc5ec7f 100644
--- a/tests/tensor/test_extra_ops.py
+++ b/tests/tensor/test_extra_ops.py
@@ -35,9 +35,6 @@
     diff,
     fill_diagonal,
     fill_diagonal_offset,
-    geomspace,
-    linspace,
-    logspace,
     ravel_multi_index,
     repeat,
     searchsorted,
@@ -1281,25 +1278,37 @@ def test_broadcast_arrays():
 
 
 @pytest.mark.parametrize(
-    "start, stop, num_samples",
+    "op",
+    ["linspace", "logspace", "geomspace"],
+    ids=["linspace", "logspace", "geomspace"],
+)
+@pytest.mark.parametrize("dtype", [None, "int", "float"], ids=[None, "int", "float"])
+@pytest.mark.parametrize(
+    "start, stop, num_samples, endpoint, axis",
     [
-        (1, 10, 50),
-        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25),
-        (1, np.array([5, 6]), 30),
+        (1, 10, 50, True, 0),
+        (1, 10, 1, True, 0),
+        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25, True, 0),
+        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25, True, 1),
+        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25, False, -1),
+        (1, np.array([5, 6]), 30, True, 0),
+        (1, np.array([5, 6]), 30, False, -1),
     ],
 )
-def test_space_ops(start, stop, num_samples):
-    z = linspace(start, stop, num_samples)
-    pytensor_res = function(inputs=[], outputs=z)()
-    numpy_res = np.linspace(start, stop, num=num_samples)
-    assert np.allclose(pytensor_res, numpy_res)
-
-    z = logspace(start, stop, num_samples)
-    pytensor_res = function(inputs=[], outputs=z)()
-    numpy_res = np.logspace(start, stop, num=num_samples)
-    assert np.allclose(pytensor_res, numpy_res)
-
-    z = geomspace(start, stop, num_samples)
-    pytensor_res = function(inputs=[], outputs=z)()
-    numpy_res = np.geomspace(start, stop, num=num_samples)
-    assert np.allclose(pytensor_res, numpy_res)
+def test_space_ops(op, dtype, start, stop, num_samples, endpoint, axis):
+    pt_func = getattr(pt, op)
+    np_func = getattr(np, op)
+    dtype = dtype + config.floatX[-2:] if dtype is not None else dtype
+    z = pt_func(start, stop, num_samples, endpoint=endpoint, axis=axis, dtype=dtype)
+
+    numpy_res = np_func(
+        start, stop, num=num_samples, endpoint=endpoint, dtype=dtype, axis=axis
+    )
+    pytensor_res = function(inputs=[], outputs=z, mode="FAST_COMPILE")()
+
+    np.testing.assert_allclose(
+        pytensor_res,
+        numpy_res,
+        atol=1e-6 if config.floatX.endswith("64") else 1e-4,
+        rtol=1e-6 if config.floatX.endswith("64") else 1e-4,
+    )
diff --git a/tests/tensor/test_pad.py b/tests/tensor/test_pad.py
new file mode 100644
index 0000000000..54df4a12e1
--- /dev/null
+++ b/tests/tensor/test_pad.py
@@ -0,0 +1,224 @@
+from typing import Literal
+
+import numpy as np
+import pytest
+
+import pytensor
+from pytensor.tensor.pad import PadMode, pad
+
+
+floatX = pytensor.config.floatX
+RTOL = ATOL = 1e-8 if floatX.endswith("64") else 1e-4
+
+
+def test_unknown_mode_raises():
+    x = np.random.normal(size=(3, 3)).astype(floatX)
+    with pytest.raises(ValueError, match="Invalid mode: unknown"):
+        pad(x, 1, mode="unknown")
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 3, 3)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize("constant", [0, 0.0], ids=["int", "float"])
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+def test_constant_pad(
+    size: tuple, constant: int | float, pad_width: int | tuple[int, ...]
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="constant", constant_values=constant)
+    z = pad(x, pad_width, mode="constant", constant_values=constant)
+    assert z.owner.op.pad_mode == "constant"
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+def test_edge_pad(size: tuple, pad_width: int | tuple[int, ...]):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="edge")
+    z = pad(x, pad_width, mode="edge")
+    assert z.owner.op.pad_mode == "edge"
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize("end_values", [0, -1], ids=["0", "-1"])
+def test_linear_ramp_pad(
+    size: tuple,
+    pad_width: int | tuple[int, ...],
+    end_values: int | float | tuple[int | float, ...],
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="linear_ramp", end_values=end_values)
+    z = pad(x, pad_width, mode="linear_ramp", end_values=end_values)
+    assert z.owner.op.pad_mode == "linear_ramp"
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize("stat", ["mean", "minimum", "maximum"])
+@pytest.mark.parametrize("stat_length", [None, 2])
+def test_stat_pad(
+    size: tuple,
+    pad_width: int | tuple[int, ...],
+    stat: PadMode,
+    stat_length: int | None,
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode=stat, stat_length=stat_length)
+    z = pad(x, pad_width, mode=stat, stat_length=stat_length)
+    assert z.owner.op.pad_mode == stat
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+def test_wrap_pad(size: tuple, pad_width: int | tuple[int, ...]):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="wrap")
+    z = pad(x, pad_width, mode="wrap")
+    assert z.owner.op.pad_mode == "wrap"
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize(
+    "reflect_type",
+    ["even", pytest.param("odd", marks=pytest.mark.xfail(raises=NotImplementedError))],
+    ids=["even", "odd"],
+)
+def test_symmetric_pad(
+    size,
+    pad_width,
+    reflect_type: Literal["even", "odd"],
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="symmetric", reflect_type=reflect_type)
+    z = pad(x, pad_width, mode="symmetric", reflect_type=reflect_type)
+    assert z.owner.op.pad_mode == "symmetric"
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize(
+    "reflect_type",
+    ["even", pytest.param("odd", marks=pytest.mark.xfail(raises=NotImplementedError))],
+    ids=["even", "odd"],
+)
+def test_reflect_pad(
+    size,
+    pad_width,
+    reflect_type: Literal["even", "odd"],
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="reflect", reflect_type=reflect_type)
+    z = pad(x, pad_width, mode="reflect", reflect_type=reflect_type)
+    assert z.owner.op.pad_mode == "reflect"
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "mode",
+    [
+        "constant",
+        "edge",
+        "linear_ramp",
+        "wrap",
+        "symmetric",
+        "reflect",
+        "mean",
+        "maximum",
+        "minimum",
+    ],
+)
+@pytest.mark.parametrize("padding", ["symmetric", "asymmetric"])
+def test_nd_padding(mode, padding):
+    rng = np.random.default_rng()
+    n = rng.integers(3, 5)
+    if padding == "symmetric":
+        pad_width = [(i, i) for i in rng.integers(1, 5, size=n)]
+        stat_length = [(i, i) for i in rng.integers(1, 5, size=n)]
+    else:
+        pad_width = rng.integers(1, 5, size=(n, 2)).tolist()
+        stat_length = rng.integers(1, 5, size=(n, 2)).tolist()
+
+    test_kwargs = {
+        "constant": {"constant_values": 0},
+        "linear_ramp": {"end_values": 0},
+        "maximum": {"stat_length": stat_length},
+        "mean": {"stat_length": stat_length},
+        "minimum": {"stat_length": stat_length},
+        "reflect": {"reflect_type": "even"},
+        "symmetric": {"reflect_type": "even"},
+    }
+
+    x = np.random.normal(size=(2,) * n).astype(floatX)
+    kwargs = test_kwargs.get(mode, {})
+    expected = np.pad(x, pad_width, mode=mode, **kwargs)
+    z = pad(x, pad_width, mode=mode, **kwargs)
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
diff --git a/tests/tensor/test_subtensor.py b/tests/tensor/test_subtensor.py
index 427287dcfd..d02880f543 100644
--- a/tests/tensor/test_subtensor.py
+++ b/tests/tensor/test_subtensor.py
@@ -37,11 +37,13 @@
     advanced_subtensor1,
     as_index_literal,
     basic_shape,
+    flip,
     get_canonical_form_slice,
     inc_subtensor,
     index_vars_to_types,
     indexed_result_shape,
     set_subtensor,
+    slice_at_axis,
     take,
 )
 from pytensor.tensor.type import (
@@ -2902,3 +2904,39 @@ def test_vectorize_adv_subtensor(
         vectorize_pt(x_test, idx_test),
         vectorize_np(x_test, idx_test),
     )
+
+
+def test_slice_at_axis():
+    x = ptb.tensor("x", shape=(3, 4, 5))
+    x_sliced = x[slice_at_axis(slice(None, 1), axis=0)]
+    assert x_sliced.type.shape == (1, 4, 5)
+
+    # Negative axis
+    x_sliced = x[slice_at_axis(slice(None, 1), axis=-2)]
+    assert x_sliced.type.shape == (3, 1, 5)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+def test_flip(size: tuple[int]):
+    from itertools import combinations
+
+    ATOL = RTOL = 1e-8 if config.floatX == "float64" else 1e-4
+
+    x = np.random.normal(size=size).astype(config.floatX)
+    x_pt = pytensor.tensor.tensor(shape=size, name="x")
+    expected = np.flip(x, axis=None)
+    z = flip(x_pt, axis=None)
+    f = pytensor.function([x_pt], z, mode="FAST_COMPILE")
+    np.testing.assert_allclose(expected, f(x), atol=ATOL, rtol=RTOL)
+
+    # Test all combinations of axes
+    flip_options = [
+        axes for i in range(1, x.ndim + 1) for axes in combinations(range(x.ndim), r=i)
+    ]
+    for axes in flip_options:
+        expected = np.flip(x, axis=list(axes))
+        z = flip(x_pt, axis=list(axes))
+        f = pytensor.function([x_pt], z, mode="FAST_COMPILE")
+        np.testing.assert_allclose(expected, f(x), atol=ATOL, rtol=RTOL)

From a601a27c6c1ddfc0c965bf664a5aa33caeb24788 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Sun, 21 Jul 2024 16:01:12 -0700
Subject: [PATCH 08/72] Update away from torch.where

---
 pytensor/link/pytorch/dispatch/basic.py |  8 +++++---
 tests/link/pytorch/test_basic.py        | 16 ++++++----------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index 0039406907..5e5bc4a41b 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -138,9 +138,11 @@ def makevector(*x):
 @pytorch_funcify.register(IfElse)
 def pytorch_funcify_IfElse(op, **kwargs):
     n_outs = op.n_outs
-    assert n_outs == 1
 
-    def ifelse(cond, *args, n_outs=n_outs):
-        return torch.where(cond, *args)
+    def ifelse(cond, ifpath, elsepath, n_outs=n_outs):
+        if cond:
+            return ifpath
+        else:
+            return elsepath
 
     return ifelse
diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index d49ea1ab1e..3905055935 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -308,14 +308,10 @@ def test_pytorch_ifelse():
     true_vals = np.r_[1, 2, 3]
     false_vals = np.r_[-1, -2, -3]
 
-    x = ifelse(np.array(True), true_vals, false_vals)
-    x_fg = FunctionGraph([], [x])
-
-    compare_pytorch_and_py(x_fg, [])
-
-    a = scalar("a")
-    a.tag.test_value = np.array(0.2, dtype=config.floatX)
-    x = ifelse(a < 0.5, true_vals, false_vals)
-    x_fg = FunctionGraph([a], [x])  # I.e. False
+    for test_value, cond in [(0.2, 0.5), (0.5, 0.4)]:
+        a = scalar("a")
+        a.tag.test_value = np.array(test_value, dtype=config.floatX)
+        x = ifelse(a < cond, true_vals, false_vals)
+        x_fg = FunctionGraph([a], [x])  # I.e. False
 
-    compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])
+        compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])

From aab9faeee740e23db08323446bef1ad888d30422 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:29:36 +0000
Subject: [PATCH 09/72] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.2 → v0.5.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.2...v0.5.4)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8aee60b767..4b34d53b80 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           )$
       - id: check-merge-conflict
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.2
+    rev: v0.5.4
     hooks:
       - id: ruff
         args: ["--fix", "--output-format=full"]

From 739d97ddece3643aec20aa4d316007535ec1ddbb Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 16:52:20 -0400
Subject: [PATCH 10/72] Removed unused config options

---
 doc/library/config.rst     | 43 -------------------
 pytensor/configdefaults.py | 87 --------------------------------------
 pytensor/configparser.py   |  6 ++-
 3 files changed, 4 insertions(+), 132 deletions(-)

diff --git a/doc/library/config.rst b/doc/library/config.rst
index 60f0f7e307..dac7e2c810 100644
--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -103,14 +103,6 @@ import ``pytensor`` and print the config variable, as in:
 
     String value: either ``'cpu'``
 
-.. attribute:: force_device
-
-    Bool value: either ``True`` or ``False``
-
-    Default: ``False``
-
-    This flag's value cannot be modified during the program execution.
-
 .. attribute:: print_active_device
 
     Bool value: either ``True`` or ``False``
@@ -139,16 +131,6 @@ import ``pytensor`` and print the config variable, as in:
     equal to ``float64`` is created.
     This can be used to help find upcasts to ``float64`` in user code.
 
-.. attribute:: deterministic
-
-    String value: either ``'default'``, ``'more'``
-
-    Default: ``'default'``
-
-    If ``more``, sometimes PyTensor will select :class:`Op` implementations that
-    are more "deterministic", but slower.  See the ``dnn.conv.algo*``
-    flags for more cases.
-
 .. attribute:: allow_gc
 
     Bool value: either ``True`` or ``False``
@@ -412,16 +394,6 @@ import ``pytensor`` and print the config variable, as in:
     ignore it (i.e. ``'ignore'``).
     We suggest never using ``'ignore'`` except during testing.
 
-.. attribute:: assert_no_cpu_op
-
-    String value: ``'ignore'`` or ``'warn'`` or ``'raise'`` or ``'pdb'``
-
-    Default: ``'ignore'``
-
-    If there is a CPU :class:`Op` in the computational graph, depending on its value,
-    this flag can either raise a warning, an exception or drop into the frame
-    with ``pdb``.
-
 .. attribute:: on_shape_error
 
     String value: ``'warn'`` or ``'raise'``
@@ -797,18 +769,3 @@ import ``pytensor`` and print the config variable, as in:
     The verbosity level of the meta-rewriter: ``0`` for silent, ``1`` to only
     warn when PyTensor cannot meta-rewrite an :class:`Op`, ``2`` for full output (e.g.
     timings and the rewrites selected).
-
-
-.. attribute:: config.metaopt__optimizer_excluding
-
-    Default: ``""``
-
-    A list of rewrite tags that we don't want included in the meta-rewriter.
-    Multiple tags are separate by ``':'``.
-
-.. attribute:: config.metaopt__optimizer_including
-
-    Default: ``""``
-
-    A list of rewriter tags to be included during meta-rewriting.
-    Multiple tags are separate by ``':'``.
diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index f3a8b4a146..44b3f8ad99 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -260,15 +260,6 @@ def add_basic_configvars():
         ),
     )
 
-    config.add(
-        "deterministic",
-        "If `more`, sometimes we will select some implementation that "
-        "are more deterministic, but slower.  Also see "
-        "the dnn.conv.algo* flags to cover more cases.",
-        EnumStr("default", ["more"]),
-        in_c_key=False,
-    )
-
     config.add(
         "device",
         ("Default device for computations. only cpu is supported for now"),
@@ -276,13 +267,6 @@ def add_basic_configvars():
         in_c_key=False,
     )
 
-    config.add(
-        "force_device",
-        "Raise an error if we can't use the specified device",
-        BoolParam(False, mutable=False),
-        in_c_key=False,
-    )
-
     config.add(
         "conv__assert_shape",
         "If True, AbstractConv* ops will verify that user-provided"
@@ -299,14 +283,6 @@ def add_basic_configvars():
         in_c_key=False,
     )
 
-    # This flag determines whether or not to raise error/warning message if
-    # there is a CPU Op in the computational graph.
-    config.add(
-        "assert_no_cpu_op",
-        "Raise an error/warning if there is a CPU op in the computational graph.",
-        EnumStr("ignore", ["warn", "raise", "pdb"], mutable=True),
-        in_c_key=False,
-    )
     config.add(
         "unpickle_function",
         (
@@ -1043,20 +1019,6 @@ def add_metaopt_configvars():
         in_c_key=False,
     )
 
-    config.add(
-        "metaopt__optimizer_excluding",
-        ("exclude optimizers with these tags. Separate tags with ':'."),
-        StrParam(""),
-        in_c_key=False,
-    )
-
-    config.add(
-        "metaopt__optimizer_including",
-        ("include optimizers with these tags. Separate tags with ':'."),
-        StrParam(""),
-        in_c_key=False,
-    )
-
 
 def add_vm_configvars():
     config.add(
@@ -1295,55 +1257,6 @@ def add_caching_dir_configvars():
     )
 
 
-# Those are the options provided by PyTensor to choose algorithms at runtime.
-SUPPORTED_DNN_CONV_ALGO_RUNTIME = (
-    "guess_once",
-    "guess_on_shape_change",
-    "time_once",
-    "time_on_shape_change",
-)
-
-# Those are the supported algorithm by PyTensor,
-# The tests will reference those lists.
-SUPPORTED_DNN_CONV_ALGO_FWD = (
-    "small",
-    "none",
-    "large",
-    "fft",
-    "fft_tiling",
-    "winograd",
-    "winograd_non_fused",
-    *SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-)
-
-SUPPORTED_DNN_CONV_ALGO_BWD_DATA = (
-    "none",
-    "deterministic",
-    "fft",
-    "fft_tiling",
-    "winograd",
-    "winograd_non_fused",
-    *SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-)
-
-SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = (
-    "none",
-    "deterministic",
-    "fft",
-    "small",
-    "winograd_non_fused",
-    "fft_tiling",
-    *SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-)
-
-SUPPORTED_DNN_CONV_PRECISION = (
-    "as_input_f32",
-    "as_input",
-    "float16",
-    "float32",
-    "float64",
-)
-
 # Eventually, the instance of `PyTensorConfigParser` should be created right here,
 # where it is also populated with settings.
 config = _create_default_config()
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 1656558668..1199485d74 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -75,6 +75,7 @@ class PyTensorConfigParser:
     pickle_test_value: bool
     cast_policy: str
     device: str
+    conv__assert_shape: bool
     print_global_stats: bool
     unpickle_function: bool
     # add_compile_configvars
@@ -86,6 +87,7 @@ class PyTensorConfigParser:
     optimizer_verbose: bool
     on_opt_error: str
     nocleanup: bool
+    on_unused_input: str
     gcc__cxxflags: str
     cmodule__warn_no_version: bool
     cmodule__remove_gxx_opt: bool
@@ -93,6 +95,7 @@ class PyTensorConfigParser:
     cmodule__preload_cache: bool
     cmodule__age_thresh_use: int
     cmodule__debug: bool
+    compile__wait: int
     compile__timeout: int
     # add_tensor_configvars
     tensor__cmp_sloppy: int
@@ -143,6 +146,7 @@ class PyTensorConfigParser:
     optdb__max_use_ratio: float
     cycle_detection: str
     check_stack_trace: str
+    # add_metaopt_configvars
     metaopt__verbose: int
     # add_vm_configvars
     profile: bool
@@ -177,7 +181,6 @@ def __init__(
         self._pytensor_cfg = pytensor_cfg
         self._pytensor_raw_cfg = pytensor_raw_cfg
         self._config_var_dict: dict = {}
-        super().__init__()
 
     def __str__(self, print_doc=True):
         sio = StringIO()
@@ -375,7 +378,6 @@ def __init__(
         # more appropriate user-provided default value.
         # Calling `filter` here may actually be harmful if the default value is
         # invalid and causes a crash or has unwanted side effects.
-        super().__init__()
 
     @property
     def default(self):

From b9f2dde10a45c054b05e99590d1023551034d41f Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 16:56:01 -0400
Subject: [PATCH 11/72] Remove add_experimental_configvars

---
 pytensor/configdefaults.py | 5 -----
 pytensor/configparser.py   | 1 -
 2 files changed, 6 deletions(-)

diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index 44b3f8ad99..42d3912ccf 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -585,10 +585,6 @@ def add_traceback_configvars():
     )
 
 
-def add_experimental_configvars():
-    return
-
-
 def add_error_and_warning_configvars():
     ###
     # To disable some warning about old bug that are fixed now.
@@ -1266,7 +1262,6 @@ def add_caching_dir_configvars():
 add_compile_configvars()
 add_tensor_configvars()
 add_traceback_configvars()
-add_experimental_configvars()
 add_error_and_warning_configvars()
 add_testvalue_and_checking_configvars()
 add_multiprocessing_configvars()
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 1199485d74..815053b6e9 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -104,7 +104,6 @@ class PyTensorConfigParser:
     # add_traceback_configvars
     traceback__limit: int
     traceback__compile_limit: int
-    # add_experimental_configvars
     # add_error_and_warning_configvars
     warn__ignore_bug_before: int
     exception_verbosity: str

From f9f5c5b9897026216225126fffd60bcc3ab6bcad Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 17:06:12 -0400
Subject: [PATCH 12/72] Remove default in_c_key and change for cast_policy

---
 pytensor/configdefaults.py | 1 +
 pytensor/configparser.py   | 4 +---
 tests/test_config.py       | 5 +++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index 42d3912ccf..7fd6f951c7 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -258,6 +258,7 @@ def add_basic_configvars():
             # was expected, so it is currently not available.
             # numpy,
         ),
+        in_c_key=False,
     )
 
     config.add(
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 815053b6e9..40e84f518a 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -214,9 +214,7 @@ def get_config_hash(self):
             )
         )
 
-    def add(
-        self, name: str, doc: str, configparam: "ConfigParam", in_c_key: bool = True
-    ):
+    def add(self, name: str, doc: str, configparam: "ConfigParam", in_c_key: bool):
         """Add a new variable to PyTensorConfigParser.
 
         This method performs some of the work of initializing `ConfigParam` instances.
diff --git a/tests/test_config.py b/tests/test_config.py
index 65705c6988..4a512085f4 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -98,6 +98,7 @@ def test_config_hash():
         "test__config_hash",
         "A config var from a test case.",
         configparser.StrParam("test_default"),
+        in_c_key=True,
     )
 
     h0 = root.get_config_hash()
@@ -160,6 +161,7 @@ def test_config_context():
         "test__config_context",
         "A config var from a test case.",
         configparser.StrParam("test_default"),
+        in_c_key=False,
     )
     assert hasattr(root, "test__config_context")
     assert root.test__config_context == "test_default"
@@ -181,6 +183,7 @@ def test_invalid_configvar_access():
         "test__on_test_instance",
         "This config setting was added to the test instance.",
         configparser.IntParam(5),
+        in_c_key=False,
     )
     assert hasattr(root_test, "test__on_test_instance")
     # While the property _actually_ exists on all instances,
@@ -197,6 +200,7 @@ def test_invalid_configvar_access():
             "test__on_test_instance",
             "This config setting was already added to another instance.",
             configparser.IntParam(5),
+            in_c_key=False,
         )
 
 
@@ -248,6 +252,7 @@ def test_config_pickling():
         "test__lambda_kills_pickling",
         "Lambda functions cause pickling problems.",
         configparser.IntParam(5, lambda i: i > 0),
+        in_c_key=False,
     )
     with pytest.raises(AttributeError, match="Can't pickle local object"):
         pickle.dump(root, io.BytesIO())

From 158a7d0101b5f75387951c4991cb5252c868bcfc Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 17:24:53 -0400
Subject: [PATCH 13/72] Fix typo in docstring

---
 pytensor/configparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 40e84f518a..6ecbe051d4 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -281,7 +281,7 @@ def fetch_val_for_key(self, key, delete_key: bool = False):
 
         The (decreasing) priority order is:
         - PYTENSOR_FLAGS
-        - ~./pytensorrc
+        - ~/.pytensorrc
 
         """
 

From 7a0175af76b1c56f12cbc158638c3d81e4621805 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 17:47:42 -0400
Subject: [PATCH 14/72] Simplify _ChangeFlagDecorator

---
 pytensor/configparser.py  | 10 +++-------
 tests/link/c/test_type.py |  2 +-
 tests/tensor/test_blas.py |  2 +-
 tests/test_config.py      |  2 +-
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 6ecbe051d4..c38f131c61 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -32,11 +32,7 @@ class ConfigAccessViolation(AttributeError):
 
 
 class _ChangeFlagsDecorator:
-    def __init__(self, *args, _root=None, **kwargs):
-        # the old API supported passing a dict as the first argument:
-        if args:
-            assert len(args) == 1 and isinstance(args[0], dict)
-            kwargs = dict(**args[0], **kwargs)
+    def __init__(self, _root=None, **kwargs):
         self.confs = {k: _root._config_var_dict[k] for k in kwargs}
         self.new_vals = kwargs
         self._root = _root
@@ -310,14 +306,14 @@ def fetch_val_for_key(self, key, delete_key: bool = False):
         except (NoOptionError, NoSectionError):
             raise KeyError(key)
 
-    def change_flags(self, *args, **kwargs) -> _ChangeFlagsDecorator:
+    def change_flags(self, **kwargs) -> _ChangeFlagsDecorator:
         """
         Use this as a decorator or context manager to change the value of
         PyTensor config variables.
 
         Useful during tests.
         """
-        return _ChangeFlagsDecorator(*args, _root=self, **kwargs)
+        return _ChangeFlagsDecorator(_root=self, **kwargs)
 
     def warn_unused_flags(self):
         for key in self._flags_dict:
diff --git a/tests/link/c/test_type.py b/tests/link/c/test_type.py
index 84287e1607..0ebd249bf4 100644
--- a/tests/link/c/test_type.py
+++ b/tests/link/c/test_type.py
@@ -287,6 +287,6 @@ def test_op_with_cenumtype(self):
         assert val_billion == val_million * 1000
         assert val_two_billions == val_billion * 2
 
-    @pytensor.config.change_flags(**{"cmodule__debug": True})
+    @pytensor.config.change_flags(cmodule__debug=True)
     def test_op_with_cenumtype_debug(self):
         self.test_op_with_cenumtype()
diff --git a/tests/tensor/test_blas.py b/tests/tensor/test_blas.py
index c2479edba9..34a1d1bcf9 100644
--- a/tests/tensor/test_blas.py
+++ b/tests/tensor/test_blas.py
@@ -514,7 +514,7 @@ def compute_ref(
         C = self.get_value(C, transpose_C, slice_C)
         return alpha * np.dot(A, B) + beta * C
 
-    @config.change_flags({"blas__ldflags": ""})
+    @config.change_flags(blas__ldflags="")
     def run_gemm(
         self,
         dtype,
diff --git a/tests/test_config.py b/tests/test_config.py
index 4a512085f4..47a4e24035 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -168,7 +168,7 @@ def test_config_context():
 
     with root.change_flags(test__config_context="new_value"):
         assert root.test__config_context == "new_value"
-        with root.change_flags({"test__config_context": "new_value2"}):
+        with root.change_flags(test__config_context="new_value2"):
             assert root.test__config_context == "new_value2"
         assert root.test__config_context == "new_value"
     assert root.test__config_context == "test_default"

From d9ed1e2c4c2259fc88a6bd02c40363f2686be31c Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 18:32:55 -0400
Subject: [PATCH 15/72] Fix typo amblibm -> amdlibm

---
 doc/library/config.rst        |  2 +-
 pytensor/compile/profiling.py |  8 ++++----
 pytensor/configdefaults.py    |  2 +-
 pytensor/configparser.py      |  2 +-
 pytensor/scalar/basic.py      | 12 ++++++------
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/doc/library/config.rst b/doc/library/config.rst
index dac7e2c810..80fe090118 100644
--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -355,7 +355,7 @@ import ``pytensor`` and print the config variable, as in:
 
     When ``True``, ignore the first call to an PyTensor function while profiling.
 
-.. attribute:: config.lib__amblibm
+.. attribute:: config.lib__amdlibm
 
     Bool value: either ``True`` or ``False``
 
diff --git a/pytensor/compile/profiling.py b/pytensor/compile/profiling.py
index a361ac5087..9d93431753 100644
--- a/pytensor/compile/profiling.py
+++ b/pytensor/compile/profiling.py
@@ -1566,26 +1566,26 @@ def exp_float32_op(op):
             printed_tip = True
 
         # tip 2
-        if not config.lib__amblibm and any(
+        if not config.lib__amdlibm and any(
             amdlibm_speed_up(a.op) for (fgraph, a) in self.apply_time
         ):
             print(
                 "  - Try installing amdlibm and set the PyTensor flag "
-                "lib__amblibm=True. This speeds up only some Elemwise "
+                "lib__amdlibm=True. This speeds up only some Elemwise "
                 "operation.",
                 file=file,
             )
             printed_tip = True
 
         # tip 3
-        if not config.lib__amblibm and any(
+        if not config.lib__amdlibm and any(
             exp_float32_op(a.op) and a.inputs[0].dtype == "float32"
             for (fgraph, a) in self.apply_time
         ):
             print(
                 "  - With the default gcc libm, exp in float32 is slower "
                 "than in float64! Try PyTensor flag floatX=float64, or "
-                "install amdlibm and set the pytensor flags lib__amblibm=True",
+                "install amdlibm and set the pytensor flags lib__amdlibm=True",
                 file=file,
             )
             printed_tip = True
diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index 7fd6f951c7..f0cd279fa2 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -547,7 +547,7 @@ def add_tensor_configvars():
 
     # http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
     config.add(
-        "lib__amblibm",
+        "lib__amdlibm",
         "Use amd's amdlibm numerical library",
         BoolParam(False),
         # Added elsewhere in the c key only when needed.
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index c38f131c61..5042c15d76 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -95,7 +95,7 @@ class PyTensorConfigParser:
     compile__timeout: int
     # add_tensor_configvars
     tensor__cmp_sloppy: int
-    lib__amblibm: bool
+    lib__amdlibm: bool
     tensor__insert_inplace_optimizer_validate_nb: int
     # add_traceback_configvars
     traceback__limit: int
diff --git a/pytensor/scalar/basic.py b/pytensor/scalar/basic.py
index 763323cdb2..d4c41d5cb5 100644
--- a/pytensor/scalar/basic.py
+++ b/pytensor/scalar/basic.py
@@ -356,18 +356,18 @@ def c_headers(self, c_compiler=None, **kwargs):
         # we declare them here and they will be re-used by TensorType
         l.append("<numpy/arrayobject.h>")
         l.append("<numpy/arrayscalars.h>")
-        if config.lib__amblibm and c_compiler.supports_amdlibm:
+        if config.lib__amdlibm and c_compiler.supports_amdlibm:
             l += ["<amdlibm.h>"]
         return l
 
     def c_libraries(self, c_compiler=None, **kwargs):
         l = []
-        if config.lib__amblibm and c_compiler and c_compiler.supports_amdlibm:
+        if config.lib__amdlibm and c_compiler and c_compiler.supports_amdlibm:
             l += ["amdlibm"]
         return l
 
     def c_compile_args(self, c_compiler=None, **kwargs):
-        if config.lib__amblibm and c_compiler and c_compiler.supports_amdlibm:
+        if config.lib__amdlibm and c_compiler and c_compiler.supports_amdlibm:
             return ["-DREPLACE_WITH_AMDLIBM"]
         else:
             return []
@@ -1245,7 +1245,7 @@ class UnaryScalarOp(ScalarOp):
     def c_code_contiguous(self, node, name, inputs, outputs, sub):
         (x,) = inputs
         (z,) = outputs
-        if not config.lib__amblibm or node.inputs[0].type != node.outputs[0].type:
+        if not config.lib__amdlibm or node.inputs[0].type != node.outputs[0].type:
             raise MethodNotDefined()
 
         dtype = node.inputs[0].type.dtype_specs()[1]
@@ -1260,7 +1260,7 @@ def c_code_contiguous(self, node, name, inputs, outputs, sub):
         """
 
     def c_code_contiguous_raw(self, dtype, n, i, o):
-        if not config.lib__amblibm:
+        if not config.lib__amdlibm:
             raise MethodNotDefined()
         if dtype.startswith("npy_"):
             dtype = dtype[4:]
@@ -2296,7 +2296,7 @@ def L_op(self, inputs, outputs, gout):
     def c_code_contiguous(self, node, name, inputs, outputs, sub):
         (x, y) = inputs
         (z,) = outputs
-        if not config.lib__amblibm:
+        if not config.lib__amdlibm:
             raise MethodNotDefined()
 
         # We compare the dtype AND the broadcast flag

From 9f4b89d73d70f67c6d77205c6e841408b560288a Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 18:39:23 -0400
Subject: [PATCH 16/72] Remove unused ContextsParam

---
 pytensor/configparser.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 5042c15d76..e587782e40 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -538,22 +538,6 @@ def __str__(self):
         return f"{self.name} ({self.default})"
 
 
-class ContextsParam(ConfigParam):
-    def __init__(self):
-        super().__init__("", apply=self._apply, mutable=False)
-
-    def _apply(self, val):
-        if val == "":
-            return val
-        for v in val.split(";"):
-            s = v.split("->")
-            if len(s) != 2:
-                raise ValueError(f"Malformed context map: {v}")
-            if s[0] == "cpu" or s[0].startswith("cuda") or s[0].startswith("opencl"):
-                raise ValueError(f"Cannot use {s[0]} as context name")
-        return val
-
-
 def parse_config_string(
     config_string: str, issue_warnings: bool = True
 ) -> dict[str, str]:

From d45546084c1041af63cfcca43ab8d32ff04a2472 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 18:46:51 -0400
Subject: [PATCH 17/72] Simplify config.add(linker)

---
 pytensor/configdefaults.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index f0cd279fa2..0353c58fcd 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -371,23 +371,11 @@ def add_compile_configvars():
 
     if rc == 0 and config.cxx != "":
         # Keep the default linker the same as the one for the mode FAST_RUN
-        config.add(
-            "linker",
-            "Default linker used if the pytensor flags mode is Mode",
-            EnumStr(
-                "cvm", ["c|py", "py", "c", "c|py_nogc", "vm", "vm_nogc", "cvm_nogc"]
-            ),
-            in_c_key=False,
-        )
+        linker_options = ["c|py", "py", "c", "c|py_nogc", "vm", "vm_nogc", "cvm_nogc"]
     else:
         # g++ is not present or the user disabled it,
         # linker should default to python only.
-        config.add(
-            "linker",
-            "Default linker used if the pytensor flags mode is Mode",
-            EnumStr("vm", ["py", "vm_nogc"]),
-            in_c_key=False,
-        )
+        linker_options = ["py", "vm_nogc"]
         if type(config).cxx.is_default:
             # If the user provided an empty value for cxx, do not warn.
             _logger.warning(
@@ -397,6 +385,13 @@ def add_compile_configvars():
                 "To remove this warning, set PyTensor flags cxx to an empty string."
             )
 
+    config.add(
+        "linker",
+        "Default linker used if the pytensor flags mode is Mode",
+        EnumStr("cvm", linker_options),
+        in_c_key=False,
+    )
+
     # Keep the default value the same as the one for the mode FAST_RUN
     config.add(
         "allow_gc",

From 367351f321f90e1bc38393562e8c9c1e59a9606e Mon Sep 17 00:00:00 2001
From: Pham Nguyen Hung <97870091+HangenYuu@users.noreply.github.com>
Date: Thu, 25 Jul 2024 12:43:23 +0700
Subject: [PATCH 18/72] Fixed dead wiki links (#950)

* Fixed dead wiki links

* Fixed dead wiki links

* Deleted old documentation at doc/sandbox.
---
 doc/introduction.rst                        |   4 +-
 doc/links.rst                               |  14 +-
 doc/sandbox/ccodegen.rst                    | 255 -----------------
 doc/sandbox/compilation.rst                 |  18 --
 doc/sandbox/debugging_with_stepmode.rst     |  75 -----
 doc/sandbox/elemwise_compiler.rst           |  86 ------
 doc/sandbox/function.rst                    |   9 -
 doc/sandbox/functional.rst                  |   7 -
 doc/sandbox/how_to_make_ops.rst             | 295 --------------------
 doc/sandbox/index.rst                       |  11 -
 doc/sandbox/index2.rst                      |  15 -
 doc/sandbox/interactive_debugger.rst        |  56 ----
 doc/sandbox/logistic_regression_example.rst |  77 -----
 doc/sandbox/performance.rst                 |  23 --
 doc/sandbox/randomnumbers.rst               | 245 ----------------
 doc/sandbox/rethinkccodegen.rst             | 124 --------
 doc/sandbox/sandbox.rst                     | 161 -----------
 doc/sandbox/software.rst                    |  19 --
 doc/sandbox/sparse.rst                      | 147 ----------
 doc/sandbox/tensoroptools.rst               |   9 -
 20 files changed, 9 insertions(+), 1641 deletions(-)
 delete mode 100644 doc/sandbox/ccodegen.rst
 delete mode 100644 doc/sandbox/compilation.rst
 delete mode 100644 doc/sandbox/debugging_with_stepmode.rst
 delete mode 100644 doc/sandbox/elemwise_compiler.rst
 delete mode 100644 doc/sandbox/function.rst
 delete mode 100644 doc/sandbox/functional.rst
 delete mode 100644 doc/sandbox/how_to_make_ops.rst
 delete mode 100644 doc/sandbox/index.rst
 delete mode 100644 doc/sandbox/index2.rst
 delete mode 100644 doc/sandbox/interactive_debugger.rst
 delete mode 100644 doc/sandbox/logistic_regression_example.rst
 delete mode 100644 doc/sandbox/performance.rst
 delete mode 100644 doc/sandbox/randomnumbers.rst
 delete mode 100644 doc/sandbox/rethinkccodegen.rst
 delete mode 100644 doc/sandbox/sandbox.rst
 delete mode 100644 doc/sandbox/software.rst
 delete mode 100644 doc/sandbox/sparse.rst
 delete mode 100644 doc/sandbox/tensoroptools.rst

diff --git a/doc/introduction.rst b/doc/introduction.rst
index cfbfeaf90f..5c7a837fa9 100644
--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -157,9 +157,9 @@ to extend PyTensor, please feel free to ask.
    install
    tutorial/index
 
-.. _LISA:  https://mila.umontreal.ca/
+.. _LISA:  https://mila.quebec/en
 .. _Greek mathematician: http://en.wikipedia.org/wiki/Theano_(mathematician)
-.. _numpy: http://numpy.scipy.org/
+.. _numpy: https://numpy.org/
 .. _BLAS: http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms
 
 .. _sympy: http://www.sympy.org/
diff --git a/doc/links.rst b/doc/links.rst
index 8d2689fed1..ec22e14f12 100644
--- a/doc/links.rst
+++ b/doc/links.rst
@@ -39,18 +39,18 @@ This is a sort of memo for developers and would-be developers.
 
 .. _git: http://git-scm.com/
 .. _pytest: http://docs.pytest.org/en/latest/
-.. _numpy: http://numpy.scipy.org/
+.. _numpy: https://numpy.org/
 .. _python: http://www.python.org
 .. _scipy: http://scipy.org/
 
 .. _autodiff: http://www.autodiff.org
-.. _boost.python: http://www.boost.org/doc/libs/1_38_0/libs/python/doc/index.html
+.. _boost.python: https://www.boost.org/doc/libs/1_85_0/libs/python/doc/html/index.html
 .. _cython: http://www.cython.org/
 .. _liboil: http://liboil.freedesktop.org/wiki/
 .. _llvm: http://llvm.org/
-.. _networkx: http://networkx.lanl.gov/
-.. _pypy: http://codespeak.net/pypy/dist/pypy/doc/
+.. _networkx: https://networkx.org/
+.. _pypy: https://doc.pypy.org/en/latest/
 .. _swig: http://www.swig.org/
-.. _unpython: http://code.google.com/p/unpython/
-.. _pycppad: http://www.seanet.com/~bradbell/pycppad/index.xml
-.. _shedskin: http://shed-skin.blogspot.com/
+.. _unpython: https://code.google.com/archive/p/unpython/
+.. _pycppad: https://github.com/Simple-Robotics/pycppad
+.. _shedskin: https://shedskin.github.io/shedskin/
diff --git a/doc/sandbox/ccodegen.rst b/doc/sandbox/ccodegen.rst
deleted file mode 100644
index 1d9730b97d..0000000000
--- a/doc/sandbox/ccodegen.rst
+++ /dev/null
@@ -1,255 +0,0 @@
-'''C code is actually generated this way. Could be refreshed as developer documentation.  Olivier to review.  20080904.'''
-
-Here is a proposal on the interface to generate C code:
-
-What will be passed to C
-========================
-
-For each ResultBase, the C code gets a variable called storage_<name> which contains a PyObject* pointing to a 1-element list (a sort of cell). That is the "channel" via which C and Python can communicate data. Of course, the C code will not manipulate that directly. At every execution of the C function, the PyObject* inside the storage is extracted and given the name py_<name> (its reference count is handled automatically).
-
-
-Extracting the data for use with C
-==================================
-
-In ResultBase, we have several methods to generate C code for particular purposes. They should return templated strings of C code (see below) but should not actually fill the template. The caller will fill it.
-
-List of template variables you can use:
-  * '''%(name)s:''' Will be filled in by a mangled name representing this ResultBase.
-  * '''%(fail)s:''' This can be inserted in the code to make the current function fail. It will proceed to cleanup everything that needs to be cleaned up. This cannot be used in any cleanup routine (and hence it is forbidden for a cleanup routine to fail!) If a code block uses %(fail)s, its corresponding cleanup block will be called first, so make sure that the cleanup can be done properly at any point where you use %(fail)s, even if you didn't allocate or INCREF everything yet.
-
-List of methods in ResultBase:
-
-'''c_declare:''' This method returns code that declares one or more variables ''without'' initializing them. These are the variables that all C code using this ResultBase will use to manipulate the data. The code should ''only'' declare variables and typedefs (no #defines, but a future extension might address this). Example: if we have a ResultBase representing a double, c_declare may simply return "double %(name)s;". ''All'' variables declared should contain the %(name)s template, but they may prefix or suffix it.
-
-'''c_init:''' This method returns code that initializes (zeros/sets to NULL, typically) the variables declared in c_declare.
-
-'''c_extract:''' This method should manipulate py_<name> to set the values of the variables declared by c_declare. For example, if we have a ResultBase representing a double, c_extract might return "%(name)s = PyFloat_AsDouble(py_%(name)s);" (plus error checking!). If something is wrong with the data provided from Python, c_extract should set an informative error message and insert %(fail)s.
-
-'''c_sync:''' This method should adjust the py_<name> variable using the values of the variables declared by c_declare. For example, if we have a ResultBase representing a double, c_sync might return "Py_XDECREF(py_%(name)s); py_%(name)s = PyFloat_FromDouble(%(name)s);". The result will then be made accessible from Python. c_sync is not allowed to fail, though it is not really cleanup code.
-
-'''c_cleanup:''' This method should clean up all the variables declared by c_declare.
-
-.. warning::
-
-    This page describes usage of c_init and c_extract as of version 0.4.0 (and
-    previous versions). This will change in the future, to allow c_code to
-    use preallocated memory buffers of the outputs.
-
-Important notes:
-  * ''Either'' c_init or c_extract will be called. The former for temporary variables and outputs, the latter for inputs. If the former is used, py_<name> will be set to Py_None regardless of what is in storage_<name>.
-  * c_sync will only be called on the outputs, not on inputs or temporaries.
-  * c_cleanup will ''always'' be called. If c_sync decides to relay some data to Python (thus ousting it from the op's scope), it should NULL any pointers that c_cleanup is not allowed to free.
-
-
-Manipulating the data from C
-============================
-
-The Op class has in turn several methods that generate C code. As for ResultBase, they should return templated strings of C code (see below) but should not actually fill the template. The caller will fill it.
-
-List of template variables you can use:
-  * '''%(<variable_name>)s:''' See c_var_names. These will be substituted for mangled names.
-  * '''%(fail)s:''' This can be inserted in the code to make the current function fail. It will proceed to cleanup everything that needs to be cleaned up. This cannot be used in any cleanup routine (and hence it is forbidden for a cleanup routine to fail!). If a code block uses %(fail)s, its corresponding cleanup block will be called first, so make sure that the cleanup can be done properly at any point where you use %(fail)s, even if you didn't allocate or INCREF everything yet.
-
-'''c_var_names''': This method should return two lists, one list of strings representing the input names and one list of strings representing the output names. The actual names might be mangled by the compiler. In the template strings returned by the next few methods, you can use the names defined here. For example, if op.c_var_names() returns [['x', 'y'], ['z']], then "%(x)s" in op's templates will be the same as "%(name)s" in op.inputs[0]'s templates. This means that all the variables declared by the inputs and outputs can easily be used in the op's templates.
-
-'''c_validate_update''': This method should return code that ensures that the inputs are valid for processing by this Op (checking shapes, bounds, etc.). If anything is invalid, it should set an informative error message and use %(fail)s. Then, it should prepare the outputs: for example, if the output is a tensor, allocate a tensor, resize it appropriately and place it in the appropriate variable (see c_var_names).
-
-'''c_validate_update_cleanup''': This method should clean up any temporary storage used by c_validate_update. It is not forbidden to do it in c_validate_update itself, but this can come in handy.
-
-'''c_code''': This is the meat of the Op that actually calculates the function. If an error occurs in the process, it may use %(fail)s. It should work in place on the variables declared by its inputs and outputs and rely on their c_sync routines to relay the results to Python.
-
-'''c_code_cleanup''': This cleans up any temporary structures allocated by c_code.
-
-'''c_is_simple (field)''': Class field. Defaults to False. It is basically a compiler hint that this class represents a builtin C type or a small struct, so we can optimize its access.
-
-
-Important notes:
-  * There might be provisions in the future to skip the validate_update step if the Op can guarantee that the inputs are valid and the outputs are set up properly.
-  * It is not forbidden to just put the validate_update code in c_code. Some situations might require it, but it helps organization to segregate them.
-
-
-Failure
-=======
-
-Besides cleanup code, all code has access to the %(fail)s template. For three code blocks, the generated C code will pretty much look like this:
-
-.. code-block:: cpp
-
-    int failure = 0;
-    {
-      <code1>
-      {
-        <code2>
-        {
-          <code3>
-        label3:
-          <cleanup3>
-        }
-      label2:
-        <cleanup2>
-      }
-    label1:
-      <cleanup1>
-    }
-    return failure;
-
-And %(fail)s in the nth code block will take the value "{failure = n; goto label<n>;}". This means only the blocks executed up to the failure point are cleaned up and the return value indicates which block failed, which is handy for debugging.
-
-When compiling an Op, we want to sync the outputs so we can get the results from Python. In case of failure, we will not necessarily want to sync. Because of that, typical code will look like this:
-
-.. code-block:: cpp
-
-    int failure = 0;
-    <declare input>
-    <declare output>
-    {
-      <extract input>
-      {
-        <extract output>
-        {
-          <perform>
-        label3:
-          <clean up perform>
-        }
-      label2:
-        if (!failure)
-          <sync output>
-        <clean up output>
-      }
-    label1:
-      <clean up input>
-    }
-    return failure;
-
-Furthermore, is not necessary to extract the output because we mean to overwrite it anyway. In that case, <extract output> will be a no-op, but of course we may still need to clean up or sync what <perform> will put in the declared outputs.
-
-
-Example ResultBase
-==================
-
-The following ResultBase represents a double (we only care about the C part).
-
-.. code-block:: python
-
-    class Double(ResultBase):
-      # <snip>
-      def c_declare(self):
-        return "double %(name)s;"
-      def c_init(self):
-        return "%(name)s = 0.0;"
-      def c_extract(self):
-        return "%(name)s = PyFloat_AsDouble(py_%(name)s);"
-      def c_cleanup(self):
-        return "" # nothing to do
-      def c_sync(self):
-        return "Py_XDECREF(py_%(name)s); py_%(name)s = PyFloat_FromDouble(%(name)s);"
-
-
-Example Op
-==========
-
-The following ResultBase represents addition of two nonnegative doubles (we only care about the C part).
-
-.. code-block:: python
-
-    class Add(COp):
-      # <snip>
-      def c_var_names(self):
-        return "[['x', 'y'], ['z']]"
-      def c_validate_update(self):
-        return "if (%(x)s < 0 || %(y)s < 0) %(fail)s" # fail if x or y is negative
-      def c_validate_update_cleanup(self):
-        return "" # nothing to do
-      def c_code(self):
-        return "%(z)s = %(x)s + %(y)s;"
-      def c_code_cleanup(self):
-        return "" # nothing to do
-
-Generating a C function
-=======================
-
-For the example Op, the generated C function will typically look like this:
-
-.. code-block:: cpp
-
-    void add(PyObject* storage_x, PyObject* storage_y, PyObject* storage_z) {
-      PyObject* py_x = PyList_GET_ITEM(storage_x, 0); Py_XINCREF(py_x); // automatic
-      PyObject* py_y = PyList_GET_ITEM(storage_y, 0); Py_XINCREF(py_y); // automatic
-      PyObject* py_z = Py_None; // we don't care what's currently in storage_z
-
-      failure = 0
-      double x; // x.c_declare
-      double y; // y.c_declare
-      double z; // z.c_declare
-      {
-        x = PyFloat_AsDouble(py_x); // x.c_extract
-        {
-          y = PyFloat_AsDouble(py_y); // y.c_extract
-          {
-            # we don't need to use z.c_extract
-            {
-              if (x < 0 || y < 0) { // add.validate_update
-                // This is automatically inserted in place of %(fail)s
-                failure = 4;
-                goto label_add_validate_update_cleanup;
-              }
-              {
-                z = x + y; // add.c_code
-              label_add_code_cleanup:
-              }
-            label_add_validate_update_cleanup:
-            }
-          label_z_sync_or_cleanup:
-            if (!failure) {
-              Py_XDECREF(py_z); // z.c_sync
-              py_z = PyFloat_FromDouble(z); // z.c_sync, the result is now available from Python!
-              PyList_SET_ITEM(storage_z, 0, py_z); // always done after _.c_sync
-            }
-            Py_XDECREF(py_z); // always done after _.c_cleanup
-          }
-        label_y_cleanup:
-          Py_XDECREF(py_y); // always done after _.c_cleanup
-        }
-      label_x_cleanup:
-        Py_XDECREF(py_x); // always done after _.c_cleanup
-      }
-      return failure;
-    }
-
-Generating a C struct
-=====================
-
-To accelerate processing a tad, a struct can be generated instead of a function. The struct will keep pointers to the storage where to fetch inputs and store outputs, but it will also store fields declared by outputs and temporaries' c_declare methods.
-
-Here is a sketch of the struct equivalent of the previous function:
-
-.. code-block:: cpp
-
-    struct add {
-      PyObject* storage_x;
-      PyObject* storage_y;
-      PyObject* storage_z;
-      double z; // z.c_declare
-
-      void init(PyObject* storage_x, PyObject* storage_y, PyObject* storage_z) {
-        // <set the struct members of the same names>
-        // <init the struct members corresponding to z>
-      }
-
-      void cleanup(void) {
-        // <cleanup z>
-      }
-
-      void run(void) {
-        // <same code as before minus z's cleanup>
-      }
-
-      add() { this->init(); }
-      ~add() { this->cleanup(); }
-    };
-
-Advantages of using a struct:
-  * Can be run several times even if we provide the storage only once.
-  * Output variables or temporary variables can reuse what they allocated the last time. This is not particularly useful with doubles (in fact it might be detrimental), but if z was a large tensor it might be interesting to recycle the memory over thousands of runs of the Op.
-
-No struct members will be made if a result's c_is_simple field is True. They will be allocated on the stack instead.
diff --git a/doc/sandbox/compilation.rst b/doc/sandbox/compilation.rst
deleted file mode 100644
index fad7d71ef9..0000000000
--- a/doc/sandbox/compilation.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-
-.. _compilation:
-
-=======================
-Compilation and Linking
-=======================
-
-.. index::
-   single: Linker
-
-.. _linker:
-
-Linker
-======
-
-WRITEME
-
-
diff --git a/doc/sandbox/debugging_with_stepmode.rst b/doc/sandbox/debugging_with_stepmode.rst
deleted file mode 100644
index fba3a63e71..0000000000
--- a/doc/sandbox/debugging_with_stepmode.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-
-.. _sandbox_debugging_step_mode:
-
-Debugging with a customized so-called StepMode
-==============================================
-
-One convenient trick I've found for debugging my programs that are running with pytensor is to
-use what I call a 'StepMode'.  There is no such StepMode in the standard library because the
-purpose of it is to hack it to investigate what your own particular program is doing.
-
-
-.. code-block:: python
-
-    from pytensor.link import WrapLinkerMany
-    from pytensor.configdefaults import config
-    from pytensor.compile.mode import (Mode, register_mode, predefined_modes, predefined_linkers,
-            predefined_optimizers)
-
-    class StepMode(Mode):
-        def __init__(self, linker=None, optimizer='default'):
-            if linker is None:
-                linker = config.linker
-            if optimizer is 'default':
-                optimizer = config.optimizer
-            def blah(i, node, th):
-                # This function will be run for each node in your compiled program.
-                # here you can inspect all the values as they are computed,
-                # ... you can even change them !
-
-                # 'i' is the execution position in the serialized graph
-                # node is the symbolic Apply instance
-                # th is a callable thing that will compute the node.
-
-                print i, node, len(th.inputs)
-
-                # the symbolic inputs of the node are in node.inputs
-                # the j'th non-symbolic input of the node is in th.inputs[j][0]
-
-                th() # call the function to actually 'run' the graph
-
-                # the symbolic outputs of the node are in node.outputs
-                # the j'th non-symbolic output of the node is in th.outputs[j][0]
-
-                print type(th.outputs[0][0])
-
-                if i == 39:
-                    print 'this node is weird...', th.outputs[0][0]
-
-
-            self.provided_linker = linker
-            self.provided_optimizer = optimizer
-            if isinstance(linker, basestring) or linker is None:
-                linker = predefined_linkers[linker]
-
-            self.linker = WrapLinkerMany([linker], [blah])
-
-            if isinstance(optimizer, basestring) or optimizer is None:
-                optimizer = predefined_optimizers[optimizer]
-            self._optimizer = optimizer
-
-
-
-The way to use it is like this:
-
-.. code-block:: python
-
-    fn = function(inputs, outputs, mode=StepMode())
-
-When you call fn, your function in the stepmode will be called for each node in the compiled
-program.  You can print out some or all of the values, you can change them in mid-execution.
-You can see where bizarre values are first occurring in your computations.  It's a very
-powerful way to understand your program's execution.
-
-Remember, if you give names your variables then printing nodes will give you a better idea of
-where in the calculations you are.
diff --git a/doc/sandbox/elemwise_compiler.rst b/doc/sandbox/elemwise_compiler.rst
deleted file mode 100644
index 8c7825b7c4..0000000000
--- a/doc/sandbox/elemwise_compiler.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-.. _sandbox_elemwise:
-
-==========================
-:class:`Elemwise` compiler
-==========================
-
-.. todo:: Stale specification page.  Upgrade this to provide useful developer doc. 2008.09.04
-
-Definitions
-===========
-
-The element-wise compiler takes inputs {{{(in0, in1, in2, ...)}}}, outputs {{{(out0, out1, out2, ...)}}}, broadcast modes {{{(mod0, mod1, mod2, ...)}}} where each mode corresponds to an output as well as {{{order}}} which determines if we broadcast/accumulate over the first or last dimensions (the looping order, basically, but some operations are only valid for one particular order!).
-
-The broadcast mode serves to calculate the rank of the corresponding output and how to map each input element to an output element:
-
-  * {{{broadcast}}}
-    * output.rank = max(input.rank)
-    * the inputs of lesser rank are broadcasted over missing dimensions
-    * if {{{order == f}}} ([3, 5], [5]) => [3, 5] or ([7, 8, 9], [8, 9]) => [7, 8, 9]
-    * if {{{order == c}}} ([3, 5], [3]) => [3, 5] or ([7, 8, 9], [7, 8]) => [7, 8, 9]
-  * {{{(accumulate, Accumulator)}}}
-    * output.rank = min(input.rank)
-    * for the inputs of greater rank, we use Accumulator (sum, product, etc.) to accumulate over the first dimensions
-
-      * e.g. {{{if Accumulator == sum, order == c, x.rank == 2, y.rank == 1 and z = f(x, y) then z[i] = f(sum_j(x[i, j]), y[i])}}}
-
-    * if {{{order == f}}} ([3, 5], [5]) => [5] or ([7, 8, 9], [8, 9]) => [8, 9]
-    * if {{{order == c}}} ([3, 5], [3]) => [3] or ([7, 8, 9], [7, 8]) => [7, 8]
-
-{{{order == c}}} is equivalent to transposing the outputs of an {{{order == f}}} operation on transposed inputs.
-
-This does not cover all cases of broadcasting, but I believe they cover enough. Other cases of broadcasting can be emulated with proper transposition and/or slicing.
- * Could you give some examples of what kinds of broadcasting are and are not covered by your proposed implementation?
-
-  * For rank <= 2, I think only operations of the form {{{add(ones(3,1), ones(1,3)))}}} are missing. I actually didn't think of that one before now.
-  * In general, it only handles f(shape(head, ...), shape(head, ...), ...) and f(shape(..., tail), shape(..., tail), ...)
-  * Maybe I could add a general case later... the thing is that I think the ones I am considering here are easier to streamline.
-
-Point of clarification: the order discussed here corresponds to a set of broadcasting rules, and is independent from the storage order.  The 'f' order corresponds to numpy's broadcasting rules, while the 'c' order is something new and different (TODO VERIFY!)
-
-Question: does it make sense to apply the order to the loop, or is this broadcast order something which will be local to each input argument.  What happens when the elemwise compiler deals with more complex subgraphs with multiple inputs and outputs?
-
-The loop
-========
-
-Here is the loop for {{{order == c}}}. Check for errors!
-
-.. code-block:: cpp
-
-    <initialize iterators>
-
-    i1 = -1
-    while (++i1 < dim1) {
-      i2 = -1
-      rank_N-1_accumulator = init
-      while (++i2 < dim2) {
-        ...
-        iN = -1
-        while (++iN < dimN) {
-          <accumulate rank N input>
-          <SET rank N output using broadcasted inputs>
-          <NEXT rank N iterator>
-        }
-        ...
-      }
-      <SET rank 1 output using accumulated inputs>
-      <NEXT rank 1 iterator>
-    }
-
-When {{{order == f}}}, the iterators ''ideally'' (but not necessarily) iterate in FORTRAN order, i.e. the while loops are on {{{dimN..dim1}}} instead of {{{dim1..dimN}}}.
-
-{{{order}}} does __not__ represent the {{{C/F_CONTIGUOUS}}} flags of the inputs or outputs. Depending on combinations of those parameters, different loops will be used. If {{{order == f and C_CONTIGUOUS(array)}}}, for example, the loop will be on {{{dim1..dimN}}} and the matrices of lesser rank will need to be looped over several times.
-
-An rewrite should look at the operations in the graph and figure out whether to allocate C_CONTIGUOUS (ideal for {{{order == c}}}) or F_CONTIGUOUS (ideal for {{{order == f}}}) arrays.
-
-Gradient
-========
-
-The input ranks become the output ranks and gradients of the same rank as the outputs are added to the input list. If an output was given mode {{{broadcast}}}, then all inputs used to calculate it had to be broadcasted to that shape, so we must sum over the broadcasted dimensions on the gradient. The mode that we give to those inputs is therefore {{{(accumulate, sum)}}}. Inversely, if an output was given mode {{{(accumulate, sum)}}}, then all inputs used to calculate it had to be summed over those dimensions. Therefore, we give them mode {{{broadcast}}} in grad. Other accumulators than sum might prove more difficult. For example, the ith gradient for product is grad*product/x_i. Not sure how to handle that automatically.
- * I don't exactly follow this paragraph, but I think I catch the general idea and it seems to me like it will work very well.
-
-  * In a nutshell for {{{broadcast}}} I calculate the gradient as normal assuming the shape is broadcasted and then I sum over what I had to broadcast.
-
- * Could you explain why the accumulator gradient (e.g. product) can be trickier?
-
-  * I thought about it and I figured that the general case is {{{g_accum[N-i+1], g_m[i] = grad_fn(accum[i-1], m[i], g_accum[N-i])}}} where {{{g_accum}}} is the accumulated gradient wrt the accumulator {{{accum}}}. It can be short-circuited in sum and product's case: for sum, grad_fn is the identity on its last argument so {{{g_m[i] == g_accum[i] == g_accum[0] == g_z for all i}}}. In product's case, {{{accum[i-1] == product(m[1:i-1]) and g_accum[N-i] == g_z * product(m[i+1:N])}}}, multiply them together and you obtain {{{g_z * product(m)/m[i]}}} where obviously we only need to compute {{{product(m)}}} once. It's worth handling those two special cases, for the general case I don't know.
diff --git a/doc/sandbox/function.rst b/doc/sandbox/function.rst
deleted file mode 100644
index f5a0a29f0d..0000000000
--- a/doc/sandbox/function.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-
-.. _function:
-
-==================
-function interface
-==================
-
-WRITEME
-
diff --git a/doc/sandbox/functional.rst b/doc/sandbox/functional.rst
deleted file mode 100644
index 97d4d65b52..0000000000
--- a/doc/sandbox/functional.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-
-==========
-Functional
-==========
-
-Want to know about PyTensor's `function design
-<http://groups.google.com/group/theano-dev/browse_thread/thread/fd4c6947d8a20510>`?
diff --git a/doc/sandbox/how_to_make_ops.rst b/doc/sandbox/how_to_make_ops.rst
deleted file mode 100644
index 9fd92e0d04..0000000000
--- a/doc/sandbox/how_to_make_ops.rst
+++ /dev/null
@@ -1,295 +0,0 @@
-.. _how_to_make_ops:
-
-#################
-How to Make Ops
-#################
-
-
-Parametrization
-===============
-
-An Op class can represent one or a wide variety of functions depending on how you choose to parametrize it. The parameters of an Op do not show up in the structure of the computation graph - they are local to the Op. [*What does the last sentence mean? What is its effect?*] When an Op's ``make_node`` function is called on an Op instance with a list of inputs, the computation that is performed depends on the type and value of those inputs and on the internal parameters of the Op.
-
-It is not always obvious what should be a parameter and what should be an input. For example, a generic indexing Op could take a list and an index as graph inputs, whereas a specific indexing Op could have an index parameter, so you could have a specialized Op instance to fetch the nth element of a list, where n is known statically. [*Could you give some advice about the relative tradeoffs of having something as a parameter and something as an input?*]
-
-Examples of parameterized Ops in pytensor:
-  ``Broadcast(<scalar op>, <inplace?>)``
-    upgrades an op that works on scalars so it works on tensors. Can work inplace or not.
-  ``Reduce(<scalar op>, <axes>)``
-    reduces the specified axes using the provided scalar op.
-  ``Add(<output type inferrer>)``
-    adds scalars and puts the variable in a scalar whose type is inferred from the input types using ``output_type_inferrer(*inputs)``
-  ``Composite(<graph>)``
-    makes a single Op out of a graph of scalar operations.
-
-[*These examples are a little abstract. I'm not sure what are the inputs and what are the parameters. Maybe also give like something that has a random seed.*]
-
-Ideas:
-  ``MyOp(<debug>)``
-    prints debugging information in perform or the C implementation if debug is True.
-  ``MyOp(<allow C>)``
-    always use the python implementation if allow C is False (raise an exception in c_code)
-
-``__eq__``, ``__ne__`` and ``__hash__``
----------------------------------------------
-
-In order for certain rewrites to apply (such as the merging of duplicate
-calculations by `MergeOptimizer`), it is necessary for `Op`\s that do the same
-thing to compare equal.  If `Op` instances are generated by a function call
-(for example) then it can happen that several different `Op` instances do the
-same thing; in that case you will have to override `Op.__eq__`, `Op.__ne__`, and
-`Op.__hash__` for the `MergeOptimizer` to recognize them as equal.
-
-Recall: the contract for any ``__hash__`` is that ``a == b`` implies ``hash(a) == hash(b)``.
-
-:meth:`Op.make_node`
-====================
-
-The :meth:`Op.make_node` method is expected to have the following signature:
-
-.. code-block:: python
-
-    make_node(self, *inputs)
-
-``inputs`` may be a list of anything that the user wants to provide as symbolic
-input (symbolic: standing for the actual values that will be passed when the
-graph is compiled into an executable function). [*The PyTensor intro should
-describe symbolic in greater depth, and we should link to that from here.*] This
-may or may not include Variable instances (but if you want the inputs of this Op
-to sometimes be outputs of another Op, then the inputs should be Variable
-instances). [*What else could they be? Constant, Values, ...*] The return value
-should be an instance of [GraphStructures Apply] (see the example below). Here
-are the tasks typically handled in ``make_node``.
-
-  * Check that the inputs are valid (type checking, etc.). [*Since we don't actually have values, what can we do besides type checking?*]
-  * If needed, wrap the inputs in Variable instances with the proper type.
-  * Make the Variable instances that will serve as the outputs of the node.
-  * ``return Apply(self, <wrapped inputs>, <outputs>)``
-
-The ``inputs`` and ``outputs`` arguments to ``Apply`` must be lists of
-`Variable` instances (or instances of subclasses of ``Variable``). The inputs
-given to `Apply` do not have to be the same as the inputs passed to
-`make_node`, but it is recommended that the order corresponds. [*why?*] The
-behavior of `make_node` should not depend on the structure of the graph of
-[*or?*] its inputs: it may look at the type and type fields of its inputs, but
-not at their owner field, because modifications to the graph structure do not
-use `make_node`.
-
-Example:
-
-.. code-block:: python
-
-	from pytensor.scalar import *
-
-	class Add(Op):
-	    #...
-	    def make_node(self, x, y):
-	        # note 1: constant, int64 and ScalarType are defined in pytensor.scalar
-	        # note 2: constant(x) is equivalent to Constant(type=int64, data=x)
-	        # note 3: the call int64() is equivalent to Variable(type=int64, None) or Variable(type=ScalarType(dtype = 'int64'), None)
-	        if isinstance(x, int):
-	            x = constant(x)
-	        elif not isinstance(x, Variable) or not x.type == int64:
-	            raise TypeError("expected an int64 ScalarType")
-	        if isinstance(y, int):
-	            y = constant(y)
-	        elif not isinstance(y, Variable) or not x.type == int64:
-	            raise TypeError("expected an int64 ScalarType")
-	        inputs = [x, y]
-	        outputs = [int64()]
-	        node = Apply(op = self, inputs = inputs, outputs = outputs)
-	        return node
-	    #...
-
-	add = Add()                               # I make an instance of Add
-	node1 = add.make_node(int64(), int64())   # I make a node with two Variable inputs
-	node2 = add.make_node(1, 2)               # this works too
-	node3 = add.make_node(int64(), 79)        # this works three
-	node4 = add.make_node(float64(), int64()) # this raises a TypeError
-
-[*What type is an instance of Add? It's an Apply? But that's not a Variable, and cannot be used as input for another Op.*]
-
-Two Apply nodes ``node1`` and ``node2`` are *assumed* by the compiler to represent the same behavior if:
-  1. ``node1.op == node2.op``
-  1. ``all(input1.type == input2.type for input1, input2 in zip(node1.inputs, node2.inputs))``
-  1. ``all(output1.type == output2.type for output1, output2 in zip(node1.outputs, node2.outputs))``
-
-It is considered an *error* to have conditions 1 and 2 but not condition 3. A corollary to those conditions is that repeated calls to ``make_node`` with the same inputs should produce equivalent nodes.
-
-``__call__``
-----------------
-
-In ``Op``, ``__call__`` is defined in terms of ``make_node``. Instead of returning a node, it returns the output Variables directly, which is practical from a UI standpoint. Here is pseudocode:
-
-.. code-block:: python
-
-    if len(outputs) is 1:
-        __call__(*inputs) <=> make_node(*inputs).outputs[0]
-    else:
-        __call__(*inputs) <=> make_node(*inputs).outputs
-
-It is not necessary or recommended to override ``__call__`` unless you want to hide some outputs from view (see hidden outputs section).
-
-perform
-=======
-
-The ``perform`` method is expected to have the following signature:
-
-``
-perform(self, node, inputs, output_storage)
-``
-
-Where:
-  * *node*: a pointer to an Apply instance - ``node`` is assumed to be produced by a previous call to ``self.make_node``.
-  * *inputs*: *not* the same as ``node.inputs`` - it is a list of values. [*i.e. actually data, not just symbolic stuff?*]
-  * *output_storage*: *not* the same as ``node.outputs`` - it is a list of lists of length 1 where the variables of the computation must be put.
-
-[*Can you explain better how inputs is not node.inputs and output_storage is not node.outputs?*]
-
-[*Would it be better to call inputs as 'inputs_storage'?*]
-
-Here is an example of a properly defined ``perform``:
-
-.. code-block:: python
-
-	class Add(Op):
-	    ...
-	    def perform(self, node, inputs, output_storage):
-	        # this does z = x + y
-	        x, y = inputs        # extract the two inputs
-	        z, = output_storage  # extract the one storage (the comma after z is not optional)
-	        z[0] = x + y         # we must put the variable in z[0]
-	    ...
-
-	add = Add()                               # I make an instance of Add
-	node = add.make_node(int64(), int64())    # I make a node with two integer inputs
-	storage = [None]                          # I make my storage as a 1-element list with None
-	add.perform(node, (3, 7), (storage, ))    # I provide the node, two inputs and storage for one output
-    print storage[0]                          # prints 10
-
-[*Why is node never used in the perform function? Why is self never used?*]
-
-[*What does the comma after z do? Why is it not optional?*]
-
-The ``node`` parameter is not always needed, but might come in handy sometimes [*when?*]. There are as many entries in ``output_storage`` as there are in ``node.outputs`` and each entry is a list of length 1. The outputs must be computed from the inputs and put in those lists. The lists in ``output_storage`` must not be resized - the only allowed operation is to set or read their first element. [*Since these instructions correspond to more general principles, could you state the principles of the contract more generally and put it __above__ the example?*]
-
-reusing outputs
----------------
-
-The output storage in ``output_storage`` might not be empty. In fact, whatever the op allocates to store the computation and puts in the storage *might* still be there the second time around. [*huh?*] This is an intended feature and it is acceptable for ``perform`` to *reuse* what is in the output storage if it is worth it. For example, if ``perform`` must add two ``1000x1000`` matrices into a new matrix of the same size and that there is already a ``1000x1000`` matrix in the corresponding output storage, it may reuse it and thus save a lot in memory and allocation time. It may also freely discard what is already there.
-
-Note that it is not *guaranteed* that the outputs will stick around. Indeed, the linker may, at its discretion, clean them up. It is not guaranteed either (though it will usually be the case) that the contents of the output storage was allocated by a previous call to ``perform``. It *is* however guaranteed that the contents are either ``None`` or a structure of the proper type which it can use.
-
-If the contents of the storage are ``None``, *new* storage is expected for that output (typical case is that we "gave" the output to the user so we don't own it anymore). Therefore, it is not acceptable to have a private cache of previously allocated storage unless you know what you are doing.
-
-Advanced note: for an Op with multiple outputs, it is possible that some of them can be reused and some others not. If an Op with multiple outputs shares storage between them, e.g. the first output is a view of the second, if the first output is reset to ``None``, the second should *not* be reused, even if it's available, because a fresh output is expected for the first. It is not recommended in general to share storage between outputs unless one of them is hidden (see hidden outputs section), because the engine does not know how to handle that situation safely.
-
-grad
-====
-
-``grad`` is an PyTensor-specific [*as opposed to?*]  function - it does not interface with core rewrite and compilation facilities, but it provides a useful interface to differentiation. Its expected signature is:
-
-.. code-block:: python
-
-    grad(self, inputs, output_gradients)
-
-
-where:
-  * ``inputs`` is a list of Variable instances. It is assumed to be the ``inputs`` field of a node produced by ``make_node``.
-  * ``output_gradients`` is a list of Variable instances. They have the same properties as the outputs of the node, but are filled with gradient values.
-
-Essentially, the semantics are:
-
-.. code-block:: python
-
-	# Not completely sure about this, James should doublecheck -jpt and ob
-	def grad(self, (x, ), (gz, )):
-	   return [gz * (dz/dx)]
-	def grad(self, (x, y), (gz, )):
-	   return gz*(dz/dx), gz*(dz/dy)
-	def grad(self, (x, y), (gz, gw)):
-	   # In this situation you want two return values that have the shape of x and y respectively
-	   return gz*dz/dx + gw*dw/dx, gz*dz/dy + gw*dw/dy
-
-More specifically,
-``grad`` must return a list or tuple of input gradients, as many as there are inputs. Let C be a Variable (currently assumed to be a scalar) that depends through an PyTensor symbolic expression on the node outputs. Then each output_gradients[i] represents symbolically dC/doutputs[i]. The returned input gradients should represent symbolically dC/dinputs[i].
-
-Example:
-
-.. code-block:: python
-
-	class Mul(Op):
-	    ...
-	    def grad(self, inputs, output_gradients):
-	        x, y = inputs
-	        gz, = output_gradients   # here again, the comma is not optional
-	        return mul(gz, y), mul(gz, x)
-	    ...
-	mul = Mul()
-
-If the op is not differentiable wrt one of its inputs, the gradient for that input should be ``None``; if the op is not differentiable with respect to any of its inputs, it should return something equivalent to
-``[None] * len(inputs)``.  If ``grad`` is not implemented for any op in a graph, then the symbolic gradient engine will complain (with an attribute exception).
-
-
-
-If the op only has one input, be careful to still return a list or tuple:
-  * fine: ``return gx,``
-  * fine: ``return [gx]``
-  * not fine: ``return gx``
-
-The [http://www.iro.umontreal.ca/~pift6266/A06/cours/gradient.pdf principle] behide this is explaned in section 2.
-
-Destroyers and viewers
-======================
-
-Destroyers
-----------
-
-An Op may change the contents of its inputs. For example, ``z = add_inplace(x, y)`` will increment ``x`` with ``y``, erasing the previous contents of ``x``. ``z`` represents ``x`` after it was incremented. However, the engine needs to be told about all this so it can guarantee that ``add_inplace`` will only be executed as soon as we don't need ``x`` anywhere else.
-
-This is done by setting the ``destroy_map`` field of the op. ``destroy_map`` must be a dictionary which associates an output index or ``None`` to a list of input indices that are destroyed by that output. For example, ``add_inplace.destroy_map == {0: [0]``} because the first input is overwritten by the first output. If it was ``y`` that was overwritten, then ``destroy_map`` would be ``{0: [1]``}, because the second input is overwritten by the first output. In a nutshell, to each output must correspond the list of inputs that were changed and share storage with that output. Use ``None`` if the inputs were only destroyed to do temporary calculations, etc. and are not reused as the output storage.
-
-Viewers
--------
-
-Similarly, an Op might not modify the inputs, but return an output which shares state with one or several of its inputs. For example, ``transpose`` can be done efficiently by viewing the same data as the original with modified dimensions and strides. That is fine, but the compiler needs to be told.
-
-This is done by setting the ``view_map`` field of the op. It works like the ``destroy_map`` field: to an output index is associated the list of inputs that it shares state with. For example, ``transpose.view_map == {0: [0]``} because its first output uses the same data as its first input. ``view_map`` is conservative: if there is any probability that an output will be the view of an input, that input must be in the view list of that output.
-
-Important note: currently, an output can only be the view of one input. This is limiting, as an 'if' or 'switch' op would need to declare its output as a view of both its then and else branches, but for the time being the framework is not powerful enough to handle it. A future version should address this issue.
-
-Hidden outputs (as a form of op state)
-======================================
-
-For performance purposes, an ``op`` might want to have a hidden internal state.
-
-Example: if we expect to call the op repeatedly on incrementally bigger inputs, we might want private output storage that's a lot bigger than needed and take incrementally bigger views on it, to save allocation overhead. In order to do this, we can have two outputs: one that we will return normally and will contain the answer and the other that will be the (larger) container. In this case, the advanced note in the 'reusing outputs' section applies. Furthermore, ``__call__`` should be overridden to only return the first output instead of both of them. Here is what the example's ``perform`` and ``__call__`` would look like:
-
-.. code-block:: python
-
-	class Add(Op):
-	    """
-	    Use a hidden buffer to prevent unnecessary reallocation of memory.
-	    """
-	    default_output = 0
-	    def make_node(self, x, y):
-	        return Apply(self, [x,y], [x.type.make_variable(), x.type.make_variable()])
-
-	    def perform(self, node, (x, y), (z, stor)):
-	        if z[0] is None or stor[0] is None:
-	            stor[0] = numpy.ndarray(x.size * 2)
-	        else:
-	            if x.size > stor[0].size:
-	                stor[0].resize(x.size * 2, refcheck = 0)
-	        z[0] = stor[0][:x.size]
-	        numpy.add(x, y, z[0])
-        ...
-
-Another example: for a FFTW Op, we would like to cache FFTW's plan along
-with the inputs it was computed on, so we can reuse it if the inputs
-are similar to the previous ones.
-
-It is also possible but potentially more complicated to use "private
-inputs" to do the same thing: inputs cannot be set, though their contents
-can be modified, so a wrapper would be needed and the input must be
-marked as 'destroyed' by the Op using the 'destroy_map' field.
diff --git a/doc/sandbox/index.rst b/doc/sandbox/index.rst
deleted file mode 100644
index afbff2cb5e..0000000000
--- a/doc/sandbox/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-:orphan:
-
-=========================================================
-Sandbox, this documentation may or may not be out-of-date
-=========================================================
-
-.. toctree::
-   :glob:
-
-   *
-
diff --git a/doc/sandbox/index2.rst b/doc/sandbox/index2.rst
deleted file mode 100644
index 8b1c02b948..0000000000
--- a/doc/sandbox/index2.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-
-.. _advanced:
-
-====================================
-Advanced Topics (under construction)
-====================================
-
-.. toctree::
-    :maxdepth: 2
-
-    compilation
-    ccodegen
-    function
-    debugging_with_stepmode
-
diff --git a/doc/sandbox/interactive_debugger.rst b/doc/sandbox/interactive_debugger.rst
deleted file mode 100644
index c72fd3f206..0000000000
--- a/doc/sandbox/interactive_debugger.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-====================
-Interactive Debugger
-====================
-
-'''Seed of discussion for what an interactive debugging tool might look like. 2009.03.27.'''
-
-== Interactive debugger ( #352 ) ==
-
-The interactive debugger should allow the user to go step by step in a graph to debug it. It should allow setting breakpoints on arbitrary Ops or subgraphs. If we can group ops by the user's function that defined them, we could have a logical grouping of the graph into subgraphs.
-
-The debugger should save the inputs at each step so the user loses no info through inplace operations. Ideally, the debugger should be a normal python shell enriched with commands to control the flow and all the inputs should be made available so the user can use numpy interactively on them.
-
-Command wishlist
- * py_perform (perform the current operation using the python implementation)
- * c_perform (perform the current operation using the C implementation)
- * perform (use the Linker's preference)
- * get_inputs (get the inputs of the current op)
- * set_inputs (set the inputs of the current op)
- * get_outputs (get the outputs of the current op)
- * set_outputs (set the outputs of the current op (bypasses its perform))
- * next (perform and go to the next breakpoint)
- * breakpoint (set a breakpoint on the current Op or subgraph)
- * step (perform and go to the next Op or subgraph)
- * step_in (go to the first Op inside the current subgraph)
- * step_out (exit the subgraph containing this Op)
- * Of course, normal python shell functionality!
- * The global context where the debugger was called (so the user can define his own helper functions, etc.)
-
-A good, simple way to do it would be to have those commands as methods of a structure that would be returned by a DebugLinker. This would allow an interactive session like the following:
-
-{{{
->>> a, b, c = Tensor(), Tensor(), Tensor()
->>> d = b * c
->>> e = a + d
->>> debug = make_function(DebugLinker(FunctionGraph([a, b, c], [e])))
->>> debug.set_breakpoint(d)
->>> debug.debug(10, 20, 30) # a, b, c = 10, 20, 30
-Now at: Mul(b, c)
-Context: d = b * c
->>> debug.get_inputs() # we are at the node d = b * c
-[20, 30]
->>> debug.get_outputs()
-[None]
->>> debug.py_perform()
->>> debug.get_outputs()
-[600]
->>> debug.step()
-Now at: Add(a, Mul)
-Context: e = a + d
->>> debug.get_inputs()
-[30, 600]
->>> debug.step()
-Finished.
-[630]
->>>
-}}}
diff --git a/doc/sandbox/logistic_regression_example.rst b/doc/sandbox/logistic_regression_example.rst
deleted file mode 100644
index 1631dcce1e..0000000000
--- a/doc/sandbox/logistic_regression_example.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. _logistic_regression_example:
-
-State example
-=============
-
-In this example, we'll look at a complete logistic regression model, with
-training by gradient descent.
-
-BUT, YOU GOTTA RUN THIS CODE AND MAKE SURE IT STILL WORKS NICELY, HEY?
-
-.. code-block:: python
-
-    def build_logistic_regression_model(n_in, n_out, l2_coef=30.0)
-        # DECLARE SOME VARIABLES
-
-        import pytensor.tensor as pt
-
-        x = pt.matrix()  #our points, one point per row
-        y = pt.matrix()  #store our labels as place codes (label 3 of 5 is vector [00100])
-
-        w = pt.matrix()  #the linear transform to apply to our input points
-        b = pt.vector()  #a vector of biases, which make our transform affine instead of linear
-
-        stepsize = pt.scalar('stepsize')  # a stepsize for gradient descent
-
-        # REGRESSION MODEL AND COSTS TO MINIMIZE
-
-        prediction = pt.softmax(pt.dot(x, w) + b)
-        cross_entropy = pt.sum(y * pt.log(prediction), axis=1)
-        cost = pt.sum(cross_entropy) + l2_coef * pt.sum(pt.sum(w*w))
-
-        # GET THE GRADIENTS NECESSARY TO FIT OUR PARAMETERS
-
-        grad_w, grad_b = pt.grad(cost, [w, b])
-
-        #
-        # GET THE GRADIENTS NECESSARY TO FIT OUR PARAMETERS
-
-        update_fn = pytensor.function(
-            inputs = [x, y, stepsize,
-                In(w,
-                    name='w',
-                    value=numpy.zeros((n_in, n_out)),
-                    update=w - stepsize * grad_w,
-                    mutable=True,
-                    strict=True)
-                In(b,
-                    name='b',
-                    value=numpy.zeros(n_out),
-                    update=b - lr * grad_b,
-                    mutable=True,
-                    strict=True)
-            ],
-            outputs = cost,
-            mode = 'EXPENSIVE_OPTIMIZATIONS')
-
-        apply_fn = pytensor.function(
-            inputs = [x, In(w, value=update_fn.storage[w]), In(b, value=update_fn.storage[b])],
-            outputs = [prediction])
-
-        return update_fn, apply_fn
-
-    #USUALLY THIS WOULD BE IN A DIFFERENT FUNCTION/CLASS
-    #FIT SOME DUMMY DATA: 100 points with 10 attributes and 3 potential labels
-
-    up_fn, app_fn = build_logistic_regression_model(n_in=10, n_out=3, l2_coef=30.0)
-
-    x_data = numpy.random.standard_normal((100, 10))
-    y_data = numpy.random.standard_normal((100, 3))
-    y_data = _asarray(y_data == numpy.max(y_data, axis=1), dtype='int64')
-
-    print "Model Training ..."
-    for iteration in range(1000):
-        print "  iter", iteration, "cost", update_fn(x_data, y_data, stepsize=0.0001)
-
-    print "Model Predictions"
-    print apply_fn(x_data)
diff --git a/doc/sandbox/performance.rst b/doc/sandbox/performance.rst
deleted file mode 100644
index a62b00b345..0000000000
--- a/doc/sandbox/performance.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-
-===========
-Performance
-===========
-
-PyTensor uses several tricks to obtain good performance:
- * common sub-expression elimination
- * [custom generated] C code for many operations
- * pre-allocation of temporary storage
- * loop fusion (which gcc normally can't do)
-
-On my neural net experiments for my course projects, I was getting around 10x
-speed improvements over basic numpy by using pytensor.
-[More specific speed tests would be nice.]
-
-
-With a little work, PyTensor could also implement more sophisticated
-rewrites:
-
- * automatic ordering of matrix multiplications
- * profile-based memory layout decisions (e.g. row-major vs. col-major)
- * gcc intrinsics to use MMX, SSE2 parallelism for faster element-wise arithmetic
- * conditional expressions
diff --git a/doc/sandbox/randomnumbers.rst b/doc/sandbox/randomnumbers.rst
deleted file mode 100644
index fcdded1c2b..0000000000
--- a/doc/sandbox/randomnumbers.rst
+++ /dev/null
@@ -1,245 +0,0 @@
-.. _sandbox_randnb:
-
-==============
-Random Numbers
-==============
-
-''' This has been implemented (#182). 20090327.'''
-
-= Random Numbers =
-
-== Requirements ==
-
-
-PyTensor functions sometimes need random numbers.
-Random operations are not as simple as other operations such as ones_like, or pow(), because the output must be different when we call the same function repeatedly.  CompileFunction's new default-valued, updatable input variables make this possible.  At the same time we need random streams to be repeatable, and easy to work with.  So the basic requirements of our random number mechanism are:
-
- 1. Internal random number generators must be used in a clear manner, and be accessible to the caller after a function has been compiled.
- 1. A random-number-producing Op (from now on: {{{RandomOp}}}) should generally produce exactly the same stream of random numbers regardless of any other {{{RandomOp}}} instances in its own graph, and any other times the graph was compiled.
- 1. A {{{RandomOp}}}'s stream should be isolated from other {{{RandomOp}}} instances in a compiled graph, so that it is possible to adjust any one {{{RandomOp}}} independently from the others.
- 1. It should be easy to put the {{{RandomOp}}}s in a graph into a state from which their outputs are all independent.
- 1. It should be easy to save the current state of the {{{RandomOp}}}s in a graph.
- 1. It should be easy to re-instate a previous state of the {{{RandomOp}}}s in a graph.
-
-== Basic Technical Spec ==
-
-One option would be to skirt the issue by requiring users to pass all the random numbers we might need as input.
-However, it is not always simple to know how many random numbers will be required because the shape of a random matrix might be computed within the graph.
-The solution proposed here is to pass one or more random number generators as input to {{{pytensor.function}}}.
-
-Sharing a random number generator between different {{{RandomOp}}} instances makes it difficult to producing the same stream regardless of other ops in graph, and to keep {{{RandomOps}}} isolated.
-Therefore, each {{{RandomOp}}} instance in a graph will have its very own random number generator.
-That random number generator is an input to the function.
-In typical usage, we will use the new features of function inputs ({{{value}}}, {{{update}}}) to pass and update the rng for each {{{RandomOp}}}.
-By passing RNGs as inputs, it is possible to use the normal methods of accessing function inputs to access each {{{RandomOp}}}'s rng.
-In this approach it there is no pre-existing mechanism to work with the combined random number state of an entire graph.
-So the proposal is to provide the missing functionality (the last three requirements) via auxiliary functions: {{{seed, getstate, setstate}}}.
-
-== Syntax ==
-
-.. code-block:: python
-
-    #!python
-    # create a random generator, providing a default seed to condition how RandomOp instances are produced.
-    from pytensor.compile.function import function
-
-
-    r = MetaRandom(metaseed=872364)
-
-    # create a different random generator
-    rr = MetaRandom(metaseed=99)
-
-    # create an Op to produce a stream of random numbers.
-    # This generates random numbers uniformly between 0.0 and 1.0 excluded
-    # u will remember that it was made from r.
-    u = r.uniform(shape=(3,4,5), low=0.0, high=1.0)
-
-    # create a second Op for more random numbers
-    # v will remember that it was made from r.
-    v = r.uniform(shape=(8,), low=-1.0, high=0.0)
-
-    # create a third Op with a different underlying random state
-    # w will remember that it was made from rr.
-    w = rr.uniform(shape=(), low=-10., high=10.)
-
-    # compile a function to draw random numbers
-    # note: un-named state inputs will be added automatically.
-    # note: it is not necessary to draw samples for u, even though
-    #       u was created by r before v.
-    fn_v = function([], [v])
-
-    # this prints some representation of v's rng in fn_v.
-    # The .rng property works for Result instances produced by MetaRandom.
-    print fn_v.state[v.rng]
-
-    # compile a function to draw each of u, v, w
-    # note: un-named state inputs will be added automatically
-    # note: This function (especially its internal state) is independent from fn_v.
-    fn_uvw = function([], [u,v,w])
-
-    # N.B. The random number streams of fn_v and fn_uvw are independent.
-    assert fn_v.state[v.rng] != fn_uvw.state[v.rng]
-
-    fn_v()  # returns random numbers A (according to metaseed 872364)
-    fn_v()  # returns different random numbers B
-
-    # note that v's stream here is identical to the one in fn_v()
-    fn_uvw() # returns random numbers C, A, E
-
-    #explicitly re-seed v's random stream in fn_v
-    r.seed(fn_v, 872364)
-    fn_v()    # returns random numbers A (as above)
-    fn_v()    # returns random numbers B (as above)
-
-    #re-seed w's random stream in fn_uvw, but not u's or v's
-    rr.seed(fn_uvw, 99)
-    fn_uvw() # returns random numbers D, B, E
-
-
-== {{{MetaRandom}}} ==
-
-The {{{MetaRandom}}} class is the proposed interface for getting {{{RandomOp}}} instances.
-There are some syntactic similarities in the way {{{MetaRandom}}} is used to construct graphs, and the way {{{numpy.RandomState}}} appears in a corresponding procedural implementation.  But since pytensor is symbolic the meaning of {{{MetaRandom}}} is quite different.
-
-As with {{{numpy.RandomState}}} though, a global instance of {{{MetaRandom}}} will be instantiated at import time for the scripter's convenience.
-
-A {{{MetaRandom}}} instance will remember every {{{Result}}} that it returns during its lifetime.
-When calling functions like {{{seed, setstate}}}, this list is consulted so that only the streams associated with Results returned by {{{self}}} are modified.
-The use of multiple {{{MetaRandom}}} objects in a single function is mostly for debugging (e.g., when you want to synchronize two sets of random number streams).
-
-The typical case is that only one (global) {{{MetaRandom}}} object is used to produce all the random streams in a function, so seeding (once) will reset the entire function.
-
-.. code-block:: python
-
-    class MetaRandom(obj):
-     def __init__(self, metaseed=<N>): ... # new functions will be initialized so that seed(fn, <N>) has no effect on output.
-
-     def __contains__(self, Result): ...   # True if Result was returned by a call to self.<distribution>
-     def results(self): ...                # Iterate over returned Result instances in creation order.
-
-     def seed(self, fn, bits): ...         # See below.
-     def getstate(self, fn): ...           # See below.
-     def setstate(self, fn, state): ...    # See below.
-
-     def uniform(...): ...                 # return a Result of an Apply of a RandomOp.
-                                         # The return value is also stored internally for __contains__ and results().
-     def normal(...): ...
-     def bernoulli(...): ...
-     ...
-
-
-=== {{{MetaRandom.getstate}}} ===
-
-.. code-block:: python
-
-    def getstate(self, fn): ...
-
- ''return''::
-   list, set, dict, instance... something to store the random number generators associated with every one of {{{self}}}'s members in {{{fn}}}
-
-=== {{{MetaRandom.setstate}}} ===
-
-Re-install the random number generators in {{{rstates}}} to the {{{randomobj}}} members in {{{fn}}
-
-.. code-block:: python
-
-   def setstate(self, fn, rstates): ....
-
- ''fn::
-   a CompileFunction instance, generally with some Apply instances inside that are members of {{{self}}}.
- ''rstates''::
-   a structure returned by a previous call to {{{getstate}}}
- ''return''::
-   nothing
-
-
-=== {{{MetaRandom.seed}}} ===
-
-.. code-block:: python
-
-    def seed(self, fn, bits): ....
-
- ''fn::
-   a CompileFunction instance, generally with some Apply instances inside that are members of {{{self}}}.
- ''bits''::
-   Something to use as a seed. Typically an integer or list of integers.
- ''return''::
-   None
-
-Set the states of self's members in fn in a deterministic way based on bits.
-Each member of self should generate independent samples after this call.
-
-Seed is like a dynamically-computed setstate.  If the user runs
-.. code-block:: python
-
-    r.seed(fn, 99)
-    state_99 = r.getstate(fn)
-
-then any time afterward both {{{r.setstate(fn, state_99)}}} and {{{r.seed(fn, 99)}}} will put {{{fn}}} into the same state.
-
-
-
-= Potential Other syntax =
-
-
-.. code-block:: python
-
-    #!python
-    # create a random state
-    from pytensor.compile.function import function
-
-
-    r = RandomState(name = 'r')
-
-    # create a different random state
-    rr = RandomState(name = 'rr')
-
-    # create an Op to produce a stream of random numbers.
-    # That stream is a function of r's seed.
-    # This generates random numbers uniformly between 0.0 and 1.0 excluded
-    u = r.uniform(shape=(3,4,5), 0.0, 1.0)
-
-    # create a second Op for more random numbers
-    # This stream is seeded using a different function of r's seed.
-    # u and v should be independent
-    v = r.uniform(shape=(8,), -1.0, 0.0)
-
-    # create a third Op with a different underlying random state
-    w = rr.uniform(shape=(), -10., 10.)
-
-    # compile a function to draw random numbers
-    # note: it is not necessary to draw samples for u.
-    # we provide the seed for the RandomState r in the inputs list as a "Type 4" input
-    fn_v = function([(r, 872364)], [v])
-
-    # compile a function to draw each of u, v, w
-    # we provide the seeds for the RandomStates r and rr in the inputs list as "Type 4" inputs
-    # note: the random state for r here is seeded independently from the one in fn_v, which means
-    #       random number generation of fn_v and fn_uvw will not interfere. Since the seed is the
-    #       same, it means they will produce the same sequence of tensors for the output v.
-    fn_uvw = function([(r, 872364), (rr, 99)], [u,v,w])
-
-
-    fn_v()  # returns random numbers A
-    fn_v()  # returns different random numbers B
-
-    # note that v's stream here is identical to the one in fn_v()
-    fn_uvw() # returns random numbers C, A, E
-
-    #re-seed v's random stream in fn
-    fn_v.r = 872364
-
-    ### Is this state readable? What should we do here:
-    print fn_v.r
-
-    fn()    # returns random numbers A
-
-    ### Is this state well-defined?
-    ### Does there even exist a number such that fn_v.r = N would have no effect on the rng states?
-    print fn_v.r
-
-    fn()    # returns random numbers B
-
-    #re-seed w's random stream, but not u's or v's
-    fn_uvw.rr = 99
-    fn_uvw() # returns random numbers D, B, E
diff --git a/doc/sandbox/rethinkccodegen.rst b/doc/sandbox/rethinkccodegen.rst
deleted file mode 100644
index 462f424452..0000000000
--- a/doc/sandbox/rethinkccodegen.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-'''An open proposal.  This is still relevant. 20080904'''
-
-======================
-New C code generation?
-======================
-
-Issues
-======
-
-There are several issues with the current way C code is generated:
-  * Ops cannot declare their own persistent variables.
-  * Reliance on weave, but most of weave's features go unused.
-  * There could easily be conflicts between support code from different Ops/Results.
-    * It is currently impossible to specialize support code based on the self.
-  * Caching of the generated code for graphs is greatly suboptimal.
-
-Structure
-=========
-
-Currently, the general structure of the generated C code is approximately as follows:
-
-.. code-block:: c
-
-    <imports>
-    <weave type converters>
-    <op/result support code>
-
-    struct my_computation {
-      <input/output storage>
-      <persistent fields>
-      init(<input/output storage>) { <initialize persistent fields> }
-      cleanup { <clean up persistent fields> }
-      run { <run the computation> }
-    };
-
-    <runner for the struct>
-    PyObject* instantiate(PyObject* args) {
-      <weave stuff>
-      <make up a CObject out of the runner and a my_computation instance>
-      <weave stuff>
-    }
-    <python exports for instantiate>
-
-The module produced via that method then has to be used as such::
-
-    obj = module.instantiate(error_storage, input_storage, output_storage, orphan_storage)
-    cutils.run_cthunk(obj)
-
-
-We would like to get rid of weave dependencies, avoid name conflicts with the support code and have a nicer user interface for the produced module. The proposed new structure is as follows:
-
-.. code-block:: c
-
-    <imports>
-
-    struct op1 {
-      <persistent variables>
-      <support code>
-      init() { <initialize persistent fields> }
-      cleanup { <clean up persistent fields> }
-      run(<inputs>) { <run the computation for op1> }
-    };
-
-    struct op2 { <same> };
-    ...
-    struct opN { <ditto> };
-
-    struct driver {
-      op1 o1; op2 o2; ... opN oN;
-      <input storage>
-      <output storage>
-      init(<storage>) { <initialize ops, storage> }
-      cleanup() { <free storage?> }
-      run() {
-        <extract inputs>
-        o1.run(input1, input2);
-        o2.run(o1.output1);
-        ...
-        oN.run(...);
-        <sync outputs>
-      }
-    }
-
-    PyObject* <name>(PyObject* inputs) {
-      <init driver, input/output storage>
-      <put inputs in input storage>
-      driver.run()
-      <free input storage>
-      <return output storage>
-    }
-
-    PyObject* <name>_driver(PyObject* storage) {
-      <init driver with storage>
-      <return driver>
-    }
-
-    <export <name> and <name>_driver>
-
-Gains:
-  * support code can be put inside a struct and become private to the Op
-  * we can export several functions that can be used directly, eg ``z = module.add(1, 2)``
-    * this won't do filtering like ``Result.filter`` so the usefulness is limited by that
-  * the sequence of operations might be clearer to read
-  * we can use more descriptive names in each Op struct representing its input names (if we can find them using the inspect module) without worrying about name conflicts
-
-Losses:
-  * maybe gcc can't optimize it as well?
-    * make functions static and inline as much as possible
-
-
-Caching
-=======
-
-The current way of caching is from a hash of the generated code. That is inefficient because code has to be generated each time, which might be a costly process. Furthermore, usage of hashing in sets make it difficult to ensure a consistent ordering of Ops in graphs where several orderings are valid, so the generated C code is potentially different each time. Here is a proposal for a better way to compute the hash:
-  * Result_hash = Result version + Result desc
-  * Op_hash = Op version + Op desc + input/output hashes
-  * FunctionGraph_hash = FunctionGraph version + combination of the Op hashes and their traversal order wrt a consistent traversal method
-
-The version could be set explicitly via a ``__version__`` field or it could simply be equal to the file's last modification date. We could also have a ``__nocache__`` field indicating that code produced by the Op or Result cannot be cached.
-
-It should also be easier to bypass the cache (eg an option to CLinker to regenerate the code).
-
-
-
diff --git a/doc/sandbox/sandbox.rst b/doc/sandbox/sandbox.rst
deleted file mode 100644
index 4ab3e78182..0000000000
--- a/doc/sandbox/sandbox.rst
+++ /dev/null
@@ -1,161 +0,0 @@
-Basically, this file contains stuff that should be documented, but is not.
-
-Feel free to contribute things that you want documented, as well as to add
-or correct documentation.
-
-
-======================================
-How do you define the grad function?
-======================================
-
-Let's talk about defining the :meth:`Op.grad` function in an :class:`Op`, using an
-illustrative example.
-
-In Poisson regression (Ranzato and Szummer, 2008), the target *t* is
-integer data, which we predict using a continuous output *o*.
-In the negative log likelihood of the Poisson regressor, there is a term:
-
-.. math::
-
-    \log(t!)
-
-Let's say we write a logfactorial :class:`Op`. We then compute the gradient
-
-You should define gradient, even if it is undefined.
-[give log factorial example]
-
-If an :class:`Op` does not define ``grad``, but this :class:`Op` does not appear in the path when
-you compute the gradient, then there is no problem.
-
-If an :class:`Op` does not define ``grad``, and this :class:`Op` *does* appear in the path when
-you compute the gradient, **WRITEME**.
-
-Gradients for a particular variable can be one of four kinds:
-1) forgot to implement it
-
-You will get an exception of the following form::
-
-    pytensor.graph.utils.MethodNotDefined: ('grad', <class 'pylearn.algorithms.sandbox.cost.LogFactorial'>, 'LogFactorial')
-
-2) a symbolic variable
-3) None / zero
-4) undefined mathematically
-
-currently, there is no way for a ``grad()`` method to distinguish between cases 3
-and 4
-but the distinction is important because graphs with type-3 gradients are ok
-to run, whereas graphs with type-4 gradients are not.
-so I suggested that Joseph return a type-4 gradient by defining an :class:`Op` with no
-perform method.
-the idea would be that this would suit the graph-construction phase, but would
-prevent linking.
-how does that sound to you?
-
-**This documentation is useful when we show users how to write :class:`Op`\s.**
-
-======================================
-What is staticmethod, st_impl?
-======================================
-
-``st_impl`` is an optional method in an :class:`Op`.
-``@staticmethod`` is a Python decorator for a class method that does not
-implicitly take the class instance as a first argument. Hence, st_impl
-can be used for :class:`Op` implementations when no information from the :class:`Op`
-instance is needed. This can be useful for testing an implementation.
-See the ``XlogX`` class below for an example.
-
-**This documentation is useful when we show users how to write :class:`Op`\s.
-Olivier says this behavior should be discouraged but I feel that st_impl
-should be encouraged where possible.**
-
-============================================================
-how do we write scalar ops and upgrade them to tensor ops?
-============================================================
-
-**Olivier says that** :class:`~pytensor.tensor.xlogx.XlogX` **gives a good example. In fact, I would
-like to beef xlogx up into our running example for demonstrating how to
-write an :class:`Op`:**
-
-.. code-block:: python
-
-    class XlogX(scalar.UnaryScalarOp):
-        """
-        Compute X * log(X), with special case 0 log(0) = 0.
-        """
-        @staticmethod
-        def st_impl(x):
-            if x == 0.0:
-                return 0.0
-            return x * numpy.log(x)
-        def impl(self, x):
-            return XlogX.st_impl(x)
-        def grad(self, inp, grads):
-            x, = inp
-            gz, = grads
-            return [gz * (1 + scalar.log(x))]
-        def c_code(self, node, name, inp, out, sub):
-            x, = inp
-            z, = out
-            if node.inputs[0].type in [scalar.float32, scalar.float64]:
-                return """%(z)s =
-                    %(x)s == 0.0
-                    ? 0.0
-                    : %(x)s * log(%(x)s);""" % locals()
-            raise NotImplementedError('only floatingpoint is implemented')
-    scalar_xlogx  = XlogX(scalar.upgrade_to_float, name='scalar_xlogx')
-    xlogx = pytensor.tensor.elemwise.Elemwise(scalar_xlogx, name='xlogx')
-
-**It is also necessary to talk about UnaryScalarOp vs. BinaryOp.**
-
-UnaryScalarOp is the same as scalar.ScalarOp with member variable nin=1.
-**give an example of this**
-
-=======================================================
-How to use the `PrintOp`
-=======================================================
-
-** This is also useful in the How to write an :class:`Op` tutorial. **
-
-=======================================================
-Mammouth
-=======================================================
-
-**This is internal documentation. Guillaume can you make sure to hit these points:**
-
-export PYTENSOR_BLAS_LDFLAGS='-lmkl -liomp5 -fopenmp'
-
-**Do we want the following:**
-
-export OMP_NUM_THREADS=2
-
-=======================================================
-Type checking
-=======================================================
-
-    * Are there functions for doing type checking?
-        like dtype of this matrix is an int-type (not just int32
-        or int64)
-        "if isinstance(item, int):" is the preferred way to do it in
-        python now, so mimic this
-        If the type is wrong, what exception should be raised?
-
-======================================
-More simple numpy stuff
-======================================
-
-    * If we have a matrix with only one row, how do we convert it to a vector?
-        ``x.reshape(x.size)``
-        You can also use ``resize`` but there is not reason to ''resize''
-    * How do you convert the type of a numpy array?
-        ``pytensor._asarray(x, dtype = 'int32')``
-        Note that using ``numpy.asarray`` is potentially dangerous, due to
-        a problem in numpy where the type may not be properly set (see
-        numpy's Track ticket #870).
-
-
-=========================================
-How to reuse (overwrite) a storage tensor
-=========================================
-
-``pytensor.compile.io.Out(gw1, borrow = True)`` for that value in
-``pytensor.compile.function.function``
diff --git a/doc/sandbox/software.rst b/doc/sandbox/software.rst
deleted file mode 100644
index 12ccc68108..0000000000
--- a/doc/sandbox/software.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-===============
-Others software
-===============
-
-Other software to look at and maybe recommend to users:
-
-* [http://www.pytables.org/moin PyTables] - This is looking really
-    promising for dataset storage and experiment logging... This might
-    actually be useful for large data sets.
-* [http://matplotlib.sourceforge.net/ MatPlotLib] - visualization tools
-    (plot curves interactively, like matlab's figure window)
-* [http://www.pythonware.com/products/pil/ PIL] - Python Image Library:
-    write your matrices out in png! (Kinda a weird recommendation, I think)
-* [http://www.logilab.org/857 pylint] - Syntax checker for python to
-    help beautify your code. (We'd be hypocrites to recommend this :)
-* [http://www.winpdb.org/ Winpdb] - A Platform Independent Python
-    Debugger. (Except it doesn't really help you debug PyTensor graphs)
-* [http://wiki.python.org/moin/IntegratedDevelopmentEnvironments Python
-    Integrated Development Environments] - for all your coding needs
diff --git a/doc/sandbox/sparse.rst b/doc/sandbox/sparse.rst
deleted file mode 100644
index 27ccb8c449..0000000000
--- a/doc/sandbox/sparse.rst
+++ /dev/null
@@ -1,147 +0,0 @@
-.. _sparse:
-
-===============
-Sparse matrices
-===============
-
-scipy.sparse
-------------
-
-Note that you want SciPy >= 0.7.2
-
-.. warning::
-
-    In SciPy 0.6, `scipy.csc_matrix.dot` has a bug with singleton
-    dimensions. There may be more bugs. It also has inconsistent
-    implementation of sparse matrices.
-
-    We do not test against SciPy versions below 0.7.2.
-
-We describe the details of the compressed sparse matrix types.
-    `scipy.sparse.csc_matrix`
-        should be used if there are more rows than column (``shape[0] > shape[1]``).
-    `scipy.sparse.csr_matrix`
-        should be used if there are more columns than rows (``shape[0] < shape[1]``).
-    `scipy.sparse.lil_matrix`
-        is faster if we are modifying the array. After initial inserts,
-        we can then convert to the appropriate sparse matrix format.
-
-The following types also exist:
-    `dok_matrix`
-        Dictionary of Keys format. From their doc: This is an efficient structure for constructing sparse matrices incrementally.
-    `coo_matrix`
-        Coordinate format. From their lil doc: consider using the COO format when constructing large matrices.
-
-There seems to be a new format planned for SciPy 0.7.x:
-    `bsr_matrix`
-        Block Compressed Row (BSR). From their doc: The Block Compressed Row
-        (BSR) format is very similar to the Compressed Sparse Row (CSR)
-        format. BSR is appropriate for sparse matrices with dense sub matrices
-        like the last example below. Block matrices often arise in vector-valued
-        finite element discretizations. In such cases, BSR is considerably more
-        efficient than CSR and CSC for many sparse arithmetic operations.
-    `dia_matrix`
-        Sparse matrix with DIAgonal storage
-
-There are four member variables that comprise a compressed matrix ``sp`` (for at least csc, csr and bsr):
-
-    ``sp.shape``
-        gives the shape of the matrix.
-    ``sp.data``
-        gives the values of the non-zero entries. For CSC, these should
-        be in order from (I think, not sure) reading down in columns,
-        starting at the leftmost column until we reach the rightmost
-        column.
-    ``sp.indices``
-        gives the location of the non-zero entry. For CSC, this is the
-        row location.
-    ``sp.indptr``
-        gives the other location of the non-zero entry. For CSC, there are
-        as many values of indptr as there are ``columns + 1`` in the matrix.
-        ``sp.indptr[k] = x`` and ``indptr[k+1] = y`` means that column
-        ``k`` contains ``sp.data[x:y]``, i.e. the ``x``-th through the y-1th non-zero values.
-
-See the example below for details.
-
-.. code-block:: python
-
-    >>> import scipy.sparse
-    >>> sp = scipy.sparse.csc_matrix((5, 10))
-    >>> sp[4, 0] = 20
-    SparseEfficiencyWarning: changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.
-     SparseEfficiencyWarning)
-    >>> sp[0, 0] = 10
-    >>> sp[2, 3] = 30
-    >>> sp.todense()
-    matrix([[ 10.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [  0.,   0.,   0.,  30.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [ 20.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.]])
-    >>> print sp
-      (0, 0)        10.0
-      (4, 0)        20.0
-      (2, 3)        30.0
-    >>> sp.shape
-    (5, 10)
-    >>> sp.data
-    array([ 10.,  20.,  30.])
-    >>> sp.indices
-    array([0, 4, 2], dtype=int32)
-    >>> sp.indptr
-    array([0, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3], dtype=int32)
-
-Several things should be learned from the above example:
-
-* We actually use the wrong sparse matrix type. In fact, it is the
-  *rows* that are sparse, not the columns. So, it would have been
-  better to use ``sp = scipy.sparse.csr_matrix((5, 10))``.
-* We should have actually created the matrix as a `lil_matrix`,
-  which is more efficient for inserts. Afterwards, we should convert
-  to the appropriate compressed format.
-* ``sp.indptr[0] = 0`` and ``sp.indptr[1] = 2``, which means that
-  column 0 contains ``sp.data[0:2]``, i.e. the first two non-zero values.
-* ``sp.indptr[3] = 2`` and ``sp.indptr[4] = 3``, which means that column
-  three contains ``sp.data[2:3]``, i.e. the third non-zero value.
-
-TODO: Rewrite this documentation to do things in a smarter way.
-
-Speed
------
-
-For faster sparse code:
-  * Construction: lil_format is fast for many inserts.
-  * Operators: "Since conversions to and from the COO format are
-    quite fast, you can use this approach to efficiently implement lots
-    computations on sparse matrices." (Nathan Bell on scipy mailing list)
-
-Misc
-----
-The sparse equivalent of `dmatrix` is `csc_matrix` and `csr_matrix`.
-
-:class:`~pytensor.sparse.basic.Dot` vs. :class:`~pytensor.sparse.basic.StructuredDot`
--------------------------------------------------------------------------------------
-
-Often when you use a sparse matrix it is because there is a meaning to the
-structure of non-zeros. The gradient on terms outside that structure
-has no meaning, so it is computationally efficient not to compute them.
-
-`StructuredDot` is when you want the gradient to have zeroes corresponding to
-the sparse entries in the matrix.
-
-`TrueDot` and `Structured` dot have different gradients
-but their perform functions should be the same.
-
-The gradient of `TrueDot` can have non-zeros where the sparse matrix had zeros.
-The gradient of `StructuredDot` can't.
-
-Suppose you have ``dot(x,w)`` where ``x`` and ``w`` are square matrices.
-If ``w`` is dense, like ``standard_normal((5,5))`` and ``x`` is of full rank (though
-potentially sparse, like a diagonal matrix of ones) then the output will
-be dense too.
-What's important is the density of the gradient on the output.
-If the gradient on the output is dense, and ``w`` is dense (as we said it was)
-then the ``True`` gradient on ``x`` will be dense.
-If our dot is a `TrueDot`, then it will say that the gradient on ``x`` is dense.
-If our dot is a `StructuredDot`, then it will say the gradient on ``x`` is only
-defined on the diagonal and ignore the gradients on the off-diagonal.
diff --git a/doc/sandbox/tensoroptools.rst b/doc/sandbox/tensoroptools.rst
deleted file mode 100644
index 132924142f..0000000000
--- a/doc/sandbox/tensoroptools.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-
-.. _tensoroptools:
-
-================
-Tensor Op Tools
-================
-
-WRITEME - describe how to use Elemwise here
-

From 58fec45d00717ecebbcd4a70e08e1ed0b99e362c Mon Sep 17 00:00:00 2001
From: Diego Sandoval <46681084+twaclaw@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:06:27 +0200
Subject: [PATCH 19/72] Implement nlinalg Ops in PyTorch (#920)

---
 pytensor/link/pytorch/dispatch/__init__.py |   2 +-
 pytensor/link/pytorch/dispatch/nlinalg.py  | 103 +++++++++++++++++++
 tests/link/pytorch/test_nlinalg.py         | 111 +++++++++++++++++++++
 3 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 pytensor/link/pytorch/dispatch/nlinalg.py
 create mode 100644 tests/link/pytorch/test_nlinalg.py

diff --git a/pytensor/link/pytorch/dispatch/__init__.py b/pytensor/link/pytorch/dispatch/__init__.py
index fa47908d74..0295a12e8e 100644
--- a/pytensor/link/pytorch/dispatch/__init__.py
+++ b/pytensor/link/pytorch/dispatch/__init__.py
@@ -9,5 +9,5 @@
 import pytensor.link.pytorch.dispatch.extra_ops
 import pytensor.link.pytorch.dispatch.shape
 import pytensor.link.pytorch.dispatch.sort
-
+import pytensor.link.pytorch.dispatch.nlinalg
 # isort: on
diff --git a/pytensor/link/pytorch/dispatch/nlinalg.py b/pytensor/link/pytorch/dispatch/nlinalg.py
new file mode 100644
index 0000000000..91690489e9
--- /dev/null
+++ b/pytensor/link/pytorch/dispatch/nlinalg.py
@@ -0,0 +1,103 @@
+import torch
+
+from pytensor.link.pytorch.dispatch import pytorch_funcify
+from pytensor.tensor.nlinalg import (
+    SVD,
+    Det,
+    Eig,
+    Eigh,
+    KroneckerProduct,
+    MatrixInverse,
+    MatrixPinv,
+    QRFull,
+    SLogDet,
+)
+
+
+@pytorch_funcify.register(SVD)
+def pytorch_funcify_SVD(op, **kwargs):
+    full_matrices = op.full_matrices
+    compute_uv = op.compute_uv
+
+    def svd(x):
+        U, S, V = torch.linalg.svd(x, full_matrices=full_matrices)
+        if compute_uv:
+            return U, S, V
+        return S
+
+    return svd
+
+
+@pytorch_funcify.register(Det)
+def pytorch_funcify_Det(op, **kwargs):
+    def det(x):
+        return torch.linalg.det(x)
+
+    return det
+
+
+@pytorch_funcify.register(SLogDet)
+def pytorch_funcify_SLogDet(op, **kwargs):
+    def slogdet(x):
+        return torch.linalg.slogdet(x)
+
+    return slogdet
+
+
+@pytorch_funcify.register(Eig)
+def pytorch_funcify_Eig(op, **kwargs):
+    def eig(x):
+        return torch.linalg.eig(x)
+
+    return eig
+
+
+@pytorch_funcify.register(Eigh)
+def pytorch_funcify_Eigh(op, **kwargs):
+    uplo = op.UPLO
+
+    def eigh(x, uplo=uplo):
+        return torch.linalg.eigh(x, UPLO=uplo)
+
+    return eigh
+
+
+@pytorch_funcify.register(MatrixInverse)
+def pytorch_funcify_MatrixInverse(op, **kwargs):
+    def matrix_inverse(x):
+        return torch.linalg.inv(x)
+
+    return matrix_inverse
+
+
+@pytorch_funcify.register(QRFull)
+def pytorch_funcify_QRFull(op, **kwargs):
+    mode = op.mode
+    if mode == "raw":
+        raise NotImplementedError("raw mode not implemented in PyTorch")
+
+    def qr_full(x):
+        Q, R = torch.linalg.qr(x, mode=mode)
+        if mode == "r":
+            return R
+        return Q, R
+
+    return qr_full
+
+
+@pytorch_funcify.register(MatrixPinv)
+def pytorch_funcify_Pinv(op, **kwargs):
+    hermitian = op.hermitian
+
+    def pinv(x):
+        return torch.linalg.pinv(x, hermitian=hermitian)
+
+    return pinv
+
+
+@pytorch_funcify.register(KroneckerProduct)
+def pytorch_funcify_KroneckerProduct(op, **kwargs):
+    def _kron(x, y):
+        return torch.kron(x, y)
+
+    return _kron
diff --git a/tests/link/pytorch/test_nlinalg.py b/tests/link/pytorch/test_nlinalg.py
new file mode 100644
index 0000000000..7d69ac0500
--- /dev/null
+++ b/tests/link/pytorch/test_nlinalg.py
@@ -0,0 +1,111 @@
+import numpy as np
+import pytest
+
+from pytensor.compile.function import function
+from pytensor.configdefaults import config
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor import nlinalg as pt_nla
+from pytensor.tensor.type import matrix
+from tests.link.pytorch.test_basic import compare_pytorch_and_py
+
+
+@pytest.fixture
+def matrix_test():
+    rng = np.random.default_rng(213234)
+
+    M = rng.normal(size=(3, 3))
+    test_value = M.dot(M.T).astype(config.floatX)
+
+    x = matrix("x")
+    return (x, test_value)
+
+
+@pytest.mark.parametrize(
+    "func",
+    (pt_nla.eig, pt_nla.eigh, pt_nla.slogdet, pt_nla.inv, pt_nla.det),
+)
+def test_lin_alg_no_params(func, matrix_test):
+    x, test_value = matrix_test
+
+    out = func(x)
+    out_fg = FunctionGraph([x], out if isinstance(out, list) else [out])
+
+    def assert_fn(x, y):
+        np.testing.assert_allclose(x, y, rtol=1e-3)
+
+    compare_pytorch_and_py(out_fg, [test_value], assert_fn=assert_fn)
+
+
+@pytest.mark.parametrize(
+    "mode",
+    (
+        "complete",
+        "reduced",
+        "r",
+        pytest.param("raw", marks=pytest.mark.xfail(raises=NotImplementedError)),
+    ),
+)
+def test_qr(mode, matrix_test):
+    x, test_value = matrix_test
+    outs = pt_nla.qr(x, mode=mode)
+    out_fg = FunctionGraph([x], outs if isinstance(outs, list) else [outs])
+    compare_pytorch_and_py(out_fg, [test_value])
+
+
+@pytest.mark.parametrize("compute_uv", [True, False])
+@pytest.mark.parametrize("full_matrices", [True, False])
+def test_svd(compute_uv, full_matrices, matrix_test):
+    x, test_value = matrix_test
+
+    out = pt_nla.svd(x, full_matrices=full_matrices, compute_uv=compute_uv)
+    out_fg = FunctionGraph([x], out if isinstance(out, list) else [out])
+
+    compare_pytorch_and_py(out_fg, [test_value])
+
+
+def test_pinv():
+    x = matrix("x")
+    x_inv = pt_nla.pinv(x)
+
+    fgraph = FunctionGraph([x], [x_inv])
+    x_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=config.floatX)
+    compare_pytorch_and_py(fgraph, [x_np])
+
+
+@pytest.mark.parametrize("hermitian", [False, True])
+def test_pinv_hermitian(hermitian):
+    A = matrix("A", dtype="complex128")
+    A_h_test = np.c_[[3, 3 + 2j], [3 - 2j, 2]]
+    A_not_h_test = A_h_test + 0 + 1j
+
+    A_inv = pt_nla.pinv(A, hermitian=hermitian)
+    torch_fn = function([A], A_inv, mode="PYTORCH")
+
+    assert np.allclose(torch_fn(A_h_test), np.linalg.pinv(A_h_test, hermitian=False))
+    assert np.allclose(torch_fn(A_h_test), np.linalg.pinv(A_h_test, hermitian=True))
+
+    assert (
+        np.allclose(
+            torch_fn(A_not_h_test), np.linalg.pinv(A_not_h_test, hermitian=False)
+        )
+        is not hermitian
+    )
+
+    assert (
+        np.allclose(
+            torch_fn(A_not_h_test), np.linalg.pinv(A_not_h_test, hermitian=True)
+        )
+        is hermitian
+    )
+
+
+def test_kron():
+    x = matrix("x")
+    y = matrix("y")
+    z = pt_nla.kron(x, y)
+
+    fgraph = FunctionGraph([x, y], [z])
+    x_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=config.floatX)
+    y_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=config.floatX)
+
+    compare_pytorch_and_py(fgraph, [x_np, y_np])

From 7fd8cbd1d83ef6a2b21008f6c9b6e45a6d7a12cc Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Wed, 17 Jul 2024 13:08:28 -0700
Subject: [PATCH 20/72] Update for m1

---
 environment.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/environment.yml b/environment.yml
index 95bb58c06c..033765302f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,7 @@ name: pytensor-dev
 channels:
   - conda-forge
 dependencies:
-  - python>=3.10
+  - python=3.10
   - compilers
   - numpy>=1.17.0,<2
   - scipy>=0.14,<1.14.0
@@ -18,9 +18,7 @@ dependencies:
   - cons
   - pydeprecate
   # Intel BLAS
-  - mkl
-  - mkl-service
-  - libblas=*=*mkl
+  - libblas=*=*accelerate
   # numba backend
   - numba>=0.57
   # For testing

From a5587a7d3041cd6e742a7f8c231cd87b6c172022 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Sun, 21 Jul 2024 16:06:37 -0700
Subject: [PATCH 21/72] Add new env file

---
 environment-osx-arm64.yml | 53 +++++++++++++++++++++++++++++++++++++++
 environment.yml           |  6 +++--
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 environment-osx-arm64.yml

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
new file mode 100644
index 0000000000..033765302f
--- /dev/null
+++ b/environment-osx-arm64.yml
@@ -0,0 +1,53 @@
+# To use:
+#
+#   $ conda env create -f environment.yml  # `mamba` works too for this command
+#   $ conda activate pytensor-dev
+#
+name: pytensor-dev
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - compilers
+  - numpy>=1.17.0,<2
+  - scipy>=0.14,<1.14.0
+  - filelock>=3.15
+  - etuples
+  - logical-unification
+  - miniKanren
+  - cons
+  - pydeprecate
+  # Intel BLAS
+  - libblas=*=*accelerate
+  # numba backend
+  - numba>=0.57
+  # For testing
+  - coveralls
+  - diff-cover
+  - mypy
+  - types-setuptools
+  - pytest
+  - pytest-cov
+  - pytest-xdist
+  - pytest-benchmark
+  - pytest-mock
+  - pip:
+    - pytest-sphinx
+  # For building docs
+  - sphinx>=5.1.0,<6
+  - sphinx_rtd_theme
+  - pygments
+  - pydot
+  - ipython
+  - pymc-sphinx-theme
+  - sphinx-design
+  # code style
+  - ruff
+  # developer tools
+  - pandas # required to run mypy script
+  - pre-commit
+  - packaging
+  # optional
+  - cython
+  - graphviz
+  - pydot
diff --git a/environment.yml b/environment.yml
index 033765302f..95bb58c06c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,7 @@ name: pytensor-dev
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python>=3.10
   - compilers
   - numpy>=1.17.0,<2
   - scipy>=0.14,<1.14.0
@@ -18,7 +18,9 @@ dependencies:
   - cons
   - pydeprecate
   # Intel BLAS
-  - libblas=*=*accelerate
+  - mkl
+  - mkl-service
+  - libblas=*=*mkl
   # numba backend
   - numba>=0.57
   # For testing

From a09fa7588cf5524cca06e0fbf79177f8e7b50d5a Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Sun, 21 Jul 2024 16:07:57 -0700
Subject: [PATCH 22/72] Update comment

---
 environment-osx-arm64.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
index 033765302f..1ab4c81a36 100644
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -17,7 +17,7 @@ dependencies:
   - miniKanren
   - cons
   - pydeprecate
-  # Intel BLAS
+  # Apple BLAS
   - libblas=*=*accelerate
   # numba backend
   - numba>=0.57

From d6254afaa9447580c5627b7c39929e8041fa4892 Mon Sep 17 00:00:00 2001
From: Thomas Wiecki <thomas.wiecki@gmail.com>
Date: Mon, 22 Jul 2024 11:55:55 +0200
Subject: [PATCH 23/72] Update environment-osx-arm64.yml

Co-authored-by: Ben Mares <services-git-throwaway1@tensorial.com>
---
 environment-osx-arm64.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
index 1ab4c81a36..0d624aa55c 100644
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -7,7 +7,7 @@ name: pytensor-dev
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python=>3.10
   - compilers
   - numpy>=1.17.0,<2
   - scipy>=0.14,<1.14.0

From 23427a0a375bfca82bd85e35a2f247759fbdcff1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Jul 2024 17:32:59 +0000
Subject: [PATCH 24/72] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.4 → v0.5.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.4...v0.5.5)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4b34d53b80..118a371e78 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           )$
       - id: check-merge-conflict
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.4
+    rev: v0.5.5
     hooks:
       - id: ruff
         args: ["--fix", "--output-format=full"]

From f25a624a42849374807368d6b056c254a4ea2044 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Fri, 19 Apr 2024 17:57:34 +0200
Subject: [PATCH 25/72] Implement Einsum

Co-authored-by: Adrian Seyboldt <aseyboldt@users.noreply.github.com>
Co-authored-by: Jesse Grabowski <48652735+jessegrabowski@users.noreply.github.com>
Co-authored-by: Ricardo Vieira <28983449+ricardov94@users.noreply.github.com>
Co-authored-by: Rob Zinkov <zaxtax@users.noreply.github.com>
---
 pytensor/link/jax/dispatch/__init__.py   |   1 +
 pytensor/link/jax/dispatch/einsum.py     |  20 +
 pytensor/tensor/__init__.py              |   1 +
 pytensor/tensor/basic.py                 |  34 +-
 pytensor/tensor/einsum.py                | 760 +++++++++++++++++++++++
 pytensor/tensor/functional.py            |   2 +-
 pytensor/tensor/rewriting/__init__.py    |   3 +-
 pytensor/tensor/rewriting/basic.py       | 121 ++++
 pytensor/tensor/rewriting/blockwise.py   | 133 ++--
 pytensor/tensor/rewriting/einsum.py      |  53 ++
 pytensor/tensor/rewriting/ofg.py         |  27 +-
 pytensor/tensor/rewriting/shape.py       | 120 ++--
 pytensor/tensor/shape.py                 |   6 +-
 tests/link/jax/test_einsum.py            |  38 ++
 tests/tensor/rewriting/test_blockwise.py |  37 +-
 tests/tensor/rewriting/test_einsum.py    |  39 ++
 tests/tensor/rewriting/test_shape.py     |  46 ++
 tests/tensor/test_basic.py               |   4 +-
 tests/tensor/test_einsum.py              | 263 ++++++++
 tests/tensor/test_shape.py               |  12 +-
 20 files changed, 1569 insertions(+), 151 deletions(-)
 create mode 100644 pytensor/link/jax/dispatch/einsum.py
 create mode 100644 pytensor/tensor/einsum.py
 create mode 100644 pytensor/tensor/rewriting/einsum.py
 create mode 100644 tests/link/jax/test_einsum.py
 create mode 100644 tests/tensor/rewriting/test_einsum.py
 create mode 100644 tests/tensor/test_einsum.py

diff --git a/pytensor/link/jax/dispatch/__init__.py b/pytensor/link/jax/dispatch/__init__.py
index f4098416b8..00976f221c 100644
--- a/pytensor/link/jax/dispatch/__init__.py
+++ b/pytensor/link/jax/dispatch/__init__.py
@@ -4,6 +4,7 @@
 # Load dispatch specializations
 import pytensor.link.jax.dispatch.blas
 import pytensor.link.jax.dispatch.blockwise
+import pytensor.link.jax.dispatch.einsum
 import pytensor.link.jax.dispatch.elemwise
 import pytensor.link.jax.dispatch.extra_ops
 import pytensor.link.jax.dispatch.pad
diff --git a/pytensor/link/jax/dispatch/einsum.py b/pytensor/link/jax/dispatch/einsum.py
new file mode 100644
index 0000000000..3080f6964f
--- /dev/null
+++ b/pytensor/link/jax/dispatch/einsum.py
@@ -0,0 +1,20 @@
+import jax.numpy as jnp
+
+from pytensor.link.jax.dispatch import jax_funcify
+from pytensor.tensor.einsum import Einsum
+
+
+@jax_funcify.register(Einsum)
+def jax_funcify_Einsum(op, **kwargs):
+    """Dispatch einsum to JAX.
+
+    This dispatch is triggered only when we couldn't optimize einsum at the PyTensor level.
+    This happens when some of the dimension lengths are unknown. This is never a problem in JAX,
+    as it always compiles a function per runtime input shape.
+    """
+    subscripts = op.subscripts
+
+    def einsum(*operands):
+        return jnp.einsum(subscripts, *operands, optimize="optimal")
+
+    return einsum
diff --git a/pytensor/tensor/__init__.py b/pytensor/tensor/__init__.py
index 81cabfa6bd..7385f02478 100644
--- a/pytensor/tensor/__init__.py
+++ b/pytensor/tensor/__init__.py
@@ -151,6 +151,7 @@ def _get_vector_length_Constant(op: Op | Variable, var: Constant) -> int:
 
 
 # isort: off
+from pytensor.tensor.einsum import einsum
 from pytensor.tensor.functional import vectorize
 # isort: on
 
diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
index 119c44c647..9eaa04c522 100644
--- a/pytensor/tensor/basic.py
+++ b/pytensor/tensor/basic.py
@@ -1700,21 +1700,22 @@ def do_constant_folding(self, fgraph, node):
             return False
 
         for client, idx in clients:
-            if isinstance(client.op, Output):
+            client_op = client.op
+            if isinstance(client_op, Output):
                 # If the output is a constant, it will have to be deepcopied
                 # each time the function is called.  So we do not fold.
                 return False
-            # Allow alloc to be lifted out of Elemwise before constant folding it
-            elif isinstance(client.op, Elemwise):
-                return None
+            # Op's through which Alloc can be lifted
+            elif isinstance(client_op, Elemwise | DimShuffle | Alloc | Join):
+                return False
             # Same for Blockwise, unless it has no batch_dims
-            elif isinstance(client.op, Blockwise) and client.op.batch_ndim(client):
-                return None
+            elif isinstance(client_op, Blockwise) and client.op.batch_ndim(client):
+                return False
             elif (
                 # The following ops work inplace of their input id 0.
                 idx == 0
                 and isinstance(
-                    client.op,
+                    client_op,
                     pytensor.tensor.subtensor.IncSubtensor
                     | pytensor.tensor.subtensor.AdvancedIncSubtensor1
                     | pytensor.tensor.subtensor.AdvancedIncSubtensor
@@ -2035,10 +2036,15 @@ def transpose(x, axes=None):
     _x = as_tensor_variable(x)
 
     if axes is None:
-        axes = list(range((_x.type.ndim - 1), -1, -1))
+        axes = tuple(range((_x.type.ndim - 1), -1, -1))
+
+    if tuple(axes) == tuple(range(len(axes))):
+        # No-op
+        return _x
+
     ret = DimShuffle(tuple(s == 1 for s in _x.type.shape), axes)(_x)
 
-    if _x.name and axes == list(range((_x.type.ndim - 1), -1, -1)):
+    if _x.name and axes == tuple(range((_x.type.ndim - 1), -1, -1)):
         ret.name = _x.name + ".T"
 
     return ret
@@ -3950,6 +3956,10 @@ def moveaxis(
     source = normalize_axis_tuple(source, a.ndim, "source")
     destination = normalize_axis_tuple(destination, a.ndim, "destination")
 
+    if source == destination:
+        # It's a no-op
+        return a
+
     if len(source) != len(destination):
         raise ValueError(
             "`source` and `destination` arguments must have the same number of elements"
@@ -4260,9 +4270,7 @@ def atleast_Nd(
 atleast_3d = partial(atleast_Nd, n=3)
 
 
-def expand_dims(
-    a: np.ndarray | TensorVariable, axis: tuple[int, ...]
-) -> TensorVariable:
+def expand_dims(a: np.ndarray | TensorVariable, axis: Sequence[int]) -> TensorVariable:
     """Expand the shape of an array.
 
     Insert a new axis that will appear at the `axis` position in the expanded
@@ -4281,7 +4289,7 @@ def expand_dims(
     """
     a = as_tensor(a)
 
-    if not isinstance(axis, tuple | list):
+    if not isinstance(axis, Sequence):
         axis = (axis,)
 
     out_ndim = len(axis) + a.ndim
diff --git a/pytensor/tensor/einsum.py b/pytensor/tensor/einsum.py
new file mode 100644
index 0000000000..79151a91a2
--- /dev/null
+++ b/pytensor/tensor/einsum.py
@@ -0,0 +1,760 @@
+import collections
+import warnings
+from collections.abc import Sequence
+from functools import partial, reduce
+from itertools import pairwise
+from typing import cast
+
+import numpy as np
+from numpy.core.einsumfunc import _find_contraction, _parse_einsum_input  # type: ignore
+from numpy.core.numeric import (  # type: ignore
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
+
+from pytensor.compile.builders import OpFromGraph
+from pytensor.tensor import TensorLike
+from pytensor.tensor.basic import (
+    arange,
+    as_tensor,
+    expand_dims,
+    get_vector_length,
+    moveaxis,
+    stack,
+    transpose,
+    where,
+)
+from pytensor.tensor.extra_ops import broadcast_to
+from pytensor.tensor.functional import vectorize
+from pytensor.tensor.math import and_, eq, tensordot
+from pytensor.tensor.shape import shape_padright
+from pytensor.tensor.variable import TensorVariable
+
+
+PATH = tuple[tuple[int] | tuple[int, int], ...]
+
+
+class Einsum(OpFromGraph):
+    """
+    Wrapper Op for Einsum graphs
+
+    Notes
+    -----
+    The `optimized` prop indicates whether the inner graph was optimized, which can only be done when all shapes are
+    statically known. This is now determined at graph creation time only. We could introduce a rewrite that tries to
+    optimize the graph if static shapes become known later (e.g., after use of `clone_replace` or shape inference during
+    rewrites).
+
+    Also, once the graph is optimized, it could be inlined for potential further optimization that consider the rest of
+    the graph.
+
+    This prop is different from the `optimize` kwarg in numpy that determines what kind (if any) of optimization is
+    desired. We haven't decided whether we want to provide this functionality.
+    """
+
+    __props__ = ("subscripts", "path", "optimized")
+
+    def __init__(self, *args, subscripts: str, path: PATH, optimized: bool, **kwargs):
+        self.subscripts = subscripts
+        self.path = path
+        self.optimized = optimized
+        super().__init__(*args, **kwargs, strict=True)
+
+
+def _iota(shape: TensorVariable, axis: int) -> TensorVariable:
+    """
+    Create an array with values increasing along the specified axis.
+
+    Iota is a multidimensional generalization of the `arange` function. The returned array is filled with whole numbers
+    increasing along the specified axis.
+
+    Parameters
+    ----------
+    shape: TensorVariable
+        The shape of the array to be created.
+    axis: int
+        The axis along which to fill the array with increasing values.
+
+    Returns
+    -------
+    TensorVariable
+        An array with values increasing along the specified axis.
+
+    Examples
+    --------
+    In the simplest case where ``shape`` is 1d, the output will be equivalent to ``pt.arange``:
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+        from pytensor.tensor.einsum import _iota
+
+        shape = pt.as_tensor((5,))
+        print(_iota(shape, 0).eval())
+
+    .. testoutput::
+
+         [0 1 2 3 4]
+
+    In higher dimensions, it will look like many concatenated `arange`:
+
+    .. testcode::
+
+        shape = pt.as_tensor((5, 5))
+        print(_iota(shape, 1).eval())
+
+    .. testoutput::
+
+        [[0 1 2 3 4]
+         [0 1 2 3 4]
+         [0 1 2 3 4]
+         [0 1 2 3 4]
+         [0 1 2 3 4]]
+
+    Setting ``axis=0`` above would result in the transpose of the output.
+    """
+    len_shape = get_vector_length(shape)
+    axis = normalize_axis_index(axis, len_shape)
+    values = arange(shape[axis])
+    return broadcast_to(shape_padright(values, len_shape - axis - 1), shape)
+
+
+def _delta(shape: TensorVariable, axes: Sequence[int]) -> TensorVariable:
+    """
+    Create a Kroncker delta tensor.
+
+    The Kroncker delta function is defined:
+
+    .. math::
+
+        \\delta(i, j) = \begin{cases} 1 & \text{if} \\quad i = j \\ 0 & \text{otherwise} \\end{cases}
+
+    To create a Kronecker tensor, the delta function is applied elementwise to the axes specified. The result is a
+    tensor of booleans, with ``True`` where the axis indices coincide, and ``False`` otherwise. See below for examples.
+
+    Parameters
+    ----------
+    shape: TensorVariable
+        The shape of the tensor to be created. Note that `_delta` is not defined for 1d tensors, because there is no
+        second axis against which to compare.
+    axes: sequence of int
+        Axes whose indices should be compared. Note that `_delta` is not defined for a single axis, because there is no
+        second axis against which to compare.
+
+    Examples
+    --------
+    An easy case to understand is when the shape is square and the number of axes is equal to the number of dimensions.
+    This will result in a generalized identity tensor, with ``True`` along the main diagonal:
+
+    .. testcode::
+
+        from pytensor.tensor.einsum import _delta
+        print(_delta((5, 5), (0, 1)).eval())
+
+    .. testoutput::
+
+        [[ True False False False False]
+         [False  True False False False]
+         [False False  True False False]
+         [False False False  True False]
+         [False False False False  True]]
+
+    In the case where the shape is not square, the result will be a tensor with ``True`` along the main diagonal and
+    ``False`` elsewhere:
+
+    .. testcode::
+
+        from pytensor.tensor.einsum import _delta
+        print(_delta((3, 2), (0, 1)).eval())
+
+    .. testoutput::
+
+        [[ True False]
+         [False  True]
+         [False False]]
+
+    When there are more than two dimensions in the shape, axes can be only a subset of them, leading to different
+    arragements of True and False values. For example for a 3d batch of matrices, choosing axes (0, 2) will lead to
+    True values on the column corresponding to the batch index of each matrix:
+
+    .. testcode::
+
+        from pytensor.tensor.einsum import _delta
+        print(_delta((3, 3, 3), (0, 2)).eval())
+
+    .. testoutput::
+
+        [[[ True False False]
+          [ True False False]
+          [ True False False]]
+
+         [[False  True False]
+          [False  True False]
+          [False  True False]]
+
+         [[False False  True]
+          [False False  True]
+          [False False  True]]]
+    """
+    if len(axes) == 1:
+        raise ValueError("Need at least two axes to create a delta tensor")
+    base_shape = stack([shape[axis] for axis in axes])
+    iotas = [_iota(base_shape, i) for i in range(len(axes))]
+    eyes = [eq(i1, i2) for i1, i2 in pairwise(iotas)]
+    result = reduce(and_, eyes)
+    non_axes = [i for i in range(len(tuple(shape))) if i not in axes]
+    return broadcast_to(expand_dims(result, non_axes), shape)
+
+
+def _general_dot(
+    vars: tuple[TensorVariable, TensorVariable],
+    axes: Sequence[Sequence[int]],  # Should be length 2,
+    batch_axes: Sequence[Sequence[int]],  # Should be length 2,
+) -> TensorVariable:
+    """
+    Generalized dot product between two tensors.
+
+    Ultimately ``_general_dot`` is a call to `tensor_dot`, performing a multiply-and-sum ("dot") operation between two
+    tensors, along a requested dimension. This function further generalizes this operation by allowing arbitrary
+    batch dimensions to be specified for each tensor.
+
+
+    Parameters
+    ----------
+    vars: tuple[TensorVariable, TensorVariable]
+        The tensors to be ``tensor_dot``ed
+    axes: Sequence[Sequence[int]]
+        The axes along which to perform the dot product. Should be a sequence of two sequences, one for each tensor.
+    batch_axes: Sequence[Sequence[int]]
+        The batch axes for each tensor. Should be a sequence of two sequences, one for each tensor.
+
+    Returns
+    -------
+    TensorVariable
+        The result of the ``tensor_dot`` product.
+
+    Examples
+    --------
+    Perform a batched dot product between two 3d tensors:
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+        from pytensor.tensor.einsum import _general_dot
+        import numpy as np
+
+        A = pt.tensor(shape=(3, 4, 5))
+        B = pt.tensor(shape=(3, 5, 2))
+
+        result = _general_dot((A, B), axes=[[2], [1]], batch_axes=[[0], [0]])
+
+        A_val = np.empty((3, 4, 5))
+        B_val = np.empty((3, 5, 2))
+        print(tuple(result.shape.eval({A:A_val, B:B_val})))
+
+    .. testoutput::
+
+        (3, 4, 2)
+    """
+    # Shortcut for non batched case
+    if not batch_axes[0] and not batch_axes[1]:
+        return tensordot(*vars, axes=axes)
+
+    # Normalize axes, thankfully numpy helper does not sort axis!
+    axes = [
+        normalize_axis_tuple(var_axes, var.ndim)
+        for var, var_axes in zip(vars, axes, strict=True)
+    ]
+    batch_axes = [
+        normalize_axis_tuple(var_axes, var.ndim)
+        for var, var_axes in zip(vars, batch_axes, strict=True)
+    ]
+    n_batch_axes = [len(var_batch_axes) for var_batch_axes in batch_axes]
+
+    # Move batch axes to the left and recode reduction axes
+    new_vars = list(vars)
+    new_axes = list(axes)
+    for i, (var, var_axes, var_batch_axes, var_n_batch_axes) in enumerate(
+        zip(vars, axes, batch_axes, n_batch_axes, strict=True)
+    ):
+        if var_batch_axes == tuple(range(var_n_batch_axes)):
+            # Already on left to right order
+            continue
+
+        new_var_batch_axes = tuple(range(var_n_batch_axes))
+        new_var = moveaxis(var, var_batch_axes, new_var_batch_axes)
+
+        new_var_axes = []
+        for var_axis in var_axes:
+            batch_axes_to_the_right = len(
+                [batch_axis for batch_axis in var_batch_axes if batch_axis > var_axis]
+            )
+            new_var_axes.append(var_axis + batch_axes_to_the_right)
+
+        new_vars[i] = new_var
+        new_axes[i] = new_var_axes
+
+    lhs, rhs = new_vars
+    lhs_axes, rhs_axes = new_axes
+    lhs_n_batch_axes, rhs_n_batch_axes = n_batch_axes
+
+    # Create signature of tensordot
+    lhs_signature = [f"l{i}" for i in range(lhs.type.ndim)]
+    rhs_signature = [f"r{i}" for i in range(rhs.type.ndim)]
+    # Aligned axes get the same dimension name
+    for i, (lhs_axis, rhs_axis) in enumerate(zip(lhs_axes, rhs_axes)):
+        lhs_signature[lhs_axis] = rhs_signature[rhs_axis] = f"a{i}"
+    # Trim away the batch ndims
+    lhs_signature = lhs_signature[lhs_n_batch_axes:]
+    rhs_signature = rhs_signature[rhs_n_batch_axes:]
+    out_signature = [
+        lhs_dim for lhs_dim in lhs_signature if not lhs_dim.startswith("a")
+    ] + [rhs_dim for rhs_dim in rhs_signature if not rhs_dim.startswith("a")]
+    signature = f"({','.join(lhs_signature)}),({','.join(rhs_signature)})->({','.join(out_signature)})"
+    # Adjust axes for core case
+    core_lhs_axes = tuple(np.array(lhs_axes) - lhs_n_batch_axes)
+    core_rhs_axes = tuple(np.array(rhs_axes) - rhs_n_batch_axes)
+
+    if signature == "(),()->()":
+        # Just a multiplication
+        out = lhs * rhs
+    else:
+        out = vectorize(
+            partial(tensordot, axes=[core_lhs_axes, core_rhs_axes]), signature=signature
+        )(lhs, rhs)
+
+    return cast(TensorVariable, out)
+
+
+def _contraction_list_from_path(
+    subscripts: str, operands: Sequence[TensorVariable], path: PATH
+):
+    """
+    Generate a list of contraction steps based on the provided einsum path.
+
+    Code adapted from einsum_opt: https://github.com/dgasmith/opt_einsum/blob/94c62a05d5ebcedd30f59c90b9926de967ed10b5/opt_einsum/contract.py#L369
+
+    When all shapes are known, the linked einsum_opt implementation is preferred. This implementation is used when
+    some or all shapes are not known. As a result, contraction will (always?) be done left-to-right, pushing intermediate
+    results to the end of the stack.
+
+    Parameters
+    ----------
+    subscripts: str
+        Einsum signature string describing the computation to be performed.
+
+    operands: Sequence[TensorLike]
+        Tensors described by the subscripts.
+
+    path: tuple[tuple[int] | tuple[int, int]]
+        A list of tuples, where each tuple describes the indices of the operands to be contracted, sorted in the order
+        they should be contracted.
+
+    Returns
+    -------
+    contraction_list: list
+        A list of tuples, where each tuple describes a contraction step. Each tuple contains the following elements:
+        - contraction_inds: tuple[int]
+            The indices of the operands to be contracted
+        - idx_removed: str
+            The indices of the contracted indices (those removed from the einsum string at this step)
+        - einsum_str: str
+            The einsum string for the contraction step
+        - remaining: None
+            The remaining indices. Included to match the output of opt_einsum.contract_path, but not used.
+        - do_blas: None
+            Whether to use blas to perform this step. Included to match the output of opt_einsum.contract_path,
+            but not used.
+    """
+    fake_operands = [
+        np.zeros([1 if dim == 1 else 0 for dim in x.type.shape]) for x in operands
+    ]
+    input_subscripts, output_subscript, operands = _parse_einsum_input(
+        (subscripts, *fake_operands)
+    )
+
+    # Build a few useful list and sets
+    input_list = input_subscripts.split(",")
+    input_sets = [set(x) for x in input_list]
+    output_set = set(output_subscript)
+
+    # Build contraction tuple (positions, gemm, einsum_str, remaining)
+    contraction_list = []
+    for cnum, contract_inds in enumerate(path):
+        # Make sure we remove inds from right to left
+        contract_inds = cast(
+            tuple[int] | tuple[int, int], tuple(sorted(contract_inds, reverse=True))
+        )
+
+        contract_tuple = _find_contraction(contract_inds, input_sets, output_set)
+        out_inds, input_sets, idx_removed, idx_contract = contract_tuple
+
+        tmp_inputs = [input_list.pop(x) for x in contract_inds]
+
+        # Last contraction
+        if (cnum - len(path)) == -1:
+            idx_result = output_subscript
+        else:
+            # use tensordot order to minimize transpositions
+            all_input_inds = "".join(tmp_inputs)
+            idx_result = "".join(sorted(out_inds, key=all_input_inds.find))
+
+        input_list.append(idx_result)
+        einsum_str = ",".join(tmp_inputs) + "->" + idx_result
+
+        # We only need the first three inputs to build the forward graph
+        contraction = (contract_inds, idx_removed, einsum_str, None, None)
+        contraction_list.append(contraction)
+
+    return contraction_list
+
+
+def einsum(subscripts: str, *operands: "TensorLike", optimize=None) -> TensorVariable:
+    """
+    Multiplication and summation of tensors using the Einstein summation convention.
+
+    Code adapted from JAX: https://github.com/google/jax/blob/534d32a24d7e1efdef206188bb11ae48e9097092/jax/_src/numpy/lax_numpy.py#L5283
+
+    Einsum allows the user to specify a wide range of operations on tensors using the Einstein summation convention. Using
+    this notation, many common linear algebraic operations can be succinctly described on higher order tensors.
+
+    Parameters
+    ----------
+    subscripts: str
+        Einsum signature string describing the computation to be performed.
+
+    operands: sequence of TensorVariable
+        Tensors to be multiplied and summed.
+
+    Returns
+    -------
+    TensorVariable
+        The result of the einsum operation.
+
+    See Also
+    --------
+    pytensor.tensor.tensordot: Generalized dot product between two tensors
+    pytensor.tensor.dot: Matrix multiplication between two tensors
+    numpy.einsum: The numpy implementation of einsum
+
+    Examples
+    --------
+    Inputs to `pt.einsum` are a string describing the operation to be performed (the "subscripts"), and a sequence of
+    tensors to be operated on. The string must follow the following rules:
+
+    1. The string gives inputs and (optionally) outputs. Inputs and outputs are separated by "->".
+    2. The input side of the string is a comma-separated list of indices. For each comma-separated index string, there
+         must be a corresponding tensor in the input sequence.
+    3. For each index string, the number of dimensions in the corresponding tensor must match the number of characters
+         in the index string.
+    4. Indices are arbitrary strings of characters. If an index appears multiple times in the input side, it must have
+        the same shape in each input.
+    5. The indices on the output side must be a subset of the indices on the input side -- you cannot introduce new
+        indices in the output.
+    6. Elipses ("...") can be used to elide multiple indices. This is useful when you have a large number of "batch"
+        dimensions that are not implicated in the operation.
+
+    Finally, two rules about these indicies govern how computation is carried out:
+
+    1. Repeated indices on the input side indicate how the tensor should be "aligned" for multiplication.
+    2. Indices that appear on the input side but not the output side are summed over.
+
+    The operation of these rules is best understood via examples:
+
+    Example 1: Matrix multiplication
+
+    .. code-block:: python
+
+        import pytensor as pt
+        A = pt.matrix("A")
+        B = pt.matrix("B")
+        C = pt.einsum("ij, jk -> ik", A, B)
+
+    This computation is equivalent to :code:`C = A @ B`. Notice that the ``j`` index is repeated on the input side of the
+    signature, and does not appear on the output side. This indicates that the ``j`` dimension of the first tensor should be
+    multiplied with the ``j`` dimension of the second tensor, and the resulting tensor's ``j`` dimension should be summed
+    away.
+
+    Example 2: Batched matrix multiplication
+
+    .. code-block:: python
+
+        import pytensor as pt
+        A = pt.tensor("A", shape=(None, 4, 5))
+        B = pt.tensor("B", shape=(None, 5, 6))
+        C = pt.einsum("bij, bjk -> bik", A, B)
+
+    This computation is also equivalent to :code:`C = A @ B` because of Pytensor's built-in broadcasting rules, but
+    the einsum signature is more explicit about the batch dimensions. The ``b`` and ``j`` indices are repeated on the
+    input side. Unlike ``j``, the ``b`` index is also present on the output side, indicating that the batch dimension
+    should **not** be summed away. As a result, multiplication will be performed over the ``b, j`` dimensions, and then
+    the ``j`` dimension will be summed over. The resulting tensor will have shape ``(None, 4, 6)``.
+
+    Example 3: Batched matrix multiplication with elipses
+
+    .. code-block:: python
+
+        import pytensor as pt
+        A = pt.tensor("A", shape=(4, None, None, None, 5))
+        B = pt.tensor("B", shape=(5, None, None, None, 6))
+        C = pt.einsum("i...j, j...k -> ...ik", A, B)
+
+    This case is the same as above, but inputs ``A`` and ``B`` have multiple batch dimensions. To avoid writing out all
+    of the batch dimensions (which we do not care about), we can use ellipses to elide over these dimensions. Notice
+    also that we are not required to "sort" the input dimensions in any way. In this example, we are doing a dot
+    between the last dimension A and the first dimension of B, which is perfectly valid.
+
+    Example 4: Outer product
+
+    .. code-block:: python
+
+        import pytensor as pt
+        x = pt.tensor("x", shape=(3,))
+        y = pt.tensor("y", shape=(4,))
+        z = pt.einsum("i, j -> ij", x, y)
+
+    This computation is equivalent to :code:`pt.outer(x, y)`. Notice that no indices are repeated on the input side,
+    and the output side has two indices. Since there are no indices to align on, the einsum operation will simply
+    multiply the two tensors elementwise, broadcasting dimensions ``i`` and ``j``.
+
+    Example 5: Convolution
+
+    .. code-block:: python
+
+            import pytensor as pt
+            x = pt.tensor("x", shape=(None, None, None, None, None, None))
+            w = pt.tensor("w", shape=(None, None, None, None))
+            y = pt.einsum(""bchwkt,fckt->bfhw", x, w)
+
+    Given a batch of images ``x`` with dimensions ``(batch, channel, height, width, kernel_size, num_filters)``
+    and a filter ``w``, with dimensions ``(num_filters, channels, kernel_size, num_filters)``,  this einsum operation
+    computes the convolution of ``x`` with ``w``. Multiplication is aligned on the batch, num_filters, height, and width
+    dimensions. The channel, kernel_size, and num_filters dimensions are summed over. The resulting tensor has shape
+    ``(batch, num_filters, height, width)``, reflecting the fact that information from each channel has been mixed
+    together.
+    """
+
+    if optimize is not None:
+        raise NotImplementedError(
+            "Optimize kwarg is not implemented in PyTensor. "
+            "By default, PyTensor will always optimize the graph if the inputs have static shapes.\n"
+            "If you need this functionality open an issue in https://github.com/pymc-devs/pytensor/issues to let us know. "
+        )
+
+    # TODO: Is this doing something clever about unknown shapes?
+    # contract_path = _poly_einsum_handlers.get(ty, _default_poly_einsum_handler)
+    tensor_operands = [as_tensor(operand) for operand in operands]
+    shapes = [operand.type.shape for operand in tensor_operands]
+
+    path: PATH
+    if any(None in shape for shape in shapes):
+        # Case 1: At least one of the operands has an unknown shape. In this case, we can't use opt_einsum to optimize
+        # the contraction order, so we just use a default path of (1,0) contractions. This will work left-to-right,
+        # pushing intermediate results to the end of the stack.
+        # We use (1,0) and not (0,1) because that's what opt_einsum tends to prefer, and so the Op signatures will
+        # match more often
+
+        # If shapes become known later we will likely want to rebuild the Op (unless we inline it)
+        if len(tensor_operands) == 1:
+            path = ((0,),)
+        else:
+            # By default, we try right to left because we assume that most graphs
+            # have a lower dimensional rightmost operand
+            path = tuple(pairwise(reversed(range(len(tensor_operands)))))
+        contraction_list = _contraction_list_from_path(
+            subscripts, tensor_operands, path
+        )
+
+        # If there are only 1 or 2 operands, there is no optimization to be done?
+        optimized = len(tensor_operands) <= 2
+    else:
+        # Case 2: All operands have known shapes. In this case, we can use opt_einsum to compute the optimal
+        # contraction order.
+        _, contraction_list = np.einsum_path(
+            subscripts,
+            # Numpy einsum_path requires arrays even though only the shapes matter
+            # It's not trivial to duck-type our way around because of internal call to `asanyarray`
+            *[np.empty(shape) for shape in shapes],
+            einsum_call=True,  # Not part of public API
+            optimize="optimal",
+        )  # type: ignore
+        path = tuple(contraction[0] for contraction in contraction_list)
+        optimized = True
+
+    def removechars(s, chars):
+        return s.translate(str.maketrans(dict.fromkeys(chars)))
+
+    def sum_uniques(
+        operand: TensorVariable, names: str, uniques: list[str]
+    ) -> tuple[TensorVariable, str]:
+        """Reduce unique indices (those that appear only once) in a given contraction step via summing."""
+        if uniques:
+            axes = [names.index(name) for name in uniques]
+            operand = operand.sum(axes)
+            names = removechars(names, uniques)
+        return operand, names
+
+    def sum_repeats(
+        operand: TensorVariable,
+        names: str,
+        counts: collections.Counter,
+        keep_names: str,
+    ) -> tuple[TensorVariable, str]:
+        """Reduce repeated indices in a given contraction step via summation against an identity matrix."""
+
+        for name, count in counts.items():
+            if count > 1:
+                axes = [i for i, n in enumerate(names) if n == name]
+                eye = _delta(operand.shape, axes)
+                operand = where(eye, operand, operand.zeros_like())
+                if name not in keep_names:
+                    operand = operand.sum(axes)
+                    names = names.replace(name, "")
+                else:
+                    operand = operand.sum(axes[:-1])
+                    names = names.replace(name, "", count - 1)
+        return operand, names
+
+    def filter_singleton_dims(operand, names, other_operand, other_names):
+        op_bcast = operand.type.broadcastable
+        other_bcast = other_operand.type.broadcastable
+        keep = [
+            (not op_bcast[i]) or (j == -1) or other_bcast[j]
+            for i, j in enumerate(map(other_names.find, names))
+        ]
+        keep_axes = [i for i, keep_axis in enumerate(keep) if keep_axis]
+        squeeze_axes = [i for i, keep_axis in enumerate(keep) if not keep_axis]
+        if squeeze_axes:
+            # TODO: We could modify the subscripts to avoid the problem?
+            warnings.warn(
+                "The same einsum subscript is used for a broadcastable and non-broadcastable dimension. "
+                "This can result in a suboptimal contraction path."
+            )
+        return operand.squeeze(squeeze_axes), "".join(names[i] for i in keep_axes)
+
+    einsum_operands = list(tensor_operands)  # So we can pop
+    for operand_indices, contracted_names, einstr, _, _ in contraction_list:
+        contracted_names = sorted(contracted_names)
+        assert len(contracted_names) == len(
+            set(contracted_names)
+        ), "The set was needed!"
+
+        input_str, result_names = einstr.split("->")
+        input_names = input_str.split(",")
+
+        # switch on the number of operands to be processed in this loop iteration.
+        # every case here sets 'operand' and 'names'.
+        if len(operand_indices) == 1:
+            operand = einsum_operands.pop(operand_indices[0])
+            (names,) = input_names
+            counts = collections.Counter(names)
+
+            # sum out unique contracted indices with a single reduce-sum
+            uniques = [name for name in contracted_names if counts[name] == 1]
+            operand, names = sum_uniques(operand, names, uniques)
+
+            # for every repeated index, do a contraction against an identity matrix
+            operand, names = sum_repeats(operand, names, counts, result_names)
+
+        elif len(operand_indices) == 2:
+            lhs, rhs = map(einsum_operands.pop, operand_indices)
+            lhs_names, rhs_names = input_names
+
+            # handle cases where one side of a contracting or batch dimension is 1
+            # but its counterpart is not.
+            lhs, lhs_names = filter_singleton_dims(lhs, lhs_names, rhs, rhs_names)
+            rhs, rhs_names = filter_singleton_dims(rhs, rhs_names, lhs, lhs_names)
+
+            lhs_counts = collections.Counter(lhs_names)
+            rhs_counts = collections.Counter(rhs_names)
+
+            # sum out unique contracted indices in lhs and rhs
+            lhs_uniques = [
+                name
+                for name in contracted_names
+                if lhs_counts[name] == 1 and rhs_counts[name] == 0
+            ]
+            lhs, lhs_names = sum_uniques(lhs, lhs_names, lhs_uniques)
+
+            rhs_uniques = [
+                name
+                for name in contracted_names
+                if rhs_counts[name] == 1 and lhs_counts[name] == 0
+            ]
+            rhs, rhs_names = sum_uniques(rhs, rhs_names, rhs_uniques)
+
+            # for every repeated index, contract against an identity matrix
+            lhs, lhs_names = sum_repeats(
+                lhs, lhs_names, lhs_counts, result_names + rhs_names
+            )
+            rhs, rhs_names = sum_repeats(
+                rhs, rhs_names, rhs_counts, result_names + lhs_names
+            )
+
+            lhs_or_rhs_names = set(lhs_names) | set(rhs_names)
+            contracted_names = [x for x in contracted_names if x in lhs_or_rhs_names]
+            lhs_and_rhs_names = set(lhs_names) & set(rhs_names)
+            batch_names = [x for x in result_names if x in lhs_and_rhs_names]
+
+            if batch_names:
+                lhs_batch, rhs_batch = tuple(
+                    zip(*[(lhs_names.find(n), rhs_names.find(n)) for n in batch_names])
+                )
+            else:
+                lhs_batch = rhs_batch = ()
+
+            # contract using dot_general
+            batch_names_str = "".join(batch_names)
+            if contracted_names:
+                lhs_cont, rhs_cont = tuple(
+                    zip(
+                        *[
+                            (lhs_names.index(n), rhs_names.index(n))
+                            for n in contracted_names
+                        ]
+                    )
+                )
+            else:
+                lhs_cont = rhs_cont = ()
+            deleted_names = batch_names_str + "".join(contracted_names)
+            remaining_lhs_names = removechars(lhs_names, deleted_names)
+            remaining_rhs_names = removechars(rhs_names, deleted_names)
+            # Try both orders of lhs and rhs, in the hope that one of them means we
+            # don't need an explicit transpose. opt_einsum likes to contract from
+            # right to left, so we expect (rhs,lhs) to have the best chance of not
+            # needing a transpose.
+            names = batch_names_str + remaining_rhs_names + remaining_lhs_names
+            if names == result_names:
+                operand = _general_dot(
+                    (rhs, lhs), (rhs_cont, lhs_cont), (rhs_batch, lhs_batch)
+                )
+            else:
+                names = batch_names_str + remaining_lhs_names + remaining_rhs_names
+                operand = _general_dot(
+                    (lhs, rhs),
+                    axes=(lhs_cont, rhs_cont),
+                    batch_axes=(lhs_batch, rhs_batch),
+                )
+        else:
+            raise ValueError(
+                f"Each step of einsum must have 1 or 2 operands, got {len(operand_indices)}"
+            )
+
+        # the resulting 'operand' with axis labels 'names' should be a permutation of the desired result
+        assert len(names) == len(result_names) == len(set(names))
+        assert set(names) == set(result_names)
+        if names != result_names:
+            perm = tuple(names.index(name) for name in result_names)
+            operand = transpose(operand, perm)
+        einsum_operands.append(operand)  # used in next iteration
+
+    [einsum_result] = einsum_operands
+
+    out = Einsum(
+        subscripts=subscripts,
+        inputs=list(tensor_operands),
+        outputs=[einsum_result],
+        path=tuple(path),
+        optimized=optimized,
+    )(*tensor_operands)
+    return cast(TensorVariable, out)
diff --git a/pytensor/tensor/functional.py b/pytensor/tensor/functional.py
index e7a5371b02..05e11f2643 100644
--- a/pytensor/tensor/functional.py
+++ b/pytensor/tensor/functional.py
@@ -1,8 +1,8 @@
 from collections.abc import Callable
 
 from pytensor.graph import vectorize_graph
-from pytensor.tensor import TensorVariable
 from pytensor.tensor.utils import _parse_gufunc_signature
+from pytensor.tensor.variable import TensorVariable
 
 
 def vectorize(func: Callable, signature: str | None = None) -> Callable:
diff --git a/pytensor/tensor/rewriting/__init__.py b/pytensor/tensor/rewriting/__init__.py
index 168b636041..fc5c528f2d 100644
--- a/pytensor/tensor/rewriting/__init__.py
+++ b/pytensor/tensor/rewriting/__init__.py
@@ -3,10 +3,9 @@
 import pytensor.tensor.rewriting.blas_c
 import pytensor.tensor.rewriting.blas_scipy
 import pytensor.tensor.rewriting.blockwise
+import pytensor.tensor.rewriting.einsum
 import pytensor.tensor.rewriting.elemwise
 import pytensor.tensor.rewriting.extra_ops
-
-# Register JAX specializations
 import pytensor.tensor.rewriting.jax
 import pytensor.tensor.rewriting.linalg
 import pytensor.tensor.rewriting.math
diff --git a/pytensor/tensor/rewriting/basic.py b/pytensor/tensor/rewriting/basic.py
index 4a7570dad3..6a038cab15 100644
--- a/pytensor/tensor/rewriting/basic.py
+++ b/pytensor/tensor/rewriting/basic.py
@@ -52,6 +52,7 @@
     TensorFromScalar,
     alloc,
     as_tensor_variable,
+    atleast_Nd,
     cast,
     extract_constant,
     fill,
@@ -1219,3 +1220,123 @@ def local_merge_alloc(fgraph, node):
 
 
 register_canonicalize(RemovalNodeRewriter(tensor_copy), name="remove_tensor_copy")
+
+
+@register_specialize
+@node_rewriter([DimShuffle])
+def local_dimshuffle_alloc(fgraph, node):
+    """
+    Lift DimShuffle through Alloc
+
+    dimshuffle{x, 0, 1}(alloc([3 4], 3, 2) => alloc([3 4], 1, 3, 2)
+    """
+    alloc_out = node.inputs[0]
+    alloc_node = alloc_out.owner
+    if not (alloc_node and isinstance(alloc_node.op, Alloc)):
+        return
+
+    ds_op = node.op
+    value, *alloc_shape = alloc_node.inputs
+
+    # Add implicit dimensions of value
+    value = atleast_Nd(value, n=len(alloc_shape))
+
+    # Dimshuffle value and alloc_shape
+    ds_value = value.dimshuffle(ds_op.new_order)
+    ds_alloc_shape = [alloc_shape[i] for i in ds_op.shuffle]
+    for dim in ds_op.augment:
+        ds_alloc_shape.insert(dim, 1)
+
+    return [alloc(ds_value, *ds_alloc_shape)]
+
+
+@register_specialize("shape_unsafe")
+@node_rewriter([Join])
+def local_join_of_alloc(fgraph, node):
+    """Rewrite a Join of Alloc nodes to an Alloc of the Join nodes."""
+    axis, *tensors = node.inputs
+
+    if len(tensors) < 2:
+        # Let other rewrite handle the useless Join
+        return
+
+    if not isinstance(axis, Constant):
+        return
+
+    core_tensors = []
+    alloc_shapes = []
+    for tensor in tensors:
+        if tensor.owner is None:
+            return
+
+        # tensor = expand_dims_to_alloc(tensor)
+        if not isinstance(tensor.owner.op, Alloc):
+            return
+
+        value, *shape = tensor.owner.inputs
+        # Introduce explicit batch dims
+        value = atleast_Nd(value, n=len(shape))
+        core_tensors.append(value)
+        alloc_shapes.append(shape)
+
+    # Find which allocated dimensions can be lifted
+    # Axis can never be lifted
+    # Non-axis allocated dimensions can be lifted if they are all broadcastable
+    [out] = node.outputs
+    axis = axis.data
+
+    broadcasted_dims = list(
+        zip(
+            *(
+                [
+                    bef and not aft
+                    for bef, aft in zip(
+                        core_tensor.type.broadcastable,
+                        tensor.type.broadcastable,
+                        strict=True,
+                    )
+                ]
+                for core_tensor, tensor in zip(core_tensors, tensors, strict=True)
+            )
+        )
+    )
+
+    lifteable_alloc_dims = {
+        dim
+        for dim in range(out.type.ndim)
+        if dim != axis and all(broadcasted_dims[dim])
+    }
+
+    if not lifteable_alloc_dims:
+        return
+
+    # Lift the allocated dimensions
+    new_tensors = []
+    for core_tensor, alloc_shape in zip(core_tensors, alloc_shapes):
+        pre_join_shape = [
+            1 if i in lifteable_alloc_dims else alloc_dim
+            for i, alloc_dim in enumerate(alloc_shape)
+        ]
+        new_tensor = alloc(core_tensor, *pre_join_shape)
+        copy_stack_trace(tensor, new_tensor)
+        new_tensors.append(new_tensor)
+
+    new_join = node.op(axis, *new_tensors)
+    copy_stack_trace(node.outputs[0], new_join)
+
+    # Reintroduce the lifted dims
+    post_join_shape = []
+    for i, alloc_dims in enumerate(zip(*alloc_shapes)):
+        if i == axis:
+            # The alloc dim along the axis is the sum of all the pre-join alloc dims
+            post_join_shape.append(add(*alloc_dims))
+        else:
+            # Otherwise the shapes should all match. We prioritize constants if any
+            for best_alloc_dim in alloc_dims:
+                if isinstance(best_alloc_dim, Constant):
+                    break
+            post_join_shape.append(best_alloc_dim)
+
+    new_out = alloc(new_join, *post_join_shape)
+    copy_stack_trace(node.outputs[0], new_out)
+    return [new_out]
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
index 0bed304c29..7220824c58 100644
--- a/pytensor/tensor/rewriting/blockwise.py
+++ b/pytensor/tensor/rewriting/blockwise.py
@@ -10,6 +10,7 @@
     register_specialize,
     register_stabilize,
 )
+from pytensor.tensor.shape import Reshape
 from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor, Subtensor
 
 
@@ -67,10 +68,16 @@ def local_useless_unbatched_blockwise(fgraph, node):
 def local_eager_useless_unbatched_blockwise(fgraph, node):
     if isinstance(
         node.op.core_op,
-        Dot | Alloc | ARange | Subtensor | AdvancedSubtensor | AdvancedIncSubtensor,
+        Dot
+        | Alloc
+        | ARange
+        | Subtensor
+        | AdvancedSubtensor
+        | AdvancedIncSubtensor
+        | Reshape,
     ):
         # Many Dot-related rewrites (eg, all of BlasOpt) happen before specialize
-        # These other Ops can't always be trivially vectored at runtime,
+        # These other Ops can't always be trivially vectorized at runtime,
         # since their inputs may imply non-rectangular shapes.
         return local_useless_unbatched_blockwise.fn(fgraph, node)
 
@@ -97,62 +104,67 @@ def local_blockwise_alloc(fgraph, node):
     BOp(matrix, alloc(vector, 10, 5)) -> BOp(matrix, vector)
     """
 
-    if not any(isinstance(inp.owner.op, Alloc) for inp in node.inputs if inp.owner):
-        return None
-
     op: Blockwise = node.op  # type: ignore
 
     batch_ndim = op.batch_ndim(node)
     if not batch_ndim:
         return None
 
+    if not any(var.owner and isinstance(var.owner.op, Alloc) for var in node.inputs):
+        return None
+
     new_inputs = []
     batch_shapes = []
     can_push_any_alloc = False
     for inp, inp_sig in zip(node.inputs, op.inputs_sig):
-        if inp.owner and isinstance(inp.owner.op, Alloc):
-            # Push batch dims from Alloc
-            value, *shape = inp.owner.inputs
-
-            # Check what to do with the value of the Alloc
-            squeezed_value = _squeeze_left(value, batch_ndim)
-            missing_ndim = len(shape) - value.type.ndim
-            if (
-                (((1,) * missing_ndim + value.type.broadcastable)[batch_ndim:])
-                != inp.type.broadcastable[batch_ndim:]
-            ):
-                # We still need an Alloc for the core dims
-                core_shape = shape[batch_ndim:]
-                # And the batch dims of the squeezed value
-                squeezed_value_batch_ndim = squeezed_value.type.ndim - len(core_shape)
-                batch_shape = [
-                    1 if broadcastable else dim
-                    for broadcastable, dim in zip(
-                        squeezed_value.type.broadcastable[:squeezed_value_batch_ndim],
-                        tuple(squeezed_value.shape)[:squeezed_value_batch_ndim],
+        if not all(inp.type.broadcastable[:batch_ndim]):
+            if inp.owner and isinstance(inp.owner.op, Alloc):
+                # Push batch dims from Alloc
+                value, *shape = inp.owner.inputs
+
+                # Check what to do with the value of the Alloc
+                squeezed_value = _squeeze_left(value, batch_ndim)
+                missing_ndim = len(shape) - value.type.ndim
+                if (
+                    (((1,) * missing_ndim + value.type.broadcastable)[batch_ndim:])
+                    != inp.type.broadcastable[batch_ndim:]
+                ):
+                    # We still need an Alloc for the core dims
+                    core_shape = shape[batch_ndim:]
+                    # And the batch dims of the squeezed value
+                    squeezed_value_batch_ndim = squeezed_value.type.ndim - len(
+                        core_shape
                     )
-                ]
-                squeezed_value = alloc(squeezed_value, *batch_shape, *core_shape)
-                if squeezed_value.type.broadcastable == inp.type.broadcastable:
-                    # We can't change anything about this Alloc input
-                    new_inputs.append(inp)
-                    continue
-
-            # We can push batch dims of this Alloc input
-            batch_shapes.append(
-                tuple(
-                    1 if broadcastable else dim
-                    for broadcastable, dim in zip(
-                        inp.type.broadcastable, shape[:batch_ndim]
+                    batch_shape = [
+                        1 if broadcastable else dim
+                        for broadcastable, dim in zip(
+                            squeezed_value.type.broadcastable[
+                                :squeezed_value_batch_ndim
+                            ],
+                            tuple(squeezed_value.shape)[:squeezed_value_batch_ndim],
+                        )
+                    ]
+                    squeezed_value = alloc(squeezed_value, *batch_shape, *core_shape)
+                    if squeezed_value.type.broadcastable == inp.type.broadcastable:
+                        # We can't change anything about this Alloc input
+                        new_inputs.append(inp)
+                        continue
+
+                # We can push batch dims of this Alloc input
+                batch_shapes.append(
+                    tuple(
+                        1 if broadcastable else dim
+                        for broadcastable, dim in zip(
+                            inp.type.broadcastable, shape[:batch_ndim]
+                        )
                     )
                 )
-            )
-            new_inputs.append(squeezed_value)
-            can_push_any_alloc = True
+                new_inputs.append(squeezed_value)
+                can_push_any_alloc = True
+                continue
 
-        else:
-            # Nothing to do with this input other than removing dummy batch dims
-            new_inputs.append(_squeeze_left(inp, batch_ndim))
+        # Nothing to do with this input other than removing dummy batch dims
+        new_inputs.append(_squeeze_left(inp, batch_ndim))
 
     if not can_push_any_alloc:
         return None
@@ -167,17 +179,15 @@ def local_blockwise_alloc(fgraph, node):
         missing_ndim = old_out_type.ndim - new_out_type.ndim
         batch_shape = ([1] * missing_ndim + list(new_outs[0].shape))[:batch_ndim]
         for i, batch_dims in enumerate(zip(*batch_shapes)):  # Transpose shape tuples
+            if old_out_type.broadcastable[i]:
+                continue
             for batch_dim in batch_dims:
                 if batch_dim == 1:
                     continue
+                batch_shape[i] = batch_dim
                 if isinstance(batch_dim, Constant):
                     # Give preference to Constants
-                    batch_shape[i] = batch_dim
                     break
-                elif old_out_type.broadcastable[i]:
-                    # Only use non Constant shapes if absolutely necessary
-                    # Otherwise, we use the shape of the non-alloc output
-                    batch_shape[i] = batch_dim
 
         copy_stack_trace(node.outputs, new_outs)
         new_outs = [
@@ -190,3 +200,28 @@ def local_blockwise_alloc(fgraph, node):
         ]
     copy_stack_trace(node.outputs, new_outs)
     return new_outs
+
+
+@register_specialize
+@node_rewriter([Blockwise])
+def local_blockwise_reshape(fgraph, node):
+    """Rewrite away square Blockwise reshapes.
+
+    Reshape is tricky to vectorize eagerly, because a graph like
+    `x.reshape([x.shape[0] * x.shape[1], -1])` has many operations
+    that must be vectorized before we arrize at the reshape operation.
+
+    For the square Reshape case, we must wait for all the intemediate
+    operations to be lifted as Allocs
+    """
+    if not isinstance(node.op.core_op, Reshape):
+        return None
+
+    x, output_shape = node.inputs
+    batch_ndim = node.op.batch_ndim(node)
+    if all(output_shape.type.broadcastable[:batch_ndim]):
+        batched_shape = x.shape[:batch_ndim]
+        core_reshape = _squeeze_left(output_shape, batch_ndim)
+        new_out = x.reshape([*tuple(batched_shape), *tuple(core_reshape)])
+        copy_stack_trace(node.outputs[0], new_out)
+        return [new_out]
diff --git a/pytensor/tensor/rewriting/einsum.py b/pytensor/tensor/rewriting/einsum.py
new file mode 100644
index 0000000000..5e9fe2d026
--- /dev/null
+++ b/pytensor/tensor/rewriting/einsum.py
@@ -0,0 +1,53 @@
+from typing import cast
+
+from pytensor.graph import Apply, FunctionGraph, node_rewriter
+from pytensor.graph.rewriting.basic import copy_stack_trace
+from pytensor.tensor.einsum import Einsum, einsum
+from pytensor.tensor.rewriting.basic import register_specialize
+from pytensor.tensor.rewriting.ofg import inline_ofg_node
+from pytensor.tensor.variable import TensorVariable
+
+
+@register_specialize
+@node_rewriter([Einsum])
+def optimize_einsum_inner_graph(
+    fgraph: FunctionGraph, node: Apply
+) -> list[TensorVariable] | None:
+    """Try to optimize an einsum that was not optimizable at definition time.
+
+    This can happen when users replace a graph without rebuilding
+
+    Or when during the course of rewrites more specialized static shapes are found
+    """
+    op: Einsum = node.op
+
+    if op.optimized:
+        # Already optimized
+        return None
+
+    operands = node.inputs
+    if any(None in operand.type.shape for operand in operands):
+        return None
+
+    new_out = einsum(op.subscripts, *operands)
+    assert new_out.owner.op.optimized
+
+    copy_stack_trace(node.outputs[0], new_out)
+    return [new_out]
+
+
+@register_specialize
+@node_rewriter([Einsum])
+def inline_optimized_einsum(
+    fgraph: FunctionGraph, node: Apply
+) -> list[TensorVariable] | None:
+    """Inline einsums that are already optimized.
+
+    This allows the inner garph to be optimized with the rest of the graph, now that we got ordering right.
+    """
+    op: Einsum = node.op
+
+    if not op.optimized:
+        return None
+
+    return cast(list[TensorVariable], inline_ofg_node(node))
diff --git a/pytensor/tensor/rewriting/ofg.py b/pytensor/tensor/rewriting/ofg.py
index 265f3ff2e8..2c4dfc4f70 100644
--- a/pytensor/tensor/rewriting/ofg.py
+++ b/pytensor/tensor/rewriting/ofg.py
@@ -1,12 +1,24 @@
-from pytensor import clone_replace
+from typing import cast
+
+from pytensor import Variable, clone_replace
 from pytensor.compile import optdb
 from pytensor.compile.builders import OpFromGraph
-from pytensor.graph import node_rewriter
+from pytensor.graph import Apply, node_rewriter
 from pytensor.graph.rewriting.basic import copy_stack_trace, in2out
 from pytensor.tensor.basic import AllocDiag
 from pytensor.tensor.rewriting.basic import register_specialize
 
 
+def inline_ofg_node(node: Apply) -> list[Variable]:
+    op = node.op
+    assert isinstance(op, OpFromGraph)
+    inlined_outs = clone_replace(
+        op.inner_outputs, dict(zip(op.inner_inputs, node.inputs))
+    )
+    copy_stack_trace(op.inner_outputs, inlined_outs)
+    return cast(list[Variable], inlined_outs)
+
+
 @node_rewriter([OpFromGraph])
 def inline_ofg_expansion(fgraph, node):
     """
@@ -18,10 +30,7 @@ def inline_ofg_expansion(fgraph, node):
     if not op.is_inline:
         return False
 
-    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
-    copy_stack_trace(op.inner_outputs, new_out)
-
-    return new_out
+    return inline_ofg_node(node)
 
 
 # We want to run this before the first merge optimizer
@@ -61,8 +70,4 @@ def late_inline_OpFromGraph(fgraph, node):
     -------
 
     """
-    op = node.op
-    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
-    copy_stack_trace(op.inner_outputs, new_out)
-
-    return new_out
+    return inline_ofg_node(node)
diff --git a/pytensor/tensor/rewriting/shape.py b/pytensor/tensor/rewriting/shape.py
index 1426a7d993..afa94d4e1f 100644
--- a/pytensor/tensor/rewriting/shape.py
+++ b/pytensor/tensor/rewriting/shape.py
@@ -749,51 +749,43 @@ def apply(self, fgraph):
 pytensor.compile.mode.optdb.register("UnShapeOpt", UnShapeOptimizer(), position=10)
 
 
-def local_reshape_chain(op):
-    @node_rewriter([op])
-    def f(fgraph, node):
-        """
-        Reshape(Reshape(shape1),shape2) -> Reshape(shape2)
-
-        """
-        if not check_chain(node, op, op):
-            return False
-
-        # TODO: this can permit a failing program to run by eliminating
-        #       the lower reshape
-        rval = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
-
-        # Copy over stacktrace from previous output node, as any error
-        # in new computational graph would have been caused by last op
-        # in the old computational graph.
-        copy_stack_trace(node.outputs, rval)
-
-        # It might happen that the desired output of this node has a
-        # broadcastable pattern that does not match that of 'rval'. This is
-        # when originally, we were able to figure out that one of the
-        # dimensions of the reshape is one, but some other transformation
-        # replaced the shape by one for which this cannot be guessed.
-        # We should try to figure out why we lost the information about this
-        # constant value... but in the meantime, better not apply this
-        # rewrite.
-        if rval.type.ndim == node.outputs[0].type.ndim and all(
-            s1 == s2
-            for s1, s2 in zip(rval.type.shape, node.outputs[0].type.shape)
-            if s1 == 1 or s2 == 1
-        ):
-            return [rval]
-        else:
-            return False
-
-    return f
+@register_canonicalize("shape_unsafe")
+@register_specialize("shape_unsafe")
+@node_rewriter([Reshape])
+def local_reshape_chain(fgraph, node):
+    """
+    Reshape(Reshape(x, shape1),shape2) -> Reshape(x, shape2)
 
+    """
+    if not check_chain(node, Reshape, Reshape):
+        return False
 
-register_canonicalize(local_reshape_chain(Reshape), name="local_reshape_chain")
+    rval = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
+
+    # Copy over stacktrace from previous output node, as any error
+    # in new computational graph would have been caused by last op
+    # in the old computational graph.
+    copy_stack_trace(node.outputs, rval)
+
+    # It might happen that the desired output of this node has a
+    # broadcastable pattern that does not match that of 'rval'. This is
+    # when originally, we were able to figure out that one of the
+    # dimensions of the reshape is one, but some other transformation
+    # replaced the shape by one for which this cannot be guessed.
+    # We should try to figure out why we lost the information about this
+    # constant value... but in the meantime, better not apply this
+    # rewrite.
+    if rval.type.ndim == node.outputs[0].type.ndim and all(
+        s1 == s2
+        for s1, s2 in zip(rval.type.shape, node.outputs[0].type.shape)
+        if s1 == 1 or s2 == 1
+    ):
+        return [rval]
 
 
-@register_useless
-@register_canonicalize
-@register_stabilize
+@register_useless("shape_unsafe")
+@register_canonicalize("shape_unsafe")
+@register_specialize("shape_unsafe")
 @node_rewriter([Reshape])
 def local_useless_reshape(fgraph, node):
     """Remove two kinds of useless `Reshape`.
@@ -802,24 +794,17 @@ def local_useless_reshape(fgraph, node):
     - Remove `Reshape` when reshaping to the shape of the input.
 
     """
-    inp = node.inputs[0]
-    output = node.outputs[0]
-    output_shape = node.inputs[1]
+    inp, output_shape = node.inputs
+    [output] = node.outputs
 
     if inp.type.ndim != output.type.ndim:
         return False
 
     # Simple case: both input and output have a single dimension.
-    # TODO FIXME XXX: This could hide errors if the user provides inconsistent
-    # shapes.
     if (
         inp.type.ndim == 1
         and output.type.ndim == 1
-        and all(
-            s1 == s2
-            for s1, s2 in zip(inp.type.shape, output.type.shape)
-            if s1 == 1 or s2 == 1
-        )
+        and inp.type.broadcastable == output.type.broadcastable
     ):
         return [inp]
 
@@ -832,8 +817,15 @@ def local_useless_reshape(fgraph, node):
 
     # Match Reshape(x, [x.shape[0], ..., x.shape[-1]]), accounting for
     # broadcastable and constant dimensions
-    if output_shape.owner and isinstance(output_shape.owner.op, MakeVector):
-        output_shape_is = output_shape.owner.inputs
+    if isinstance(output_shape, Constant) or (
+        output_shape.owner and isinstance(output_shape.owner.op, MakeVector)
+    ):
+        if isinstance(output_shape, Constant):
+            output_shape_is = [
+                as_tensor_variable(dim, ndim=0) for dim in output_shape.data
+            ]
+        else:
+            output_shape_is = output_shape.owner.inputs
 
         shape_feature = getattr(fgraph, "shape_feature", None)
 
@@ -865,9 +857,9 @@ def local_useless_reshape(fgraph, node):
                         shape_match[dim] = True
                         continue
 
-            # Match 1 if input.type.shape[dim] == 1
+            # Match constant if input.type.shape[dim] == constant
             cst_outshp_i = extract_constant(outshp_i, only_process_constants=1)
-            if inp.type.shape[dim] == 1 and cst_outshp_i == 1:
+            if inp.type.shape[dim] == cst_outshp_i:
                 shape_match[dim] = True
                 continue
 
@@ -881,17 +873,18 @@ def local_useless_reshape(fgraph, node):
             if shape_feature:
                 inpshp_i = shape_feature.get_shape(inp, dim)
                 if inpshp_i == outshp_i or (
-                    extract_constant(inpshp_i, only_process_constants=1)
-                    == extract_constant(outshp_i, only_process_constants=1)
+                    extract_constant(inpshp_i, only_process_constants=True)
+                    == extract_constant(outshp_i, only_process_constants=True)
                 ):
                     shape_match[dim] = True
                     continue
 
-        if all(shape_match) and nb_m1 <= 1:
+        if nb_m1 <= 1 and all(shape_match):
+            return [inp]
+
+        if (nb_m1 == 0) and (shape_match.count(False) == output.type.ndim - 1):
             return [inp]
 
-        # TODO later: if all the shapes except one match, we may want to
-        # consider it useless as well, like we do in the 1-dim case.
         return False
 
 
@@ -910,9 +903,8 @@ def local_reshape_to_dimshuffle(fgraph, node):
           -> DimShuffle{x,0,x,1,x,x}(Reshape(x, (m, n)))
     """
     op = node.op
-    inp = node.inputs[0]
-    output = node.outputs[0]
-    output_shape = node.inputs[1]
+    inp, output_shape = node.inputs
+    [output] = node.outputs
 
     dimshuffle_new_order = []
     new_output_shape = []
@@ -944,7 +936,7 @@ def local_reshape_to_dimshuffle(fgraph, node):
 
 
 @register_canonicalize
-@register_stabilize
+@register_specialize
 @node_rewriter([Reshape])
 def local_reshape_lift(fgraph, node):
     """
diff --git a/pytensor/tensor/shape.py b/pytensor/tensor/shape.py
index 236c34b442..614258dcae 100644
--- a/pytensor/tensor/shape.py
+++ b/pytensor/tensor/shape.py
@@ -842,13 +842,13 @@ def c_code(self, node, name, inputs, outputs, sub):
 
 @_vectorize_node.register(Reshape)
 def _vectorize_reshape(op, node, x, shape):
+    from pytensor.tensor.blockwise import vectorize_node_fallback
+
     old_x, old_shape = node.inputs
     batched_ndims = x.type.ndim - old_x.type.ndim
 
     if as_tensor_variable(shape).type.ndim != 1:
-        raise NotImplementedError(
-            "It is not possible to vectorize the shape argument of Reshape"
-        )
+        return vectorize_node_fallback(op, node, x, shape)
 
     if len(tuple(old_shape)) == len(tuple(shape)):
         new_shape = [*x.shape[:batched_ndims], *shape]
diff --git a/tests/link/jax/test_einsum.py b/tests/link/jax/test_einsum.py
new file mode 100644
index 0000000000..9a55670c64
--- /dev/null
+++ b/tests/link/jax/test_einsum.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pytest
+
+import pytensor
+import pytensor.tensor as pt
+
+
+jax = pytest.importorskip("jax")
+
+
+def test_jax_einsum():
+    subscripts = "ij, jk, kl -> il"
+    x = np.random.rand(3, 5)
+    y = np.random.rand(5, 2)
+    z = np.random.rand(2, 4)
+
+    shapes = ((3, 5), (5, 2), (2, 4))
+    x_pt, y_pt, z_pt = (
+        pt.tensor(name, shape=shape) for name, shape in zip("xyz", shapes)
+    )
+    out = pt.einsum(subscripts, x_pt, y_pt, z_pt)
+    f = pytensor.function([x_pt, y_pt, z_pt], out, mode="JAX")
+
+    np.testing.assert_allclose(f(x, y, z), np.einsum(subscripts, x, y, z))
+
+
+@pytest.mark.xfail(raises=NotImplementedError)
+def test_ellipsis_einsum():
+    subscripts = "...i,...i->..."
+    x = np.random.rand(2, 5)
+    y = np.random.rand(2, 5)
+
+    x_pt = pt.tensor("x", shape=x.shape)
+    y_pt = pt.tensor("y", shape=y.shape)
+    out = pt.einsum(subscripts, x_pt, y_pt)
+    f = pytensor.function([x_pt, y_pt], out, mode="JAX")
+
+    np.testing.assert_allclose(f(x, y), np.einsum(subscripts, x, y))
diff --git a/tests/tensor/rewriting/test_blockwise.py b/tests/tensor/rewriting/test_blockwise.py
index d5ea6e2b4e..a17ad18a1f 100644
--- a/tests/tensor/rewriting/test_blockwise.py
+++ b/tests/tensor/rewriting/test_blockwise.py
@@ -1,7 +1,9 @@
 from functools import partial
 
-from pytensor import function
-from pytensor.graph import FunctionGraph, rewrite_graph
+import numpy as np
+
+from pytensor import Mode, config, function
+from pytensor.graph import FunctionGraph, rewrite_graph, vectorize_graph
 from pytensor.graph.basic import equal_computations
 from pytensor.scalar import log as scalar_log
 from pytensor.tensor import add, alloc, matrix, tensor, tensor3
@@ -9,6 +11,7 @@
 from pytensor.tensor.elemwise import Elemwise
 from pytensor.tensor.nlinalg import MatrixPinv
 from pytensor.tensor.rewriting.blockwise import local_useless_blockwise
+from pytensor.tensor.shape import Reshape
 
 
 def test_useless_blockwise_of_elemwise():
@@ -45,7 +48,7 @@ def test_blockwise_alloc():
     rewrite = partial(
         rewrite_graph,
         include=("ShapeOpt", "specialize"),
-        exclude=("local_useless_unbatched_blockwise",),
+        exclude=("local_useless_unbatched_blockwise", "local_dimshuffle_alloc"),
     )
 
     vector_add = Blockwise(core_op=add, signature="(x),(x)->(x)")
@@ -104,7 +107,9 @@ def test_blockwise_alloc():
     y = tensor("y", shape=())
     out = vector_add(alloc(x, 3, 1, 5), alloc(y, 7, 5))
     expected_out = alloc(vector_add(alloc(x, 5), alloc(y, 5)), 3, 7, 5)
-    assert equal([rewrite(out)], [expected_out])
+    assert equal(
+        [rewrite(out)], [expected_out]
+    ), None  # pytensor.dprint([expected_out, rewrite(out)], print_type=True)
 
     x = tensor("x", shape=(5,))
     y = tensor("y", shape=())
@@ -118,3 +123,27 @@ def test_blockwise_alloc():
     out = vector_add(x, alloc(y, 5))
     expected_out = out
     assert equal([rewrite(out)], [expected_out])
+
+
+def test_blockwise_reshape():
+    x = tensor("x", shape=(None, None, None))
+    y = x.reshape([x.shape[0] * x.shape[1], -1])
+
+    new_x = tensor("x", shape=(None, None, None, None))
+    new_y = vectorize_graph(y, {x: new_x})
+    assert not isinstance(new_y.owner.op, Reshape)
+    assert isinstance(new_y.owner.op, Blockwise) and isinstance(
+        new_y.owner.op.core_op, Reshape
+    )
+
+    rewritten_y = rewrite_graph(
+        new_y, include=("canonicalize", "specialize"), clone=True
+    )
+    assert isinstance(rewritten_y.owner.op, Reshape)
+
+    no_rewrites = Mode(linker="py", optimizer=None)
+    test_x = np.arange(5 * 4 * 3 * 2).reshape(5, 4, 3, 2).astype(config.floatX)
+    np.testing.assert_allclose(
+        new_y.eval({"x": test_x}, mode=no_rewrites),
+        rewritten_y.eval({"x": test_x}, mode=no_rewrites),
+    )
diff --git a/tests/tensor/rewriting/test_einsum.py b/tests/tensor/rewriting/test_einsum.py
new file mode 100644
index 0000000000..73e4372aaa
--- /dev/null
+++ b/tests/tensor/rewriting/test_einsum.py
@@ -0,0 +1,39 @@
+from functools import partial
+
+from pytensor.graph import ancestors, rewrite_graph
+from pytensor.tensor import einsum, specify_shape, tensor
+from pytensor.tensor.einsum import Einsum
+
+
+specialize_rewrite = partial(rewrite_graph, include=("specialize",), clone=True)
+
+
+def test_einsum_optimization():
+    a = tensor("a", shape=(None, None))
+    b = tensor("b", shape=(None, None))
+    c = tensor("c", shape=(None, None))
+
+    dynamic_shape_einsum = einsum("ij,ij,jk->ik", a, b, c)
+    assert not dynamic_shape_einsum.owner.op.optimized
+
+    rewritten_out = specialize_rewrite(dynamic_shape_einsum)
+    assert isinstance(rewritten_out.owner.op, Einsum)
+
+    a = specify_shape(a, (2, 3))
+    b = specify_shape(b, (2, 3))
+    c = specify_shape(c, (3, 5))
+
+    static_shape_einsum = dynamic_shape_einsum.owner.clone_with_new_inputs(
+        [a, b, c]
+    ).default_output()
+    assert not static_shape_einsum.owner.op.optimized
+
+    rewritten_out = specialize_rewrite(static_shape_einsum)
+    # Einsum was inlined because it was optimized
+    assert not isinstance(rewritten_out.owner.op, Einsum)
+    # Sanity check that it's not buried in the graph
+    assert not any(
+        isinstance(var.owner.op, Einsum)
+        for var in ancestors([rewritten_out])
+        if var.owner
+    )
diff --git a/tests/tensor/rewriting/test_shape.py b/tests/tensor/rewriting/test_shape.py
index f4c529a0d2..bbfd829070 100644
--- a/tests/tensor/rewriting/test_shape.py
+++ b/tests/tensor/rewriting/test_shape.py
@@ -337,6 +337,52 @@ def test_m1(self):
         topo = f2.maker.fgraph.toposort()
         assert not any(isinstance(n.op, Reshape) for n in topo)
 
+    def test_constant_shape(self):
+        # Where reshape is a constant that matches the shape
+        x = matrix(shape=(2, 3))
+        shape = pt.as_tensor(np.array([2, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is x
+
+        x = matrix(shape=(2, 3))
+        shape = pt.as_tensor(np.array([-1, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is x
+
+        x = matrix(shape=(None, 3))
+        shape = pt.as_tensor(np.array([-1, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is x
+
+        x = matrix(shape=(None, 3))
+        shape = pt.as_tensor(np.array([2, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        # This could be rewritten as a specify_shape(x, (2, 3))
+        assert new_out is not x
+
+        x = matrix(shape=(2, 3))
+        shape = pt.as_tensor(np.array([3, 2]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is not x
+
+    def test_all_but_one_match(self):
+        x = matrix(shape=(None, None))
+        shape = [x.shape[0], 3]
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert equal_computations([new_out], [specify_shape(x, (None, 3))])
+
+        # Rewrite does not apply if there's also a -1
+        shape = [-1, 3]
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is out
+
 
 class TestLocalReshapeToDimshuffle:
     def setup_method(self):
diff --git a/tests/tensor/test_basic.py b/tests/tensor/test_basic.py
index 49c8e9c38c..58d4de2481 100644
--- a/tests/tensor/test_basic.py
+++ b/tests/tensor/test_basic.py
@@ -3847,8 +3847,10 @@ def test_transpose():
     assert np.all(t2d == np.transpose(x2v, [0, 1]))
     assert np.all(t3d == np.transpose(x3v, [0, 2, 1]))
 
+    # Check we don't introduce useless transpose
+    assert ptb.transpose(x1) is x1
+
     # Check that we create a name.
-    assert ptb.transpose(x1).name == "x1.T"
     assert ptb.transpose(x2).name == "x2.T"
     assert ptb.transpose(x3).name == "x3.T"
     assert ptb.transpose(dmatrix()).name is None
diff --git a/tests/tensor/test_einsum.py b/tests/tensor/test_einsum.py
new file mode 100644
index 0000000000..9131cda056
--- /dev/null
+++ b/tests/tensor/test_einsum.py
@@ -0,0 +1,263 @@
+from functools import partial
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+import pytensor
+import pytensor.tensor as pt
+from pytensor import Mode, config, function
+from pytensor.graph import FunctionGraph
+from pytensor.graph.op import HasInnerGraph
+from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.einsum import _delta, _general_dot, _iota, einsum
+from pytensor.tensor.shape import Reshape
+
+
+# Fail for unexpected warnings in this file
+pytestmark = pytest.mark.filterwarnings("error")
+
+floatX = pytensor.config.floatX
+ATOL = RTOL = 1e-8 if floatX == "float64" else 1e-4
+
+
+def assert_no_blockwise_in_graph(fgraph: FunctionGraph, core_op=None) -> None:
+    for node in fgraph.apply_nodes:
+        if isinstance(node.op, Blockwise):
+            if core_op is None:
+                raise AssertionError
+            assert not isinstance(node.op.core_op, core_op)
+
+        if isinstance(node.op, HasInnerGraph):
+            # InnerGraph Ops can be rewritten without modifying the original fgraph
+            if hasattr(node.op, "_fn"):
+                inner_fgraph = node.op._fn.maker.fgraph
+            else:
+                inner_fgraph = node.op.fgraph
+            assert_no_blockwise_in_graph(inner_fgraph, core_op=core_op)
+
+
+def test_iota():
+    mode = Mode(linker="py", optimizer=None)
+    np.testing.assert_allclose(
+        _iota((4, 8), 0).eval(mode=mode),
+        [
+            [0, 0, 0, 0, 0, 0, 0, 0],
+            [1, 1, 1, 1, 1, 1, 1, 1],
+            [2, 2, 2, 2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 3, 3, 3, 3],
+        ],
+    )
+
+    np.testing.assert_allclose(
+        _iota((4, 8), 1).eval(mode=mode),
+        [
+            [0, 1, 2, 3, 4, 5, 6, 7],
+            [0, 1, 2, 3, 4, 5, 6, 7],
+            [0, 1, 2, 3, 4, 5, 6, 7],
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ],
+    )
+
+
+def test_delta():
+    mode = Mode(linker="py", optimizer=None)
+    np.testing.assert_allclose(
+        _delta((2, 2), (0, 1)).eval(mode=mode),
+        [[1.0, 0.0], [0.0, 1.0]],
+    )
+
+    np.testing.assert_allclose(
+        _delta((2, 2, 2), (0, 1)).eval(mode=mode),
+        [[[1, 1], [0, 0]], [[0, 0], [1, 1]]],
+    )
+
+
+def test_general_dot():
+    rng = np.random.default_rng(45)
+    signature = "(l0,a0,a1,l1),(a1,r0,r1,a0)->(l0,l1,r0,r1)"
+    tensordot_axes = [(-3, -2), (-1, -4)]
+
+    # X has two batch dims
+    # Y has one batch dim
+    x = pt.tensor("x", shape=(5, 4, 2, 11, 13, 3))
+    y = pt.tensor("y", shape=(4, 13, 5, 7, 11))
+    out = _general_dot((x, y), tensordot_axes, [(0, 1), (0,)])
+
+    fn = pytensor.function([x, y], out)
+    # fn.dprint(print_type=True)
+    if config.mode != "FAST_COMPILE":
+        assert_no_blockwise_in_graph(fn.maker.fgraph, Reshape)
+
+    np_batched_tensordot = np.vectorize(
+        partial(np.tensordot, axes=tensordot_axes), signature=signature
+    )
+    x_test = rng.normal(size=x.type.shape).astype(floatX)
+    y_test = rng.normal(size=y.type.shape).astype(floatX)
+    np.testing.assert_allclose(
+        fn(x_test, y_test), np_batched_tensordot(x_test, y_test), atol=ATOL, rtol=RTOL
+    )
+
+
+@pytest.mark.parametrize("static_shape_known", [True, False])
+@pytest.mark.parametrize(
+    "signature",
+    [
+        "ij",
+        "ji",
+        "ii->i",
+        "ii",
+        "ij->",
+        "ij->j",
+        "ij->i",
+        "ij,ij->ij",
+        "ij,ji->ij",
+        "ij,ji->ji",
+        "ij,jk",
+        "kj,ji",
+        "ij,kj->ik",
+        "ik,kj->ikj",
+        "ij,kl->ijkl",
+        "ij,jk,kl->il",
+        "kl,ij,jk->il",
+        "oij,imj,mjkn,lnk,plk->op",
+    ],
+)
+def test_einsum_signatures(static_shape_known, signature):
+    letters_to_dims = dict(zip("ijklmnop", [2, 3, 5, 7, 11, 13, 17, 19], strict=True))
+
+    inputs = signature.split("->")[0].split(",")
+
+    shapes = [tuple(letters_to_dims[letter] for letter in inp) for inp in inputs]
+    if static_shape_known:
+        static_shapes = shapes
+    else:
+        static_shapes = [[None] * len(shape) for shape in shapes]
+
+    operands = [
+        pt.tensor(name, shape=static_shape)
+        for name, static_shape in zip(ascii_lowercase, static_shapes)
+    ]
+    out = pt.einsum(signature, *operands)
+    assert out.owner.op.optimized == static_shape_known or len(operands) <= 2
+
+    rng = np.random.default_rng(37)
+    test_values = [rng.normal(size=shape).astype(floatX) for shape in shapes]
+    np_out = np.einsum(signature, *test_values)
+
+    fn = function(operands, out)
+    pt_out = fn(*test_values)
+
+    # print(); fn.dprint(print_type=True)
+
+    if config.mode != "FAST_COMPILE":
+        assert_no_blockwise_in_graph(fn.maker.fgraph)
+    np.testing.assert_allclose(pt_out, np_out, atol=ATOL, rtol=RTOL)
+
+
+def test_batch_dim():
+    shapes = (
+        (7, 3, 5),
+        (5, 2),
+    )
+    x, y = (pt.tensor(name, shape=shape) for name, shape in zip("xy", shapes))
+    out = pt.einsum("mij,jk->mik", x, y)
+
+    assert out.type.shape == (7, 3, 2)
+
+
+def test_einsum_conv():
+    # Adapted example from https://medium.com/latinxinai/vectorized-convolution-operation-using-numpy-b122fd52fba3
+    rng = np.random.default_rng(125)
+    batch_size = 32
+    channels = 3
+    height = 8
+    width = 8
+    kernel_size = 2
+    num_filters = 15
+    conv_signature = "bchwkt,fckt->bfhw"
+    windowed_input = rng.random(
+        size=(batch_size, channels, height, width, kernel_size, kernel_size)
+    ).astype(floatX)
+    weights = rng.random(size=(num_filters, channels, kernel_size, kernel_size)).astype(
+        floatX
+    )
+    result = einsum(conv_signature, windowed_input, weights).eval()
+
+    assert result.shape == (32, 15, 8, 8)
+    np.testing.assert_allclose(
+        result,
+        np.einsum("bchwkt,fckt->bfhw", windowed_input, weights),
+        atol=ATOL,
+        rtol=RTOL,
+    )
+
+
+def test_ellipsis():
+    rng = np.random.default_rng(159)
+    x = pt.tensor("x", shape=(3, 5, 7, 11))
+    y = pt.tensor("y", shape=(3, 5, 11, 13))
+    x_test = rng.normal(size=x.type.shape).astype(floatX)
+    y_test = rng.normal(size=y.type.shape).astype(floatX)
+    expected_out = np.matmul(x_test, y_test)
+
+    with pytest.raises(ValueError):
+        pt.einsum("mp,pn->mn", x, y)
+
+    out = pt.einsum("...mp,...pn->...mn", x, y)
+    np.testing.assert_allclose(
+        out.eval({x: x_test, y: y_test}), expected_out, atol=ATOL, rtol=RTOL
+    )
+
+    # Put batch axes in the middle
+    new_x = pt.moveaxis(x, -2, 0)
+    new_y = pt.moveaxis(y, -2, 0)
+    out = pt.einsum("m...p,p...n->m...n", new_x, new_y)
+    np.testing.assert_allclose(
+        out.eval({x: x_test, y: y_test}),
+        expected_out.transpose(-2, 0, 1, -1),
+        atol=ATOL,
+        rtol=RTOL,
+    )
+
+    out = pt.einsum("m...p,p...n->mn", new_x, new_y)
+    np.testing.assert_allclose(
+        out.eval({x: x_test, y: y_test}), expected_out.sum((0, 1)), atol=ATOL, rtol=RTOL
+    )
+
+
+def test_broadcastable_dims():
+    # Test that einsum handles broadcasting dims correctly. There are two points:
+    # 1. Numpy einsum allows the same subscript for degenerate and full dimensions
+    # There is some stale discussion on whether this should be a bug or not, but for now it is not:
+    # https://github.com/numpy/numpy/issues/11548
+
+    # 2. Using the same letter for dimensions that are and aren't broadcastable
+    # can lead to suboptimal paths. We check we issue a warning for the following example:
+    # https://github.com/dgasmith/opt_einsum/issues/220
+    rng = np.random.default_rng(222)
+    a = pt.tensor("a", shape=(32, 32, 32))
+    b = pt.tensor("b", shape=(1000, 32))
+    c = pt.tensor("c", shape=(1, 32))
+
+    a_test = rng.normal(size=a.type.shape).astype(floatX)
+    b_test = rng.normal(size=b.type.shape).astype(floatX)
+    c_test = rng.normal(size=c.type.shape).astype(floatX)
+
+    # Note b is used for both 1 and 32
+    with pytest.warns(
+        UserWarning, match="This can result in a suboptimal contraction path"
+    ):
+        suboptimal_out = pt.einsum("ijk,bj,bk->i", a, b, c)
+    assert not [set(p) for p in suboptimal_out.owner.op.path] == [{0, 2}, {0, 1}]
+
+    # If we use a distinct letter we get the optimal path
+    optimal_out = pt.einsum("ijk,bj,ck->i", a, b, c)
+    assert [set(p) for p in optimal_out.owner.op.path] == [{0, 2}, {0, 1}]
+
+    suboptimal_eval = suboptimal_out.eval({a: a_test, b: b_test, c: c_test})
+    optimal_eval = optimal_out.eval({a: a_test, b: b_test, c: c_test})
+    np_eval = np.einsum("ijk,bj,bk->i", a_test, b_test, c_test)
+    atol = 1e-12 if config.floatX == "float64" else 1e-2
+    np.testing.assert_allclose(suboptimal_eval, np_eval, atol=atol)
+    np.testing.assert_allclose(optimal_eval, np_eval, atol=atol)
diff --git a/tests/tensor/test_shape.py b/tests/tensor/test_shape.py
index 7fa8133c4e..f9434c9f60 100644
--- a/tests/tensor/test_shape.py
+++ b/tests/tensor/test_shape.py
@@ -14,7 +14,7 @@
 from pytensor.misc.safe_asarray import _asarray
 from pytensor.scalar.basic import ScalarConstant
 from pytensor.tensor import as_tensor_variable, broadcast_to, get_vector_length, row
-from pytensor.tensor.basic import MakeVector, as_tensor, constant
+from pytensor.tensor.basic import MakeVector, constant, stack
 from pytensor.tensor.elemwise import DimShuffle, Elemwise
 from pytensor.tensor.rewriting.shape import ShapeFeature
 from pytensor.tensor.shape import (
@@ -801,8 +801,14 @@ def test_reshape(self):
         [vect_out] = vectorize_node(node, mat, new_shape).outputs
         assert equal_computations([vect_out], [reshape(mat, new_shape)])
 
-        with pytest.raises(NotImplementedError):
-            vectorize_node(node, vec, broadcast_to(as_tensor([5, 2, x]), (2, 3)))
+        new_shape = stack([[-1, x], [x - 1, -1]], axis=0)
+        print(new_shape.type)
+        [vect_out] = vectorize_node(node, vec, new_shape).outputs
+        vec_test_value = np.arange(6)
+        np.testing.assert_allclose(
+            vect_out.eval({x: 3, vec: vec_test_value}),
+            np.broadcast_to(vec_test_value.reshape(2, 3), (2, 2, 3)),
+        )
 
         with pytest.raises(
             ValueError,

From b65d08c7681a6fa6d8986d8eea4770d65ea5c87b Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Sun, 4 Aug 2024 16:38:43 +0200
Subject: [PATCH 26/72] Skip tri test in latest version of JAX

Related to https://github.com/google/jax/issues/22751
---
 tests/link/jax/test_tensor_basic.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/link/jax/test_tensor_basic.py b/tests/link/jax/test_tensor_basic.py
index 1a7f787a3a..afa4191b9d 100644
--- a/tests/link/jax/test_tensor_basic.py
+++ b/tests/link/jax/test_tensor_basic.py
@@ -218,6 +218,10 @@ def test_tri():
     compare_jax_and_py(fgraph, [])
 
 
+@pytest.mark.skipif(
+    jax.__version__ == "0.4.31",
+    reason="https://github.com/google/jax/issues/22751",
+)
 def test_tri_nonconcrete():
     """JAX cannot JIT-compile `jax.numpy.tri` when arguments are not concrete values."""
 

From da91dc7f3713a45485a36263578ac2874cce7479 Mon Sep 17 00:00:00 2001
From: abhishekshah5486 <abhishek.shah5486@gmail.com>
Date: Tue, 6 Aug 2024 01:00:34 +0530
Subject: [PATCH 27/72] Corrected the reference from 'an PyTensor' to 'a
 PyTensor' in the contributing guidelines.

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1d3c8c875f..c3b8b1fff2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,7 @@ For issues a minimal working example (MWE) is strongly recommended when relevant
 (fixing a typo in the documentation does not require a MWE). For discussions,
 MWEs are generally required. All MWEs must be implemented using PyTensor. Please
 do not submit MWEs if they are not implemented in PyTensor. In certain cases,
-pseudocode may be acceptable, but an PyTensor implementation is always preferable.
+pseudocode may be acceptable, but a PyTensor implementation is always preferable.
 
 ## Quick links
 

From 0ae3cfef5f646afc3433c599f4da8bb01586d132 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 5 Aug 2024 17:39:49 +0000
Subject: [PATCH 28/72] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.5 → v0.5.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.5...v0.5.6)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 118a371e78..c0e45f6e15 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           )$
       - id: check-merge-conflict
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.5
+    rev: v0.5.6
     hooks:
       - id: ruff
         args: ["--fix", "--output-format=full"]

From 48450b07760018bad64df165b502b1d9f07b2777 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Fri, 9 Aug 2024 16:45:36 -0700
Subject: [PATCH 29/72] Fix test to allow for n_outs>1

---
 pytensor/link/pytorch/dispatch/basic.py |  6 +++---
 tests/link/pytorch/test_basic.py        | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index 5e5bc4a41b..291ad40a65 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -139,10 +139,10 @@ def makevector(*x):
 def pytorch_funcify_IfElse(op, **kwargs):
     n_outs = op.n_outs
 
-    def ifelse(cond, ifpath, elsepath, n_outs=n_outs):
+    def ifelse(cond, *true_and_false, n_outs=n_outs):
         if cond:
-            return ifpath
+            return torch.stack(true_and_false[:n_outs])
         else:
-            return elsepath
+            return torch.stack(true_and_false[n_outs:])
 
     return ifelse
diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index 3905055935..8393f695c3 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -305,13 +305,15 @@ def test_pytorch_MakeVector():
 
 
 def test_pytorch_ifelse():
-    true_vals = np.r_[1, 2, 3]
-    false_vals = np.r_[-1, -2, -3]
+    p1_vals = np.r_[1, 2, 3]
+    p2_vals = np.r_[-1, -2, -3]
 
     for test_value, cond in [(0.2, 0.5), (0.5, 0.4)]:
         a = scalar("a")
         a.tag.test_value = np.array(test_value, dtype=config.floatX)
-        x = ifelse(a < cond, true_vals, false_vals)
-        x_fg = FunctionGraph([a], [x])  # I.e. False
+        x = ifelse(
+            a < cond, tuple(np.r_[p1_vals, p2_vals]), tuple(np.r_[p2_vals, p1_vals])
+        )
+        x_fg = FunctionGraph([a], x)
 
         compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])

From fd27b6abee2d2c718ad3c7d24dad228b042b620b Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Fri, 9 Aug 2024 16:52:13 -0700
Subject: [PATCH 30/72] Remove test value

---
 tests/link/pytorch/test_basic.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index 8393f695c3..73b098182b 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -11,7 +11,7 @@
 from pytensor.configdefaults import config
 from pytensor.graph.basic import Apply
 from pytensor.graph.fg import FunctionGraph
-from pytensor.graph.op import Op, get_test_value
+from pytensor.graph.op import Op
 from pytensor.ifelse import ifelse
 from pytensor.raise_op import CheckAndRaise
 from pytensor.tensor import alloc, arange, as_tensor, empty, eye
@@ -310,10 +310,9 @@ def test_pytorch_ifelse():
 
     for test_value, cond in [(0.2, 0.5), (0.5, 0.4)]:
         a = scalar("a")
-        a.tag.test_value = np.array(test_value, dtype=config.floatX)
         x = ifelse(
             a < cond, tuple(np.r_[p1_vals, p2_vals]), tuple(np.r_[p2_vals, p1_vals])
         )
         x_fg = FunctionGraph([a], x)
 
-        compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])
+        compare_pytorch_and_py(x_fg, np.array(test_value, dtype=config.floatX))

From 7fffec618bde96c9403b263d18d7c3d3dd869189 Mon Sep 17 00:00:00 2001
From: Thomas Wiecki <thomas.wiecki@gmail.com>
Date: Sat, 10 Aug 2024 12:53:53 +0200
Subject: [PATCH 31/72] Pickle error message changed (#966)

---
 tests/test_config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 47a4e24035..73c1408e03 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -254,7 +254,10 @@ def test_config_pickling():
         configparser.IntParam(5, lambda i: i > 0),
         in_c_key=False,
     )
-    with pytest.raises(AttributeError, match="Can't pickle local object"):
+    with pytest.raises(
+        AttributeError,
+        match="Can't (pickle|get) local object 'test_config_pickling.<locals>.<lambda>'",
+    ):
         pickle.dump(root, io.BytesIO())
 
 

From 29183c72bdae1ee7cf1e06c90675d3d7d61bb18b Mon Sep 17 00:00:00 2001
From: Thomas Wiecki <thomas.wiecki@gmail.com>
Date: Sat, 10 Aug 2024 23:53:39 +0200
Subject: [PATCH 32/72] Add building of pyodide universal wheels (#918)

* Add building of pyodide universal wheels

* precommit

* Fix precommit. Readd comment.

* Fix precommit2

* Minor improvement to ext_modules conditional definition

* Bump Python version so that tomllib is included

This way versioneer can read pyproject.toml

* Add wheel package to build dependencies

* Update .github/workflows/pypi.yml

* Revert unnecessary

* ruff

---------

Co-authored-by: Ben Mares <services-git-throwaway1@tensorial.com>
---
 .github/workflows/pypi.yml | 30 ++++++++++++++++++++++++++++++
 setup.py                   | 25 ++++++++++++++++++-------
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index ca37e422d0..af3ea8b93c 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -57,6 +57,31 @@ jobs:
           name: wheels-${{ matrix.platform }}
           path: ./wheelhouse/*.whl
 
+  build_universal_wheel:
+    name: Build universal wheel for Pyodide
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install numpy versioneer wheel
+
+      - name: Build universal wheel
+        run: |
+          PYODIDE=1 python setup.py bdist_wheel --universal
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: universal_wheel
+          path: dist/*.whl
+
   check_dist:
     name: Check dist
     needs: [make_sdist,build_wheels]
@@ -103,6 +128,11 @@ jobs:
           path: dist
           merge-multiple: true
 
+      - uses: actions/download-artifact@v4
+        with:
+          name: universal_wheel
+          path: dist
+
       - uses: pypa/gh-action-pypi-publish@v1.9.0
         with:
           user: __token__
diff --git a/setup.py b/setup.py
index 3f8eb225d8..09202a658c 100755
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python
+import os
+
 import numpy
 import versioneer
 from setuptools import Extension, setup
@@ -11,17 +13,26 @@
 
 NAME: str = dist.get_name()  # type: ignore
 
+# Check if building for Pyodide
+is_pyodide = os.getenv("PYODIDE", "0") == "1"
+
+if is_pyodide:
+    # For pyodide we build a universal wheel that must be pure-python
+    # so we must omit the cython-version of scan.
+    ext_modules = []
+else:
+    ext_modules = [
+        Extension(
+            name="pytensor.scan.scan_perform",
+            sources=["pytensor/scan/scan_perform.pyx"],
+            include_dirs=[numpy.get_include()],
+        ),
+    ]
 
 if __name__ == "__main__":
     setup(
         name=NAME,
         version=versioneer.get_version(),
         cmdclass=versioneer.get_cmdclass(),
-        ext_modules=[
-            Extension(
-                name="pytensor.scan.scan_perform",
-                sources=["pytensor/scan/scan_perform.pyx"],
-                include_dirs=[numpy.get_include()],
-            ),
-        ],
+        ext_modules=ext_modules,
     )

From 4d0103bd01af9eba3643403f68325e633e9304e5 Mon Sep 17 00:00:00 2001
From: Krupakar Reddy <137398727+Krupakar-Reddy-S@users.noreply.github.com>
Date: Mon, 12 Aug 2024 17:04:43 +0530
Subject: [PATCH 33/72] Removed types examples and introduced tensor (#968)

---
 doc/tutorial/adding.rst | 44 +++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/doc/tutorial/adding.rst b/doc/tutorial/adding.rst
index d558217dc7..0262b60edf 100644
--- a/doc/tutorial/adding.rst
+++ b/doc/tutorial/adding.rst
@@ -4,6 +4,31 @@
 Baby Steps - Algebra
 ====================
 
+Understanding Tensors
+===========================
+
+Before diving into PyTensor, it's essential to understand the fundamental 
+data structure it operates on: the *tensor*. A *tensor* is a multi-dimensional 
+array that serves as the foundation for symbolic computations.
+
+tensors can represent anything from a single number (scalar) to 
+complex multi-dimensional arrays. Each tensor has a type that dictates its 
+dimensionality and the kind of data it holds.
+
+For example, the following code creates a symbolic scalar and a symbolic matrix:
+
+>>> x = pt.scalar('x')
+>>> y = pt.matrix('y')
+
+Here, `scalar` refers to a tensor with zero dimensions, while `matrix` refers 
+to a tensor with two dimensions. The same principles apply to tensors of other 
+dimensions.
+
+For more information about tensors and their associated operations can be 
+found here: :ref:`tensor <libdoc_tensor>`.
+
+
+
 Adding two Scalars
 ==================
 
@@ -173,25 +198,6 @@ It is possible to add scalars to matrices, vectors to matrices,
 scalars to vectors, etc. The behavior of these operations is defined
 by :ref:`broadcasting <libdoc_tensor_broadcastable>`.
 
-The following types are available:
-
-* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4, btensor5, btensor6, btensor7``
-* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4, wtensor5, wtensor6, wtensor7``
-* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4, itensor5, itensor6, itensor7``
-* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4, ltensor5, ltensor6, ltensor7``
-* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4, ftensor5, ftensor6, ftensor7``
-* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4, dtensor5, dtensor6, dtensor7``
-* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4, ctensor5, ctensor6, ctensor7``
-
-The previous list is not exhaustive and a guide to all types compatible
-with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creation>`.
-
-.. note::
-
-   You, the user---not the system architecture---have to choose whether your
-   program will use 32- or 64-bit integers (``i`` prefix vs. the ``l`` prefix)
-   and floats (``f`` prefix vs. the ``d`` prefix).
-
 
 
 Exercise

From f62401a04adaf80bcb2e22f63da1088042704190 Mon Sep 17 00:00:00 2001
From: ferres <justferres@yandex.ru>
Date: Tue, 13 Aug 2024 11:54:32 +0300
Subject: [PATCH 34/72] maintanance: unpin scipy

fix: cast to elemwise outputs to their respective dtypes

fix: Relax scipy dependency, should work in both cases

style: black

wrap with asarray

fix: make elemwise test check against dtype in the graph

fix scalar issues

Update pytensor/scalar/basic.py

Co-authored-by: Ricardo Vieira <28983449+ricardoV94@users.noreply.github.com>

fix test

add a clarifying comment to checking nan

fix: bool is deprecated in numpy

deps: bound scipy version

improve test
---
 environment-osx-arm64.yml   |  2 +-
 environment.yml             |  2 +-
 pyproject.toml              |  2 +-
 pytensor/scalar/basic.py    | 17 ++++++++++++++---
 pytensor/tensor/elemwise.py | 22 ++--------------------
 tests/scalar/test_loop.py   | 17 ++++++++++++-----
 tests/tensor/utils.py       |  8 +++++---
 7 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
index 0d624aa55c..13a68faaaa 100644
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -10,7 +10,7 @@ dependencies:
   - python=>3.10
   - compilers
   - numpy>=1.17.0,<2
-  - scipy>=0.14,<1.14.0
+  - scipy>=1,<2
   - filelock>=3.15
   - etuples
   - logical-unification
diff --git a/environment.yml b/environment.yml
index 95bb58c06c..4b213fd851 100644
--- a/environment.yml
+++ b/environment.yml
@@ -10,7 +10,7 @@ dependencies:
   - python>=3.10
   - compilers
   - numpy>=1.17.0,<2
-  - scipy>=0.14,<1.14.0
+  - scipy>=1,<2
   - filelock>=3.15
   - etuples
   - logical-unification
diff --git a/pyproject.toml b/pyproject.toml
index 81a1285da8..bebba8a7de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ keywords = [
 ]
 dependencies = [
     "setuptools>=59.0.0",
-    "scipy>=0.14,<1.14",
+    "scipy>=1,<2",
     "numpy>=1.17.0,<2",
     "filelock>=3.15",
     "etuples",
diff --git a/pytensor/scalar/basic.py b/pytensor/scalar/basic.py
index d4c41d5cb5..d6fcfc0723 100644
--- a/pytensor/scalar/basic.py
+++ b/pytensor/scalar/basic.py
@@ -1140,14 +1140,25 @@ def output_types(self, types):
         else:
             raise NotImplementedError(f"Cannot calculate the output types for {self}")
 
+    @staticmethod
+    def _cast_scalar(x, dtype):
+        if hasattr(x, "astype"):
+            return x.astype(dtype)
+        elif dtype == "bool":
+            return np.bool_(x)
+        else:
+            return getattr(np, dtype)(x)
+
     def perform(self, node, inputs, output_storage):
         if self.nout == 1:
-            output_storage[0][0] = self.impl(*inputs)
+            dtype = node.outputs[0].dtype
+            output_storage[0][0] = self._cast_scalar(self.impl(*inputs), dtype)
         else:
             variables = from_return_values(self.impl(*inputs))
             assert len(variables) == len(output_storage)
-            for storage, variable in zip(output_storage, variables):
-                storage[0] = variable
+            for out, storage, variable in zip(node.outputs, output_storage, variables):
+                dtype = out.dtype
+                storage[0] = self._cast_scalar(variable, dtype)
 
     def impl(self, *inputs):
         raise MethodNotDefined("impl", type(self), self.__class__.__name__)
diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
index de966f1a78..1b0d433dda 100644
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
@@ -767,34 +767,16 @@ def perform(self, node, inputs, output_storage):
         for i, (variable, storage, nout) in enumerate(
             zip(variables, output_storage, node.outputs)
         ):
-            if getattr(variable, "dtype", "") == "object":
-                # Since numpy 1.6, function created with numpy.frompyfunc
-                # always return an ndarray with dtype object
-                variable = np.asarray(variable, dtype=nout.dtype)
+            storage[0] = variable = np.asarray(variable, dtype=nout.dtype)
 
             if i in self.inplace_pattern:
                 odat = inputs[self.inplace_pattern[i]]
                 odat[...] = variable
                 storage[0] = odat
 
-            # Sometimes NumPy return a Python type.
-            # Some PyTensor op return a different dtype like floor, ceil,
-            # trunc, eq, ...
-            elif not isinstance(variable, np.ndarray) or variable.dtype != nout.dtype:
-                variable = np.asarray(variable, nout.dtype)
-                # The next line is needed for numpy 1.9. Otherwise
-                # there are tests that fail in DebugMode.
-                # Normally we would call pytensor.misc._asarray, but it
-                # is faster to inline the code. We know that the dtype
-                # are the same string, just different typenum.
-                if np.dtype(nout.dtype).num != variable.dtype.num:
-                    variable = variable.view(dtype=nout.dtype)
-                storage[0] = variable
             # numpy.real return a view!
-            elif not variable.flags.owndata:
+            if not variable.flags.owndata:
                 storage[0] = variable.copy()
-            else:
-                storage[0] = variable
 
     @staticmethod
     def _check_runtime_broadcast(node, inputs):
diff --git a/tests/scalar/test_loop.py b/tests/scalar/test_loop.py
index 88f1a588fd..88d14c6e43 100644
--- a/tests/scalar/test_loop.py
+++ b/tests/scalar/test_loop.py
@@ -212,12 +212,17 @@ def test_inner_composite(mode):
     y16 = op(n_steps, x16)
     assert y16.type.dtype == "float16"
 
-    fn32 = function([n_steps, x16], y16, mode=mode)
+    fn16 = function([n_steps, x16], y16, mode=mode)
+    out16 = fn16(n_steps=3, x16=np.array(4.73, dtype="float16"))
     np.testing.assert_allclose(
-        fn32(n_steps=9, x16=np.array(4.73, dtype="float16")),
-        4.73 + 9,
+        out16,
+        4.73 + 3,
         rtol=1e-3,
     )
+    out16overflow = fn16(n_steps=9, x16=np.array(4.73, dtype="float16"))
+    assert out16overflow.dtype == "float16"
+    # with this dtype overflow happens
+    assert np.isnan(out16overflow)
 
 
 @mode
@@ -243,8 +248,10 @@ def test_inner_loop(mode):
     y16 = outer_loop_op(n_steps, x16, n_steps)
     assert y16.type.dtype == "float16"
 
-    fn32 = function([n_steps, x16], y16, mode=mode)
+    fn16 = function([n_steps, x16], y16, mode=mode)
+    out16 = fn16(n_steps=3, x16=np.array(2.5, dtype="float16"))
+    assert out16.dtype == "float16"
     np.testing.assert_allclose(
-        fn32(n_steps=3, x16=np.array(2.5, dtype="float16")),
+        out16,
         3**2 + 2.5,
     )
diff --git a/tests/tensor/utils.py b/tests/tensor/utils.py
index 2f97d0e18f..85c48a42dd 100644
--- a/tests/tensor/utils.py
+++ b/tests/tensor/utils.py
@@ -508,15 +508,17 @@ def test_good(self):
                 if not isinstance(expecteds, list | tuple):
                     expecteds = (expecteds,)
 
-                for i, (variable, expected) in enumerate(zip(variables, expecteds)):
+                for i, (variable, expected, out_symbol) in enumerate(
+                    zip(variables, expecteds, node.outputs)
+                ):
                     condition = (
-                        variable.dtype != expected.dtype
+                        variable.dtype != out_symbol.type.dtype
                         or variable.shape != expected.shape
                         or not np.allclose(variable, expected, atol=eps, rtol=eps)
                     )
                     assert not condition, (
                         f"Test {self.op}::{testname}: Output {i} gave the wrong"
-                        f" value. With inputs {inputs}, expected {expected} (dtype {expected.dtype}),"
+                        f" value. With inputs {inputs}, expected {expected} (dtype {out_symbol.type.dtype}),"
                         f" got {variable} (dtype {variable.dtype}). eps={eps:f}"
                         f" np.allclose returns {np.allclose(variable, expected, atol=eps, rtol=eps)} {np.allclose(variable, expected)}"
                     )

From dd8895df89a60259aaa73269ba23c16c5130afc2 Mon Sep 17 00:00:00 2001
From: ferres <justferres@yandex.ru>
Date: Wed, 14 Aug 2024 19:38:23 +0300
Subject: [PATCH 35/72] mypy: fix graph.py

---
 pytensor/gradient.py    | 30 ++++++++++++++++++++++++++++--
 pytensor/graph/basic.py |  5 +++--
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/pytensor/gradient.py b/pytensor/gradient.py
index abf80bff43..6b3a1a4b62 100644
--- a/pytensor/gradient.py
+++ b/pytensor/gradient.py
@@ -4,7 +4,7 @@
 import warnings
 from collections.abc import Callable, Mapping, MutableSequence, Sequence
 from functools import partial, reduce
-from typing import TYPE_CHECKING, Literal, TypeVar, Union
+from typing import TYPE_CHECKING, Literal, TypeVar, Union, overload
 
 import numpy as np
 
@@ -414,6 +414,32 @@ def Lop(
     return as_list_or_tuple(using_list, using_tuple, ret)
 
 
+@overload
+def grad(
+    cost: Variable | None,
+    wrt: Variable | Sequence[Variable],
+    consider_constant: Sequence[Variable] | None = ...,
+    disconnected_inputs: Literal["ignore", "warn", "raise"] = ...,
+    add_names: bool = ...,
+    known_grads: Mapping[Variable, Variable] | None = ...,
+    return_disconnected: Literal["zero", "disconnected"] = ...,
+    null_gradients: Literal["raise", "return"] = ...,
+) -> Variable | None | Sequence[Variable]: ...
+
+
+@overload
+def grad(
+    cost: Variable | None,
+    wrt: Variable | Sequence[Variable],
+    consider_constant: Sequence[Variable] | None = ...,
+    disconnected_inputs: Literal["ignore", "warn", "raise"] = ...,
+    add_names: bool = ...,
+    known_grads: Mapping[Variable, Variable] | None = ...,
+    return_disconnected: Literal["none"] = ...,
+    null_gradients: Literal["raise", "return"] = ...,
+) -> Variable | None | Sequence[Variable | None]: ...
+
+
 def grad(
     cost: Variable | None,
     wrt: Variable | Sequence[Variable],
@@ -423,7 +449,7 @@ def grad(
     known_grads: Mapping[Variable, Variable] | None = None,
     return_disconnected: Literal["none", "zero", "disconnected"] = "zero",
     null_gradients: Literal["raise", "return"] = "raise",
-) -> Variable | None | Sequence[Variable | None]:
+) -> Variable | None | Sequence[Variable | None] | Sequence[Variable]:
     """
     Return symbolic gradients of one cost with respect to one or more variables.
 
diff --git a/pytensor/graph/basic.py b/pytensor/graph/basic.py
index 2ffd101c23..057341909c 100644
--- a/pytensor/graph/basic.py
+++ b/pytensor/graph/basic.py
@@ -1313,8 +1313,9 @@ def clone_get_equiv(
     outputs: Reversible[Variable],
     copy_inputs: bool = True,
     copy_orphans: bool = True,
-    memo: dict[Union[Apply, Variable, "Op"], Union[Apply, Variable, "Op"]]
-    | None = None,
+    memo: (
+        dict[Union[Apply, Variable, "Op"], Union[Apply, Variable, "Op"]] | None
+    ) = None,
     clone_inner_graphs: bool = False,
     **kwargs,
 ) -> dict[Union[Apply, Variable, "Op"], Union[Apply, Variable, "Op"]]:

From a3f0a4eadba5b1fa6b0a81f6c4307fc2792281c0 Mon Sep 17 00:00:00 2001
From: ferres <justferres@yandex.ru>
Date: Wed, 14 Aug 2024 19:44:12 +0300
Subject: [PATCH 36/72] mypy: fix graph/basic.py

---
 pytensor/graph/basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytensor/graph/basic.py b/pytensor/graph/basic.py
index 057341909c..ed1ad6b6c2 100644
--- a/pytensor/graph/basic.py
+++ b/pytensor/graph/basic.py
@@ -710,7 +710,7 @@ def clone(self, **kwargs):
         return cp
 
 
-class NominalVariable(AtomicVariable[_TypeType]):
+class NominalVariable(Generic[_TypeType, _IdType], AtomicVariable[_TypeType]):
     """A variable that enables alpha-equivalent comparisons."""
 
     __instances__: dict[tuple["Type", Hashable], "NominalVariable"] = {}

From 79232b26c2c516fa8b29b1eb1bf234ea0df08b3e Mon Sep 17 00:00:00 2001
From: Pham Nguyen Hung <97870091+HangenYuu@users.noreply.github.com>
Date: Thu, 18 Jul 2024 17:58:59 +0700
Subject: [PATCH 37/72] Implement Dot and BatchedDot in PyTensor (#878)

---
 pytensor/link/__init__.py                  |  1 +
 pytensor/link/pytorch/dispatch/__init__.py |  5 +++-
 pytensor/link/pytorch/dispatch/blas.py     | 14 +++++++++++
 pytensor/link/pytorch/dispatch/math.py     | 12 +++++++++
 tests/link/pytorch/test_blas.py            | 24 ++++++++++++++++++
 tests/link/pytorch/test_math.py            | 29 ++++++++++++++++++++++
 6 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 pytensor/link/pytorch/dispatch/blas.py
 create mode 100644 pytensor/link/pytorch/dispatch/math.py
 create mode 100644 tests/link/pytorch/test_blas.py
 create mode 100644 tests/link/pytorch/test_math.py

diff --git a/pytensor/link/__init__.py b/pytensor/link/__init__.py
index e69de29bb2..c8c236a854 100644
--- a/pytensor/link/__init__.py
+++ b/pytensor/link/__init__.py
@@ -0,0 +1 @@
+from pytensor.link.pytorch.linker import PytorchLinker
diff --git a/pytensor/link/pytorch/dispatch/__init__.py b/pytensor/link/pytorch/dispatch/__init__.py
index 017e57df64..fa47908d74 100644
--- a/pytensor/link/pytorch/dispatch/__init__.py
+++ b/pytensor/link/pytorch/dispatch/__init__.py
@@ -2,9 +2,12 @@
 from pytensor.link.pytorch.dispatch.basic import pytorch_funcify, pytorch_typify
 
 # # Load dispatch specializations
+import pytensor.link.pytorch.dispatch.blas
 import pytensor.link.pytorch.dispatch.scalar
 import pytensor.link.pytorch.dispatch.elemwise
+import pytensor.link.pytorch.dispatch.math
 import pytensor.link.pytorch.dispatch.extra_ops
-import pytensor.link.pytorch.dispatch.sort
 import pytensor.link.pytorch.dispatch.shape
+import pytensor.link.pytorch.dispatch.sort
+
 # isort: on
diff --git a/pytensor/link/pytorch/dispatch/blas.py b/pytensor/link/pytorch/dispatch/blas.py
new file mode 100644
index 0000000000..5691551998
--- /dev/null
+++ b/pytensor/link/pytorch/dispatch/blas.py
@@ -0,0 +1,14 @@
+import torch
+
+from pytensor.link.pytorch.dispatch import pytorch_funcify
+from pytensor.tensor.blas import BatchedDot
+
+
+@pytorch_funcify.register(BatchedDot)
+def pytorch_funcify_BatchedDot(op, **kwargs):
+    def batched_dot(a, b):
+        if a.shape[0] != b.shape[0]:
+            raise TypeError("Shapes must match in the 0-th dimension")
+        return torch.bmm(a, b)
+
+    return batched_dot
diff --git a/pytensor/link/pytorch/dispatch/math.py b/pytensor/link/pytorch/dispatch/math.py
new file mode 100644
index 0000000000..4275424f0a
--- /dev/null
+++ b/pytensor/link/pytorch/dispatch/math.py
@@ -0,0 +1,12 @@
+import torch
+
+from pytensor.link.pytorch.dispatch import pytorch_funcify
+from pytensor.tensor.math import Dot
+
+
+@pytorch_funcify.register(Dot)
+def pytorch_funcify_Dot(op, **kwargs):
+    def dot(x, y):
+        return torch.matmul(x, y)
+
+    return dot
diff --git a/tests/link/pytorch/test_blas.py b/tests/link/pytorch/test_blas.py
new file mode 100644
index 0000000000..35f7dd7b6a
--- /dev/null
+++ b/tests/link/pytorch/test_blas.py
@@ -0,0 +1,24 @@
+import numpy as np
+import pytest
+
+from pytensor.configdefaults import config
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor import blas as pt_blas
+from pytensor.tensor.type import tensor3
+from tests.link.pytorch.test_basic import compare_pytorch_and_py
+
+
+def test_pytorch_BatchedDot():
+    # tensor3 . tensor3
+    a = tensor3("a")
+    a_test = np.linspace(-1, 1, 10 * 5 * 3).astype(config.floatX).reshape((10, 5, 3))
+    b = tensor3("b")
+    b_test = np.linspace(1, -1, 10 * 3 * 2).astype(config.floatX).reshape((10, 3, 2))
+    out = pt_blas.BatchedDot()(a, b)
+    fgraph = FunctionGraph([a, b], [out])
+    pytensor_pytorch_fn, _ = compare_pytorch_and_py(fgraph, [a_test, b_test])
+
+    # A dimension mismatch should raise a TypeError for compatibility
+    inputs = [a_test[:-1], b_test]
+    with pytest.raises(TypeError):
+        pytensor_pytorch_fn(*inputs)
diff --git a/tests/link/pytorch/test_math.py b/tests/link/pytorch/test_math.py
new file mode 100644
index 0000000000..affca4ad32
--- /dev/null
+++ b/tests/link/pytorch/test_math.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+from pytensor.configdefaults import config
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor.type import matrix, scalar, vector
+from tests.link.pytorch.test_basic import compare_pytorch_and_py
+
+
+def test_pytorch_dot():
+    y = vector("y")
+    y_test = np.r_[1.0, 2.0].astype(config.floatX)
+    x = vector("x")
+    x_test = np.r_[3.0, 4.0].astype(config.floatX)
+    A = matrix("A")
+    A_test = np.array([[6, 3], [3, 0]], dtype=config.floatX)
+    alpha = scalar("alpha")
+    alpha_test = np.array(3.0, dtype=config.floatX)
+    beta = scalar("beta")
+    beta_test = np.array(5.0, dtype=config.floatX)
+
+    # 2D * 2D
+    out = A.dot(A * alpha) + beta * A
+    fgraph = FunctionGraph([A, alpha, beta], [out])
+    compare_pytorch_and_py(fgraph, [A_test, alpha_test, beta_test])
+
+    # 1D * 2D and 1D * 1D
+    out = y.dot(alpha * A).dot(x) + beta * y
+    fgraph = FunctionGraph([y, x, A, alpha, beta], [out])
+    compare_pytorch_and_py(fgraph, [y_test, x_test, A_test, alpha_test, beta_test])

From 143ded672c42793ce02d0241561b9fc629d859d7 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <48652735+jessegrabowski@users.noreply.github.com>
Date: Thu, 18 Jul 2024 09:08:56 -0500
Subject: [PATCH 38/72] Add `OpFromGraph` wrapper around `alloc_diag` (#915)

* Add `OpFromGraph` wrapper around `alloc_diag`

* Remove depreciated `AllocDiag` `Op`, rename `AllocDiag2 -> AllocDiag`

* Set `inline = False`

* Add rewrite to inline all `OpFromGraph` `Op`s

* Add `is_zero_offset` helper to `Eye`

* Add `is_left_expand_dims` and `is_right_expand_dims` attributes to `DimShuffle`

* Seed `test_local_lift_through_linalg` test
---
 pytensor/compile/builders.py          |  32 +-----
 pytensor/link/jax/dispatch/basic.py   |  24 +++++
 pytensor/tensor/basic.py              | 141 +++++++++-----------------
 pytensor/tensor/elemwise.py           |   8 ++
 pytensor/tensor/rewriting/__init__.py |   1 +
 pytensor/tensor/rewriting/linalg.py   | 119 ++++++++++++++--------
 pytensor/tensor/rewriting/ofg.py      |  68 +++++++++++++
 tests/link/jax/test_basic.py          |  19 +++-
 tests/tensor/rewriting/test_linalg.py |  17 +++-
 tests/tensor/rewriting/test_ofg.py    |  22 ++++
 10 files changed, 278 insertions(+), 173 deletions(-)
 create mode 100644 pytensor/tensor/rewriting/ofg.py
 create mode 100644 tests/tensor/rewriting/test_ofg.py

diff --git a/pytensor/compile/builders.py b/pytensor/compile/builders.py
index 91588a5ecc..759c9b09bb 100644
--- a/pytensor/compile/builders.py
+++ b/pytensor/compile/builders.py
@@ -8,7 +8,6 @@
 
 from pytensor.compile.function import function
 from pytensor.compile.function.pfunc import rebuild_collect_shared
-from pytensor.compile.mode import optdb
 from pytensor.compile.sharedvalue import SharedVariable
 from pytensor.configdefaults import config
 from pytensor.gradient import DisconnectedType, Rop, grad
@@ -24,7 +23,6 @@
 from pytensor.graph.null_type import NullType
 from pytensor.graph.op import HasInnerGraph, Op
 from pytensor.graph.replace import clone_replace
-from pytensor.graph.rewriting.basic import in2out, node_rewriter
 from pytensor.graph.utils import MissingInputError
 
 
@@ -575,7 +573,7 @@ def lop_overrides(inps, grads):
             for inp_grad in input_grads
             if not isinstance(inp_grad.type, DisconnectedType | NullType)
         ]
-        lop_op = type(self)(
+        lop_op = OpFromGraph(
             inputs=inner_inputs + connected_inner_outputs + connected_output_grads,
             outputs=connected_input_grads,
             inline=self.is_inline,
@@ -669,7 +667,7 @@ def _build_and_cache_rop_op(self):
             for out_grad in output_grads
             if not isinstance(out_grad.type, DisconnectedType | NullType)
         ]
-        rop_op = type(self)(
+        rop_op = OpFromGraph(
             inputs=inner_inputs + eval_points,
             outputs=filtered_output_grads,
             inline=self.is_inline,
@@ -852,29 +850,3 @@ def perform(self, node, inputs, outputs):
         assert len(variables) == len(outputs)
         for output, variable in zip(outputs, variables):
             output[0] = variable
-
-
-@node_rewriter([OpFromGraph])
-def inline_ofg_expansion(fgraph, node):
-    """
-    This optimization expands internal graph of OpFromGraph.
-    Only performed if node.op.is_inline == True
-    Doing so can improve optimization at the cost of compilation speed.
-    """
-    op = node.op
-    if not isinstance(op, OpFromGraph):
-        return False
-    if not op.is_inline:
-        return False
-    return clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
-
-
-# We want to run this before the first merge optimizer
-# and before the first scan optimizer.
-optdb.register(
-    "inline_ofg_expansion",
-    in2out(inline_ofg_expansion),
-    "fast_compile",
-    "fast_run",
-    position=-0.01,
-)
diff --git a/pytensor/link/jax/dispatch/basic.py b/pytensor/link/jax/dispatch/basic.py
index b35759f837..bd559ee716 100644
--- a/pytensor/link/jax/dispatch/basic.py
+++ b/pytensor/link/jax/dispatch/basic.py
@@ -1,10 +1,13 @@
 import warnings
+from collections.abc import Callable
 from functools import singledispatch
 
 import jax
 import jax.numpy as jnp
 import numpy as np
 
+from pytensor.compile import JAX
+from pytensor.compile.builders import OpFromGraph
 from pytensor.compile.ops import DeepCopyOp, ViewOp
 from pytensor.configdefaults import config
 from pytensor.graph.fg import FunctionGraph
@@ -114,3 +117,24 @@ def viewop(x):
         return x
 
     return viewop
+
+
+@jax_funcify.register(OpFromGraph)
+def jax_funcify_OpFromGraph(ofg: OpFromGraph, node=None, **kwargs) -> Callable:
+    _ = kwargs.pop("storage_map", None)
+
+    # Apply inner rewrites
+    JAX.optimizer(ofg.fgraph)
+    fgraph_fn = jax_funcify(ofg.fgraph, **kwargs)
+
+    if len(ofg.fgraph.outputs) == 1:
+
+        def opfromgraph(*inputs):
+            return fgraph_fn(*inputs)[0]
+
+    else:
+
+        def opfromgraph(*inputs):
+            return fgraph_fn(*inputs)
+
+    return opfromgraph
diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
index 014ae80e4c..119c44c647 100644
--- a/pytensor/tensor/basic.py
+++ b/pytensor/tensor/basic.py
@@ -21,6 +21,7 @@
 import pytensor.scalar.sharedvar
 from pytensor import compile, config, printing
 from pytensor import scalar as ps
+from pytensor.compile.builders import OpFromGraph
 from pytensor.gradient import DisconnectedType, grad_undefined
 from pytensor.graph import RewriteDatabaseQuery
 from pytensor.graph.basic import Apply, Constant, Variable, equal_computations
@@ -1334,6 +1335,25 @@ def infer_shape(self, fgraph, node, in_shapes):
     def grad(self, inp, grads):
         return [grad_undefined(self, i, inp[i]) for i in range(3)]
 
+    @staticmethod
+    def is_offset_zero(node) -> bool:
+        """
+        Test if an Eye Op has a diagonal offset of zero
+
+        Parameters
+        ----------
+        node
+            Eye node to test
+
+        Returns
+        -------
+        is_offset_zero: bool
+            True if the offset is zero (``k = 0``).
+        """
+
+        offset = node.inputs[-1]
+        return isinstance(offset, Constant) and offset.data.item() == 0
+
 
 def eye(n, m=None, k=0, dtype=None):
     """Return a 2-D array with ones on the diagonal and zeros elsewhere.
@@ -3749,109 +3769,37 @@ def trace(a, offset=0, axis1=0, axis2=1):
     return diagonal(a, offset=offset, axis1=axis1, axis2=axis2).sum(-1)
 
 
-class AllocDiag(Op):
-    """An `Op` that copies a vector to the diagonal of a zero-ed matrix."""
+class AllocDiag(OpFromGraph):
+    """
+    Wrapper Op for alloc_diag graphs
+    """
 
-    __props__ = ("offset", "axis1", "axis2")
+    __props__ = ("axis1", "axis2")
 
-    def __init__(self, offset=0, axis1=0, axis2=1):
-        """
-        Parameters
-        ----------
-        offset: int
-            Offset of the diagonal from the main diagonal defined by `axis1`
-            and `axis2`. Can be positive or negative.  Defaults to main
-            diagonal (i.e. 0).
-        axis1: int
-            Axis to be used as the first axis of the 2-D sub-arrays to which
-            the diagonals will be allocated.  Defaults to first axis (i.e. 0).
-        axis2: int
-            Axis to be used as the second axis of the 2-D sub-arrays to which
-            the diagonals will be allocated.  Defaults to second axis (i.e. 1).
-        """
-        warnings.warn(
-            "AllocDiag is deprecated. Use `alloc_diag` instead",
-            FutureWarning,
-        )
-        self.offset = offset
-        if axis1 < 0 or axis2 < 0:
-            raise NotImplementedError("AllocDiag does not support negative axis")
-        if axis1 == axis2:
-            raise ValueError("axis1 and axis2 cannot be the same")
+    def __init__(self, *args, axis1, axis2, offset, **kwargs):
         self.axis1 = axis1
         self.axis2 = axis2
+        self.offset = offset
 
-    def make_node(self, diag):
-        diag = as_tensor_variable(diag)
-        if diag.type.ndim < 1:
-            raise ValueError(
-                "AllocDiag needs an input with 1 or more dimensions", diag.type
-            )
-        return Apply(
-            self,
-            [diag],
-            [diag.type.clone(shape=(None,) * (diag.ndim + 1))()],
-        )
-
-    def perform(self, node, inputs, outputs):
-        (x,) = inputs
-        (z,) = outputs
-
-        axis1 = np.minimum(self.axis1, self.axis2)
-        axis2 = np.maximum(self.axis1, self.axis2)
-        offset = self.offset
-
-        # Create array with one extra dimension for resulting matrix
-        result_shape = x.shape[:-1] + (x.shape[-1] + abs(offset),) * 2
-        result = np.zeros(result_shape, dtype=x.dtype)
-
-        # Create slice for diagonal in final 2 axes
-        idxs = np.arange(x.shape[-1])
-        diagonal_slice = (len(result_shape) - 2) * [slice(None)] + [
-            idxs + np.maximum(0, -offset),
-            idxs + np.maximum(0, offset),
-        ]
-
-        # Fill in final 2 axes with x
-        result[tuple(diagonal_slice)] = x
-
-        if len(x.shape) > 1:
-            # Re-order axes so they correspond to diagonals at axis1, axis2
-            axes = list(range(len(x.shape[:-1])))
-            last_idx = axes[-1]
-            axes = axes[:axis1] + [last_idx + 1] + axes[axis1:]
-            axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
-            result = result.transpose(axes)
-
-        z[0] = result
-
-    def grad(self, inputs, gout):
-        (gz,) = gout
-        return [diagonal(gz, offset=self.offset, axis1=self.axis1, axis2=self.axis2)]
-
-    def infer_shape(self, fgraph, nodes, shapes):
-        (x_shape,) = shapes
-        axis1 = np.minimum(self.axis1, self.axis2)
-        axis2 = np.maximum(self.axis1, self.axis2)
+        super().__init__(*args, **kwargs, strict=True)
 
-        result_shape = list(x_shape[:-1])
-        diag_shape = x_shape[-1] + abs(self.offset)
-        result_shape = result_shape[:axis1] + [diag_shape] + result_shape[axis1:]
-        result_shape = result_shape[:axis2] + [diag_shape] + result_shape[axis2:]
-        return [tuple(result_shape)]
+    @staticmethod
+    def is_offset_zero(node) -> bool:
+        """
+        Test if an AllocDiag Op has a diagonal offset of zero
 
-    def __setstate__(self, state):
-        if "view_map" in state:
-            del state["view_map"]
+        Parameters
+        ----------
+        node
+            AllocDiag node to test
 
-        self.__dict__.update(state)
+        Returns
+        -------
+        is_offset_zero: bool
+            True if the offset is zero (``k = 0``).
+        """
 
-        if "offset" not in state:
-            self.offset = 0
-        if "axis1" not in state:
-            self.axis1 = 0
-        if "axis2" not in state:
-            self.axis2 = 1
+        return node.op.offset == 0
 
 
 def alloc_diag(diag, offset=0, axis1=0, axis2=1):
@@ -3862,6 +3810,7 @@ def alloc_diag(diag, offset=0, axis1=0, axis2=1):
     from pytensor.tensor import set_subtensor
 
     diag = as_tensor_variable(diag)
+
     axis1, axis2 = normalize_axis_tuple((axis1, axis2), ndim=diag.type.ndim + 1)
     if axis1 > axis2:
         axis1, axis2 = axis2, axis1
@@ -3888,7 +3837,9 @@ def alloc_diag(diag, offset=0, axis1=0, axis2=1):
         axes = axes[:axis2] + [last_idx + 2] + axes[axis2:]
         result = result.transpose(axes)
 
-    return result
+    return AllocDiag(
+        inputs=[diag], outputs=[result], axis1=axis1, axis2=axis2, offset=offset
+    )(diag)
 
 
 def diag(v, k=0):
diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
index 971be19f46..de966f1a78 100644
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
@@ -185,6 +185,14 @@ def __init__(self, input_broadcastable, new_order):
         self.augment = sorted(i for i, x in enumerate(new_order) if x == "x")
         self.drop = drop
 
+        input_ndim = len(input_broadcastable)
+        self.is_left_expand_dims = self.augment and (
+            input_ndim == 0 or new_order[-input_ndim:] == list(range(input_ndim))
+        )
+        self.is_right_expand_dims = self.augment and new_order[:input_ndim] == list(
+            range(input_ndim)
+        )
+
         if self.inplace:
             self.view_map = {0: [0]}
 
diff --git a/pytensor/tensor/rewriting/__init__.py b/pytensor/tensor/rewriting/__init__.py
index 617eab04fa..168b636041 100644
--- a/pytensor/tensor/rewriting/__init__.py
+++ b/pytensor/tensor/rewriting/__init__.py
@@ -10,6 +10,7 @@
 import pytensor.tensor.rewriting.jax
 import pytensor.tensor.rewriting.linalg
 import pytensor.tensor.rewriting.math
+import pytensor.tensor.rewriting.ofg
 import pytensor.tensor.rewriting.shape
 import pytensor.tensor.rewriting.special
 import pytensor.tensor.rewriting.subtensor
diff --git a/pytensor/tensor/rewriting/linalg.py b/pytensor/tensor/rewriting/linalg.py
index 38ed9a51d5..5f2e8cf388 100644
--- a/pytensor/tensor/rewriting/linalg.py
+++ b/pytensor/tensor/rewriting/linalg.py
@@ -5,12 +5,16 @@
 from pytensor import Variable
 from pytensor.graph import Apply, FunctionGraph
 from pytensor.graph.rewriting.basic import (
-    PatternNodeRewriter,
     copy_stack_trace,
     node_rewriter,
 )
 from pytensor.scalar.basic import Mul
-from pytensor.tensor.basic import ARange, Eye, TensorVariable, alloc, diagonal
+from pytensor.tensor.basic import (
+    AllocDiag,
+    Eye,
+    TensorVariable,
+    diagonal,
+)
 from pytensor.tensor.blas import Dot22
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import DimShuffle, Elemwise
@@ -41,7 +45,6 @@
     solve,
     solve_triangular,
 )
-from pytensor.tensor.subtensor import advanced_set_subtensor
 
 
 logger = logging.getLogger(__name__)
@@ -402,30 +405,68 @@ def _find_diag_from_eye_mul(potential_mul_input):
     eye_input = [
         mul_input
         for mul_input in inputs_to_mul
-        if mul_input.owner and isinstance(mul_input.owner.op, Eye)
+        if mul_input.owner
+        and (
+            isinstance(mul_input.owner.op, Eye)
+            or
+            # This whole condition checks if there is an Eye hiding inside a DimShuffle.
+            # This arises from batched elementwise multiplication between a tensor and an eye, e.g.:
+            # tensor(shape=(None, 3, 3) * eye(3). This is still potentially valid for diag rewrites.
+            (
+                isinstance(mul_input.owner.op, DimShuffle)
+                and (
+                    mul_input.owner.op.is_left_expand_dims
+                    or mul_input.owner.op.is_right_expand_dims
+                )
+                and mul_input.owner.inputs[0].owner is not None
+                and isinstance(mul_input.owner.inputs[0].owner.op, Eye)
+            )
+        )
     ]
 
-    # Check if 1's are being put on the main diagonal only (k = 0)
-    if eye_input and getattr(eye_input[0].owner.inputs[-1], "data", -1).item() != 0:
+    if not eye_input:
         return None
 
-    # If the broadcast pattern of eye_input is not (False, False), we do not get a diagonal matrix and thus, dont need to apply the rewrite
-    if eye_input and eye_input[0].broadcastable[-2:] != (False, False):
+    eye_input = eye_input[0]
+    # If eye_input is an Eye Op (it's not wrapped in a DimShuffle), check it doesn't have an offset
+    if isinstance(eye_input.owner.op, Eye) and (
+        not Eye.is_offset_zero(eye_input.owner)
+        or eye_input.broadcastable[-2:] != (False, False)
+    ):
         return None
 
+    # Otherwise, an Eye was found but it is wrapped in a DimShuffle (i.e. there was some broadcasting going on).
+    # We have to look inside DimShuffle to decide if the rewrite can be applied
+    if isinstance(eye_input.owner.op, DimShuffle) and (
+        eye_input.owner.op.is_left_expand_dims
+        or eye_input.owner.op.is_right_expand_dims
+    ):
+        inner_eye = eye_input.owner.inputs[0]
+        # We can only rewrite when the Eye is on the main diagonal (the offset is zero) and the identity isn't
+        # degenerate
+        if not Eye.is_offset_zero(inner_eye.owner) or inner_eye.broadcastable[-2:] != (
+            False,
+            False,
+        ):
+            return None
+
     # Get all non Eye inputs (scalars/matrices/vectors)
-    non_eye_inputs = list(set(inputs_to_mul) - set(eye_input))
+    non_eye_inputs = list(set(inputs_to_mul) - {eye_input})
     return eye_input, non_eye_inputs
 
 
 @register_canonicalize("shape_unsafe")
 @register_stabilize("shape_unsafe")
 @node_rewriter([det])
-def rewrite_det_diag_from_eye_mul(fgraph, node):
+def rewrite_det_diag_to_prod_diag(fgraph, node):
     """
-     This rewrite takes advantage of the fact that for a diagonal matrix, the determinant value is the product of its diagonal elements.
+     This rewrite takes advantage of the fact that for a diagonal matrix, the determinant value is the product of its
+     diagonal elements.
 
-    The presence of a diagonal matrix is detected by inspecting the graph. This rewrite can identify diagonal matrices that arise as the result of elementwise multiplication with an identity matrix. Specialized computation is used to make this rewrite as efficient as possible, depending on whether the multiplication was with a scalar, vector or a matrix.
+    The presence of a diagonal matrix is detected by inspecting the graph. This rewrite can identify diagonal matrices
+    that arise as the result of elementwise multiplication with an identity matrix. Specialized computation is used to
+    make this rewrite as efficient as possible, depending on whether the multiplication was with a scalar,
+    vector or a matrix.
 
     Parameters
     ----------
@@ -439,53 +480,45 @@ def rewrite_det_diag_from_eye_mul(fgraph, node):
     list of Variable, optional
         List of optimized variables, or None if no optimization was performed
     """
-    potential_mul_input = node.inputs[0]
-    eye_non_eye_inputs = _find_diag_from_eye_mul(potential_mul_input)
-    if eye_non_eye_inputs is None:
+    inputs = node.inputs[0]
+
+    # Check for use of pt.diag first
+    if (
+        inputs.owner
+        and isinstance(inputs.owner.op, AllocDiag)
+        and AllocDiag.is_offset_zero(inputs.owner)
+    ):
+        diag_input = inputs.owner.inputs[0]
+        det_val = diag_input.prod(axis=-1)
+        return [det_val]
+
+    # Check if the input is an elemwise multiply with identity matrix -- this also results in a diagonal matrix
+    inputs_or_none = _find_diag_from_eye_mul(inputs)
+    if inputs_or_none is None:
         return None
-    eye_input, non_eye_inputs = eye_non_eye_inputs
+
+    eye_input, non_eye_inputs = inputs_or_none
 
     # Dealing with only one other input
     if len(non_eye_inputs) != 1:
         return None
 
-    useful_eye, useful_non_eye = eye_input[0], non_eye_inputs[0]
+    eye_input, non_eye_input = eye_input[0], non_eye_inputs[0]
 
     # Checking if original x was scalar/vector/matrix
-    if useful_non_eye.type.broadcastable[-2:] == (True, True):
+    if non_eye_input.type.broadcastable[-2:] == (True, True):
         # For scalar
-        det_val = useful_non_eye.squeeze(axis=(-1, -2)) ** (useful_eye.shape[0])
-    elif useful_non_eye.type.broadcastable[-2:] == (False, False):
+        det_val = non_eye_input.squeeze(axis=(-1, -2)) ** (eye_input.shape[0])
+    elif non_eye_input.type.broadcastable[-2:] == (False, False):
         # For Matrix
-        det_val = useful_non_eye.diagonal(axis1=-1, axis2=-2).prod(axis=-1)
+        det_val = non_eye_input.diagonal(axis1=-1, axis2=-2).prod(axis=-1)
     else:
         # For vector
-        det_val = useful_non_eye.prod(axis=(-1, -2))
+        det_val = non_eye_input.prod(axis=(-1, -2))
     det_val = det_val.astype(node.outputs[0].type.dtype)
     return [det_val]
 
 
-arange = ARange("int64")
-det_diag_from_diag = PatternNodeRewriter(
-    (
-        det,
-        (
-            advanced_set_subtensor,
-            (alloc, 0, "sh1", "sh2"),
-            "x",
-            (arange, 0, "stop", 1),
-            (arange, 0, "stop", 1),
-        ),
-    ),
-    (prod, "x"),
-    name="det_diag_from_diag",
-    allow_multiple_clients=True,
-)
-register_canonicalize(det_diag_from_diag)
-register_stabilize(det_diag_from_diag)
-register_specialize(det_diag_from_diag)
-
-
 @register_canonicalize
 @register_stabilize
 @register_specialize
diff --git a/pytensor/tensor/rewriting/ofg.py b/pytensor/tensor/rewriting/ofg.py
new file mode 100644
index 0000000000..265f3ff2e8
--- /dev/null
+++ b/pytensor/tensor/rewriting/ofg.py
@@ -0,0 +1,68 @@
+from pytensor import clone_replace
+from pytensor.compile import optdb
+from pytensor.compile.builders import OpFromGraph
+from pytensor.graph import node_rewriter
+from pytensor.graph.rewriting.basic import copy_stack_trace, in2out
+from pytensor.tensor.basic import AllocDiag
+from pytensor.tensor.rewriting.basic import register_specialize
+
+
+@node_rewriter([OpFromGraph])
+def inline_ofg_expansion(fgraph, node):
+    """
+    This optimization expands internal graph of OpFromGraph.
+    Only performed if node.op.is_inline == True
+    Doing so can improve optimization at the cost of compilation speed.
+    """
+    op = node.op
+    if not op.is_inline:
+        return False
+
+    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
+    copy_stack_trace(op.inner_outputs, new_out)
+
+    return new_out
+
+
+# We want to run this before the first merge optimizer
+# and before the first scan optimizer.
+optdb.register(
+    "inline_ofg_expansion",
+    in2out(inline_ofg_expansion),
+    "fast_compile",
+    "fast_run",
+    position=-0.01,
+)
+
+
+@register_specialize("inline_ofg")
+@node_rewriter([AllocDiag])
+def late_inline_OpFromGraph(fgraph, node):
+    """
+    Inline `OpFromGraph` nodes.
+
+    OpFromGraph nodes are used to compactly represent the output of a function graph. Certain `Ops`, like, einsum,
+    diag, and kron, are implemented using pytensor `Op`s. As a result, their outputs are not a single `Op`, but a
+    graph. To allow rewrites to easily spot and manipulate these "composite functions", we use the `OpFromGraph` node.
+    This node is a thin wrapper around the output graph. It is not, however, meant to be included in the final
+    program, because it hides the inner graph from certain optimizations.
+
+    This rewrite specifies that all `OpFromGraph` nodes should be replaced by their inner graphs by setting the
+    `inplace=True` flag.
+
+    Parameters
+    ----------
+    fgraph: FunctionGraph
+        The function graph being rewritten
+    node: Apply
+        Node of the function graph to be optimized
+
+    Returns
+    -------
+
+    """
+    op = node.op
+    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
+    copy_stack_trace(op.inner_outputs, new_out)
+
+    return new_out
diff --git a/tests/link/jax/test_basic.py b/tests/link/jax/test_basic.py
index 76c8b4b329..5cd2bd54c6 100644
--- a/tests/link/jax/test_basic.py
+++ b/tests/link/jax/test_basic.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pytest
 
+from pytensor.compile.builders import OpFromGraph
 from pytensor.compile.function import function
 from pytensor.compile.mode import get_mode
 from pytensor.compile.sharedvalue import SharedVariable, shared
@@ -13,7 +14,7 @@
 from pytensor.graph.op import Op, get_test_value
 from pytensor.ifelse import ifelse
 from pytensor.raise_op import assert_op
-from pytensor.tensor.type import dscalar, scalar, vector
+from pytensor.tensor.type import dscalar, matrices, scalar, vector
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -209,3 +210,19 @@ def test_jax_checkandraise():
 def set_test_value(x, v):
     x.tag.test_value = v
     return x
+
+
+def test_OpFromGraph():
+    x, y, z = matrices("xyz")
+    ofg_1 = OpFromGraph([x, y], [x + y], inline=False)
+    ofg_2 = OpFromGraph([x, y], [x * y, x - y], inline=False)
+
+    o1, o2 = ofg_2(y, z)
+    out = ofg_1(x, o1) + o2
+    out_fg = FunctionGraph([x, y, z], [out])
+
+    xv = np.ones((2, 2), dtype=config.floatX)
+    yv = np.ones((2, 2), dtype=config.floatX) * 3
+    zv = np.ones((2, 2), dtype=config.floatX) * 5
+
+    compare_jax_and_py(out_fg, [xv, yv, zv])
diff --git a/tests/tensor/rewriting/test_linalg.py b/tests/tensor/rewriting/test_linalg.py
index d59e3cc88f..0bc064fe65 100644
--- a/tests/tensor/rewriting/test_linalg.py
+++ b/tests/tensor/rewriting/test_linalg.py
@@ -362,6 +362,8 @@ def test_invalid_batched_a(self):
     ids=["block_diag", "kron"],
 )
 def test_local_lift_through_linalg(constructor, f_op, f, g_op, g):
+    rng = np.random.default_rng(sum(map(ord, "lift_through_linalg")))
+
     if pytensor.config.floatX.endswith("32"):
         pytest.skip("Test is flaky at half precision")
 
@@ -371,6 +373,7 @@ def test_local_lift_through_linalg(constructor, f_op, f, g_op, g):
     f1 = pytensor.function(
         [A, B], X, mode=get_default_mode().including("local_lift_through_linalg")
     )
+
     f2 = pytensor.function(
         [A, B], X, mode=get_default_mode().excluding("local_lift_through_linalg")
     )
@@ -386,9 +389,7 @@ def test_local_lift_through_linalg(constructor, f_op, f, g_op, g):
     assert len(f_ops) == 2
     assert len(g_ops) == 1
 
-    test_vals = [
-        np.random.normal(size=(3,) * A.ndim).astype(config.floatX) for _ in range(2)
-    ]
+    test_vals = [rng.normal(size=(3,) * A.ndim).astype(config.floatX) for _ in range(2)]
     test_vals = [x @ np.swapaxes(x, -1, -2) for x in test_vals]
 
     np.testing.assert_allclose(f1(*test_vals), f2(*test_vals), atol=1e-8)
@@ -403,13 +404,18 @@ def test_det_diag_from_eye_mul(shape):
     # Initializing x based on scalar/vector/matrix
     x = pt.tensor("x", shape=shape)
     y = pt.eye(7) * x
+
     # Calculating determinant value using pt.linalg.det
     z_det = pt.linalg.det(y)
 
     # REWRITE TEST
     f_rewritten = function([x], z_det, mode="FAST_RUN")
     nodes = f_rewritten.maker.fgraph.apply_nodes
-    assert not any(isinstance(node.op, Det) for node in nodes)
+
+    assert not any(
+        isinstance(node.op, Det) or isinstance(getattr(node.op, "core_op", None), Det)
+        for node in nodes
+    )
 
     # NUMERIC VALUE TEST
     if len(shape) == 0:
@@ -418,6 +424,7 @@ def test_det_diag_from_eye_mul(shape):
         x_test = np.random.rand(*shape).astype(config.floatX)
     else:
         x_test = np.random.rand(*shape).astype(config.floatX)
+
     x_test_matrix = np.eye(7) * x_test
     det_val = np.linalg.det(x_test_matrix)
     rewritten_val = f_rewritten(x_test)
@@ -459,6 +466,7 @@ def test_dont_apply_det_diag_rewrite_for_1_1():
     x_diag = pt.eye(1, 1) * x
     y = pt.linalg.det(x_diag)
     f_rewritten = function([x], y, mode="FAST_RUN")
+
     nodes = f_rewritten.maker.fgraph.apply_nodes
 
     assert any(isinstance(node.op, Det) for node in nodes)
@@ -468,6 +476,7 @@ def test_dont_apply_det_diag_rewrite_for_1_1():
     x_test_matrix = np.eye(1, 1) * x_test
     det_val = np.linalg.det(x_test_matrix)
     rewritten_val = f_rewritten(x_test)
+
     assert_allclose(
         det_val,
         rewritten_val,
diff --git a/tests/tensor/rewriting/test_ofg.py b/tests/tensor/rewriting/test_ofg.py
new file mode 100644
index 0000000000..6304939562
--- /dev/null
+++ b/tests/tensor/rewriting/test_ofg.py
@@ -0,0 +1,22 @@
+import pytest
+
+import pytensor
+import pytensor.tensor as pt
+from pytensor import config
+from pytensor.compile.builders import OpFromGraph
+
+
+@pytest.mark.skipif(
+    config.mode == "FAST_COMPILE",
+    reason="Rewrite is not applied in FAST_COMPILE mode",
+)
+def test_alloc_diag_inlined():
+    x = pt.tensor("x", shape=(None,))
+
+    z = pt.diag(x)
+    assert isinstance(z.owner.op, OpFromGraph)
+
+    f = pytensor.function([x], z)
+    nodes = f.maker.fgraph.apply_nodes
+
+    assert not any(isinstance(node.op, OpFromGraph) for node in nodes)

From 8c30780af61a3c0fa4c4cf818594c643925a9949 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 18 Jul 2024 18:15:50 +0200
Subject: [PATCH 39/72] Bump actions/upload-artifact from 3 to 4 (#560)

* Bump actions/upload-artifact from 3 to 4

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3 to 4.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

* Migrate pypi to GHA artifacts v4

* Bump download-artifact to v4

* Eliminate undefined matrix.python-version variable

* Upload/download each platform separately

* Use pattern arg to download-artifact

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Ben Mares <services-git-throwaway1@tensorial.com>
---
 .github/workflows/pypi.yml | 41 ++++++++++++++++++++++++++------------
 .github/workflows/test.yml |  9 +++++----
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index d129c0c32a..ca37e422d0 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -30,12 +30,13 @@ jobs:
       - name: Build SDist
         run: pipx run build --sdist
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
+          name: sdist
           path: dist/*.tar.gz
 
   build_wheels:
-    name: Build ${{ matrix.python-version }} wheels on ${{ matrix.platform }}
+    name: Build wheels for ${{ matrix.platform }}
     runs-on: ${{ matrix.platform }}
     strategy:
       matrix:
@@ -51,8 +52,9 @@ jobs:
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.19.2
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
+          name: wheels-${{ matrix.platform }}
           path: ./wheelhouse/*.whl
 
   check_dist:
@@ -60,10 +62,17 @@ jobs:
     needs: [make_sdist,build_wheels]
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
-          name: artifact
+          name: sdist
           path: dist
+
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: wheels-*
+          path: dist
+          merge-multiple: true
+
       - name: Check SDist
         run: |
           mkdir -p test-sdist
@@ -83,12 +92,18 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event_name == 'release' && github.event.action == 'published'
     steps:
-    - uses: actions/download-artifact@v3
-      with:
-        name: artifact
-        path: dist
+      - uses: actions/download-artifact@v4
+        with:
+          name: sdist
+          path: dist
 
-    - uses: pypa/gh-action-pypi-publish@v1.9.0
-      with:
-        user: __token__
-        password: ${{ secrets.pypi_password }}
+      - uses: actions/download-artifact@v4
+        with:
+          pattern: wheels-*
+          path: dist
+          merge-multiple: true
+
+      - uses: pypa/gh-action-pypi-publish@v1.9.0
+        with:
+          user: __token__
+          password: ${{ secrets.pypi_password }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index eb36a61386..674bc52c7b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -187,9 +187,9 @@ jobs:
           FLOAT32: ${{ matrix.float32 }}
 
       - name: Upload coverage file
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
-          name: coverage
+          name: coverage-${{ steps.matrix-id.outputs.id }}
           path: coverage/coverage-${{ steps.matrix-id.outputs.id }}.xml
 
   benchmarks:
@@ -273,10 +273,11 @@ jobs:
           python -m pip install -U coverage>=5.1 coveralls
 
       - name: Download coverage file
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
-          name: coverage
+          pattern: coverage-*
           path: coverage
+          merge-multiple: true
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4

From 297bdd45a6362c82ae58fe8ffef865ec35f33295 Mon Sep 17 00:00:00 2001
From: Tanish <tanish.taneja@research.iiit.ac.in>
Date: Fri, 19 Jul 2024 16:20:37 +0530
Subject: [PATCH 40/72] Added rewrite for matrix inv(inv(x)) -> x (#893)

---
 pytensor/tensor/rewriting/linalg.py   | 42 +++++++++++++++++++++++++++
 tests/tensor/rewriting/test_linalg.py | 14 +++++++++
 2 files changed, 56 insertions(+)

diff --git a/pytensor/tensor/rewriting/linalg.py b/pytensor/tensor/rewriting/linalg.py
index 5f2e8cf388..1de6dbb373 100644
--- a/pytensor/tensor/rewriting/linalg.py
+++ b/pytensor/tensor/rewriting/linalg.py
@@ -569,3 +569,45 @@ def svd_uv_merge(fgraph, node):
                     or len(fgraph.clients[cl.outputs[2]]) > 0
                 ):
                     return [cl.outputs[1]]
+
+
+@register_canonicalize
+@register_stabilize
+@node_rewriter([Blockwise])
+def rewrite_inv_inv(fgraph, node):
+    """
+    This rewrite takes advantage of the fact that if there are two consecutive inverse operations (inv(inv(input))), we get back our original input without having to compute inverse once.
+
+    Here, we check for direct inverse operations (inv/pinv)  and allows for any combination of these "inverse" nodes to be simply rewritten.
+
+    Parameters
+    ----------
+    fgraph: FunctionGraph
+        Function graph being optimized
+    node: Apply
+        Node of the function graph to be optimized
+
+    Returns
+    -------
+    list of Variable, optional
+        List of optimized variables, or None if no optimization was performed
+    """
+    valid_inverses = (MatrixInverse, MatrixPinv)
+    # Check if its a valid inverse operation (either inv/pinv)
+    # In case the outer operation is an inverse, it directly goes to the next step of finding inner operation
+    # If the outer operation is not a valid inverse, we do not apply this rewrite
+    if not isinstance(node.op.core_op, valid_inverses):
+        return None
+
+    potential_inner_inv = node.inputs[0].owner
+    if potential_inner_inv is None or potential_inner_inv.op is None:
+        return None
+
+    # Check if inner op is blockwise and and possible inv
+    if not (
+        potential_inner_inv
+        and isinstance(potential_inner_inv.op, Blockwise)
+        and isinstance(potential_inner_inv.op.core_op, valid_inverses)
+    ):
+        return None
+    return [potential_inner_inv.inputs[0]]
diff --git a/tests/tensor/rewriting/test_linalg.py b/tests/tensor/rewriting/test_linalg.py
index 0bc064fe65..7353a82be0 100644
--- a/tests/tensor/rewriting/test_linalg.py
+++ b/tests/tensor/rewriting/test_linalg.py
@@ -10,6 +10,7 @@
 from pytensor import tensor as pt
 from pytensor.compile import get_default_mode
 from pytensor.configdefaults import config
+from pytensor.graph.rewriting.utils import rewrite_graph
 from pytensor.tensor import swapaxes
 from pytensor.tensor.blockwise import Blockwise
 from pytensor.tensor.elemwise import DimShuffle
@@ -554,3 +555,16 @@ def test_svd_uv_merge():
             assert node.op.compute_uv
             svd_counter += 1
     assert svd_counter == 1
+
+
+@pytest.mark.parametrize("inv_op_1", ["inv", "pinv"])
+@pytest.mark.parametrize("inv_op_2", ["inv", "pinv"])
+def test_inv_inv_rewrite(inv_op_1, inv_op_2):
+    def get_pt_function(x, op_name):
+        return getattr(pt.linalg, op_name)(x)
+
+    x = pt.matrix("x")
+    op1 = get_pt_function(x, inv_op_1)
+    op2 = get_pt_function(op1, inv_op_2)
+    rewritten_out = rewrite_graph(op2)
+    assert rewritten_out == x

From a4e014e66c40f1514ccf62bade10d5c226fbb908 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <48652735+jessegrabowski@users.noreply.github.com>
Date: Fri, 19 Jul 2024 08:08:35 -0500
Subject: [PATCH 41/72] Implement `pad` (#748)

* Add `pt.pad`

* Refactor linspace, logspace, and geomspace to match numpy implementation

* Add `pt.flip`

* Move `flip` to `tensor/subtensor.py`, add docstring

* Move `slice_at_axis` to `tensor/subtensor` and expose it in `pytensor.tensor`
---
 pytensor/link/jax/dispatch/__init__.py |   1 +
 pytensor/link/jax/dispatch/pad.py      |  53 ++
 pytensor/tensor/__init__.py            |   1 +
 pytensor/tensor/extra_ops.py           | 366 ++++++++++++-
 pytensor/tensor/pad.py                 | 690 +++++++++++++++++++++++++
 pytensor/tensor/subtensor.py           | 115 +++++
 tests/link/jax/test_pad.py             |  63 +++
 tests/link/numba/test_pad.py           |  68 +++
 tests/tensor/test_extra_ops.py         |  53 +-
 tests/tensor/test_pad.py               | 224 ++++++++
 tests/tensor/test_subtensor.py         |  38 ++
 11 files changed, 1632 insertions(+), 40 deletions(-)
 create mode 100644 pytensor/link/jax/dispatch/pad.py
 create mode 100644 pytensor/tensor/pad.py
 create mode 100644 tests/link/jax/test_pad.py
 create mode 100644 tests/link/numba/test_pad.py
 create mode 100644 tests/tensor/test_pad.py

diff --git a/pytensor/link/jax/dispatch/__init__.py b/pytensor/link/jax/dispatch/__init__.py
index 1d8ae33104..f4098416b8 100644
--- a/pytensor/link/jax/dispatch/__init__.py
+++ b/pytensor/link/jax/dispatch/__init__.py
@@ -6,6 +6,7 @@
 import pytensor.link.jax.dispatch.blockwise
 import pytensor.link.jax.dispatch.elemwise
 import pytensor.link.jax.dispatch.extra_ops
+import pytensor.link.jax.dispatch.pad
 import pytensor.link.jax.dispatch.math
 import pytensor.link.jax.dispatch.nlinalg
 import pytensor.link.jax.dispatch.random
diff --git a/pytensor/link/jax/dispatch/pad.py b/pytensor/link/jax/dispatch/pad.py
new file mode 100644
index 0000000000..6d40d20cc1
--- /dev/null
+++ b/pytensor/link/jax/dispatch/pad.py
@@ -0,0 +1,53 @@
+import jax.numpy as jnp
+import numpy as np
+
+from pytensor.link.jax.dispatch import jax_funcify
+from pytensor.tensor.pad import Pad
+
+
+@jax_funcify.register(Pad)
+def jax_funcify_pad(op, **kwargs):
+    pad_mode = op.pad_mode
+    reflect_type = op.reflect_type
+    has_stat_length = op.has_stat_length
+
+    if pad_mode == "constant":
+
+        def constant_pad(x, pad_width, constant_values):
+            return jnp.pad(x, pad_width, mode=pad_mode, constant_values=constant_values)
+
+        return constant_pad
+
+    elif pad_mode == "linear_ramp":
+
+        def lr_pad(x, pad_width, end_values):
+            # JAX does not allow a dynamic input if end_values is non-scalar
+            if not isinstance(end_values, int | float):
+                end_values = tuple(np.array(end_values))
+            return jnp.pad(x, pad_width, mode=pad_mode, end_values=end_values)
+
+        return lr_pad
+
+    elif pad_mode in ["maximum", "minimum", "mean"] and has_stat_length:
+
+        def stat_pad(x, pad_width, stat_length):
+            # JAX does not allow a dynamic input here, need to cast to tuple
+            return jnp.pad(
+                x, pad_width, mode=pad_mode, stat_length=tuple(np.array(stat_length))
+            )
+
+        return stat_pad
+
+    elif pad_mode in ["reflect", "symmetric"]:
+
+        def loop_pad(x, pad_width):
+            return jnp.pad(x, pad_width, mode=pad_mode, reflect_type=reflect_type)
+
+        return loop_pad
+
+    else:
+
+        def pad(x, pad_width):
+            return jnp.pad(x, pad_width, mode=pad_mode)
+
+        return pad
diff --git a/pytensor/tensor/__init__.py b/pytensor/tensor/__init__.py
index 3dfa1b4b7a..81cabfa6bd 100644
--- a/pytensor/tensor/__init__.py
+++ b/pytensor/tensor/__init__.py
@@ -130,6 +130,7 @@ def _get_vector_length_Constant(op: Op | Variable, var: Constant) -> int:
 from pytensor.tensor.extra_ops import *
 from pytensor.tensor.io import *
 from pytensor.tensor.math import *
+from pytensor.tensor.pad import pad
 from pytensor.tensor.shape import (
     reshape,
     shape,
diff --git a/pytensor/tensor/extra_ops.py b/pytensor/tensor/extra_ops.py
index b1eaf4f001..cf809a55ef 100644
--- a/pytensor/tensor/extra_ops.py
+++ b/pytensor/tensor/extra_ops.py
@@ -1,3 +1,4 @@
+import warnings
 from collections.abc import Collection, Iterable
 
 import numpy as np
@@ -20,14 +21,24 @@
 from pytensor.raise_op import Assert
 from pytensor.scalar import int32 as int_t
 from pytensor.scalar import upcast
-from pytensor.tensor import as_tensor_variable
+from pytensor.tensor import TensorLike, as_tensor_variable
 from pytensor.tensor import basic as ptb
 from pytensor.tensor.basic import alloc, second
 from pytensor.tensor.exceptions import NotScalarConstantError
 from pytensor.tensor.math import abs as pt_abs
 from pytensor.tensor.math import all as pt_all
 from pytensor.tensor.math import eq as pt_eq
-from pytensor.tensor.math import ge, lt, maximum, minimum, prod, switch
+from pytensor.tensor.math import (
+    ge,
+    gt,
+    log,
+    lt,
+    maximum,
+    minimum,
+    prod,
+    sign,
+    switch,
+)
 from pytensor.tensor.math import max as pt_max
 from pytensor.tensor.math import sum as pt_sum
 from pytensor.tensor.shape import specify_broadcastable
@@ -1584,27 +1595,346 @@ def broadcast_shape_iter(
     return tuple(result_dims)
 
 
-def geomspace(start, end, steps, base=10.0):
-    from pytensor.tensor.math import log
+def _check_deprecated_inputs(stop, end, num, steps):
+    if end is not None:
+        warnings.warn(
+            "The 'end' parameter is deprecated and will be removed in a future version. Use 'stop' instead.",
+            DeprecationWarning,
+        )
+        stop = end
+    if steps is not None:
+        warnings.warn(
+            "The 'steps' parameter is deprecated and will be removed in a future version. Use 'num' instead.",
+            DeprecationWarning,
+        )
+        num = steps
+
+    return stop, num
+
+
+def _linspace_core(
+    start: TensorVariable,
+    stop: TensorVariable,
+    num: int,
+    endpoint=True,
+    retstep=False,
+    axis=0,
+) -> TensorVariable | tuple[TensorVariable, TensorVariable]:
+    div = (num - 1) if endpoint else num
+    delta = stop - start
+    samples = ptb.shape_padright(ptb.arange(0, num), delta.ndim)
+
+    step = delta / div
+    samples = switch(gt(div, 0), samples * delta / div + start, samples * delta + start)
+    if endpoint:
+        samples = switch(gt(num, 1), set_subtensor(samples[-1, ...], stop), samples)
+
+    if axis != 0:
+        samples = ptb.moveaxis(samples, 0, axis)
+
+    if retstep:
+        return samples, step
+
+    return samples
+
+
+def _broadcast_base_with_inputs(start, stop, base, axis):
+    """
+    Broadcast the base tensor with the start and stop tensors if base is not a scalar. This is important because it
+    may change how the axis argument is interpreted in the final output.
+
+    Parameters
+    ----------
+    start: TensorVariable
+        The start value(s) of the sequence(s).
+    stop: TensorVariable
+        The end value(s) of the sequence(s)
+    base: TensorVariable
+        The log base value(s) of the sequence(s)
+    axis: int
+        The axis along which to generate samples.
+
+    Returns
+    -------
+    start: TensorVariable
+        The start value(s) of the sequence(s), broadcast with the base tensor if necessary.
+    stop: TensorVariable
+        The end value(s) of the sequence(s), broadcast with the base tensor if necessary.
+    base: TensorVariable
+        The log base value(s) of the sequence(s), broadcast with the start and stop tensors if necessary.
+    """
+    base = ptb.as_tensor_variable(base)
+    if base.ndim > 0:
+        ndmax = len(broadcast_shape(start, stop, base))
+        start, stop, base = (
+            ptb.shape_padleft(a, ndmax - a.ndim) for a in (start, stop, base)
+        )
+        base = ptb.expand_dims(base, axis=(axis,))
+
+    return start, stop, base
+
+
+def linspace(
+    start: TensorLike,
+    stop: TensorLike,
+    num: TensorLike = 50,
+    endpoint: bool = True,
+    retstep: bool = False,
+    dtype: str | None = None,
+    axis: int = 0,
+    end: TensorLike | None = None,
+    steps: TensorLike | None = None,
+) -> TensorVariable | tuple[TensorVariable, TensorVariable]:
+    """
+    Return evenly spaced numbers over a specified interval.
+
+    Returns `num` evenly spaced samples, calculated over the interval [`start`, `stop`].
+
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start: int, float, or TensorVariable
+        The starting value of the sequence.
+
+    stop: int, float or TensorVariable
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `stop` is excluded.
+
+    num: int
+        Number of samples to generate. Must be non-negative.
 
-    start = ptb.as_tensor_variable(start)
-    end = ptb.as_tensor_variable(end)
-    return base ** linspace(log(start) / log(base), log(end) / log(base), steps)
+    endpoint: bool
+        Whether to include the endpoint in the range.
 
+    retstep: bool
+        If true, returns both the samples and an array of steps between samples.
 
-def logspace(start, end, steps, base=10.0):
-    start = ptb.as_tensor_variable(start)
-    end = ptb.as_tensor_variable(end)
-    return base ** linspace(start, end, steps)
+    dtype: str, optional
+        dtype of the output tensor(s). If None, the dtype is inferred from that of the values provided to the `start`
+        and `end` arguments.
 
+    axis: int
+        Axis along which to generate samples. Ignored if both `start` and `end` have dimension 0. By default, axis=0
+        will insert the samples on a new left-most dimension. To insert samples on a right-most dimension, use axis=-1.
+
+    end:  int, float or TensorVariable
+        .. warning::
+            The "end" parameter is deprecated and will be removed in a future version. Use "stop" instead.
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `end` is
+        excluded.
+
+    steps: float, int, or TensorVariable
+        .. warning::
+            The "steps" parameter is deprecated and will be removed in a future version. Use "num" instead.
+
+        Number of samples to generate. Must be non-negative
+
+    Returns
+    -------
+    samples: TensorVariable
+        Tensor containing `num` evenly-spaced values between [start, stop]. The range is inclusive if `endpoint` is True.
+
+    step: TensorVariable
+        Tensor containing the spacing between samples. Only returned if `retstep` is True.
+    """
+    if dtype is None:
+        dtype = pytensor.config.floatX
+    end, num = _check_deprecated_inputs(stop, end, num, steps)
+    start, stop = broadcast_arrays(start, stop)
+
+    ls = _linspace_core(
+        start=start,
+        stop=stop,
+        num=num,
+        endpoint=endpoint,
+        retstep=retstep,
+        axis=axis,
+    )
+
+    return ls.astype(dtype)
+
+
+def geomspace(
+    start: TensorLike,
+    stop: TensorLike,
+    num: int = 50,
+    base: float = 10.0,
+    endpoint: bool = True,
+    dtype: str | None = None,
+    axis: int = 0,
+    end: TensorLike | None = None,
+    steps: TensorLike | None = None,
+) -> TensorVariable:
+    """
+    Return numbers spaced evenly on a log scale (a geometric progression).
+
+    This is similar to logspace, but with endpoints specified directly. Each output sample is a constant multiple of
+    the previous.
+
+    Parameters
+    ----------
+    Returns `num` evenly spaced samples, calculated over the interval [`start`, `stop`].
+
+    The endpoint of the interval can optionally be excluded.
+
+    Parameters
+    ----------
+    start: int, float, or TensorVariable
+        The starting value of the sequence.
+
+    stop: int, float or TensorVariable
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `stop` is excluded.
+
+    num: int
+        Number of samples to generate. Must be non-negative.
+
+    base: float
+        The base of the log space.
+
+    endpoint: bool
+        Whether to include the endpoint in the range.
+
+    dtype: str, optional
+        dtype of the output tensor(s). If None, the dtype is inferred from that of the values provided to the `start`
+        and `end` arguments.
+
+    axis: int
+        Axis along which to generate samples. Ignored if both `start` and `end` have dimension 0. By default, axis=0
+        will insert the samples on a new left-most dimension. To insert samples on a right-most dimension, use axis=-1.
+
+    end:  int, float or TensorVariable
+        .. warning::
+            The "end" parameter is deprecated and will be removed in a future version. Use "stop" instead.
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `end` is
+        excluded.
+
+    steps: float, int, or TensorVariable
+        .. warning::
+            The "steps" parameter is deprecated and will be removed in a future version. Use "num" instead.
+
+        Number of samples to generate. Must be non-negative
+
+    Returns
+    -------
+    samples: TensorVariable
+        Tensor containing `num` evenly-spaced (in log space) values between [start, stop]. The range is inclusive if
+         `endpoint` is True.
+    """
+    if dtype is None:
+        dtype = pytensor.config.floatX
+    stop, num = _check_deprecated_inputs(stop, end, num, steps)
+    start, stop = broadcast_arrays(start, stop)
+    start, stop, base = _broadcast_base_with_inputs(start, stop, base, axis)
+
+    out_sign = sign(start)
+    log_start, log_stop = (
+        log(start * out_sign) / log(base),
+        log(stop * out_sign) / log(base),
+    )
+    result = _linspace_core(
+        start=log_start,
+        stop=log_stop,
+        num=num,
+        endpoint=endpoint,
+        axis=0,
+        retstep=False,
+    )
+    result = base**result
+
+    result = switch(gt(num, 0), set_subtensor(result[0, ...], start), result)
+    if endpoint:
+        result = switch(gt(num, 1), set_subtensor(result[-1, ...], stop), result)
+
+    result = result * out_sign
+
+    if axis != 0:
+        result = ptb.moveaxis(result, 0, axis)
+
+    return result.astype(dtype)
+
+
+def logspace(
+    start: TensorLike,
+    stop: TensorLike,
+    num: int = 50,
+    base: float = 10.0,
+    endpoint: bool = True,
+    dtype: str | None = None,
+    axis: int = 0,
+    end: TensorLike | None = None,
+    steps: TensorLike | None = None,
+) -> TensorVariable:
+    """
+    Return numbers spaced evenly on a log scale.
+
+    In linear space, the sequence starts at ``base ** start`` (base to the power of start) and ends with ``base ** stop``
+     (see ``endpoint`` below).
+
+    Parameters
+    ----------
+    start: int, float, or TensorVariable
+        ``base ** start`` is the starting value of the sequence
+
+    stop: int, float or TensorVariable
+        ``base ** stop`` is the endpoint of the sequence, unless ``endopoint`` is set to False.
+        In that case, ``num + 1`` values are spaced over the interval in log-space, and the first ``num`` are returned.
+
+    num: int, default = 50
+        Number of samples to generate.
+
+    base: float, default = 10.0
+        The base of the log space. The step size between the elements in ``log(samples) / log(base)``
+         (or ``log_base(samples)`` is uniform.
+
+    endpoint: bool
+        Whether to include the endpoint in the range.
+
+    dtype: str, optional
+        dtype of the output tensor(s). If None, the dtype is inferred from that of the values provided to the `start`
+        and `stop` arguments.
+
+    axis: int
+        Axis along which to generate samples. Ignored if both `start` and `end` have dimension 0. By default, axis=0
+        will insert the samples on a new left-most dimension. To insert samples on a right-most dimension, use axis=-1.
+
+    end:  int float or TensorVariable
+        .. warning::
+            The "end" parameter is deprecated and will be removed in a future version. Use "stop" instead.
+        The end value of the sequence, unless `endpoint` is set to False.
+        In that case, the sequence consists of all but the last of `num + 1` evenly spaced samples, such that `end` is
+        excluded.
+
+    steps: int or TensorVariable
+        .. warning::
+            The "steps" parameter is deprecated and will be removed in a future version. Use "num" instead.
+        Number of samples to generate. Must be non-negative
+
+    Returns
+    -------
+    samples: TensorVariable
+        Tensor containing `num` evenly-spaced (in log-pace) values between [start, stop]. The range is inclusive if
+        `endpoint` is True.
+    """
+    if dtype is None:
+        dtype = pytensor.config.floatX
+    stop, num = _check_deprecated_inputs(stop, end, num, steps)
+    start, stop = broadcast_arrays(start, stop)
+    start, stop, base = _broadcast_base_with_inputs(start, stop, base, axis)
+
+    ls = _linspace_core(
+        start=start,
+        stop=stop,
+        num=num,
+        endpoint=endpoint,
+        axis=axis,
+        retstep=False,
+    )
 
-def linspace(start, end, steps):
-    start = ptb.as_tensor_variable(start)
-    end = ptb.as_tensor_variable(end)
-    arr = ptb.arange(steps)
-    arr = ptb.shape_padright(arr, max(start.ndim, end.ndim))
-    multiplier = (end - start) / (steps - 1)
-    return start + arr * multiplier
+    return (base**ls).astype(dtype)
 
 
 def broadcast_to(
diff --git a/pytensor/tensor/pad.py b/pytensor/tensor/pad.py
new file mode 100644
index 0000000000..91aef44004
--- /dev/null
+++ b/pytensor/tensor/pad.py
@@ -0,0 +1,690 @@
+from collections.abc import Callable
+from functools import partial
+from typing import Literal, cast
+
+from pytensor.compile.builders import OpFromGraph
+from pytensor.ifelse import ifelse
+from pytensor.scan import scan
+from pytensor.tensor import TensorLike
+from pytensor.tensor.basic import (
+    TensorVariable,
+    as_tensor,
+    concatenate,
+    expand_dims,
+    moveaxis,
+    switch,
+    zeros,
+)
+from pytensor.tensor.extra_ops import broadcast_to, linspace
+from pytensor.tensor.math import divmod as pt_divmod
+from pytensor.tensor.math import eq, gt, mean, minimum
+from pytensor.tensor.math import max as pt_max
+from pytensor.tensor.math import min as pt_min
+from pytensor.tensor.shape import specify_broadcastable
+from pytensor.tensor.subtensor import flip, set_subtensor, slice_at_axis
+
+
+PadMode = Literal[
+    "constant",
+    "edge",
+    "linear_ramp",
+    "maximum",
+    "minimum",
+    "mean",
+    "median",
+    "wrap",
+    "symmetric",
+    "reflect",
+]
+stat_funcs = {"maximum": pt_max, "minimum": pt_min, "mean": mean}
+
+allowed_kwargs = {
+    "edge": [],
+    "wrap": [],
+    "constant": ["constant_values"],
+    "linear_ramp": ["end_values"],
+    "maximum": ["stat_length"],
+    "mean": ["stat_length"],
+    "median": ["stat_length"],
+    "minimum": ["stat_length"],
+    "reflect": ["reflect_type"],
+    "symmetric": ["reflect_type"],
+}
+
+
+def _get_edges(
+    padded: TensorVariable, axis: int, width_pair: tuple[TensorVariable, TensorVariable]
+) -> tuple[TensorVariable, TensorVariable]:
+    """
+    Retrieve edge values from empty-padded array in given dimension.
+
+    Copied from numpy.lib.arraypad._get_edges
+    https://github.com/numpy/numpy/blob/300096d384046eee479b0c7a70f79e308da52bff/numpy/lib/_arraypad_impl.py#L154
+
+    Parameters
+    ----------
+    padded : TensorVariable
+        Empty-padded array.
+    axis : int
+        Dimension in which the edges are considered.
+    width_pair : (TensorVariable, TensorVariable)
+        Pair of widths that mark the pad area on both sides in the given
+        dimension.
+
+    Returns
+    -------
+    left_edge, right_edge : TensorVariable
+        Edge values of the valid area in `padded` in the given dimension. Its
+        shape will always match `padded` except for the dimension given by
+        `axis` which will have a length of 1.
+    """
+    left_index = width_pair[0]
+    left_slice = slice_at_axis(slice(left_index, left_index + 1), axis)
+    left_edge = padded[left_slice]
+
+    right_index = padded.shape[axis] - width_pair[1]
+    right_slice = slice_at_axis(slice(right_index - 1, right_index), axis)
+    right_edge = padded[right_slice]
+
+    return left_edge, right_edge
+
+
+def _symbolic_pad(
+    x: TensorVariable, pad_width: TensorVariable
+) -> tuple[TensorVariable, tuple[slice, ...], TensorVariable]:
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+    new_shape = as_tensor(
+        [pad_width[i][0] + size + pad_width[i][1] for i, size in enumerate(x.shape)]
+    )
+    original_area_slice = tuple(
+        slice(pad_width[i][0], pad_width[i][0] + size) for i, size in enumerate(x.shape)
+    )
+    padded: TensorVariable = set_subtensor(zeros(new_shape)[original_area_slice], x)
+    return padded, original_area_slice, pad_width
+
+
+def _get_padding_slices(
+    dim_shape: TensorVariable,
+    width_pair: tuple[TensorVariable, TensorVariable],
+    axis: int,
+) -> tuple[tuple[slice, ...], tuple[slice, ...]]:
+    left_slice = slice_at_axis(slice(None, width_pair[0]), axis)
+    right_slice = slice_at_axis(slice(dim_shape - width_pair[1], None), axis)
+
+    return left_slice, right_slice
+
+
+def _constant_pad(
+    x: TensorVariable, pad_width: TensorVariable, constant_values: TensorVariable
+) -> TensorVariable:
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    values = broadcast_to(constant_values, as_tensor((padded.ndim, 2)))
+
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        value_pair = values[axis]
+        dim_shape = padded.shape[axis]
+
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+        padded = set_subtensor(padded[left_slice], value_pair[0])
+        padded = set_subtensor(padded[right_slice], value_pair[1])
+
+    return padded
+
+
+def _edge_pad(x: TensorVariable, pad_width: TensorVariable) -> TensorVariable:
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        dim_shape = padded.shape[axis]
+
+        left_edge, right_edge = _get_edges(padded, axis, width_pair)
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+
+        padded = set_subtensor(padded[left_slice], left_edge)
+        padded = set_subtensor(padded[right_slice], right_edge)
+
+    return padded
+
+
+def _get_stats(
+    padded: TensorVariable,
+    axis: int,
+    width_pair: TensorVariable,
+    length_pair: tuple[TensorVariable, TensorVariable] | tuple[None, None],
+    stat_func: Callable,
+):
+    """
+    Calculate statistic for the empty-padded array in given dimension.
+
+    Copied from numpy.lib.arraypad._get_stats
+    https://github.com/numpy/numpy/blob/300096d384046eee479b0c7a70f79e308da52bff/numpy/lib/_arraypad_impl.py#L230
+
+    Parameters
+    ----------
+    padded : TensorVariable
+        Empty-padded array.
+    axis : int
+        Dimension in which the statistic is calculated.
+    width_pair : (TensorVariable, TensorVariable)
+        Pair of widths that mark the pad area on both sides in the given dimension.
+    length_pair : 2-element sequence of None or TensorVariable
+        Gives the number of values in valid area from each side that is taken into account when calculating the
+        statistic. If None the entire valid area in `padded` is considered.
+    stat_func : function
+        Function to compute statistic. The expected signature is
+        ``stat_func(x: TensorVariable, axis: int, keepdims: bool) -> TensorVariable``.
+
+    Returns
+    -------
+    left_stat, right_stat : TensorVariable
+        Calculated statistic for both sides of `padded`.
+    """
+    # Calculate indices of the edges of the area with original values
+    left_index = width_pair[0]
+    right_index = padded.shape[axis] - width_pair[1]
+    # as well as its length
+    max_length = right_index - left_index
+
+    # Limit stat_lengths to max_length
+    left_length, right_length = length_pair
+
+    # Calculate statistic for the left side
+    left_length = (
+        minimum(left_length, max_length) if left_length is not None else max_length
+    )
+    left_slice = slice_at_axis(slice(left_index, left_index + left_length), axis)
+    left_chunk = padded[left_slice]
+    left_stat = stat_func(left_chunk, axis=axis, keepdims=True)
+    if left_length is None and right_length is None:
+        # We could also return early in the more general case of left_length == right_length, but we don't necessarily
+        # know these shapes.
+        # TODO: Add rewrite to simplify in this case
+        return left_stat, left_stat
+
+    # Calculate statistic for the right side
+    right_length = (
+        minimum(right_length, max_length) if right_length is not None else max_length
+    )
+    right_slice = slice_at_axis(slice(right_index - right_length, right_index), axis)
+    right_chunk = padded[right_slice]
+    right_stat = stat_func(right_chunk, axis=axis, keepdims=True)
+
+    return left_stat, right_stat
+
+
+def _stat_pad(
+    x: TensorVariable,
+    pad_width: TensorVariable,
+    stat_func: Callable,
+    stat_length: TensorVariable | None,
+):
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    if stat_length is None:
+        stat_length = [[None, None]] * padded.ndim  # type: ignore
+    else:
+        stat_length = broadcast_to(stat_length, as_tensor((padded.ndim, 2)))
+
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        length_pair = stat_length[axis]  # type: ignore
+        dim_shape = padded.shape[axis]
+
+        left_stat, right_stat = _get_stats(
+            padded, axis, width_pair, length_pair, stat_func
+        )
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+        padded = set_subtensor(padded[left_slice], left_stat)
+        padded = set_subtensor(padded[right_slice], right_stat)
+
+    return padded
+
+
+def _linear_ramp_pad(
+    x: TensorVariable, pad_width: TensorVariable, end_values: TensorVariable | int = 0
+) -> TensorVariable:
+    padded, area_slice, pad_width = _symbolic_pad(x, pad_width)
+    end_values = as_tensor(end_values)
+    end_values = broadcast_to(end_values, as_tensor((padded.ndim, 2)))
+
+    for axis in range(padded.ndim):
+        width_pair = pad_width[axis]
+        end_value_pair = end_values[axis]
+        edge_pair = _get_edges(padded, axis, width_pair)
+        dim_shape = padded.shape[axis]
+        left_slice, right_slice = _get_padding_slices(dim_shape, width_pair, axis)
+
+        left_ramp, right_ramp = (
+            linspace(
+                start=end_value,
+                stop=specify_broadcastable(edge, axis).squeeze(axis),
+                num=width,
+                endpoint=False,
+                dtype=padded.dtype,
+                axis=axis,
+            )
+            for end_value, edge, width in zip(end_value_pair, edge_pair, width_pair)
+        )
+
+        # Reverse the direction of the ramp for the "right" side
+        right_ramp = right_ramp[slice_at_axis(slice(None, None, -1), axis)]  # type: ignore
+
+        padded = set_subtensor(padded[left_slice], left_ramp)
+        padded = set_subtensor(padded[right_slice], right_ramp)
+
+    return padded
+
+
+def _wrap_pad(x: TensorVariable, pad_width: TensorVariable) -> TensorVariable:
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+
+    for axis in range(x.ndim):
+        size = x.shape[axis]
+
+        # Compute how many complete copies of the input will be padded on this dimension, along with the amount of
+        # overflow on the final copy
+        repeats, (left_remainder, right_remainder) = pt_divmod(pad_width[axis], size)
+
+        # In the next step we will generate extra copies of the input, and then trim them down to the correct size.
+        left_trim = size - left_remainder
+        right_trim = size - right_remainder
+
+        # The total number of copies needed is always the sum of the number of complete copies to add, plus the original
+        # input itself, plus the two edge copies that will be trimmed down.
+        total_repeats = repeats.sum() + 3
+
+        # Create a batch dimension and clone the input the required number of times
+        parts = expand_dims(x, (0,)).repeat(total_repeats, axis=0)
+
+        # Move the batch dimension to the active dimension
+        parts = moveaxis(parts, 0, axis)
+
+        # Ravel the active dimension while preserving the shapes of the inactive dimensions. This will expand the
+        # active dimension to have the correctly padded shape, plus excess to be trimmed
+        new_shape = [-1 if i == axis else x.shape[i] for i in range(x.ndim)]
+        x = parts.reshape(new_shape)
+
+        # Trim the excess on the active dimension
+        trim_slice = slice_at_axis(slice(left_trim, -right_trim), axis)
+        x = x[trim_slice]
+
+    return x
+
+
+def _build_padding_one_direction(array, array_flipped, repeats, *, inner_func, axis):
+    [_, parts], _ = scan(
+        inner_func,
+        non_sequences=[array, array_flipped],
+        outputs_info=[0, None],
+        n_steps=repeats,
+    )
+
+    parts = moveaxis(parts, 0, axis)
+    new_shape = [-1 if i == axis else array.shape[i] for i in range(array.ndim)]
+    padding = parts.reshape(new_shape)
+
+    return padding
+
+
+def _symmetric_pad(x, pad_width):
+    def _symmetric_inner(i, x, x_flipped, padding_left):
+        return i + 1, ifelse(eq(i % 2, int(padding_left)), x_flipped, x)
+
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+
+    for axis in range(x.ndim):
+        x_flipped = flip(x, axis=axis)
+        original_size = x.shape[axis]
+
+        repeats, remainders = pt_divmod(pad_width[axis], original_size)
+        has_remainder = gt(remainders, 0)
+        repeats = repeats + has_remainder
+
+        left_padding = _build_padding_one_direction(
+            x,
+            x_flipped,
+            repeats[0],
+            axis=axis,
+            inner_func=partial(_symmetric_inner, padding_left=True),
+        )
+        right_padding = _build_padding_one_direction(
+            x,
+            x_flipped,
+            repeats[1],
+            axis=axis,
+            inner_func=partial(_symmetric_inner, padding_left=False),
+        )
+
+        x = concatenate([flip(left_padding, axis), x, right_padding], axis=axis)
+
+        (left_trim, right_trim) = switch(
+            has_remainder, original_size - remainders, remainders
+        )
+        right_trim = x.shape[axis] - right_trim
+
+        trim_slice = slice_at_axis(slice(left_trim, right_trim), axis)
+        x = x[trim_slice]
+
+    return x
+
+
+def _reflect_pad(x, pad_width):
+    def _reflect_inner(i, x, x_flipped, padding_left):
+        return i + 1, ifelse(eq(i % 2, int(padding_left)), x_flipped, x)
+
+    pad_width = broadcast_to(pad_width, as_tensor((x.ndim, 2)))
+    for axis in range(x.ndim):
+        trimmed_size = x.shape[axis] - 1
+
+        trim_slice = slice_at_axis(slice(None, -1), axis)
+        x_trimmed = x[trim_slice]
+        x_flipped = flip(x, axis=axis)[trim_slice]
+
+        repeats, remainders = pt_divmod(pad_width[axis], trimmed_size)
+        repeats = repeats + 1
+
+        left_padding = _build_padding_one_direction(
+            x_trimmed,
+            x_flipped,
+            repeats[0],
+            axis=axis,
+            inner_func=partial(_reflect_inner, padding_left=True),
+        )
+        right_padding = _build_padding_one_direction(
+            x_trimmed,
+            x_flipped,
+            repeats[1],
+            axis=axis,
+            inner_func=partial(_reflect_inner, padding_left=False),
+        )
+
+        left_trim = slice_at_axis(slice(trimmed_size - remainders[0] - 1, -1), axis)
+        right_trim = slice_at_axis(
+            slice(1, right_padding.shape[axis] - trimmed_size + remainders[1] + 1), axis
+        )
+
+        x = concatenate(
+            [flip(left_padding, axis)[left_trim], x, right_padding[right_trim]],
+            axis=axis,
+        )
+    return x
+
+
+class Pad(OpFromGraph):
+    """
+    Wrapper Op for Pad graphs
+    """
+
+    def __init__(
+        self, inputs, outputs, pad_mode, reflect_type=None, has_stat_length=False
+    ):
+        self.pad_mode = pad_mode
+        self.reflect_type = reflect_type
+        self.has_stat_length = has_stat_length
+
+        super().__init__(inputs=inputs, outputs=outputs)
+
+
+def pad(
+    x: TensorLike, pad_width: TensorLike, mode: PadMode = "constant", **kwargs
+) -> TensorVariable:
+    """
+    Pad an array.
+
+    Parameters
+    ----------
+    array : array_like of rank N
+        The array to pad.
+
+    pad_width : sequence, array_like, or int
+        Number of values padded to the edges of each axis.
+        ``((before_1, after_1), ... (before_N, after_N))`` unique pad widths
+        for each axis.
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after pad for each axis.
+        ``(pad,)`` or ``int`` is a shortcut for before = after = pad width
+        for all axes.
+
+    mode : str or function, optional
+        One of the following string values or a user supplied function.
+
+        'constant' (default)
+            Pads with a constant value.
+        'edge'
+            Pads with the edge values of array.
+        'linear_ramp'
+            Pads with the linear ramp between end_value and the
+            array edge value.
+        'maximum'
+            Pads with the maximum value of all or part of the
+            vector along each axis.
+        'mean'
+            Pads with the mean value of all or part of the
+            vector along each axis.
+        'minimum'
+            Pads with the minimum value of all or part of the
+            vector along each axis.
+        'reflect'
+            Pads with the reflection of the vector mirrored on
+            the first and last values of the vector along each
+            axis.
+        'symmetric'
+            Pads with the reflection of the vector mirrored
+            along the edge of the array.
+        'wrap'
+            Pads with the wrap of the vector along the axis.
+            The first values are used to pad the end and the
+            end values are used to pad the beginning.
+
+    stat_length : sequence or int, optional
+        Used in 'maximum', 'mean', and 'minimum'.  Number of
+        values at edge of each axis used to calculate the statistic value.
+
+        ``((before_1, after_1), ... (before_N, after_N))`` unique statistic
+        lengths for each axis.
+
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after statistic lengths for each axis.
+
+        ``(stat_length,)`` or ``int`` is a shortcut for
+        ``before = after = statistic`` length for all axes.
+
+        Default is ``None``, to use the entire axis.
+
+    constant_values : sequence or scalar, optional
+        Used in 'constant'.  The values to set the padded values for each
+        axis.
+
+        ``((before_1, after_1), ... (before_N, after_N))`` unique pad constants
+        for each axis.
+
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after constants for each axis.
+
+        ``(constant,)`` or ``constant`` is a shortcut for
+        ``before = after = constant`` for all axes.
+
+        Default is 0.
+
+    end_values : sequence or scalar, optional
+        Used in 'linear_ramp'.  The values used for the ending value of the
+        linear_ramp and that will form the edge of the padded array.
+
+        ``((before_1, after_1), ... (before_N, after_N))`` unique end values
+        for each axis.
+
+        ``(before, after)`` or ``((before, after),)`` yields same before
+        and after end values for each axis.
+
+        ``(constant,)`` or ``constant`` is a shortcut for
+        ``before = after = constant`` for all axes.
+
+        Default is 0.
+
+    reflect_type : str, optional
+        Only 'even' is currently accepted. Used in 'reflect', and 'symmetric'.  The 'even' style is the
+        default with an unaltered reflection around the edge value.
+
+    Returns
+    -------
+    pad : ndarray
+        Padded array of rank equal to `array` with shape increased
+        according to `pad_width`.
+
+    Examples
+    --------
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+        a = [1, 2, 3, 4, 5]
+        print(pt.pad(a, (2, 3), 'constant', constant_values=(4, 6)).eval())
+
+    .. testoutput::
+
+        [4. 4. 1. 2. 3. 4. 5. 6. 6. 6.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'edge').eval())
+
+    .. testoutput::
+
+         [1. 1. 1. 2. 3. 4. 5. 5. 5. 5.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'linear_ramp', end_values=(5, -4)).eval())
+
+    .. testoutput::
+
+        [ 5.  3.  1.  2.  3.  4.  5.  2. -1. -4.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2,), 'maximum').eval())
+
+    .. testoutput::
+
+        [5. 5. 1. 2. 3. 4. 5. 5. 5.]
+
+    .. testcode::
+
+        print(pt.pad(a, (2,), 'mean').eval())
+
+    .. testoutput::
+
+        [3. 3. 1. 2. 3. 4. 5. 3. 3.]
+
+    .. testcode::
+
+        a = [[1, 2], [3, 4]]
+        print(pt.pad(a, ((3, 2), (2, 3)), 'minimum').eval())
+
+    .. testoutput::
+
+        [[1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [3. 3. 3. 4. 3. 3. 3.]
+         [1. 1. 1. 2. 1. 1. 1.]
+         [1. 1. 1. 2. 1. 1. 1.]]
+
+    .. testcode::
+
+        a = [1, 2, 3, 4, 5]
+        print(pt.pad(a, (2, 3), 'reflect').eval())
+
+    .. testoutput::
+
+        [3 2 1 2 3 4 5 4 3 2]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'symmetric').eval())
+
+    .. testoutput::
+
+        [2 1 1 2 3 4 5 5 4 3]
+
+    .. testcode::
+
+        print(pt.pad(a, (2, 3), 'wrap').eval())
+
+    .. testoutput::
+
+        [4 5 1 2 3 4 5 1 2 3]
+
+    """
+    if any(value not in allowed_kwargs[mode] for value in kwargs.keys()):
+        raise ValueError(
+            f"Invalid keyword arguments for mode '{mode}': {kwargs.keys()}"
+        )
+    x = as_tensor(x, name="x")
+    pad_width = as_tensor(pad_width, name="pad_width")
+    inputs = [x, pad_width]
+    attrs = {}
+
+    if mode == "constant":
+        constant_values = as_tensor(
+            kwargs.pop("constant_values", 0), name="constant_values"
+        )
+        inputs += [constant_values]
+        outputs = _constant_pad(x, pad_width, constant_values)
+
+    elif mode == "edge":
+        outputs = _edge_pad(x, pad_width)
+
+    elif mode in ["maximum", "minimum", "mean", "median"]:
+        if mode == "median":
+            # TODO: Revisit this after we implement a quantile function.
+            #  See https://github.com/pymc-devs/pytensor/issues/53
+            raise NotImplementedError("Median padding not implemented")
+        stat_func = cast(Callable, stat_funcs[mode])
+        stat_length = kwargs.get("stat_length")
+        if stat_length is not None:
+            attrs.update({"has_stat_length": True})
+            stat_length = as_tensor(stat_length, name="stat_length")
+            inputs += [stat_length]
+
+        outputs = _stat_pad(x, pad_width, stat_func, stat_length)
+
+    elif mode == "linear_ramp":
+        end_values = kwargs.pop("end_values", 0)
+        end_values = as_tensor(end_values)
+
+        inputs += [end_values]
+        outputs = _linear_ramp_pad(x, pad_width, end_values)
+
+    elif mode == "wrap":
+        outputs = _wrap_pad(x, pad_width)
+
+    elif mode == "symmetric":
+        reflect_type = kwargs.pop("reflect_type", "even")
+        if reflect_type == "odd":
+            raise NotImplementedError(
+                "Odd reflection not implemented. If you need this feature, please open an "
+                "issue at https://github.com/pymc-devs/pytensor/issues"
+            )
+        attrs.update({"reflect_type": reflect_type})
+        outputs = _symmetric_pad(x, pad_width)
+
+    elif mode == "reflect":
+        reflect_type = kwargs.pop("reflect_type", "even")
+        if reflect_type == "odd":
+            raise NotImplementedError(
+                "Odd reflection not implemented. If you need this feature, please open an "
+                "issue at https://github.com/pymc-devs/pytensor/issues"
+            )
+        attrs.update({"reflect_type": reflect_type})
+        outputs = _reflect_pad(x, pad_width)
+
+    else:
+        raise ValueError(f"Invalid mode: {mode}")
+
+    op = Pad(inputs=inputs, outputs=[outputs], pad_mode=mode, **attrs)(*inputs)
+    return cast(TensorVariable, op)
+
+
+__all__ = ["pad", "flip"]
diff --git a/pytensor/tensor/subtensor.py b/pytensor/tensor/subtensor.py
index a21f2d7dcc..41b4c6bd5a 100644
--- a/pytensor/tensor/subtensor.py
+++ b/pytensor/tensor/subtensor.py
@@ -3013,8 +3013,123 @@ def _get_vector_length_Subtensor(op, var):
         raise ValueError(f"Length of {var} cannot be determined")
 
 
+def slice_at_axis(sl: slice, axis: int) -> tuple[slice, ...]:
+    """
+    Construct tuple of slices to slice an array in the given dimension.
+
+    Copied from numpy.lib.arraypad._slice_at_axis
+    https://github.com/numpy/numpy/blob/300096d384046eee479b0c7a70f79e308da52bff/numpy/lib/_arraypad_impl.py#L33
+
+    Parameters
+    ----------
+    sl : slice
+        The slice for the given dimension.
+    axis : int
+        The axis to which `sl` is applied. All other dimensions are left
+        "unsliced".
+
+    Returns
+    -------
+    sl : tuple of slices
+        A tuple with slices matching `shape` in length.
+
+    Examples
+    --------
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+
+        s = pt.slice_at_axis(slice(None, 1), 1)
+        print(s)
+
+    .. testoutput::
+
+        (slice(None, None, None), slice(None, 1, None), Ellipsis)
+
+    .. testcode::
+
+        x = pt.tensor('x', shape=(None, None, None))
+        x_sliced = x[s]
+
+        f = pytensor.function([x], x_sliced)
+        x = np.arange(27).reshape(3, 3, 3)
+        print(f(x))
+
+    .. testoutput::
+        [[[ 0.  1.  2.]]
+
+         [[ 9. 10. 11.]]
+
+         [[18. 19. 20.]]]
+
+    """
+    if axis >= 0:
+        return (slice(None),) * axis + (sl,) + (...,)  # type: ignore
+    else:
+        # If axis = -1 we want zero right padding (and so on), so subtract one
+        axis = abs(axis) - 1
+        return (...,) + (sl,) + (slice(None),) * axis  # type: ignore
+
+
+def flip(
+    arr: TensorVariable, axis: int | tuple[int] | TensorVariable | None = None
+) -> TensorVariable:
+    """
+    Reverse the order of elements in an tensor along the given axis.
+
+    Parameters
+    ----------
+    arr: TensorVariable
+        Input tensor.
+
+    axis: int | tuple[int] | TensorVariable, optional
+        Axis or axes along which to flip over. The default is to flip over all of the axes of the input tensor.
+
+    Returns
+    -------
+    arr: TensorVariable
+        A view of `arr` with the entries of axis reversed.
+
+    Examples
+    --------
+
+    .. testcode::
+
+        import pytensor
+        import pytensor.tensor as pt
+
+        x = pt.tensor('x', shape=(None, None))
+        x_flipped = pt.flip(x, axis=0)
+
+        f = pytensor.function([x], x_flipped)
+        x = [[1, 2], [3, 4]]
+        print(f(x))
+
+    .. testoutput::
+        [[3. 4.]
+         [1. 2.]]
+
+    """
+    if axis is None:
+        index = ((slice(None, None, -1)),) * arr.ndim
+    else:
+        if isinstance(axis, int):
+            axis = (axis,)
+        index = tuple(
+            [
+                slice(None, None, -1) if i in axis else slice(None, None, None)
+                for i in range(arr.ndim)
+            ]
+        )
+
+    return cast(TensorVariable, arr[index])
+
+
 __all__ = [
     "take",
+    "flip",
+    "slice_at_axis",
     "inc_subtensor",
     "set_subtensor",
 ]
diff --git a/tests/link/jax/test_pad.py b/tests/link/jax/test_pad.py
new file mode 100644
index 0000000000..2321645741
--- /dev/null
+++ b/tests/link/jax/test_pad.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+import pytensor.tensor as pt
+from pytensor import config
+from pytensor.graph import FunctionGraph
+from pytensor.tensor.pad import PadMode
+from tests.link.jax.test_basic import compare_jax_and_py
+
+
+jax = pytest.importorskip("jax")
+floatX = config.floatX
+RTOL = ATOL = 1e-6 if floatX.endswith("64") else 1e-3
+
+
+@pytest.mark.parametrize(
+    "mode, kwargs",
+    [
+        ("constant", {"constant_values": 0}),
+        ("constant", {"constant_values": (1, 2)}),
+        ("edge", {}),
+        ("linear_ramp", {"end_values": 0}),
+        ("linear_ramp", {"end_values": (1, 2)}),
+        ("reflect", {"reflect_type": "even"}),
+        ("wrap", {}),
+        ("symmetric", {"reflect_type": "even"}),
+        ("mean", {"stat_length": None}),
+        ("mean", {"stat_length": (10, 2)}),
+        ("maximum", {"stat_length": None}),
+        ("maximum", {"stat_length": (10, 2)}),
+        ("minimum", {"stat_length": None}),
+        ("minimum", {"stat_length": (10, 2)}),
+    ],
+    ids=[
+        "constant_default",
+        "constant_tuple",
+        "edge",
+        "linear_ramp_default",
+        "linear_ramp_tuple",
+        "reflect",
+        "wrap",
+        "symmetric",
+        "mean_default",
+        "mean_tuple",
+        "maximum_default",
+        "maximum_tuple",
+        "minimum_default",
+        "minimum_tuple",
+    ],
+)
+def test_jax_pad(mode: PadMode, kwargs):
+    x_pt = pt.tensor("x", shape=(3, 3))
+    x = np.random.normal(size=(3, 3))
+
+    res = pt.pad(x_pt, mode=mode, pad_width=3, **kwargs)
+    res_fg = FunctionGraph([x_pt], [res])
+
+    compare_jax_and_py(
+        res_fg,
+        [x],
+        assert_fn=lambda x, y: np.testing.assert_allclose(x, y, rtol=RTOL, atol=ATOL),
+        py_mode="FAST_RUN",
+    )
diff --git a/tests/link/numba/test_pad.py b/tests/link/numba/test_pad.py
new file mode 100644
index 0000000000..11877594d7
--- /dev/null
+++ b/tests/link/numba/test_pad.py
@@ -0,0 +1,68 @@
+import numpy as np
+import pytest
+
+import pytensor.tensor as pt
+from pytensor import config
+from pytensor.graph import FunctionGraph
+from pytensor.tensor.pad import PadMode
+from tests.link.numba.test_basic import compare_numba_and_py
+
+
+floatX = config.floatX
+RTOL = ATOL = 1e-6 if floatX.endswith("64") else 1e-3
+
+
+@pytest.mark.parametrize(
+    "mode, kwargs",
+    [
+        ("constant", {"constant_values": 0}),
+        ("constant", {"constant_values": (1, 2)}),
+        pytest.param(
+            "edge",
+            {},
+            marks=pytest.mark.skip(
+                "This is causing a segfault in NUMBA mode, but I have no idea why"
+            ),
+        ),
+        ("linear_ramp", {"end_values": 0}),
+        ("linear_ramp", {"end_values": (1, 2)}),
+        ("reflect", {"reflect_type": "even"}),
+        ("wrap", {}),
+        ("symmetric", {"reflect_type": "even"}),
+        ("mean", {"stat_length": None}),
+        ("mean", {"stat_length": (10, 2)}),
+        ("maximum", {"stat_length": None}),
+        ("maximum", {"stat_length": (10, 2)}),
+        ("minimum", {"stat_length": None}),
+        ("minimum", {"stat_length": (10, 2)}),
+    ],
+    ids=[
+        "constant_default",
+        "constant_tuple",
+        "edge",
+        "linear_ramp_default",
+        "linear_ramp_tuple",
+        "reflect",
+        "wrap",
+        "symmetric",
+        "mean_default",
+        "mean_tuple",
+        "maximum_default",
+        "maximum_tuple",
+        "minimum_default",
+        "minimum_tuple",
+    ],
+)
+def test_numba_pad(mode: PadMode, kwargs):
+    x_pt = pt.tensor("x", shape=(3, 3))
+    x = np.random.normal(size=(3, 3))
+
+    res = pt.pad(x_pt, mode=mode, pad_width=3, **kwargs)
+    res_fg = FunctionGraph([x_pt], [res])
+
+    compare_numba_and_py(
+        res_fg,
+        [x],
+        assert_fn=lambda x, y: np.testing.assert_allclose(x, y, rtol=RTOL, atol=ATOL),
+        py_mode="FAST_RUN",
+    )
diff --git a/tests/tensor/test_extra_ops.py b/tests/tensor/test_extra_ops.py
index 4376ab1d32..3b3cc5ec7f 100644
--- a/tests/tensor/test_extra_ops.py
+++ b/tests/tensor/test_extra_ops.py
@@ -35,9 +35,6 @@
     diff,
     fill_diagonal,
     fill_diagonal_offset,
-    geomspace,
-    linspace,
-    logspace,
     ravel_multi_index,
     repeat,
     searchsorted,
@@ -1281,25 +1278,37 @@ def test_broadcast_arrays():
 
 
 @pytest.mark.parametrize(
-    "start, stop, num_samples",
+    "op",
+    ["linspace", "logspace", "geomspace"],
+    ids=["linspace", "logspace", "geomspace"],
+)
+@pytest.mark.parametrize("dtype", [None, "int", "float"], ids=[None, "int", "float"])
+@pytest.mark.parametrize(
+    "start, stop, num_samples, endpoint, axis",
     [
-        (1, 10, 50),
-        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25),
-        (1, np.array([5, 6]), 30),
+        (1, 10, 50, True, 0),
+        (1, 10, 1, True, 0),
+        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25, True, 0),
+        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25, True, 1),
+        (np.array([5, 6]), np.array([[10, 10], [10, 10]]), 25, False, -1),
+        (1, np.array([5, 6]), 30, True, 0),
+        (1, np.array([5, 6]), 30, False, -1),
     ],
 )
-def test_space_ops(start, stop, num_samples):
-    z = linspace(start, stop, num_samples)
-    pytensor_res = function(inputs=[], outputs=z)()
-    numpy_res = np.linspace(start, stop, num=num_samples)
-    assert np.allclose(pytensor_res, numpy_res)
-
-    z = logspace(start, stop, num_samples)
-    pytensor_res = function(inputs=[], outputs=z)()
-    numpy_res = np.logspace(start, stop, num=num_samples)
-    assert np.allclose(pytensor_res, numpy_res)
-
-    z = geomspace(start, stop, num_samples)
-    pytensor_res = function(inputs=[], outputs=z)()
-    numpy_res = np.geomspace(start, stop, num=num_samples)
-    assert np.allclose(pytensor_res, numpy_res)
+def test_space_ops(op, dtype, start, stop, num_samples, endpoint, axis):
+    pt_func = getattr(pt, op)
+    np_func = getattr(np, op)
+    dtype = dtype + config.floatX[-2:] if dtype is not None else dtype
+    z = pt_func(start, stop, num_samples, endpoint=endpoint, axis=axis, dtype=dtype)
+
+    numpy_res = np_func(
+        start, stop, num=num_samples, endpoint=endpoint, dtype=dtype, axis=axis
+    )
+    pytensor_res = function(inputs=[], outputs=z, mode="FAST_COMPILE")()
+
+    np.testing.assert_allclose(
+        pytensor_res,
+        numpy_res,
+        atol=1e-6 if config.floatX.endswith("64") else 1e-4,
+        rtol=1e-6 if config.floatX.endswith("64") else 1e-4,
+    )
diff --git a/tests/tensor/test_pad.py b/tests/tensor/test_pad.py
new file mode 100644
index 0000000000..54df4a12e1
--- /dev/null
+++ b/tests/tensor/test_pad.py
@@ -0,0 +1,224 @@
+from typing import Literal
+
+import numpy as np
+import pytest
+
+import pytensor
+from pytensor.tensor.pad import PadMode, pad
+
+
+floatX = pytensor.config.floatX
+RTOL = ATOL = 1e-8 if floatX.endswith("64") else 1e-4
+
+
+def test_unknown_mode_raises():
+    x = np.random.normal(size=(3, 3)).astype(floatX)
+    with pytest.raises(ValueError, match="Invalid mode: unknown"):
+        pad(x, 1, mode="unknown")
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 3, 3)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize("constant", [0, 0.0], ids=["int", "float"])
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+def test_constant_pad(
+    size: tuple, constant: int | float, pad_width: int | tuple[int, ...]
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="constant", constant_values=constant)
+    z = pad(x, pad_width, mode="constant", constant_values=constant)
+    assert z.owner.op.pad_mode == "constant"
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+def test_edge_pad(size: tuple, pad_width: int | tuple[int, ...]):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="edge")
+    z = pad(x, pad_width, mode="edge")
+    assert z.owner.op.pad_mode == "edge"
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize("end_values", [0, -1], ids=["0", "-1"])
+def test_linear_ramp_pad(
+    size: tuple,
+    pad_width: int | tuple[int, ...],
+    end_values: int | float | tuple[int | float, ...],
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="linear_ramp", end_values=end_values)
+    z = pad(x, pad_width, mode="linear_ramp", end_values=end_values)
+    assert z.owner.op.pad_mode == "linear_ramp"
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize("stat", ["mean", "minimum", "maximum"])
+@pytest.mark.parametrize("stat_length", [None, 2])
+def test_stat_pad(
+    size: tuple,
+    pad_width: int | tuple[int, ...],
+    stat: PadMode,
+    stat_length: int | None,
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode=stat, stat_length=stat_length)
+    z = pad(x, pad_width, mode=stat, stat_length=stat_length)
+    assert z.owner.op.pad_mode == stat
+
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+def test_wrap_pad(size: tuple, pad_width: int | tuple[int, ...]):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="wrap")
+    z = pad(x, pad_width, mode="wrap")
+    assert z.owner.op.pad_mode == "wrap"
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize(
+    "reflect_type",
+    ["even", pytest.param("odd", marks=pytest.mark.xfail(raises=NotImplementedError))],
+    ids=["even", "odd"],
+)
+def test_symmetric_pad(
+    size,
+    pad_width,
+    reflect_type: Literal["even", "odd"],
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="symmetric", reflect_type=reflect_type)
+    z = pad(x, pad_width, mode="symmetric", reflect_type=reflect_type)
+    assert z.owner.op.pad_mode == "symmetric"
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+@pytest.mark.parametrize(
+    "pad_width",
+    [10, (10, 0), (0, 10)],
+    ids=["symmetrical", "asymmetrical_left", "asymmetric_right"],
+)
+@pytest.mark.parametrize(
+    "reflect_type",
+    ["even", pytest.param("odd", marks=pytest.mark.xfail(raises=NotImplementedError))],
+    ids=["even", "odd"],
+)
+def test_reflect_pad(
+    size,
+    pad_width,
+    reflect_type: Literal["even", "odd"],
+):
+    x = np.random.normal(size=size).astype(floatX)
+    expected = np.pad(x, pad_width, mode="reflect", reflect_type=reflect_type)
+    z = pad(x, pad_width, mode="reflect", reflect_type=reflect_type)
+    assert z.owner.op.pad_mode == "reflect"
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
+
+
+@pytest.mark.parametrize(
+    "mode",
+    [
+        "constant",
+        "edge",
+        "linear_ramp",
+        "wrap",
+        "symmetric",
+        "reflect",
+        "mean",
+        "maximum",
+        "minimum",
+    ],
+)
+@pytest.mark.parametrize("padding", ["symmetric", "asymmetric"])
+def test_nd_padding(mode, padding):
+    rng = np.random.default_rng()
+    n = rng.integers(3, 5)
+    if padding == "symmetric":
+        pad_width = [(i, i) for i in rng.integers(1, 5, size=n)]
+        stat_length = [(i, i) for i in rng.integers(1, 5, size=n)]
+    else:
+        pad_width = rng.integers(1, 5, size=(n, 2)).tolist()
+        stat_length = rng.integers(1, 5, size=(n, 2)).tolist()
+
+    test_kwargs = {
+        "constant": {"constant_values": 0},
+        "linear_ramp": {"end_values": 0},
+        "maximum": {"stat_length": stat_length},
+        "mean": {"stat_length": stat_length},
+        "minimum": {"stat_length": stat_length},
+        "reflect": {"reflect_type": "even"},
+        "symmetric": {"reflect_type": "even"},
+    }
+
+    x = np.random.normal(size=(2,) * n).astype(floatX)
+    kwargs = test_kwargs.get(mode, {})
+    expected = np.pad(x, pad_width, mode=mode, **kwargs)
+    z = pad(x, pad_width, mode=mode, **kwargs)
+    f = pytensor.function([], z, mode="FAST_COMPILE")
+
+    np.testing.assert_allclose(expected, f(), atol=ATOL, rtol=RTOL)
diff --git a/tests/tensor/test_subtensor.py b/tests/tensor/test_subtensor.py
index 427287dcfd..d02880f543 100644
--- a/tests/tensor/test_subtensor.py
+++ b/tests/tensor/test_subtensor.py
@@ -37,11 +37,13 @@
     advanced_subtensor1,
     as_index_literal,
     basic_shape,
+    flip,
     get_canonical_form_slice,
     inc_subtensor,
     index_vars_to_types,
     indexed_result_shape,
     set_subtensor,
+    slice_at_axis,
     take,
 )
 from pytensor.tensor.type import (
@@ -2902,3 +2904,39 @@ def test_vectorize_adv_subtensor(
         vectorize_pt(x_test, idx_test),
         vectorize_np(x_test, idx_test),
     )
+
+
+def test_slice_at_axis():
+    x = ptb.tensor("x", shape=(3, 4, 5))
+    x_sliced = x[slice_at_axis(slice(None, 1), axis=0)]
+    assert x_sliced.type.shape == (1, 4, 5)
+
+    # Negative axis
+    x_sliced = x[slice_at_axis(slice(None, 1), axis=-2)]
+    assert x_sliced.type.shape == (3, 1, 5)
+
+
+@pytest.mark.parametrize(
+    "size", [(3,), (3, 3), (3, 5, 5)], ids=["1d", "2d square", "3d square"]
+)
+def test_flip(size: tuple[int]):
+    from itertools import combinations
+
+    ATOL = RTOL = 1e-8 if config.floatX == "float64" else 1e-4
+
+    x = np.random.normal(size=size).astype(config.floatX)
+    x_pt = pytensor.tensor.tensor(shape=size, name="x")
+    expected = np.flip(x, axis=None)
+    z = flip(x_pt, axis=None)
+    f = pytensor.function([x_pt], z, mode="FAST_COMPILE")
+    np.testing.assert_allclose(expected, f(x), atol=ATOL, rtol=RTOL)
+
+    # Test all combinations of axes
+    flip_options = [
+        axes for i in range(1, x.ndim + 1) for axes in combinations(range(x.ndim), r=i)
+    ]
+    for axes in flip_options:
+        expected = np.flip(x, axis=list(axes))
+        z = flip(x_pt, axis=list(axes))
+        f = pytensor.function([x_pt], z, mode="FAST_COMPILE")
+        np.testing.assert_allclose(expected, f(x), atol=ATOL, rtol=RTOL)

From 8d25c1464d2cc2186ea78ffe4a50b52bc905677c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:29:36 +0000
Subject: [PATCH 42/72] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.2 → v0.5.4](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.2...v0.5.4)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8aee60b767..4b34d53b80 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           )$
       - id: check-merge-conflict
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.2
+    rev: v0.5.4
     hooks:
       - id: ruff
         args: ["--fix", "--output-format=full"]

From 6fcc37c83181a5a70307f3644f0141fbf35ba45f Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 16:52:20 -0400
Subject: [PATCH 43/72] Removed unused config options

---
 doc/library/config.rst     | 43 -------------------
 pytensor/configdefaults.py | 87 --------------------------------------
 pytensor/configparser.py   |  6 ++-
 3 files changed, 4 insertions(+), 132 deletions(-)

diff --git a/doc/library/config.rst b/doc/library/config.rst
index 60f0f7e307..dac7e2c810 100644
--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -103,14 +103,6 @@ import ``pytensor`` and print the config variable, as in:
 
     String value: either ``'cpu'``
 
-.. attribute:: force_device
-
-    Bool value: either ``True`` or ``False``
-
-    Default: ``False``
-
-    This flag's value cannot be modified during the program execution.
-
 .. attribute:: print_active_device
 
     Bool value: either ``True`` or ``False``
@@ -139,16 +131,6 @@ import ``pytensor`` and print the config variable, as in:
     equal to ``float64`` is created.
     This can be used to help find upcasts to ``float64`` in user code.
 
-.. attribute:: deterministic
-
-    String value: either ``'default'``, ``'more'``
-
-    Default: ``'default'``
-
-    If ``more``, sometimes PyTensor will select :class:`Op` implementations that
-    are more "deterministic", but slower.  See the ``dnn.conv.algo*``
-    flags for more cases.
-
 .. attribute:: allow_gc
 
     Bool value: either ``True`` or ``False``
@@ -412,16 +394,6 @@ import ``pytensor`` and print the config variable, as in:
     ignore it (i.e. ``'ignore'``).
     We suggest never using ``'ignore'`` except during testing.
 
-.. attribute:: assert_no_cpu_op
-
-    String value: ``'ignore'`` or ``'warn'`` or ``'raise'`` or ``'pdb'``
-
-    Default: ``'ignore'``
-
-    If there is a CPU :class:`Op` in the computational graph, depending on its value,
-    this flag can either raise a warning, an exception or drop into the frame
-    with ``pdb``.
-
 .. attribute:: on_shape_error
 
     String value: ``'warn'`` or ``'raise'``
@@ -797,18 +769,3 @@ import ``pytensor`` and print the config variable, as in:
     The verbosity level of the meta-rewriter: ``0`` for silent, ``1`` to only
     warn when PyTensor cannot meta-rewrite an :class:`Op`, ``2`` for full output (e.g.
     timings and the rewrites selected).
-
-
-.. attribute:: config.metaopt__optimizer_excluding
-
-    Default: ``""``
-
-    A list of rewrite tags that we don't want included in the meta-rewriter.
-    Multiple tags are separate by ``':'``.
-
-.. attribute:: config.metaopt__optimizer_including
-
-    Default: ``""``
-
-    A list of rewriter tags to be included during meta-rewriting.
-    Multiple tags are separate by ``':'``.
diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index f3a8b4a146..44b3f8ad99 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -260,15 +260,6 @@ def add_basic_configvars():
         ),
     )
 
-    config.add(
-        "deterministic",
-        "If `more`, sometimes we will select some implementation that "
-        "are more deterministic, but slower.  Also see "
-        "the dnn.conv.algo* flags to cover more cases.",
-        EnumStr("default", ["more"]),
-        in_c_key=False,
-    )
-
     config.add(
         "device",
         ("Default device for computations. only cpu is supported for now"),
@@ -276,13 +267,6 @@ def add_basic_configvars():
         in_c_key=False,
     )
 
-    config.add(
-        "force_device",
-        "Raise an error if we can't use the specified device",
-        BoolParam(False, mutable=False),
-        in_c_key=False,
-    )
-
     config.add(
         "conv__assert_shape",
         "If True, AbstractConv* ops will verify that user-provided"
@@ -299,14 +283,6 @@ def add_basic_configvars():
         in_c_key=False,
     )
 
-    # This flag determines whether or not to raise error/warning message if
-    # there is a CPU Op in the computational graph.
-    config.add(
-        "assert_no_cpu_op",
-        "Raise an error/warning if there is a CPU op in the computational graph.",
-        EnumStr("ignore", ["warn", "raise", "pdb"], mutable=True),
-        in_c_key=False,
-    )
     config.add(
         "unpickle_function",
         (
@@ -1043,20 +1019,6 @@ def add_metaopt_configvars():
         in_c_key=False,
     )
 
-    config.add(
-        "metaopt__optimizer_excluding",
-        ("exclude optimizers with these tags. Separate tags with ':'."),
-        StrParam(""),
-        in_c_key=False,
-    )
-
-    config.add(
-        "metaopt__optimizer_including",
-        ("include optimizers with these tags. Separate tags with ':'."),
-        StrParam(""),
-        in_c_key=False,
-    )
-
 
 def add_vm_configvars():
     config.add(
@@ -1295,55 +1257,6 @@ def add_caching_dir_configvars():
     )
 
 
-# Those are the options provided by PyTensor to choose algorithms at runtime.
-SUPPORTED_DNN_CONV_ALGO_RUNTIME = (
-    "guess_once",
-    "guess_on_shape_change",
-    "time_once",
-    "time_on_shape_change",
-)
-
-# Those are the supported algorithm by PyTensor,
-# The tests will reference those lists.
-SUPPORTED_DNN_CONV_ALGO_FWD = (
-    "small",
-    "none",
-    "large",
-    "fft",
-    "fft_tiling",
-    "winograd",
-    "winograd_non_fused",
-    *SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-)
-
-SUPPORTED_DNN_CONV_ALGO_BWD_DATA = (
-    "none",
-    "deterministic",
-    "fft",
-    "fft_tiling",
-    "winograd",
-    "winograd_non_fused",
-    *SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-)
-
-SUPPORTED_DNN_CONV_ALGO_BWD_FILTER = (
-    "none",
-    "deterministic",
-    "fft",
-    "small",
-    "winograd_non_fused",
-    "fft_tiling",
-    *SUPPORTED_DNN_CONV_ALGO_RUNTIME,
-)
-
-SUPPORTED_DNN_CONV_PRECISION = (
-    "as_input_f32",
-    "as_input",
-    "float16",
-    "float32",
-    "float64",
-)
-
 # Eventually, the instance of `PyTensorConfigParser` should be created right here,
 # where it is also populated with settings.
 config = _create_default_config()
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 1656558668..1199485d74 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -75,6 +75,7 @@ class PyTensorConfigParser:
     pickle_test_value: bool
     cast_policy: str
     device: str
+    conv__assert_shape: bool
     print_global_stats: bool
     unpickle_function: bool
     # add_compile_configvars
@@ -86,6 +87,7 @@ class PyTensorConfigParser:
     optimizer_verbose: bool
     on_opt_error: str
     nocleanup: bool
+    on_unused_input: str
     gcc__cxxflags: str
     cmodule__warn_no_version: bool
     cmodule__remove_gxx_opt: bool
@@ -93,6 +95,7 @@ class PyTensorConfigParser:
     cmodule__preload_cache: bool
     cmodule__age_thresh_use: int
     cmodule__debug: bool
+    compile__wait: int
     compile__timeout: int
     # add_tensor_configvars
     tensor__cmp_sloppy: int
@@ -143,6 +146,7 @@ class PyTensorConfigParser:
     optdb__max_use_ratio: float
     cycle_detection: str
     check_stack_trace: str
+    # add_metaopt_configvars
     metaopt__verbose: int
     # add_vm_configvars
     profile: bool
@@ -177,7 +181,6 @@ def __init__(
         self._pytensor_cfg = pytensor_cfg
         self._pytensor_raw_cfg = pytensor_raw_cfg
         self._config_var_dict: dict = {}
-        super().__init__()
 
     def __str__(self, print_doc=True):
         sio = StringIO()
@@ -375,7 +378,6 @@ def __init__(
         # more appropriate user-provided default value.
         # Calling `filter` here may actually be harmful if the default value is
         # invalid and causes a crash or has unwanted side effects.
-        super().__init__()
 
     @property
     def default(self):

From 39612d1d6ebc5f4cc8dde6e9f89c601f7430ded6 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 16:56:01 -0400
Subject: [PATCH 44/72] Remove add_experimental_configvars

---
 pytensor/configdefaults.py | 5 -----
 pytensor/configparser.py   | 1 -
 2 files changed, 6 deletions(-)

diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index 44b3f8ad99..42d3912ccf 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -585,10 +585,6 @@ def add_traceback_configvars():
     )
 
 
-def add_experimental_configvars():
-    return
-
-
 def add_error_and_warning_configvars():
     ###
     # To disable some warning about old bug that are fixed now.
@@ -1266,7 +1262,6 @@ def add_caching_dir_configvars():
 add_compile_configvars()
 add_tensor_configvars()
 add_traceback_configvars()
-add_experimental_configvars()
 add_error_and_warning_configvars()
 add_testvalue_and_checking_configvars()
 add_multiprocessing_configvars()
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 1199485d74..815053b6e9 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -104,7 +104,6 @@ class PyTensorConfigParser:
     # add_traceback_configvars
     traceback__limit: int
     traceback__compile_limit: int
-    # add_experimental_configvars
     # add_error_and_warning_configvars
     warn__ignore_bug_before: int
     exception_verbosity: str

From 9571d4fbca91aaff72a522686a0c772b89db9860 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 17:06:12 -0400
Subject: [PATCH 45/72] Remove default in_c_key and change for cast_policy

---
 pytensor/configdefaults.py | 1 +
 pytensor/configparser.py   | 4 +---
 tests/test_config.py       | 5 +++++
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index 42d3912ccf..7fd6f951c7 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -258,6 +258,7 @@ def add_basic_configvars():
             # was expected, so it is currently not available.
             # numpy,
         ),
+        in_c_key=False,
     )
 
     config.add(
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 815053b6e9..40e84f518a 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -214,9 +214,7 @@ def get_config_hash(self):
             )
         )
 
-    def add(
-        self, name: str, doc: str, configparam: "ConfigParam", in_c_key: bool = True
-    ):
+    def add(self, name: str, doc: str, configparam: "ConfigParam", in_c_key: bool):
         """Add a new variable to PyTensorConfigParser.
 
         This method performs some of the work of initializing `ConfigParam` instances.
diff --git a/tests/test_config.py b/tests/test_config.py
index 65705c6988..4a512085f4 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -98,6 +98,7 @@ def test_config_hash():
         "test__config_hash",
         "A config var from a test case.",
         configparser.StrParam("test_default"),
+        in_c_key=True,
     )
 
     h0 = root.get_config_hash()
@@ -160,6 +161,7 @@ def test_config_context():
         "test__config_context",
         "A config var from a test case.",
         configparser.StrParam("test_default"),
+        in_c_key=False,
     )
     assert hasattr(root, "test__config_context")
     assert root.test__config_context == "test_default"
@@ -181,6 +183,7 @@ def test_invalid_configvar_access():
         "test__on_test_instance",
         "This config setting was added to the test instance.",
         configparser.IntParam(5),
+        in_c_key=False,
     )
     assert hasattr(root_test, "test__on_test_instance")
     # While the property _actually_ exists on all instances,
@@ -197,6 +200,7 @@ def test_invalid_configvar_access():
             "test__on_test_instance",
             "This config setting was already added to another instance.",
             configparser.IntParam(5),
+            in_c_key=False,
         )
 
 
@@ -248,6 +252,7 @@ def test_config_pickling():
         "test__lambda_kills_pickling",
         "Lambda functions cause pickling problems.",
         configparser.IntParam(5, lambda i: i > 0),
+        in_c_key=False,
     )
     with pytest.raises(AttributeError, match="Can't pickle local object"):
         pickle.dump(root, io.BytesIO())

From ab4f150f2e27afb5bd0c6aba41f7067d0470f656 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 17:24:53 -0400
Subject: [PATCH 46/72] Fix typo in docstring

---
 pytensor/configparser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 40e84f518a..6ecbe051d4 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -281,7 +281,7 @@ def fetch_val_for_key(self, key, delete_key: bool = False):
 
         The (decreasing) priority order is:
         - PYTENSOR_FLAGS
-        - ~./pytensorrc
+        - ~/.pytensorrc
 
         """
 

From 153d209e77037c7ca726e4e065ab38c5382122e5 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 17:47:42 -0400
Subject: [PATCH 47/72] Simplify _ChangeFlagDecorator

---
 pytensor/configparser.py  | 10 +++-------
 tests/link/c/test_type.py |  2 +-
 tests/tensor/test_blas.py |  2 +-
 tests/test_config.py      |  2 +-
 4 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 6ecbe051d4..c38f131c61 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -32,11 +32,7 @@ class ConfigAccessViolation(AttributeError):
 
 
 class _ChangeFlagsDecorator:
-    def __init__(self, *args, _root=None, **kwargs):
-        # the old API supported passing a dict as the first argument:
-        if args:
-            assert len(args) == 1 and isinstance(args[0], dict)
-            kwargs = dict(**args[0], **kwargs)
+    def __init__(self, _root=None, **kwargs):
         self.confs = {k: _root._config_var_dict[k] for k in kwargs}
         self.new_vals = kwargs
         self._root = _root
@@ -310,14 +306,14 @@ def fetch_val_for_key(self, key, delete_key: bool = False):
         except (NoOptionError, NoSectionError):
             raise KeyError(key)
 
-    def change_flags(self, *args, **kwargs) -> _ChangeFlagsDecorator:
+    def change_flags(self, **kwargs) -> _ChangeFlagsDecorator:
         """
         Use this as a decorator or context manager to change the value of
         PyTensor config variables.
 
         Useful during tests.
         """
-        return _ChangeFlagsDecorator(*args, _root=self, **kwargs)
+        return _ChangeFlagsDecorator(_root=self, **kwargs)
 
     def warn_unused_flags(self):
         for key in self._flags_dict:
diff --git a/tests/link/c/test_type.py b/tests/link/c/test_type.py
index 84287e1607..0ebd249bf4 100644
--- a/tests/link/c/test_type.py
+++ b/tests/link/c/test_type.py
@@ -287,6 +287,6 @@ def test_op_with_cenumtype(self):
         assert val_billion == val_million * 1000
         assert val_two_billions == val_billion * 2
 
-    @pytensor.config.change_flags(**{"cmodule__debug": True})
+    @pytensor.config.change_flags(cmodule__debug=True)
     def test_op_with_cenumtype_debug(self):
         self.test_op_with_cenumtype()
diff --git a/tests/tensor/test_blas.py b/tests/tensor/test_blas.py
index c2479edba9..34a1d1bcf9 100644
--- a/tests/tensor/test_blas.py
+++ b/tests/tensor/test_blas.py
@@ -514,7 +514,7 @@ def compute_ref(
         C = self.get_value(C, transpose_C, slice_C)
         return alpha * np.dot(A, B) + beta * C
 
-    @config.change_flags({"blas__ldflags": ""})
+    @config.change_flags(blas__ldflags="")
     def run_gemm(
         self,
         dtype,
diff --git a/tests/test_config.py b/tests/test_config.py
index 4a512085f4..47a4e24035 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -168,7 +168,7 @@ def test_config_context():
 
     with root.change_flags(test__config_context="new_value"):
         assert root.test__config_context == "new_value"
-        with root.change_flags({"test__config_context": "new_value2"}):
+        with root.change_flags(test__config_context="new_value2"):
             assert root.test__config_context == "new_value2"
         assert root.test__config_context == "new_value"
     assert root.test__config_context == "test_default"

From 3aaf7569d9f99177a925d5bf928c4c3776a66538 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 18:32:55 -0400
Subject: [PATCH 48/72] Fix typo amblibm -> amdlibm

---
 doc/library/config.rst        |  2 +-
 pytensor/compile/profiling.py |  8 ++++----
 pytensor/configdefaults.py    |  2 +-
 pytensor/configparser.py      |  2 +-
 pytensor/scalar/basic.py      | 12 ++++++------
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/doc/library/config.rst b/doc/library/config.rst
index dac7e2c810..80fe090118 100644
--- a/doc/library/config.rst
+++ b/doc/library/config.rst
@@ -355,7 +355,7 @@ import ``pytensor`` and print the config variable, as in:
 
     When ``True``, ignore the first call to an PyTensor function while profiling.
 
-.. attribute:: config.lib__amblibm
+.. attribute:: config.lib__amdlibm
 
     Bool value: either ``True`` or ``False``
 
diff --git a/pytensor/compile/profiling.py b/pytensor/compile/profiling.py
index a361ac5087..9d93431753 100644
--- a/pytensor/compile/profiling.py
+++ b/pytensor/compile/profiling.py
@@ -1566,26 +1566,26 @@ def exp_float32_op(op):
             printed_tip = True
 
         # tip 2
-        if not config.lib__amblibm and any(
+        if not config.lib__amdlibm and any(
             amdlibm_speed_up(a.op) for (fgraph, a) in self.apply_time
         ):
             print(
                 "  - Try installing amdlibm and set the PyTensor flag "
-                "lib__amblibm=True. This speeds up only some Elemwise "
+                "lib__amdlibm=True. This speeds up only some Elemwise "
                 "operation.",
                 file=file,
             )
             printed_tip = True
 
         # tip 3
-        if not config.lib__amblibm and any(
+        if not config.lib__amdlibm and any(
             exp_float32_op(a.op) and a.inputs[0].dtype == "float32"
             for (fgraph, a) in self.apply_time
         ):
             print(
                 "  - With the default gcc libm, exp in float32 is slower "
                 "than in float64! Try PyTensor flag floatX=float64, or "
-                "install amdlibm and set the pytensor flags lib__amblibm=True",
+                "install amdlibm and set the pytensor flags lib__amdlibm=True",
                 file=file,
             )
             printed_tip = True
diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index 7fd6f951c7..f0cd279fa2 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -547,7 +547,7 @@ def add_tensor_configvars():
 
     # http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
     config.add(
-        "lib__amblibm",
+        "lib__amdlibm",
         "Use amd's amdlibm numerical library",
         BoolParam(False),
         # Added elsewhere in the c key only when needed.
diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index c38f131c61..5042c15d76 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -95,7 +95,7 @@ class PyTensorConfigParser:
     compile__timeout: int
     # add_tensor_configvars
     tensor__cmp_sloppy: int
-    lib__amblibm: bool
+    lib__amdlibm: bool
     tensor__insert_inplace_optimizer_validate_nb: int
     # add_traceback_configvars
     traceback__limit: int
diff --git a/pytensor/scalar/basic.py b/pytensor/scalar/basic.py
index 763323cdb2..d4c41d5cb5 100644
--- a/pytensor/scalar/basic.py
+++ b/pytensor/scalar/basic.py
@@ -356,18 +356,18 @@ def c_headers(self, c_compiler=None, **kwargs):
         # we declare them here and they will be re-used by TensorType
         l.append("<numpy/arrayobject.h>")
         l.append("<numpy/arrayscalars.h>")
-        if config.lib__amblibm and c_compiler.supports_amdlibm:
+        if config.lib__amdlibm and c_compiler.supports_amdlibm:
             l += ["<amdlibm.h>"]
         return l
 
     def c_libraries(self, c_compiler=None, **kwargs):
         l = []
-        if config.lib__amblibm and c_compiler and c_compiler.supports_amdlibm:
+        if config.lib__amdlibm and c_compiler and c_compiler.supports_amdlibm:
             l += ["amdlibm"]
         return l
 
     def c_compile_args(self, c_compiler=None, **kwargs):
-        if config.lib__amblibm and c_compiler and c_compiler.supports_amdlibm:
+        if config.lib__amdlibm and c_compiler and c_compiler.supports_amdlibm:
             return ["-DREPLACE_WITH_AMDLIBM"]
         else:
             return []
@@ -1245,7 +1245,7 @@ class UnaryScalarOp(ScalarOp):
     def c_code_contiguous(self, node, name, inputs, outputs, sub):
         (x,) = inputs
         (z,) = outputs
-        if not config.lib__amblibm or node.inputs[0].type != node.outputs[0].type:
+        if not config.lib__amdlibm or node.inputs[0].type != node.outputs[0].type:
             raise MethodNotDefined()
 
         dtype = node.inputs[0].type.dtype_specs()[1]
@@ -1260,7 +1260,7 @@ def c_code_contiguous(self, node, name, inputs, outputs, sub):
         """
 
     def c_code_contiguous_raw(self, dtype, n, i, o):
-        if not config.lib__amblibm:
+        if not config.lib__amdlibm:
             raise MethodNotDefined()
         if dtype.startswith("npy_"):
             dtype = dtype[4:]
@@ -2296,7 +2296,7 @@ def L_op(self, inputs, outputs, gout):
     def c_code_contiguous(self, node, name, inputs, outputs, sub):
         (x, y) = inputs
         (z,) = outputs
-        if not config.lib__amblibm:
+        if not config.lib__amdlibm:
             raise MethodNotDefined()
 
         # We compare the dtype AND the broadcast flag

From 1b2802e321a1cbe899aa4e83d0375d4ce1bd48a9 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 18:39:23 -0400
Subject: [PATCH 49/72] Remove unused ContextsParam

---
 pytensor/configparser.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/pytensor/configparser.py b/pytensor/configparser.py
index 5042c15d76..e587782e40 100644
--- a/pytensor/configparser.py
+++ b/pytensor/configparser.py
@@ -538,22 +538,6 @@ def __str__(self):
         return f"{self.name} ({self.default})"
 
 
-class ContextsParam(ConfigParam):
-    def __init__(self):
-        super().__init__("", apply=self._apply, mutable=False)
-
-    def _apply(self, val):
-        if val == "":
-            return val
-        for v in val.split(";"):
-            s = v.split("->")
-            if len(s) != 2:
-                raise ValueError(f"Malformed context map: {v}")
-            if s[0] == "cpu" or s[0].startswith("cuda") or s[0].startswith("opencl"):
-                raise ValueError(f"Cannot use {s[0]} as context name")
-        return val
-
-
 def parse_config_string(
     config_string: str, issue_warnings: bool = True
 ) -> dict[str, str]:

From 9c6748f4e76bd97be59a1c713c1eb54c8c9a2390 Mon Sep 17 00:00:00 2001
From: Virgile Andreani <virgile@pymc-devs.org>
Date: Fri, 19 Jul 2024 18:46:51 -0400
Subject: [PATCH 50/72] Simplify config.add(linker)

---
 pytensor/configdefaults.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/pytensor/configdefaults.py b/pytensor/configdefaults.py
index f0cd279fa2..0353c58fcd 100644
--- a/pytensor/configdefaults.py
+++ b/pytensor/configdefaults.py
@@ -371,23 +371,11 @@ def add_compile_configvars():
 
     if rc == 0 and config.cxx != "":
         # Keep the default linker the same as the one for the mode FAST_RUN
-        config.add(
-            "linker",
-            "Default linker used if the pytensor flags mode is Mode",
-            EnumStr(
-                "cvm", ["c|py", "py", "c", "c|py_nogc", "vm", "vm_nogc", "cvm_nogc"]
-            ),
-            in_c_key=False,
-        )
+        linker_options = ["c|py", "py", "c", "c|py_nogc", "vm", "vm_nogc", "cvm_nogc"]
     else:
         # g++ is not present or the user disabled it,
         # linker should default to python only.
-        config.add(
-            "linker",
-            "Default linker used if the pytensor flags mode is Mode",
-            EnumStr("vm", ["py", "vm_nogc"]),
-            in_c_key=False,
-        )
+        linker_options = ["py", "vm_nogc"]
         if type(config).cxx.is_default:
             # If the user provided an empty value for cxx, do not warn.
             _logger.warning(
@@ -397,6 +385,13 @@ def add_compile_configvars():
                 "To remove this warning, set PyTensor flags cxx to an empty string."
             )
 
+    config.add(
+        "linker",
+        "Default linker used if the pytensor flags mode is Mode",
+        EnumStr("cvm", linker_options),
+        in_c_key=False,
+    )
+
     # Keep the default value the same as the one for the mode FAST_RUN
     config.add(
         "allow_gc",

From 9973e031315c0c85d5d7e11c0ae1505a2837c8f4 Mon Sep 17 00:00:00 2001
From: Pham Nguyen Hung <97870091+HangenYuu@users.noreply.github.com>
Date: Thu, 25 Jul 2024 12:43:23 +0700
Subject: [PATCH 51/72] Fixed dead wiki links (#950)

* Fixed dead wiki links

* Fixed dead wiki links

* Deleted old documentation at doc/sandbox.
---
 doc/introduction.rst                        |   4 +-
 doc/links.rst                               |  14 +-
 doc/sandbox/ccodegen.rst                    | 255 -----------------
 doc/sandbox/compilation.rst                 |  18 --
 doc/sandbox/debugging_with_stepmode.rst     |  75 -----
 doc/sandbox/elemwise_compiler.rst           |  86 ------
 doc/sandbox/function.rst                    |   9 -
 doc/sandbox/functional.rst                  |   7 -
 doc/sandbox/how_to_make_ops.rst             | 295 --------------------
 doc/sandbox/index.rst                       |  11 -
 doc/sandbox/index2.rst                      |  15 -
 doc/sandbox/interactive_debugger.rst        |  56 ----
 doc/sandbox/logistic_regression_example.rst |  77 -----
 doc/sandbox/performance.rst                 |  23 --
 doc/sandbox/randomnumbers.rst               | 245 ----------------
 doc/sandbox/rethinkccodegen.rst             | 124 --------
 doc/sandbox/sandbox.rst                     | 161 -----------
 doc/sandbox/software.rst                    |  19 --
 doc/sandbox/sparse.rst                      | 147 ----------
 doc/sandbox/tensoroptools.rst               |   9 -
 20 files changed, 9 insertions(+), 1641 deletions(-)
 delete mode 100644 doc/sandbox/ccodegen.rst
 delete mode 100644 doc/sandbox/compilation.rst
 delete mode 100644 doc/sandbox/debugging_with_stepmode.rst
 delete mode 100644 doc/sandbox/elemwise_compiler.rst
 delete mode 100644 doc/sandbox/function.rst
 delete mode 100644 doc/sandbox/functional.rst
 delete mode 100644 doc/sandbox/how_to_make_ops.rst
 delete mode 100644 doc/sandbox/index.rst
 delete mode 100644 doc/sandbox/index2.rst
 delete mode 100644 doc/sandbox/interactive_debugger.rst
 delete mode 100644 doc/sandbox/logistic_regression_example.rst
 delete mode 100644 doc/sandbox/performance.rst
 delete mode 100644 doc/sandbox/randomnumbers.rst
 delete mode 100644 doc/sandbox/rethinkccodegen.rst
 delete mode 100644 doc/sandbox/sandbox.rst
 delete mode 100644 doc/sandbox/software.rst
 delete mode 100644 doc/sandbox/sparse.rst
 delete mode 100644 doc/sandbox/tensoroptools.rst

diff --git a/doc/introduction.rst b/doc/introduction.rst
index cfbfeaf90f..5c7a837fa9 100644
--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -157,9 +157,9 @@ to extend PyTensor, please feel free to ask.
    install
    tutorial/index
 
-.. _LISA:  https://mila.umontreal.ca/
+.. _LISA:  https://mila.quebec/en
 .. _Greek mathematician: http://en.wikipedia.org/wiki/Theano_(mathematician)
-.. _numpy: http://numpy.scipy.org/
+.. _numpy: https://numpy.org/
 .. _BLAS: http://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms
 
 .. _sympy: http://www.sympy.org/
diff --git a/doc/links.rst b/doc/links.rst
index 8d2689fed1..ec22e14f12 100644
--- a/doc/links.rst
+++ b/doc/links.rst
@@ -39,18 +39,18 @@ This is a sort of memo for developers and would-be developers.
 
 .. _git: http://git-scm.com/
 .. _pytest: http://docs.pytest.org/en/latest/
-.. _numpy: http://numpy.scipy.org/
+.. _numpy: https://numpy.org/
 .. _python: http://www.python.org
 .. _scipy: http://scipy.org/
 
 .. _autodiff: http://www.autodiff.org
-.. _boost.python: http://www.boost.org/doc/libs/1_38_0/libs/python/doc/index.html
+.. _boost.python: https://www.boost.org/doc/libs/1_85_0/libs/python/doc/html/index.html
 .. _cython: http://www.cython.org/
 .. _liboil: http://liboil.freedesktop.org/wiki/
 .. _llvm: http://llvm.org/
-.. _networkx: http://networkx.lanl.gov/
-.. _pypy: http://codespeak.net/pypy/dist/pypy/doc/
+.. _networkx: https://networkx.org/
+.. _pypy: https://doc.pypy.org/en/latest/
 .. _swig: http://www.swig.org/
-.. _unpython: http://code.google.com/p/unpython/
-.. _pycppad: http://www.seanet.com/~bradbell/pycppad/index.xml
-.. _shedskin: http://shed-skin.blogspot.com/
+.. _unpython: https://code.google.com/archive/p/unpython/
+.. _pycppad: https://github.com/Simple-Robotics/pycppad
+.. _shedskin: https://shedskin.github.io/shedskin/
diff --git a/doc/sandbox/ccodegen.rst b/doc/sandbox/ccodegen.rst
deleted file mode 100644
index 1d9730b97d..0000000000
--- a/doc/sandbox/ccodegen.rst
+++ /dev/null
@@ -1,255 +0,0 @@
-'''C code is actually generated this way. Could be refreshed as developer documentation.  Olivier to review.  20080904.'''
-
-Here is a proposal on the interface to generate C code:
-
-What will be passed to C
-========================
-
-For each ResultBase, the C code gets a variable called storage_<name> which contains a PyObject* pointing to a 1-element list (a sort of cell). That is the "channel" via which C and Python can communicate data. Of course, the C code will not manipulate that directly. At every execution of the C function, the PyObject* inside the storage is extracted and given the name py_<name> (its reference count is handled automatically).
-
-
-Extracting the data for use with C
-==================================
-
-In ResultBase, we have several methods to generate C code for particular purposes. They should return templated strings of C code (see below) but should not actually fill the template. The caller will fill it.
-
-List of template variables you can use:
-  * '''%(name)s:''' Will be filled in by a mangled name representing this ResultBase.
-  * '''%(fail)s:''' This can be inserted in the code to make the current function fail. It will proceed to cleanup everything that needs to be cleaned up. This cannot be used in any cleanup routine (and hence it is forbidden for a cleanup routine to fail!) If a code block uses %(fail)s, its corresponding cleanup block will be called first, so make sure that the cleanup can be done properly at any point where you use %(fail)s, even if you didn't allocate or INCREF everything yet.
-
-List of methods in ResultBase:
-
-'''c_declare:''' This method returns code that declares one or more variables ''without'' initializing them. These are the variables that all C code using this ResultBase will use to manipulate the data. The code should ''only'' declare variables and typedefs (no #defines, but a future extension might address this). Example: if we have a ResultBase representing a double, c_declare may simply return "double %(name)s;". ''All'' variables declared should contain the %(name)s template, but they may prefix or suffix it.
-
-'''c_init:''' This method returns code that initializes (zeros/sets to NULL, typically) the variables declared in c_declare.
-
-'''c_extract:''' This method should manipulate py_<name> to set the values of the variables declared by c_declare. For example, if we have a ResultBase representing a double, c_extract might return "%(name)s = PyFloat_AsDouble(py_%(name)s);" (plus error checking!). If something is wrong with the data provided from Python, c_extract should set an informative error message and insert %(fail)s.
-
-'''c_sync:''' This method should adjust the py_<name> variable using the values of the variables declared by c_declare. For example, if we have a ResultBase representing a double, c_sync might return "Py_XDECREF(py_%(name)s); py_%(name)s = PyFloat_FromDouble(%(name)s);". The result will then be made accessible from Python. c_sync is not allowed to fail, though it is not really cleanup code.
-
-'''c_cleanup:''' This method should clean up all the variables declared by c_declare.
-
-.. warning::
-
-    This page describes usage of c_init and c_extract as of version 0.4.0 (and
-    previous versions). This will change in the future, to allow c_code to
-    use preallocated memory buffers of the outputs.
-
-Important notes:
-  * ''Either'' c_init or c_extract will be called. The former for temporary variables and outputs, the latter for inputs. If the former is used, py_<name> will be set to Py_None regardless of what is in storage_<name>.
-  * c_sync will only be called on the outputs, not on inputs or temporaries.
-  * c_cleanup will ''always'' be called. If c_sync decides to relay some data to Python (thus ousting it from the op's scope), it should NULL any pointers that c_cleanup is not allowed to free.
-
-
-Manipulating the data from C
-============================
-
-The Op class has in turn several methods that generate C code. As for ResultBase, they should return templated strings of C code (see below) but should not actually fill the template. The caller will fill it.
-
-List of template variables you can use:
-  * '''%(<variable_name>)s:''' See c_var_names. These will be substituted for mangled names.
-  * '''%(fail)s:''' This can be inserted in the code to make the current function fail. It will proceed to cleanup everything that needs to be cleaned up. This cannot be used in any cleanup routine (and hence it is forbidden for a cleanup routine to fail!). If a code block uses %(fail)s, its corresponding cleanup block will be called first, so make sure that the cleanup can be done properly at any point where you use %(fail)s, even if you didn't allocate or INCREF everything yet.
-
-'''c_var_names''': This method should return two lists, one list of strings representing the input names and one list of strings representing the output names. The actual names might be mangled by the compiler. In the template strings returned by the next few methods, you can use the names defined here. For example, if op.c_var_names() returns [['x', 'y'], ['z']], then "%(x)s" in op's templates will be the same as "%(name)s" in op.inputs[0]'s templates. This means that all the variables declared by the inputs and outputs can easily be used in the op's templates.
-
-'''c_validate_update''': This method should return code that ensures that the inputs are valid for processing by this Op (checking shapes, bounds, etc.). If anything is invalid, it should set an informative error message and use %(fail)s. Then, it should prepare the outputs: for example, if the output is a tensor, allocate a tensor, resize it appropriately and place it in the appropriate variable (see c_var_names).
-
-'''c_validate_update_cleanup''': This method should clean up any temporary storage used by c_validate_update. It is not forbidden to do it in c_validate_update itself, but this can come in handy.
-
-'''c_code''': This is the meat of the Op that actually calculates the function. If an error occurs in the process, it may use %(fail)s. It should work in place on the variables declared by its inputs and outputs and rely on their c_sync routines to relay the results to Python.
-
-'''c_code_cleanup''': This cleans up any temporary structures allocated by c_code.
-
-'''c_is_simple (field)''': Class field. Defaults to False. It is basically a compiler hint that this class represents a builtin C type or a small struct, so we can optimize its access.
-
-
-Important notes:
-  * There might be provisions in the future to skip the validate_update step if the Op can guarantee that the inputs are valid and the outputs are set up properly.
-  * It is not forbidden to just put the validate_update code in c_code. Some situations might require it, but it helps organization to segregate them.
-
-
-Failure
-=======
-
-Besides cleanup code, all code has access to the %(fail)s template. For three code blocks, the generated C code will pretty much look like this:
-
-.. code-block:: cpp
-
-    int failure = 0;
-    {
-      <code1>
-      {
-        <code2>
-        {
-          <code3>
-        label3:
-          <cleanup3>
-        }
-      label2:
-        <cleanup2>
-      }
-    label1:
-      <cleanup1>
-    }
-    return failure;
-
-And %(fail)s in the nth code block will take the value "{failure = n; goto label<n>;}". This means only the blocks executed up to the failure point are cleaned up and the return value indicates which block failed, which is handy for debugging.
-
-When compiling an Op, we want to sync the outputs so we can get the results from Python. In case of failure, we will not necessarily want to sync. Because of that, typical code will look like this:
-
-.. code-block:: cpp
-
-    int failure = 0;
-    <declare input>
-    <declare output>
-    {
-      <extract input>
-      {
-        <extract output>
-        {
-          <perform>
-        label3:
-          <clean up perform>
-        }
-      label2:
-        if (!failure)
-          <sync output>
-        <clean up output>
-      }
-    label1:
-      <clean up input>
-    }
-    return failure;
-
-Furthermore, is not necessary to extract the output because we mean to overwrite it anyway. In that case, <extract output> will be a no-op, but of course we may still need to clean up or sync what <perform> will put in the declared outputs.
-
-
-Example ResultBase
-==================
-
-The following ResultBase represents a double (we only care about the C part).
-
-.. code-block:: python
-
-    class Double(ResultBase):
-      # <snip>
-      def c_declare(self):
-        return "double %(name)s;"
-      def c_init(self):
-        return "%(name)s = 0.0;"
-      def c_extract(self):
-        return "%(name)s = PyFloat_AsDouble(py_%(name)s);"
-      def c_cleanup(self):
-        return "" # nothing to do
-      def c_sync(self):
-        return "Py_XDECREF(py_%(name)s); py_%(name)s = PyFloat_FromDouble(%(name)s);"
-
-
-Example Op
-==========
-
-The following ResultBase represents addition of two nonnegative doubles (we only care about the C part).
-
-.. code-block:: python
-
-    class Add(COp):
-      # <snip>
-      def c_var_names(self):
-        return "[['x', 'y'], ['z']]"
-      def c_validate_update(self):
-        return "if (%(x)s < 0 || %(y)s < 0) %(fail)s" # fail if x or y is negative
-      def c_validate_update_cleanup(self):
-        return "" # nothing to do
-      def c_code(self):
-        return "%(z)s = %(x)s + %(y)s;"
-      def c_code_cleanup(self):
-        return "" # nothing to do
-
-Generating a C function
-=======================
-
-For the example Op, the generated C function will typically look like this:
-
-.. code-block:: cpp
-
-    void add(PyObject* storage_x, PyObject* storage_y, PyObject* storage_z) {
-      PyObject* py_x = PyList_GET_ITEM(storage_x, 0); Py_XINCREF(py_x); // automatic
-      PyObject* py_y = PyList_GET_ITEM(storage_y, 0); Py_XINCREF(py_y); // automatic
-      PyObject* py_z = Py_None; // we don't care what's currently in storage_z
-
-      failure = 0
-      double x; // x.c_declare
-      double y; // y.c_declare
-      double z; // z.c_declare
-      {
-        x = PyFloat_AsDouble(py_x); // x.c_extract
-        {
-          y = PyFloat_AsDouble(py_y); // y.c_extract
-          {
-            # we don't need to use z.c_extract
-            {
-              if (x < 0 || y < 0) { // add.validate_update
-                // This is automatically inserted in place of %(fail)s
-                failure = 4;
-                goto label_add_validate_update_cleanup;
-              }
-              {
-                z = x + y; // add.c_code
-              label_add_code_cleanup:
-              }
-            label_add_validate_update_cleanup:
-            }
-          label_z_sync_or_cleanup:
-            if (!failure) {
-              Py_XDECREF(py_z); // z.c_sync
-              py_z = PyFloat_FromDouble(z); // z.c_sync, the result is now available from Python!
-              PyList_SET_ITEM(storage_z, 0, py_z); // always done after _.c_sync
-            }
-            Py_XDECREF(py_z); // always done after _.c_cleanup
-          }
-        label_y_cleanup:
-          Py_XDECREF(py_y); // always done after _.c_cleanup
-        }
-      label_x_cleanup:
-        Py_XDECREF(py_x); // always done after _.c_cleanup
-      }
-      return failure;
-    }
-
-Generating a C struct
-=====================
-
-To accelerate processing a tad, a struct can be generated instead of a function. The struct will keep pointers to the storage where to fetch inputs and store outputs, but it will also store fields declared by outputs and temporaries' c_declare methods.
-
-Here is a sketch of the struct equivalent of the previous function:
-
-.. code-block:: cpp
-
-    struct add {
-      PyObject* storage_x;
-      PyObject* storage_y;
-      PyObject* storage_z;
-      double z; // z.c_declare
-
-      void init(PyObject* storage_x, PyObject* storage_y, PyObject* storage_z) {
-        // <set the struct members of the same names>
-        // <init the struct members corresponding to z>
-      }
-
-      void cleanup(void) {
-        // <cleanup z>
-      }
-
-      void run(void) {
-        // <same code as before minus z's cleanup>
-      }
-
-      add() { this->init(); }
-      ~add() { this->cleanup(); }
-    };
-
-Advantages of using a struct:
-  * Can be run several times even if we provide the storage only once.
-  * Output variables or temporary variables can reuse what they allocated the last time. This is not particularly useful with doubles (in fact it might be detrimental), but if z was a large tensor it might be interesting to recycle the memory over thousands of runs of the Op.
-
-No struct members will be made if a result's c_is_simple field is True. They will be allocated on the stack instead.
diff --git a/doc/sandbox/compilation.rst b/doc/sandbox/compilation.rst
deleted file mode 100644
index fad7d71ef9..0000000000
--- a/doc/sandbox/compilation.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-
-.. _compilation:
-
-=======================
-Compilation and Linking
-=======================
-
-.. index::
-   single: Linker
-
-.. _linker:
-
-Linker
-======
-
-WRITEME
-
-
diff --git a/doc/sandbox/debugging_with_stepmode.rst b/doc/sandbox/debugging_with_stepmode.rst
deleted file mode 100644
index fba3a63e71..0000000000
--- a/doc/sandbox/debugging_with_stepmode.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-
-.. _sandbox_debugging_step_mode:
-
-Debugging with a customized so-called StepMode
-==============================================
-
-One convenient trick I've found for debugging my programs that are running with pytensor is to
-use what I call a 'StepMode'.  There is no such StepMode in the standard library because the
-purpose of it is to hack it to investigate what your own particular program is doing.
-
-
-.. code-block:: python
-
-    from pytensor.link import WrapLinkerMany
-    from pytensor.configdefaults import config
-    from pytensor.compile.mode import (Mode, register_mode, predefined_modes, predefined_linkers,
-            predefined_optimizers)
-
-    class StepMode(Mode):
-        def __init__(self, linker=None, optimizer='default'):
-            if linker is None:
-                linker = config.linker
-            if optimizer is 'default':
-                optimizer = config.optimizer
-            def blah(i, node, th):
-                # This function will be run for each node in your compiled program.
-                # here you can inspect all the values as they are computed,
-                # ... you can even change them !
-
-                # 'i' is the execution position in the serialized graph
-                # node is the symbolic Apply instance
-                # th is a callable thing that will compute the node.
-
-                print i, node, len(th.inputs)
-
-                # the symbolic inputs of the node are in node.inputs
-                # the j'th non-symbolic input of the node is in th.inputs[j][0]
-
-                th() # call the function to actually 'run' the graph
-
-                # the symbolic outputs of the node are in node.outputs
-                # the j'th non-symbolic output of the node is in th.outputs[j][0]
-
-                print type(th.outputs[0][0])
-
-                if i == 39:
-                    print 'this node is weird...', th.outputs[0][0]
-
-
-            self.provided_linker = linker
-            self.provided_optimizer = optimizer
-            if isinstance(linker, basestring) or linker is None:
-                linker = predefined_linkers[linker]
-
-            self.linker = WrapLinkerMany([linker], [blah])
-
-            if isinstance(optimizer, basestring) or optimizer is None:
-                optimizer = predefined_optimizers[optimizer]
-            self._optimizer = optimizer
-
-
-
-The way to use it is like this:
-
-.. code-block:: python
-
-    fn = function(inputs, outputs, mode=StepMode())
-
-When you call fn, your function in the stepmode will be called for each node in the compiled
-program.  You can print out some or all of the values, you can change them in mid-execution.
-You can see where bizarre values are first occurring in your computations.  It's a very
-powerful way to understand your program's execution.
-
-Remember, if you give names your variables then printing nodes will give you a better idea of
-where in the calculations you are.
diff --git a/doc/sandbox/elemwise_compiler.rst b/doc/sandbox/elemwise_compiler.rst
deleted file mode 100644
index 8c7825b7c4..0000000000
--- a/doc/sandbox/elemwise_compiler.rst
+++ /dev/null
@@ -1,86 +0,0 @@
-.. _sandbox_elemwise:
-
-==========================
-:class:`Elemwise` compiler
-==========================
-
-.. todo:: Stale specification page.  Upgrade this to provide useful developer doc. 2008.09.04
-
-Definitions
-===========
-
-The element-wise compiler takes inputs {{{(in0, in1, in2, ...)}}}, outputs {{{(out0, out1, out2, ...)}}}, broadcast modes {{{(mod0, mod1, mod2, ...)}}} where each mode corresponds to an output as well as {{{order}}} which determines if we broadcast/accumulate over the first or last dimensions (the looping order, basically, but some operations are only valid for one particular order!).
-
-The broadcast mode serves to calculate the rank of the corresponding output and how to map each input element to an output element:
-
-  * {{{broadcast}}}
-    * output.rank = max(input.rank)
-    * the inputs of lesser rank are broadcasted over missing dimensions
-    * if {{{order == f}}} ([3, 5], [5]) => [3, 5] or ([7, 8, 9], [8, 9]) => [7, 8, 9]
-    * if {{{order == c}}} ([3, 5], [3]) => [3, 5] or ([7, 8, 9], [7, 8]) => [7, 8, 9]
-  * {{{(accumulate, Accumulator)}}}
-    * output.rank = min(input.rank)
-    * for the inputs of greater rank, we use Accumulator (sum, product, etc.) to accumulate over the first dimensions
-
-      * e.g. {{{if Accumulator == sum, order == c, x.rank == 2, y.rank == 1 and z = f(x, y) then z[i] = f(sum_j(x[i, j]), y[i])}}}
-
-    * if {{{order == f}}} ([3, 5], [5]) => [5] or ([7, 8, 9], [8, 9]) => [8, 9]
-    * if {{{order == c}}} ([3, 5], [3]) => [3] or ([7, 8, 9], [7, 8]) => [7, 8]
-
-{{{order == c}}} is equivalent to transposing the outputs of an {{{order == f}}} operation on transposed inputs.
-
-This does not cover all cases of broadcasting, but I believe they cover enough. Other cases of broadcasting can be emulated with proper transposition and/or slicing.
- * Could you give some examples of what kinds of broadcasting are and are not covered by your proposed implementation?
-
-  * For rank <= 2, I think only operations of the form {{{add(ones(3,1), ones(1,3)))}}} are missing. I actually didn't think of that one before now.
-  * In general, it only handles f(shape(head, ...), shape(head, ...), ...) and f(shape(..., tail), shape(..., tail), ...)
-  * Maybe I could add a general case later... the thing is that I think the ones I am considering here are easier to streamline.
-
-Point of clarification: the order discussed here corresponds to a set of broadcasting rules, and is independent from the storage order.  The 'f' order corresponds to numpy's broadcasting rules, while the 'c' order is something new and different (TODO VERIFY!)
-
-Question: does it make sense to apply the order to the loop, or is this broadcast order something which will be local to each input argument.  What happens when the elemwise compiler deals with more complex subgraphs with multiple inputs and outputs?
-
-The loop
-========
-
-Here is the loop for {{{order == c}}}. Check for errors!
-
-.. code-block:: cpp
-
-    <initialize iterators>
-
-    i1 = -1
-    while (++i1 < dim1) {
-      i2 = -1
-      rank_N-1_accumulator = init
-      while (++i2 < dim2) {
-        ...
-        iN = -1
-        while (++iN < dimN) {
-          <accumulate rank N input>
-          <SET rank N output using broadcasted inputs>
-          <NEXT rank N iterator>
-        }
-        ...
-      }
-      <SET rank 1 output using accumulated inputs>
-      <NEXT rank 1 iterator>
-    }
-
-When {{{order == f}}}, the iterators ''ideally'' (but not necessarily) iterate in FORTRAN order, i.e. the while loops are on {{{dimN..dim1}}} instead of {{{dim1..dimN}}}.
-
-{{{order}}} does __not__ represent the {{{C/F_CONTIGUOUS}}} flags of the inputs or outputs. Depending on combinations of those parameters, different loops will be used. If {{{order == f and C_CONTIGUOUS(array)}}}, for example, the loop will be on {{{dim1..dimN}}} and the matrices of lesser rank will need to be looped over several times.
-
-An rewrite should look at the operations in the graph and figure out whether to allocate C_CONTIGUOUS (ideal for {{{order == c}}}) or F_CONTIGUOUS (ideal for {{{order == f}}}) arrays.
-
-Gradient
-========
-
-The input ranks become the output ranks and gradients of the same rank as the outputs are added to the input list. If an output was given mode {{{broadcast}}}, then all inputs used to calculate it had to be broadcasted to that shape, so we must sum over the broadcasted dimensions on the gradient. The mode that we give to those inputs is therefore {{{(accumulate, sum)}}}. Inversely, if an output was given mode {{{(accumulate, sum)}}}, then all inputs used to calculate it had to be summed over those dimensions. Therefore, we give them mode {{{broadcast}}} in grad. Other accumulators than sum might prove more difficult. For example, the ith gradient for product is grad*product/x_i. Not sure how to handle that automatically.
- * I don't exactly follow this paragraph, but I think I catch the general idea and it seems to me like it will work very well.
-
-  * In a nutshell for {{{broadcast}}} I calculate the gradient as normal assuming the shape is broadcasted and then I sum over what I had to broadcast.
-
- * Could you explain why the accumulator gradient (e.g. product) can be trickier?
-
-  * I thought about it and I figured that the general case is {{{g_accum[N-i+1], g_m[i] = grad_fn(accum[i-1], m[i], g_accum[N-i])}}} where {{{g_accum}}} is the accumulated gradient wrt the accumulator {{{accum}}}. It can be short-circuited in sum and product's case: for sum, grad_fn is the identity on its last argument so {{{g_m[i] == g_accum[i] == g_accum[0] == g_z for all i}}}. In product's case, {{{accum[i-1] == product(m[1:i-1]) and g_accum[N-i] == g_z * product(m[i+1:N])}}}, multiply them together and you obtain {{{g_z * product(m)/m[i]}}} where obviously we only need to compute {{{product(m)}}} once. It's worth handling those two special cases, for the general case I don't know.
diff --git a/doc/sandbox/function.rst b/doc/sandbox/function.rst
deleted file mode 100644
index f5a0a29f0d..0000000000
--- a/doc/sandbox/function.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-
-.. _function:
-
-==================
-function interface
-==================
-
-WRITEME
-
diff --git a/doc/sandbox/functional.rst b/doc/sandbox/functional.rst
deleted file mode 100644
index 97d4d65b52..0000000000
--- a/doc/sandbox/functional.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-
-==========
-Functional
-==========
-
-Want to know about PyTensor's `function design
-<http://groups.google.com/group/theano-dev/browse_thread/thread/fd4c6947d8a20510>`?
diff --git a/doc/sandbox/how_to_make_ops.rst b/doc/sandbox/how_to_make_ops.rst
deleted file mode 100644
index 9fd92e0d04..0000000000
--- a/doc/sandbox/how_to_make_ops.rst
+++ /dev/null
@@ -1,295 +0,0 @@
-.. _how_to_make_ops:
-
-#################
-How to Make Ops
-#################
-
-
-Parametrization
-===============
-
-An Op class can represent one or a wide variety of functions depending on how you choose to parametrize it. The parameters of an Op do not show up in the structure of the computation graph - they are local to the Op. [*What does the last sentence mean? What is its effect?*] When an Op's ``make_node`` function is called on an Op instance with a list of inputs, the computation that is performed depends on the type and value of those inputs and on the internal parameters of the Op.
-
-It is not always obvious what should be a parameter and what should be an input. For example, a generic indexing Op could take a list and an index as graph inputs, whereas a specific indexing Op could have an index parameter, so you could have a specialized Op instance to fetch the nth element of a list, where n is known statically. [*Could you give some advice about the relative tradeoffs of having something as a parameter and something as an input?*]
-
-Examples of parameterized Ops in pytensor:
-  ``Broadcast(<scalar op>, <inplace?>)``
-    upgrades an op that works on scalars so it works on tensors. Can work inplace or not.
-  ``Reduce(<scalar op>, <axes>)``
-    reduces the specified axes using the provided scalar op.
-  ``Add(<output type inferrer>)``
-    adds scalars and puts the variable in a scalar whose type is inferred from the input types using ``output_type_inferrer(*inputs)``
-  ``Composite(<graph>)``
-    makes a single Op out of a graph of scalar operations.
-
-[*These examples are a little abstract. I'm not sure what are the inputs and what are the parameters. Maybe also give like something that has a random seed.*]
-
-Ideas:
-  ``MyOp(<debug>)``
-    prints debugging information in perform or the C implementation if debug is True.
-  ``MyOp(<allow C>)``
-    always use the python implementation if allow C is False (raise an exception in c_code)
-
-``__eq__``, ``__ne__`` and ``__hash__``
----------------------------------------------
-
-In order for certain rewrites to apply (such as the merging of duplicate
-calculations by `MergeOptimizer`), it is necessary for `Op`\s that do the same
-thing to compare equal.  If `Op` instances are generated by a function call
-(for example) then it can happen that several different `Op` instances do the
-same thing; in that case you will have to override `Op.__eq__`, `Op.__ne__`, and
-`Op.__hash__` for the `MergeOptimizer` to recognize them as equal.
-
-Recall: the contract for any ``__hash__`` is that ``a == b`` implies ``hash(a) == hash(b)``.
-
-:meth:`Op.make_node`
-====================
-
-The :meth:`Op.make_node` method is expected to have the following signature:
-
-.. code-block:: python
-
-    make_node(self, *inputs)
-
-``inputs`` may be a list of anything that the user wants to provide as symbolic
-input (symbolic: standing for the actual values that will be passed when the
-graph is compiled into an executable function). [*The PyTensor intro should
-describe symbolic in greater depth, and we should link to that from here.*] This
-may or may not include Variable instances (but if you want the inputs of this Op
-to sometimes be outputs of another Op, then the inputs should be Variable
-instances). [*What else could they be? Constant, Values, ...*] The return value
-should be an instance of [GraphStructures Apply] (see the example below). Here
-are the tasks typically handled in ``make_node``.
-
-  * Check that the inputs are valid (type checking, etc.). [*Since we don't actually have values, what can we do besides type checking?*]
-  * If needed, wrap the inputs in Variable instances with the proper type.
-  * Make the Variable instances that will serve as the outputs of the node.
-  * ``return Apply(self, <wrapped inputs>, <outputs>)``
-
-The ``inputs`` and ``outputs`` arguments to ``Apply`` must be lists of
-`Variable` instances (or instances of subclasses of ``Variable``). The inputs
-given to `Apply` do not have to be the same as the inputs passed to
-`make_node`, but it is recommended that the order corresponds. [*why?*] The
-behavior of `make_node` should not depend on the structure of the graph of
-[*or?*] its inputs: it may look at the type and type fields of its inputs, but
-not at their owner field, because modifications to the graph structure do not
-use `make_node`.
-
-Example:
-
-.. code-block:: python
-
-	from pytensor.scalar import *
-
-	class Add(Op):
-	    #...
-	    def make_node(self, x, y):
-	        # note 1: constant, int64 and ScalarType are defined in pytensor.scalar
-	        # note 2: constant(x) is equivalent to Constant(type=int64, data=x)
-	        # note 3: the call int64() is equivalent to Variable(type=int64, None) or Variable(type=ScalarType(dtype = 'int64'), None)
-	        if isinstance(x, int):
-	            x = constant(x)
-	        elif not isinstance(x, Variable) or not x.type == int64:
-	            raise TypeError("expected an int64 ScalarType")
-	        if isinstance(y, int):
-	            y = constant(y)
-	        elif not isinstance(y, Variable) or not x.type == int64:
-	            raise TypeError("expected an int64 ScalarType")
-	        inputs = [x, y]
-	        outputs = [int64()]
-	        node = Apply(op = self, inputs = inputs, outputs = outputs)
-	        return node
-	    #...
-
-	add = Add()                               # I make an instance of Add
-	node1 = add.make_node(int64(), int64())   # I make a node with two Variable inputs
-	node2 = add.make_node(1, 2)               # this works too
-	node3 = add.make_node(int64(), 79)        # this works three
-	node4 = add.make_node(float64(), int64()) # this raises a TypeError
-
-[*What type is an instance of Add? It's an Apply? But that's not a Variable, and cannot be used as input for another Op.*]
-
-Two Apply nodes ``node1`` and ``node2`` are *assumed* by the compiler to represent the same behavior if:
-  1. ``node1.op == node2.op``
-  1. ``all(input1.type == input2.type for input1, input2 in zip(node1.inputs, node2.inputs))``
-  1. ``all(output1.type == output2.type for output1, output2 in zip(node1.outputs, node2.outputs))``
-
-It is considered an *error* to have conditions 1 and 2 but not condition 3. A corollary to those conditions is that repeated calls to ``make_node`` with the same inputs should produce equivalent nodes.
-
-``__call__``
-----------------
-
-In ``Op``, ``__call__`` is defined in terms of ``make_node``. Instead of returning a node, it returns the output Variables directly, which is practical from a UI standpoint. Here is pseudocode:
-
-.. code-block:: python
-
-    if len(outputs) is 1:
-        __call__(*inputs) <=> make_node(*inputs).outputs[0]
-    else:
-        __call__(*inputs) <=> make_node(*inputs).outputs
-
-It is not necessary or recommended to override ``__call__`` unless you want to hide some outputs from view (see hidden outputs section).
-
-perform
-=======
-
-The ``perform`` method is expected to have the following signature:
-
-``
-perform(self, node, inputs, output_storage)
-``
-
-Where:
-  * *node*: a pointer to an Apply instance - ``node`` is assumed to be produced by a previous call to ``self.make_node``.
-  * *inputs*: *not* the same as ``node.inputs`` - it is a list of values. [*i.e. actually data, not just symbolic stuff?*]
-  * *output_storage*: *not* the same as ``node.outputs`` - it is a list of lists of length 1 where the variables of the computation must be put.
-
-[*Can you explain better how inputs is not node.inputs and output_storage is not node.outputs?*]
-
-[*Would it be better to call inputs as 'inputs_storage'?*]
-
-Here is an example of a properly defined ``perform``:
-
-.. code-block:: python
-
-	class Add(Op):
-	    ...
-	    def perform(self, node, inputs, output_storage):
-	        # this does z = x + y
-	        x, y = inputs        # extract the two inputs
-	        z, = output_storage  # extract the one storage (the comma after z is not optional)
-	        z[0] = x + y         # we must put the variable in z[0]
-	    ...
-
-	add = Add()                               # I make an instance of Add
-	node = add.make_node(int64(), int64())    # I make a node with two integer inputs
-	storage = [None]                          # I make my storage as a 1-element list with None
-	add.perform(node, (3, 7), (storage, ))    # I provide the node, two inputs and storage for one output
-    print storage[0]                          # prints 10
-
-[*Why is node never used in the perform function? Why is self never used?*]
-
-[*What does the comma after z do? Why is it not optional?*]
-
-The ``node`` parameter is not always needed, but might come in handy sometimes [*when?*]. There are as many entries in ``output_storage`` as there are in ``node.outputs`` and each entry is a list of length 1. The outputs must be computed from the inputs and put in those lists. The lists in ``output_storage`` must not be resized - the only allowed operation is to set or read their first element. [*Since these instructions correspond to more general principles, could you state the principles of the contract more generally and put it __above__ the example?*]
-
-reusing outputs
----------------
-
-The output storage in ``output_storage`` might not be empty. In fact, whatever the op allocates to store the computation and puts in the storage *might* still be there the second time around. [*huh?*] This is an intended feature and it is acceptable for ``perform`` to *reuse* what is in the output storage if it is worth it. For example, if ``perform`` must add two ``1000x1000`` matrices into a new matrix of the same size and that there is already a ``1000x1000`` matrix in the corresponding output storage, it may reuse it and thus save a lot in memory and allocation time. It may also freely discard what is already there.
-
-Note that it is not *guaranteed* that the outputs will stick around. Indeed, the linker may, at its discretion, clean them up. It is not guaranteed either (though it will usually be the case) that the contents of the output storage was allocated by a previous call to ``perform``. It *is* however guaranteed that the contents are either ``None`` or a structure of the proper type which it can use.
-
-If the contents of the storage are ``None``, *new* storage is expected for that output (typical case is that we "gave" the output to the user so we don't own it anymore). Therefore, it is not acceptable to have a private cache of previously allocated storage unless you know what you are doing.
-
-Advanced note: for an Op with multiple outputs, it is possible that some of them can be reused and some others not. If an Op with multiple outputs shares storage between them, e.g. the first output is a view of the second, if the first output is reset to ``None``, the second should *not* be reused, even if it's available, because a fresh output is expected for the first. It is not recommended in general to share storage between outputs unless one of them is hidden (see hidden outputs section), because the engine does not know how to handle that situation safely.
-
-grad
-====
-
-``grad`` is an PyTensor-specific [*as opposed to?*]  function - it does not interface with core rewrite and compilation facilities, but it provides a useful interface to differentiation. Its expected signature is:
-
-.. code-block:: python
-
-    grad(self, inputs, output_gradients)
-
-
-where:
-  * ``inputs`` is a list of Variable instances. It is assumed to be the ``inputs`` field of a node produced by ``make_node``.
-  * ``output_gradients`` is a list of Variable instances. They have the same properties as the outputs of the node, but are filled with gradient values.
-
-Essentially, the semantics are:
-
-.. code-block:: python
-
-	# Not completely sure about this, James should doublecheck -jpt and ob
-	def grad(self, (x, ), (gz, )):
-	   return [gz * (dz/dx)]
-	def grad(self, (x, y), (gz, )):
-	   return gz*(dz/dx), gz*(dz/dy)
-	def grad(self, (x, y), (gz, gw)):
-	   # In this situation you want two return values that have the shape of x and y respectively
-	   return gz*dz/dx + gw*dw/dx, gz*dz/dy + gw*dw/dy
-
-More specifically,
-``grad`` must return a list or tuple of input gradients, as many as there are inputs. Let C be a Variable (currently assumed to be a scalar) that depends through an PyTensor symbolic expression on the node outputs. Then each output_gradients[i] represents symbolically dC/doutputs[i]. The returned input gradients should represent symbolically dC/dinputs[i].
-
-Example:
-
-.. code-block:: python
-
-	class Mul(Op):
-	    ...
-	    def grad(self, inputs, output_gradients):
-	        x, y = inputs
-	        gz, = output_gradients   # here again, the comma is not optional
-	        return mul(gz, y), mul(gz, x)
-	    ...
-	mul = Mul()
-
-If the op is not differentiable wrt one of its inputs, the gradient for that input should be ``None``; if the op is not differentiable with respect to any of its inputs, it should return something equivalent to
-``[None] * len(inputs)``.  If ``grad`` is not implemented for any op in a graph, then the symbolic gradient engine will complain (with an attribute exception).
-
-
-
-If the op only has one input, be careful to still return a list or tuple:
-  * fine: ``return gx,``
-  * fine: ``return [gx]``
-  * not fine: ``return gx``
-
-The [http://www.iro.umontreal.ca/~pift6266/A06/cours/gradient.pdf principle] behide this is explaned in section 2.
-
-Destroyers and viewers
-======================
-
-Destroyers
-----------
-
-An Op may change the contents of its inputs. For example, ``z = add_inplace(x, y)`` will increment ``x`` with ``y``, erasing the previous contents of ``x``. ``z`` represents ``x`` after it was incremented. However, the engine needs to be told about all this so it can guarantee that ``add_inplace`` will only be executed as soon as we don't need ``x`` anywhere else.
-
-This is done by setting the ``destroy_map`` field of the op. ``destroy_map`` must be a dictionary which associates an output index or ``None`` to a list of input indices that are destroyed by that output. For example, ``add_inplace.destroy_map == {0: [0]``} because the first input is overwritten by the first output. If it was ``y`` that was overwritten, then ``destroy_map`` would be ``{0: [1]``}, because the second input is overwritten by the first output. In a nutshell, to each output must correspond the list of inputs that were changed and share storage with that output. Use ``None`` if the inputs were only destroyed to do temporary calculations, etc. and are not reused as the output storage.
-
-Viewers
--------
-
-Similarly, an Op might not modify the inputs, but return an output which shares state with one or several of its inputs. For example, ``transpose`` can be done efficiently by viewing the same data as the original with modified dimensions and strides. That is fine, but the compiler needs to be told.
-
-This is done by setting the ``view_map`` field of the op. It works like the ``destroy_map`` field: to an output index is associated the list of inputs that it shares state with. For example, ``transpose.view_map == {0: [0]``} because its first output uses the same data as its first input. ``view_map`` is conservative: if there is any probability that an output will be the view of an input, that input must be in the view list of that output.
-
-Important note: currently, an output can only be the view of one input. This is limiting, as an 'if' or 'switch' op would need to declare its output as a view of both its then and else branches, but for the time being the framework is not powerful enough to handle it. A future version should address this issue.
-
-Hidden outputs (as a form of op state)
-======================================
-
-For performance purposes, an ``op`` might want to have a hidden internal state.
-
-Example: if we expect to call the op repeatedly on incrementally bigger inputs, we might want private output storage that's a lot bigger than needed and take incrementally bigger views on it, to save allocation overhead. In order to do this, we can have two outputs: one that we will return normally and will contain the answer and the other that will be the (larger) container. In this case, the advanced note in the 'reusing outputs' section applies. Furthermore, ``__call__`` should be overridden to only return the first output instead of both of them. Here is what the example's ``perform`` and ``__call__`` would look like:
-
-.. code-block:: python
-
-	class Add(Op):
-	    """
-	    Use a hidden buffer to prevent unnecessary reallocation of memory.
-	    """
-	    default_output = 0
-	    def make_node(self, x, y):
-	        return Apply(self, [x,y], [x.type.make_variable(), x.type.make_variable()])
-
-	    def perform(self, node, (x, y), (z, stor)):
-	        if z[0] is None or stor[0] is None:
-	            stor[0] = numpy.ndarray(x.size * 2)
-	        else:
-	            if x.size > stor[0].size:
-	                stor[0].resize(x.size * 2, refcheck = 0)
-	        z[0] = stor[0][:x.size]
-	        numpy.add(x, y, z[0])
-        ...
-
-Another example: for a FFTW Op, we would like to cache FFTW's plan along
-with the inputs it was computed on, so we can reuse it if the inputs
-are similar to the previous ones.
-
-It is also possible but potentially more complicated to use "private
-inputs" to do the same thing: inputs cannot be set, though their contents
-can be modified, so a wrapper would be needed and the input must be
-marked as 'destroyed' by the Op using the 'destroy_map' field.
diff --git a/doc/sandbox/index.rst b/doc/sandbox/index.rst
deleted file mode 100644
index afbff2cb5e..0000000000
--- a/doc/sandbox/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-:orphan:
-
-=========================================================
-Sandbox, this documentation may or may not be out-of-date
-=========================================================
-
-.. toctree::
-   :glob:
-
-   *
-
diff --git a/doc/sandbox/index2.rst b/doc/sandbox/index2.rst
deleted file mode 100644
index 8b1c02b948..0000000000
--- a/doc/sandbox/index2.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-
-.. _advanced:
-
-====================================
-Advanced Topics (under construction)
-====================================
-
-.. toctree::
-    :maxdepth: 2
-
-    compilation
-    ccodegen
-    function
-    debugging_with_stepmode
-
diff --git a/doc/sandbox/interactive_debugger.rst b/doc/sandbox/interactive_debugger.rst
deleted file mode 100644
index c72fd3f206..0000000000
--- a/doc/sandbox/interactive_debugger.rst
+++ /dev/null
@@ -1,56 +0,0 @@
-====================
-Interactive Debugger
-====================
-
-'''Seed of discussion for what an interactive debugging tool might look like. 2009.03.27.'''
-
-== Interactive debugger ( #352 ) ==
-
-The interactive debugger should allow the user to go step by step in a graph to debug it. It should allow setting breakpoints on arbitrary Ops or subgraphs. If we can group ops by the user's function that defined them, we could have a logical grouping of the graph into subgraphs.
-
-The debugger should save the inputs at each step so the user loses no info through inplace operations. Ideally, the debugger should be a normal python shell enriched with commands to control the flow and all the inputs should be made available so the user can use numpy interactively on them.
-
-Command wishlist
- * py_perform (perform the current operation using the python implementation)
- * c_perform (perform the current operation using the C implementation)
- * perform (use the Linker's preference)
- * get_inputs (get the inputs of the current op)
- * set_inputs (set the inputs of the current op)
- * get_outputs (get the outputs of the current op)
- * set_outputs (set the outputs of the current op (bypasses its perform))
- * next (perform and go to the next breakpoint)
- * breakpoint (set a breakpoint on the current Op or subgraph)
- * step (perform and go to the next Op or subgraph)
- * step_in (go to the first Op inside the current subgraph)
- * step_out (exit the subgraph containing this Op)
- * Of course, normal python shell functionality!
- * The global context where the debugger was called (so the user can define his own helper functions, etc.)
-
-A good, simple way to do it would be to have those commands as methods of a structure that would be returned by a DebugLinker. This would allow an interactive session like the following:
-
-{{{
->>> a, b, c = Tensor(), Tensor(), Tensor()
->>> d = b * c
->>> e = a + d
->>> debug = make_function(DebugLinker(FunctionGraph([a, b, c], [e])))
->>> debug.set_breakpoint(d)
->>> debug.debug(10, 20, 30) # a, b, c = 10, 20, 30
-Now at: Mul(b, c)
-Context: d = b * c
->>> debug.get_inputs() # we are at the node d = b * c
-[20, 30]
->>> debug.get_outputs()
-[None]
->>> debug.py_perform()
->>> debug.get_outputs()
-[600]
->>> debug.step()
-Now at: Add(a, Mul)
-Context: e = a + d
->>> debug.get_inputs()
-[30, 600]
->>> debug.step()
-Finished.
-[630]
->>>
-}}}
diff --git a/doc/sandbox/logistic_regression_example.rst b/doc/sandbox/logistic_regression_example.rst
deleted file mode 100644
index 1631dcce1e..0000000000
--- a/doc/sandbox/logistic_regression_example.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. _logistic_regression_example:
-
-State example
-=============
-
-In this example, we'll look at a complete logistic regression model, with
-training by gradient descent.
-
-BUT, YOU GOTTA RUN THIS CODE AND MAKE SURE IT STILL WORKS NICELY, HEY?
-
-.. code-block:: python
-
-    def build_logistic_regression_model(n_in, n_out, l2_coef=30.0)
-        # DECLARE SOME VARIABLES
-
-        import pytensor.tensor as pt
-
-        x = pt.matrix()  #our points, one point per row
-        y = pt.matrix()  #store our labels as place codes (label 3 of 5 is vector [00100])
-
-        w = pt.matrix()  #the linear transform to apply to our input points
-        b = pt.vector()  #a vector of biases, which make our transform affine instead of linear
-
-        stepsize = pt.scalar('stepsize')  # a stepsize for gradient descent
-
-        # REGRESSION MODEL AND COSTS TO MINIMIZE
-
-        prediction = pt.softmax(pt.dot(x, w) + b)
-        cross_entropy = pt.sum(y * pt.log(prediction), axis=1)
-        cost = pt.sum(cross_entropy) + l2_coef * pt.sum(pt.sum(w*w))
-
-        # GET THE GRADIENTS NECESSARY TO FIT OUR PARAMETERS
-
-        grad_w, grad_b = pt.grad(cost, [w, b])
-
-        #
-        # GET THE GRADIENTS NECESSARY TO FIT OUR PARAMETERS
-
-        update_fn = pytensor.function(
-            inputs = [x, y, stepsize,
-                In(w,
-                    name='w',
-                    value=numpy.zeros((n_in, n_out)),
-                    update=w - stepsize * grad_w,
-                    mutable=True,
-                    strict=True)
-                In(b,
-                    name='b',
-                    value=numpy.zeros(n_out),
-                    update=b - lr * grad_b,
-                    mutable=True,
-                    strict=True)
-            ],
-            outputs = cost,
-            mode = 'EXPENSIVE_OPTIMIZATIONS')
-
-        apply_fn = pytensor.function(
-            inputs = [x, In(w, value=update_fn.storage[w]), In(b, value=update_fn.storage[b])],
-            outputs = [prediction])
-
-        return update_fn, apply_fn
-
-    #USUALLY THIS WOULD BE IN A DIFFERENT FUNCTION/CLASS
-    #FIT SOME DUMMY DATA: 100 points with 10 attributes and 3 potential labels
-
-    up_fn, app_fn = build_logistic_regression_model(n_in=10, n_out=3, l2_coef=30.0)
-
-    x_data = numpy.random.standard_normal((100, 10))
-    y_data = numpy.random.standard_normal((100, 3))
-    y_data = _asarray(y_data == numpy.max(y_data, axis=1), dtype='int64')
-
-    print "Model Training ..."
-    for iteration in range(1000):
-        print "  iter", iteration, "cost", update_fn(x_data, y_data, stepsize=0.0001)
-
-    print "Model Predictions"
-    print apply_fn(x_data)
diff --git a/doc/sandbox/performance.rst b/doc/sandbox/performance.rst
deleted file mode 100644
index a62b00b345..0000000000
--- a/doc/sandbox/performance.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-
-===========
-Performance
-===========
-
-PyTensor uses several tricks to obtain good performance:
- * common sub-expression elimination
- * [custom generated] C code for many operations
- * pre-allocation of temporary storage
- * loop fusion (which gcc normally can't do)
-
-On my neural net experiments for my course projects, I was getting around 10x
-speed improvements over basic numpy by using pytensor.
-[More specific speed tests would be nice.]
-
-
-With a little work, PyTensor could also implement more sophisticated
-rewrites:
-
- * automatic ordering of matrix multiplications
- * profile-based memory layout decisions (e.g. row-major vs. col-major)
- * gcc intrinsics to use MMX, SSE2 parallelism for faster element-wise arithmetic
- * conditional expressions
diff --git a/doc/sandbox/randomnumbers.rst b/doc/sandbox/randomnumbers.rst
deleted file mode 100644
index fcdded1c2b..0000000000
--- a/doc/sandbox/randomnumbers.rst
+++ /dev/null
@@ -1,245 +0,0 @@
-.. _sandbox_randnb:
-
-==============
-Random Numbers
-==============
-
-''' This has been implemented (#182). 20090327.'''
-
-= Random Numbers =
-
-== Requirements ==
-
-
-PyTensor functions sometimes need random numbers.
-Random operations are not as simple as other operations such as ones_like, or pow(), because the output must be different when we call the same function repeatedly.  CompileFunction's new default-valued, updatable input variables make this possible.  At the same time we need random streams to be repeatable, and easy to work with.  So the basic requirements of our random number mechanism are:
-
- 1. Internal random number generators must be used in a clear manner, and be accessible to the caller after a function has been compiled.
- 1. A random-number-producing Op (from now on: {{{RandomOp}}}) should generally produce exactly the same stream of random numbers regardless of any other {{{RandomOp}}} instances in its own graph, and any other times the graph was compiled.
- 1. A {{{RandomOp}}}'s stream should be isolated from other {{{RandomOp}}} instances in a compiled graph, so that it is possible to adjust any one {{{RandomOp}}} independently from the others.
- 1. It should be easy to put the {{{RandomOp}}}s in a graph into a state from which their outputs are all independent.
- 1. It should be easy to save the current state of the {{{RandomOp}}}s in a graph.
- 1. It should be easy to re-instate a previous state of the {{{RandomOp}}}s in a graph.
-
-== Basic Technical Spec ==
-
-One option would be to skirt the issue by requiring users to pass all the random numbers we might need as input.
-However, it is not always simple to know how many random numbers will be required because the shape of a random matrix might be computed within the graph.
-The solution proposed here is to pass one or more random number generators as input to {{{pytensor.function}}}.
-
-Sharing a random number generator between different {{{RandomOp}}} instances makes it difficult to producing the same stream regardless of other ops in graph, and to keep {{{RandomOps}}} isolated.
-Therefore, each {{{RandomOp}}} instance in a graph will have its very own random number generator.
-That random number generator is an input to the function.
-In typical usage, we will use the new features of function inputs ({{{value}}}, {{{update}}}) to pass and update the rng for each {{{RandomOp}}}.
-By passing RNGs as inputs, it is possible to use the normal methods of accessing function inputs to access each {{{RandomOp}}}'s rng.
-In this approach it there is no pre-existing mechanism to work with the combined random number state of an entire graph.
-So the proposal is to provide the missing functionality (the last three requirements) via auxiliary functions: {{{seed, getstate, setstate}}}.
-
-== Syntax ==
-
-.. code-block:: python
-
-    #!python
-    # create a random generator, providing a default seed to condition how RandomOp instances are produced.
-    from pytensor.compile.function import function
-
-
-    r = MetaRandom(metaseed=872364)
-
-    # create a different random generator
-    rr = MetaRandom(metaseed=99)
-
-    # create an Op to produce a stream of random numbers.
-    # This generates random numbers uniformly between 0.0 and 1.0 excluded
-    # u will remember that it was made from r.
-    u = r.uniform(shape=(3,4,5), low=0.0, high=1.0)
-
-    # create a second Op for more random numbers
-    # v will remember that it was made from r.
-    v = r.uniform(shape=(8,), low=-1.0, high=0.0)
-
-    # create a third Op with a different underlying random state
-    # w will remember that it was made from rr.
-    w = rr.uniform(shape=(), low=-10., high=10.)
-
-    # compile a function to draw random numbers
-    # note: un-named state inputs will be added automatically.
-    # note: it is not necessary to draw samples for u, even though
-    #       u was created by r before v.
-    fn_v = function([], [v])
-
-    # this prints some representation of v's rng in fn_v.
-    # The .rng property works for Result instances produced by MetaRandom.
-    print fn_v.state[v.rng]
-
-    # compile a function to draw each of u, v, w
-    # note: un-named state inputs will be added automatically
-    # note: This function (especially its internal state) is independent from fn_v.
-    fn_uvw = function([], [u,v,w])
-
-    # N.B. The random number streams of fn_v and fn_uvw are independent.
-    assert fn_v.state[v.rng] != fn_uvw.state[v.rng]
-
-    fn_v()  # returns random numbers A (according to metaseed 872364)
-    fn_v()  # returns different random numbers B
-
-    # note that v's stream here is identical to the one in fn_v()
-    fn_uvw() # returns random numbers C, A, E
-
-    #explicitly re-seed v's random stream in fn_v
-    r.seed(fn_v, 872364)
-    fn_v()    # returns random numbers A (as above)
-    fn_v()    # returns random numbers B (as above)
-
-    #re-seed w's random stream in fn_uvw, but not u's or v's
-    rr.seed(fn_uvw, 99)
-    fn_uvw() # returns random numbers D, B, E
-
-
-== {{{MetaRandom}}} ==
-
-The {{{MetaRandom}}} class is the proposed interface for getting {{{RandomOp}}} instances.
-There are some syntactic similarities in the way {{{MetaRandom}}} is used to construct graphs, and the way {{{numpy.RandomState}}} appears in a corresponding procedural implementation.  But since pytensor is symbolic the meaning of {{{MetaRandom}}} is quite different.
-
-As with {{{numpy.RandomState}}} though, a global instance of {{{MetaRandom}}} will be instantiated at import time for the scripter's convenience.
-
-A {{{MetaRandom}}} instance will remember every {{{Result}}} that it returns during its lifetime.
-When calling functions like {{{seed, setstate}}}, this list is consulted so that only the streams associated with Results returned by {{{self}}} are modified.
-The use of multiple {{{MetaRandom}}} objects in a single function is mostly for debugging (e.g., when you want to synchronize two sets of random number streams).
-
-The typical case is that only one (global) {{{MetaRandom}}} object is used to produce all the random streams in a function, so seeding (once) will reset the entire function.
-
-.. code-block:: python
-
-    class MetaRandom(obj):
-     def __init__(self, metaseed=<N>): ... # new functions will be initialized so that seed(fn, <N>) has no effect on output.
-
-     def __contains__(self, Result): ...   # True if Result was returned by a call to self.<distribution>
-     def results(self): ...                # Iterate over returned Result instances in creation order.
-
-     def seed(self, fn, bits): ...         # See below.
-     def getstate(self, fn): ...           # See below.
-     def setstate(self, fn, state): ...    # See below.
-
-     def uniform(...): ...                 # return a Result of an Apply of a RandomOp.
-                                         # The return value is also stored internally for __contains__ and results().
-     def normal(...): ...
-     def bernoulli(...): ...
-     ...
-
-
-=== {{{MetaRandom.getstate}}} ===
-
-.. code-block:: python
-
-    def getstate(self, fn): ...
-
- ''return''::
-   list, set, dict, instance... something to store the random number generators associated with every one of {{{self}}}'s members in {{{fn}}}
-
-=== {{{MetaRandom.setstate}}} ===
-
-Re-install the random number generators in {{{rstates}}} to the {{{randomobj}}} members in {{{fn}}
-
-.. code-block:: python
-
-   def setstate(self, fn, rstates): ....
-
- ''fn::
-   a CompileFunction instance, generally with some Apply instances inside that are members of {{{self}}}.
- ''rstates''::
-   a structure returned by a previous call to {{{getstate}}}
- ''return''::
-   nothing
-
-
-=== {{{MetaRandom.seed}}} ===
-
-.. code-block:: python
-
-    def seed(self, fn, bits): ....
-
- ''fn::
-   a CompileFunction instance, generally with some Apply instances inside that are members of {{{self}}}.
- ''bits''::
-   Something to use as a seed. Typically an integer or list of integers.
- ''return''::
-   None
-
-Set the states of self's members in fn in a deterministic way based on bits.
-Each member of self should generate independent samples after this call.
-
-Seed is like a dynamically-computed setstate.  If the user runs
-.. code-block:: python
-
-    r.seed(fn, 99)
-    state_99 = r.getstate(fn)
-
-then any time afterward both {{{r.setstate(fn, state_99)}}} and {{{r.seed(fn, 99)}}} will put {{{fn}}} into the same state.
-
-
-
-= Potential Other syntax =
-
-
-.. code-block:: python
-
-    #!python
-    # create a random state
-    from pytensor.compile.function import function
-
-
-    r = RandomState(name = 'r')
-
-    # create a different random state
-    rr = RandomState(name = 'rr')
-
-    # create an Op to produce a stream of random numbers.
-    # That stream is a function of r's seed.
-    # This generates random numbers uniformly between 0.0 and 1.0 excluded
-    u = r.uniform(shape=(3,4,5), 0.0, 1.0)
-
-    # create a second Op for more random numbers
-    # This stream is seeded using a different function of r's seed.
-    # u and v should be independent
-    v = r.uniform(shape=(8,), -1.0, 0.0)
-
-    # create a third Op with a different underlying random state
-    w = rr.uniform(shape=(), -10., 10.)
-
-    # compile a function to draw random numbers
-    # note: it is not necessary to draw samples for u.
-    # we provide the seed for the RandomState r in the inputs list as a "Type 4" input
-    fn_v = function([(r, 872364)], [v])
-
-    # compile a function to draw each of u, v, w
-    # we provide the seeds for the RandomStates r and rr in the inputs list as "Type 4" inputs
-    # note: the random state for r here is seeded independently from the one in fn_v, which means
-    #       random number generation of fn_v and fn_uvw will not interfere. Since the seed is the
-    #       same, it means they will produce the same sequence of tensors for the output v.
-    fn_uvw = function([(r, 872364), (rr, 99)], [u,v,w])
-
-
-    fn_v()  # returns random numbers A
-    fn_v()  # returns different random numbers B
-
-    # note that v's stream here is identical to the one in fn_v()
-    fn_uvw() # returns random numbers C, A, E
-
-    #re-seed v's random stream in fn
-    fn_v.r = 872364
-
-    ### Is this state readable? What should we do here:
-    print fn_v.r
-
-    fn()    # returns random numbers A
-
-    ### Is this state well-defined?
-    ### Does there even exist a number such that fn_v.r = N would have no effect on the rng states?
-    print fn_v.r
-
-    fn()    # returns random numbers B
-
-    #re-seed w's random stream, but not u's or v's
-    fn_uvw.rr = 99
-    fn_uvw() # returns random numbers D, B, E
diff --git a/doc/sandbox/rethinkccodegen.rst b/doc/sandbox/rethinkccodegen.rst
deleted file mode 100644
index 462f424452..0000000000
--- a/doc/sandbox/rethinkccodegen.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-'''An open proposal.  This is still relevant. 20080904'''
-
-======================
-New C code generation?
-======================
-
-Issues
-======
-
-There are several issues with the current way C code is generated:
-  * Ops cannot declare their own persistent variables.
-  * Reliance on weave, but most of weave's features go unused.
-  * There could easily be conflicts between support code from different Ops/Results.
-    * It is currently impossible to specialize support code based on the self.
-  * Caching of the generated code for graphs is greatly suboptimal.
-
-Structure
-=========
-
-Currently, the general structure of the generated C code is approximately as follows:
-
-.. code-block:: c
-
-    <imports>
-    <weave type converters>
-    <op/result support code>
-
-    struct my_computation {
-      <input/output storage>
-      <persistent fields>
-      init(<input/output storage>) { <initialize persistent fields> }
-      cleanup { <clean up persistent fields> }
-      run { <run the computation> }
-    };
-
-    <runner for the struct>
-    PyObject* instantiate(PyObject* args) {
-      <weave stuff>
-      <make up a CObject out of the runner and a my_computation instance>
-      <weave stuff>
-    }
-    <python exports for instantiate>
-
-The module produced via that method then has to be used as such::
-
-    obj = module.instantiate(error_storage, input_storage, output_storage, orphan_storage)
-    cutils.run_cthunk(obj)
-
-
-We would like to get rid of weave dependencies, avoid name conflicts with the support code and have a nicer user interface for the produced module. The proposed new structure is as follows:
-
-.. code-block:: c
-
-    <imports>
-
-    struct op1 {
-      <persistent variables>
-      <support code>
-      init() { <initialize persistent fields> }
-      cleanup { <clean up persistent fields> }
-      run(<inputs>) { <run the computation for op1> }
-    };
-
-    struct op2 { <same> };
-    ...
-    struct opN { <ditto> };
-
-    struct driver {
-      op1 o1; op2 o2; ... opN oN;
-      <input storage>
-      <output storage>
-      init(<storage>) { <initialize ops, storage> }
-      cleanup() { <free storage?> }
-      run() {
-        <extract inputs>
-        o1.run(input1, input2);
-        o2.run(o1.output1);
-        ...
-        oN.run(...);
-        <sync outputs>
-      }
-    }
-
-    PyObject* <name>(PyObject* inputs) {
-      <init driver, input/output storage>
-      <put inputs in input storage>
-      driver.run()
-      <free input storage>
-      <return output storage>
-    }
-
-    PyObject* <name>_driver(PyObject* storage) {
-      <init driver with storage>
-      <return driver>
-    }
-
-    <export <name> and <name>_driver>
-
-Gains:
-  * support code can be put inside a struct and become private to the Op
-  * we can export several functions that can be used directly, eg ``z = module.add(1, 2)``
-    * this won't do filtering like ``Result.filter`` so the usefulness is limited by that
-  * the sequence of operations might be clearer to read
-  * we can use more descriptive names in each Op struct representing its input names (if we can find them using the inspect module) without worrying about name conflicts
-
-Losses:
-  * maybe gcc can't optimize it as well?
-    * make functions static and inline as much as possible
-
-
-Caching
-=======
-
-The current way of caching is from a hash of the generated code. That is inefficient because code has to be generated each time, which might be a costly process. Furthermore, usage of hashing in sets make it difficult to ensure a consistent ordering of Ops in graphs where several orderings are valid, so the generated C code is potentially different each time. Here is a proposal for a better way to compute the hash:
-  * Result_hash = Result version + Result desc
-  * Op_hash = Op version + Op desc + input/output hashes
-  * FunctionGraph_hash = FunctionGraph version + combination of the Op hashes and their traversal order wrt a consistent traversal method
-
-The version could be set explicitly via a ``__version__`` field or it could simply be equal to the file's last modification date. We could also have a ``__nocache__`` field indicating that code produced by the Op or Result cannot be cached.
-
-It should also be easier to bypass the cache (eg an option to CLinker to regenerate the code).
-
-
-
diff --git a/doc/sandbox/sandbox.rst b/doc/sandbox/sandbox.rst
deleted file mode 100644
index 4ab3e78182..0000000000
--- a/doc/sandbox/sandbox.rst
+++ /dev/null
@@ -1,161 +0,0 @@
-Basically, this file contains stuff that should be documented, but is not.
-
-Feel free to contribute things that you want documented, as well as to add
-or correct documentation.
-
-
-======================================
-How do you define the grad function?
-======================================
-
-Let's talk about defining the :meth:`Op.grad` function in an :class:`Op`, using an
-illustrative example.
-
-In Poisson regression (Ranzato and Szummer, 2008), the target *t* is
-integer data, which we predict using a continuous output *o*.
-In the negative log likelihood of the Poisson regressor, there is a term:
-
-.. math::
-
-    \log(t!)
-
-Let's say we write a logfactorial :class:`Op`. We then compute the gradient
-
-You should define gradient, even if it is undefined.
-[give log factorial example]
-
-If an :class:`Op` does not define ``grad``, but this :class:`Op` does not appear in the path when
-you compute the gradient, then there is no problem.
-
-If an :class:`Op` does not define ``grad``, and this :class:`Op` *does* appear in the path when
-you compute the gradient, **WRITEME**.
-
-Gradients for a particular variable can be one of four kinds:
-1) forgot to implement it
-
-You will get an exception of the following form::
-
-    pytensor.graph.utils.MethodNotDefined: ('grad', <class 'pylearn.algorithms.sandbox.cost.LogFactorial'>, 'LogFactorial')
-
-2) a symbolic variable
-3) None / zero
-4) undefined mathematically
-
-currently, there is no way for a ``grad()`` method to distinguish between cases 3
-and 4
-but the distinction is important because graphs with type-3 gradients are ok
-to run, whereas graphs with type-4 gradients are not.
-so I suggested that Joseph return a type-4 gradient by defining an :class:`Op` with no
-perform method.
-the idea would be that this would suit the graph-construction phase, but would
-prevent linking.
-how does that sound to you?
-
-**This documentation is useful when we show users how to write :class:`Op`\s.**
-
-======================================
-What is staticmethod, st_impl?
-======================================
-
-``st_impl`` is an optional method in an :class:`Op`.
-``@staticmethod`` is a Python decorator for a class method that does not
-implicitly take the class instance as a first argument. Hence, st_impl
-can be used for :class:`Op` implementations when no information from the :class:`Op`
-instance is needed. This can be useful for testing an implementation.
-See the ``XlogX`` class below for an example.
-
-**This documentation is useful when we show users how to write :class:`Op`\s.
-Olivier says this behavior should be discouraged but I feel that st_impl
-should be encouraged where possible.**
-
-============================================================
-how do we write scalar ops and upgrade them to tensor ops?
-============================================================
-
-**Olivier says that** :class:`~pytensor.tensor.xlogx.XlogX` **gives a good example. In fact, I would
-like to beef xlogx up into our running example for demonstrating how to
-write an :class:`Op`:**
-
-.. code-block:: python
-
-    class XlogX(scalar.UnaryScalarOp):
-        """
-        Compute X * log(X), with special case 0 log(0) = 0.
-        """
-        @staticmethod
-        def st_impl(x):
-            if x == 0.0:
-                return 0.0
-            return x * numpy.log(x)
-        def impl(self, x):
-            return XlogX.st_impl(x)
-        def grad(self, inp, grads):
-            x, = inp
-            gz, = grads
-            return [gz * (1 + scalar.log(x))]
-        def c_code(self, node, name, inp, out, sub):
-            x, = inp
-            z, = out
-            if node.inputs[0].type in [scalar.float32, scalar.float64]:
-                return """%(z)s =
-                    %(x)s == 0.0
-                    ? 0.0
-                    : %(x)s * log(%(x)s);""" % locals()
-            raise NotImplementedError('only floatingpoint is implemented')
-    scalar_xlogx  = XlogX(scalar.upgrade_to_float, name='scalar_xlogx')
-    xlogx = pytensor.tensor.elemwise.Elemwise(scalar_xlogx, name='xlogx')
-
-**It is also necessary to talk about UnaryScalarOp vs. BinaryOp.**
-
-UnaryScalarOp is the same as scalar.ScalarOp with member variable nin=1.
-**give an example of this**
-
-=======================================================
-How to use the `PrintOp`
-=======================================================
-
-** This is also useful in the How to write an :class:`Op` tutorial. **
-
-=======================================================
-Mammouth
-=======================================================
-
-**This is internal documentation. Guillaume can you make sure to hit these points:**
-
-export PYTENSOR_BLAS_LDFLAGS='-lmkl -liomp5 -fopenmp'
-
-**Do we want the following:**
-
-export OMP_NUM_THREADS=2
-
-=======================================================
-Type checking
-=======================================================
-
-    * Are there functions for doing type checking?
-        like dtype of this matrix is an int-type (not just int32
-        or int64)
-        "if isinstance(item, int):" is the preferred way to do it in
-        python now, so mimic this
-        If the type is wrong, what exception should be raised?
-
-======================================
-More simple numpy stuff
-======================================
-
-    * If we have a matrix with only one row, how do we convert it to a vector?
-        ``x.reshape(x.size)``
-        You can also use ``resize`` but there is not reason to ''resize''
-    * How do you convert the type of a numpy array?
-        ``pytensor._asarray(x, dtype = 'int32')``
-        Note that using ``numpy.asarray`` is potentially dangerous, due to
-        a problem in numpy where the type may not be properly set (see
-        numpy's Track ticket #870).
-
-
-=========================================
-How to reuse (overwrite) a storage tensor
-=========================================
-
-``pytensor.compile.io.Out(gw1, borrow = True)`` for that value in
-``pytensor.compile.function.function``
diff --git a/doc/sandbox/software.rst b/doc/sandbox/software.rst
deleted file mode 100644
index 12ccc68108..0000000000
--- a/doc/sandbox/software.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-===============
-Others software
-===============
-
-Other software to look at and maybe recommend to users:
-
-* [http://www.pytables.org/moin PyTables] - This is looking really
-    promising for dataset storage and experiment logging... This might
-    actually be useful for large data sets.
-* [http://matplotlib.sourceforge.net/ MatPlotLib] - visualization tools
-    (plot curves interactively, like matlab's figure window)
-* [http://www.pythonware.com/products/pil/ PIL] - Python Image Library:
-    write your matrices out in png! (Kinda a weird recommendation, I think)
-* [http://www.logilab.org/857 pylint] - Syntax checker for python to
-    help beautify your code. (We'd be hypocrites to recommend this :)
-* [http://www.winpdb.org/ Winpdb] - A Platform Independent Python
-    Debugger. (Except it doesn't really help you debug PyTensor graphs)
-* [http://wiki.python.org/moin/IntegratedDevelopmentEnvironments Python
-    Integrated Development Environments] - for all your coding needs
diff --git a/doc/sandbox/sparse.rst b/doc/sandbox/sparse.rst
deleted file mode 100644
index 27ccb8c449..0000000000
--- a/doc/sandbox/sparse.rst
+++ /dev/null
@@ -1,147 +0,0 @@
-.. _sparse:
-
-===============
-Sparse matrices
-===============
-
-scipy.sparse
-------------
-
-Note that you want SciPy >= 0.7.2
-
-.. warning::
-
-    In SciPy 0.6, `scipy.csc_matrix.dot` has a bug with singleton
-    dimensions. There may be more bugs. It also has inconsistent
-    implementation of sparse matrices.
-
-    We do not test against SciPy versions below 0.7.2.
-
-We describe the details of the compressed sparse matrix types.
-    `scipy.sparse.csc_matrix`
-        should be used if there are more rows than column (``shape[0] > shape[1]``).
-    `scipy.sparse.csr_matrix`
-        should be used if there are more columns than rows (``shape[0] < shape[1]``).
-    `scipy.sparse.lil_matrix`
-        is faster if we are modifying the array. After initial inserts,
-        we can then convert to the appropriate sparse matrix format.
-
-The following types also exist:
-    `dok_matrix`
-        Dictionary of Keys format. From their doc: This is an efficient structure for constructing sparse matrices incrementally.
-    `coo_matrix`
-        Coordinate format. From their lil doc: consider using the COO format when constructing large matrices.
-
-There seems to be a new format planned for SciPy 0.7.x:
-    `bsr_matrix`
-        Block Compressed Row (BSR). From their doc: The Block Compressed Row
-        (BSR) format is very similar to the Compressed Sparse Row (CSR)
-        format. BSR is appropriate for sparse matrices with dense sub matrices
-        like the last example below. Block matrices often arise in vector-valued
-        finite element discretizations. In such cases, BSR is considerably more
-        efficient than CSR and CSC for many sparse arithmetic operations.
-    `dia_matrix`
-        Sparse matrix with DIAgonal storage
-
-There are four member variables that comprise a compressed matrix ``sp`` (for at least csc, csr and bsr):
-
-    ``sp.shape``
-        gives the shape of the matrix.
-    ``sp.data``
-        gives the values of the non-zero entries. For CSC, these should
-        be in order from (I think, not sure) reading down in columns,
-        starting at the leftmost column until we reach the rightmost
-        column.
-    ``sp.indices``
-        gives the location of the non-zero entry. For CSC, this is the
-        row location.
-    ``sp.indptr``
-        gives the other location of the non-zero entry. For CSC, there are
-        as many values of indptr as there are ``columns + 1`` in the matrix.
-        ``sp.indptr[k] = x`` and ``indptr[k+1] = y`` means that column
-        ``k`` contains ``sp.data[x:y]``, i.e. the ``x``-th through the y-1th non-zero values.
-
-See the example below for details.
-
-.. code-block:: python
-
-    >>> import scipy.sparse
-    >>> sp = scipy.sparse.csc_matrix((5, 10))
-    >>> sp[4, 0] = 20
-    SparseEfficiencyWarning: changing the sparsity structure of a csc_matrix is expensive. lil_matrix is more efficient.
-     SparseEfficiencyWarning)
-    >>> sp[0, 0] = 10
-    >>> sp[2, 3] = 30
-    >>> sp.todense()
-    matrix([[ 10.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [  0.,   0.,   0.,  30.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.],
-            [ 20.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.]])
-    >>> print sp
-      (0, 0)        10.0
-      (4, 0)        20.0
-      (2, 3)        30.0
-    >>> sp.shape
-    (5, 10)
-    >>> sp.data
-    array([ 10.,  20.,  30.])
-    >>> sp.indices
-    array([0, 4, 2], dtype=int32)
-    >>> sp.indptr
-    array([0, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3], dtype=int32)
-
-Several things should be learned from the above example:
-
-* We actually use the wrong sparse matrix type. In fact, it is the
-  *rows* that are sparse, not the columns. So, it would have been
-  better to use ``sp = scipy.sparse.csr_matrix((5, 10))``.
-* We should have actually created the matrix as a `lil_matrix`,
-  which is more efficient for inserts. Afterwards, we should convert
-  to the appropriate compressed format.
-* ``sp.indptr[0] = 0`` and ``sp.indptr[1] = 2``, which means that
-  column 0 contains ``sp.data[0:2]``, i.e. the first two non-zero values.
-* ``sp.indptr[3] = 2`` and ``sp.indptr[4] = 3``, which means that column
-  three contains ``sp.data[2:3]``, i.e. the third non-zero value.
-
-TODO: Rewrite this documentation to do things in a smarter way.
-
-Speed
------
-
-For faster sparse code:
-  * Construction: lil_format is fast for many inserts.
-  * Operators: "Since conversions to and from the COO format are
-    quite fast, you can use this approach to efficiently implement lots
-    computations on sparse matrices." (Nathan Bell on scipy mailing list)
-
-Misc
-----
-The sparse equivalent of `dmatrix` is `csc_matrix` and `csr_matrix`.
-
-:class:`~pytensor.sparse.basic.Dot` vs. :class:`~pytensor.sparse.basic.StructuredDot`
--------------------------------------------------------------------------------------
-
-Often when you use a sparse matrix it is because there is a meaning to the
-structure of non-zeros. The gradient on terms outside that structure
-has no meaning, so it is computationally efficient not to compute them.
-
-`StructuredDot` is when you want the gradient to have zeroes corresponding to
-the sparse entries in the matrix.
-
-`TrueDot` and `Structured` dot have different gradients
-but their perform functions should be the same.
-
-The gradient of `TrueDot` can have non-zeros where the sparse matrix had zeros.
-The gradient of `StructuredDot` can't.
-
-Suppose you have ``dot(x,w)`` where ``x`` and ``w`` are square matrices.
-If ``w`` is dense, like ``standard_normal((5,5))`` and ``x`` is of full rank (though
-potentially sparse, like a diagonal matrix of ones) then the output will
-be dense too.
-What's important is the density of the gradient on the output.
-If the gradient on the output is dense, and ``w`` is dense (as we said it was)
-then the ``True`` gradient on ``x`` will be dense.
-If our dot is a `TrueDot`, then it will say that the gradient on ``x`` is dense.
-If our dot is a `StructuredDot`, then it will say the gradient on ``x`` is only
-defined on the diagonal and ignore the gradients on the off-diagonal.
diff --git a/doc/sandbox/tensoroptools.rst b/doc/sandbox/tensoroptools.rst
deleted file mode 100644
index 132924142f..0000000000
--- a/doc/sandbox/tensoroptools.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-
-.. _tensoroptools:
-
-================
-Tensor Op Tools
-================
-
-WRITEME - describe how to use Elemwise here
-

From 286c8fcb32c1b087cff40a314d8c5e4331eca12b Mon Sep 17 00:00:00 2001
From: Diego Sandoval <46681084+twaclaw@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:06:27 +0200
Subject: [PATCH 52/72] Implement nlinalg Ops in PyTorch (#920)

---
 pytensor/link/pytorch/dispatch/__init__.py |   2 +-
 pytensor/link/pytorch/dispatch/nlinalg.py  | 103 +++++++++++++++++++
 tests/link/pytorch/test_nlinalg.py         | 111 +++++++++++++++++++++
 3 files changed, 215 insertions(+), 1 deletion(-)
 create mode 100644 pytensor/link/pytorch/dispatch/nlinalg.py
 create mode 100644 tests/link/pytorch/test_nlinalg.py

diff --git a/pytensor/link/pytorch/dispatch/__init__.py b/pytensor/link/pytorch/dispatch/__init__.py
index fa47908d74..0295a12e8e 100644
--- a/pytensor/link/pytorch/dispatch/__init__.py
+++ b/pytensor/link/pytorch/dispatch/__init__.py
@@ -9,5 +9,5 @@
 import pytensor.link.pytorch.dispatch.extra_ops
 import pytensor.link.pytorch.dispatch.shape
 import pytensor.link.pytorch.dispatch.sort
-
+import pytensor.link.pytorch.dispatch.nlinalg
 # isort: on
diff --git a/pytensor/link/pytorch/dispatch/nlinalg.py b/pytensor/link/pytorch/dispatch/nlinalg.py
new file mode 100644
index 0000000000..91690489e9
--- /dev/null
+++ b/pytensor/link/pytorch/dispatch/nlinalg.py
@@ -0,0 +1,103 @@
+import torch
+
+from pytensor.link.pytorch.dispatch import pytorch_funcify
+from pytensor.tensor.nlinalg import (
+    SVD,
+    Det,
+    Eig,
+    Eigh,
+    KroneckerProduct,
+    MatrixInverse,
+    MatrixPinv,
+    QRFull,
+    SLogDet,
+)
+
+
+@pytorch_funcify.register(SVD)
+def pytorch_funcify_SVD(op, **kwargs):
+    full_matrices = op.full_matrices
+    compute_uv = op.compute_uv
+
+    def svd(x):
+        U, S, V = torch.linalg.svd(x, full_matrices=full_matrices)
+        if compute_uv:
+            return U, S, V
+        return S
+
+    return svd
+
+
+@pytorch_funcify.register(Det)
+def pytorch_funcify_Det(op, **kwargs):
+    def det(x):
+        return torch.linalg.det(x)
+
+    return det
+
+
+@pytorch_funcify.register(SLogDet)
+def pytorch_funcify_SLogDet(op, **kwargs):
+    def slogdet(x):
+        return torch.linalg.slogdet(x)
+
+    return slogdet
+
+
+@pytorch_funcify.register(Eig)
+def pytorch_funcify_Eig(op, **kwargs):
+    def eig(x):
+        return torch.linalg.eig(x)
+
+    return eig
+
+
+@pytorch_funcify.register(Eigh)
+def pytorch_funcify_Eigh(op, **kwargs):
+    uplo = op.UPLO
+
+    def eigh(x, uplo=uplo):
+        return torch.linalg.eigh(x, UPLO=uplo)
+
+    return eigh
+
+
+@pytorch_funcify.register(MatrixInverse)
+def pytorch_funcify_MatrixInverse(op, **kwargs):
+    def matrix_inverse(x):
+        return torch.linalg.inv(x)
+
+    return matrix_inverse
+
+
+@pytorch_funcify.register(QRFull)
+def pytorch_funcify_QRFull(op, **kwargs):
+    mode = op.mode
+    if mode == "raw":
+        raise NotImplementedError("raw mode not implemented in PyTorch")
+
+    def qr_full(x):
+        Q, R = torch.linalg.qr(x, mode=mode)
+        if mode == "r":
+            return R
+        return Q, R
+
+    return qr_full
+
+
+@pytorch_funcify.register(MatrixPinv)
+def pytorch_funcify_Pinv(op, **kwargs):
+    hermitian = op.hermitian
+
+    def pinv(x):
+        return torch.linalg.pinv(x, hermitian=hermitian)
+
+    return pinv
+
+
+@pytorch_funcify.register(KroneckerProduct)
+def pytorch_funcify_KroneckerProduct(op, **kwargs):
+    def _kron(x, y):
+        return torch.kron(x, y)
+
+    return _kron
diff --git a/tests/link/pytorch/test_nlinalg.py b/tests/link/pytorch/test_nlinalg.py
new file mode 100644
index 0000000000..7d69ac0500
--- /dev/null
+++ b/tests/link/pytorch/test_nlinalg.py
@@ -0,0 +1,111 @@
+import numpy as np
+import pytest
+
+from pytensor.compile.function import function
+from pytensor.configdefaults import config
+from pytensor.graph.fg import FunctionGraph
+from pytensor.tensor import nlinalg as pt_nla
+from pytensor.tensor.type import matrix
+from tests.link.pytorch.test_basic import compare_pytorch_and_py
+
+
+@pytest.fixture
+def matrix_test():
+    rng = np.random.default_rng(213234)
+
+    M = rng.normal(size=(3, 3))
+    test_value = M.dot(M.T).astype(config.floatX)
+
+    x = matrix("x")
+    return (x, test_value)
+
+
+@pytest.mark.parametrize(
+    "func",
+    (pt_nla.eig, pt_nla.eigh, pt_nla.slogdet, pt_nla.inv, pt_nla.det),
+)
+def test_lin_alg_no_params(func, matrix_test):
+    x, test_value = matrix_test
+
+    out = func(x)
+    out_fg = FunctionGraph([x], out if isinstance(out, list) else [out])
+
+    def assert_fn(x, y):
+        np.testing.assert_allclose(x, y, rtol=1e-3)
+
+    compare_pytorch_and_py(out_fg, [test_value], assert_fn=assert_fn)
+
+
+@pytest.mark.parametrize(
+    "mode",
+    (
+        "complete",
+        "reduced",
+        "r",
+        pytest.param("raw", marks=pytest.mark.xfail(raises=NotImplementedError)),
+    ),
+)
+def test_qr(mode, matrix_test):
+    x, test_value = matrix_test
+    outs = pt_nla.qr(x, mode=mode)
+    out_fg = FunctionGraph([x], outs if isinstance(outs, list) else [outs])
+    compare_pytorch_and_py(out_fg, [test_value])
+
+
+@pytest.mark.parametrize("compute_uv", [True, False])
+@pytest.mark.parametrize("full_matrices", [True, False])
+def test_svd(compute_uv, full_matrices, matrix_test):
+    x, test_value = matrix_test
+
+    out = pt_nla.svd(x, full_matrices=full_matrices, compute_uv=compute_uv)
+    out_fg = FunctionGraph([x], out if isinstance(out, list) else [out])
+
+    compare_pytorch_and_py(out_fg, [test_value])
+
+
+def test_pinv():
+    x = matrix("x")
+    x_inv = pt_nla.pinv(x)
+
+    fgraph = FunctionGraph([x], [x_inv])
+    x_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=config.floatX)
+    compare_pytorch_and_py(fgraph, [x_np])
+
+
+@pytest.mark.parametrize("hermitian", [False, True])
+def test_pinv_hermitian(hermitian):
+    A = matrix("A", dtype="complex128")
+    A_h_test = np.c_[[3, 3 + 2j], [3 - 2j, 2]]
+    A_not_h_test = A_h_test + 0 + 1j
+
+    A_inv = pt_nla.pinv(A, hermitian=hermitian)
+    torch_fn = function([A], A_inv, mode="PYTORCH")
+
+    assert np.allclose(torch_fn(A_h_test), np.linalg.pinv(A_h_test, hermitian=False))
+    assert np.allclose(torch_fn(A_h_test), np.linalg.pinv(A_h_test, hermitian=True))
+
+    assert (
+        np.allclose(
+            torch_fn(A_not_h_test), np.linalg.pinv(A_not_h_test, hermitian=False)
+        )
+        is not hermitian
+    )
+
+    assert (
+        np.allclose(
+            torch_fn(A_not_h_test), np.linalg.pinv(A_not_h_test, hermitian=True)
+        )
+        is hermitian
+    )
+
+
+def test_kron():
+    x = matrix("x")
+    y = matrix("y")
+    z = pt_nla.kron(x, y)
+
+    fgraph = FunctionGraph([x, y], [z])
+    x_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=config.floatX)
+    y_np = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=config.floatX)
+
+    compare_pytorch_and_py(fgraph, [x_np, y_np])

From 70c902b78f6017d9225154aba4043abe7ee29f0a Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Wed, 17 Jul 2024 13:08:28 -0700
Subject: [PATCH 53/72] Update for m1

---
 environment.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/environment.yml b/environment.yml
index 95bb58c06c..033765302f 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,7 @@ name: pytensor-dev
 channels:
   - conda-forge
 dependencies:
-  - python>=3.10
+  - python=3.10
   - compilers
   - numpy>=1.17.0,<2
   - scipy>=0.14,<1.14.0
@@ -18,9 +18,7 @@ dependencies:
   - cons
   - pydeprecate
   # Intel BLAS
-  - mkl
-  - mkl-service
-  - libblas=*=*mkl
+  - libblas=*=*accelerate
   # numba backend
   - numba>=0.57
   # For testing

From bd607f3ab6bdba0aaac997bb9561dfac9edb4355 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Sun, 21 Jul 2024 16:06:37 -0700
Subject: [PATCH 54/72] Add new env file

---
 environment-osx-arm64.yml | 53 +++++++++++++++++++++++++++++++++++++++
 environment.yml           |  6 +++--
 2 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 environment-osx-arm64.yml

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
new file mode 100644
index 0000000000..033765302f
--- /dev/null
+++ b/environment-osx-arm64.yml
@@ -0,0 +1,53 @@
+# To use:
+#
+#   $ conda env create -f environment.yml  # `mamba` works too for this command
+#   $ conda activate pytensor-dev
+#
+name: pytensor-dev
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - compilers
+  - numpy>=1.17.0,<2
+  - scipy>=0.14,<1.14.0
+  - filelock>=3.15
+  - etuples
+  - logical-unification
+  - miniKanren
+  - cons
+  - pydeprecate
+  # Intel BLAS
+  - libblas=*=*accelerate
+  # numba backend
+  - numba>=0.57
+  # For testing
+  - coveralls
+  - diff-cover
+  - mypy
+  - types-setuptools
+  - pytest
+  - pytest-cov
+  - pytest-xdist
+  - pytest-benchmark
+  - pytest-mock
+  - pip:
+    - pytest-sphinx
+  # For building docs
+  - sphinx>=5.1.0,<6
+  - sphinx_rtd_theme
+  - pygments
+  - pydot
+  - ipython
+  - pymc-sphinx-theme
+  - sphinx-design
+  # code style
+  - ruff
+  # developer tools
+  - pandas # required to run mypy script
+  - pre-commit
+  - packaging
+  # optional
+  - cython
+  - graphviz
+  - pydot
diff --git a/environment.yml b/environment.yml
index 033765302f..95bb58c06c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,7 +7,7 @@ name: pytensor-dev
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python>=3.10
   - compilers
   - numpy>=1.17.0,<2
   - scipy>=0.14,<1.14.0
@@ -18,7 +18,9 @@ dependencies:
   - cons
   - pydeprecate
   # Intel BLAS
-  - libblas=*=*accelerate
+  - mkl
+  - mkl-service
+  - libblas=*=*mkl
   # numba backend
   - numba>=0.57
   # For testing

From 3249ae2b58dafdf2df1e58a2f0ab742c6603a786 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Sun, 21 Jul 2024 16:07:57 -0700
Subject: [PATCH 55/72] Update comment

---
 environment-osx-arm64.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
index 033765302f..1ab4c81a36 100644
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -17,7 +17,7 @@ dependencies:
   - miniKanren
   - cons
   - pydeprecate
-  # Intel BLAS
+  # Apple BLAS
   - libblas=*=*accelerate
   # numba backend
   - numba>=0.57

From d2ad1ed25bf618592704e9e874a901b34b4d5720 Mon Sep 17 00:00:00 2001
From: Thomas Wiecki <thomas.wiecki@gmail.com>
Date: Mon, 22 Jul 2024 11:55:55 +0200
Subject: [PATCH 56/72] Update environment-osx-arm64.yml

Co-authored-by: Ben Mares <services-git-throwaway1@tensorial.com>
---
 environment-osx-arm64.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
index 1ab4c81a36..0d624aa55c 100644
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -7,7 +7,7 @@ name: pytensor-dev
 channels:
   - conda-forge
 dependencies:
-  - python=3.10
+  - python=>3.10
   - compilers
   - numpy>=1.17.0,<2
   - scipy>=0.14,<1.14.0

From f11df4afc5ce83c6cd791567e22093c1c5cce71a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Jul 2024 17:32:59 +0000
Subject: [PATCH 57/72] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.4 → v0.5.5](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.4...v0.5.5)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4b34d53b80..118a371e78 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           )$
       - id: check-merge-conflict
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.4
+    rev: v0.5.5
     hooks:
       - id: ruff
         args: ["--fix", "--output-format=full"]

From a7c099c30f693bc7c5333cd254f85c5d4421c467 Mon Sep 17 00:00:00 2001
From: Jesse Grabowski <jessegrabowski@gmail.com>
Date: Fri, 19 Apr 2024 17:57:34 +0200
Subject: [PATCH 58/72] Implement Einsum

Co-authored-by: Adrian Seyboldt <aseyboldt@users.noreply.github.com>
Co-authored-by: Jesse Grabowski <48652735+jessegrabowski@users.noreply.github.com>
Co-authored-by: Ricardo Vieira <28983449+ricardov94@users.noreply.github.com>
Co-authored-by: Rob Zinkov <zaxtax@users.noreply.github.com>
---
 pytensor/link/jax/dispatch/__init__.py   |   1 +
 pytensor/link/jax/dispatch/einsum.py     |  20 +
 pytensor/tensor/__init__.py              |   1 +
 pytensor/tensor/basic.py                 |  34 +-
 pytensor/tensor/einsum.py                | 760 +++++++++++++++++++++++
 pytensor/tensor/functional.py            |   2 +-
 pytensor/tensor/rewriting/__init__.py    |   3 +-
 pytensor/tensor/rewriting/basic.py       | 121 ++++
 pytensor/tensor/rewriting/blockwise.py   | 133 ++--
 pytensor/tensor/rewriting/einsum.py      |  53 ++
 pytensor/tensor/rewriting/ofg.py         |  27 +-
 pytensor/tensor/rewriting/shape.py       | 120 ++--
 pytensor/tensor/shape.py                 |   6 +-
 tests/link/jax/test_einsum.py            |  38 ++
 tests/tensor/rewriting/test_blockwise.py |  37 +-
 tests/tensor/rewriting/test_einsum.py    |  39 ++
 tests/tensor/rewriting/test_shape.py     |  46 ++
 tests/tensor/test_basic.py               |   4 +-
 tests/tensor/test_einsum.py              | 263 ++++++++
 tests/tensor/test_shape.py               |  12 +-
 20 files changed, 1569 insertions(+), 151 deletions(-)
 create mode 100644 pytensor/link/jax/dispatch/einsum.py
 create mode 100644 pytensor/tensor/einsum.py
 create mode 100644 pytensor/tensor/rewriting/einsum.py
 create mode 100644 tests/link/jax/test_einsum.py
 create mode 100644 tests/tensor/rewriting/test_einsum.py
 create mode 100644 tests/tensor/test_einsum.py

diff --git a/pytensor/link/jax/dispatch/__init__.py b/pytensor/link/jax/dispatch/__init__.py
index f4098416b8..00976f221c 100644
--- a/pytensor/link/jax/dispatch/__init__.py
+++ b/pytensor/link/jax/dispatch/__init__.py
@@ -4,6 +4,7 @@
 # Load dispatch specializations
 import pytensor.link.jax.dispatch.blas
 import pytensor.link.jax.dispatch.blockwise
+import pytensor.link.jax.dispatch.einsum
 import pytensor.link.jax.dispatch.elemwise
 import pytensor.link.jax.dispatch.extra_ops
 import pytensor.link.jax.dispatch.pad
diff --git a/pytensor/link/jax/dispatch/einsum.py b/pytensor/link/jax/dispatch/einsum.py
new file mode 100644
index 0000000000..3080f6964f
--- /dev/null
+++ b/pytensor/link/jax/dispatch/einsum.py
@@ -0,0 +1,20 @@
+import jax.numpy as jnp
+
+from pytensor.link.jax.dispatch import jax_funcify
+from pytensor.tensor.einsum import Einsum
+
+
+@jax_funcify.register(Einsum)
+def jax_funcify_Einsum(op, **kwargs):
+    """Dispatch einsum to JAX.
+
+    This dispatch is triggered only when we couldn't optimize einsum at the PyTensor level.
+    This happens when some of the dimension lengths are unknown. This is never a problem in JAX,
+    as it always compiles a function per runtime input shape.
+    """
+    subscripts = op.subscripts
+
+    def einsum(*operands):
+        return jnp.einsum(subscripts, *operands, optimize="optimal")
+
+    return einsum
diff --git a/pytensor/tensor/__init__.py b/pytensor/tensor/__init__.py
index 81cabfa6bd..7385f02478 100644
--- a/pytensor/tensor/__init__.py
+++ b/pytensor/tensor/__init__.py
@@ -151,6 +151,7 @@ def _get_vector_length_Constant(op: Op | Variable, var: Constant) -> int:
 
 
 # isort: off
+from pytensor.tensor.einsum import einsum
 from pytensor.tensor.functional import vectorize
 # isort: on
 
diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
index 119c44c647..9eaa04c522 100644
--- a/pytensor/tensor/basic.py
+++ b/pytensor/tensor/basic.py
@@ -1700,21 +1700,22 @@ def do_constant_folding(self, fgraph, node):
             return False
 
         for client, idx in clients:
-            if isinstance(client.op, Output):
+            client_op = client.op
+            if isinstance(client_op, Output):
                 # If the output is a constant, it will have to be deepcopied
                 # each time the function is called.  So we do not fold.
                 return False
-            # Allow alloc to be lifted out of Elemwise before constant folding it
-            elif isinstance(client.op, Elemwise):
-                return None
+            # Op's through which Alloc can be lifted
+            elif isinstance(client_op, Elemwise | DimShuffle | Alloc | Join):
+                return False
             # Same for Blockwise, unless it has no batch_dims
-            elif isinstance(client.op, Blockwise) and client.op.batch_ndim(client):
-                return None
+            elif isinstance(client_op, Blockwise) and client.op.batch_ndim(client):
+                return False
             elif (
                 # The following ops work inplace of their input id 0.
                 idx == 0
                 and isinstance(
-                    client.op,
+                    client_op,
                     pytensor.tensor.subtensor.IncSubtensor
                     | pytensor.tensor.subtensor.AdvancedIncSubtensor1
                     | pytensor.tensor.subtensor.AdvancedIncSubtensor
@@ -2035,10 +2036,15 @@ def transpose(x, axes=None):
     _x = as_tensor_variable(x)
 
     if axes is None:
-        axes = list(range((_x.type.ndim - 1), -1, -1))
+        axes = tuple(range((_x.type.ndim - 1), -1, -1))
+
+    if tuple(axes) == tuple(range(len(axes))):
+        # No-op
+        return _x
+
     ret = DimShuffle(tuple(s == 1 for s in _x.type.shape), axes)(_x)
 
-    if _x.name and axes == list(range((_x.type.ndim - 1), -1, -1)):
+    if _x.name and axes == tuple(range((_x.type.ndim - 1), -1, -1)):
         ret.name = _x.name + ".T"
 
     return ret
@@ -3950,6 +3956,10 @@ def moveaxis(
     source = normalize_axis_tuple(source, a.ndim, "source")
     destination = normalize_axis_tuple(destination, a.ndim, "destination")
 
+    if source == destination:
+        # It's a no-op
+        return a
+
     if len(source) != len(destination):
         raise ValueError(
             "`source` and `destination` arguments must have the same number of elements"
@@ -4260,9 +4270,7 @@ def atleast_Nd(
 atleast_3d = partial(atleast_Nd, n=3)
 
 
-def expand_dims(
-    a: np.ndarray | TensorVariable, axis: tuple[int, ...]
-) -> TensorVariable:
+def expand_dims(a: np.ndarray | TensorVariable, axis: Sequence[int]) -> TensorVariable:
     """Expand the shape of an array.
 
     Insert a new axis that will appear at the `axis` position in the expanded
@@ -4281,7 +4289,7 @@ def expand_dims(
     """
     a = as_tensor(a)
 
-    if not isinstance(axis, tuple | list):
+    if not isinstance(axis, Sequence):
         axis = (axis,)
 
     out_ndim = len(axis) + a.ndim
diff --git a/pytensor/tensor/einsum.py b/pytensor/tensor/einsum.py
new file mode 100644
index 0000000000..79151a91a2
--- /dev/null
+++ b/pytensor/tensor/einsum.py
@@ -0,0 +1,760 @@
+import collections
+import warnings
+from collections.abc import Sequence
+from functools import partial, reduce
+from itertools import pairwise
+from typing import cast
+
+import numpy as np
+from numpy.core.einsumfunc import _find_contraction, _parse_einsum_input  # type: ignore
+from numpy.core.numeric import (  # type: ignore
+    normalize_axis_index,
+    normalize_axis_tuple,
+)
+
+from pytensor.compile.builders import OpFromGraph
+from pytensor.tensor import TensorLike
+from pytensor.tensor.basic import (
+    arange,
+    as_tensor,
+    expand_dims,
+    get_vector_length,
+    moveaxis,
+    stack,
+    transpose,
+    where,
+)
+from pytensor.tensor.extra_ops import broadcast_to
+from pytensor.tensor.functional import vectorize
+from pytensor.tensor.math import and_, eq, tensordot
+from pytensor.tensor.shape import shape_padright
+from pytensor.tensor.variable import TensorVariable
+
+
+PATH = tuple[tuple[int] | tuple[int, int], ...]
+
+
+class Einsum(OpFromGraph):
+    """
+    Wrapper Op for Einsum graphs
+
+    Notes
+    -----
+    The `optimized` prop indicates whether the inner graph was optimized, which can only be done when all shapes are
+    statically known. This is now determined at graph creation time only. We could introduce a rewrite that tries to
+    optimize the graph if static shapes become known later (e.g., after use of `clone_replace` or shape inference during
+    rewrites).
+
+    Also, once the graph is optimized, it could be inlined for potential further optimization that consider the rest of
+    the graph.
+
+    This prop is different from the `optimize` kwarg in numpy that determines what kind (if any) of optimization is
+    desired. We haven't decided whether we want to provide this functionality.
+    """
+
+    __props__ = ("subscripts", "path", "optimized")
+
+    def __init__(self, *args, subscripts: str, path: PATH, optimized: bool, **kwargs):
+        self.subscripts = subscripts
+        self.path = path
+        self.optimized = optimized
+        super().__init__(*args, **kwargs, strict=True)
+
+
+def _iota(shape: TensorVariable, axis: int) -> TensorVariable:
+    """
+    Create an array with values increasing along the specified axis.
+
+    Iota is a multidimensional generalization of the `arange` function. The returned array is filled with whole numbers
+    increasing along the specified axis.
+
+    Parameters
+    ----------
+    shape: TensorVariable
+        The shape of the array to be created.
+    axis: int
+        The axis along which to fill the array with increasing values.
+
+    Returns
+    -------
+    TensorVariable
+        An array with values increasing along the specified axis.
+
+    Examples
+    --------
+    In the simplest case where ``shape`` is 1d, the output will be equivalent to ``pt.arange``:
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+        from pytensor.tensor.einsum import _iota
+
+        shape = pt.as_tensor((5,))
+        print(_iota(shape, 0).eval())
+
+    .. testoutput::
+
+         [0 1 2 3 4]
+
+    In higher dimensions, it will look like many concatenated `arange`:
+
+    .. testcode::
+
+        shape = pt.as_tensor((5, 5))
+        print(_iota(shape, 1).eval())
+
+    .. testoutput::
+
+        [[0 1 2 3 4]
+         [0 1 2 3 4]
+         [0 1 2 3 4]
+         [0 1 2 3 4]
+         [0 1 2 3 4]]
+
+    Setting ``axis=0`` above would result in the transpose of the output.
+    """
+    len_shape = get_vector_length(shape)
+    axis = normalize_axis_index(axis, len_shape)
+    values = arange(shape[axis])
+    return broadcast_to(shape_padright(values, len_shape - axis - 1), shape)
+
+
+def _delta(shape: TensorVariable, axes: Sequence[int]) -> TensorVariable:
+    """
+    Create a Kroncker delta tensor.
+
+    The Kroncker delta function is defined:
+
+    .. math::
+
+        \\delta(i, j) = \begin{cases} 1 & \text{if} \\quad i = j \\ 0 & \text{otherwise} \\end{cases}
+
+    To create a Kronecker tensor, the delta function is applied elementwise to the axes specified. The result is a
+    tensor of booleans, with ``True`` where the axis indices coincide, and ``False`` otherwise. See below for examples.
+
+    Parameters
+    ----------
+    shape: TensorVariable
+        The shape of the tensor to be created. Note that `_delta` is not defined for 1d tensors, because there is no
+        second axis against which to compare.
+    axes: sequence of int
+        Axes whose indices should be compared. Note that `_delta` is not defined for a single axis, because there is no
+        second axis against which to compare.
+
+    Examples
+    --------
+    An easy case to understand is when the shape is square and the number of axes is equal to the number of dimensions.
+    This will result in a generalized identity tensor, with ``True`` along the main diagonal:
+
+    .. testcode::
+
+        from pytensor.tensor.einsum import _delta
+        print(_delta((5, 5), (0, 1)).eval())
+
+    .. testoutput::
+
+        [[ True False False False False]
+         [False  True False False False]
+         [False False  True False False]
+         [False False False  True False]
+         [False False False False  True]]
+
+    In the case where the shape is not square, the result will be a tensor with ``True`` along the main diagonal and
+    ``False`` elsewhere:
+
+    .. testcode::
+
+        from pytensor.tensor.einsum import _delta
+        print(_delta((3, 2), (0, 1)).eval())
+
+    .. testoutput::
+
+        [[ True False]
+         [False  True]
+         [False False]]
+
+    When there are more than two dimensions in the shape, axes can be only a subset of them, leading to different
+    arragements of True and False values. For example for a 3d batch of matrices, choosing axes (0, 2) will lead to
+    True values on the column corresponding to the batch index of each matrix:
+
+    .. testcode::
+
+        from pytensor.tensor.einsum import _delta
+        print(_delta((3, 3, 3), (0, 2)).eval())
+
+    .. testoutput::
+
+        [[[ True False False]
+          [ True False False]
+          [ True False False]]
+
+         [[False  True False]
+          [False  True False]
+          [False  True False]]
+
+         [[False False  True]
+          [False False  True]
+          [False False  True]]]
+    """
+    if len(axes) == 1:
+        raise ValueError("Need at least two axes to create a delta tensor")
+    base_shape = stack([shape[axis] for axis in axes])
+    iotas = [_iota(base_shape, i) for i in range(len(axes))]
+    eyes = [eq(i1, i2) for i1, i2 in pairwise(iotas)]
+    result = reduce(and_, eyes)
+    non_axes = [i for i in range(len(tuple(shape))) if i not in axes]
+    return broadcast_to(expand_dims(result, non_axes), shape)
+
+
+def _general_dot(
+    vars: tuple[TensorVariable, TensorVariable],
+    axes: Sequence[Sequence[int]],  # Should be length 2,
+    batch_axes: Sequence[Sequence[int]],  # Should be length 2,
+) -> TensorVariable:
+    """
+    Generalized dot product between two tensors.
+
+    Ultimately ``_general_dot`` is a call to `tensor_dot`, performing a multiply-and-sum ("dot") operation between two
+    tensors, along a requested dimension. This function further generalizes this operation by allowing arbitrary
+    batch dimensions to be specified for each tensor.
+
+
+    Parameters
+    ----------
+    vars: tuple[TensorVariable, TensorVariable]
+        The tensors to be ``tensor_dot``ed
+    axes: Sequence[Sequence[int]]
+        The axes along which to perform the dot product. Should be a sequence of two sequences, one for each tensor.
+    batch_axes: Sequence[Sequence[int]]
+        The batch axes for each tensor. Should be a sequence of two sequences, one for each tensor.
+
+    Returns
+    -------
+    TensorVariable
+        The result of the ``tensor_dot`` product.
+
+    Examples
+    --------
+    Perform a batched dot product between two 3d tensors:
+
+    .. testcode::
+
+        import pytensor.tensor as pt
+        from pytensor.tensor.einsum import _general_dot
+        import numpy as np
+
+        A = pt.tensor(shape=(3, 4, 5))
+        B = pt.tensor(shape=(3, 5, 2))
+
+        result = _general_dot((A, B), axes=[[2], [1]], batch_axes=[[0], [0]])
+
+        A_val = np.empty((3, 4, 5))
+        B_val = np.empty((3, 5, 2))
+        print(tuple(result.shape.eval({A:A_val, B:B_val})))
+
+    .. testoutput::
+
+        (3, 4, 2)
+    """
+    # Shortcut for non batched case
+    if not batch_axes[0] and not batch_axes[1]:
+        return tensordot(*vars, axes=axes)
+
+    # Normalize axes, thankfully numpy helper does not sort axis!
+    axes = [
+        normalize_axis_tuple(var_axes, var.ndim)
+        for var, var_axes in zip(vars, axes, strict=True)
+    ]
+    batch_axes = [
+        normalize_axis_tuple(var_axes, var.ndim)
+        for var, var_axes in zip(vars, batch_axes, strict=True)
+    ]
+    n_batch_axes = [len(var_batch_axes) for var_batch_axes in batch_axes]
+
+    # Move batch axes to the left and recode reduction axes
+    new_vars = list(vars)
+    new_axes = list(axes)
+    for i, (var, var_axes, var_batch_axes, var_n_batch_axes) in enumerate(
+        zip(vars, axes, batch_axes, n_batch_axes, strict=True)
+    ):
+        if var_batch_axes == tuple(range(var_n_batch_axes)):
+            # Already on left to right order
+            continue
+
+        new_var_batch_axes = tuple(range(var_n_batch_axes))
+        new_var = moveaxis(var, var_batch_axes, new_var_batch_axes)
+
+        new_var_axes = []
+        for var_axis in var_axes:
+            batch_axes_to_the_right = len(
+                [batch_axis for batch_axis in var_batch_axes if batch_axis > var_axis]
+            )
+            new_var_axes.append(var_axis + batch_axes_to_the_right)
+
+        new_vars[i] = new_var
+        new_axes[i] = new_var_axes
+
+    lhs, rhs = new_vars
+    lhs_axes, rhs_axes = new_axes
+    lhs_n_batch_axes, rhs_n_batch_axes = n_batch_axes
+
+    # Create signature of tensordot
+    lhs_signature = [f"l{i}" for i in range(lhs.type.ndim)]
+    rhs_signature = [f"r{i}" for i in range(rhs.type.ndim)]
+    # Aligned axes get the same dimension name
+    for i, (lhs_axis, rhs_axis) in enumerate(zip(lhs_axes, rhs_axes)):
+        lhs_signature[lhs_axis] = rhs_signature[rhs_axis] = f"a{i}"
+    # Trim away the batch ndims
+    lhs_signature = lhs_signature[lhs_n_batch_axes:]
+    rhs_signature = rhs_signature[rhs_n_batch_axes:]
+    out_signature = [
+        lhs_dim for lhs_dim in lhs_signature if not lhs_dim.startswith("a")
+    ] + [rhs_dim for rhs_dim in rhs_signature if not rhs_dim.startswith("a")]
+    signature = f"({','.join(lhs_signature)}),({','.join(rhs_signature)})->({','.join(out_signature)})"
+    # Adjust axes for core case
+    core_lhs_axes = tuple(np.array(lhs_axes) - lhs_n_batch_axes)
+    core_rhs_axes = tuple(np.array(rhs_axes) - rhs_n_batch_axes)
+
+    if signature == "(),()->()":
+        # Just a multiplication
+        out = lhs * rhs
+    else:
+        out = vectorize(
+            partial(tensordot, axes=[core_lhs_axes, core_rhs_axes]), signature=signature
+        )(lhs, rhs)
+
+    return cast(TensorVariable, out)
+
+
+def _contraction_list_from_path(
+    subscripts: str, operands: Sequence[TensorVariable], path: PATH
+):
+    """
+    Generate a list of contraction steps based on the provided einsum path.
+
+    Code adapted from einsum_opt: https://github.com/dgasmith/opt_einsum/blob/94c62a05d5ebcedd30f59c90b9926de967ed10b5/opt_einsum/contract.py#L369
+
+    When all shapes are known, the linked einsum_opt implementation is preferred. This implementation is used when
+    some or all shapes are not known. As a result, contraction will (always?) be done left-to-right, pushing intermediate
+    results to the end of the stack.
+
+    Parameters
+    ----------
+    subscripts: str
+        Einsum signature string describing the computation to be performed.
+
+    operands: Sequence[TensorLike]
+        Tensors described by the subscripts.
+
+    path: tuple[tuple[int] | tuple[int, int]]
+        A list of tuples, where each tuple describes the indices of the operands to be contracted, sorted in the order
+        they should be contracted.
+
+    Returns
+    -------
+    contraction_list: list
+        A list of tuples, where each tuple describes a contraction step. Each tuple contains the following elements:
+        - contraction_inds: tuple[int]
+            The indices of the operands to be contracted
+        - idx_removed: str
+            The indices of the contracted indices (those removed from the einsum string at this step)
+        - einsum_str: str
+            The einsum string for the contraction step
+        - remaining: None
+            The remaining indices. Included to match the output of opt_einsum.contract_path, but not used.
+        - do_blas: None
+            Whether to use blas to perform this step. Included to match the output of opt_einsum.contract_path,
+            but not used.
+    """
+    fake_operands = [
+        np.zeros([1 if dim == 1 else 0 for dim in x.type.shape]) for x in operands
+    ]
+    input_subscripts, output_subscript, operands = _parse_einsum_input(
+        (subscripts, *fake_operands)
+    )
+
+    # Build a few useful list and sets
+    input_list = input_subscripts.split(",")
+    input_sets = [set(x) for x in input_list]
+    output_set = set(output_subscript)
+
+    # Build contraction tuple (positions, gemm, einsum_str, remaining)
+    contraction_list = []
+    for cnum, contract_inds in enumerate(path):
+        # Make sure we remove inds from right to left
+        contract_inds = cast(
+            tuple[int] | tuple[int, int], tuple(sorted(contract_inds, reverse=True))
+        )
+
+        contract_tuple = _find_contraction(contract_inds, input_sets, output_set)
+        out_inds, input_sets, idx_removed, idx_contract = contract_tuple
+
+        tmp_inputs = [input_list.pop(x) for x in contract_inds]
+
+        # Last contraction
+        if (cnum - len(path)) == -1:
+            idx_result = output_subscript
+        else:
+            # use tensordot order to minimize transpositions
+            all_input_inds = "".join(tmp_inputs)
+            idx_result = "".join(sorted(out_inds, key=all_input_inds.find))
+
+        input_list.append(idx_result)
+        einsum_str = ",".join(tmp_inputs) + "->" + idx_result
+
+        # We only need the first three inputs to build the forward graph
+        contraction = (contract_inds, idx_removed, einsum_str, None, None)
+        contraction_list.append(contraction)
+
+    return contraction_list
+
+
+def einsum(subscripts: str, *operands: "TensorLike", optimize=None) -> TensorVariable:
+    """
+    Multiplication and summation of tensors using the Einstein summation convention.
+
+    Code adapted from JAX: https://github.com/google/jax/blob/534d32a24d7e1efdef206188bb11ae48e9097092/jax/_src/numpy/lax_numpy.py#L5283
+
+    Einsum allows the user to specify a wide range of operations on tensors using the Einstein summation convention. Using
+    this notation, many common linear algebraic operations can be succinctly described on higher order tensors.
+
+    Parameters
+    ----------
+    subscripts: str
+        Einsum signature string describing the computation to be performed.
+
+    operands: sequence of TensorVariable
+        Tensors to be multiplied and summed.
+
+    Returns
+    -------
+    TensorVariable
+        The result of the einsum operation.
+
+    See Also
+    --------
+    pytensor.tensor.tensordot: Generalized dot product between two tensors
+    pytensor.tensor.dot: Matrix multiplication between two tensors
+    numpy.einsum: The numpy implementation of einsum
+
+    Examples
+    --------
+    Inputs to `pt.einsum` are a string describing the operation to be performed (the "subscripts"), and a sequence of
+    tensors to be operated on. The string must follow the following rules:
+
+    1. The string gives inputs and (optionally) outputs. Inputs and outputs are separated by "->".
+    2. The input side of the string is a comma-separated list of indices. For each comma-separated index string, there
+         must be a corresponding tensor in the input sequence.
+    3. For each index string, the number of dimensions in the corresponding tensor must match the number of characters
+         in the index string.
+    4. Indices are arbitrary strings of characters. If an index appears multiple times in the input side, it must have
+        the same shape in each input.
+    5. The indices on the output side must be a subset of the indices on the input side -- you cannot introduce new
+        indices in the output.
+    6. Elipses ("...") can be used to elide multiple indices. This is useful when you have a large number of "batch"
+        dimensions that are not implicated in the operation.
+
+    Finally, two rules about these indicies govern how computation is carried out:
+
+    1. Repeated indices on the input side indicate how the tensor should be "aligned" for multiplication.
+    2. Indices that appear on the input side but not the output side are summed over.
+
+    The operation of these rules is best understood via examples:
+
+    Example 1: Matrix multiplication
+
+    .. code-block:: python
+
+        import pytensor as pt
+        A = pt.matrix("A")
+        B = pt.matrix("B")
+        C = pt.einsum("ij, jk -> ik", A, B)
+
+    This computation is equivalent to :code:`C = A @ B`. Notice that the ``j`` index is repeated on the input side of the
+    signature, and does not appear on the output side. This indicates that the ``j`` dimension of the first tensor should be
+    multiplied with the ``j`` dimension of the second tensor, and the resulting tensor's ``j`` dimension should be summed
+    away.
+
+    Example 2: Batched matrix multiplication
+
+    .. code-block:: python
+
+        import pytensor as pt
+        A = pt.tensor("A", shape=(None, 4, 5))
+        B = pt.tensor("B", shape=(None, 5, 6))
+        C = pt.einsum("bij, bjk -> bik", A, B)
+
+    This computation is also equivalent to :code:`C = A @ B` because of Pytensor's built-in broadcasting rules, but
+    the einsum signature is more explicit about the batch dimensions. The ``b`` and ``j`` indices are repeated on the
+    input side. Unlike ``j``, the ``b`` index is also present on the output side, indicating that the batch dimension
+    should **not** be summed away. As a result, multiplication will be performed over the ``b, j`` dimensions, and then
+    the ``j`` dimension will be summed over. The resulting tensor will have shape ``(None, 4, 6)``.
+
+    Example 3: Batched matrix multiplication with elipses
+
+    .. code-block:: python
+
+        import pytensor as pt
+        A = pt.tensor("A", shape=(4, None, None, None, 5))
+        B = pt.tensor("B", shape=(5, None, None, None, 6))
+        C = pt.einsum("i...j, j...k -> ...ik", A, B)
+
+    This case is the same as above, but inputs ``A`` and ``B`` have multiple batch dimensions. To avoid writing out all
+    of the batch dimensions (which we do not care about), we can use ellipses to elide over these dimensions. Notice
+    also that we are not required to "sort" the input dimensions in any way. In this example, we are doing a dot
+    between the last dimension A and the first dimension of B, which is perfectly valid.
+
+    Example 4: Outer product
+
+    .. code-block:: python
+
+        import pytensor as pt
+        x = pt.tensor("x", shape=(3,))
+        y = pt.tensor("y", shape=(4,))
+        z = pt.einsum("i, j -> ij", x, y)
+
+    This computation is equivalent to :code:`pt.outer(x, y)`. Notice that no indices are repeated on the input side,
+    and the output side has two indices. Since there are no indices to align on, the einsum operation will simply
+    multiply the two tensors elementwise, broadcasting dimensions ``i`` and ``j``.
+
+    Example 5: Convolution
+
+    .. code-block:: python
+
+            import pytensor as pt
+            x = pt.tensor("x", shape=(None, None, None, None, None, None))
+            w = pt.tensor("w", shape=(None, None, None, None))
+            y = pt.einsum(""bchwkt,fckt->bfhw", x, w)
+
+    Given a batch of images ``x`` with dimensions ``(batch, channel, height, width, kernel_size, num_filters)``
+    and a filter ``w``, with dimensions ``(num_filters, channels, kernel_size, num_filters)``,  this einsum operation
+    computes the convolution of ``x`` with ``w``. Multiplication is aligned on the batch, num_filters, height, and width
+    dimensions. The channel, kernel_size, and num_filters dimensions are summed over. The resulting tensor has shape
+    ``(batch, num_filters, height, width)``, reflecting the fact that information from each channel has been mixed
+    together.
+    """
+
+    if optimize is not None:
+        raise NotImplementedError(
+            "Optimize kwarg is not implemented in PyTensor. "
+            "By default, PyTensor will always optimize the graph if the inputs have static shapes.\n"
+            "If you need this functionality open an issue in https://github.com/pymc-devs/pytensor/issues to let us know. "
+        )
+
+    # TODO: Is this doing something clever about unknown shapes?
+    # contract_path = _poly_einsum_handlers.get(ty, _default_poly_einsum_handler)
+    tensor_operands = [as_tensor(operand) for operand in operands]
+    shapes = [operand.type.shape for operand in tensor_operands]
+
+    path: PATH
+    if any(None in shape for shape in shapes):
+        # Case 1: At least one of the operands has an unknown shape. In this case, we can't use opt_einsum to optimize
+        # the contraction order, so we just use a default path of (1,0) contractions. This will work left-to-right,
+        # pushing intermediate results to the end of the stack.
+        # We use (1,0) and not (0,1) because that's what opt_einsum tends to prefer, and so the Op signatures will
+        # match more often
+
+        # If shapes become known later we will likely want to rebuild the Op (unless we inline it)
+        if len(tensor_operands) == 1:
+            path = ((0,),)
+        else:
+            # By default, we try right to left because we assume that most graphs
+            # have a lower dimensional rightmost operand
+            path = tuple(pairwise(reversed(range(len(tensor_operands)))))
+        contraction_list = _contraction_list_from_path(
+            subscripts, tensor_operands, path
+        )
+
+        # If there are only 1 or 2 operands, there is no optimization to be done?
+        optimized = len(tensor_operands) <= 2
+    else:
+        # Case 2: All operands have known shapes. In this case, we can use opt_einsum to compute the optimal
+        # contraction order.
+        _, contraction_list = np.einsum_path(
+            subscripts,
+            # Numpy einsum_path requires arrays even though only the shapes matter
+            # It's not trivial to duck-type our way around because of internal call to `asanyarray`
+            *[np.empty(shape) for shape in shapes],
+            einsum_call=True,  # Not part of public API
+            optimize="optimal",
+        )  # type: ignore
+        path = tuple(contraction[0] for contraction in contraction_list)
+        optimized = True
+
+    def removechars(s, chars):
+        return s.translate(str.maketrans(dict.fromkeys(chars)))
+
+    def sum_uniques(
+        operand: TensorVariable, names: str, uniques: list[str]
+    ) -> tuple[TensorVariable, str]:
+        """Reduce unique indices (those that appear only once) in a given contraction step via summing."""
+        if uniques:
+            axes = [names.index(name) for name in uniques]
+            operand = operand.sum(axes)
+            names = removechars(names, uniques)
+        return operand, names
+
+    def sum_repeats(
+        operand: TensorVariable,
+        names: str,
+        counts: collections.Counter,
+        keep_names: str,
+    ) -> tuple[TensorVariable, str]:
+        """Reduce repeated indices in a given contraction step via summation against an identity matrix."""
+
+        for name, count in counts.items():
+            if count > 1:
+                axes = [i for i, n in enumerate(names) if n == name]
+                eye = _delta(operand.shape, axes)
+                operand = where(eye, operand, operand.zeros_like())
+                if name not in keep_names:
+                    operand = operand.sum(axes)
+                    names = names.replace(name, "")
+                else:
+                    operand = operand.sum(axes[:-1])
+                    names = names.replace(name, "", count - 1)
+        return operand, names
+
+    def filter_singleton_dims(operand, names, other_operand, other_names):
+        op_bcast = operand.type.broadcastable
+        other_bcast = other_operand.type.broadcastable
+        keep = [
+            (not op_bcast[i]) or (j == -1) or other_bcast[j]
+            for i, j in enumerate(map(other_names.find, names))
+        ]
+        keep_axes = [i for i, keep_axis in enumerate(keep) if keep_axis]
+        squeeze_axes = [i for i, keep_axis in enumerate(keep) if not keep_axis]
+        if squeeze_axes:
+            # TODO: We could modify the subscripts to avoid the problem?
+            warnings.warn(
+                "The same einsum subscript is used for a broadcastable and non-broadcastable dimension. "
+                "This can result in a suboptimal contraction path."
+            )
+        return operand.squeeze(squeeze_axes), "".join(names[i] for i in keep_axes)
+
+    einsum_operands = list(tensor_operands)  # So we can pop
+    for operand_indices, contracted_names, einstr, _, _ in contraction_list:
+        contracted_names = sorted(contracted_names)
+        assert len(contracted_names) == len(
+            set(contracted_names)
+        ), "The set was needed!"
+
+        input_str, result_names = einstr.split("->")
+        input_names = input_str.split(",")
+
+        # switch on the number of operands to be processed in this loop iteration.
+        # every case here sets 'operand' and 'names'.
+        if len(operand_indices) == 1:
+            operand = einsum_operands.pop(operand_indices[0])
+            (names,) = input_names
+            counts = collections.Counter(names)
+
+            # sum out unique contracted indices with a single reduce-sum
+            uniques = [name for name in contracted_names if counts[name] == 1]
+            operand, names = sum_uniques(operand, names, uniques)
+
+            # for every repeated index, do a contraction against an identity matrix
+            operand, names = sum_repeats(operand, names, counts, result_names)
+
+        elif len(operand_indices) == 2:
+            lhs, rhs = map(einsum_operands.pop, operand_indices)
+            lhs_names, rhs_names = input_names
+
+            # handle cases where one side of a contracting or batch dimension is 1
+            # but its counterpart is not.
+            lhs, lhs_names = filter_singleton_dims(lhs, lhs_names, rhs, rhs_names)
+            rhs, rhs_names = filter_singleton_dims(rhs, rhs_names, lhs, lhs_names)
+
+            lhs_counts = collections.Counter(lhs_names)
+            rhs_counts = collections.Counter(rhs_names)
+
+            # sum out unique contracted indices in lhs and rhs
+            lhs_uniques = [
+                name
+                for name in contracted_names
+                if lhs_counts[name] == 1 and rhs_counts[name] == 0
+            ]
+            lhs, lhs_names = sum_uniques(lhs, lhs_names, lhs_uniques)
+
+            rhs_uniques = [
+                name
+                for name in contracted_names
+                if rhs_counts[name] == 1 and lhs_counts[name] == 0
+            ]
+            rhs, rhs_names = sum_uniques(rhs, rhs_names, rhs_uniques)
+
+            # for every repeated index, contract against an identity matrix
+            lhs, lhs_names = sum_repeats(
+                lhs, lhs_names, lhs_counts, result_names + rhs_names
+            )
+            rhs, rhs_names = sum_repeats(
+                rhs, rhs_names, rhs_counts, result_names + lhs_names
+            )
+
+            lhs_or_rhs_names = set(lhs_names) | set(rhs_names)
+            contracted_names = [x for x in contracted_names if x in lhs_or_rhs_names]
+            lhs_and_rhs_names = set(lhs_names) & set(rhs_names)
+            batch_names = [x for x in result_names if x in lhs_and_rhs_names]
+
+            if batch_names:
+                lhs_batch, rhs_batch = tuple(
+                    zip(*[(lhs_names.find(n), rhs_names.find(n)) for n in batch_names])
+                )
+            else:
+                lhs_batch = rhs_batch = ()
+
+            # contract using dot_general
+            batch_names_str = "".join(batch_names)
+            if contracted_names:
+                lhs_cont, rhs_cont = tuple(
+                    zip(
+                        *[
+                            (lhs_names.index(n), rhs_names.index(n))
+                            for n in contracted_names
+                        ]
+                    )
+                )
+            else:
+                lhs_cont = rhs_cont = ()
+            deleted_names = batch_names_str + "".join(contracted_names)
+            remaining_lhs_names = removechars(lhs_names, deleted_names)
+            remaining_rhs_names = removechars(rhs_names, deleted_names)
+            # Try both orders of lhs and rhs, in the hope that one of them means we
+            # don't need an explicit transpose. opt_einsum likes to contract from
+            # right to left, so we expect (rhs,lhs) to have the best chance of not
+            # needing a transpose.
+            names = batch_names_str + remaining_rhs_names + remaining_lhs_names
+            if names == result_names:
+                operand = _general_dot(
+                    (rhs, lhs), (rhs_cont, lhs_cont), (rhs_batch, lhs_batch)
+                )
+            else:
+                names = batch_names_str + remaining_lhs_names + remaining_rhs_names
+                operand = _general_dot(
+                    (lhs, rhs),
+                    axes=(lhs_cont, rhs_cont),
+                    batch_axes=(lhs_batch, rhs_batch),
+                )
+        else:
+            raise ValueError(
+                f"Each step of einsum must have 1 or 2 operands, got {len(operand_indices)}"
+            )
+
+        # the resulting 'operand' with axis labels 'names' should be a permutation of the desired result
+        assert len(names) == len(result_names) == len(set(names))
+        assert set(names) == set(result_names)
+        if names != result_names:
+            perm = tuple(names.index(name) for name in result_names)
+            operand = transpose(operand, perm)
+        einsum_operands.append(operand)  # used in next iteration
+
+    [einsum_result] = einsum_operands
+
+    out = Einsum(
+        subscripts=subscripts,
+        inputs=list(tensor_operands),
+        outputs=[einsum_result],
+        path=tuple(path),
+        optimized=optimized,
+    )(*tensor_operands)
+    return cast(TensorVariable, out)
diff --git a/pytensor/tensor/functional.py b/pytensor/tensor/functional.py
index e7a5371b02..05e11f2643 100644
--- a/pytensor/tensor/functional.py
+++ b/pytensor/tensor/functional.py
@@ -1,8 +1,8 @@
 from collections.abc import Callable
 
 from pytensor.graph import vectorize_graph
-from pytensor.tensor import TensorVariable
 from pytensor.tensor.utils import _parse_gufunc_signature
+from pytensor.tensor.variable import TensorVariable
 
 
 def vectorize(func: Callable, signature: str | None = None) -> Callable:
diff --git a/pytensor/tensor/rewriting/__init__.py b/pytensor/tensor/rewriting/__init__.py
index 168b636041..fc5c528f2d 100644
--- a/pytensor/tensor/rewriting/__init__.py
+++ b/pytensor/tensor/rewriting/__init__.py
@@ -3,10 +3,9 @@
 import pytensor.tensor.rewriting.blas_c
 import pytensor.tensor.rewriting.blas_scipy
 import pytensor.tensor.rewriting.blockwise
+import pytensor.tensor.rewriting.einsum
 import pytensor.tensor.rewriting.elemwise
 import pytensor.tensor.rewriting.extra_ops
-
-# Register JAX specializations
 import pytensor.tensor.rewriting.jax
 import pytensor.tensor.rewriting.linalg
 import pytensor.tensor.rewriting.math
diff --git a/pytensor/tensor/rewriting/basic.py b/pytensor/tensor/rewriting/basic.py
index 4a7570dad3..6a038cab15 100644
--- a/pytensor/tensor/rewriting/basic.py
+++ b/pytensor/tensor/rewriting/basic.py
@@ -52,6 +52,7 @@
     TensorFromScalar,
     alloc,
     as_tensor_variable,
+    atleast_Nd,
     cast,
     extract_constant,
     fill,
@@ -1219,3 +1220,123 @@ def local_merge_alloc(fgraph, node):
 
 
 register_canonicalize(RemovalNodeRewriter(tensor_copy), name="remove_tensor_copy")
+
+
+@register_specialize
+@node_rewriter([DimShuffle])
+def local_dimshuffle_alloc(fgraph, node):
+    """
+    Lift DimShuffle through Alloc
+
+    dimshuffle{x, 0, 1}(alloc([3 4], 3, 2) => alloc([3 4], 1, 3, 2)
+    """
+    alloc_out = node.inputs[0]
+    alloc_node = alloc_out.owner
+    if not (alloc_node and isinstance(alloc_node.op, Alloc)):
+        return
+
+    ds_op = node.op
+    value, *alloc_shape = alloc_node.inputs
+
+    # Add implicit dimensions of value
+    value = atleast_Nd(value, n=len(alloc_shape))
+
+    # Dimshuffle value and alloc_shape
+    ds_value = value.dimshuffle(ds_op.new_order)
+    ds_alloc_shape = [alloc_shape[i] for i in ds_op.shuffle]
+    for dim in ds_op.augment:
+        ds_alloc_shape.insert(dim, 1)
+
+    return [alloc(ds_value, *ds_alloc_shape)]
+
+
+@register_specialize("shape_unsafe")
+@node_rewriter([Join])
+def local_join_of_alloc(fgraph, node):
+    """Rewrite a Join of Alloc nodes to an Alloc of the Join nodes."""
+    axis, *tensors = node.inputs
+
+    if len(tensors) < 2:
+        # Let other rewrite handle the useless Join
+        return
+
+    if not isinstance(axis, Constant):
+        return
+
+    core_tensors = []
+    alloc_shapes = []
+    for tensor in tensors:
+        if tensor.owner is None:
+            return
+
+        # tensor = expand_dims_to_alloc(tensor)
+        if not isinstance(tensor.owner.op, Alloc):
+            return
+
+        value, *shape = tensor.owner.inputs
+        # Introduce explicit batch dims
+        value = atleast_Nd(value, n=len(shape))
+        core_tensors.append(value)
+        alloc_shapes.append(shape)
+
+    # Find which allocated dimensions can be lifted
+    # Axis can never be lifted
+    # Non-axis allocated dimensions can be lifted if they are all broadcastable
+    [out] = node.outputs
+    axis = axis.data
+
+    broadcasted_dims = list(
+        zip(
+            *(
+                [
+                    bef and not aft
+                    for bef, aft in zip(
+                        core_tensor.type.broadcastable,
+                        tensor.type.broadcastable,
+                        strict=True,
+                    )
+                ]
+                for core_tensor, tensor in zip(core_tensors, tensors, strict=True)
+            )
+        )
+    )
+
+    lifteable_alloc_dims = {
+        dim
+        for dim in range(out.type.ndim)
+        if dim != axis and all(broadcasted_dims[dim])
+    }
+
+    if not lifteable_alloc_dims:
+        return
+
+    # Lift the allocated dimensions
+    new_tensors = []
+    for core_tensor, alloc_shape in zip(core_tensors, alloc_shapes):
+        pre_join_shape = [
+            1 if i in lifteable_alloc_dims else alloc_dim
+            for i, alloc_dim in enumerate(alloc_shape)
+        ]
+        new_tensor = alloc(core_tensor, *pre_join_shape)
+        copy_stack_trace(tensor, new_tensor)
+        new_tensors.append(new_tensor)
+
+    new_join = node.op(axis, *new_tensors)
+    copy_stack_trace(node.outputs[0], new_join)
+
+    # Reintroduce the lifted dims
+    post_join_shape = []
+    for i, alloc_dims in enumerate(zip(*alloc_shapes)):
+        if i == axis:
+            # The alloc dim along the axis is the sum of all the pre-join alloc dims
+            post_join_shape.append(add(*alloc_dims))
+        else:
+            # Otherwise the shapes should all match. We prioritize constants if any
+            for best_alloc_dim in alloc_dims:
+                if isinstance(best_alloc_dim, Constant):
+                    break
+            post_join_shape.append(best_alloc_dim)
+
+    new_out = alloc(new_join, *post_join_shape)
+    copy_stack_trace(node.outputs[0], new_out)
+    return [new_out]
diff --git a/pytensor/tensor/rewriting/blockwise.py b/pytensor/tensor/rewriting/blockwise.py
index 0bed304c29..7220824c58 100644
--- a/pytensor/tensor/rewriting/blockwise.py
+++ b/pytensor/tensor/rewriting/blockwise.py
@@ -10,6 +10,7 @@
     register_specialize,
     register_stabilize,
 )
+from pytensor.tensor.shape import Reshape
 from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedSubtensor, Subtensor
 
 
@@ -67,10 +68,16 @@ def local_useless_unbatched_blockwise(fgraph, node):
 def local_eager_useless_unbatched_blockwise(fgraph, node):
     if isinstance(
         node.op.core_op,
-        Dot | Alloc | ARange | Subtensor | AdvancedSubtensor | AdvancedIncSubtensor,
+        Dot
+        | Alloc
+        | ARange
+        | Subtensor
+        | AdvancedSubtensor
+        | AdvancedIncSubtensor
+        | Reshape,
     ):
         # Many Dot-related rewrites (eg, all of BlasOpt) happen before specialize
-        # These other Ops can't always be trivially vectored at runtime,
+        # These other Ops can't always be trivially vectorized at runtime,
         # since their inputs may imply non-rectangular shapes.
         return local_useless_unbatched_blockwise.fn(fgraph, node)
 
@@ -97,62 +104,67 @@ def local_blockwise_alloc(fgraph, node):
     BOp(matrix, alloc(vector, 10, 5)) -> BOp(matrix, vector)
     """
 
-    if not any(isinstance(inp.owner.op, Alloc) for inp in node.inputs if inp.owner):
-        return None
-
     op: Blockwise = node.op  # type: ignore
 
     batch_ndim = op.batch_ndim(node)
     if not batch_ndim:
         return None
 
+    if not any(var.owner and isinstance(var.owner.op, Alloc) for var in node.inputs):
+        return None
+
     new_inputs = []
     batch_shapes = []
     can_push_any_alloc = False
     for inp, inp_sig in zip(node.inputs, op.inputs_sig):
-        if inp.owner and isinstance(inp.owner.op, Alloc):
-            # Push batch dims from Alloc
-            value, *shape = inp.owner.inputs
-
-            # Check what to do with the value of the Alloc
-            squeezed_value = _squeeze_left(value, batch_ndim)
-            missing_ndim = len(shape) - value.type.ndim
-            if (
-                (((1,) * missing_ndim + value.type.broadcastable)[batch_ndim:])
-                != inp.type.broadcastable[batch_ndim:]
-            ):
-                # We still need an Alloc for the core dims
-                core_shape = shape[batch_ndim:]
-                # And the batch dims of the squeezed value
-                squeezed_value_batch_ndim = squeezed_value.type.ndim - len(core_shape)
-                batch_shape = [
-                    1 if broadcastable else dim
-                    for broadcastable, dim in zip(
-                        squeezed_value.type.broadcastable[:squeezed_value_batch_ndim],
-                        tuple(squeezed_value.shape)[:squeezed_value_batch_ndim],
+        if not all(inp.type.broadcastable[:batch_ndim]):
+            if inp.owner and isinstance(inp.owner.op, Alloc):
+                # Push batch dims from Alloc
+                value, *shape = inp.owner.inputs
+
+                # Check what to do with the value of the Alloc
+                squeezed_value = _squeeze_left(value, batch_ndim)
+                missing_ndim = len(shape) - value.type.ndim
+                if (
+                    (((1,) * missing_ndim + value.type.broadcastable)[batch_ndim:])
+                    != inp.type.broadcastable[batch_ndim:]
+                ):
+                    # We still need an Alloc for the core dims
+                    core_shape = shape[batch_ndim:]
+                    # And the batch dims of the squeezed value
+                    squeezed_value_batch_ndim = squeezed_value.type.ndim - len(
+                        core_shape
                     )
-                ]
-                squeezed_value = alloc(squeezed_value, *batch_shape, *core_shape)
-                if squeezed_value.type.broadcastable == inp.type.broadcastable:
-                    # We can't change anything about this Alloc input
-                    new_inputs.append(inp)
-                    continue
-
-            # We can push batch dims of this Alloc input
-            batch_shapes.append(
-                tuple(
-                    1 if broadcastable else dim
-                    for broadcastable, dim in zip(
-                        inp.type.broadcastable, shape[:batch_ndim]
+                    batch_shape = [
+                        1 if broadcastable else dim
+                        for broadcastable, dim in zip(
+                            squeezed_value.type.broadcastable[
+                                :squeezed_value_batch_ndim
+                            ],
+                            tuple(squeezed_value.shape)[:squeezed_value_batch_ndim],
+                        )
+                    ]
+                    squeezed_value = alloc(squeezed_value, *batch_shape, *core_shape)
+                    if squeezed_value.type.broadcastable == inp.type.broadcastable:
+                        # We can't change anything about this Alloc input
+                        new_inputs.append(inp)
+                        continue
+
+                # We can push batch dims of this Alloc input
+                batch_shapes.append(
+                    tuple(
+                        1 if broadcastable else dim
+                        for broadcastable, dim in zip(
+                            inp.type.broadcastable, shape[:batch_ndim]
+                        )
                     )
                 )
-            )
-            new_inputs.append(squeezed_value)
-            can_push_any_alloc = True
+                new_inputs.append(squeezed_value)
+                can_push_any_alloc = True
+                continue
 
-        else:
-            # Nothing to do with this input other than removing dummy batch dims
-            new_inputs.append(_squeeze_left(inp, batch_ndim))
+        # Nothing to do with this input other than removing dummy batch dims
+        new_inputs.append(_squeeze_left(inp, batch_ndim))
 
     if not can_push_any_alloc:
         return None
@@ -167,17 +179,15 @@ def local_blockwise_alloc(fgraph, node):
         missing_ndim = old_out_type.ndim - new_out_type.ndim
         batch_shape = ([1] * missing_ndim + list(new_outs[0].shape))[:batch_ndim]
         for i, batch_dims in enumerate(zip(*batch_shapes)):  # Transpose shape tuples
+            if old_out_type.broadcastable[i]:
+                continue
             for batch_dim in batch_dims:
                 if batch_dim == 1:
                     continue
+                batch_shape[i] = batch_dim
                 if isinstance(batch_dim, Constant):
                     # Give preference to Constants
-                    batch_shape[i] = batch_dim
                     break
-                elif old_out_type.broadcastable[i]:
-                    # Only use non Constant shapes if absolutely necessary
-                    # Otherwise, we use the shape of the non-alloc output
-                    batch_shape[i] = batch_dim
 
         copy_stack_trace(node.outputs, new_outs)
         new_outs = [
@@ -190,3 +200,28 @@ def local_blockwise_alloc(fgraph, node):
         ]
     copy_stack_trace(node.outputs, new_outs)
     return new_outs
+
+
+@register_specialize
+@node_rewriter([Blockwise])
+def local_blockwise_reshape(fgraph, node):
+    """Rewrite away square Blockwise reshapes.
+
+    Reshape is tricky to vectorize eagerly, because a graph like
+    `x.reshape([x.shape[0] * x.shape[1], -1])` has many operations
+    that must be vectorized before we arrize at the reshape operation.
+
+    For the square Reshape case, we must wait for all the intemediate
+    operations to be lifted as Allocs
+    """
+    if not isinstance(node.op.core_op, Reshape):
+        return None
+
+    x, output_shape = node.inputs
+    batch_ndim = node.op.batch_ndim(node)
+    if all(output_shape.type.broadcastable[:batch_ndim]):
+        batched_shape = x.shape[:batch_ndim]
+        core_reshape = _squeeze_left(output_shape, batch_ndim)
+        new_out = x.reshape([*tuple(batched_shape), *tuple(core_reshape)])
+        copy_stack_trace(node.outputs[0], new_out)
+        return [new_out]
diff --git a/pytensor/tensor/rewriting/einsum.py b/pytensor/tensor/rewriting/einsum.py
new file mode 100644
index 0000000000..5e9fe2d026
--- /dev/null
+++ b/pytensor/tensor/rewriting/einsum.py
@@ -0,0 +1,53 @@
+from typing import cast
+
+from pytensor.graph import Apply, FunctionGraph, node_rewriter
+from pytensor.graph.rewriting.basic import copy_stack_trace
+from pytensor.tensor.einsum import Einsum, einsum
+from pytensor.tensor.rewriting.basic import register_specialize
+from pytensor.tensor.rewriting.ofg import inline_ofg_node
+from pytensor.tensor.variable import TensorVariable
+
+
+@register_specialize
+@node_rewriter([Einsum])
+def optimize_einsum_inner_graph(
+    fgraph: FunctionGraph, node: Apply
+) -> list[TensorVariable] | None:
+    """Try to optimize an einsum that was not optimizable at definition time.
+
+    This can happen when users replace a graph without rebuilding
+
+    Or when during the course of rewrites more specialized static shapes are found
+    """
+    op: Einsum = node.op
+
+    if op.optimized:
+        # Already optimized
+        return None
+
+    operands = node.inputs
+    if any(None in operand.type.shape for operand in operands):
+        return None
+
+    new_out = einsum(op.subscripts, *operands)
+    assert new_out.owner.op.optimized
+
+    copy_stack_trace(node.outputs[0], new_out)
+    return [new_out]
+
+
+@register_specialize
+@node_rewriter([Einsum])
+def inline_optimized_einsum(
+    fgraph: FunctionGraph, node: Apply
+) -> list[TensorVariable] | None:
+    """Inline einsums that are already optimized.
+
+    This allows the inner garph to be optimized with the rest of the graph, now that we got ordering right.
+    """
+    op: Einsum = node.op
+
+    if not op.optimized:
+        return None
+
+    return cast(list[TensorVariable], inline_ofg_node(node))
diff --git a/pytensor/tensor/rewriting/ofg.py b/pytensor/tensor/rewriting/ofg.py
index 265f3ff2e8..2c4dfc4f70 100644
--- a/pytensor/tensor/rewriting/ofg.py
+++ b/pytensor/tensor/rewriting/ofg.py
@@ -1,12 +1,24 @@
-from pytensor import clone_replace
+from typing import cast
+
+from pytensor import Variable, clone_replace
 from pytensor.compile import optdb
 from pytensor.compile.builders import OpFromGraph
-from pytensor.graph import node_rewriter
+from pytensor.graph import Apply, node_rewriter
 from pytensor.graph.rewriting.basic import copy_stack_trace, in2out
 from pytensor.tensor.basic import AllocDiag
 from pytensor.tensor.rewriting.basic import register_specialize
 
 
+def inline_ofg_node(node: Apply) -> list[Variable]:
+    op = node.op
+    assert isinstance(op, OpFromGraph)
+    inlined_outs = clone_replace(
+        op.inner_outputs, dict(zip(op.inner_inputs, node.inputs))
+    )
+    copy_stack_trace(op.inner_outputs, inlined_outs)
+    return cast(list[Variable], inlined_outs)
+
+
 @node_rewriter([OpFromGraph])
 def inline_ofg_expansion(fgraph, node):
     """
@@ -18,10 +30,7 @@ def inline_ofg_expansion(fgraph, node):
     if not op.is_inline:
         return False
 
-    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
-    copy_stack_trace(op.inner_outputs, new_out)
-
-    return new_out
+    return inline_ofg_node(node)
 
 
 # We want to run this before the first merge optimizer
@@ -61,8 +70,4 @@ def late_inline_OpFromGraph(fgraph, node):
     -------
 
     """
-    op = node.op
-    new_out = clone_replace(op.inner_outputs, dict(zip(op.inner_inputs, node.inputs)))
-    copy_stack_trace(op.inner_outputs, new_out)
-
-    return new_out
+    return inline_ofg_node(node)
diff --git a/pytensor/tensor/rewriting/shape.py b/pytensor/tensor/rewriting/shape.py
index 1426a7d993..afa94d4e1f 100644
--- a/pytensor/tensor/rewriting/shape.py
+++ b/pytensor/tensor/rewriting/shape.py
@@ -749,51 +749,43 @@ def apply(self, fgraph):
 pytensor.compile.mode.optdb.register("UnShapeOpt", UnShapeOptimizer(), position=10)
 
 
-def local_reshape_chain(op):
-    @node_rewriter([op])
-    def f(fgraph, node):
-        """
-        Reshape(Reshape(shape1),shape2) -> Reshape(shape2)
-
-        """
-        if not check_chain(node, op, op):
-            return False
-
-        # TODO: this can permit a failing program to run by eliminating
-        #       the lower reshape
-        rval = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
-
-        # Copy over stacktrace from previous output node, as any error
-        # in new computational graph would have been caused by last op
-        # in the old computational graph.
-        copy_stack_trace(node.outputs, rval)
-
-        # It might happen that the desired output of this node has a
-        # broadcastable pattern that does not match that of 'rval'. This is
-        # when originally, we were able to figure out that one of the
-        # dimensions of the reshape is one, but some other transformation
-        # replaced the shape by one for which this cannot be guessed.
-        # We should try to figure out why we lost the information about this
-        # constant value... but in the meantime, better not apply this
-        # rewrite.
-        if rval.type.ndim == node.outputs[0].type.ndim and all(
-            s1 == s2
-            for s1, s2 in zip(rval.type.shape, node.outputs[0].type.shape)
-            if s1 == 1 or s2 == 1
-        ):
-            return [rval]
-        else:
-            return False
-
-    return f
+@register_canonicalize("shape_unsafe")
+@register_specialize("shape_unsafe")
+@node_rewriter([Reshape])
+def local_reshape_chain(fgraph, node):
+    """
+    Reshape(Reshape(x, shape1),shape2) -> Reshape(x, shape2)
 
+    """
+    if not check_chain(node, Reshape, Reshape):
+        return False
 
-register_canonicalize(local_reshape_chain(Reshape), name="local_reshape_chain")
+    rval = node.op(node.inputs[0].owner.inputs[0], node.inputs[1])
+
+    # Copy over stacktrace from previous output node, as any error
+    # in new computational graph would have been caused by last op
+    # in the old computational graph.
+    copy_stack_trace(node.outputs, rval)
+
+    # It might happen that the desired output of this node has a
+    # broadcastable pattern that does not match that of 'rval'. This is
+    # when originally, we were able to figure out that one of the
+    # dimensions of the reshape is one, but some other transformation
+    # replaced the shape by one for which this cannot be guessed.
+    # We should try to figure out why we lost the information about this
+    # constant value... but in the meantime, better not apply this
+    # rewrite.
+    if rval.type.ndim == node.outputs[0].type.ndim and all(
+        s1 == s2
+        for s1, s2 in zip(rval.type.shape, node.outputs[0].type.shape)
+        if s1 == 1 or s2 == 1
+    ):
+        return [rval]
 
 
-@register_useless
-@register_canonicalize
-@register_stabilize
+@register_useless("shape_unsafe")
+@register_canonicalize("shape_unsafe")
+@register_specialize("shape_unsafe")
 @node_rewriter([Reshape])
 def local_useless_reshape(fgraph, node):
     """Remove two kinds of useless `Reshape`.
@@ -802,24 +794,17 @@ def local_useless_reshape(fgraph, node):
     - Remove `Reshape` when reshaping to the shape of the input.
 
     """
-    inp = node.inputs[0]
-    output = node.outputs[0]
-    output_shape = node.inputs[1]
+    inp, output_shape = node.inputs
+    [output] = node.outputs
 
     if inp.type.ndim != output.type.ndim:
         return False
 
     # Simple case: both input and output have a single dimension.
-    # TODO FIXME XXX: This could hide errors if the user provides inconsistent
-    # shapes.
     if (
         inp.type.ndim == 1
         and output.type.ndim == 1
-        and all(
-            s1 == s2
-            for s1, s2 in zip(inp.type.shape, output.type.shape)
-            if s1 == 1 or s2 == 1
-        )
+        and inp.type.broadcastable == output.type.broadcastable
     ):
         return [inp]
 
@@ -832,8 +817,15 @@ def local_useless_reshape(fgraph, node):
 
     # Match Reshape(x, [x.shape[0], ..., x.shape[-1]]), accounting for
     # broadcastable and constant dimensions
-    if output_shape.owner and isinstance(output_shape.owner.op, MakeVector):
-        output_shape_is = output_shape.owner.inputs
+    if isinstance(output_shape, Constant) or (
+        output_shape.owner and isinstance(output_shape.owner.op, MakeVector)
+    ):
+        if isinstance(output_shape, Constant):
+            output_shape_is = [
+                as_tensor_variable(dim, ndim=0) for dim in output_shape.data
+            ]
+        else:
+            output_shape_is = output_shape.owner.inputs
 
         shape_feature = getattr(fgraph, "shape_feature", None)
 
@@ -865,9 +857,9 @@ def local_useless_reshape(fgraph, node):
                         shape_match[dim] = True
                         continue
 
-            # Match 1 if input.type.shape[dim] == 1
+            # Match constant if input.type.shape[dim] == constant
             cst_outshp_i = extract_constant(outshp_i, only_process_constants=1)
-            if inp.type.shape[dim] == 1 and cst_outshp_i == 1:
+            if inp.type.shape[dim] == cst_outshp_i:
                 shape_match[dim] = True
                 continue
 
@@ -881,17 +873,18 @@ def local_useless_reshape(fgraph, node):
             if shape_feature:
                 inpshp_i = shape_feature.get_shape(inp, dim)
                 if inpshp_i == outshp_i or (
-                    extract_constant(inpshp_i, only_process_constants=1)
-                    == extract_constant(outshp_i, only_process_constants=1)
+                    extract_constant(inpshp_i, only_process_constants=True)
+                    == extract_constant(outshp_i, only_process_constants=True)
                 ):
                     shape_match[dim] = True
                     continue
 
-        if all(shape_match) and nb_m1 <= 1:
+        if nb_m1 <= 1 and all(shape_match):
+            return [inp]
+
+        if (nb_m1 == 0) and (shape_match.count(False) == output.type.ndim - 1):
             return [inp]
 
-        # TODO later: if all the shapes except one match, we may want to
-        # consider it useless as well, like we do in the 1-dim case.
         return False
 
 
@@ -910,9 +903,8 @@ def local_reshape_to_dimshuffle(fgraph, node):
           -> DimShuffle{x,0,x,1,x,x}(Reshape(x, (m, n)))
     """
     op = node.op
-    inp = node.inputs[0]
-    output = node.outputs[0]
-    output_shape = node.inputs[1]
+    inp, output_shape = node.inputs
+    [output] = node.outputs
 
     dimshuffle_new_order = []
     new_output_shape = []
@@ -944,7 +936,7 @@ def local_reshape_to_dimshuffle(fgraph, node):
 
 
 @register_canonicalize
-@register_stabilize
+@register_specialize
 @node_rewriter([Reshape])
 def local_reshape_lift(fgraph, node):
     """
diff --git a/pytensor/tensor/shape.py b/pytensor/tensor/shape.py
index 236c34b442..614258dcae 100644
--- a/pytensor/tensor/shape.py
+++ b/pytensor/tensor/shape.py
@@ -842,13 +842,13 @@ def c_code(self, node, name, inputs, outputs, sub):
 
 @_vectorize_node.register(Reshape)
 def _vectorize_reshape(op, node, x, shape):
+    from pytensor.tensor.blockwise import vectorize_node_fallback
+
     old_x, old_shape = node.inputs
     batched_ndims = x.type.ndim - old_x.type.ndim
 
     if as_tensor_variable(shape).type.ndim != 1:
-        raise NotImplementedError(
-            "It is not possible to vectorize the shape argument of Reshape"
-        )
+        return vectorize_node_fallback(op, node, x, shape)
 
     if len(tuple(old_shape)) == len(tuple(shape)):
         new_shape = [*x.shape[:batched_ndims], *shape]
diff --git a/tests/link/jax/test_einsum.py b/tests/link/jax/test_einsum.py
new file mode 100644
index 0000000000..9a55670c64
--- /dev/null
+++ b/tests/link/jax/test_einsum.py
@@ -0,0 +1,38 @@
+import numpy as np
+import pytest
+
+import pytensor
+import pytensor.tensor as pt
+
+
+jax = pytest.importorskip("jax")
+
+
+def test_jax_einsum():
+    subscripts = "ij, jk, kl -> il"
+    x = np.random.rand(3, 5)
+    y = np.random.rand(5, 2)
+    z = np.random.rand(2, 4)
+
+    shapes = ((3, 5), (5, 2), (2, 4))
+    x_pt, y_pt, z_pt = (
+        pt.tensor(name, shape=shape) for name, shape in zip("xyz", shapes)
+    )
+    out = pt.einsum(subscripts, x_pt, y_pt, z_pt)
+    f = pytensor.function([x_pt, y_pt, z_pt], out, mode="JAX")
+
+    np.testing.assert_allclose(f(x, y, z), np.einsum(subscripts, x, y, z))
+
+
+@pytest.mark.xfail(raises=NotImplementedError)
+def test_ellipsis_einsum():
+    subscripts = "...i,...i->..."
+    x = np.random.rand(2, 5)
+    y = np.random.rand(2, 5)
+
+    x_pt = pt.tensor("x", shape=x.shape)
+    y_pt = pt.tensor("y", shape=y.shape)
+    out = pt.einsum(subscripts, x_pt, y_pt)
+    f = pytensor.function([x_pt, y_pt], out, mode="JAX")
+
+    np.testing.assert_allclose(f(x, y), np.einsum(subscripts, x, y))
diff --git a/tests/tensor/rewriting/test_blockwise.py b/tests/tensor/rewriting/test_blockwise.py
index d5ea6e2b4e..a17ad18a1f 100644
--- a/tests/tensor/rewriting/test_blockwise.py
+++ b/tests/tensor/rewriting/test_blockwise.py
@@ -1,7 +1,9 @@
 from functools import partial
 
-from pytensor import function
-from pytensor.graph import FunctionGraph, rewrite_graph
+import numpy as np
+
+from pytensor import Mode, config, function
+from pytensor.graph import FunctionGraph, rewrite_graph, vectorize_graph
 from pytensor.graph.basic import equal_computations
 from pytensor.scalar import log as scalar_log
 from pytensor.tensor import add, alloc, matrix, tensor, tensor3
@@ -9,6 +11,7 @@
 from pytensor.tensor.elemwise import Elemwise
 from pytensor.tensor.nlinalg import MatrixPinv
 from pytensor.tensor.rewriting.blockwise import local_useless_blockwise
+from pytensor.tensor.shape import Reshape
 
 
 def test_useless_blockwise_of_elemwise():
@@ -45,7 +48,7 @@ def test_blockwise_alloc():
     rewrite = partial(
         rewrite_graph,
         include=("ShapeOpt", "specialize"),
-        exclude=("local_useless_unbatched_blockwise",),
+        exclude=("local_useless_unbatched_blockwise", "local_dimshuffle_alloc"),
     )
 
     vector_add = Blockwise(core_op=add, signature="(x),(x)->(x)")
@@ -104,7 +107,9 @@ def test_blockwise_alloc():
     y = tensor("y", shape=())
     out = vector_add(alloc(x, 3, 1, 5), alloc(y, 7, 5))
     expected_out = alloc(vector_add(alloc(x, 5), alloc(y, 5)), 3, 7, 5)
-    assert equal([rewrite(out)], [expected_out])
+    assert equal(
+        [rewrite(out)], [expected_out]
+    ), None  # pytensor.dprint([expected_out, rewrite(out)], print_type=True)
 
     x = tensor("x", shape=(5,))
     y = tensor("y", shape=())
@@ -118,3 +123,27 @@ def test_blockwise_alloc():
     out = vector_add(x, alloc(y, 5))
     expected_out = out
     assert equal([rewrite(out)], [expected_out])
+
+
+def test_blockwise_reshape():
+    x = tensor("x", shape=(None, None, None))
+    y = x.reshape([x.shape[0] * x.shape[1], -1])
+
+    new_x = tensor("x", shape=(None, None, None, None))
+    new_y = vectorize_graph(y, {x: new_x})
+    assert not isinstance(new_y.owner.op, Reshape)
+    assert isinstance(new_y.owner.op, Blockwise) and isinstance(
+        new_y.owner.op.core_op, Reshape
+    )
+
+    rewritten_y = rewrite_graph(
+        new_y, include=("canonicalize", "specialize"), clone=True
+    )
+    assert isinstance(rewritten_y.owner.op, Reshape)
+
+    no_rewrites = Mode(linker="py", optimizer=None)
+    test_x = np.arange(5 * 4 * 3 * 2).reshape(5, 4, 3, 2).astype(config.floatX)
+    np.testing.assert_allclose(
+        new_y.eval({"x": test_x}, mode=no_rewrites),
+        rewritten_y.eval({"x": test_x}, mode=no_rewrites),
+    )
diff --git a/tests/tensor/rewriting/test_einsum.py b/tests/tensor/rewriting/test_einsum.py
new file mode 100644
index 0000000000..73e4372aaa
--- /dev/null
+++ b/tests/tensor/rewriting/test_einsum.py
@@ -0,0 +1,39 @@
+from functools import partial
+
+from pytensor.graph import ancestors, rewrite_graph
+from pytensor.tensor import einsum, specify_shape, tensor
+from pytensor.tensor.einsum import Einsum
+
+
+specialize_rewrite = partial(rewrite_graph, include=("specialize",), clone=True)
+
+
+def test_einsum_optimization():
+    a = tensor("a", shape=(None, None))
+    b = tensor("b", shape=(None, None))
+    c = tensor("c", shape=(None, None))
+
+    dynamic_shape_einsum = einsum("ij,ij,jk->ik", a, b, c)
+    assert not dynamic_shape_einsum.owner.op.optimized
+
+    rewritten_out = specialize_rewrite(dynamic_shape_einsum)
+    assert isinstance(rewritten_out.owner.op, Einsum)
+
+    a = specify_shape(a, (2, 3))
+    b = specify_shape(b, (2, 3))
+    c = specify_shape(c, (3, 5))
+
+    static_shape_einsum = dynamic_shape_einsum.owner.clone_with_new_inputs(
+        [a, b, c]
+    ).default_output()
+    assert not static_shape_einsum.owner.op.optimized
+
+    rewritten_out = specialize_rewrite(static_shape_einsum)
+    # Einsum was inlined because it was optimized
+    assert not isinstance(rewritten_out.owner.op, Einsum)
+    # Sanity check that it's not buried in the graph
+    assert not any(
+        isinstance(var.owner.op, Einsum)
+        for var in ancestors([rewritten_out])
+        if var.owner
+    )
diff --git a/tests/tensor/rewriting/test_shape.py b/tests/tensor/rewriting/test_shape.py
index f4c529a0d2..bbfd829070 100644
--- a/tests/tensor/rewriting/test_shape.py
+++ b/tests/tensor/rewriting/test_shape.py
@@ -337,6 +337,52 @@ def test_m1(self):
         topo = f2.maker.fgraph.toposort()
         assert not any(isinstance(n.op, Reshape) for n in topo)
 
+    def test_constant_shape(self):
+        # Where reshape is a constant that matches the shape
+        x = matrix(shape=(2, 3))
+        shape = pt.as_tensor(np.array([2, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is x
+
+        x = matrix(shape=(2, 3))
+        shape = pt.as_tensor(np.array([-1, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is x
+
+        x = matrix(shape=(None, 3))
+        shape = pt.as_tensor(np.array([-1, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is x
+
+        x = matrix(shape=(None, 3))
+        shape = pt.as_tensor(np.array([2, 3]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        # This could be rewritten as a specify_shape(x, (2, 3))
+        assert new_out is not x
+
+        x = matrix(shape=(2, 3))
+        shape = pt.as_tensor(np.array([3, 2]))
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is not x
+
+    def test_all_but_one_match(self):
+        x = matrix(shape=(None, None))
+        shape = [x.shape[0], 3]
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert equal_computations([new_out], [specify_shape(x, (None, 3))])
+
+        # Rewrite does not apply if there's also a -1
+        shape = [-1, 3]
+        out = reshape(x, shape)
+        new_out = rewrite_graph(out)
+        assert new_out is out
+
 
 class TestLocalReshapeToDimshuffle:
     def setup_method(self):
diff --git a/tests/tensor/test_basic.py b/tests/tensor/test_basic.py
index 49c8e9c38c..58d4de2481 100644
--- a/tests/tensor/test_basic.py
+++ b/tests/tensor/test_basic.py
@@ -3847,8 +3847,10 @@ def test_transpose():
     assert np.all(t2d == np.transpose(x2v, [0, 1]))
     assert np.all(t3d == np.transpose(x3v, [0, 2, 1]))
 
+    # Check we don't introduce useless transpose
+    assert ptb.transpose(x1) is x1
+
     # Check that we create a name.
-    assert ptb.transpose(x1).name == "x1.T"
     assert ptb.transpose(x2).name == "x2.T"
     assert ptb.transpose(x3).name == "x3.T"
     assert ptb.transpose(dmatrix()).name is None
diff --git a/tests/tensor/test_einsum.py b/tests/tensor/test_einsum.py
new file mode 100644
index 0000000000..9131cda056
--- /dev/null
+++ b/tests/tensor/test_einsum.py
@@ -0,0 +1,263 @@
+from functools import partial
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+import pytensor
+import pytensor.tensor as pt
+from pytensor import Mode, config, function
+from pytensor.graph import FunctionGraph
+from pytensor.graph.op import HasInnerGraph
+from pytensor.tensor.blockwise import Blockwise
+from pytensor.tensor.einsum import _delta, _general_dot, _iota, einsum
+from pytensor.tensor.shape import Reshape
+
+
+# Fail for unexpected warnings in this file
+pytestmark = pytest.mark.filterwarnings("error")
+
+floatX = pytensor.config.floatX
+ATOL = RTOL = 1e-8 if floatX == "float64" else 1e-4
+
+
+def assert_no_blockwise_in_graph(fgraph: FunctionGraph, core_op=None) -> None:
+    for node in fgraph.apply_nodes:
+        if isinstance(node.op, Blockwise):
+            if core_op is None:
+                raise AssertionError
+            assert not isinstance(node.op.core_op, core_op)
+
+        if isinstance(node.op, HasInnerGraph):
+            # InnerGraph Ops can be rewritten without modifying the original fgraph
+            if hasattr(node.op, "_fn"):
+                inner_fgraph = node.op._fn.maker.fgraph
+            else:
+                inner_fgraph = node.op.fgraph
+            assert_no_blockwise_in_graph(inner_fgraph, core_op=core_op)
+
+
+def test_iota():
+    mode = Mode(linker="py", optimizer=None)
+    np.testing.assert_allclose(
+        _iota((4, 8), 0).eval(mode=mode),
+        [
+            [0, 0, 0, 0, 0, 0, 0, 0],
+            [1, 1, 1, 1, 1, 1, 1, 1],
+            [2, 2, 2, 2, 2, 2, 2, 2],
+            [3, 3, 3, 3, 3, 3, 3, 3],
+        ],
+    )
+
+    np.testing.assert_allclose(
+        _iota((4, 8), 1).eval(mode=mode),
+        [
+            [0, 1, 2, 3, 4, 5, 6, 7],
+            [0, 1, 2, 3, 4, 5, 6, 7],
+            [0, 1, 2, 3, 4, 5, 6, 7],
+            [0, 1, 2, 3, 4, 5, 6, 7],
+        ],
+    )
+
+
+def test_delta():
+    mode = Mode(linker="py", optimizer=None)
+    np.testing.assert_allclose(
+        _delta((2, 2), (0, 1)).eval(mode=mode),
+        [[1.0, 0.0], [0.0, 1.0]],
+    )
+
+    np.testing.assert_allclose(
+        _delta((2, 2, 2), (0, 1)).eval(mode=mode),
+        [[[1, 1], [0, 0]], [[0, 0], [1, 1]]],
+    )
+
+
+def test_general_dot():
+    rng = np.random.default_rng(45)
+    signature = "(l0,a0,a1,l1),(a1,r0,r1,a0)->(l0,l1,r0,r1)"
+    tensordot_axes = [(-3, -2), (-1, -4)]
+
+    # X has two batch dims
+    # Y has one batch dim
+    x = pt.tensor("x", shape=(5, 4, 2, 11, 13, 3))
+    y = pt.tensor("y", shape=(4, 13, 5, 7, 11))
+    out = _general_dot((x, y), tensordot_axes, [(0, 1), (0,)])
+
+    fn = pytensor.function([x, y], out)
+    # fn.dprint(print_type=True)
+    if config.mode != "FAST_COMPILE":
+        assert_no_blockwise_in_graph(fn.maker.fgraph, Reshape)
+
+    np_batched_tensordot = np.vectorize(
+        partial(np.tensordot, axes=tensordot_axes), signature=signature
+    )
+    x_test = rng.normal(size=x.type.shape).astype(floatX)
+    y_test = rng.normal(size=y.type.shape).astype(floatX)
+    np.testing.assert_allclose(
+        fn(x_test, y_test), np_batched_tensordot(x_test, y_test), atol=ATOL, rtol=RTOL
+    )
+
+
+@pytest.mark.parametrize("static_shape_known", [True, False])
+@pytest.mark.parametrize(
+    "signature",
+    [
+        "ij",
+        "ji",
+        "ii->i",
+        "ii",
+        "ij->",
+        "ij->j",
+        "ij->i",
+        "ij,ij->ij",
+        "ij,ji->ij",
+        "ij,ji->ji",
+        "ij,jk",
+        "kj,ji",
+        "ij,kj->ik",
+        "ik,kj->ikj",
+        "ij,kl->ijkl",
+        "ij,jk,kl->il",
+        "kl,ij,jk->il",
+        "oij,imj,mjkn,lnk,plk->op",
+    ],
+)
+def test_einsum_signatures(static_shape_known, signature):
+    letters_to_dims = dict(zip("ijklmnop", [2, 3, 5, 7, 11, 13, 17, 19], strict=True))
+
+    inputs = signature.split("->")[0].split(",")
+
+    shapes = [tuple(letters_to_dims[letter] for letter in inp) for inp in inputs]
+    if static_shape_known:
+        static_shapes = shapes
+    else:
+        static_shapes = [[None] * len(shape) for shape in shapes]
+
+    operands = [
+        pt.tensor(name, shape=static_shape)
+        for name, static_shape in zip(ascii_lowercase, static_shapes)
+    ]
+    out = pt.einsum(signature, *operands)
+    assert out.owner.op.optimized == static_shape_known or len(operands) <= 2
+
+    rng = np.random.default_rng(37)
+    test_values = [rng.normal(size=shape).astype(floatX) for shape in shapes]
+    np_out = np.einsum(signature, *test_values)
+
+    fn = function(operands, out)
+    pt_out = fn(*test_values)
+
+    # print(); fn.dprint(print_type=True)
+
+    if config.mode != "FAST_COMPILE":
+        assert_no_blockwise_in_graph(fn.maker.fgraph)
+    np.testing.assert_allclose(pt_out, np_out, atol=ATOL, rtol=RTOL)
+
+
+def test_batch_dim():
+    shapes = (
+        (7, 3, 5),
+        (5, 2),
+    )
+    x, y = (pt.tensor(name, shape=shape) for name, shape in zip("xy", shapes))
+    out = pt.einsum("mij,jk->mik", x, y)
+
+    assert out.type.shape == (7, 3, 2)
+
+
+def test_einsum_conv():
+    # Adapted example from https://medium.com/latinxinai/vectorized-convolution-operation-using-numpy-b122fd52fba3
+    rng = np.random.default_rng(125)
+    batch_size = 32
+    channels = 3
+    height = 8
+    width = 8
+    kernel_size = 2
+    num_filters = 15
+    conv_signature = "bchwkt,fckt->bfhw"
+    windowed_input = rng.random(
+        size=(batch_size, channels, height, width, kernel_size, kernel_size)
+    ).astype(floatX)
+    weights = rng.random(size=(num_filters, channels, kernel_size, kernel_size)).astype(
+        floatX
+    )
+    result = einsum(conv_signature, windowed_input, weights).eval()
+
+    assert result.shape == (32, 15, 8, 8)
+    np.testing.assert_allclose(
+        result,
+        np.einsum("bchwkt,fckt->bfhw", windowed_input, weights),
+        atol=ATOL,
+        rtol=RTOL,
+    )
+
+
+def test_ellipsis():
+    rng = np.random.default_rng(159)
+    x = pt.tensor("x", shape=(3, 5, 7, 11))
+    y = pt.tensor("y", shape=(3, 5, 11, 13))
+    x_test = rng.normal(size=x.type.shape).astype(floatX)
+    y_test = rng.normal(size=y.type.shape).astype(floatX)
+    expected_out = np.matmul(x_test, y_test)
+
+    with pytest.raises(ValueError):
+        pt.einsum("mp,pn->mn", x, y)
+
+    out = pt.einsum("...mp,...pn->...mn", x, y)
+    np.testing.assert_allclose(
+        out.eval({x: x_test, y: y_test}), expected_out, atol=ATOL, rtol=RTOL
+    )
+
+    # Put batch axes in the middle
+    new_x = pt.moveaxis(x, -2, 0)
+    new_y = pt.moveaxis(y, -2, 0)
+    out = pt.einsum("m...p,p...n->m...n", new_x, new_y)
+    np.testing.assert_allclose(
+        out.eval({x: x_test, y: y_test}),
+        expected_out.transpose(-2, 0, 1, -1),
+        atol=ATOL,
+        rtol=RTOL,
+    )
+
+    out = pt.einsum("m...p,p...n->mn", new_x, new_y)
+    np.testing.assert_allclose(
+        out.eval({x: x_test, y: y_test}), expected_out.sum((0, 1)), atol=ATOL, rtol=RTOL
+    )
+
+
+def test_broadcastable_dims():
+    # Test that einsum handles broadcasting dims correctly. There are two points:
+    # 1. Numpy einsum allows the same subscript for degenerate and full dimensions
+    # There is some stale discussion on whether this should be a bug or not, but for now it is not:
+    # https://github.com/numpy/numpy/issues/11548
+
+    # 2. Using the same letter for dimensions that are and aren't broadcastable
+    # can lead to suboptimal paths. We check we issue a warning for the following example:
+    # https://github.com/dgasmith/opt_einsum/issues/220
+    rng = np.random.default_rng(222)
+    a = pt.tensor("a", shape=(32, 32, 32))
+    b = pt.tensor("b", shape=(1000, 32))
+    c = pt.tensor("c", shape=(1, 32))
+
+    a_test = rng.normal(size=a.type.shape).astype(floatX)
+    b_test = rng.normal(size=b.type.shape).astype(floatX)
+    c_test = rng.normal(size=c.type.shape).astype(floatX)
+
+    # Note b is used for both 1 and 32
+    with pytest.warns(
+        UserWarning, match="This can result in a suboptimal contraction path"
+    ):
+        suboptimal_out = pt.einsum("ijk,bj,bk->i", a, b, c)
+    assert not [set(p) for p in suboptimal_out.owner.op.path] == [{0, 2}, {0, 1}]
+
+    # If we use a distinct letter we get the optimal path
+    optimal_out = pt.einsum("ijk,bj,ck->i", a, b, c)
+    assert [set(p) for p in optimal_out.owner.op.path] == [{0, 2}, {0, 1}]
+
+    suboptimal_eval = suboptimal_out.eval({a: a_test, b: b_test, c: c_test})
+    optimal_eval = optimal_out.eval({a: a_test, b: b_test, c: c_test})
+    np_eval = np.einsum("ijk,bj,bk->i", a_test, b_test, c_test)
+    atol = 1e-12 if config.floatX == "float64" else 1e-2
+    np.testing.assert_allclose(suboptimal_eval, np_eval, atol=atol)
+    np.testing.assert_allclose(optimal_eval, np_eval, atol=atol)
diff --git a/tests/tensor/test_shape.py b/tests/tensor/test_shape.py
index 7fa8133c4e..f9434c9f60 100644
--- a/tests/tensor/test_shape.py
+++ b/tests/tensor/test_shape.py
@@ -14,7 +14,7 @@
 from pytensor.misc.safe_asarray import _asarray
 from pytensor.scalar.basic import ScalarConstant
 from pytensor.tensor import as_tensor_variable, broadcast_to, get_vector_length, row
-from pytensor.tensor.basic import MakeVector, as_tensor, constant
+from pytensor.tensor.basic import MakeVector, constant, stack
 from pytensor.tensor.elemwise import DimShuffle, Elemwise
 from pytensor.tensor.rewriting.shape import ShapeFeature
 from pytensor.tensor.shape import (
@@ -801,8 +801,14 @@ def test_reshape(self):
         [vect_out] = vectorize_node(node, mat, new_shape).outputs
         assert equal_computations([vect_out], [reshape(mat, new_shape)])
 
-        with pytest.raises(NotImplementedError):
-            vectorize_node(node, vec, broadcast_to(as_tensor([5, 2, x]), (2, 3)))
+        new_shape = stack([[-1, x], [x - 1, -1]], axis=0)
+        print(new_shape.type)
+        [vect_out] = vectorize_node(node, vec, new_shape).outputs
+        vec_test_value = np.arange(6)
+        np.testing.assert_allclose(
+            vect_out.eval({x: 3, vec: vec_test_value}),
+            np.broadcast_to(vec_test_value.reshape(2, 3), (2, 2, 3)),
+        )
 
         with pytest.raises(
             ValueError,

From 6112f82626edda15fb5420c7cd6376bee7bffb20 Mon Sep 17 00:00:00 2001
From: Ricardo Vieira <ricardo.vieira1994@gmail.com>
Date: Sun, 4 Aug 2024 16:38:43 +0200
Subject: [PATCH 59/72] Skip tri test in latest version of JAX

Related to https://github.com/google/jax/issues/22751
---
 tests/link/jax/test_tensor_basic.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/link/jax/test_tensor_basic.py b/tests/link/jax/test_tensor_basic.py
index 1a7f787a3a..afa4191b9d 100644
--- a/tests/link/jax/test_tensor_basic.py
+++ b/tests/link/jax/test_tensor_basic.py
@@ -218,6 +218,10 @@ def test_tri():
     compare_jax_and_py(fgraph, [])
 
 
+@pytest.mark.skipif(
+    jax.__version__ == "0.4.31",
+    reason="https://github.com/google/jax/issues/22751",
+)
 def test_tri_nonconcrete():
     """JAX cannot JIT-compile `jax.numpy.tri` when arguments are not concrete values."""
 

From cd8585ddc20de86e9af70ca7f9011bce806449c9 Mon Sep 17 00:00:00 2001
From: abhishekshah5486 <abhishek.shah5486@gmail.com>
Date: Tue, 6 Aug 2024 01:00:34 +0530
Subject: [PATCH 60/72] Corrected the reference from 'an PyTensor' to 'a
 PyTensor' in the contributing guidelines.

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1d3c8c875f..c3b8b1fff2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,7 @@ For issues a minimal working example (MWE) is strongly recommended when relevant
 (fixing a typo in the documentation does not require a MWE). For discussions,
 MWEs are generally required. All MWEs must be implemented using PyTensor. Please
 do not submit MWEs if they are not implemented in PyTensor. In certain cases,
-pseudocode may be acceptable, but an PyTensor implementation is always preferable.
+pseudocode may be acceptable, but a PyTensor implementation is always preferable.
 
 ## Quick links
 

From bd38216fbf45ac540b5dbc73779ac1a5c7f03c05 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 5 Aug 2024 17:39:49 +0000
Subject: [PATCH 61/72] [pre-commit.ci] pre-commit autoupdate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.5 → v0.5.6](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.5...v0.5.6)
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 118a371e78..c0e45f6e15 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           )$
       - id: check-merge-conflict
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.5
+    rev: v0.5.6
     hooks:
       - id: ruff
         args: ["--fix", "--output-format=full"]

From 521b8cab84acd25a0555599d6bc6eae201f27762 Mon Sep 17 00:00:00 2001
From: Thomas Wiecki <thomas.wiecki@gmail.com>
Date: Sat, 10 Aug 2024 12:53:53 +0200
Subject: [PATCH 62/72] Pickle error message changed (#966)

---
 tests/test_config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 47a4e24035..73c1408e03 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -254,7 +254,10 @@ def test_config_pickling():
         configparser.IntParam(5, lambda i: i > 0),
         in_c_key=False,
     )
-    with pytest.raises(AttributeError, match="Can't pickle local object"):
+    with pytest.raises(
+        AttributeError,
+        match="Can't (pickle|get) local object 'test_config_pickling.<locals>.<lambda>'",
+    ):
         pickle.dump(root, io.BytesIO())
 
 

From 917cc55bd588cc6bee3f71a4fc0ae9d8b207eecf Mon Sep 17 00:00:00 2001
From: Thomas Wiecki <thomas.wiecki@gmail.com>
Date: Sat, 10 Aug 2024 23:53:39 +0200
Subject: [PATCH 63/72] Add building of pyodide universal wheels (#918)

* Add building of pyodide universal wheels

* precommit

* Fix precommit. Readd comment.

* Fix precommit2

* Minor improvement to ext_modules conditional definition

* Bump Python version so that tomllib is included

This way versioneer can read pyproject.toml

* Add wheel package to build dependencies

* Update .github/workflows/pypi.yml

* Revert unnecessary

* ruff

---------

Co-authored-by: Ben Mares <services-git-throwaway1@tensorial.com>
---
 .github/workflows/pypi.yml | 30 ++++++++++++++++++++++++++++++
 setup.py                   | 25 ++++++++++++++++++-------
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
index ca37e422d0..af3ea8b93c 100644
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -57,6 +57,31 @@ jobs:
           name: wheels-${{ matrix.platform }}
           path: ./wheelhouse/*.whl
 
+  build_universal_wheel:
+    name: Build universal wheel for Pyodide
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install numpy versioneer wheel
+
+      - name: Build universal wheel
+        run: |
+          PYODIDE=1 python setup.py bdist_wheel --universal
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: universal_wheel
+          path: dist/*.whl
+
   check_dist:
     name: Check dist
     needs: [make_sdist,build_wheels]
@@ -103,6 +128,11 @@ jobs:
           path: dist
           merge-multiple: true
 
+      - uses: actions/download-artifact@v4
+        with:
+          name: universal_wheel
+          path: dist
+
       - uses: pypa/gh-action-pypi-publish@v1.9.0
         with:
           user: __token__
diff --git a/setup.py b/setup.py
index 3f8eb225d8..09202a658c 100755
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python
+import os
+
 import numpy
 import versioneer
 from setuptools import Extension, setup
@@ -11,17 +13,26 @@
 
 NAME: str = dist.get_name()  # type: ignore
 
+# Check if building for Pyodide
+is_pyodide = os.getenv("PYODIDE", "0") == "1"
+
+if is_pyodide:
+    # For pyodide we build a universal wheel that must be pure-python
+    # so we must omit the cython-version of scan.
+    ext_modules = []
+else:
+    ext_modules = [
+        Extension(
+            name="pytensor.scan.scan_perform",
+            sources=["pytensor/scan/scan_perform.pyx"],
+            include_dirs=[numpy.get_include()],
+        ),
+    ]
 
 if __name__ == "__main__":
     setup(
         name=NAME,
         version=versioneer.get_version(),
         cmdclass=versioneer.get_cmdclass(),
-        ext_modules=[
-            Extension(
-                name="pytensor.scan.scan_perform",
-                sources=["pytensor/scan/scan_perform.pyx"],
-                include_dirs=[numpy.get_include()],
-            ),
-        ],
+        ext_modules=ext_modules,
     )

From e879b0c62a659a4daf9cff1bfe84293cb843f870 Mon Sep 17 00:00:00 2001
From: Krupakar Reddy <137398727+Krupakar-Reddy-S@users.noreply.github.com>
Date: Mon, 12 Aug 2024 17:04:43 +0530
Subject: [PATCH 64/72] Removed types examples and introduced tensor (#968)

---
 doc/tutorial/adding.rst | 44 +++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/doc/tutorial/adding.rst b/doc/tutorial/adding.rst
index d558217dc7..0262b60edf 100644
--- a/doc/tutorial/adding.rst
+++ b/doc/tutorial/adding.rst
@@ -4,6 +4,31 @@
 Baby Steps - Algebra
 ====================
 
+Understanding Tensors
+===========================
+
+Before diving into PyTensor, it's essential to understand the fundamental 
+data structure it operates on: the *tensor*. A *tensor* is a multi-dimensional 
+array that serves as the foundation for symbolic computations.
+
+tensors can represent anything from a single number (scalar) to 
+complex multi-dimensional arrays. Each tensor has a type that dictates its 
+dimensionality and the kind of data it holds.
+
+For example, the following code creates a symbolic scalar and a symbolic matrix:
+
+>>> x = pt.scalar('x')
+>>> y = pt.matrix('y')
+
+Here, `scalar` refers to a tensor with zero dimensions, while `matrix` refers 
+to a tensor with two dimensions. The same principles apply to tensors of other 
+dimensions.
+
+For more information about tensors and their associated operations can be 
+found here: :ref:`tensor <libdoc_tensor>`.
+
+
+
 Adding two Scalars
 ==================
 
@@ -173,25 +198,6 @@ It is possible to add scalars to matrices, vectors to matrices,
 scalars to vectors, etc. The behavior of these operations is defined
 by :ref:`broadcasting <libdoc_tensor_broadcastable>`.
 
-The following types are available:
-
-* **byte**: ``bscalar, bvector, bmatrix, brow, bcol, btensor3, btensor4, btensor5, btensor6, btensor7``
-* **16-bit integers**: ``wscalar, wvector, wmatrix, wrow, wcol, wtensor3, wtensor4, wtensor5, wtensor6, wtensor7``
-* **32-bit integers**: ``iscalar, ivector, imatrix, irow, icol, itensor3, itensor4, itensor5, itensor6, itensor7``
-* **64-bit integers**: ``lscalar, lvector, lmatrix, lrow, lcol, ltensor3, ltensor4, ltensor5, ltensor6, ltensor7``
-* **float**: ``fscalar, fvector, fmatrix, frow, fcol, ftensor3, ftensor4, ftensor5, ftensor6, ftensor7``
-* **double**: ``dscalar, dvector, dmatrix, drow, dcol, dtensor3, dtensor4, dtensor5, dtensor6, dtensor7``
-* **complex**: ``cscalar, cvector, cmatrix, crow, ccol, ctensor3, ctensor4, ctensor5, ctensor6, ctensor7``
-
-The previous list is not exhaustive and a guide to all types compatible
-with NumPy arrays may be found here: :ref:`tensor creation<libdoc_tensor_creation>`.
-
-.. note::
-
-   You, the user---not the system architecture---have to choose whether your
-   program will use 32- or 64-bit integers (``i`` prefix vs. the ``l`` prefix)
-   and floats (``f`` prefix vs. the ``d`` prefix).
-
 
 
 Exercise

From 3523d79bb79561a55e15035bc384831a2389f9ae Mon Sep 17 00:00:00 2001
From: ferres <justferres@yandex.ru>
Date: Tue, 13 Aug 2024 11:54:32 +0300
Subject: [PATCH 65/72] maintanance: unpin scipy

fix: cast to elemwise outputs to their respective dtypes

fix: Relax scipy dependency, should work in both cases

style: black

wrap with asarray

fix: make elemwise test check against dtype in the graph

fix scalar issues

Update pytensor/scalar/basic.py

Co-authored-by: Ricardo Vieira <28983449+ricardoV94@users.noreply.github.com>

fix test

add a clarifying comment to checking nan

fix: bool is deprecated in numpy

deps: bound scipy version

improve test
---
 environment-osx-arm64.yml   |  2 +-
 environment.yml             |  2 +-
 pyproject.toml              |  2 +-
 pytensor/scalar/basic.py    | 17 ++++++++++++++---
 pytensor/tensor/elemwise.py | 22 ++--------------------
 tests/scalar/test_loop.py   | 17 ++++++++++++-----
 tests/tensor/utils.py       |  8 +++++---
 7 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/environment-osx-arm64.yml b/environment-osx-arm64.yml
index 0d624aa55c..13a68faaaa 100644
--- a/environment-osx-arm64.yml
+++ b/environment-osx-arm64.yml
@@ -10,7 +10,7 @@ dependencies:
   - python=>3.10
   - compilers
   - numpy>=1.17.0,<2
-  - scipy>=0.14,<1.14.0
+  - scipy>=1,<2
   - filelock>=3.15
   - etuples
   - logical-unification
diff --git a/environment.yml b/environment.yml
index 95bb58c06c..4b213fd851 100644
--- a/environment.yml
+++ b/environment.yml
@@ -10,7 +10,7 @@ dependencies:
   - python>=3.10
   - compilers
   - numpy>=1.17.0,<2
-  - scipy>=0.14,<1.14.0
+  - scipy>=1,<2
   - filelock>=3.15
   - etuples
   - logical-unification
diff --git a/pyproject.toml b/pyproject.toml
index 81a1285da8..bebba8a7de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ keywords = [
 ]
 dependencies = [
     "setuptools>=59.0.0",
-    "scipy>=0.14,<1.14",
+    "scipy>=1,<2",
     "numpy>=1.17.0,<2",
     "filelock>=3.15",
     "etuples",
diff --git a/pytensor/scalar/basic.py b/pytensor/scalar/basic.py
index d4c41d5cb5..d6fcfc0723 100644
--- a/pytensor/scalar/basic.py
+++ b/pytensor/scalar/basic.py
@@ -1140,14 +1140,25 @@ def output_types(self, types):
         else:
             raise NotImplementedError(f"Cannot calculate the output types for {self}")
 
+    @staticmethod
+    def _cast_scalar(x, dtype):
+        if hasattr(x, "astype"):
+            return x.astype(dtype)
+        elif dtype == "bool":
+            return np.bool_(x)
+        else:
+            return getattr(np, dtype)(x)
+
     def perform(self, node, inputs, output_storage):
         if self.nout == 1:
-            output_storage[0][0] = self.impl(*inputs)
+            dtype = node.outputs[0].dtype
+            output_storage[0][0] = self._cast_scalar(self.impl(*inputs), dtype)
         else:
             variables = from_return_values(self.impl(*inputs))
             assert len(variables) == len(output_storage)
-            for storage, variable in zip(output_storage, variables):
-                storage[0] = variable
+            for out, storage, variable in zip(node.outputs, output_storage, variables):
+                dtype = out.dtype
+                storage[0] = self._cast_scalar(variable, dtype)
 
     def impl(self, *inputs):
         raise MethodNotDefined("impl", type(self), self.__class__.__name__)
diff --git a/pytensor/tensor/elemwise.py b/pytensor/tensor/elemwise.py
index de966f1a78..1b0d433dda 100644
--- a/pytensor/tensor/elemwise.py
+++ b/pytensor/tensor/elemwise.py
@@ -767,34 +767,16 @@ def perform(self, node, inputs, output_storage):
         for i, (variable, storage, nout) in enumerate(
             zip(variables, output_storage, node.outputs)
         ):
-            if getattr(variable, "dtype", "") == "object":
-                # Since numpy 1.6, function created with numpy.frompyfunc
-                # always return an ndarray with dtype object
-                variable = np.asarray(variable, dtype=nout.dtype)
+            storage[0] = variable = np.asarray(variable, dtype=nout.dtype)
 
             if i in self.inplace_pattern:
                 odat = inputs[self.inplace_pattern[i]]
                 odat[...] = variable
                 storage[0] = odat
 
-            # Sometimes NumPy return a Python type.
-            # Some PyTensor op return a different dtype like floor, ceil,
-            # trunc, eq, ...
-            elif not isinstance(variable, np.ndarray) or variable.dtype != nout.dtype:
-                variable = np.asarray(variable, nout.dtype)
-                # The next line is needed for numpy 1.9. Otherwise
-                # there are tests that fail in DebugMode.
-                # Normally we would call pytensor.misc._asarray, but it
-                # is faster to inline the code. We know that the dtype
-                # are the same string, just different typenum.
-                if np.dtype(nout.dtype).num != variable.dtype.num:
-                    variable = variable.view(dtype=nout.dtype)
-                storage[0] = variable
             # numpy.real return a view!
-            elif not variable.flags.owndata:
+            if not variable.flags.owndata:
                 storage[0] = variable.copy()
-            else:
-                storage[0] = variable
 
     @staticmethod
     def _check_runtime_broadcast(node, inputs):
diff --git a/tests/scalar/test_loop.py b/tests/scalar/test_loop.py
index 88f1a588fd..88d14c6e43 100644
--- a/tests/scalar/test_loop.py
+++ b/tests/scalar/test_loop.py
@@ -212,12 +212,17 @@ def test_inner_composite(mode):
     y16 = op(n_steps, x16)
     assert y16.type.dtype == "float16"
 
-    fn32 = function([n_steps, x16], y16, mode=mode)
+    fn16 = function([n_steps, x16], y16, mode=mode)
+    out16 = fn16(n_steps=3, x16=np.array(4.73, dtype="float16"))
     np.testing.assert_allclose(
-        fn32(n_steps=9, x16=np.array(4.73, dtype="float16")),
-        4.73 + 9,
+        out16,
+        4.73 + 3,
         rtol=1e-3,
     )
+    out16overflow = fn16(n_steps=9, x16=np.array(4.73, dtype="float16"))
+    assert out16overflow.dtype == "float16"
+    # with this dtype overflow happens
+    assert np.isnan(out16overflow)
 
 
 @mode
@@ -243,8 +248,10 @@ def test_inner_loop(mode):
     y16 = outer_loop_op(n_steps, x16, n_steps)
     assert y16.type.dtype == "float16"
 
-    fn32 = function([n_steps, x16], y16, mode=mode)
+    fn16 = function([n_steps, x16], y16, mode=mode)
+    out16 = fn16(n_steps=3, x16=np.array(2.5, dtype="float16"))
+    assert out16.dtype == "float16"
     np.testing.assert_allclose(
-        fn32(n_steps=3, x16=np.array(2.5, dtype="float16")),
+        out16,
         3**2 + 2.5,
     )
diff --git a/tests/tensor/utils.py b/tests/tensor/utils.py
index 2f97d0e18f..85c48a42dd 100644
--- a/tests/tensor/utils.py
+++ b/tests/tensor/utils.py
@@ -508,15 +508,17 @@ def test_good(self):
                 if not isinstance(expecteds, list | tuple):
                     expecteds = (expecteds,)
 
-                for i, (variable, expected) in enumerate(zip(variables, expecteds)):
+                for i, (variable, expected, out_symbol) in enumerate(
+                    zip(variables, expecteds, node.outputs)
+                ):
                     condition = (
-                        variable.dtype != expected.dtype
+                        variable.dtype != out_symbol.type.dtype
                         or variable.shape != expected.shape
                         or not np.allclose(variable, expected, atol=eps, rtol=eps)
                     )
                     assert not condition, (
                         f"Test {self.op}::{testname}: Output {i} gave the wrong"
-                        f" value. With inputs {inputs}, expected {expected} (dtype {expected.dtype}),"
+                        f" value. With inputs {inputs}, expected {expected} (dtype {out_symbol.type.dtype}),"
                         f" got {variable} (dtype {variable.dtype}). eps={eps:f}"
                         f" np.allclose returns {np.allclose(variable, expected, atol=eps, rtol=eps)} {np.allclose(variable, expected)}"
                     )

From 400323fecef8576733f5a6e0e2ae6e71323ea0e8 Mon Sep 17 00:00:00 2001
From: ferres <justferres@yandex.ru>
Date: Wed, 14 Aug 2024 19:38:23 +0300
Subject: [PATCH 66/72] mypy: fix graph.py

---
 pytensor/gradient.py    | 30 ++++++++++++++++++++++++++++--
 pytensor/graph/basic.py |  5 +++--
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/pytensor/gradient.py b/pytensor/gradient.py
index abf80bff43..6b3a1a4b62 100644
--- a/pytensor/gradient.py
+++ b/pytensor/gradient.py
@@ -4,7 +4,7 @@
 import warnings
 from collections.abc import Callable, Mapping, MutableSequence, Sequence
 from functools import partial, reduce
-from typing import TYPE_CHECKING, Literal, TypeVar, Union
+from typing import TYPE_CHECKING, Literal, TypeVar, Union, overload
 
 import numpy as np
 
@@ -414,6 +414,32 @@ def Lop(
     return as_list_or_tuple(using_list, using_tuple, ret)
 
 
+@overload
+def grad(
+    cost: Variable | None,
+    wrt: Variable | Sequence[Variable],
+    consider_constant: Sequence[Variable] | None = ...,
+    disconnected_inputs: Literal["ignore", "warn", "raise"] = ...,
+    add_names: bool = ...,
+    known_grads: Mapping[Variable, Variable] | None = ...,
+    return_disconnected: Literal["zero", "disconnected"] = ...,
+    null_gradients: Literal["raise", "return"] = ...,
+) -> Variable | None | Sequence[Variable]: ...
+
+
+@overload
+def grad(
+    cost: Variable | None,
+    wrt: Variable | Sequence[Variable],
+    consider_constant: Sequence[Variable] | None = ...,
+    disconnected_inputs: Literal["ignore", "warn", "raise"] = ...,
+    add_names: bool = ...,
+    known_grads: Mapping[Variable, Variable] | None = ...,
+    return_disconnected: Literal["none"] = ...,
+    null_gradients: Literal["raise", "return"] = ...,
+) -> Variable | None | Sequence[Variable | None]: ...
+
+
 def grad(
     cost: Variable | None,
     wrt: Variable | Sequence[Variable],
@@ -423,7 +449,7 @@ def grad(
     known_grads: Mapping[Variable, Variable] | None = None,
     return_disconnected: Literal["none", "zero", "disconnected"] = "zero",
     null_gradients: Literal["raise", "return"] = "raise",
-) -> Variable | None | Sequence[Variable | None]:
+) -> Variable | None | Sequence[Variable | None] | Sequence[Variable]:
     """
     Return symbolic gradients of one cost with respect to one or more variables.
 
diff --git a/pytensor/graph/basic.py b/pytensor/graph/basic.py
index 2ffd101c23..057341909c 100644
--- a/pytensor/graph/basic.py
+++ b/pytensor/graph/basic.py
@@ -1313,8 +1313,9 @@ def clone_get_equiv(
     outputs: Reversible[Variable],
     copy_inputs: bool = True,
     copy_orphans: bool = True,
-    memo: dict[Union[Apply, Variable, "Op"], Union[Apply, Variable, "Op"]]
-    | None = None,
+    memo: (
+        dict[Union[Apply, Variable, "Op"], Union[Apply, Variable, "Op"]] | None
+    ) = None,
     clone_inner_graphs: bool = False,
     **kwargs,
 ) -> dict[Union[Apply, Variable, "Op"], Union[Apply, Variable, "Op"]]:

From f0214a132530d4da3a7cea113a2ba4aa80ed97a8 Mon Sep 17 00:00:00 2001
From: ferres <justferres@yandex.ru>
Date: Wed, 14 Aug 2024 19:44:12 +0300
Subject: [PATCH 67/72] mypy: fix graph/basic.py

---
 pytensor/graph/basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytensor/graph/basic.py b/pytensor/graph/basic.py
index 057341909c..ed1ad6b6c2 100644
--- a/pytensor/graph/basic.py
+++ b/pytensor/graph/basic.py
@@ -710,7 +710,7 @@ def clone(self, **kwargs):
         return cp
 
 
-class NominalVariable(AtomicVariable[_TypeType]):
+class NominalVariable(Generic[_TypeType, _IdType], AtomicVariable[_TypeType]):
     """A variable that enables alpha-equivalent comparisons."""
 
     __instances__: dict[tuple["Type", Hashable], "NominalVariable"] = {}

From 9f3a938a6066ca9e216c71adcc0999a8f8c1e3f7 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Wed, 17 Jul 2024 12:53:04 -0700
Subject: [PATCH 68/72] Add IfElse

---
 pytensor/link/pytorch/dispatch/basic.py | 13 +++++++++++++
 tests/link/pytorch/test_basic.py        | 20 +++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index c71e1606bf..03dc8b2362 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -5,6 +5,7 @@
 
 from pytensor.compile.ops import DeepCopyOp
 from pytensor.graph.fg import FunctionGraph
+from pytensor.ifelse import IfElse
 from pytensor.link.utils import fgraph_to_python
 from pytensor.raise_op import CheckAndRaise
 from pytensor.tensor.basic import Alloc, AllocEmpty, ARange, Eye, Join, MakeVector
@@ -124,6 +125,7 @@ def eye(N, M, k):
     return eye
 
 
+
 @pytorch_funcify.register(MakeVector)
 def pytorch_funcify_MakeVector(op, **kwargs):
     torch_dtype = getattr(torch, op.dtype)
@@ -132,3 +134,14 @@ def makevector(*x):
         return torch.tensor(x, dtype=torch_dtype)
 
     return makevector
+
+
+@pytorch_funcify.register(IfElse)
+def pytorch_funcify_IfElse(op, **kwargs):
+    n_outs = op.n_outs
+    assert n_outs == 1
+
+    def ifelse(cond, *args, n_outs=n_outs):
+        return torch.where(cond, *args)
+
+    return ifelse
diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index 27c1b1bd6a..d49ea1ab1e 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -11,7 +11,8 @@
 from pytensor.configdefaults import config
 from pytensor.graph.basic import Apply
 from pytensor.graph.fg import FunctionGraph
-from pytensor.graph.op import Op
+from pytensor.graph.op import Op, get_test_value
+from pytensor.ifelse import ifelse
 from pytensor.raise_op import CheckAndRaise
 from pytensor.tensor import alloc, arange, as_tensor, empty, eye
 from pytensor.tensor.type import matrix, scalar, vector
@@ -301,3 +302,20 @@ def test_pytorch_MakeVector():
     x_fg = FunctionGraph([], [x])
 
     compare_pytorch_and_py(x_fg, [])
+
+
+def test_pytorch_ifelse():
+    true_vals = np.r_[1, 2, 3]
+    false_vals = np.r_[-1, -2, -3]
+
+    x = ifelse(np.array(True), true_vals, false_vals)
+    x_fg = FunctionGraph([], [x])
+
+    compare_pytorch_and_py(x_fg, [])
+
+    a = scalar("a")
+    a.tag.test_value = np.array(0.2, dtype=config.floatX)
+    x = ifelse(a < 0.5, true_vals, false_vals)
+    x_fg = FunctionGraph([a], [x])  # I.e. False
+
+    compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])

From d36d4ce07a4d77a8c42d200c4beb27a17eb5ce7e Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Wed, 17 Jul 2024 13:14:35 -0700
Subject: [PATCH 69/72] Remove space

---
 pytensor/link/pytorch/dispatch/basic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index 03dc8b2362..0039406907 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -125,7 +125,6 @@ def eye(N, M, k):
     return eye
 
 
-
 @pytorch_funcify.register(MakeVector)
 def pytorch_funcify_MakeVector(op, **kwargs):
     torch_dtype = getattr(torch, op.dtype)

From 9adbbe249e4051743fa78793a3b83d6ca469334f Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Sun, 21 Jul 2024 16:01:12 -0700
Subject: [PATCH 70/72] Update away from torch.where

---
 pytensor/link/pytorch/dispatch/basic.py |  8 +++++---
 tests/link/pytorch/test_basic.py        | 16 ++++++----------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index 0039406907..5e5bc4a41b 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -138,9 +138,11 @@ def makevector(*x):
 @pytorch_funcify.register(IfElse)
 def pytorch_funcify_IfElse(op, **kwargs):
     n_outs = op.n_outs
-    assert n_outs == 1
 
-    def ifelse(cond, *args, n_outs=n_outs):
-        return torch.where(cond, *args)
+    def ifelse(cond, ifpath, elsepath, n_outs=n_outs):
+        if cond:
+            return ifpath
+        else:
+            return elsepath
 
     return ifelse
diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index d49ea1ab1e..3905055935 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -308,14 +308,10 @@ def test_pytorch_ifelse():
     true_vals = np.r_[1, 2, 3]
     false_vals = np.r_[-1, -2, -3]
 
-    x = ifelse(np.array(True), true_vals, false_vals)
-    x_fg = FunctionGraph([], [x])
-
-    compare_pytorch_and_py(x_fg, [])
-
-    a = scalar("a")
-    a.tag.test_value = np.array(0.2, dtype=config.floatX)
-    x = ifelse(a < 0.5, true_vals, false_vals)
-    x_fg = FunctionGraph([a], [x])  # I.e. False
+    for test_value, cond in [(0.2, 0.5), (0.5, 0.4)]:
+        a = scalar("a")
+        a.tag.test_value = np.array(test_value, dtype=config.floatX)
+        x = ifelse(a < cond, true_vals, false_vals)
+        x_fg = FunctionGraph([a], [x])  # I.e. False
 
-    compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])
+        compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])

From 27664570814ef953c4b36a15e2548436491cdd75 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Fri, 9 Aug 2024 16:45:36 -0700
Subject: [PATCH 71/72] Fix test to allow for n_outs>1

---
 pytensor/link/pytorch/dispatch/basic.py |  6 +++---
 tests/link/pytorch/test_basic.py        | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/pytensor/link/pytorch/dispatch/basic.py b/pytensor/link/pytorch/dispatch/basic.py
index 5e5bc4a41b..291ad40a65 100644
--- a/pytensor/link/pytorch/dispatch/basic.py
+++ b/pytensor/link/pytorch/dispatch/basic.py
@@ -139,10 +139,10 @@ def makevector(*x):
 def pytorch_funcify_IfElse(op, **kwargs):
     n_outs = op.n_outs
 
-    def ifelse(cond, ifpath, elsepath, n_outs=n_outs):
+    def ifelse(cond, *true_and_false, n_outs=n_outs):
         if cond:
-            return ifpath
+            return torch.stack(true_and_false[:n_outs])
         else:
-            return elsepath
+            return torch.stack(true_and_false[n_outs:])
 
     return ifelse
diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index 3905055935..8393f695c3 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -305,13 +305,15 @@ def test_pytorch_MakeVector():
 
 
 def test_pytorch_ifelse():
-    true_vals = np.r_[1, 2, 3]
-    false_vals = np.r_[-1, -2, -3]
+    p1_vals = np.r_[1, 2, 3]
+    p2_vals = np.r_[-1, -2, -3]
 
     for test_value, cond in [(0.2, 0.5), (0.5, 0.4)]:
         a = scalar("a")
         a.tag.test_value = np.array(test_value, dtype=config.floatX)
-        x = ifelse(a < cond, true_vals, false_vals)
-        x_fg = FunctionGraph([a], [x])  # I.e. False
+        x = ifelse(
+            a < cond, tuple(np.r_[p1_vals, p2_vals]), tuple(np.r_[p2_vals, p1_vals])
+        )
+        x_fg = FunctionGraph([a], x)
 
         compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])

From ef9277b66a5834cacf14ae774d8274ce8c11eb64 Mon Sep 17 00:00:00 2001
From: Ian Schweer <ischweer@riotgames.com>
Date: Fri, 9 Aug 2024 16:52:13 -0700
Subject: [PATCH 72/72] Remove test value

---
 tests/link/pytorch/test_basic.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/link/pytorch/test_basic.py b/tests/link/pytorch/test_basic.py
index 8393f695c3..73b098182b 100644
--- a/tests/link/pytorch/test_basic.py
+++ b/tests/link/pytorch/test_basic.py
@@ -11,7 +11,7 @@
 from pytensor.configdefaults import config
 from pytensor.graph.basic import Apply
 from pytensor.graph.fg import FunctionGraph
-from pytensor.graph.op import Op, get_test_value
+from pytensor.graph.op import Op
 from pytensor.ifelse import ifelse
 from pytensor.raise_op import CheckAndRaise
 from pytensor.tensor import alloc, arange, as_tensor, empty, eye
@@ -310,10 +310,9 @@ def test_pytorch_ifelse():
 
     for test_value, cond in [(0.2, 0.5), (0.5, 0.4)]:
         a = scalar("a")
-        a.tag.test_value = np.array(test_value, dtype=config.floatX)
         x = ifelse(
             a < cond, tuple(np.r_[p1_vals, p2_vals]), tuple(np.r_[p2_vals, p1_vals])
         )
         x_fg = FunctionGraph([a], x)
 
-        compare_pytorch_and_py(x_fg, [get_test_value(i) for i in x_fg.inputs])
+        compare_pytorch_and_py(x_fg, np.array(test_value, dtype=config.floatX))