From 74c2ad7fa209e39994d9757d1451563992d538db Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 29 Nov 2023 18:08:33 -0500
Subject: [PATCH 01/25] Reduce unnecessary allocations and reuse code

---
 Project.toml          |   7 +--
 src/NonlinearSolve.jl | 113 +++++++++++++++++++++---------------------
 src/jacobian.jl       |  96 +++++++++++++++++++----------------
 src/klement.jl        |   2 +-
 src/levenberg.jl      |   2 +-
 src/raphson.jl        |  57 ++++++---------------
 src/trace.jl          |   6 ++-
 src/trustRegion.jl    |   2 +-
 src/utils.jl          |  92 +++++++++++++++++++++++++---------
 9 files changed, 204 insertions(+), 173 deletions(-)

diff --git a/Project.toml b/Project.toml
index 60764651b..8a42f9d21 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
+MaybeInplace = "bb5d69b7-63fc-4a16-80bd-7e42200c7bdb"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
@@ -42,8 +43,8 @@ NonlinearSolveZygoteExt = "Zygote"
 
 [compat]
 ADTypes = "0.2"
-ArrayInterface = "6.0.24, 7"
 Aqua = "0.8"
+ArrayInterface = "6.0.24, 7"
 BandedMatrices = "1"
 BenchmarkTools = "1"
 ConcreteStructs = "0.2"
@@ -70,9 +71,9 @@ Reexport = "0.2, 1"
 SafeTestsets = "0.1"
 SciMLBase = "2.9"
 SciMLOperators = "0.3"
-SimpleNonlinearSolve = "0.1.23"
+SimpleNonlinearSolve = "1"  # FIXME: Don't update the version in this PR. Using it to test
 SparseArrays = "<0.0.1, 1"
-SparseDiffTools = "2.12"
+SparseDiffTools = "2.14"
 StaticArrays = "1"
 StaticArraysCore = "1.4"
 Symbolics = "5"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index c591eb4ee..f050bf007 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,25 +8,24 @@ import Reexport: @reexport
 import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_workload
 
 @recompile_invalidations begin
-    using DiffEqBase,
-        LazyArrays, LinearAlgebra, LinearSolve, Printf, SparseArrays,
+    using DiffEqBase, LazyArrays, LinearAlgebra, LinearSolve, Printf, SparseArrays,
         SparseDiffTools
-    using FastBroadcast: @..
-    import ArrayInterface: restructure
 
     import ADTypes: AbstractFiniteDifferencesMode
-    import ArrayInterface: undefmatrix,
+    import ArrayInterface: undefmatrix, restructure, can_setindex,
         matrix_colors, parameterless_type, ismutable, issingular, fast_scalar_indexing
     import ConcreteStructs: @concrete
     import EnumX: @enumx
+    import FastBroadcast: @..
     import ForwardDiff
     import ForwardDiff: Dual
     import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
+    import MaybeInplace: @bb
     import RecursiveArrayTools: ArrayPartition,
         AbstractVectorOfArray, recursivecopy!, recursivefill!
     import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
     import SciMLOperators: FunctionOperator
-    import StaticArraysCore: StaticArray, SVector, SArray, MArray
+    import StaticArraysCore: StaticArray, SVector, SArray, MArray, Size, SMatrix
     import UnPack: @unpack
 
     using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
@@ -55,13 +54,13 @@ isinplace(::AbstractNonlinearSolveCache{iip}) where {iip} = iip
 function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm)
     str = "$(nameof(typeof(alg)))("
     modifiers = String[]
-    if _getproperty(alg, Val(:ad)) !== nothing
+    if __getproperty(alg, Val(:ad)) !== nothing
         push!(modifiers, "ad = $(nameof(typeof(alg.ad)))()")
     end
-    if _getproperty(alg, Val(:linsolve)) !== nothing
+    if __getproperty(alg, Val(:linsolve)) !== nothing
         push!(modifiers, "linsolve = $(nameof(typeof(alg.linsolve)))()")
     end
-    if _getproperty(alg, Val(:linesearch)) !== nothing
+    if __getproperty(alg, Val(:linesearch)) !== nothing
         ls = alg.linesearch
         if ls isa LineSearch
             ls.method !== nothing &&
@@ -70,7 +69,7 @@ function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm)
             push!(modifiers, "linesearch = $(nameof(typeof(alg.linesearch)))()")
         end
     end
-    if _getproperty(alg, Val(:radius_update_scheme)) !== nothing
+    if __getproperty(alg, Val(:radius_update_scheme)) !== nothing
         push!(modifiers, "radius_update_scheme = $(alg.radius_update_scheme)")
     end
     str = str * join(modifiers, ", ")
@@ -107,7 +106,7 @@ function SciMLBase.solve!(cache::AbstractNonlinearSolveCache)
         end
     end
 
-    trace = _getproperty(cache, Val{:trace}())
+    trace = __getproperty(cache, Val{:trace}())
     if trace !== nothing
         update_trace!(trace, cache.stats.nsteps, get_u(cache), get_fu(cache), nothing,
             nothing, nothing; last = Val(true))
@@ -134,52 +133,52 @@ include("jacobian.jl")
 include("ad.jl")
 include("default.jl")
 
-@setup_workload begin
-    nlfuncs = ((NonlinearFunction{false}((u, p) -> u .* u .- p), 0.1),
-        (NonlinearFunction{false}((u, p) -> u .* u .- p), [0.1]),
-        (NonlinearFunction{true}((du, u, p) -> du .= u .* u .- p), [0.1]))
-    probs_nls = NonlinearProblem[]
-    for T in (Float32, Float64), (fn, u0) in nlfuncs
-        push!(probs_nls, NonlinearProblem(fn, T.(u0), T(2)))
-    end
-
-    nls_algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt(), PseudoTransient(),
-        GeneralBroyden(), GeneralKlement(), DFSane(), nothing)
-
-    probs_nlls = NonlinearLeastSquaresProblem[]
-    nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), [0.1, 0.0]),
-        (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)), [0.1, 0.1]),
-        (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
-                resid_prototype = zeros(1)), [0.1, 0.0]),
-        (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
-                resid_prototype = zeros(4)), [0.1, 0.1]))
-    for (fn, u0) in nlfuncs
-        push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0))
-    end
-    nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), Float32[0.1, 0.0]),
-        (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)),
-            Float32[0.1, 0.1]),
-        (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
-                resid_prototype = zeros(Float32, 1)), Float32[0.1, 0.0]),
-        (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
-                resid_prototype = zeros(Float32, 4)), Float32[0.1, 0.1]))
-    for (fn, u0) in nlfuncs
-        push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0f0))
-    end
-
-    nlls_algs = (LevenbergMarquardt(), GaussNewton(),
-        LevenbergMarquardt(; linsolve = LUFactorization()),
-        GaussNewton(; linsolve = LUFactorization()))
-
-    @compile_workload begin
-        for prob in probs_nls, alg in nls_algs
-            solve(prob, alg, abstol = 1e-2)
-        end
-        for prob in probs_nlls, alg in nlls_algs
-            solve(prob, alg, abstol = 1e-2)
-        end
-    end
-end
+# @setup_workload begin
+#     nlfuncs = ((NonlinearFunction{false}((u, p) -> u .* u .- p), 0.1),
+#         (NonlinearFunction{false}((u, p) -> u .* u .- p), [0.1]),
+#         (NonlinearFunction{true}((du, u, p) -> du .= u .* u .- p), [0.1]))
+#     probs_nls = NonlinearProblem[]
+#     for T in (Float32, Float64), (fn, u0) in nlfuncs
+#         push!(probs_nls, NonlinearProblem(fn, T.(u0), T(2)))
+#     end
+
+#     nls_algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt(), PseudoTransient(),
+#         GeneralBroyden(), GeneralKlement(), DFSane(), nothing)
+
+#     probs_nlls = NonlinearLeastSquaresProblem[]
+#     nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), [0.1, 0.0]),
+#         (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)), [0.1, 0.1]),
+#         (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
+#                 resid_prototype = zeros(1)), [0.1, 0.0]),
+#         (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
+#                 resid_prototype = zeros(4)), [0.1, 0.1]))
+#     for (fn, u0) in nlfuncs
+#         push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0))
+#     end
+#     nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), Float32[0.1, 0.0]),
+#         (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)),
+#             Float32[0.1, 0.1]),
+#         (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
+#                 resid_prototype = zeros(Float32, 1)), Float32[0.1, 0.0]),
+#         (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
+#                 resid_prototype = zeros(Float32, 4)), Float32[0.1, 0.1]))
+#     for (fn, u0) in nlfuncs
+#         push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0f0))
+#     end
+
+#     nlls_algs = (LevenbergMarquardt(), GaussNewton(),
+#         LevenbergMarquardt(; linsolve = LUFactorization()),
+#         GaussNewton(; linsolve = LUFactorization()))
+
+#     @compile_workload begin
+#         for prob in probs_nls, alg in nls_algs
+#             solve(prob, alg, abstol = 1e-2)
+#         end
+#         for prob in probs_nlls, alg in nlls_algs
+#             solve(prob, alg, abstol = 1e-2)
+#         end
+#     end
+# end
 
 export RadiusUpdateSchemes
 
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 41c7319a1..54f1c0f0e 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -3,8 +3,11 @@
     Jᵀ
 end
 
-SciMLBase.isinplace(JᵀJ::KrylovJᵀJ) = isinplace(JᵀJ.Jᵀ)
+__maybe_symmetric(x::KrylovJᵀJ) = x.JᵀJ
+
+isinplace(JᵀJ::KrylovJᵀJ) = isinplace(JᵀJ.Jᵀ)
 
+# Select if we are going to use sparse differentiation or not
 sparsity_detection_alg(_, _) = NoSparsityDetection()
 function sparsity_detection_alg(f, ad::AbstractSparseADType)
     if f.sparsity === nothing
@@ -33,13 +36,21 @@ function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
     @unpack f, uf, u, p, jac_cache, alg, fu2 = cache
     iip = isinplace(cache)
     if iip
-        has_jac(f) ? f.jac(J, u, p) :
-        sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, _maybe_mutable(u, alg.ad))
+        if has_jac(f)
+            f.jac(J, u, p)
+        else
+            sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, u)
+        end
+        return J
     else
-        return has_jac(f) ? f.jac(u, p) :
-               sparse_jacobian!(J, alg.ad, jac_cache, uf, _maybe_mutable(u, alg.ad))
+        if has_jac(f)
+            return f.jac(u, p)
+        elseif can_setindex(typeof(J))
+            return sparse_jacobian!(J, alg.ad, jac_cache, uf, u)
+        else
+            return sparse_jacobian(alg.ad, jac_cache, uf, u)
+        end
     end
-    return J
 end
 # Scalar case
 jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
@@ -59,13 +70,13 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
     alg_wants_jac = (concrete_jac(alg) !== nothing && concrete_jac(alg))
 
     # NOTE: The deepcopy is needed here since we are using the resid_prototype elsewhere
-    fu = f.resid_prototype === nothing ? (iip ? _mutable_zero(u) : _mutable(f(u, p))) :
+    fu = f.resid_prototype === nothing ? (iip ? zero(u) : f(u, p)) :
          (iip ? deepcopy(f.resid_prototype) : f.resid_prototype)
     if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
         sd = sparsity_detection_alg(f, alg.ad)
         ad = alg.ad
-        jac_cache = iip ? sparse_jacobian_cache(ad, sd, uf, fu, _maybe_mutable(u, ad)) :
-                    sparse_jacobian_cache(ad, sd, uf, _maybe_mutable(u, ad); fx = fu)
+        jac_cache = iip ? sparse_jacobian_cache(ad, sd, uf, fu, u) :
+                    sparse_jacobian_cache(ad, sd, uf, __maybe_mutable(u, ad); fx = fu)
     else
         jac_cache = nothing
     end
@@ -76,11 +87,11 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
             JacVec(uf, u; fu, autodiff = __get_nonsparse_ad(alg.ad))
         else
             if iip
-                jvp = (_, u, v) -> (du = similar(fu); f.jvp(du, v, u, p); du)
-                jvp! = (du, _, u, v) -> f.jvp(du, v, u, p)
+                jvp = (_, u, v) -> (du_ = similar(fu); f.jvp(du_, v, u, p); du_)
+                jvp! = (du_, _, u, v) -> f.jvp(du_, v, u, p)
             else
                 jvp = (_, u, v) -> f.jvp(v, u, p)
-                jvp! = (du, _, u, v) -> (du .= f.jvp(v, u, p))
+                jvp! = (du_, _, u, v) -> (du_ .= f.jvp(v, u, p))
             end
             op = SparseDiffTools.FwdModeAutoDiffVecProd(f, u, (), jvp, jvp!)
             FunctionOperator(op, u, fu; isinplace = Val(true), outofplace = Val(false),
@@ -89,16 +100,18 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
     else
         if has_analytic_jac
             f.jac_prototype === nothing ? undefmatrix(u) : f.jac_prototype
+        elseif f.jac_prototype === nothing
+            init_jacobian(jac_cache; preserve_immutable = Val(true))
         else
-            f.jac_prototype === nothing ? init_jacobian(jac_cache) : f.jac_prototype
+            f.jac_prototype
         end
     end
 
-    du = _mutable_zero(u)
+    du = copy(u)
 
     if needsJᵀJ
         JᵀJ, Jᵀfu = __init_JᵀJ(J, _vec(fu), uf, u; f,
-            vjp_autodiff = __get_nonsparse_ad(_getproperty(alg, Val(:vjp_autodiff))),
+            vjp_autodiff = __get_nonsparse_ad(__getproperty(alg, Val(:vjp_autodiff))),
             jvp_autodiff = __get_nonsparse_ad(alg.ad))
     end
 
@@ -106,7 +119,8 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
         linprob_A = alg isa PseudoTransient ?
                     (J - (1 / (convert(eltype(u), alg.alpha_initial))) * I) :
                     (needsJᵀJ ? __maybe_symmetric(JᵀJ) : J)
-        linsolve = __setup_linsolve(linprob_A, needsJᵀJ ? Jᵀfu : fu, du, p, alg)
+        linsolve = linsolve_caches(linprob_A, needsJᵀJ ? Jᵀfu : fu, du, p, alg;
+            linsolve_kwargs)
     else
         linsolve = nothing
     end
@@ -115,22 +129,33 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
     return uf, linsolve, J, fu, jac_cache, du
 end
 
-function __setup_linsolve(A, b, u, p, alg)
-    linprob = LinearProblem(A, _vec(b); u0 = _vec(u))
+## Special Handling for Scalars
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u::Number, p,
+        ::Val{false}; linsolve_with_JᵀJ::Val{needsJᵀJ} = Val(false),
+        kwargs...) where {needsJᵀJ, F}
+    # NOTE: Scalar `u` assumes scalar output from `f`
+    uf = SciMLBase.JacobianWrapper{false}(f, p)
+    needsJᵀJ && return uf, nothing, u, nothing, nothing, u, u, u
+    return uf, FakeLinearSolveJLCache(u, u), u, nothing, nothing, u
+end
 
-    weight = similar(u)
-    recursivefill!(weight, true)
+# Linear Solve Cache
+function linsolve_caches(A, b, u, p, alg; linsolve_kwargs = (;))
+    if alg.linsolve === nothing && A isa SMatrix && linsolve_kwargs === (;)
+        # Default handling for SArrays in LinearSolve is not great. Some parts are patched
+        # but there are quite a few unnecessary allocations
+        return FakeLinearSolveJLCache(A, b)
+    end
+
+    linprob = LinearProblem(A, _vec(b); u0 = _vec(u), linsolve_kwargs...)
+
+    weight = __init_ones(u)
 
     Pl, Pr = wrapprecs(alg.precs(A, nothing, u, p, nothing, nothing, nothing, nothing,
             nothing)..., weight)
     return init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr)
 end
-__setup_linsolve(A::KrylovJᵀJ, b, u, p, alg) = __setup_linsolve(A.JᵀJ, b, u, p, alg)
-
-__get_nonsparse_ad(::AutoSparseForwardDiff) = AutoForwardDiff()
-__get_nonsparse_ad(::AutoSparseFiniteDiff) = AutoFiniteDiff()
-__get_nonsparse_ad(::AutoSparseZygote) = AutoZygote()
-__get_nonsparse_ad(ad) = ad
+linsolve_caches(A::KrylovJᵀJ, b, u, p, alg) = linsolve_caches(A.JᵀJ, b, u, p, alg)
 
 __init_JᵀJ(J::Number, args...; kwargs...) = zero(J), zero(J)
 function __init_JᵀJ(J::AbstractArray, fu, args...; kwargs...)
@@ -180,24 +205,7 @@ function __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
     end
 end
 
-__maybe_symmetric(x) = Symmetric(x)
-__maybe_symmetric(x::Number) = x
-# LinearSolve with `nothing` doesn't dispatch correctly here
-__maybe_symmetric(x::StaticArray) = x
-__maybe_symmetric(x::SparseArrays.AbstractSparseMatrix) = x
-__maybe_symmetric(x::SciMLOperators.AbstractSciMLOperator) = x
-__maybe_symmetric(x::KrylovJᵀJ) = x.JᵀJ
-
-## Special Handling for Scalars
-function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u::Number, p,
-        ::Val{false}; linsolve_with_JᵀJ::Val{needsJᵀJ} = Val(false),
-        kwargs...) where {needsJᵀJ, F}
-    # NOTE: Scalar `u` assumes scalar output from `f`
-    uf = SciMLBase.JacobianWrapper{false}(f, p)
-    needsJᵀJ && return uf, nothing, u, nothing, nothing, u, u, u
-    return uf, nothing, u, nothing, nothing, u
-end
-
+# Generic Handling of Krylov Methods for Normal Form Linear Solves
 function __update_JᵀJ!(iip::Val, cache, sym::Symbol, J)
     return __update_JᵀJ!(iip, cache, sym, getproperty(cache, sym), J)
 end
diff --git a/src/klement.jl b/src/klement.jl
index ec32dc6b8..8a9640fd4 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -87,7 +87,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::GeneralKleme
         linsolve_alg = alg_.linsolve === nothing && u isa Array ? LUFactorization() :
                        nothing
         alg = set_linsolve(alg_, linsolve_alg)
-        linsolve = __setup_linsolve(J, _vec(fu), _vec(du), p, alg)
+        linsolve = linsolve_caches(J, _vec(fu), _vec(du), p, alg)
     end
 
     abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
diff --git a/src/levenberg.jl b/src/levenberg.jl
index dcc07d85e..94e882223 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -232,7 +232,7 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
         fill!(mat_tmp, zero(eltype(u)))
         rhs_tmp = vcat(_vec(fu1), _vec(u))
         fill!(rhs_tmp, zero(eltype(u)))
-        linsolve = __setup_linsolve(mat_tmp, rhs_tmp, u, p, alg)
+        linsolve = linsolve_caches(mat_tmp, rhs_tmp, u, p, alg)
     end
 
     return LevenbergMarquardtCache{iip, !_unwrap_val(linsolve_with_JᵀJ)}(f, alg, u, copy(u),
diff --git a/src/raphson.jl b/src/raphson.jl
index 594b893e5..4c4125579 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -80,7 +80,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::NewtonRaphso
         kwargs...) where {uType, iip}
     alg = get_concrete_algorithm(alg_, prob)
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
+    u = __maybe_unaliased(u0, alias_u0)
     fu1 = evaluate_f(prob, u)
     uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
         linsolve_kwargs)
@@ -91,62 +91,37 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::NewtonRaphso
     ls_cache = init_linesearch_cache(alg.linesearch, f, u, p, fu1, Val(iip))
     trace = init_nonlinearsolve_trace(alg, u, fu1, ApplyArray(__zero, J), du; kwargs...)
 
-    return NewtonRaphsonCache{iip}(f, alg, u, copy(u), fu1, fu2, du, p, uf, linsolve, J,
+    @bb u_prev = copy(u)
+
+    return NewtonRaphsonCache{iip}(f, alg, u, u_prev, fu1, fu2, du, p, uf, linsolve, J,
         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob,
         NLStats(1, 0, 0, 0, 0), ls_cache, tc_cache, trace)
 end
 
-function perform_step!(cache::NewtonRaphsonCache{true})
-    @unpack u, u_prev, fu1, f, p, alg, J, linsolve, du = cache
-    jacobian!!(J, cache)
+function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
+    @unpack alg = cache
+
+    cache.J = jacobian!!(cache.J, cache)
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(fu1), linu = _vec(du),
-        p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu1),
+        linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
 
-    # Line Search
-    α = perform_linesearch!(cache.ls_cache, u, du)
-    _axpy!(-α, du, u)
-    f(cache.fu1, u, p)
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), J,
-        cache.du, α)
-
-    check_and_update!(cache, cache.fu1, cache.u, cache.u_prev)
-
-    @. u_prev = u
-    cache.stats.nf += 1
-    cache.stats.njacs += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-    return nothing
-end
-
-function perform_step!(cache::NewtonRaphsonCache{false})
-    @unpack u, u_prev, fu1, f, p, alg, linsolve = cache
-
-    cache.J = jacobian!!(cache.J, cache)
-    # u = u - J \ fu
-    if linsolve === nothing
-        cache.du = fu1 / cache.J
-    else
-        linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1),
-            linu = _vec(cache.du), p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-    end
+    !iip && (cache.du = linres.u)
 
     # Line Search
-    α = perform_linesearch!(cache.ls_cache, u, cache.du)
-    cache.u = @. u - α * cache.du  # `u` might not support mutation
-    cache.fu1 = f(cache.u, p)
+    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
+    @bb axpy!(-α, cache.du, cache.u)
+
+    evaluate_f(cache, cache.u)
 
     update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
         cache.du, α)
 
     check_and_update!(cache, cache.fu1, cache.u, cache.u_prev)
 
-    cache.u_prev = cache.u
+    @bb copyto!(cache.u_prev, cache.u)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
diff --git a/src/trace.jl b/src/trace.jl
index c458c7d07..e89efe956 100644
--- a/src/trace.jl
+++ b/src/trace.jl
@@ -151,8 +151,10 @@ function reset!(trace::NonlinearSolveTrace)
 end
 
 function Base.show(io::IO, trace::NonlinearSolveTrace)
-    for entry in trace.history
-        show(io, entry)
+    if trace.history !== nothing
+        foreach(entry -> show(io, entry), trace.history)
+    else
+        print(io, "Tracing Disabled")
     end
     return nothing
 end
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 8b4041b75..5493aa4d7 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -248,7 +248,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
     uf, _, J, fu2, jac_cache, du, H, g = jacobian_caches(alg, f, u, p, Val(iip);
         linsolve_kwargs, linsolve_with_JᵀJ = Val(true), lininit = Val(false))
     g = _restructure(fu1, g)
-    linsolve = u isa Number ? nothing : __setup_linsolve(J, fu2, du, p, alg)
+    linsolve = u isa Number ? nothing : linsolve_caches(J, fu2, du, p, alg)
 
     u_tmp = zero(u)
     u_cauchy = zero(u)
diff --git a/src/utils.jl b/src/utils.jl
index bf6d1152f..d3017d42f 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,5 +1,15 @@
 const DEFAULT_NORM = DiffEqBase.NONLINEARSOLVE_DEFAULT_NORM
 
+@concrete mutable struct FakeLinearSolveJLCache
+    A
+    b
+end
+
+@concrete struct FakeLinearSolveJLResult
+    cache
+    u
+end
+
 # Ignores NaN
 function __findmin(f, x)
     return findmin(x) do xᵢ
@@ -55,7 +65,7 @@ function default_adargs_to_adtype(; chunk_size = missing, autodiff = nothing,
 end
 
 """
-value_derivative(f, x)
+    value_derivative(f, x)
 
 Compute `f(x), d/dx f(x)` in the most efficient way.
 """
@@ -65,10 +75,6 @@ function value_derivative(f::F, x::R) where {F, R}
     ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
 end
 
-function value_derivative(f::F, x::SVector) where {F}
-    f(x), ForwardDiff.jacobian(f, x)
-end
-
 @inline value(x) = x
 @inline value(x::Dual) = ForwardDiff.value(x)
 @inline value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
@@ -82,6 +88,15 @@ end
 
 DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
 
+function dolinsolve(precs::P, linsolve::FakeLinearSolveJLCache; A = nothing,
+    linu = nothing, b = nothing, du = nothing, p = nothing, weight = nothing,
+    cachedata = nothing, reltol = nothing, reuse_A_if_factorization = false) where {P}
+    A !== nothing && (linsolve.A = A)
+    b !== nothing && (linsolve.b = b)
+    linres = linsolve.A \ linsolve.b
+    return FakeLinearSolveJLResult(linsolve, linres)
+end
+
 function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing,
         du = nothing, p = nothing, weight = nothing, cachedata = nothing, reltol = nothing,
         reuse_A_if_factorization = false) where {P}
@@ -155,33 +170,32 @@ _mutable_zero(x::SArray) = MArray(x)
 _mutable(x) = x
 _mutable(x::SArray) = MArray(x)
 
-_maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
+# __maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
 # The shadow allocated for Enzyme needs to be mutable
-_maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
-_maybe_mutable(x, _) = x
+__maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
+__maybe_mutable(x, _) = x
 
 # Helper function to get value of `f(u, p)`
 function evaluate_f(prob::Union{NonlinearProblem{uType, iip},
             NonlinearLeastSquaresProblem{uType, iip}}, u) where {uType, iip}
     @unpack f, u0, p = prob
     if iip
-        fu = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        fu = f.resid_prototype === nothing ? similar(u) : f.resid_prototype
         f(fu, u, p)
     else
-        fu = _mutable(f(u, p))
+        fu = f(u, p)
     end
     return fu
 end
 
-evaluate_f(cache, u; fu = nothing) = evaluate_f(cache.f, u, cache.p, Val(cache.iip); fu)
-
-function evaluate_f(f, u, p, ::Val{iip}; fu = nothing) where {iip}
-    if iip
-        f(fu, u, p)
-        return fu
+function evaluate_f(cache, u)
+    @unpack f, p = cache.prob
+    if isinplace(cache)
+        f(get_fu(cache), u, p)
     else
-        return f(u, p)
+        set_fu!(cache, f(u, p))
     end
+    return nothing
 end
 
 """
@@ -206,7 +220,7 @@ end
 function __get_concrete_algorithm(alg, prob)
     @unpack sparsity, jac_prototype = prob.f
     use_sparse_ad = sparsity !== nothing || jac_prototype !== nothing
-    ad = if eltype(prob.u0) <: Complex
+    ad = if !ForwardDiff.can_dual(eltype(prob.u0))
         # Use Finite Differencing
         use_sparse_ad ? AutoSparseFiniteDiff() : AutoFiniteDiff()
     else
@@ -310,16 +324,16 @@ function __init_low_rank_jacobian(u, fu, threshold::Int)
 end
 
 # Check Singular Matrix
-_issingular(x::Number) = iszero(x)
-@generated function _issingular(x::T) where {T}
+@inline _issingular(x::Number) = iszero(x)
+@inline @generated function _issingular(x::T) where {T}
     hasmethod(issingular, Tuple{T}) && return :(issingular(x))
     return :(__issingular(x))
 end
-__issingular(x::AbstractMatrix{T}) where {T} = cond(x) > inv(sqrt(eps(real(T))))
-__issingular(x) = false ## If SciMLOperator and such
+@inline __issingular(x::AbstractMatrix{T}) where {T} = cond(x) > inv(sqrt(eps(real(T))))
+@inline __issingular(x) = false ## If SciMLOperator and such
 
 # Safe getproperty
-@generated function _getproperty(s::S, ::Val{X}) where {S, X}
+@generated function __getproperty(s::S, ::Val{X}) where {S, X}
     hasfield(S, X) && return :(s.$X)
     return :(nothing)
 end
@@ -348,6 +362,7 @@ _try_factorize_and_check_singular!(::Nothing, x) = _issingular(x), false
     return :(@. y += α * x)
 end
 
+# Non-square matrix
 @inline _needs_square_A(_, ::Number) = true
 @inline _needs_square_A(_, ::StaticArray) = true
 @inline _needs_square_A(alg, _) = LinearSolve.needs_square_A(alg.linsolve)
@@ -355,9 +370,40 @@ end
 # Define special concatenation for certain Array combinations
 @inline _vcat(x, y) = vcat(x, y)
 
+# LazyArrays for tracing
 __zero(x::AbstractArray) = zero(x)
 __zero(x) = x
 LazyArrays.applied_eltype(::typeof(__zero), x) = eltype(x)
 LazyArrays.applied_ndims(::typeof(__zero), x) = ndims(x)
 LazyArrays.applied_size(::typeof(__zero), x) = size(x)
 LazyArrays.applied_axes(::typeof(__zero), x) = axes(x)
+
+# SparseAD --> NonSparseAD
+@inline __get_nonsparse_ad(::AutoSparseForwardDiff) = AutoForwardDiff()
+@inline __get_nonsparse_ad(::AutoSparseFiniteDiff) = AutoFiniteDiff()
+@inline __get_nonsparse_ad(::AutoSparseZygote) = AutoZygote()
+@inline __get_nonsparse_ad(ad) = ad
+
+# Use Symmetric Matrices if known to be efficient
+@inline __maybe_symmetric(x) = Symmetric(x)
+@inline __maybe_symmetric(x::Number) = x
+## LinearSolve with `nothing` doesn't dispatch correctly here
+@inline __maybe_symmetric(x::StaticArray) = x
+@inline __maybe_symmetric(x::SparseArrays.AbstractSparseMatrix) = x
+@inline __maybe_symmetric(x::SciMLOperators.AbstractSciMLOperator) = x
+
+# Unalias
+@inline __maybe_unaliased(x::Union{Number, SArray}, ::Bool) = x
+@inline function __maybe_unaliased(x::AbstractArray, alias::Bool)
+    # Spend time coping iff we will mutate the array
+    (alias || !can_setindex(typeof(x))) && return x
+    return deepcopy(x)
+end
+
+# Init ones
+@inline function __init_ones(x)
+    w = similar(x)
+    recursivefill!(w, true)
+    return w
+end
+@inline __init_ones(x::StaticArray) = ones(typeof(x))

From 9bc8f5bacb706ee0c3ef9382e2270b1ec5a791db Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 29 Nov 2023 21:45:54 -0500
Subject: [PATCH 02/25] Reuse more code in Broyden

---
 src/NonlinearSolve.jl |  2 +-
 src/broyden.jl        | 95 ++++++++++++++-----------------------------
 src/raphson.jl        |  2 +-
 src/utils.jl          | 38 ++++++++++++-----
 4 files changed, 60 insertions(+), 77 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index f050bf007..f1782b8c1 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -25,7 +25,7 @@ import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_work
         AbstractVectorOfArray, recursivecopy!, recursivefill!
     import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
     import SciMLOperators: FunctionOperator
-    import StaticArraysCore: StaticArray, SVector, SArray, MArray, Size, SMatrix
+    import StaticArraysCore: StaticArray, SVector, SArray, MArray, Size, SMatrix, MMatrix
     import UnPack: @unpack
 
     using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
diff --git a/src/broyden.jl b/src/broyden.jl
index 008ff589d..e0b69f19c 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -65,81 +65,46 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::GeneralBroyde
         termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
         kwargs...) where {uType, iip, F}
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
+    u = __maybe_unaliased(u0, alias_u0)
     fu = evaluate_f(prob, u)
-    du = _mutable_zero(u)
+    @bb du = copy(u)
     J⁻¹ = __init_identity_jacobian(u, fu)
     reset_tolerance = alg.reset_tolerance === nothing ? sqrt(eps(real(eltype(u)))) :
                       alg.reset_tolerance
     reset_check = x -> abs(x) ≤ reset_tolerance
 
+    @bb u_prev = copy(u)
+    @bb fu2 = copy(fu)
+    @bb dfu = similar(fu)
+    @bb J⁻¹₂ = similar(u)
+    @bb J⁻¹df = similar(u)
+
     abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, J⁻¹, du; uses_jac_inverse = Val(true),
         kwargs...)
 
-    return GeneralBroydenCache{iip}(f, alg, u, zero(u), du, fu, zero(fu),
-        zero(fu), p, J⁻¹, zero(_reshape(fu, 1, :)), _mutable_zero(u), false, 0,
-        alg.max_resets, maxiters, internalnorm, ReturnCode.Default, abstol, reltol,
-        reset_tolerance, reset_check, prob, NLStats(1, 0, 0, 0, 0),
+    return GeneralBroydenCache{iip}(f, alg, u, u_prev, du, fu, fu2, dfu, p, J⁻¹,
+        J⁻¹₂, J⁻¹df, false, 0, alg.max_resets, maxiters, internalnorm, ReturnCode.Default,
+        abstol, reltol, reset_tolerance, reset_check, prob, NLStats(1, 0, 0, 0, 0),
         init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
 end
 
-function perform_step!(cache::GeneralBroydenCache{true})
-    @unpack f, p, du, fu, fu2, dfu, u, u_prev, J⁻¹, J⁻¹df, J⁻¹₂ = cache
-    T = eltype(u)
-
-    mul!(_vec(du), J⁻¹, _vec(fu))
-    α = perform_linesearch!(cache.ls_cache, u, du)
-    _axpy!(-α, du, u)
-    f(fu2, u, p)
-
-    update_trace_with_invJ!(cache.trace, cache.stats.nsteps + 1, get_u(cache),
-        get_fu(cache), J⁻¹, du, α)
-
-    check_and_update!(cache, fu2, u, u_prev)
-    cache.stats.nf += 1
-
-    cache.force_stop && return nothing
+function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
+    T = eltype(cache.u)
 
-    # Update the inverse jacobian
-    dfu .= fu2 .- fu
+    @bb cache.du = cache.J⁻¹ × vec(cache.fu)
+    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
+    @bb axpy!(-α, cache.du, cache.u)
 
-    if all(cache.reset_check, du) || all(cache.reset_check, dfu)
-        if cache.resets ≥ cache.max_resets
-            cache.retcode = ReturnCode.ConvergenceFailure
-            cache.force_stop = true
-            return nothing
-        end
-        fill!(J⁻¹, 0)
-        J⁻¹[diagind(J⁻¹)] .= T(1)
-        cache.resets += 1
+    if iip
+        cache.f(cache.fu2, cache.u, cache.p)
     else
-        du .*= -1
-        mul!(_vec(J⁻¹df), J⁻¹, _vec(dfu))
-        mul!(J⁻¹₂, _vec(du)', J⁻¹)
-        denom = dot(du, J⁻¹df)
-        du .= (du .- J⁻¹df) ./ ifelse(iszero(denom), T(1e-5), denom)
-        mul!(J⁻¹, _vec(du), J⁻¹₂, 1, 1)
+        cache.fu2 = cache.f(cache.u, cache.p)
     end
-    fu .= fu2
-    @. u_prev = u
-
-    return nothing
-end
-
-function perform_step!(cache::GeneralBroydenCache{false})
-    @unpack f, p = cache
-
-    T = eltype(cache.u)
-
-    cache.du = _restructure(cache.du, cache.J⁻¹ * _vec(cache.fu))
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    cache.u = cache.u .- α * cache.du
-    cache.fu2 = f(cache.u, p)
 
     update_trace_with_invJ!(cache.trace, cache.stats.nsteps + 1, get_u(cache),
-        get_fu(cache), cache.J⁻¹, cache.du, α)
+        cache.fu2, cache.J⁻¹, cache.du, α)
 
     check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
     cache.stats.nf += 1
@@ -147,25 +112,27 @@ function perform_step!(cache::GeneralBroydenCache{false})
     cache.force_stop && return nothing
 
     # Update the inverse jacobian
-    cache.dfu = cache.fu2 .- cache.fu
+    @bb @. cache.dfu = cache.fu2 - cache.fu
+
     if all(cache.reset_check, cache.du) || all(cache.reset_check, cache.dfu)
         if cache.resets ≥ cache.max_resets
             cache.retcode = ReturnCode.ConvergenceFailure
             cache.force_stop = true
             return nothing
         end
-        cache.J⁻¹ = __init_identity_jacobian(cache.u, cache.fu)
+        cache.J⁻¹ = __reinit_identity_jacobian!!(cache.J⁻¹)
         cache.resets += 1
     else
-        cache.du = -cache.du
-        cache.J⁻¹df = _restructure(cache.J⁻¹df, cache.J⁻¹ * _vec(cache.dfu))
-        cache.J⁻¹₂ = _vec(cache.du)' * cache.J⁻¹
+        @bb cache.du .*= -1
+        @bb cache.J⁻¹df = cache.J⁻¹ × vec(cache.dfu)
+        @bb cache.J⁻¹₂ = cache.J⁻¹ × vec(cache.du)
         denom = dot(cache.du, cache.J⁻¹df)
-        cache.du = (cache.du .- cache.J⁻¹df) ./ ifelse(iszero(denom), T(1e-5), denom)
-        cache.J⁻¹ = cache.J⁻¹ .+ _vec(cache.du) * cache.J⁻¹₂
+        @bb @. cache.du = (cache.du - cache.J⁻¹df) / ifelse(iszero(denom), T(1e-5), denom)
+        @bb cache.J⁻¹ += vec(cache.du) × transpose(cache.J⁻¹₂)
     end
-    cache.fu = cache.fu2
-    cache.u_prev = @. cache.u
+
+    @bb copyto!(cache.fu, cache.fu2)
+    @bb copyto!(cache.u_prev, cache.u)
 
     return nothing
 end
diff --git a/src/raphson.jl b/src/raphson.jl
index 4c4125579..52e47ac01 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -114,7 +114,7 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
     @bb axpy!(-α, cache.du, cache.u)
 
-    evaluate_f(cache, cache.u)
+    evaluate_f(cache, cache.u, cache.p)
 
     update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
         cache.du, α)
diff --git a/src/utils.jl b/src/utils.jl
index d3017d42f..ab2db093f 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -188,12 +188,11 @@ function evaluate_f(prob::Union{NonlinearProblem{uType, iip},
     return fu
 end
 
-function evaluate_f(cache, u)
-    @unpack f, p = cache.prob
+function evaluate_f(cache, u, p)
     if isinplace(cache)
-        f(get_fu(cache), u, p)
+        cache.prob.f(get_fu(cache), u, p)
     else
-        set_fu!(cache, f(u, p))
+        set_fu!(cache, cache.prob.f(u, p))
     end
     return nothing
 end
@@ -301,14 +300,31 @@ function check_and_update!(tc_cache, cache, fu, u, uprev,
     end
 end
 
-__init_identity_jacobian(u::Number, _) = u
-function __init_identity_jacobian(u, fu)
-    return convert(parameterless_type(_mutable(u)),
-        Matrix{eltype(u)}(I, length(fu), length(u)))
+@inline __init_identity_jacobian(u::Number, _) = one(u)
+@inline function __init_identity_jacobian(u, fu)
+    J = similar(fu, promote_type(eltype(fu), eltype(u)), length(fu), length(u))
+    fill!(J, zero(eltype(J)))
+    J[diagind(J)] .= one(eltype(J))
+    return J
 end
-function __init_identity_jacobian(u::StaticArray, fu)
-    return convert(MArray{Tuple{length(fu), length(u)}},
-        Matrix{eltype(u)}(I, length(fu), length(u)))
+@inline function __init_identity_jacobian(u::StaticArray, fu::StaticArray)
+    T = promote_type(eltype(fu), eltype(u))
+    return MArray{Tuple{prod(Size(fu)), prod(Size(u))}, T}(I)
+end
+@inline function __init_identity_jacobian(u::SArray, fu::SArray)
+    T = promote_type(eltype(fu), eltype(u))
+    return SArray{Tuple{prod(Size(fu)), prod(Size(u))}, T}(I)
+end
+
+@inline __reinit_identity_jacobian!!(J::Number) = one(J)
+@inline function __reinit_identity_jacobian!!(J::AbstractMatrix)
+    fill!(J, zero(eltype(J)))
+    J[diagind(J)] .= one(eltype(J))
+    return J
+end
+@inline function __reinit_identity_jacobian!!(J::SMatrix)
+    S = Size(J)
+    return SArray{Tuple{S[1], S[2]}, eltype(J)}(I)
 end
 
 function __init_low_rank_jacobian(u::StaticArray, fu, threshold::Int)

From a5c6195c4b8952078542082ecc22d3328a81b18f Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 29 Nov 2023 22:01:55 -0500
Subject: [PATCH 03/25] Share reinit code

---
 src/NonlinearSolve.jl | 36 +++++++++++++++++++++++++++++++++++-
 src/broyden.jl        | 29 +++--------------------------
 src/raphson.jl        | 28 ----------------------------
 3 files changed, 38 insertions(+), 55 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index f1782b8c1..9096525ee 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,7 +8,8 @@ import Reexport: @reexport
 import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_workload
 
 @recompile_invalidations begin
-    using DiffEqBase, LazyArrays, LinearAlgebra, LinearSolve, Printf, SparseArrays,
+    using DiffEqBase,
+        LazyArrays, LinearAlgebra, LinearSolve, Printf, SparseArrays,
         SparseDiffTools
 
     import ADTypes: AbstractFiniteDifferencesMode
@@ -51,6 +52,39 @@ abstract type AbstractNonlinearSolveCache{iip} end
 
 isinplace(::AbstractNonlinearSolveCache{iip}) where {iip} = iip
 
+function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(cache);
+        p = cache.p, abstol = cache.abstol, reltol = cache.reltol,
+        maxiters = cache.maxiters, alias_u0 = false,
+        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
+    cache.p = p
+    if iip
+        recursivecopy!(get_u(cache), u0)
+        cache.f(cache.fu1, get_u(cache), p)
+    else
+        cache.u = __maybe_unaliased(u0, alias_u0)
+        set_fu!(cache, cache.f(cache.u, p))
+    end
+
+    reset!(cache.trace)
+    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, get_fu(cache),
+        get_u(cache), termination_condition)
+
+    cache.abstol = abstol
+    cache.reltol = reltol
+    cache.tc_cache = tc_cache
+    cache.maxiters = maxiters
+    cache.stats.nf = 1
+    cache.stats.nsteps = 1
+    cache.force_stop = false
+    cache.retcode = ReturnCode.Default
+
+    __reinit_internal!(cache)
+
+    return cache
+end
+
+__reinit_internal!(cache::AbstractNonlinearSolveCache) = nothing
+
 function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm)
     str = "$(nameof(typeof(alg)))("
     modifiers = String[]
diff --git a/src/broyden.jl b/src/broyden.jl
index e0b69f19c..dbc4f5131 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -137,31 +137,8 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
     return nothing
 end
 
-function SciMLBase.reinit!(cache::GeneralBroydenCache{iip}, u0 = cache.u; p = cache.p,
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu = cache.f(cache.u, p)
-    end
-
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu, cache.u,
-        termination_condition)
-
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
+function __reinit_internal!(cache::GeneralBroydenCache)
+    cache.J⁻¹ = __reinit_identity_jacobian!!(cache.J⁻¹)
     cache.resets = 0
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
+    return nothing
 end
diff --git a/src/raphson.jl b/src/raphson.jl
index 52e47ac01..07b155f1c 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -128,31 +128,3 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     cache.stats.nfactors += 1
     return nothing
 end
-
-function SciMLBase.reinit!(cache::NewtonRaphsonCache{iip}, u0 = cache.u; p = cache.p,
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu1, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu1 = cache.f(cache.u, p)
-    end
-
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu1, cache.u,
-        termination_condition)
-
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
-end

From f147663015cef79f1ed75171e6d1014b2941e755 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 01:14:46 -0500
Subject: [PATCH 04/25] Reuse Klement Code

---
 Project.toml          |   4 +-
 src/NonlinearSolve.jl |   9 +--
 src/klement.jl        | 173 +++++++++++++-----------------------------
 src/utils.jl          |  10 +--
 4 files changed, 60 insertions(+), 136 deletions(-)

diff --git a/Project.toml b/Project.toml
index 8a42f9d21..9385b14a2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,7 +26,7 @@ SciMLOperators = "c0aeaf25-5076-4817-a8d5-81caf7dfa961"
 SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
-StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [weakdeps]
@@ -75,7 +75,6 @@ SimpleNonlinearSolve = "1"  # FIXME: Don't update the version in this PR. Using
 SparseArrays = "<0.0.1, 1"
 SparseDiffTools = "2.14"
 StaticArrays = "1"
-StaticArraysCore = "1.4"
 Symbolics = "5"
 Test = "1"
 UnPack = "1.0"
@@ -99,7 +98,6 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 9096525ee..63987898b 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,9 +8,8 @@ import Reexport: @reexport
 import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_workload
 
 @recompile_invalidations begin
-    using DiffEqBase,
-        LazyArrays, LinearAlgebra, LinearSolve, Printf, SparseArrays,
-        SparseDiffTools
+    using ADTypes, DiffEqBase, LazyArrays, LineSearches, LinearAlgebra, LinearSolve, Printf,
+        SciMLBase, SimpleNonlinearSolve, SparseArrays, SparseDiffTools, StaticArrays
 
     import ADTypes: AbstractFiniteDifferencesMode
     import ArrayInterface: undefmatrix, restructure, can_setindex,
@@ -26,10 +25,8 @@ import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_work
         AbstractVectorOfArray, recursivecopy!, recursivefill!
     import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
     import SciMLOperators: FunctionOperator
-    import StaticArraysCore: StaticArray, SVector, SArray, MArray, Size, SMatrix, MMatrix
+    import StaticArrays: StaticArray, SVector, SArray, MArray, Size, SMatrix, MMatrix
     import UnPack: @unpack
-
-    using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
 end
 
 @reexport using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
diff --git a/src/klement.jl b/src/klement.jl
index 8a9640fd4..4296defcf 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -74,38 +74,43 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::GeneralKleme
         termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
         linsolve_kwargs = (;), kwargs...) where {uType, iip, F}
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
+    u = __maybe_unaliased(u0, alias_u0)
     fu = evaluate_f(prob, u)
     J = __init_identity_jacobian(u, fu)
-    du = _mutable_zero(u)
+    @bb du = similar(u)
 
     if u isa Number
-        linsolve = nothing
+        linsolve = FakeLinearSolveJLCache(J, fu)
         alg = alg_
     else
         # For General Julia Arrays default to LU Factorization
-        linsolve_alg = alg_.linsolve === nothing && u isa Array ? LUFactorization() :
-                       nothing
+        linsolve_alg = (alg_.linsolve === nothing && (u isa Array || u isa StaticArray)) ?
+                       LUFactorization() : nothing
         alg = set_linsolve(alg_, linsolve_alg)
-        linsolve = linsolve_caches(J, _vec(fu), _vec(du), p, alg)
+        linsolve = linsolve_caches(J, _vec(fu), _vec(du), p, alg; linsolve_kwargs)
     end
 
     abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, J, du; kwargs...)
 
-    return GeneralKlementCache{iip}(f, alg, u, zero(u), fu, zero(fu), du, p, linsolve,
-        J, zero(J), zero(J), _vec(zero(fu)), _vec(zero(fu)), 0, false,
-        maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob,
-        NLStats(1, 0, 0, 0, 0),
+    @bb u_prev = copy(u)
+    @bb fu2 = similar(fu)
+    @bb J_cache = similar(J)
+    @bb J_cache2 = similar(J)
+    @bb Jᵀ²du = similar(fu)
+    @bb Jdu = similar(fu)
+
+    return GeneralKlementCache{iip}(f, alg, u, u_prev, fu, fu2, du, p, linsolve, J, J_cache,
+        J_cache2, Jᵀ²du, Jdu, 0, false, maxiters, internalnorm, ReturnCode.Default, abstol,
+        reltol, prob, NLStats(1, 0, 0, 0, 0),
         init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
 end
 
-function perform_step!(cache::GeneralKlementCache{true})
-    @unpack u, u_prev, fu, f, p, alg, J, linsolve, du = cache
-    T = eltype(J)
-
-    singular, fact_done = _try_factorize_and_check_singular!(linsolve, J)
+function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
+    @unpack linsolve, alg = cache
+    T = eltype(cache.J)
+    singular, fact_done = __try_factorize_and_check_singular!(linsolve, cache.J)
 
     if singular
         if cache.resets == alg.max_resets
@@ -114,88 +119,33 @@ function perform_step!(cache::GeneralKlementCache{true})
             return nothing
         end
         fact_done = false
-        fill!(J, zero(T))
-        J[diagind(J)] .= T(1)
+        cache.J = __reinit_identity_jacobian!!(cache.J)
         cache.resets += 1
     end
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve; A = ifelse(fact_done, nothing, J),
-        b = _vec(fu), linu = _vec(du), p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
+        linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
 
-    # Line Search
-    α = perform_linesearch!(cache.ls_cache, u, du)
-    _axpy!(-α, du, u)
-    f(cache.fu2, u, p)
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu2, J,
-        cache.du, α)
-
-    check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
-    cache.stats.nf += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-
-    cache.force_stop && return nothing
-
-    # Update the Jacobian
-    cache.du .*= -1
-    cache.J_cache .= cache.J' .^ 2
-    cache.Jdu .= _vec(du) .^ 2
-    mul!(cache.Jᵀ²du, cache.J_cache, cache.Jdu)
-    mul!(cache.Jdu, J, _vec(du))
-    cache.fu .= cache.fu2 .- cache.fu
-    cache.fu .= _restructure(cache.fu,
-        (_vec(cache.fu) .- cache.Jdu) ./ max.(cache.Jᵀ²du, eps(real(T))))
-    mul!(cache.J_cache, _vec(cache.fu), _vec(du)')
-    cache.J_cache .*= J
-    mul!(cache.J_cache2, cache.J_cache, J)
-    J .+= cache.J_cache2
-
-    @. u_prev = u
-    cache.fu .= cache.fu2
-
-    return nothing
-end
-
-function perform_step!(cache::GeneralKlementCache{false})
-    @unpack fu, f, p, alg, J, linsolve = cache
+    !iip && (cache.du = linres.u)
 
-    T = eltype(J)
-
-    singular, fact_done = _try_factorize_and_check_singular!(linsolve, J)
-
-    if singular
-        if cache.resets == alg.max_resets
-            cache.force_stop = true
-            cache.retcode = ReturnCode.ConvergenceFailure
-            return nothing
-        end
-        fact_done = false
-        cache.J = __init_identity_jacobian(cache.u, fu)
-        cache.resets += 1
-    end
+    # Line Search
+    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
+    @bb axpy!(-α, cache.du, cache.u)
 
-    # u = u - J \ fu
-    if linsolve === nothing
-        cache.du = fu / cache.J
+    if iip
+        cache.f(cache.fu2, cache.u, cache.p)
     else
-        linres = dolinsolve(alg.precs, linsolve; A = ifelse(fact_done, nothing, J),
-            b = _vec(fu), linu = _vec(cache.du), p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
+        cache.fu2 = cache.f(cache.u, cache.p)
     end
 
-    # Line Search
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    cache.u = @. cache.u - α * cache.du  # `u` might not support mutation
-    cache.fu2 = f(cache.u, p)
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu2, J,
+    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu2, cache.J,
         cache.du, α)
 
     check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
-    cache.u_prev = cache.u
+    @bb copyto!(cache.u_prev, cache.u)
+
     cache.stats.nf += 1
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
@@ -203,46 +153,27 @@ function perform_step!(cache::GeneralKlementCache{false})
     cache.force_stop && return nothing
 
     # Update the Jacobian
-    cache.du = -cache.du
-    cache.J_cache = cache.J' .^ 2
-    cache.Jdu = _vec(cache.du) .^ 2
-    cache.Jᵀ²du = cache.J_cache * cache.Jdu
-    cache.Jdu = J * _vec(cache.du)
-    cache.fu = cache.fu2 .- cache.fu
-    cache.fu = _restructure(cache.fu,
-        (_vec(cache.fu) .- cache.Jdu) ./ max.(cache.Jᵀ²du, eps(real(T))))
-    cache.J_cache = ((_vec(cache.fu) * _vec(cache.du)') .* J) * J
-    cache.J = J .+ cache.J_cache
-
-    cache.fu = cache.fu2
+    @bb cache.du .*= -1
+    @bb cache.J_cache .= cache.J' .^ 2
+    @bb @. cache.Jdu = cache.du ^ 2
+    @bb cache.Jᵀ²du = cache.J_cache × vec(cache.Jdu)
+    @bb cache.Jdu = cache.J × vec(cache.du)
+    @bb @. cache.fu = cache.fu2 - cache.fu
 
-    return nothing
-end
+    @bb @. cache.fu = (cache.fu - cache.Jdu) / max(cache.Jᵀ²du, eps(real(T)))
 
-function SciMLBase.reinit!(cache::GeneralKlementCache{iip}, u0 = cache.u; p = cache.p,
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu = cache.f(cache.u, p)
-    end
+    @bb cache.J_cache = vec(cache.fu) × transpose(_vec(cache.du))
+    @bb @. cache.J_cache *= cache.J
+    @bb cache.J_cache2 = cache.J_cache × cache.J
+    @bb cache.J .+= cache.J_cache2
 
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu, cache.u,
-        termination_condition)
+    @bb copyto!(cache.fu, cache.fu2)
 
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
+    return nothing
+end
+
+function __reinit_internal!(cache::GeneralKlementCache)
+    cache.J = __reinit_identity_jacobian!!(cache.J)
+    cache.resets = 0
+    return nothing
 end
diff --git a/src/utils.jl b/src/utils.jl
index ab2db093f..bc38d9257 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -356,8 +356,8 @@ end
 
 # If factorization is LU then perform that and update the linsolve cache
 # else check if the matrix is singular
-function _try_factorize_and_check_singular!(linsolve, X)
-    if linsolve.cacheval isa LU
+function __try_factorize_and_check_singular!(linsolve, X)
+    if linsolve.cacheval isa LU || linsolve.cacheval isa StaticArrays.LU
         # LU Factorization was used
         linsolve.A = X
         linsolve.cacheval = LinearSolve.do_factorization(linsolve.alg, X, linsolve.b,
@@ -368,11 +368,9 @@ function _try_factorize_and_check_singular!(linsolve, X)
     end
     return _issingular(X), false
 end
-_try_factorize_and_check_singular!(::Nothing, x) = _issingular(x), false
-
-@inline _reshape(x, args...) = reshape(x, args...)
-@inline _reshape(x::Number, args...) = x
+__try_factorize_and_check_singular!(::FakeLinearSolveJLCache, x) = _issingular(x), false
 
+# TODO: Remove. handled in MaybeInplace.jl
 @generated function _axpy!(α, x, y)
     hasmethod(axpy!, Tuple{α, x, y}) && return :(axpy!(α, x, y))
     return :(@. y += α * x)

From 4f2dec04fa30432b19062f4726cb8cc487ef02ab Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 15:35:07 -0500
Subject: [PATCH 05/25] Make the internal field names more consistent

---
 src/NonlinearSolve.jl | 21 +++++++++--------
 src/broyden.jl        | 49 +++++++++++++++-----------------------
 src/jacobian.jl       | 11 +++++----
 src/klement.jl        | 55 +++++++++++++++++--------------------------
 src/raphson.jl        | 31 +++++++++++-------------
 src/trace.jl          | 20 ++++++++++++++++
 src/utils.jl          | 16 +++++++------
 7 files changed, 101 insertions(+), 102 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 63987898b..d55527c90 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -117,8 +117,9 @@ end
 function not_terminated(cache::AbstractNonlinearSolveCache)
     return !cache.force_stop && cache.stats.nsteps < cache.maxiters
 end
-get_fu(cache::AbstractNonlinearSolveCache) = cache.fu1
-set_fu!(cache::AbstractNonlinearSolveCache, fu) = (cache.fu1 = fu)
+
+get_fu(cache::AbstractNonlinearSolveCache) = cache.fu
+set_fu!(cache::AbstractNonlinearSolveCache, fu) = (cache.fu = fu)
 get_u(cache::AbstractNonlinearSolveCache) = cache.u
 SciMLBase.set_u!(cache::AbstractNonlinearSolveCache, u) = (cache.u = u)
 
@@ -152,17 +153,17 @@ include("trace.jl")
 include("extension_algs.jl")
 include("linesearch.jl")
 include("raphson.jl")
-include("trustRegion.jl")
-include("levenberg.jl")
-include("gaussnewton.jl")
-include("dfsane.jl")
-include("pseudotransient.jl")
+# include("trustRegion.jl")
+# include("levenberg.jl")
+# include("gaussnewton.jl")
+# include("dfsane.jl")
+# include("pseudotransient.jl")
 include("broyden.jl")
 include("klement.jl")
-include("lbroyden.jl")
+# include("lbroyden.jl")
 include("jacobian.jl")
-include("ad.jl")
-include("default.jl")
+# include("ad.jl")
+# include("default.jl")
 
 # @setup_workload begin
 #     nlfuncs = ((NonlinearFunction{false}((u, p) -> u .* u .- p), 0.1),
diff --git a/src/broyden.jl b/src/broyden.jl
index dbc4f5131..d1c8ac433 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -31,15 +31,14 @@ end
     f
     alg
     u
-    u_prev
+    u_cache
     du
     fu
-    fu2
+    fu_cache
     dfu
     p
     J⁻¹
-    J⁻¹₂
-    J⁻¹df
+    J⁻¹dfu
     force_stop::Bool
     resets::Int
     max_resets::Int
@@ -57,9 +56,6 @@ end
     trace
 end
 
-get_fu(cache::GeneralBroydenCache) = cache.fu
-set_fu!(cache::GeneralBroydenCache, fu) = (cache.fu = fu)
-
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::GeneralBroyden, args...;
         alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
         termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
@@ -73,19 +69,18 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::GeneralBroyde
                       alg.reset_tolerance
     reset_check = x -> abs(x) ≤ reset_tolerance
 
-    @bb u_prev = copy(u)
-    @bb fu2 = copy(fu)
+    @bb u_cache = copy(u)
+    @bb fu_cache = similar(fu)
     @bb dfu = similar(fu)
-    @bb J⁻¹₂ = similar(u)
-    @bb J⁻¹df = similar(u)
+    @bb J⁻¹dfu = similar(u)
 
     abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, J⁻¹, du; uses_jac_inverse = Val(true),
         kwargs...)
 
-    return GeneralBroydenCache{iip}(f, alg, u, u_prev, du, fu, fu2, dfu, p, J⁻¹,
-        J⁻¹₂, J⁻¹df, false, 0, alg.max_resets, maxiters, internalnorm, ReturnCode.Default,
+    return GeneralBroydenCache{iip}(f, alg, u, u_cache, du, fu, fu_cache, dfu, p,
+        J⁻¹, J⁻¹dfu, false, 0, alg.max_resets, maxiters, internalnorm, ReturnCode.Default,
         abstol, reltol, reset_tolerance, reset_check, prob, NLStats(1, 0, 0, 0, 0),
         init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
 end
@@ -97,22 +92,16 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
     α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
     @bb axpy!(-α, cache.du, cache.u)
 
-    if iip
-        cache.f(cache.fu2, cache.u, cache.p)
-    else
-        cache.fu2 = cache.f(cache.u, cache.p)
-    end
-
-    update_trace_with_invJ!(cache.trace, cache.stats.nsteps + 1, get_u(cache),
-        cache.fu2, cache.J⁻¹, cache.du, α)
+    evaluate_f(cache, cache.u, cache.p)
 
-    check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
+    update_trace!(cache, α)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
     cache.stats.nf += 1
 
     cache.force_stop && return nothing
 
     # Update the inverse jacobian
-    @bb @. cache.dfu = cache.fu2 - cache.fu
+    @bb @. cache.dfu = cache.fu - cache.fu_cache
 
     if all(cache.reset_check, cache.du) || all(cache.reset_check, cache.dfu)
         if cache.resets ≥ cache.max_resets
@@ -124,15 +113,15 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
         cache.resets += 1
     else
         @bb cache.du .*= -1
-        @bb cache.J⁻¹df = cache.J⁻¹ × vec(cache.dfu)
-        @bb cache.J⁻¹₂ = cache.J⁻¹ × vec(cache.du)
-        denom = dot(cache.du, cache.J⁻¹df)
-        @bb @. cache.du = (cache.du - cache.J⁻¹df) / ifelse(iszero(denom), T(1e-5), denom)
-        @bb cache.J⁻¹ += vec(cache.du) × transpose(cache.J⁻¹₂)
+        @bb cache.J⁻¹dfu = cache.J⁻¹ × vec(cache.dfu)
+        @bb cache.u_cache = cache.J⁻¹ × vec(cache.du)
+        denom = dot(cache.du, cache.J⁻¹dfu)
+        @bb @. cache.du = (cache.du - cache.J⁻¹dfu) / ifelse(iszero(denom), T(1e-5), denom)
+        @bb cache.J⁻¹ += vec(cache.du) × transpose(cache.u_cache)
     end
 
-    @bb copyto!(cache.fu, cache.fu2)
-    @bb copyto!(cache.u_prev, cache.u)
+    @bb copyto!(cache.fu_cache, cache.fu)
+    @bb copyto!(cache.u_cache, cache.u)
 
     return nothing
 end
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 54f1c0f0e..a63a57ffc 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -33,13 +33,13 @@ jacobian!!(J, _) = J
 # `!!` notation is from BangBang.jl since J might be jacobian in case of oop `f.jac`
 # and we don't want wasteful `copyto!`
 function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
-    @unpack f, uf, u, p, jac_cache, alg, fu2 = cache
+    @unpack f, uf, u, p, jac_cache, alg, fu_cache = cache
     iip = isinplace(cache)
     if iip
         if has_jac(f)
             f.jac(J, u, p)
         else
-            sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, u)
+            sparse_jacobian!(J, alg.ad, jac_cache, uf, fu_cache, u)
         end
         return J
     else
@@ -116,9 +116,10 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
     end
 
     if linsolve_init
-        linprob_A = alg isa PseudoTransient ?
-                    (J - (1 / (convert(eltype(u), alg.alpha_initial))) * I) :
-                    (needsJᵀJ ? __maybe_symmetric(JᵀJ) : J)
+        linprob_A = needsJᵀJ ? __maybe_symmetric(JᵀJ) : J
+        # linprob_A = alg isa PseudoTransient ?
+        #             (J - (1 / (convert(eltype(u), alg.alpha_initial))) * I) :
+        #             (needsJᵀJ ? __maybe_symmetric(JᵀJ) : J)
         linsolve = linsolve_caches(linprob_A, needsJᵀJ ? Jᵀfu : fu, du, p, alg;
             linsolve_kwargs)
     else
diff --git a/src/klement.jl b/src/klement.jl
index 4296defcf..4ec612273 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -41,17 +41,17 @@ end
     f
     alg
     u
-    u_prev
+    u_cache
     fu
-    fu2
+    fu_cache
     du
     p
     linsolve
     J
     J_cache
-    J_cache2
-    Jᵀ²du
+    J_cache_2
     Jdu
+    Jdu_cache
     resets
     force_stop
     maxiters::Int
@@ -66,9 +66,6 @@ end
     trace
 end
 
-get_fu(cache::GeneralKlementCache) = cache.fu
-set_fu!(cache::GeneralKlementCache, fu) = (cache.fu = fu)
-
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::GeneralKlement, args...;
         alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
         termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
@@ -94,16 +91,16 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::GeneralKleme
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, J, du; kwargs...)
 
-    @bb u_prev = copy(u)
-    @bb fu2 = similar(fu)
+    @bb u_cache = similar(u)
+    @bb fu_cache = similar(fu)
     @bb J_cache = similar(J)
-    @bb J_cache2 = similar(J)
-    @bb Jᵀ²du = similar(fu)
+    @bb J_cache_2 = similar(J)
     @bb Jdu = similar(fu)
+    @bb Jdu_cache = similar(fu)
 
-    return GeneralKlementCache{iip}(f, alg, u, u_prev, fu, fu2, du, p, linsolve, J, J_cache,
-        J_cache2, Jᵀ²du, Jdu, 0, false, maxiters, internalnorm, ReturnCode.Default, abstol,
-        reltol, prob, NLStats(1, 0, 0, 0, 0),
+    return GeneralKlementCache{iip}(f, alg, u, u_cache, fu, fu_cache, du, p, linsolve,
+        J, J_cache, J_cache_2, Jdu, Jdu_cache, 0, false, maxiters, internalnorm,
+        ReturnCode.Default, abstol, reltol, prob, NLStats(1, 0, 0, 0, 0),
         init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
 end
 
@@ -127,24 +124,18 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
     linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
         linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-
     !iip && (cache.du = linres.u)
 
     # Line Search
     α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
     @bb axpy!(-α, cache.du, cache.u)
 
-    if iip
-        cache.f(cache.fu2, cache.u, cache.p)
-    else
-        cache.fu2 = cache.f(cache.u, cache.p)
-    end
+    evaluate_f(cache, cache.u, cache.p)
 
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu2, cache.J,
-        cache.du, α)
+    update_trace!(cache, α)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
-    check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
-    @bb copyto!(cache.u_prev, cache.u)
+    @bb copyto!(cache.u_cache, cache.u)
 
     cache.stats.nf += 1
     cache.stats.nsolve += 1
@@ -155,19 +146,17 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
     # Update the Jacobian
     @bb cache.du .*= -1
     @bb cache.J_cache .= cache.J' .^ 2
-    @bb @. cache.Jdu = cache.du ^ 2
-    @bb cache.Jᵀ²du = cache.J_cache × vec(cache.Jdu)
+    @bb @. cache.Jdu = cache.du^2
+    @bb cache.Jdu_cache = cache.J_cache × vec(cache.Jdu)
     @bb cache.Jdu = cache.J × vec(cache.du)
-    @bb @. cache.fu = cache.fu2 - cache.fu
-
-    @bb @. cache.fu = (cache.fu - cache.Jdu) / max(cache.Jᵀ²du, eps(real(T)))
-
+    @bb @. cache.fu_cache = (cache.fu - cache.fu_cache - cache.Jdu) /
+                            max(cache.Jdu_cache, eps(real(T)))
     @bb cache.J_cache = vec(cache.fu) × transpose(_vec(cache.du))
     @bb @. cache.J_cache *= cache.J
-    @bb cache.J_cache2 = cache.J_cache × cache.J
-    @bb cache.J .+= cache.J_cache2
+    @bb cache.J_cache_2 = cache.J_cache × cache.J
+    @bb cache.J .+= cache.J_cache_2
 
-    @bb copyto!(cache.fu, cache.fu2)
+    @bb copyto!(cache.fu_cache, cache.fu)
 
     return nothing
 end
diff --git a/src/raphson.jl b/src/raphson.jl
index 07b155f1c..835fadd48 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -52,9 +52,9 @@ end
     f
     alg
     u
-    u_prev
-    fu1
-    fu2
+    fu
+    u_cache
+    fu_cache
     du
     p
     uf
@@ -81,19 +81,19 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::NewtonRaphso
     alg = get_concrete_algorithm(alg_, prob)
     @unpack f, u0, p = prob
     u = __maybe_unaliased(u0, alias_u0)
-    fu1 = evaluate_f(prob, u)
-    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
+    fu = evaluate_f(prob, u)
+    uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
         linsolve_kwargs)
 
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu1, u,
+    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
 
-    ls_cache = init_linesearch_cache(alg.linesearch, f, u, p, fu1, Val(iip))
-    trace = init_nonlinearsolve_trace(alg, u, fu1, ApplyArray(__zero, J), du; kwargs...)
+    ls_cache = init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip))
+    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
 
-    @bb u_prev = copy(u)
+    @bb u_cache = copy(u)
 
-    return NewtonRaphsonCache{iip}(f, alg, u, u_prev, fu1, fu2, du, p, uf, linsolve, J,
+    return NewtonRaphsonCache{iip}(f, alg, u, fu, u_cache, fu_cache, du, p, uf, linsolve, J,
         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob,
         NLStats(1, 0, 0, 0, 0), ls_cache, tc_cache, trace)
 end
@@ -104,10 +104,9 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     cache.J = jacobian!!(cache.J, cache)
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu1),
+    linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
         linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-
     !iip && (cache.du = linres.u)
 
     # Line Search
@@ -116,12 +115,10 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
 
     evaluate_f(cache, cache.u, cache.p)
 
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-        cache.du, α)
-
-    check_and_update!(cache, cache.fu1, cache.u, cache.u_prev)
+    update_trace!(cache, α)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
-    @bb copyto!(cache.u_prev, cache.u)
+    @bb copyto!(cache.u_cache, cache.u)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
diff --git a/src/trace.jl b/src/trace.jl
index e89efe956..39c01d2c7 100644
--- a/src/trace.jl
+++ b/src/trace.jl
@@ -240,3 +240,23 @@ function update_trace_with_invJ!(trace::NonlinearSolveTrace{ShT, StT}, iter, u,
     show_now && show(entry)
     return trace
 end
+
+function update_trace!(cache::AbstractNonlinearSolveCache, α = true)
+    trace = __getproperty(cache, Val(:trace))
+    trace === nothing && return nothing
+
+    J = __getproperty(cache, Val(:J))
+    if J === nothing
+        J_inv = __getproperty(cache, Val(:J⁻¹))
+        if J_inv === nothing
+            update_trace!(trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache),
+                nothing, cache.du, α)
+        else
+            update_trace_with_invJ!(trace, cache.stats.nsteps + 1, get_u(cache),
+                get_fu(cache), J_inv, cache.du, α)
+        end
+    else
+        update_trace!(trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), J,
+            cache.du, α)
+    end
+end
diff --git a/src/utils.jl b/src/utils.jl
index bc38d9257..c6b670f8b 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -327,15 +327,17 @@ end
     return SArray{Tuple{S[1], S[2]}, eltype(J)}(I)
 end
 
-function __init_low_rank_jacobian(u::StaticArray, fu, threshold::Int)
-    Vᵀ = convert(MArray{Tuple{length(u), threshold}},
-        zeros(eltype(u), length(u), threshold))
-    U = convert(MArray{Tuple{threshold, length(u)}}, zeros(eltype(u), threshold, length(u)))
+function __init_low_rank_jacobian(u::StaticArray{S1, T1}, fu::StaticArray{S2, T2},
+        ::Val{threshold}) where {S1, S2, T1, T2, threshold}
+    T = promote_type(T1, T2)
+    fuSize, uSize = Size(fu), Size(u)
+    Vᵀ = MArray{Tuple{threshold, prod(uSize)}, T}(undef)
+    U = MArray{Tuple{prod(fuSize), threshold}, T}(undef)
     return U, Vᵀ
 end
-function __init_low_rank_jacobian(u, fu, threshold::Int)
-    Vᵀ = convert(parameterless_type(_mutable(u)), zeros(eltype(u), length(u), threshold))
-    U = convert(parameterless_type(_mutable(u)), zeros(eltype(u), threshold, length(u)))
+function __init_low_rank_jacobian(u, fu, ::Val{threshold}) where {threshold}
+    Vᵀ = similar(u, threshold, length(u))
+    U = similar(u, length(fu), threshold)
     return U, Vᵀ
 end
 

From 4c61c4a7ee5b80eed15982aae872bdfca96dad66 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 16:27:18 -0500
Subject: [PATCH 06/25] Fix PT

---
 src/NonlinearSolve.jl  |   6 +-
 src/jacobian.jl        |   9 +--
 src/klement.jl         |   6 +-
 src/pseudotransient.jl | 140 +++++++++++++----------------------------
 4 files changed, 56 insertions(+), 105 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index d55527c90..936cd840e 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -20,7 +20,7 @@ import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_work
     import ForwardDiff
     import ForwardDiff: Dual
     import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
-    import MaybeInplace: @bb
+    import MaybeInplace: setindex_trait, @bb, CanSetindex, CannotSetindex
     import RecursiveArrayTools: ArrayPartition,
         AbstractVectorOfArray, recursivecopy!, recursivefill!
     import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
@@ -80,7 +80,7 @@ function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(c
     return cache
 end
 
-__reinit_internal!(cache::AbstractNonlinearSolveCache) = nothing
+__reinit_internal!(::AbstractNonlinearSolveCache) = nothing
 
 function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm)
     str = "$(nameof(typeof(alg)))("
@@ -157,7 +157,7 @@ include("raphson.jl")
 # include("levenberg.jl")
 # include("gaussnewton.jl")
 # include("dfsane.jl")
-# include("pseudotransient.jl")
+include("pseudotransient.jl")
 include("broyden.jl")
 include("klement.jl")
 # include("lbroyden.jl")
diff --git a/src/jacobian.jl b/src/jacobian.jl
index a63a57ffc..6747da1a8 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -116,10 +116,11 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
     end
 
     if linsolve_init
-        linprob_A = needsJᵀJ ? __maybe_symmetric(JᵀJ) : J
-        # linprob_A = alg isa PseudoTransient ?
-        #             (J - (1 / (convert(eltype(u), alg.alpha_initial))) * I) :
-        #             (needsJᵀJ ? __maybe_symmetric(JᵀJ) : J)
+        if alg isa PseudoTransient && J isa SciMLOperators.AbstractSciMLOperator
+            linprob_A = J - inv(convert(eltype(u), alg.alpha_initial)) * I
+        else
+            linprob_A = needsJᵀJ ? __maybe_symmetric(JᵀJ) : J
+        end
         linsolve = linsolve_caches(linprob_A, needsJᵀJ ? Jᵀfu : fu, du, p, alg;
             linsolve_kwargs)
     else
diff --git a/src/klement.jl b/src/klement.jl
index 4ec612273..37d6a6c07 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -120,9 +120,11 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
         cache.resets += 1
     end
 
+    A = ifelse(cache.J isa SMatrix || cache.J isa Number || !fact_done, cache.J, nothing)
+
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
-        linu = _vec(cache.du), cache.p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, cache.linsolve; A,
+        b = _vec(cache.fu), linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     !iip && (cache.du = linres.u)
 
diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
index 3873202c4..c6f1926f1 100644
--- a/src/pseudotransient.jl
+++ b/src/pseudotransient.jl
@@ -41,7 +41,6 @@ SIAM Journal on Scientific Computing,25, 553-569.](https://doi.org/10.1137/S1064
     alpha_initial
 end
 
-#concrete_jac(::PseudoTransient{CJ}) where {CJ} = CJ
 function set_ad(alg::PseudoTransient{CJ}, ad) where {CJ}
     return PseudoTransient{CJ}(ad, alg.linsolve, alg.precs, alg.alpha_initial)
 end
@@ -56,9 +55,9 @@ end
     f
     alg
     u
-    u_prev
-    fu1
-    fu2
+    u_cache
+    fu
+    fu_cache
     du
     p
     alpha
@@ -86,92 +85,66 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::PseudoTransi
     alg = get_concrete_algorithm(alg_, prob)
 
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
-    fu1 = evaluate_f(prob, u)
-    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
+    u = __maybe_unaliased(u0, alias_u0)
+    fu = evaluate_f(prob, u)
+    uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
         linsolve_kwargs)
     alpha = convert(eltype(u), alg.alpha_initial)
-    res_norm = internalnorm(fu1)
+    res_norm = internalnorm(fu)
 
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu1, u,
+    @bb u_cache = copy(u)
+
+    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu1, ApplyArray(__zero, J), du; kwargs...)
+    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
 
-    return PseudoTransientCache{iip}(f, alg, u, copy(u), fu1, fu2, du, p, alpha, res_norm,
-        uf, linsolve, J, jac_cache, false, maxiters, internalnorm, ReturnCode.Default,
-        abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
+    return PseudoTransientCache{iip}(f, alg, u, u_cache, fu, fu_cache, du, p, alpha,
+        res_norm, uf, linsolve, J, jac_cache, false, maxiters, internalnorm,
+        ReturnCode.Default, abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
 end
 
-function perform_step!(cache::PseudoTransientCache{true})
-    @unpack u, u_prev, fu1, f, p, alg, J, linsolve, du, alpha = cache
-    jacobian!!(J, cache)
+function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
+    @unpack alg = cache
 
-    inv_alpha = inv(alpha)
-    if J isa SciMLBase.AbstractSciMLOperator
-        J = J - inv_alpha * I
-    else
-        idxs = diagind(J)
-        if fast_scalar_indexing(J)
-            @inbounds for i in axes(J, 1)
-                J[i, i] = J[i, i] - inv_alpha
+    cache.J = jacobian!!(cache.J, cache)
+
+    inv_α = inv(cache.alpha)
+    if cache.J isa SciMLOperators.AbstractSciMLOperator
+        A  = cache.J - inv_α * I
+    elseif setindex_trait(cache.J) === CanSetindex()
+        idxs = diagind(cache.J)
+        if fast_scalar_indexing(cache.J)
+            @inbounds for i in axes(cache.J, 1)
+                cache.J[i, i] = cache.J[i, i] - inv_α
             end
         else
-            @.. broadcast=false @view(J[idxs])=@view(J[idxs]) - inv_alpha
+            @.. broadcast=false @view(cache.J[idxs])=@view(cache.J[idxs]) - inv_α
         end
+        A = cache.J
+    else
+        cache.J = cache.J - inv_α * I
+        A = cache.J
     end
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(fu1), linu = _vec(du),
-        p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, cache.linsolve; A, b = _vec(cache.fu),
+        linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. u = u - du
-    f(fu1, u, p)
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), J,
-        cache.du)
-
-    new_norm = cache.internalnorm(fu1)
-    cache.alpha *= cache.res_norm / new_norm
-    cache.res_norm = new_norm
-
-    check_and_update!(cache, cache.fu1, cache.u, cache.u_prev)
-
-    @. u_prev = u
-    cache.stats.nf += 1
-    cache.stats.njacs += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-    return nothing
-end
+    !iip && (cache.du = linres.u)
 
-function perform_step!(cache::PseudoTransientCache{false})
-    @unpack u, u_prev, fu1, f, p, alg, linsolve, alpha = cache
+    @bb axpy!(-true, cache.du, cache.u)
 
-    cache.J = jacobian!!(cache.J, cache)
-
-    inv_alpha = inv(alpha)
-    cache.J = cache.J - inv_alpha * I
-    # u = u - J \ fu
-    if linsolve === nothing
-        cache.du = fu1 / cache.J
-    else
-        linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1),
-            linu = _vec(cache.du), p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-    end
-    cache.u = @. u - cache.du  # `u` might not support mutation
-    cache.fu1 = f(cache.u, p)
+    evaluate_f(cache, cache.u, cache.p)
 
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-        cache.du)
+    update_trace!(cache, true)
 
-    new_norm = cache.internalnorm(fu1)
+    new_norm = cache.internalnorm(cache.fu)
     cache.alpha *= cache.res_norm / new_norm
     cache.res_norm = new_norm
 
-    check_and_update!(cache, cache.fu1, cache.u, cache.u_prev)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
-    cache.u_prev = cache.u
+    @bb copyto!(cache.u_cache, cache.u)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
@@ -179,33 +152,8 @@ function perform_step!(cache::PseudoTransientCache{false})
     return nothing
 end
 
-function SciMLBase.reinit!(cache::PseudoTransientCache{iip}, u0 = cache.u; p = cache.p,
-        alpha = cache.alpha, abstol = cache.abstol, reltol = cache.reltol,
-        termination_condition = get_termination_mode(cache.tc_cache),
-        maxiters = cache.maxiters) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu1, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu1 = cache.f(cache.u, p)
-    end
-
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu1, cache.u,
-        termination_condition)
-
-    cache.alpha = convert(eltype(cache.u), alpha)
-    cache.res_norm = cache.internalnorm(cache.fu1)
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
+function __reinit_internal!(cache::PseudoTransientCache)
+    cache.alpha = convert(eltype(cache.u), cache.alg.alpha_initial)
+    cache.res_norm = cache.internalnorm(cache.fu)
+    return nothing
 end

From c44ba3736e9e783fb80d0bba15e03975a95f900f Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 17:39:28 -0500
Subject: [PATCH 07/25] Cleanup LBroyden

---
 src/NonlinearSolve.jl |   2 +-
 src/broyden.jl        |   2 +-
 src/lbroyden.jl       | 267 +++++++++++++++---------------------------
 3 files changed, 96 insertions(+), 175 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 936cd840e..ae1814901 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -160,7 +160,7 @@ include("raphson.jl")
 include("pseudotransient.jl")
 include("broyden.jl")
 include("klement.jl")
-# include("lbroyden.jl")
+include("lbroyden.jl")
 include("jacobian.jl")
 # include("ad.jl")
 # include("default.jl")
diff --git a/src/broyden.jl b/src/broyden.jl
index d1c8ac433..504e16912 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -70,7 +70,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::GeneralBroyde
     reset_check = x -> abs(x) ≤ reset_tolerance
 
     @bb u_cache = copy(u)
-    @bb fu_cache = similar(fu)
+    @bb fu_cache = copy(fu)
     @bb dfu = similar(fu)
     @bb J⁻¹dfu = similar(u)
 
diff --git a/src/lbroyden.jl b/src/lbroyden.jl
index b000325dc..8882a2645 100644
--- a/src/lbroyden.jl
+++ b/src/lbroyden.jl
@@ -17,34 +17,36 @@ An implementation of `LimitedMemoryBroyden` with resetting and line search.
     recommended to use [LiFukushimaLineSearchCache](@ref) -- a derivative free linesearch
     specifically designed for Broyden's method.
 """
-@concrete struct LimitedMemoryBroyden <: AbstractNewtonAlgorithm{false, Nothing}
+@concrete struct LimitedMemoryBroyden{threshold} <: AbstractNewtonAlgorithm{false, Nothing}
     max_resets::Int
-    threshold::Int
     linesearch
     reset_tolerance
 end
 
 function LimitedMemoryBroyden(; max_resets::Int = 3, linesearch = nothing,
-        threshold::Int = 27, reset_tolerance = nothing)
+        threshold::Union{Val, Int} = Val(27), reset_tolerance = nothing)
     linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
-    return LimitedMemoryBroyden(max_resets, threshold, linesearch, reset_tolerance)
+    return LimitedMemoryBroyden{SciMLBase._unwrap_val(threshold)}(max_resets, linesearch,
+        reset_tolerance)
 end
 
+__get_threshold(::LimitedMemoryBroyden{threshold}) where {threshold} = Val(threshold)
+__get_unwrapped_threshold(::LimitedMemoryBroyden{threshold}) where {threshold} = threshold
+
 @concrete mutable struct LimitedMemoryBroydenCache{iip} <: AbstractNonlinearSolveCache{iip}
     f
     alg
     u
-    u_prev
+    u_cache
     du
     fu
-    fu2
+    fu_cache
     dfu
     p
     U
     Vᵀ
-    Ux
-    xᵀVᵀ
-    u_cache
+    threshold_cache
+    mat_cache
     vᵀ_cache
     force_stop::Bool
     resets::Int
@@ -64,128 +66,74 @@ end
     trace
 end
 
-get_fu(cache::LimitedMemoryBroydenCache) = cache.fu
-set_fu!(cache::LimitedMemoryBroydenCache, fu) = (cache.fu = fu)
-
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LimitedMemoryBroyden,
         args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
         termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
         kwargs...) where {uType, iip, F}
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
-    if u isa Number
-        # If u is a number then we simply use Broyden
+    threshold = __get_threshold(alg)
+    η = min(__get_unwrapped_threshold(alg), maxiters)
+
+    if u0 isa Number || length(u0) ≤ η
+        # If u is a number or very small problem then we simply use Broyden
         return SciMLBase.__init(prob,
-            GeneralBroyden(; alg.max_resets, alg.reset_tolerance,
-                alg.linesearch), args...; alias_u0, maxiters, abstol, internalnorm, kwargs...)
+            GeneralBroyden(; alg.max_resets, alg.reset_tolerance, alg.linesearch), args...;
+            alias_u0, maxiters, abstol, internalnorm, kwargs...)
     end
+    u = __maybe_unaliased(u0, alias_u0)
+
     fu = evaluate_f(prob, u)
-    threshold = min(alg.threshold, maxiters)
+
     U, Vᵀ = __init_low_rank_jacobian(u, fu, threshold)
-    du = copy(fu)
+
+    @bb du = copy(fu)
+    @bb u_cache = similar(u)
+    @bb fu_cache = copy(fu)
+    @bb dfu = similar(fu)
+    @bb vᵀ_cache = similar(u)
+    @bb mat_cache = similar(u)
+
     reset_tolerance = alg.reset_tolerance === nothing ? sqrt(eps(real(eltype(u)))) :
                       alg.reset_tolerance
     reset_check = x -> abs(x) ≤ reset_tolerance
 
     abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
+
     U_part = selectdim(U, 1, 1:0)
     Vᵀ_part = selectdim(Vᵀ, 2, 1:0)
     trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(*, Vᵀ_part, U_part), du;
         kwargs...)
 
-    return LimitedMemoryBroydenCache{iip}(f, alg, u, zero(u), du, fu, zero(fu),
-        zero(fu), p, U, Vᵀ, similar(u, threshold), similar(u, 1, threshold),
-        zero(u), zero(u), false, 0, 0, alg.max_resets, maxiters, internalnorm,
-        ReturnCode.Default, abstol, reltol, reset_tolerance, reset_check, prob,
-        NLStats(1, 0, 0, 0, 0),
-        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
-end
-
-function perform_step!(cache::LimitedMemoryBroydenCache{true})
-    @unpack f, p, du, u = cache
-    T = eltype(u)
-
-    α = perform_linesearch!(cache.ls_cache, u, du)
-    _axpy!(-α, du, u)
-    f(cache.fu2, u, p)
+    threshold_cache = __lbroyden_threshold_cache(u, threshold)
 
-    idx = min(cache.iterations_since_reset, size(cache.U, 1))
-    U_part = selectdim(cache.U, 1, 1:idx)
-    Vᵀ_part = selectdim(cache.Vᵀ, 2, 1:idx)
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu2,
-        ApplyArray(*, Vᵀ_part, U_part), du, α)
-
-    check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
-    cache.stats.nf += 1
-
-    cache.force_stop && return nothing
-
-    # Update the Inverse Jacobian Approximation
-    cache.dfu .= cache.fu2 .- cache.fu
-
-    # Only try to reset if we have enough iterations since last reset
-    if cache.iterations_since_reset > size(cache.U, 1) &&
-       (all(cache.reset_check, du) || all(cache.reset_check, cache.dfu))
-        if cache.resets ≥ cache.max_resets
-            cache.retcode = ReturnCode.ConvergenceFailure
-            cache.force_stop = true
-            return nothing
-        end
-        cache.iterations_since_reset = 0
-        cache.resets += 1
-        cache.du .= cache.fu
-    else
-        cache.du .*= -1
-        idx = min(cache.iterations_since_reset, size(cache.U, 1))
-        U_part = selectdim(cache.U, 1, 1:idx)
-        Vᵀ_part = selectdim(cache.Vᵀ, 2, 1:idx)
-
-        __lbroyden_matvec!(_vec(cache.vᵀ_cache), cache.Ux, U_part, Vᵀ_part, _vec(cache.du))
-        __lbroyden_rmatvec!(_vec(cache.u_cache), cache.xᵀVᵀ, U_part, Vᵀ_part,
-            _vec(cache.dfu))
-        denom = dot(cache.vᵀ_cache, cache.dfu)
-        cache.u_cache .= (du .- cache.u_cache) ./ ifelse(iszero(denom), T(1e-5), denom)
-
-        idx = mod1(cache.iterations_since_reset + 1, size(cache.U, 1))
-        selectdim(cache.U, 1, idx) .= _vec(cache.u_cache)
-        selectdim(cache.Vᵀ, 2, idx) .= _vec(cache.vᵀ_cache)
-
-        idx = min(cache.iterations_since_reset + 1, size(cache.U, 1))
-        U_part = selectdim(cache.U, 1, 1:idx)
-        Vᵀ_part = selectdim(cache.Vᵀ, 2, 1:idx)
-        __lbroyden_matvec!(_vec(cache.du), cache.Ux, U_part, Vᵀ_part, _vec(cache.fu2))
-        cache.iterations_since_reset += 1
-    end
-
-    cache.u_prev .= cache.u
-    cache.fu .= cache.fu2
-
-    return nothing
+    return LimitedMemoryBroydenCache{iip}(f, alg, u, u_cache, du, fu, fu_cache, dfu, p,
+        U, Vᵀ, threshold_cache, mat_cache, vᵀ_cache, false, 0, 0, alg.max_resets, maxiters,
+        internalnorm, ReturnCode.Default, abstol, reltol, reset_tolerance, reset_check,
+        prob, NLStats(1, 0, 0, 0, 0),
+        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
 end
 
-function perform_step!(cache::LimitedMemoryBroydenCache{false})
-    @unpack f, p = cache
-
+function perform_step!(cache::LimitedMemoryBroydenCache{iip}) where {iip}
     T = eltype(cache.u)
 
     α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    cache.u = cache.u .- α * cache.du
-    cache.fu2 = f(cache.u, p)
+    @bb axpy!(-α, cache.du, cache.u)
+    evaluate_f(cache, cache.u, cache.p)
 
-    idx = min(cache.iterations_since_reset, size(cache.U, 1))
-    U_part = selectdim(cache.U, 1, 1:idx)
-    Vᵀ_part = selectdim(cache.Vᵀ, 2, 1:idx)
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu2,
+    idx = min(cache.iterations_since_reset, size(cache.U, 2))
+    U_part = selectdim(cache.U, 2, 1:idx)
+    Vᵀ_part = selectdim(cache.Vᵀ, 1, 1:idx)
+    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu,
         ApplyArray(*, Vᵀ_part, U_part), cache.du, α)
 
-    check_and_update!(cache, cache.fu2, cache.u, cache.u_prev)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
     cache.stats.nf += 1
 
     cache.force_stop && return nothing
 
     # Update the Inverse Jacobian Approximation
-    cache.dfu .= cache.fu2 .- cache.fu
+    @bb @. cache.dfu = cache.fu - cache.fu_cache
 
     # Only try to reset if we have enough iterations since last reset
     if cache.iterations_since_reset > size(cache.U, 1) &&
@@ -197,102 +145,75 @@ function perform_step!(cache::LimitedMemoryBroydenCache{false})
         end
         cache.iterations_since_reset = 0
         cache.resets += 1
-        cache.du = cache.fu
+        @bb copyto!(cache.du, cache.fu)
     else
-        cache.du = -cache.du
-        idx = min(cache.iterations_since_reset, size(cache.U, 1))
-        U_part = selectdim(cache.U, 1, 1:idx)
-        Vᵀ_part = selectdim(cache.Vᵀ, 2, 1:idx)
-
-        cache.vᵀ_cache = _restructure(cache.vᵀ_cache,
-            __lbroyden_matvec(U_part, Vᵀ_part, _vec(cache.du)))
-        cache.u_cache = _restructure(cache.u_cache,
-            __lbroyden_rmatvec(U_part, Vᵀ_part, _vec(cache.dfu)))
+        @bb cache.du .*= -1
+
+        cache.vᵀ_cache = _rmatvec!!(cache.vᵀ_cache, cache.threshold_cache, U_part, Vᵀ_part,
+            cache.du)
+        cache.mat_cache = _matvec!!(cache.mat_cache, cache.threshold_cache, U_part, Vᵀ_part,
+            cache.dfu)
+
         denom = dot(cache.vᵀ_cache, cache.dfu)
-        cache.u_cache = (cache.du .- cache.u_cache) ./ ifelse(iszero(denom), T(1e-5), denom)
+        @bb @. cache.u_cache = (cache.du - cache.mat_cache) /
+                               ifelse(iszero(denom), T(1e-5), denom)
+
+        idx = mod1(cache.iterations_since_reset + 1, size(cache.U, 2))
+        selectdim(cache.U, 2, idx) .= _vec(cache.u_cache)
+        selectdim(cache.Vᵀ, 1, idx) .= _vec(cache.vᵀ_cache)
 
-        idx = mod1(cache.iterations_since_reset + 1, size(cache.U, 1))
-        selectdim(cache.U, 1, idx) .= _vec(cache.u_cache)
-        selectdim(cache.Vᵀ, 2, idx) .= _vec(cache.vᵀ_cache)
+        idx = min(cache.iterations_since_reset + 1, size(cache.U, 2))
+        U_part = selectdim(cache.U, 2, 1:idx)
+        Vᵀ_part = selectdim(cache.Vᵀ, 1, 1:idx)
+        cache.du = _matvec!!(cache.du, cache.threshold_cache, U_part, Vᵀ_part, cache.fu)
 
-        idx = min(cache.iterations_since_reset + 1, size(cache.U, 1))
-        U_part = selectdim(cache.U, 1, 1:idx)
-        Vᵀ_part = selectdim(cache.Vᵀ, 2, 1:idx)
-        cache.du = _restructure(cache.du,
-            __lbroyden_matvec(U_part, Vᵀ_part, _vec(cache.fu2)))
         cache.iterations_since_reset += 1
     end
 
-    cache.u_prev = @. cache.u
-    cache.fu = cache.fu2
+    @bb copyto!(cache.u_cache, cache.u)
+    @bb copyto!(cache.fu_cache, cache.fu)
 
     return nothing
 end
 
-function SciMLBase.reinit!(cache::LimitedMemoryBroydenCache{iip}, u0 = cache.u; p = cache.p,
-        termination_condition = get_termination_mode(cache.tc_cache),
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu = cache.f(cache.u, p)
-    end
-
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu, cache.u,
-        termination_condition)
-
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.resets = 0
+function __reinit_internal!(cache::LimitedMemoryBroydenCache)
     cache.iterations_since_reset = 0
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
+    return nothing
 end
 
-@views function __lbroyden_matvec!(y::AbstractVector, Ux::AbstractVector,
-        U::AbstractMatrix, Vᵀ::AbstractMatrix, x::AbstractVector)
-    # Computes Vᵀ × U × x
-    η = size(U, 1)
+function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
+    # xᵀ × (-I + UVᵀ)
+    η = size(U, 2)
     if η == 0
-        y .= x
-        return nothing
+        @bb @. y = -x
+        return y
     end
-    mul!(Ux[1:η], U, x)
-    mul!(y, Vᵀ[:, 1:η], Ux[1:η])
-    return nothing
-end
-
-@views function __lbroyden_matvec(U::AbstractMatrix, Vᵀ::AbstractMatrix, x::AbstractVector)
-    # Computes Vᵀ × U × x
-    size(U, 1) == 0 && return x
-    return Vᵀ * (U * x)
+    x_ = vec(x)
+    xᵀU_ = view(xᵀU, 1:η)
+    @bb xᵀU_ = transpose(U) × x_
+    @bb y = transpose(Vᵀ) × xᵀU_
+    @bb @. y -= x
+    return y
 end
 
-@views function __lbroyden_rmatvec!(y::AbstractVector, xᵀVᵀ::AbstractMatrix,
-        U::AbstractMatrix, Vᵀ::AbstractMatrix, x::AbstractVector)
-    # Computes xᵀ × Vᵀ × U
-    η = size(U, 1)
+function _matvec!!(y, Vᵀx, U, Vᵀ, x)
+    # (-I + UVᵀ) × x
+    η = size(U, 2)
     if η == 0
-        y .= x
-        return nothing
+        @bb @. y = -x
+        return y
     end
-    mul!(xᵀVᵀ[:, 1:η], x', Vᵀ)
-    mul!(reshape(y, 1, :), xᵀVᵀ[:, 1:η], U)
-    return nothing
+    x_ = vec(x)
+    Vᵀx_ = view(Vᵀx, 1:η)
+    @bb Vᵀx_ = Vᵀ × x_
+    @bb y = U × Vᵀx_
+    @bb @. y -= x
+    return y
 end
 
-@views function __lbroyden_rmatvec(U::AbstractMatrix, Vᵀ::AbstractMatrix, x::AbstractVector)
-    # Computes xᵀ × Vᵀ × U
-    size(U, 1) == 0 && return x
-    return (reshape(x, 1, :) * Vᵀ) * U
+@inline function __lbroyden_threshold_cache(x, ::Val{threshold}) where {threshold}
+    return similar(x, threshold)
+end
+@inline function __lbroyden_threshold_cache(x::SArray, ::Val{threshold}) where {threshold}
+    return zeros(SVector{threshold, eltype(x)})
 end

From 21e9ed4ce36598691f73250afeff73ee16484f1c Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 18:48:45 -0500
Subject: [PATCH 08/25] Nearly finished GN

---
 src/NonlinearSolve.jl  |  34 ++++++---
 src/broyden.jl         |   2 +-
 src/gaussnewton.jl     | 164 ++++++++++++-----------------------------
 src/jacobian.jl        |   5 +-
 src/klement.jl         |   4 +-
 src/lbroyden.jl        |   7 +-
 src/levenberg.jl       |   2 +-
 src/pseudotransient.jl |   7 +-
 src/raphson.jl         |   2 +-
 src/utils.jl           |   6 +-
 10 files changed, 89 insertions(+), 144 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index ae1814901..63d462f18 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -51,8 +51,8 @@ isinplace(::AbstractNonlinearSolveCache{iip}) where {iip} = iip
 
 function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(cache);
         p = cache.p, abstol = cache.abstol, reltol = cache.reltol,
-        maxiters = cache.maxiters, alias_u0 = false,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
+        maxiters = cache.maxiters, alias_u0 = false, termination_condition = missing,
+        kwargs...) where {iip}
     cache.p = p
     if iip
         recursivecopy!(get_u(cache), u0)
@@ -63,24 +63,40 @@ function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(c
     end
 
     reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, get_fu(cache),
-        get_u(cache), termination_condition)
+
+    # Some algorithms store multiple termination caches
+    if hasfield(typeof(cache), :tc_cache)
+        # TODO: We need an efficient way to reset this upstream
+        tc = termination_condition === missing ? get_termination_mode(cache.tc_cache) :
+             termination_condition
+        abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, get_fu(cache),
+            get_u(cache), tc)
+        cache.tc_cache = tc_cache
+    end
+
+    if hasfield(typeof(cache), :ls_cache)
+        # TODO: A more efficient way to do this
+        cache.ls_cache = init_linesearch_cache(cache.prob, cache.alg.linesearch, cache.f,
+            get_u(cache), p, get_fu(cache), Val(iip))
+    end
+
+    hasfield(typeof(cache), :uf) && (cache.uf.p = p)
 
     cache.abstol = abstol
     cache.reltol = reltol
-    cache.tc_cache = tc_cache
     cache.maxiters = maxiters
     cache.stats.nf = 1
     cache.stats.nsteps = 1
     cache.force_stop = false
     cache.retcode = ReturnCode.Default
 
-    __reinit_internal!(cache)
+    __reinit_internal!(cache; u0, p, abstol, reltol, maxiters, alias_u0,
+        termination_condition, kwargs...)
 
     return cache
 end
 
-__reinit_internal!(::AbstractNonlinearSolveCache) = nothing
+__reinit_internal!(::AbstractNonlinearSolveCache; kwargs...) = nothing
 
 function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm)
     str = "$(nameof(typeof(alg)))("
@@ -155,14 +171,14 @@ include("linesearch.jl")
 include("raphson.jl")
 # include("trustRegion.jl")
 # include("levenberg.jl")
-# include("gaussnewton.jl")
+include("gaussnewton.jl")
 # include("dfsane.jl")
 include("pseudotransient.jl")
 include("broyden.jl")
 include("klement.jl")
 include("lbroyden.jl")
 include("jacobian.jl")
-# include("ad.jl")
+include("ad.jl")
 # include("default.jl")
 
 # @setup_workload begin
diff --git a/src/broyden.jl b/src/broyden.jl
index 504e16912..9b165e513 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -126,7 +126,7 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
     return nothing
 end
 
-function __reinit_internal!(cache::GeneralBroydenCache)
+function __reinit_internal!(cache::GeneralBroydenCache; kwargs...)
     cache.J⁻¹ = __reinit_identity_jacobian!!(cache.J⁻¹)
     cache.resets = 0
     return nothing
diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index ea1855e68..1b4fc9432 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -46,9 +46,8 @@ function set_ad(alg::GaussNewton{CJ}, ad) where {CJ}
     return GaussNewton{CJ}(ad, alg.linsolve, alg.precs, alg.linesearch, alg.vjp_autodiff)
 end
 
-function GaussNewton(; concrete_jac = nothing, linsolve = nothing,
-        linesearch = nothing, precs = DEFAULT_PRECS, vjp_autodiff = nothing,
-        adkwargs...)
+function GaussNewton(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
+        linesearch = nothing, vjp_autodiff = nothing, adkwargs...)
     ad = default_adargs_to_adtype(; adkwargs...)
     linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
     return GaussNewton{_unwrap_val(concrete_jac)}(ad, linsolve, precs, linesearch,
@@ -59,11 +58,11 @@ end
     f
     alg
     u
-    u_prev
-    fu1
-    fu2
-    fu_new
+    u_cache
+    fu
+    fu_cache
     du
+    dfu
     p
     uf
     linsolve
@@ -92,109 +91,57 @@ function SciMLBase.__init(prob::NonlinearLeastSquaresProblem{uType, iip}, alg_::
     alg = get_concrete_algorithm(alg_, prob)
     @unpack f, u0, p = prob
 
-    linsolve_with_JᵀJ = Val(_needs_square_A(alg, u0))
+    u = __maybe_unaliased(u0, alias_u0)
+    fu = evaluate_f(prob, u)
 
-    u = alias_u0 ? u0 : deepcopy(u0)
-    fu1 = evaluate_f(prob, u)
+    uf, linsolve, J, fu_cache, jac_cache, du, JᵀJ, Jᵀf = jacobian_caches(alg, f, u, p,
+        Val(iip); linsolve_with_JᵀJ = Val(__needs_square_A(alg, u)))
 
-    if SciMLBase._unwrap_val(linsolve_with_JᵀJ)
-        uf, linsolve, J, fu2, jac_cache, du, JᵀJ, Jᵀf = jacobian_caches(alg, f, u, p,
-            Val(iip); linsolve_with_JᵀJ)
-    else
-        uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p,
-            Val(iip); linsolve_with_JᵀJ)
-        JᵀJ, Jᵀf = nothing, nothing
-    end
-
-    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, fu1, u,
+    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
-    _, _, tc_cache_2 = init_termination_cache(abstol, reltol, fu1, u, termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu1, ApplyArray(__zero, J), du; kwargs...)
+    _, _, tc_cache_2 = init_termination_cache(abstol, reltol, fu, u, termination_condition)
+    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
 
-    return GaussNewtonCache{iip}(f, alg, u, copy(u), fu1, fu2, zero(fu1), du, p, uf,
+    @bb u_cache = copy(u)
+    @bb dfu = copy(fu)
+
+    return GaussNewtonCache{iip}(f, alg, u, u_cache, fu, fu_cache, du, dfu, p, uf,
         linsolve, J, JᵀJ, Jᵀf, jac_cache, false, maxiters, internalnorm, ReturnCode.Default,
         abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2,
         init_linesearch_cache(alg.linesearch, f, u, p, fu1, Val(iip)), trace)
 end
 
-function perform_step!(cache::GaussNewtonCache{true})
-    @unpack u, u_prev, fu1, f, p, alg, J, JᵀJ, Jᵀf, linsolve, du = cache
-    jacobian!!(J, cache)
-
-    if JᵀJ !== nothing
-        __update_JᵀJ!(Val{true}(), cache, :JᵀJ, J)
-        __update_Jᵀf!(Val{true}(), cache, :Jᵀf, :JᵀJ, J, fu1)
-    end
-
-    # u = u - JᵀJ \ Jᵀfu
-    if cache.JᵀJ === nothing
-        linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(fu1), linu = _vec(du),
-            p, reltol = cache.abstol)
-    else
-        linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(JᵀJ), b = _vec(Jᵀf),
-            linu = _vec(du), p, reltol = cache.abstol)
-    end
-    cache.linsolve = linres.cache
-    α = perform_linesearch!(cache.ls_cache, u, du)
-    _axpy!(-α, du, u)
-    f(cache.fu_new, u, p)
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), J,
-        cache.du, α)
-
-    check_and_update!(cache.tc_cache_1, cache, cache.fu_new, cache.u, cache.u_prev)
-    if !cache.force_stop
-        cache.fu1 .= cache.fu_new .- cache.fu1
-        check_and_update!(cache.tc_cache_2, cache, cache.fu1, cache.u, cache.u_prev)
-    end
-
-    @. u_prev = u
-    cache.fu1 .= cache.fu_new
-    cache.stats.nf += 1
-    cache.stats.njacs += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-    return nothing
-end
-
-function perform_step!(cache::GaussNewtonCache{false})
-    @unpack u, u_prev, fu1, f, p, alg, linsolve = cache
-
+function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
     cache.J = jacobian!!(cache.J, cache)
 
+    # Use normal form to solve the Linear Problem
     if cache.JᵀJ !== nothing
-        __update_JᵀJ!(Val{false}(), cache, :JᵀJ, cache.J)
-        __update_Jᵀf!(Val{false}(), cache, :Jᵀf, :JᵀJ, cache.J, fu1)
-    end
-
-    # u = u - J \ fu
-    if linsolve === nothing
-        cache.du = fu1 / cache.J
+        __update_JᵀJ!(Val{iip}(), cache, :JᵀJ, cache.J)
+        __update_Jᵀf!(Val{iip}(), cache, :Jᵀf, :JᵀJ, cache.J, cache.fu1)
+        A, b = __maybe_symmetric(cache.JᵀJ), _vec(cache.Jᵀf)
     else
-        if cache.JᵀJ === nothing
-            linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1),
-                linu = _vec(cache.du), p, reltol = cache.abstol)
-        else
-            linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.JᵀJ),
-                b = _vec(cache.Jᵀf), linu = _vec(cache.du), p, reltol = cache.abstol)
-        end
-        cache.linsolve = linres.cache
+        A, b = cache.J, _vec(cache.fu)
     end
-    α = perform_linesearch!(cache.ls_cache, u, cache.du)
-    cache.u = @. u - α * cache.du  # `u` might not support mutation
-    cache.fu_new = f(cache.u, p)
 
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-        cache.du, α)
+    linres = dolinsolve(alg.precs, linsolve; A, b, linu = _vec(du), cache.p,
+        reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    cache.du = _restructure(cache.du, linres.u)
+
+    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
+    @bb axpy!(-α, cache.du, cache.u)
+    evaluate_f(cache, cache.u, cache.p)
+    update_trace!(cache, α)
 
-    check_and_update!(cache.tc_cache_1, cache, cache.fu_new, cache.u, cache.u_prev)
+    check_and_update!(cache.tc_cache_1, cache, cache.fu, cache.u, cache.u_cache)
     if !cache.force_stop
-        cache.fu1 = cache.fu_new .- cache.fu1
-        check_and_update!(cache.tc_cache_2, cache, cache.fu1, cache.u, cache.u_prev)
+        @bb @. cache.dfu = cache.fu .- cache.dfu
+        check_and_update!(cache.tc_cache_2, cache, cache.dfu, cache.u, cache.u_prev)
     end
 
-    cache.u_prev = cache.u
-    cache.fu1 = cache.fu_new
+    @bb copyto!(cache.u_cache, cache.u)
+    @bb copyto!(cache.dfu, cache.fu)
+
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
@@ -202,33 +149,16 @@ function perform_step!(cache::GaussNewtonCache{false})
     return nothing
 end
 
-function SciMLBase.reinit!(cache::GaussNewtonCache{iip}, u0 = cache.u; p = cache.p,
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu1, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu1 = cache.f(cache.u, p)
-    end
+function __reinit_internal!(cache::GaussNewtonCache;
+        termination_condition = get_termination_mode(cache.tc_cache_1), kwargs...)
+    abstol, reltol, tc_cache_1 = init_termination_cache(cache.abstol, cache.reltol,
+        cache.fu1, cache.u, termination_condition)
+    _, _, tc_cache_2 = init_termination_cache(cache.abstol, cache.reltol, cache.fu1,
+        cache.u, termination_condition)
 
-    reset!(cache.trace)
-    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, cache.fu1, cache.u,
-        termination_condition)
-    _, _, tc_cache_2 = init_termination_cache(abstol, reltol, cache.fu1, cache.u,
-        termination_condition)
-
-    cache.abstol = abstol
-    cache.reltol = reltol
     cache.tc_cache_1 = tc_cache_1
     cache.tc_cache_2 = tc_cache_2
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
+    cache.abstol = abstol
+    cache.reltol = reltol
+    return nothing
 end
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 6747da1a8..2ffcdc9aa 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -113,6 +113,8 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
         JᵀJ, Jᵀfu = __init_JᵀJ(J, _vec(fu), uf, u; f,
             vjp_autodiff = __get_nonsparse_ad(__getproperty(alg, Val(:vjp_autodiff))),
             jvp_autodiff = __get_nonsparse_ad(alg.ad))
+    else
+        JᵀJ, Jᵀfu = nothing, nothing
     end
 
     if linsolve_init
@@ -127,8 +129,7 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val
         linsolve = nothing
     end
 
-    needsJᵀJ && return uf, linsolve, J, fu, jac_cache, du, JᵀJ, Jᵀfu
-    return uf, linsolve, J, fu, jac_cache, du
+    return uf, linsolve, J, fu, jac_cache, du, JᵀJ, Jᵀfu
 end
 
 ## Special Handling for Scalars
diff --git a/src/klement.jl b/src/klement.jl
index 37d6a6c07..cceb51c3c 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -126,7 +126,7 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
     linres = dolinsolve(alg.precs, cache.linsolve; A,
         b = _vec(cache.fu), linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    !iip && (cache.du = linres.u)
+    cache.du = _restructure(cache.du, linres.u)
 
     # Line Search
     α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
@@ -163,7 +163,7 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
     return nothing
 end
 
-function __reinit_internal!(cache::GeneralKlementCache)
+function __reinit_internal!(cache::GeneralKlementCache; kwargs...)
     cache.J = __reinit_identity_jacobian!!(cache.J)
     cache.resets = 0
     return nothing
diff --git a/src/lbroyden.jl b/src/lbroyden.jl
index 8882a2645..611e5511b 100644
--- a/src/lbroyden.jl
+++ b/src/lbroyden.jl
@@ -73,7 +73,6 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LimitedMemory
     @unpack f, u0, p = prob
     threshold = __get_threshold(alg)
     η = min(__get_unwrapped_threshold(alg), maxiters)
-
     if u0 isa Number || length(u0) ≤ η
         # If u is a number or very small problem then we simply use Broyden
         return SciMLBase.__init(prob,
@@ -81,13 +80,11 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LimitedMemory
             alias_u0, maxiters, abstol, internalnorm, kwargs...)
     end
     u = __maybe_unaliased(u0, alias_u0)
-
     fu = evaluate_f(prob, u)
-
     U, Vᵀ = __init_low_rank_jacobian(u, fu, threshold)
 
     @bb du = copy(fu)
-    @bb u_cache = similar(u)
+    @bb u_cache = copy(u)
     @bb fu_cache = copy(fu)
     @bb dfu = similar(fu)
     @bb vᵀ_cache = similar(u)
@@ -176,7 +173,7 @@ function perform_step!(cache::LimitedMemoryBroydenCache{iip}) where {iip}
     return nothing
 end
 
-function __reinit_internal!(cache::LimitedMemoryBroydenCache)
+function __reinit_internal!(cache::LimitedMemoryBroydenCache; kwargs...)
     cache.iterations_since_reset = 0
     return nothing
 end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 94e882223..3b523807c 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -173,7 +173,7 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
     u = alias_u0 ? u0 : deepcopy(u0)
     fu1 = evaluate_f(prob, u)
 
-    linsolve_with_JᵀJ = Val(_needs_square_A(alg, u0))
+    linsolve_with_JᵀJ = Val(__needs_square_A(alg, u0))
 
     if _unwrap_val(linsolve_with_JᵀJ)
         uf, linsolve, J, fu2, jac_cache, du, JᵀJ, v = jacobian_caches(alg, f, u, p,
diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
index c6f1926f1..b01762493 100644
--- a/src/pseudotransient.jl
+++ b/src/pseudotransient.jl
@@ -130,7 +130,7 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
     linres = dolinsolve(alg.precs, cache.linsolve; A, b = _vec(cache.fu),
         linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    !iip && (cache.du = linres.u)
+    cache.du = _restructure(cache.du, linres.u)
 
     @bb axpy!(-true, cache.du, cache.u)
 
@@ -152,8 +152,9 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
     return nothing
 end
 
-function __reinit_internal!(cache::PseudoTransientCache)
-    cache.alpha = convert(eltype(cache.u), cache.alg.alpha_initial)
+function __reinit_internal!(cache::PseudoTransientCache; alpha = cache.alg.alpha_initial,
+        kwargs...)
+    cache.alpha = convert(eltype(cache.u), alpha)
     cache.res_norm = cache.internalnorm(cache.fu)
     return nothing
 end
diff --git a/src/raphson.jl b/src/raphson.jl
index 835fadd48..ac40b7c64 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -107,7 +107,7 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
         linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    !iip && (cache.du = linres.u)
+    cache.du = _restructure(cache.du, linres.u)
 
     # Line Search
     α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
diff --git a/src/utils.jl b/src/utils.jl
index c6b670f8b..46c5b9295 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -379,9 +379,9 @@ __try_factorize_and_check_singular!(::FakeLinearSolveJLCache, x) = _issingular(x
 end
 
 # Non-square matrix
-@inline _needs_square_A(_, ::Number) = true
-@inline _needs_square_A(_, ::StaticArray) = true
-@inline _needs_square_A(alg, _) = LinearSolve.needs_square_A(alg.linsolve)
+@inline __needs_square_A(_, ::Number) = true
+# @inline __needs_square_A(_, ::StaticArray) = true
+@inline __needs_square_A(alg, _) = LinearSolve.needs_square_A(alg.linsolve)
 
 # Define special concatenation for certain Array combinations
 @inline _vcat(x, y) = vcat(x, y)

From 0e3efd72170b801c6f50e226435c0816b9c56aff Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 19:32:32 -0500
Subject: [PATCH 09/25] Fix GN

---
 src/gaussnewton.jl | 10 +++++-----
 src/utils.jl       |  9 +++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index 1b4fc9432..5ff01d79a 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -108,7 +108,7 @@ function SciMLBase.__init(prob::NonlinearLeastSquaresProblem{uType, iip}, alg_::
     return GaussNewtonCache{iip}(f, alg, u, u_cache, fu, fu_cache, du, dfu, p, uf,
         linsolve, J, JᵀJ, Jᵀf, jac_cache, false, maxiters, internalnorm, ReturnCode.Default,
         abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2,
-        init_linesearch_cache(alg.linesearch, f, u, p, fu1, Val(iip)), trace)
+        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), trace)
 end
 
 function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
@@ -117,14 +117,14 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
     # Use normal form to solve the Linear Problem
     if cache.JᵀJ !== nothing
         __update_JᵀJ!(Val{iip}(), cache, :JᵀJ, cache.J)
-        __update_Jᵀf!(Val{iip}(), cache, :Jᵀf, :JᵀJ, cache.J, cache.fu1)
+        __update_Jᵀf!(Val{iip}(), cache, :Jᵀf, :JᵀJ, cache.J, cache.fu)
         A, b = __maybe_symmetric(cache.JᵀJ), _vec(cache.Jᵀf)
     else
         A, b = cache.J, _vec(cache.fu)
     end
 
-    linres = dolinsolve(alg.precs, linsolve; A, b, linu = _vec(du), cache.p,
-        reltol = cache.abstol)
+    linres = dolinsolve(cache.alg.precs, cache.linsolve; A, b, linu = _vec(cache.du),
+        cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     cache.du = _restructure(cache.du, linres.u)
 
@@ -136,7 +136,7 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
     check_and_update!(cache.tc_cache_1, cache, cache.fu, cache.u, cache.u_cache)
     if !cache.force_stop
         @bb @. cache.dfu = cache.fu .- cache.dfu
-        check_and_update!(cache.tc_cache_2, cache, cache.dfu, cache.u, cache.u_prev)
+        check_and_update!(cache.tc_cache_2, cache, cache.dfu, cache.u, cache.u_cache)
     end
 
     @bb copyto!(cache.u_cache, cache.u)
diff --git a/src/utils.jl b/src/utils.jl
index 46c5b9295..00b7d3726 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -188,6 +188,15 @@ function evaluate_f(prob::Union{NonlinearProblem{uType, iip},
     return fu
 end
 
+function evaluate_f(f, u, p, ::Val{iip}; fu = nothing) where {iip}
+    if iip
+        f(fu, u, p)
+        return fu
+    else
+        return f(u, p)
+    end
+end
+
 function evaluate_f(cache, u, p)
     if isinplace(cache)
         cache.prob.f(get_fu(cache), u, p)

From 031639f1a663bdca9b417dc376bea80f09277656 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 20:11:28 -0500
Subject: [PATCH 10/25] Fix DFSane

---
 src/NonlinearSolve.jl |   2 +-
 src/dfsane.jl         | 229 ++++++++++++------------------------------
 src/utils.jl          |  15 ---
 3 files changed, 63 insertions(+), 183 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 63d462f18..dacc98910 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -172,7 +172,7 @@ include("raphson.jl")
 # include("trustRegion.jl")
 # include("levenberg.jl")
 include("gaussnewton.jl")
-# include("dfsane.jl")
+include("dfsane.jl")
 include("pseudotransient.jl")
 include("broyden.jl")
 include("klement.jl")
diff --git a/src/dfsane.jl b/src/dfsane.jl
index 4b31ff9f3..8dcb1e9ff 100644
--- a/src/dfsane.jl
+++ b/src/dfsane.jl
@@ -1,8 +1,8 @@
 """
-    DFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0,
-        M::Int = 10, γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
-        n_exp::Int = 2, η_strategy::Function = (fn_1, n, x_n, f_n) -> fn_1 / n^2,
-        max_inner_iterations::Int = 1000)
+    DFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0, M::Int = 10,
+        γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5, n_exp::Int = 2,
+        η_strategy::Function = (fn_1, n, x_n, f_n) -> fn_1 / n^2,
+        max_inner_iterations::Int = 100)
 
 A low-overhead and allocation-free implementation of the df-sane method for solving large-scale nonlinear
 systems of equations. For in depth information about all the parameters and the algorithm,
@@ -39,34 +39,27 @@ Computation, 75, 1429-1448.](https://www.researchgate.net/publication/220576479_
     `f_n` the current residual. Should satisfy ``η > 0`` and ``∑ₖ ηₖ < ∞``. Defaults to
     ``fn_1 / n^2``.
   - `max_inner_iterations`: the maximum number of iterations allowed for the inner loop of the
-    algorithm. Defaults to `1000`.
+    algorithm. Defaults to `100`.
 """
-@concrete struct DFSane <: AbstractNonlinearSolveAlgorithm
-    σ_min
-    σ_max
-    σ_1
-    M::Int
-    γ
-    τ_min
-    τ_max
-    n_exp::Int
-    η_strategy
-    max_inner_iterations::Int
-end
-
-function DFSane(; σ_min = 1e-10, σ_max = 1e+10, σ_1 = 1.0, M = 10, γ = 1e-4, τ_min = 0.1,
-        τ_max = 0.5, n_exp = 2, η_strategy::F = (fn_1, n, x_n, f_n) -> fn_1 / n^2,
-        max_inner_iterations = 1000) where {F}
-    return DFSane(σ_min, σ_max, σ_1, M, γ, τ_min, τ_max, n_exp, η_strategy,
-        max_inner_iterations)
+@kwdef @concrete struct DFSane <: AbstractNonlinearSolveAlgorithm
+    σ_min = 1e-10
+    σ_max = 1e10
+    σ_1 = 1.0
+    M::Int = 10
+    γ = 1e-4
+    τ_min = 0.1
+    τ_max = 0.5
+    n_exp::Int = 2
+    η_strategy = (fn_1, n, x_n, f_n) -> fn_1 / n^2
+    max_inner_iterations::Int = 100
 end
 
 @concrete mutable struct DFSaneCache{iip} <: AbstractNonlinearSolveCache{iip}
     alg
     u
-    uprev
+    u_cache
     fu
-    fuprev
+    fu_cache
     du
     history
     f_norm
@@ -93,36 +86,35 @@ end
     trace
 end
 
-get_fu(cache::DFSaneCache) = cache.fu
-set_fu!(cache::DFSaneCache, fu) = (cache.fu = fu)
-
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::DFSane, args...;
         alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
         termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
         kwargs...) where {uType, iip, F}
-    u = alias_u0 ? prob.u0 : deepcopy(prob.u0)
+    u = __maybe_unaliased(prob.u0, alias_u0)
     T = eltype(u)
 
-    du, uprev = copy(u), copy(u)
+    @bb du = similar(u)
+    @bb u_cache = copy(u)
+
     fu = evaluate_f(prob, u)
-    fuprev = copy(fu)
+    @bb fu_cache = copy(fu)
 
     f_norm = internalnorm(fu)^alg.n_exp
     f_norm_0 = f_norm
 
     history = fill(f_norm, alg.M)
 
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, uprev,
+    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u_cache,
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, nothing, du; kwargs...)
 
-    return DFSaneCache{iip}(alg, u, uprev, fu, fuprev, du, history, f_norm, f_norm_0, alg.M,
-        T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ), T(alg.τ_min),
+    return DFSaneCache{iip}(alg, u, u_cache, fu, fu_cache, du, history, f_norm, f_norm_0,
+        alg.M, T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ), T(alg.τ_min),
         T(alg.τ_max), alg.n_exp, prob.p, false, maxiters, internalnorm, ReturnCode.Default,
         abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
 end
 
-function perform_step!(cache::DFSaneCache{true})
+function perform_step!(cache::DFSaneCache{iip}) where {iip}
     @unpack alg, f_norm, σ_n, σ_min, σ_max, α_1, γ, τ_min, τ_max, n_exp, M, prob = cache
     T = eltype(cache.u)
     f_norm_old = f_norm
@@ -131,128 +123,64 @@ function perform_step!(cache::DFSaneCache{true})
     σ_n = sign(σ_n) * clamp(abs(σ_n), σ_min, σ_max)
 
     # Line search direction
-    @. cache.du = -σ_n * cache.fuprev
+    @bb @. cache.du = -σ_n * cache.fu
 
     η = alg.η_strategy(cache.f_norm_0, cache.stats.nsteps, cache.u, cache.fu)
 
     f_bar = maximum(cache.history)
     α₊ = α_1
     α₋ = α_1
-    _axpy!(α₊, cache.du, cache.u)
-
-    prob.f(cache.fu, cache.u, cache.p)
-    f_norm = cache.internalnorm(cache.fu)^n_exp
-
-    # TODO: Failure mode with inner line search failed?
-    for _ in 1:(cache.alg.max_inner_iterations)
-        c = f_bar + η - γ * α₊^2 * f_norm_old
-
-        f_norm ≤ c && break
-
-        α₊ = α₊ * clamp(α₊ * f_norm_old / (f_norm + (T(2) * α₊ - T(1)) * f_norm_old),
-            τ_min, τ_max)
-        @. cache.u = cache.uprev - α₋ * cache.du
-
-        prob.f(cache.fu, cache.u, cache.p)
-        f_norm = cache.internalnorm(cache.fu)^n_exp
-
-        f_norm ≤ c && break
-
-        α₋ = α₋ * clamp(α₋ * f_norm_old / (f_norm + (T(2) * α₋ - T(1)) * f_norm_old),
-            τ_min, τ_max)
-        @. cache.u = cache.uprev + α₊ * cache.du
-
-        prob.f(cache.fu, cache.u, cache.p)
-        f_norm = cache.internalnorm(cache.fu)^n_exp
-    end
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), nothing,
-        cache.du, α₊)
 
-    check_and_update!(cache, cache.fu, cache.u, cache.uprev)
+    @bb axpy!(α₊, cache.du, cache.u)
 
-    # Update spectral parameter
-    @. cache.uprev = cache.u - cache.uprev
-    @. cache.fuprev = cache.fu - cache.fuprev
-
-    α₊ = sum(abs2, cache.uprev)
-    @. cache.uprev *= cache.fuprev
-    α₋ = sum(cache.uprev)
-    cache.σ_n = α₊ / α₋
-
-    # Spectral parameter bounds check
-    if !(σ_min ≤ abs(cache.σ_n) ≤ σ_max)
-        test_norm = sqrt(sum(abs2, cache.fuprev))
-        cache.σ_n = clamp(inv(test_norm), T(1), T(1e5))
-    end
-
-    # Take step
-    @. cache.uprev = cache.u
-    @. cache.fuprev = cache.fu
-    cache.f_norm = f_norm
-
-    # Update history
-    cache.history[cache.stats.nsteps % M + 1] = f_norm
-    cache.stats.nf += 1
-    return nothing
-end
-
-function perform_step!(cache::DFSaneCache{false})
-    @unpack alg, f_norm, σ_n, σ_min, σ_max, α_1, γ, τ_min, τ_max, n_exp, M, prob = cache
-    T = eltype(cache.u)
-    f_norm_old = f_norm
-
-    # Spectral parameter range check
-    σ_n = sign(σ_n) * clamp(abs(σ_n), σ_min, σ_max)
-
-    # Line search direction
-    cache.du = @. -σ_n * cache.fuprev
-
-    η = alg.η_strategy(cache.f_norm_0, cache.stats.nsteps, cache.u, cache.fu)
-
-    f_bar = maximum(cache.history)
-    α₊ = α_1
-    α₋ = α_1
-    cache.u = @. cache.uprev + α₊ * cache.du
-
-    cache.fu = prob.f(cache.u, cache.p)
+    evaluate_f(cache, cache.u, cache.p)
     f_norm = cache.internalnorm(cache.fu)^n_exp
+    α = α₊
 
-    # TODO: Failure mode with inner line search failed?
-    for _ in 1:(cache.alg.max_inner_iterations)
-        c = f_bar + η - γ * α₊^2 * f_norm_old
-
-        f_norm ≤ c && break
+    inner_converged = false
+    for k in 1:(cache.alg.max_inner_iterations)
+        if f_norm ≤ f_bar + η - γ * α₊^2 * f_norm_old
+            α = α₊
+            inner_converged = true
+            break
+        end
 
         α₊ = α₊ * clamp(α₊ * f_norm_old / (f_norm + (T(2) * α₊ - T(1)) * f_norm_old),
             τ_min, τ_max)
-        cache.u = @. cache.uprev - α₋ * cache.du
+        @bb axpy!(-α₋, cache.du, cache.u)
 
-        cache.fu = prob.f(cache.u, cache.p)
+        evaluate_f(cache, cache.u, cache.p)
         f_norm = cache.internalnorm(cache.fu)^n_exp
 
-        f_norm ≤ c && break
+        if f_norm ≤ f_bar + η - γ * α₋^2 * f_norm_old
+            α = α₋
+            inner_converged = true
+            break
+        end
 
         α₋ = α₋ * clamp(α₋ * f_norm_old / (f_norm + (T(2) * α₋ - T(1)) * f_norm_old),
             τ_min, τ_max)
-        cache.u = @. cache.uprev + α₊ * cache.du
+        @bb axpy!(α₊, cache.du, cache.u)
 
-        cache.fu = prob.f(cache.u, cache.p)
+        evaluate_f(cache, cache.u, cache.p)
         f_norm = cache.internalnorm(cache.fu)^n_exp
     end
 
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), nothing,
-        cache.du, α₊)
+    if !inner_converged
+        cache.retcode = ReturnCode.ConvergenceFailure
+        cache.force_stop = true
+    end
 
-    check_and_update!(cache, cache.fu, cache.u, cache.uprev)
+    update_trace!(cache, α)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
     # Update spectral parameter
-    cache.uprev = @. cache.u - cache.uprev
-    cache.fuprev = @. cache.fu - cache.fuprev
+    @bb @. cache.u_cache = cache.u - cache.u_cache
+    @bb @. cache.fu_cache = cache.fu - cache.fu_cache
 
-    α₊ = sum(abs2, cache.uprev)
-    cache.uprev = @. cache.uprev * cache.fuprev
-    α₋ = sum(cache.uprev)
+    α₊ = sum(abs2, cache.u_cache)
+    @bb @. cache.u_cache *= cache.fu_cache
+    α₋ = sum(cache.u_cache)
     cache.σ_n = α₊ / α₋
 
     # Spectral parameter bounds check
@@ -262,8 +190,8 @@ function perform_step!(cache::DFSaneCache{false})
     end
 
     # Take step
-    cache.uprev = cache.u
-    cache.fuprev = cache.fu
+    @bb copyto!(cache.u_cache, cache.u)
+    @bb copyto!(cache.fu_cache, cache.fu)
     cache.f_norm = f_norm
 
     # Update history
@@ -272,41 +200,8 @@ function perform_step!(cache::DFSaneCache{false})
     return nothing
 end
 
-function SciMLBase.reinit!(cache::DFSaneCache{iip}, u0 = cache.u; p = cache.p,
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        recursivecopy!(cache.uprev, u0)
-        cache.prob.f(cache.fu, cache.u, p)
-        cache.prob.f(cache.fuprev, cache.uprev, p)
-    else
-        cache.u = u0
-        cache.uprev = u0
-        cache.fu = cache.prob.f(cache.u, p)
-        cache.fuprev = cache.prob.f(cache.uprev, p)
-    end
-
+function __reinit_internal!(cache::DFSaneCache; kwargs...)
     cache.f_norm = cache.internalnorm(cache.fu)^cache.n_exp
     cache.f_norm_0 = cache.f_norm
-
-    fill!(cache.history, cache.f_norm)
-
-    T = eltype(cache.u)
-    cache.σ_n = T(cache.alg.σ_1)
-
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu, cache.u,
-        termination_condition)
-
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    return cache
+    return
 end
diff --git a/src/utils.jl b/src/utils.jl
index 00b7d3726..90b882af3 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -206,14 +206,6 @@ function evaluate_f(cache, u, p)
     return nothing
 end
 
-"""
-    __matmul!(C, A, B)
-
-Defaults to `mul!(C, A, B)`. However, for sparse matrices uses `C .= A * B`.
-"""
-__matmul!(C, A, B) = mul!(C, A, B)
-__matmul!(C::AbstractSparseMatrix, A, B) = C .= A * B
-
 # Concretize Algorithms
 function get_concrete_algorithm(alg, prob)
     !hasfield(typeof(alg), :ad) && return alg
@@ -381,15 +373,8 @@ function __try_factorize_and_check_singular!(linsolve, X)
 end
 __try_factorize_and_check_singular!(::FakeLinearSolveJLCache, x) = _issingular(x), false
 
-# TODO: Remove. handled in MaybeInplace.jl
-@generated function _axpy!(α, x, y)
-    hasmethod(axpy!, Tuple{α, x, y}) && return :(axpy!(α, x, y))
-    return :(@. y += α * x)
-end
-
 # Non-square matrix
 @inline __needs_square_A(_, ::Number) = true
-# @inline __needs_square_A(_, ::StaticArray) = true
 @inline __needs_square_A(alg, _) = LinearSolve.needs_square_A(alg.linsolve)
 
 # Define special concatenation for certain Array combinations

From f18fe152ad348c8f961c8da4183dbb5ed4cc627c Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 20:44:06 -0500
Subject: [PATCH 11/25] Start cleaning up TrustRegion

---
 src/jacobian.jl    |   1 +
 src/trustRegion.jl | 171 ++++++++++++++-------------------------------
 src/utils.jl       |  21 ++++--
 3 files changed, 68 insertions(+), 125 deletions(-)

diff --git a/src/jacobian.jl b/src/jacobian.jl
index 2ffcdc9aa..83b17f834 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -209,6 +209,7 @@ function __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
 end
 
 # Generic Handling of Krylov Methods for Normal Form Linear Solves
+# FIXME: Use MaybeInplace here for efficient matmuls
 function __update_JᵀJ!(iip::Val, cache, sym::Symbol, J)
     return __update_JᵀJ!(iip, cache, sym, getproperty(cache, sym), J)
 end
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 5493aa4d7..3cd40c907 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -1,5 +1,5 @@
 """
-`RadiusUpdateSchemes`
+    RadiusUpdateSchemes
 
 `RadiusUpdateSchemes` is the standard enum interface for different types of radius update schemes
 implemented in the Trust Region method. These schemes specify how the radius of the so-called trust region
@@ -16,7 +16,7 @@ states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
 """
 @enumx RadiusUpdateSchemes begin
     """
-    `RadiusUpdateSchemes.Simple`
+        RadiusUpdateSchemes.Simple
 
     The simple or conventional radius update scheme. This scheme is chosen by default
     and follows the conventional approach to update the trust region radius, i.e. if the
@@ -26,21 +26,21 @@ states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
     Simple
 
     """
-    `RadiusUpdateSchemes.NLsolve`
+        RadiusUpdateSchemes.NLsolve
 
     The same updating scheme as in NLsolve's (https://github.com/JuliaNLSolvers/NLsolve.jl) trust region dogleg implementation.
     """
     NLsolve
 
     """
-    `RadiusUpdateSchemes.NocedalWright`
+        RadiusUpdateSchemes.NocedalWright
 
     Trust region updating scheme as in Nocedal and Wright [see Alg 11.5, page 291].
     """
     NocedalWright
 
     """
-    `RadiusUpdateSchemes.Hei`
+        RadiusUpdateSchemes.Hei
 
     This scheme is proposed by [Hei, L.] (https://www.jstor.org/stable/43693061). The trust region radius
     depends on the size (norm) of the current step size. The hypothesis is to let the radius converge to zero
@@ -50,7 +50,7 @@ states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
     Hei
 
     """
-    `RadiusUpdateSchemes.Yuan`
+        RadiusUpdateSchemes.Yuan
 
     This scheme is proposed by [Yuan, Y.] (https://www.researchgate.net/publication/249011466_A_new_trust_region_algorithm_with_trust_region_radius_converging_to_zero).
     Similar to Hei's scheme, the trust region is updated in a way so that it converges to zero, however here,
@@ -60,7 +60,7 @@ states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
     Yuan
 
     """
-    `RadiusUpdateSchemes.Bastin`
+        RadiusUpdateSchemes.Bastin
 
     This scheme is proposed by [Bastin, et al.] (https://www.researchgate.net/publication/225100660_A_retrospective_trust-region_method_for_unconstrained_optimization).
     The scheme is called a retrospective update scheme as it uses the model function at the current
@@ -71,7 +71,7 @@ states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
     Bastin
 
     """
-    `RadiusUpdateSchemes.Fan`
+        RadiusUpdateSchemes.Fan
 
     This scheme is proposed by [Fan, J.] (https://link.springer.com/article/10.1007/s10589-005-3078-8). It is very much similar to
     Hei's and Yuan's schemes as it lets the trust region radius depend on the current size (norm) of the objective (merit)
@@ -170,7 +170,7 @@ function set_ad(alg::TrustRegion{CJ}, ad) where {CJ}
 end
 
 function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
-        radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple, #defaults to conventional radius update
+        radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple,
         max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
         step_threshold::Real = 1 // 10000, shrink_threshold::Real = 1 // 4,
         expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
@@ -233,6 +233,7 @@ end
     trace
 end
 
+# TODO: add J_cache
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion, args...;
         alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
         termination_condition = nothing, internalnorm = DEFAULT_NORM, linsolve_kwargs = (;),
@@ -244,7 +245,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
     fu1 = evaluate_f(prob, u)
     fu_prev = zero(fu1)
 
-    loss = get_loss(fu1)
+    loss = __get_trust_region_loss(fu1)
     uf, _, J, fu2, jac_cache, du, H, g = jacobian_caches(alg, f, u, p, Val(iip);
         linsolve_kwargs, linsolve_with_JᵀJ = Val(true), lininit = Val(false))
     g = _restructure(fu1, g)
@@ -350,64 +351,30 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
         p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
 end
 
-function perform_step!(cache::TrustRegionCache{true})
-    @unpack make_new_J, J, fu, f, u, p, u_gauss_newton, alg, linsolve = cache
+function perform_step!(cache::TrustRegionCache{iip}) where {iip}
     if cache.make_new_J
-        jacobian!!(J, cache)
-        __update_JᵀJ!(Val{true}(), cache, :H, J)
-        __update_Jᵀf!(Val{true}(), cache, :g, :H, J, _vec(fu))
+        cache.J = jacobian!!(cache.J, cache)
+
+        __update_JᵀJ!(Val{iip}(), cache, :H, cache.J)
+        __update_Jᵀf!(Val{iip}(), cache, :g, :H, cache.J, _vec(cache.fu))
         cache.stats.njacs += 1
 
         # do not use A = cache.H, b = _vec(cache.g) since it is equivalent
         # to  A = cache.J, b = _vec(fu) as long as the Jacobian is non-singular
-        linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(fu),
-            linu = _vec(u_gauss_newton), p = p, reltol = cache.abstol)
+        linres = dolinsolve(cache.alg.precs, cache.linsolve, A = cache.J,
+            b = _vec(cache.fu), linu = _vec(cache.u_gauss_newton), p = cache.p,
+            reltol = cache.abstol)
         cache.linsolve = linres.cache
-        @. cache.u_gauss_newton = -1 * u_gauss_newton
-    end
-
-    # Compute dogleg step
-    dogleg!(cache)
-
-    # Compute the potentially new u
-    @. cache.u_tmp = u + cache.du
-    f(cache.fu_new, cache.u_tmp, p)
-    trust_region_step!(cache)
-    cache.stats.nf += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-    return nothing
-end
-
-function perform_step!(cache::TrustRegionCache{false})
-    @unpack make_new_J, fu, f, u, p = cache
-
-    if make_new_J
-        J = jacobian!!(cache.J, cache)
-        __update_JᵀJ!(Val{false}(), cache, :H, J)
-        __update_Jᵀf!(Val{false}(), cache, :g, :H, J, _vec(fu))
-        cache.stats.njacs += 1
-
-        if cache.linsolve === nothing
-            # Scalar
-            cache.u_gauss_newton = -cache.H \ cache.g
-        else
-            # do not use A = cache.H, b = _vec(cache.g) since it is equivalent
-            # to  A = cache.J, b = _vec(fu) as long as the Jacobian is non-singular
-            linres = dolinsolve(cache.alg.precs, cache.linsolve, A = cache.J, b = _vec(fu),
-                linu = _vec(cache.u_gauss_newton), p = p, reltol = cache.abstol)
-            cache.linsolve = linres.cache
-            @. cache.u_gauss_newton *= -1
-        end
+        cache.u_gauss_newton = _restructure(cache.u_gauss_newton, linres.u)
+        @bb @. cache.u_gauss_newton *= -1
     end
 
-    # Compute the Newton step.
+    # compute dogleg step
     dogleg!(cache)
 
-    # Compute the potentially new u
-    cache.u_tmp = u + cache.du
-
-    cache.fu_new = f(cache.u_tmp, p)
+    # compute the potentially new u
+    @bb @. cache.u_cache_2 = cache.u + cache.du
+    evaluate_f(cache, cache.u_tmp, cache.p, Val{:fu_cache_2}())
     trust_region_step!(cache)
     cache.stats.nf += 1
     cache.stats.nsolve += 1
@@ -415,27 +382,23 @@ function perform_step!(cache::TrustRegionCache{false})
     return nothing
 end
 
-function retrospective_step!(cache::TrustRegionCache)
-    @unpack J, fu_prev, fu, u_prev, u = cache
-    J = jacobian!!(deepcopy(J), cache)
-    if J isa Number
-        cache.H = J' * J
-        cache.g = J' * fu
-    else
-        __update_JᵀJ!(Val{isinplace(cache)}(), cache, :H, J)
-        __update_Jᵀf!(Val{isinplace(cache)}(), cache, :g, :H, J, fu)
-    end
+function retrospective_step!(cache::TrustRegionCache{iip}) where {iip}
+    J = jacobian!!(cache.J_cache, cache)
+    __update_JᵀJ!(Val{iip}(), cache, :H, J)
+    __update_Jᵀf!(Val{iip}(), cache, :g, :H, J, cache.fu)
     cache.stats.njacs += 1
-    @unpack H, g, du = cache
 
-    return -(get_loss(fu_prev) - get_loss(fu)) /
-           (dot(_vec(du), _vec(g)) + __lr_mul(Val(isinplace(cache)), H, _vec(du)) / 2)
+    # FIXME: Caching in __lr_mul
+    num = __get_trust_region_loss(cache.fu) - __get_trust_region_loss(cache.fu_cache)
+    denom = dot(_vec(du), _vec(g)) + __lr_mul(Val{iip}(), H, _vec(du)) / 2
+    return num / denom
 end
 
+# TODO
 function trust_region_step!(cache::TrustRegionCache)
     @unpack fu_new, du, g, H, loss, max_trust_r, radius_update_scheme = cache
 
-    cache.loss_new = get_loss(fu_new)
+    cache.loss_new = __get_trust_region_loss(fu_new)
 
     # Compute the ratio of the actual reduction to the predicted reduction.
     cache.r = -(loss - cache.loss_new) /
@@ -556,6 +519,7 @@ function trust_region_step!(cache::TrustRegionCache)
         end
 
         @unpack p1 = cache
+        # TODO: Use the `vjp_autodiff` to for the jvp
         cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
 
         update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
@@ -608,6 +572,7 @@ function trust_region_step!(cache::TrustRegionCache)
     end
 end
 
+# TODO
 function dogleg!(cache::TrustRegionCache{true})
     @unpack u_tmp, u_gauss_newton, u_cauchy, trust_r = cache
 
@@ -638,6 +603,7 @@ function dogleg!(cache::TrustRegionCache{true})
     @. cache.du = u_cauchy + τ * u_tmp
 end
 
+# TODO
 function dogleg!(cache::TrustRegionCache{false})
     @unpack u_tmp, u_gauss_newton, u_cauchy, trust_r = cache
 
@@ -667,20 +633,14 @@ function dogleg!(cache::TrustRegionCache{false})
     cache.du = u_cauchy + τ * u_tmp
 end
 
-function take_step!(cache::TrustRegionCache{true})
-    cache.u_prev .= cache.u
-    cache.u .= cache.u_tmp
-    cache.fu_prev .= cache.fu
-    cache.fu .= cache.fu_new
-end
-
-function take_step!(cache::TrustRegionCache{false})
-    cache.u_prev = cache.u
-    cache.u = cache.u_tmp
-    cache.fu_prev = cache.fu
-    cache.fu = cache.fu_new
+function __take_step!(cache::TrustRegionCache)
+    @bb copyto!(cache.u_cache, cache.u)
+    @bb copyto!(cache.u, cache.u_cache_2)  # u_tmp --> u_cache_2
+    @bb copyto!(cache.fu_cache, cache.fu)
+    @bb copyto!(cache.fu, cache.fu_cache_2)  # fu_new --> fu_cache_2
 end
 
+# TODO
 function jvp!(cache::TrustRegionCache{false})
     @unpack f, u, fu, uf = cache
     if isa(u, Number)
@@ -710,40 +670,15 @@ function not_terminated(cache::TrustRegionCache)
     end
     return true
 end
-get_fu(cache::TrustRegionCache) = cache.fu
-set_fu!(cache::TrustRegionCache, fu) = (cache.fu = fu)
-
-function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
-        abstol = cache.abstol, reltol = cache.reltol, maxiters = cache.maxiters,
-        termination_condition = get_termination_mode(cache.tc_cache)) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
-    else
-        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-        cache.u = u0
-        cache.fu = cache.f(cache.u, p)
-    end
-
-    reset!(cache.trace)
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, cache.fu, cache.u,
-        termination_condition)
 
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.tc_cache = tc_cache
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-    cache.make_new_J = true
-    cache.loss = get_loss(cache.fu)
+function __reinit_internal!(cache::TrustRegionCache; kwargs...)
+    cache.loss = __get_trust_region_loss(cache.fu)
     cache.shrink_counter = 0
-    cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
-    if iszero(cache.trust_r)
-        cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
-    end
-    return cache
+    cache.trust_r = convert(eltype(cache.u),
+        ifelse(cache.alg.initial_trust_radius == 0, cache.alg.initial_trust_radius,
+            cache.max_trust_r / 11))
+    cache.make_new_J = true
+    return nothing
 end
+
+__get_trust_region_loss(fu) = norm(fu)^2 / 2
diff --git a/src/utils.jl b/src/utils.jl
index 90b882af3..0267de434 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -151,8 +151,6 @@ function wrapprecs(_Pl, _Pr, weight)
     return Pl, Pr
 end
 
-get_loss(fu) = norm(fu)^2 / 2
-
 function rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real} # R-function for adaptive trust region method
     if (r ≥ c2)
         return (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / π
@@ -188,7 +186,7 @@ function evaluate_f(prob::Union{NonlinearProblem{uType, iip},
     return fu
 end
 
-function evaluate_f(f, u, p, ::Val{iip}; fu = nothing) where {iip}
+function evaluate_f(f::F, u, p, ::Val{iip}; fu = nothing) where {F, iip <: Bool}
     if iip
         f(fu, u, p)
         return fu
@@ -197,11 +195,20 @@ function evaluate_f(f, u, p, ::Val{iip}; fu = nothing) where {iip}
     end
 end
 
-function evaluate_f(cache, u, p)
-    if isinplace(cache)
-        cache.prob.f(get_fu(cache), u, p)
+function evaluate_f(cache::AbstractNonlinearSolveCache, u, p,
+        fu_sym::Val{FUSYM} = Val(nothing)) where {FUSYM}
+    if FUSYM === nothing
+        if isinplace(cache)
+            cache.prob.f(get_fu(cache), u, p)
+        else
+            set_fu!(cache, cache.prob.f(u, p))
+        end
     else
-        set_fu!(cache, cache.prob.f(u, p))
+        if isinplace(cache)
+            cache.prob.f(__getproperty(cache, fu_sym), u, p)
+        else
+            setproperty!(cache, FUSYM, cache.prob.f(u, p))
+        end
     end
     return nothing
 end

From eadf16ff89c5ed2df1ecc9249cf43d81bedf6fb5 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 22:38:51 -0500
Subject: [PATCH 12/25] Fix some correctness issues

---
 src/broyden.jl           |  3 +--
 src/dfsane.jl            | 44 +++++++++++++++++-----------------------
 src/gaussnewton.jl       |  1 -
 src/klement.jl           | 13 ++++++------
 src/lbroyden.jl          |  1 -
 src/levenberg.jl         |  2 --
 src/linesearch.jl        |  2 +-
 src/pseudotransient.jl   |  1 -
 src/raphson.jl           |  1 -
 src/trustRegion.jl       |  1 -
 src/utils.jl             |  1 +
 test/23_test_problems.jl |  7 ++++---
 12 files changed, 32 insertions(+), 45 deletions(-)

diff --git a/src/broyden.jl b/src/broyden.jl
index 9b165e513..8b271d16c 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -96,7 +96,6 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
 
     update_trace!(cache, α)
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-    cache.stats.nf += 1
 
     cache.force_stop && return nothing
 
@@ -114,7 +113,7 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
     else
         @bb cache.du .*= -1
         @bb cache.J⁻¹dfu = cache.J⁻¹ × vec(cache.dfu)
-        @bb cache.u_cache = cache.J⁻¹ × vec(cache.du)
+        @bb cache.u_cache = transpose(cache.J⁻¹) × vec(cache.du)
         denom = dot(cache.du, cache.J⁻¹dfu)
         @bb @. cache.du = (cache.du - cache.J⁻¹dfu) / ifelse(iszero(denom), T(1e-5), denom)
         @bb cache.J⁻¹ += vec(cache.du) × transpose(cache.u_cache)
diff --git a/src/dfsane.jl b/src/dfsane.jl
index 8dcb1e9ff..570dd7ccd 100644
--- a/src/dfsane.jl
+++ b/src/dfsane.jl
@@ -58,6 +58,7 @@ end
     alg
     u
     u_cache
+    u_cache_2
     fu
     fu_cache
     du
@@ -95,6 +96,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::DFSane, args.
 
     @bb du = similar(u)
     @bb u_cache = copy(u)
+    @bb u_cache_2 = similar(u)
 
     fu = evaluate_f(prob, u)
     @bb fu_cache = copy(fu)
@@ -108,10 +110,10 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::DFSane, args.
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, nothing, du; kwargs...)
 
-    return DFSaneCache{iip}(alg, u, u_cache, fu, fu_cache, du, history, f_norm, f_norm_0,
-        alg.M, T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ), T(alg.τ_min),
-        T(alg.τ_max), alg.n_exp, prob.p, false, maxiters, internalnorm, ReturnCode.Default,
-        abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
+    return DFSaneCache{iip}(alg, u, u_cache, u_cache_2, fu, fu_cache, du, history, f_norm,
+        f_norm_0, alg.M, T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ),
+        T(alg.τ_min), T(alg.τ_max), alg.n_exp, prob.p, false, maxiters, internalnorm,
+        ReturnCode.Default, abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
 end
 
 function perform_step!(cache::DFSaneCache{iip}) where {iip}
@@ -119,37 +121,32 @@ function perform_step!(cache::DFSaneCache{iip}) where {iip}
     T = eltype(cache.u)
     f_norm_old = f_norm
 
-    # Spectral parameter range check
-    σ_n = sign(σ_n) * clamp(abs(σ_n), σ_min, σ_max)
-
     # Line search direction
     @bb @. cache.du = -σ_n * cache.fu
 
-    η = alg.η_strategy(cache.f_norm_0, cache.stats.nsteps, cache.u, cache.fu)
+    η = alg.η_strategy(cache.f_norm_0, cache.stats.nsteps + 1, cache.u, cache.fu)
 
     f_bar = maximum(cache.history)
     α₊ = α_1
     α₋ = α_1
 
-    @bb axpy!(α₊, cache.du, cache.u)
-
-    evaluate_f(cache, cache.u, cache.p)
+    @bb @. cache.u_cache_2 = cache.u + α₊ * cache.du
+    evaluate_f(cache, cache.u_cache_2, cache.p)
     f_norm = cache.internalnorm(cache.fu)^n_exp
-    α = α₊
+    α = -α₊
 
     inner_converged = false
     for k in 1:(cache.alg.max_inner_iterations)
         if f_norm ≤ f_bar + η - γ * α₊^2 * f_norm_old
-            α = α₊
+            α = -α₊
             inner_converged = true
             break
         end
 
         α₊ = α₊ * clamp(α₊ * f_norm_old / (f_norm + (T(2) * α₊ - T(1)) * f_norm_old),
             τ_min, τ_max)
-        @bb axpy!(-α₋, cache.du, cache.u)
-
-        evaluate_f(cache, cache.u, cache.p)
+        @bb @. cache.u_cache_2 = cache.u - α₋ * cache.du
+        evaluate_f(cache, cache.u_cache_2, cache.p)
         f_norm = cache.internalnorm(cache.fu)^n_exp
 
         if f_norm ≤ f_bar + η - γ * α₋^2 * f_norm_old
@@ -160,9 +157,8 @@ function perform_step!(cache::DFSaneCache{iip}) where {iip}
 
         α₋ = α₋ * clamp(α₋ * f_norm_old / (f_norm + (T(2) * α₋ - T(1)) * f_norm_old),
             τ_min, τ_max)
-        @bb axpy!(α₊, cache.du, cache.u)
-
-        evaluate_f(cache, cache.u, cache.p)
+        @bb @. cache.u_cache_2 = cache.u + α₊ * cache.du
+        evaluate_f(cache, cache.u_cache_2, cache.p)
         f_norm = cache.internalnorm(cache.fu)^n_exp
     end
 
@@ -171,6 +167,8 @@ function perform_step!(cache::DFSaneCache{iip}) where {iip}
         cache.force_stop = true
     end
 
+    @bb copyto!(cache.u, cache.u_cache_2)
+
     update_trace!(cache, α)
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
@@ -178,14 +176,11 @@ function perform_step!(cache::DFSaneCache{iip}) where {iip}
     @bb @. cache.u_cache = cache.u - cache.u_cache
     @bb @. cache.fu_cache = cache.fu - cache.fu_cache
 
-    α₊ = sum(abs2, cache.u_cache)
-    @bb @. cache.u_cache *= cache.fu_cache
-    α₋ = sum(cache.u_cache)
-    cache.σ_n = α₊ / α₋
+    cache.σ_n = dot(cache.u_cache, cache.u_cache) / dot(cache.fu_cache, cache.u_cache)
 
     # Spectral parameter bounds check
     if !(σ_min ≤ abs(cache.σ_n) ≤ σ_max)
-        test_norm = sqrt(sum(abs2, cache.fuprev))
+        test_norm = dot(cache.fu, cache.fu)
         cache.σ_n = clamp(inv(test_norm), T(1), T(1e5))
     end
 
@@ -196,7 +191,6 @@ function perform_step!(cache::DFSaneCache{iip}) where {iip}
 
     # Update history
     cache.history[cache.stats.nsteps % M + 1] = f_norm
-    cache.stats.nf += 1
     return nothing
 end
 
diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index 5ff01d79a..f199b5f29 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -142,7 +142,6 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
     @bb copyto!(cache.u_cache, cache.u)
     @bb copyto!(cache.dfu, cache.fu)
 
-    cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
diff --git a/src/klement.jl b/src/klement.jl
index cceb51c3c..62aa8f681 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -1,6 +1,6 @@
 """
-    GeneralKlement(; max_resets = 5, linsolve = nothing,
-                     linesearch = nothing, precs = DEFAULT_PRECS)
+    GeneralKlement(; max_resets = 5, linsolve = nothing, linesearch = nothing,
+        precs = DEFAULT_PRECS)
 
 An implementation of `Klement` with line search, preconditioning and customizable linear
 solves.
@@ -91,8 +91,8 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::GeneralKleme
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, J, du; kwargs...)
 
-    @bb u_cache = similar(u)
-    @bb fu_cache = similar(fu)
+    @bb u_cache = copy(u)
+    @bb fu_cache = copy(fu)
     @bb J_cache = similar(J)
     @bb J_cache_2 = similar(J)
     @bb Jdu = similar(fu)
@@ -139,7 +139,6 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
 
     @bb copyto!(cache.u_cache, cache.u)
 
-    cache.stats.nf += 1
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
 
@@ -152,8 +151,8 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
     @bb cache.Jdu_cache = cache.J_cache × vec(cache.Jdu)
     @bb cache.Jdu = cache.J × vec(cache.du)
     @bb @. cache.fu_cache = (cache.fu - cache.fu_cache - cache.Jdu) /
-                            max(cache.Jdu_cache, eps(real(T)))
-    @bb cache.J_cache = vec(cache.fu) × transpose(_vec(cache.du))
+                            ifelse(iszero(cache.Jdu_cache), T(1e-5), cache.Jdu_cache)
+    @bb cache.J_cache = vec(cache.fu_cache) × transpose(_vec(cache.du))
     @bb @. cache.J_cache *= cache.J
     @bb cache.J_cache_2 = cache.J_cache × cache.J
     @bb cache.J .+= cache.J_cache_2
diff --git a/src/lbroyden.jl b/src/lbroyden.jl
index 611e5511b..34668e5c8 100644
--- a/src/lbroyden.jl
+++ b/src/lbroyden.jl
@@ -125,7 +125,6 @@ function perform_step!(cache::LimitedMemoryBroydenCache{iip}) where {iip}
         ApplyArray(*, Vᵀ_part, U_part), cache.du, α)
 
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-    cache.stats.nf += 1
 
     cache.force_stop && return nothing
 
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 3b523807c..5806734ae 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -312,7 +312,6 @@ function perform_step!(cache::LevenbergMarquardtCache{true, fastls}) where {fast
         _vec(cache.δ) .= _vec(v) .+ _vec(cache.a) ./ 2
         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
         f(cache.fu_tmp, u .+ δ, p)
-        cache.stats.nf += 1
         loss = cache.internalnorm(cache.fu_tmp)
 
         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
@@ -411,7 +410,6 @@ function perform_step!(cache::LevenbergMarquardtCache{false, fastls}) where {fas
         cache.δ = _restructure(cache.δ, _vec(v) .+ _vec(cache.a) ./ 2)
         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
         fu_new = f(u .+ δ, p)
-        cache.stats.nf += 1
         loss = cache.internalnorm(fu_new)
 
         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
diff --git a/src/linesearch.jl b/src/linesearch.jl
index a2514e3a7..33de25ae7 100644
--- a/src/linesearch.jl
+++ b/src/linesearch.jl
@@ -1,5 +1,5 @@
 """
-    LineSearch(method = nothing, autodiff = nothing, alpha = true)
+    LineSearch(; method = nothing, autodiff = nothing, alpha = true)
 
 Wrapper over algorithms from
 [LineSearches.jl](https://github.com/JuliaNLSolvers/LineSearches.jl/). Allows automatic
diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
index b01762493..d4a41015a 100644
--- a/src/pseudotransient.jl
+++ b/src/pseudotransient.jl
@@ -145,7 +145,6 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
     @bb copyto!(cache.u_cache, cache.u)
-    cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
diff --git a/src/raphson.jl b/src/raphson.jl
index ac40b7c64..baf2ec10c 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -119,7 +119,6 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
     @bb copyto!(cache.u_cache, cache.u)
-    cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 3cd40c907..7e5497ffd 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -376,7 +376,6 @@ function perform_step!(cache::TrustRegionCache{iip}) where {iip}
     @bb @. cache.u_cache_2 = cache.u + cache.du
     evaluate_f(cache, cache.u_tmp, cache.p, Val{:fu_cache_2}())
     trust_region_step!(cache)
-    cache.stats.nf += 1
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
     return nothing
diff --git a/src/utils.jl b/src/utils.jl
index 0267de434..5bb4e8dbb 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -197,6 +197,7 @@ end
 
 function evaluate_f(cache::AbstractNonlinearSolveCache, u, p,
         fu_sym::Val{FUSYM} = Val(nothing)) where {FUSYM}
+    cache.stats.nf += 1
     if FUSYM === nothing
         if isinplace(cache)
             cache.prob.f(get_fu(cache), u, p)
diff --git a/test/23_test_problems.jl b/test/23_test_problems.jl
index 53b7b0f7b..8f6519e73 100644
--- a/test/23_test_problems.jl
+++ b/test/23_test_problems.jl
@@ -12,7 +12,7 @@ function test_on_library(problems, dicts, alg_ops, broken_tests, ϵ = 1e-4;
         @testset "$idx: $(dict["title"])" begin
             for alg in alg_ops
                 try
-                    sol = solve(nlprob, alg;
+                    sol = solve(nlprob, alg; maxiters = 10000,
                         termination_condition = AbsNormTerminationMode())
                     problem(res, sol.u, nothing)
 
@@ -23,7 +23,8 @@ function test_on_library(problems, dicts, alg_ops, broken_tests, ϵ = 1e-4;
                     end
                     broken = idx in broken_tests[alg] ? true : false
                     @test norm(res)≤ϵ broken=broken
-                catch
+                catch err
+                    @error err
                     broken = idx in broken_tests[alg] ? true : false
                     if broken
                         @test false broken=true
@@ -83,7 +84,7 @@ end
     alg_ops = (DFSane(),)
 
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [1, 2, 3, 4, 5, 6, 11, 22]
+    broken_tests[alg_ops[1]] = [1, 2, 3, 5, 6, 21]
 
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end

From 954a79965f0ed08da0b4da34d9e5eb7180d1519c Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Thu, 30 Nov 2023 23:04:36 -0500
Subject: [PATCH 13/25] Cleanup Normal Form Equation Construction

---
 src/gaussnewton.jl |  5 +++--
 src/jacobian.jl    | 31 ++++++++++---------------------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index f199b5f29..c885b02f5 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -116,8 +116,8 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
 
     # Use normal form to solve the Linear Problem
     if cache.JᵀJ !== nothing
-        __update_JᵀJ!(Val{iip}(), cache, :JᵀJ, cache.J)
-        __update_Jᵀf!(Val{iip}(), cache, :Jᵀf, :JᵀJ, cache.J, cache.fu)
+        __update_JᵀJ!(cache, Val(:JᵀJ))
+        __update_Jᵀf!(cache, Val(:JᵀJ))
         A, b = __maybe_symmetric(cache.JᵀJ), _vec(cache.Jᵀf)
     else
         A, b = cache.J, _vec(cache.fu)
@@ -148,6 +148,7 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
     return nothing
 end
 
+# FIXME: Reinit `JᵀJ` operator if `p` is changed
 function __reinit_internal!(cache::GaussNewtonCache;
         termination_condition = get_termination_mode(cache.tc_cache_1), kwargs...)
     abstol, reltol, tc_cache_1 = init_termination_cache(cache.abstol, cache.reltol,
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 83b17f834..03c2492fe 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -209,29 +209,18 @@ function __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
 end
 
 # Generic Handling of Krylov Methods for Normal Form Linear Solves
-# FIXME: Use MaybeInplace here for efficient matmuls
-function __update_JᵀJ!(iip::Val, cache, sym::Symbol, J)
-    return __update_JᵀJ!(iip, cache, sym, getproperty(cache, sym), J)
+function __update_JᵀJ!(cache::AbstractNonlinearSolveCache)
+    if !(cache.JᵀJ isa KrylovJᵀJ)
+        @bb cache.JᵀJ = transpose(cache.J) × cache.J
+    end
 end
-__update_JᵀJ!(::Val{false}, cache, sym::Symbol, _, J) = setproperty!(cache, sym, J' * J)
-__update_JᵀJ!(::Val{true}, cache, sym::Symbol, _, J) = mul!(getproperty(cache, sym), J', J)
-__update_JᵀJ!(::Val{false}, cache, sym::Symbol, H::KrylovJᵀJ, J) = H
-__update_JᵀJ!(::Val{true}, cache, sym::Symbol, H::KrylovJᵀJ, J) = H
 
-function __update_Jᵀf!(iip::Val, cache, sym1::Symbol, sym2::Symbol, J, fu)
-    return __update_Jᵀf!(iip, cache, sym1, sym2, getproperty(cache, sym2), J, fu)
-end
-function __update_Jᵀf!(::Val{false}, cache, sym1::Symbol, sym2::Symbol, _, J, fu)
-    return setproperty!(cache, sym1, _restructure(getproperty(cache, sym1), J' * fu))
-end
-function __update_Jᵀf!(::Val{true}, cache, sym1::Symbol, sym2::Symbol, _, J, fu)
-    return mul!(_vec(getproperty(cache, sym1)), J', fu)
-end
-function __update_Jᵀf!(::Val{false}, cache, sym1::Symbol, sym2::Symbol, H::KrylovJᵀJ, J, fu)
-    return setproperty!(cache, sym1, _restructure(getproperty(cache, sym1), H.Jᵀ * fu))
-end
-function __update_Jᵀf!(::Val{true}, cache, sym1::Symbol, sym2::Symbol, H::KrylovJᵀJ, J, fu)
-    return mul!(_vec(getproperty(cache, sym1)), H.Jᵀ, fu)
+function __update_Jᵀf!(cache::AbstractNonlinearSolveCache)
+    if cache.JᵀJ isa KrylovJᵀJ
+        @bb cache.Jᵀf = cache.JᵀJ.Jᵀ × cache.fu
+    else
+        @bb cache.Jᵀf = transpose(cache.J) × vec(cache.fu)
+    end
 end
 
 # Left-Right Multiplication

From 51f4a3e30ab5f8d375d24705d4f42cca51d097e7 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Sun, 3 Dec 2023 21:38:32 -0500
Subject: [PATCH 14/25] make progress on LM

---
 src/NonlinearSolve.jl |   2 +-
 src/gaussnewton.jl    |   4 +-
 src/levenberg.jl      | 411 +++++++++++++++++-------------------------
 src/utils.jl          |  26 ++-
 4 files changed, 189 insertions(+), 254 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index dacc98910..578343345 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -170,7 +170,7 @@ include("extension_algs.jl")
 include("linesearch.jl")
 include("raphson.jl")
 # include("trustRegion.jl")
-# include("levenberg.jl")
+include("levenberg.jl")
 include("gaussnewton.jl")
 include("dfsane.jl")
 include("pseudotransient.jl")
diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index c885b02f5..94f2e975a 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -152,8 +152,8 @@ end
 function __reinit_internal!(cache::GaussNewtonCache;
         termination_condition = get_termination_mode(cache.tc_cache_1), kwargs...)
     abstol, reltol, tc_cache_1 = init_termination_cache(cache.abstol, cache.reltol,
-        cache.fu1, cache.u, termination_condition)
-    _, _, tc_cache_2 = init_termination_cache(cache.abstol, cache.reltol, cache.fu1,
+        cache.fu, cache.u, termination_condition)
+    _, _, tc_cache_2 = init_termination_cache(cache.abstol, cache.reltol, cache.fu,
         cache.u, termination_condition)
 
     cache.tc_cache_1 = tc_cache_1
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 5806734ae..1ef403895 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -79,18 +79,17 @@ routine for the factorization without constructing `JᵀJ` and `Jᵀf`. For more
     [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
     `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
 """
-@concrete struct LevenbergMarquardt{CJ, AD, T} <:
-                 AbstractNewtonAlgorithm{CJ, AD}
+@concrete struct LevenbergMarquardt{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
     ad::AD
     linsolve
     precs
-    damping_initial::T
-    damping_increase_factor::T
-    damping_decrease_factor::T
-    finite_diff_step_geodesic::T
-    α_geodesic::T
-    b_uphill::T
-    min_damping_D::T
+    damping_initial
+    damping_increase_factor
+    damping_decrease_factor
+    finite_diff_step_geodesic
+    α_geodesic
+    b_uphill
+    min_damping_D
 end
 
 function set_ad(alg::LevenbergMarquardt{CJ}, ad) where {CJ}
@@ -100,11 +99,10 @@ function set_ad(alg::LevenbergMarquardt{CJ}, ad) where {CJ}
 end
 
 function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
-        damping_increase_factor::Real = 2.0,
-        damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
-        α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
-        adkwargs...)
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0, α_geodesic::Real = 0.75,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic::Real = 0.1, b_uphill::Real = 1.0,
+        min_damping_D::Real = 1e-8, adkwargs...)
     ad = default_adargs_to_adtype(; adkwargs...)
     _concrete_jac = ifelse(concrete_jac === nothing, true, concrete_jac)
     return LevenbergMarquardt{_unwrap_val(_concrete_jac)}(ad, linsolve, precs,
@@ -117,14 +115,25 @@ end
     f
     alg
     u
-    u_prev
-    fu1
-    fu2
+    u_cache
+    u_cache_2
+    fu
+    fu_cache
+    fu_cache_2
     du
+    du_cache
+    J
+    JᵀJ
+    Jv
+    DᵀD
+    v
+    v_cache
+    a
+    mat_tmp
+    rhs_tmp
     p
     uf
     linsolve
-    J
     jac_cache
     force_stop::Bool
     maxiters::Int
@@ -133,8 +142,6 @@ end
     abstol
     reltol
     prob
-    DᵀD
-    JᵀJ
     λ
     λ_factor
     damping_increase_factor
@@ -143,20 +150,9 @@ end
     α_geodesic
     b_uphill
     min_damping_D
-    v
-    a
-    tmp_vec
-    v_old
     norm_v_old
-    δ
     loss_old
     make_new_J::Bool
-    fu_tmp
-    u_tmp
-    Jv
-    mat_tmp
-    rhs_tmp
-    J²
     stats::NLStats
     tc_cache_1
     tc_cache_2
@@ -170,269 +166,186 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
         linsolve_kwargs = (;), kwargs...) where {uType, iip, F}
     alg = get_concrete_algorithm(alg_, prob)
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
-    fu1 = evaluate_f(prob, u)
 
-    linsolve_with_JᵀJ = Val(__needs_square_A(alg, u0))
+    u = __maybe_unaliased(u0, alias_u0)
+    T = eltype(u)
+    fu = evaluate_f(prob, u)
 
-    if _unwrap_val(linsolve_with_JᵀJ)
-        uf, linsolve, J, fu2, jac_cache, du, JᵀJ, v = jacobian_caches(alg, f, u, p,
-            Val(iip); linsolve_kwargs, linsolve_with_JᵀJ)
-        J² = nothing
-    else
-        uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
-            linsolve_kwargs, linsolve_with_JᵀJ)
-        JᵀJ = similar(_vec(u))
-        J² = similar(J)
-        v = similar(du)
-    end
+    fastls = !__needs_square_A(alg, u0)
 
-    λ = convert(eltype(u), alg.damping_initial)
-    λ_factor = convert(eltype(u), alg.damping_increase_factor)
-    damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
-    damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
-    h = convert(eltype(u), alg.finite_diff_step_geodesic)
-    α_geodesic = convert(eltype(u), alg.α_geodesic)
-    b_uphill = convert(eltype(u), alg.b_uphill)
-    min_damping_D = convert(eltype(u), alg.min_damping_D)
-
-    if u isa Number
-        DᵀD = min_damping_D
+    if !fastls
+        uf, linsolve, J, fu_cache, jac_cache, du, JᵀJ, v = jacobian_caches(alg, f, u, p,
+            Val(iip); linsolve_kwargs, linsolve_with_JᵀJ = Val(true))
     else
-        d = similar(u)
-        d .= min_damping_D
-        DᵀD = Diagonal(_vec(d))
+        uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p,
+            Val(iip); linsolve_kwargs, linsolve_with_JᵀJ = Val(false))
+        @bb JᵀJ = similar(u)
+        @bb v = similar(du)
     end
 
-    loss = internalnorm(fu1)
-    a = _mutable_zero(u)
-    tmp_vec = _mutable_zero(u)
-    v_old = _mutable_zero(u)
-    δ = _mutable_zero(u)
+    λ = T(alg.damping_initial)
+    λ_factor = T(alg.damping_increase_factor)
+    damping_increase_factor = T(alg.damping_increase_factor)
+    damping_decrease_factor = T(alg.damping_decrease_factor)
+    h = T(alg.finite_diff_step_geodesic)
+    α_geodesic = T(alg.α_geodesic)
+    b_uphill = T(alg.b_uphill)
+    min_damping_D = T(alg.min_damping_D)
+
+    DᵀD = __init_diagonal(u, min_damping_D)
+
+    loss = internalnorm(fu)
+
+    @bb a = similar(du)
+    @bb v_old = copy(v)
+    @bb δ = similar(du)
+
     make_new_J = true
-    fu_tmp = zero(fu1)
 
-    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, fu1, u,
+    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
     if prob isa NonlinearLeastSquaresProblem
-        _, _, tc_cache_2 = init_termination_cache(abstol, reltol, fu1, u,
+        _, _, tc_cache_2 = init_termination_cache(abstol, reltol, fu, u,
             termination_condition)
     else
         tc_cache_2 = nothing
     end
 
-    trace = init_nonlinearsolve_trace(alg, u, fu1, ApplyArray(__zero, J), du; kwargs...)
+    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
 
-    if _unwrap_val(linsolve_with_JᵀJ)
-        mat_tmp = zero(JᵀJ)
+    if !fastls
+        @bb mat_tmp = similar(JᵀJ)
+        @bb mat_tmp .*= T(0)
         rhs_tmp = nothing
     else
-        # Preserve Types
         mat_tmp = _vcat(J, DᵀD)
-        fill!(mat_tmp, zero(eltype(u)))
-        rhs_tmp = vcat(_vec(fu1), _vec(u))
-        fill!(rhs_tmp, zero(eltype(u)))
-        linsolve = linsolve_caches(mat_tmp, rhs_tmp, u, p, alg)
+        @bb mat_tmp .*= T(0)
+        rhs_tmp = vcat(_vec(fu), _vec(u))
+        @bb rhs_tmp .*= T(0)
+        linsolve = linsolve_caches(mat_tmp, rhs_tmp, u, p, alg; linsolve_kwargs)
     end
 
-    return LevenbergMarquardtCache{iip, !_unwrap_val(linsolve_with_JᵀJ)}(f, alg, u, copy(u),
-        fu1, fu2, du, p, uf, linsolve, J, jac_cache, false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, reltol, prob, DᵀD, JᵀJ, λ, λ_factor,
-        damping_increase_factor, damping_decrease_factor, h, α_geodesic, b_uphill,
-        min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp, zero(u),
-        zero(fu1), mat_tmp, rhs_tmp, J², NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2,
-        trace)
+    @bb u_cache = copy(u)
+    @bb u_cache_2 = similar(u)
+    @bb fu_cache_2 = similar(fu)
+    @bb du_cache = similar(du)
+    Jv = J * v
+    @bb v_cache = similar(v)
+
+    return LevenbergMarquardtCache{iip, fastls}(f, alg, u, u_cache, u_cache_2, fu, fu_cache,
+        fu_cache_2, du, du_cache, J, JᵀJ, Jv, DᵀD, v, v_cache, a, mat_tmp, rhs_tmp, p, uf,
+        linsolve, jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol,
+        reltol, prob, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h,
+        α_geodesic, b_uphill, min_damping_D, internalnorm(v_cache), loss, make_new_J,
+        NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2, trace)
 end
 
-function perform_step!(cache::LevenbergMarquardtCache{true, fastls}) where {fastls}
-    @unpack fu1, f, make_new_J = cache
-
-    if make_new_J
-        jacobian!!(cache.J, cache)
+function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip, fastls}
+    if cache.make_new_J
+        cache.J = jacobian!!(cache.J, cache)
         if fastls
-            cache.J² .= abs2.(cache.J)
-            sum!(cache.JᵀJ', cache.J²)
-            cache.DᵀD.diag .= max.(cache.DᵀD.diag, cache.JᵀJ)
+            cache.JᵀJ = __sum_JᵀJ!!(cache.JᵀJ, cache.J)
+            #             cache.DᵀD.diag .= max.(cache.DᵀD.diag, cache.JᵀJ)
         else
-            __matmul!(cache.JᵀJ, cache.J', cache.J)
-            cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+            @bb cache.JᵀJ = transpose(cache.J) × cache.J
+            #             cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
         end
         cache.make_new_J = false
-        cache.stats.njacs += 1
     end
-    @unpack u, u_prev, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
+
+    #     @unpack u, u_prev, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
 
     # Usual Levenberg-Marquardt step ("velocity").
     # The following lines do: cache.v = -cache.mat_tmp \ cache.u_tmp
-    if fastls
-        copyto!(@view(cache.mat_tmp[1:length(fu1), :]), cache.J)
-        cache.mat_tmp[(length(fu1) + 1):end, :] .= λ .* cache.DᵀD
-        cache.rhs_tmp[1:length(fu1)] .= _vec(fu1)
-        linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
-            b = cache.rhs_tmp, linu = _vec(cache.du), p = p, reltol = cache.abstol)
-        _vec(cache.v) .= -_vec(cache.du)
-    else
-        mul!(_vec(cache.u_tmp), J', _vec(fu1))
-        @. cache.mat_tmp = JᵀJ + λ * DᵀD
-        linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
-            b = _vec(cache.u_tmp), linu = _vec(cache.du), p = p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        _vec(cache.v) .= -_vec(cache.du)
-    end
+    #     if fastls
+    #         copyto!(@view(cache.mat_tmp[1:length(fu1), :]), cache.J)
+    #         cache.mat_tmp[(length(fu1) + 1):end, :] .= λ .* cache.DᵀD
+    #         cache.rhs_tmp[1:length(fu1)] .= _vec(fu1)
+    #         linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
+    #             b = cache.rhs_tmp, linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    #         _vec(cache.v) .= -_vec(cache.du)
+    #     else
+    #         mul!(_vec(cache.u_tmp), J', _vec(fu1))
+    #         @. cache.mat_tmp = JᵀJ + λ * DᵀD
+    #         linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
+    #             b = _vec(cache.u_tmp), linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    #         cache.linsolve = linres.cache
+    #         _vec(cache.v) .= -_vec(cache.du)
+    #     end
+
+    #     update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
+    #         cache.v)
+
+    #     # Geodesic acceleration (step_size = v + a / 2).
+    #     @unpack v, α_geodesic, h = cache
+    #     cache.u_tmp .= _restructure(cache.u_tmp, _vec(u) .+ h .* _vec(v))
+    #     f(cache.fu_tmp, cache.u_tmp, p)
+
+    #     # The following lines do: cache.a = -J \ cache.fu_tmp
+    #     # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
+    #     mul!(_vec(cache.Jv), J, _vec(v))
+    #     @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.Jv)
+    #     if fastls
+    #         cache.rhs_tmp[1:length(fu1)] .= _vec(cache.fu_tmp)
+    #         linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.du),
+    #             p = p, reltol = cache.abstol)
+    #     else
+    #         mul!(_vec(cache.u_tmp), J', _vec(cache.fu_tmp))
+    #         linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_tmp),
+    #             linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    #         cache.linsolve = linres.cache
+    #         @. cache.a = -cache.du
+    #     end
 
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-        cache.v)
-
-    # Geodesic acceleration (step_size = v + a / 2).
-    @unpack v, α_geodesic, h = cache
-    cache.u_tmp .= _restructure(cache.u_tmp, _vec(u) .+ h .* _vec(v))
-    f(cache.fu_tmp, cache.u_tmp, p)
-
-    # The following lines do: cache.a = -J \ cache.fu_tmp
-    # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
-    mul!(_vec(cache.Jv), J, _vec(v))
-    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.Jv)
-    if fastls
-        cache.rhs_tmp[1:length(fu1)] .= _vec(cache.fu_tmp)
-        linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.du),
-            p = p, reltol = cache.abstol)
-    else
-        mul!(_vec(cache.u_tmp), J', _vec(cache.fu_tmp))
-        linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_tmp),
-            linu = _vec(cache.du), p = p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        @. cache.a = -cache.du
-    end
     cache.stats.nsolve += 2
     cache.stats.nfactors += 2
 
     # Require acceptable steps to satisfy the following condition.
     norm_v = cache.internalnorm(v)
     if 2 * cache.internalnorm(cache.a) ≤ α_geodesic * norm_v
-        _vec(cache.δ) .= _vec(v) .+ _vec(cache.a) ./ 2
-        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-        f(cache.fu_tmp, u .+ δ, p)
-        loss = cache.internalnorm(cache.fu_tmp)
-
-        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-        β = dot(v, v_old) / (norm_v * norm_v_old)
-        if (1 - β)^b_uphill * loss ≤ loss_old
-            # Accept step.
-            cache.u .+= δ
-            check_and_update!(cache.tc_cache_1, cache, cache.fu_tmp, cache.u, cache.u_prev)
-            if !cache.force_stop && cache.tc_cache_2 !== nothing
-                # For NLLS Problems
-                cache.fu1 .= cache.fu_tmp .- cache.fu1
-                check_and_update!(cache.tc_cache_2, cache, cache.fu1, cache.u, cache.u_prev)
-            end
-            cache.fu1 .= cache.fu_tmp
-            _vec(cache.v_old) .= _vec(v)
-            cache.norm_v_old = norm_v
-            cache.loss_old = loss
-            cache.λ_factor = 1 / cache.damping_decrease_factor
-            cache.make_new_J = true
-        end
+        #         _vec(cache.δ) .= _vec(v) .+ _vec(cache.a) ./ 2
+        #         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+        #         f(cache.fu_tmp, u .+ δ, p)
+        #         loss = cache.internalnorm(cache.fu_tmp)
+
+        #         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+        #         β = dot(v, v_old) / (norm_v * norm_v_old)
+        #         if (1 - β)^b_uphill * loss ≤ loss_old
+        #             # Accept step.
+        #             cache.u .+= δ
+        #             check_and_update!(cache.tc_cache_1, cache, cache.fu_tmp, cache.u, cache.u_prev)
+        #             if !cache.force_stop && cache.tc_cache_2 !== nothing
+        #                 # For NLLS Problems
+        #                 cache.fu1 .= cache.fu_tmp .- cache.fu1
+        #                 check_and_update!(cache.tc_cache_2, cache, cache.fu1, cache.u, cache.u_prev)
+        #             end
+        #             cache.fu1 .= cache.fu_tmp
+        #             _vec(cache.v_old) .= _vec(v)
+        #             cache.norm_v_old = norm_v
+        #             cache.loss_old = loss
+        #             cache.λ_factor = 1 / cache.damping_decrease_factor
+        #             cache.make_new_J = true
+        #         end
     end
-    @. u_prev = u
+
+    @bb copyto!(cache.u_cache, cache.u)
     cache.λ *= cache.λ_factor
     cache.λ_factor = cache.damping_increase_factor
     return nothing
 end
 
-function perform_step!(cache::LevenbergMarquardtCache{false, fastls}) where {fastls}
-    @unpack fu1, f, make_new_J = cache
-
-    if make_new_J
-        cache.J = jacobian!!(cache.J, cache)
-        if fastls
-            cache.JᵀJ = _vec(sum(abs2, cache.J; dims = 1))
-            cache.DᵀD.diag .= max.(cache.DᵀD.diag, cache.JᵀJ)
-        else
-            cache.JᵀJ = cache.J' * cache.J
-            if cache.JᵀJ isa Number
-                cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
-            else
-                cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-            end
-        end
-        cache.make_new_J = false
-        cache.stats.njacs += 1
+function __reinit_internal!(cache::LevenbergMarquardtCache;
+        termination_condition = get_termination_mode(cache.tc_cache_1), kwargs...)
+    abstol, reltol, tc_cache_1 = init_termination_cache(cache.abstol, cache.reltol,
+        cache.fu, cache.u, termination_condition)
+    if cache.tc_cache_2 !== nothing
+        _, _, tc_cache_2 = init_termination_cache(cache.abstol, cache.reltol, cache.fu,
+            cache.u, termination_condition)
+        cache.tc_cache_2 = tc_cache_2
     end
 
-    @unpack u, u_prev, p, λ, JᵀJ, DᵀD, J, linsolve, alg = cache
-
-    # Usual Levenberg-Marquardt step ("velocity").
-    if fastls
-        cache.mat_tmp = _vcat(J, λ * cache.DᵀD)
-        cache.rhs_tmp[1:length(fu1)] .= -_vec(fu1)
-        linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
-            b = cache.rhs_tmp, linu = _vec(cache.v), p = p, reltol = cache.abstol)
-    else
-        cache.mat_tmp = JᵀJ + λ * DᵀD
-        if linsolve === nothing
-            cache.v = -cache.mat_tmp \ (J' * fu1)
-        else
-            linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
-                b = _vec(J' * _vec(fu1)), linu = _vec(cache.v), p, reltol = cache.abstol)
-            cache.linsolve = linres.cache
-            cache.v .*= -1
-        end
-    end
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-        cache.v)
-
-    @unpack v, h, α_geodesic = cache
-    # Geodesic acceleration (step_size = v + a / 2).
-    rhs_term = _vec(((2 / h) .* ((_vec(f(u .+ h .* _restructure(u, v), p)) .-
-                       _vec(fu1)) ./ h .- J * _vec(v))))
-    if fastls
-        cache.rhs_tmp[1:length(fu1)] .= -_vec(rhs_term)
-        linres = dolinsolve(alg.precs, linsolve;
-            b = cache.rhs_tmp, linu = _vec(cache.a), p = p, reltol = cache.abstol)
-    else
-        if linsolve === nothing
-            cache.a = -cache.mat_tmp \ _vec(J' * rhs_term)
-        else
-            linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
-                b = _mutable(_vec(J' * rhs_term)), linu = _vec(cache.a), p,
-                reltol = cache.abstol, reuse_A_if_factorization = true)
-            cache.linsolve = linres.cache
-            cache.a .*= -1
-        end
-    end
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-
-    # Require acceptable steps to satisfy the following condition.
-    norm_v = cache.internalnorm(v)
-    if 2 * cache.internalnorm(cache.a) ≤ α_geodesic * norm_v
-        cache.δ = _restructure(cache.δ, _vec(v) .+ _vec(cache.a) ./ 2)
-        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-        fu_new = f(u .+ δ, p)
-        loss = cache.internalnorm(fu_new)
-
-        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-        β = dot(v, v_old) / (norm_v * norm_v_old)
-        if (1 - β)^b_uphill * loss ≤ loss_old
-            # Accept step.
-            cache.u += δ
-            check_and_update!(cache.tc_cache_1, cache, fu_new, cache.u, cache.u_prev)
-            if !cache.force_stop && cache.tc_cache_2 !== nothing
-                # For NLLS Problems
-                cache.fu1 = fu_new .- cache.fu1
-                check_and_update!(cache.tc_cache_2, cache, cache.fu1, cache.u, cache.u_prev)
-            end
-            cache.fu1 = fu_new
-            cache.v_old = _restructure(cache.v_old, v)
-            cache.norm_v_old = norm_v
-            cache.loss_old = loss
-            cache.λ_factor = 1 / cache.damping_decrease_factor
-            cache.make_new_J = true
-        end
-    end
-    cache.u_prev = @. cache.u
-    cache.λ *= cache.λ_factor
-    cache.λ_factor = cache.damping_increase_factor
+    cache.tc_cache_1 = tc_cache_1
+    cache.abstol = abstol
+    cache.reltol = reltol
     return nothing
 end
diff --git a/src/utils.jl b/src/utils.jl
index 5bb4e8dbb..a6b95c9ef 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -89,8 +89,8 @@ end
 DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
 
 function dolinsolve(precs::P, linsolve::FakeLinearSolveJLCache; A = nothing,
-    linu = nothing, b = nothing, du = nothing, p = nothing, weight = nothing,
-    cachedata = nothing, reltol = nothing, reuse_A_if_factorization = false) where {P}
+        linu = nothing, b = nothing, du = nothing, p = nothing, weight = nothing,
+        cachedata = nothing, reltol = nothing, reuse_A_if_factorization = false) where {P}
     A !== nothing && (linsolve.A = A)
     b !== nothing && (linsolve.b = b)
     linres = linsolve.A \ linsolve.b
@@ -425,3 +425,25 @@ end
     return w
 end
 @inline __init_ones(x::StaticArray) = ones(typeof(x))
+
+# Diagonal of type `u`
+__init_diagonal(u::Number, v) = oftype(u, v)
+function __init_diagonal(u::SArray, v)
+    u_ = vec(u)
+    return Diagonal(ones(typeof(u_)) * v)
+end
+function __init_diagonal(u, v)
+    d = similar(vec(u))
+    d .= v
+    return Diagonal(d)
+end
+
+# Reduce sum
+function __sum_JᵀJ!!(y, J)
+    if setindex_trait(y) === CanSetindex()
+        sum!(abs2, y, J')
+        return y
+    else
+        return sum(abs2, J'; dims = 1)
+    end
+end

From 28e39dd025bfd60fca2f571f2a32248ba1f68563 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 4 Dec 2023 00:37:50 -0500
Subject: [PATCH 15/25] Kind of finish LM

---
 src/levenberg.jl | 151 +++++++++++++++++++++++++----------------------
 src/utils.jl     |  18 ++++++
 2 files changed, 100 insertions(+), 69 deletions(-)

diff --git a/src/levenberg.jl b/src/levenberg.jl
index 1ef403895..1836bceaa 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -179,7 +179,8 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
     else
         uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p,
             Val(iip); linsolve_kwargs, linsolve_with_JᵀJ = Val(false))
-        @bb JᵀJ = similar(u)
+        u_ = _vec(u)
+        @bb JᵀJ = similar(u_)
         @bb v = similar(du)
     end
 
@@ -241,91 +242,103 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
 end
 
 function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip, fastls}
+    @unpack alg, linsolve = cache
+
     if cache.make_new_J
         cache.J = jacobian!!(cache.J, cache)
         if fastls
             cache.JᵀJ = __sum_JᵀJ!!(cache.JᵀJ, cache.J)
-            #             cache.DᵀD.diag .= max.(cache.DᵀD.diag, cache.JᵀJ)
         else
             @bb cache.JᵀJ = transpose(cache.J) × cache.J
-            #             cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
         end
+        cache.DᵀD = __update_LM_diagonal!!(cache.DᵀD, cache.JᵀJ)
         cache.make_new_J = false
     end
 
-    #     @unpack u, u_prev, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
-
     # Usual Levenberg-Marquardt step ("velocity").
     # The following lines do: cache.v = -cache.mat_tmp \ cache.u_tmp
-    #     if fastls
-    #         copyto!(@view(cache.mat_tmp[1:length(fu1), :]), cache.J)
-    #         cache.mat_tmp[(length(fu1) + 1):end, :] .= λ .* cache.DᵀD
-    #         cache.rhs_tmp[1:length(fu1)] .= _vec(fu1)
-    #         linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
-    #             b = cache.rhs_tmp, linu = _vec(cache.du), p = p, reltol = cache.abstol)
-    #         _vec(cache.v) .= -_vec(cache.du)
-    #     else
-    #         mul!(_vec(cache.u_tmp), J', _vec(fu1))
-    #         @. cache.mat_tmp = JᵀJ + λ * DᵀD
-    #         linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
-    #             b = _vec(cache.u_tmp), linu = _vec(cache.du), p = p, reltol = cache.abstol)
-    #         cache.linsolve = linres.cache
-    #         _vec(cache.v) .= -_vec(cache.du)
-    #     end
-
-    #     update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-    #         cache.v)
-
-    #     # Geodesic acceleration (step_size = v + a / 2).
-    #     @unpack v, α_geodesic, h = cache
-    #     cache.u_tmp .= _restructure(cache.u_tmp, _vec(u) .+ h .* _vec(v))
-    #     f(cache.fu_tmp, cache.u_tmp, p)
-
-    #     # The following lines do: cache.a = -J \ cache.fu_tmp
-    #     # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
-    #     mul!(_vec(cache.Jv), J, _vec(v))
-    #     @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.Jv)
-    #     if fastls
-    #         cache.rhs_tmp[1:length(fu1)] .= _vec(cache.fu_tmp)
-    #         linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.du),
-    #             p = p, reltol = cache.abstol)
-    #     else
-    #         mul!(_vec(cache.u_tmp), J', _vec(cache.fu_tmp))
-    #         linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_tmp),
-    #             linu = _vec(cache.du), p = p, reltol = cache.abstol)
-    #         cache.linsolve = linres.cache
-    #         @. cache.a = -cache.du
-    #     end
+    if fastls
+        if setindex_trait(cache.mat_tmp) === CanSetindex()
+            copyto!(@view(cache.mat_tmp[1:length(cache.fu), :]), cache.J)
+            cache.mat_tmp[(length(cache.fu) + 1):end, :] .= cache.λ .* cache.DᵀD
+        else
+            cache.mat_tmp = _vcat(cache.J, cache.λ .* cache.DᵀD)
+        end
+        if setindex_trait(cache.rhs_tmp) === CanSetindex()
+            cache.rhs_tmp[1:length(cache.fu)] .= _vec(cache.fu)
+        else
+            cache.rhs_tmp = _vcat(_vec(cache.fu), zero(_vec(cache.u)))
+        end
+        linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
+            b = cache.rhs_tmp, linu = _vec(cache.v), cache.p, reltol = cache.abstol)
+        @bb @. cache.v = -linres.u
+    else
+        @bb cache.u_cache_2 = transpose(J) × cache.fu
+        @bb @. cache.mat_tmp = cache.JᵀJ + cache.λ * cache.DᵀD
+        linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
+            b = _vec(cache.u_cache_2), linu = _vec(cache.v), cache.p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
+        @bb @. cache.v = -linres.u
+    end
+
+    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
+        cache.v)
+
+    # Geodesic acceleration (step_size = v + a / 2).
+    @bb @. cache.u_cache_2 = cache.u + cache.h * cache.v
+    evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
+
+    # The following lines do: cache.a = -J \ cache.fu_tmp
+    # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
+    @bb cache.Jv = cache.J × cache.v
+    @bb @. cache.fu_cache_2 = (2 / cache.h) *
+                              ((cache.fu_cache_2 - cache.fu) / cache.h - cache.Jv)
+    if fastls
+        if setindex_trait(cache.rhs_tmp) === CanSetindex()
+            cache.rhs_tmp[1:length(cache.fu)] .= _vec(cache.fu_cache_2)
+        else
+            cache.rhs_tmp = _vcat(_vec(cache.fu_cache_2), zero(_vec(cache.u)))
+        end
+        linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.a),
+            cache.p, reltol = cache.abstol)
+        @bb @. cache.a = -linres.u
+    else
+        @bb cache.u_cache_2 = transpose(J) × cache.fu_cache_2
+        linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_cache_2),
+            linu = _vec(cache.a), cache.p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
+        @bb @. cache.a = -linres.du
+    end
 
     cache.stats.nsolve += 2
     cache.stats.nfactors += 2
 
     # Require acceptable steps to satisfy the following condition.
-    norm_v = cache.internalnorm(v)
-    if 2 * cache.internalnorm(cache.a) ≤ α_geodesic * norm_v
-        #         _vec(cache.δ) .= _vec(v) .+ _vec(cache.a) ./ 2
-        #         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-        #         f(cache.fu_tmp, u .+ δ, p)
-        #         loss = cache.internalnorm(cache.fu_tmp)
-
-        #         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-        #         β = dot(v, v_old) / (norm_v * norm_v_old)
-        #         if (1 - β)^b_uphill * loss ≤ loss_old
-        #             # Accept step.
-        #             cache.u .+= δ
-        #             check_and_update!(cache.tc_cache_1, cache, cache.fu_tmp, cache.u, cache.u_prev)
-        #             if !cache.force_stop && cache.tc_cache_2 !== nothing
-        #                 # For NLLS Problems
-        #                 cache.fu1 .= cache.fu_tmp .- cache.fu1
-        #                 check_and_update!(cache.tc_cache_2, cache, cache.fu1, cache.u, cache.u_prev)
-        #             end
-        #             cache.fu1 .= cache.fu_tmp
-        #             _vec(cache.v_old) .= _vec(v)
-        #             cache.norm_v_old = norm_v
-        #             cache.loss_old = loss
-        #             cache.λ_factor = 1 / cache.damping_decrease_factor
-        #             cache.make_new_J = true
-        #         end
+    norm_v = cache.internalnorm(cache.v)
+    if 2 * cache.internalnorm(cache.a) ≤ cache.α_geodesic * norm_v
+        @bb @. cache.du_cache = cache.v + cache.a / 2
+        @bb @. cache.u_cache_2 = cache.u + cache.du_cache
+        evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
+        loss = cache.internalnorm(cache.fu_cache_2)
+
+        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+        β = dot(cache.v, cache.v_cache) / (norm_v * cache.norm_v_old)
+        if (1 - β)^cache.b_uphill * loss ≤ cache.loss_old
+            # Accept step.
+            @bb copyto!(cache.u, cache.u_cache_2)
+            check_and_update!(cache.tc_cache_1, cache, cache.fu_cache, cache.u,
+                cache.u_cache)
+            if !cache.force_stop && cache.tc_cache_2 !== nothing # For NLLS Problems
+                @bb @. cache.fu = cache.fu_cache_2 - cache.fu
+                check_and_update!(cache.tc_cache_2, cache, cache.fu, cache.u, cache.u_cache)
+            end
+            @bb copyto!(cache.fu_cache, cache.fu_cache_2)
+            @bb copyto!(cache.v_cache, cache.v)
+            cache.norm_v_old = norm_v
+            cache.loss_old = loss
+            cache.λ_factor = 1 / cache.damping_decrease_factor
+            cache.make_new_J = true
+        end
     end
 
     @bb copyto!(cache.u_cache, cache.u)
diff --git a/src/utils.jl b/src/utils.jl
index a6b95c9ef..787d697d6 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -447,3 +447,21 @@ function __sum_JᵀJ!!(y, J)
         return sum(abs2, J'; dims = 1)
     end
 end
+
+function __update_LM_diagonal!!(y::Diagonal, x::AbstractVector)
+    if setindex_trait(y.diag) === CanSetindex()
+        @. y.diag = max(y.diag, x)
+        return y
+    else
+        return Diagonal(max.(y.diag, x))
+    end
+end
+@views function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
+    x_diag = x[diagind(x)]
+    if setindex_trait(y.diag) === CanSetindex()
+        @. y.diag = max(y.diag, x_diag)
+        return y
+    else
+        return Diagonal(max.(y.diag, x_diag))
+    end
+end

From c8f728326c2444838c5a6702914ae8a82724fd3d Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 4 Dec 2023 12:55:25 -0500
Subject: [PATCH 16/25] Patch tracing and LM

---
 src/levenberg.jl | 6 +++---
 src/trace.jl     | 8 ++++----
 src/utils.jl     | 5 +++--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/levenberg.jl b/src/levenberg.jl
index 1836bceaa..f47c36347 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -273,7 +273,7 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
             b = cache.rhs_tmp, linu = _vec(cache.v), cache.p, reltol = cache.abstol)
         @bb @. cache.v = -linres.u
     else
-        @bb cache.u_cache_2 = transpose(J) × cache.fu
+        @bb cache.u_cache_2 = transpose(cache.J) × cache.fu
         @bb @. cache.mat_tmp = cache.JᵀJ + cache.λ * cache.DᵀD
         linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
             b = _vec(cache.u_cache_2), linu = _vec(cache.v), cache.p, reltol = cache.abstol)
@@ -288,7 +288,7 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
     @bb @. cache.u_cache_2 = cache.u + cache.h * cache.v
     evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
 
-    # The following lines do: cache.a = -J \ cache.fu_tmp
+    # The following lines do: cache.a = -cache.mat_tmp \ cache.fu_tmp
     # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
     @bb cache.Jv = cache.J × cache.v
     @bb @. cache.fu_cache_2 = (2 / cache.h) *
@@ -332,7 +332,7 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
                 @bb @. cache.fu = cache.fu_cache_2 - cache.fu
                 check_and_update!(cache.tc_cache_2, cache, cache.fu, cache.u, cache.u_cache)
             end
-            @bb copyto!(cache.fu_cache, cache.fu_cache_2)
+            @bb copyto!(cache.fu, cache.fu_cache_2)
             @bb copyto!(cache.v_cache, cache.v)
             cache.norm_v_old = norm_v
             cache.loss_old = loss
diff --git a/src/trace.jl b/src/trace.jl
index 39c01d2c7..9e042f0bc 100644
--- a/src/trace.jl
+++ b/src/trace.jl
@@ -209,8 +209,8 @@ function update_trace!(trace::NonlinearSolveTrace{ShT, StT}, iter, u, fu, J, δu
         return trace
     end
 
-    show_now = ShT && (iter % trace.trace_level.print_frequency == 1)
-    store_now = StT && (iter % trace.trace_level.store_frequency == 1)
+    show_now = ShT && (mod1(iter, trace.trace_level.print_frequency) == 1)
+    store_now = StT && (mod1(iter, trace.trace_level.store_frequency) == 1)
     (show_now || store_now) && (entry = __trace_entry(trace.trace_level, iter, u, fu, J,
         δu, α))
     store_now && push!(trace.history, entry)
@@ -230,8 +230,8 @@ function update_trace_with_invJ!(trace::NonlinearSolveTrace{ShT, StT}, iter, u,
         return trace
     end
 
-    show_now = ShT && (iter % trace.trace_level.print_frequency == 1)
-    store_now = StT && (iter % trace.trace_level.store_frequency == 1)
+    show_now = ShT && (mod1(iter, trace.trace_level.print_frequency) == 1)
+    store_now = StT && (mod1(iter, trace.trace_level.store_frequency) == 1)
     if show_now || store_now
         J_ = trace.trace_level isa TraceMinimal ? J : inv(J)
         entry = __trace_entry(trace.trace_level, iter, u, fu, J_, δu, α)
diff --git a/src/utils.jl b/src/utils.jl
index 787d697d6..0b64ea839 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -448,7 +448,8 @@ function __sum_JᵀJ!!(y, J)
     end
 end
 
-function __update_LM_diagonal!!(y::Diagonal, x::AbstractVector)
+@inline __update_LM_diagonal!!(y::Number, x::Number) = max(y, x)
+@inline function __update_LM_diagonal!!(y::Diagonal, x::AbstractVector)
     if setindex_trait(y.diag) === CanSetindex()
         @. y.diag = max(y.diag, x)
         return y
@@ -456,7 +457,7 @@ function __update_LM_diagonal!!(y::Diagonal, x::AbstractVector)
         return Diagonal(max.(y.diag, x))
     end
 end
-@views function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
+@inline @views function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
     x_diag = x[diagind(x)]
     if setindex_trait(y.diag) === CanSetindex()
         @. y.diag = max(y.diag, x_diag)

From 13e590e45991b96cd12a945d3488e3890493c4fc Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 4 Dec 2023 14:00:56 -0500
Subject: [PATCH 17/25] LM Fixed

---
 src/jacobian.jl          |  6 +++---
 src/levenberg.jl         | 31 +++++++++++++------------------
 src/pseudotransient.jl   |  2 +-
 src/utils.jl             | 18 +++++++++++++-----
 test/23_test_problems.jl |  4 ++--
 5 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/src/jacobian.jl b/src/jacobian.jl
index 03c2492fe..2e539fcd8 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -138,13 +138,13 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u::Number,
         kwargs...) where {needsJᵀJ, F}
     # NOTE: Scalar `u` assumes scalar output from `f`
     uf = SciMLBase.JacobianWrapper{false}(f, p)
-    needsJᵀJ && return uf, nothing, u, nothing, nothing, u, u, u
-    return uf, FakeLinearSolveJLCache(u, u), u, nothing, nothing, u
+    return uf, FakeLinearSolveJLCache(u, u), u, nothing, nothing, u, u, u
 end
 
 # Linear Solve Cache
 function linsolve_caches(A, b, u, p, alg; linsolve_kwargs = (;))
-    if alg.linsolve === nothing && A isa SMatrix && linsolve_kwargs === (;)
+    if A isa Number ||
+       (alg.linsolve === nothing && A isa SMatrix && linsolve_kwargs === (;))
         # Default handling for SArrays in LinearSolve is not great. Some parts are patched
         # but there are quite a few unnecessary allocations
         return FakeLinearSolveJLCache(A, b)
diff --git a/src/levenberg.jl b/src/levenberg.jl
index f47c36347..9463a7c34 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -120,8 +120,6 @@ end
     fu
     fu_cache
     fu_cache_2
-    du
-    du_cache
     J
     JᵀJ
     Jv
@@ -197,9 +195,7 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
 
     loss = internalnorm(fu)
 
-    @bb a = similar(du)
-    @bb v_old = copy(v)
-    @bb δ = similar(du)
+    a = du # `du` is not used anywhere, use it to store `a`
 
     make_new_J = true
 
@@ -215,8 +211,7 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
     trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
 
     if !fastls
-        @bb mat_tmp = similar(JᵀJ)
-        @bb mat_tmp .*= T(0)
+        @bb mat_tmp = zero(JᵀJ)
         rhs_tmp = nothing
     else
         mat_tmp = _vcat(J, DᵀD)
@@ -229,15 +224,14 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
     @bb u_cache = copy(u)
     @bb u_cache_2 = similar(u)
     @bb fu_cache_2 = similar(fu)
-    @bb du_cache = similar(du)
     Jv = J * v
-    @bb v_cache = similar(v)
+    @bb v_cache = zero(v)
 
     return LevenbergMarquardtCache{iip, fastls}(f, alg, u, u_cache, u_cache_2, fu, fu_cache,
-        fu_cache_2, du, du_cache, J, JᵀJ, Jv, DᵀD, v, v_cache, a, mat_tmp, rhs_tmp, p, uf,
+        fu_cache_2, J, JᵀJ, Jv, DᵀD, v, v_cache, a, mat_tmp, rhs_tmp, p, uf,
         linsolve, jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol,
         reltol, prob, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h,
-        α_geodesic, b_uphill, min_damping_D, internalnorm(v_cache), loss, make_new_J,
+        α_geodesic, b_uphill, min_damping_D, loss, loss, make_new_J,
         NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2, trace)
 end
 
@@ -271,11 +265,12 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         end
         linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
             b = cache.rhs_tmp, linu = _vec(cache.v), cache.p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
         @bb @. cache.v = -linres.u
     else
         @bb cache.u_cache_2 = transpose(cache.J) × cache.fu
         @bb @. cache.mat_tmp = cache.JᵀJ + cache.λ * cache.DᵀD
-        linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
+        linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
             b = _vec(cache.u_cache_2), linu = _vec(cache.v), cache.p, reltol = cache.abstol)
         cache.linsolve = linres.cache
         @bb @. cache.v = -linres.u
@@ -289,7 +284,7 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
     evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
 
     # The following lines do: cache.a = -cache.mat_tmp \ cache.fu_tmp
-    # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
+    # NOTE: Don't pass `A`` in again, since we want to reuse the previous solve
     @bb cache.Jv = cache.J × cache.v
     @bb @. cache.fu_cache_2 = (2 / cache.h) *
                               ((cache.fu_cache_2 - cache.fu) / cache.h - cache.Jv)
@@ -301,13 +296,14 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         end
         linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.a),
             cache.p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
         @bb @. cache.a = -linres.u
     else
-        @bb cache.u_cache_2 = transpose(J) × cache.fu_cache_2
+        @bb cache.u_cache_2 = transpose(cache.J) × cache.fu_cache_2
         linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_cache_2),
             linu = _vec(cache.a), cache.p, reltol = cache.abstol)
         cache.linsolve = linres.cache
-        @bb @. cache.a = -linres.du
+        @bb @. cache.a = -linres.u
     end
 
     cache.stats.nsolve += 2
@@ -316,8 +312,7 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
     # Require acceptable steps to satisfy the following condition.
     norm_v = cache.internalnorm(cache.v)
     if 2 * cache.internalnorm(cache.a) ≤ cache.α_geodesic * norm_v
-        @bb @. cache.du_cache = cache.v + cache.a / 2
-        @bb @. cache.u_cache_2 = cache.u + cache.du_cache
+        @bb @. cache.u_cache_2 = cache.u + cache.v + cache.a / 2
         evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
         loss = cache.internalnorm(cache.fu_cache_2)
 
@@ -326,7 +321,7 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         if (1 - β)^cache.b_uphill * loss ≤ cache.loss_old
             # Accept step.
             @bb copyto!(cache.u, cache.u_cache_2)
-            check_and_update!(cache.tc_cache_1, cache, cache.fu_cache, cache.u,
+            check_and_update!(cache.tc_cache_1, cache, cache.fu_cache_2, cache.u,
                 cache.u_cache)
             if !cache.force_stop && cache.tc_cache_2 !== nothing # For NLLS Problems
                 @bb @. cache.fu = cache.fu_cache_2 - cache.fu
diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
index d4a41015a..dfaf80180 100644
--- a/src/pseudotransient.jl
+++ b/src/pseudotransient.jl
@@ -112,12 +112,12 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
     if cache.J isa SciMLOperators.AbstractSciMLOperator
         A  = cache.J - inv_α * I
     elseif setindex_trait(cache.J) === CanSetindex()
-        idxs = diagind(cache.J)
         if fast_scalar_indexing(cache.J)
             @inbounds for i in axes(cache.J, 1)
                 cache.J[i, i] = cache.J[i, i] - inv_α
             end
         else
+            idxs = diagind(cache.J)
             @.. broadcast=false @view(cache.J[idxs])=@view(cache.J[idxs]) - inv_α
         end
         A = cache.J
diff --git a/src/utils.jl b/src/utils.jl
index 0b64ea839..e19771ef7 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -457,12 +457,20 @@ end
         return Diagonal(max.(y.diag, x))
     end
 end
-@inline @views function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
-    x_diag = x[diagind(x)]
+@inline function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
     if setindex_trait(y.diag) === CanSetindex()
-        @. y.diag = max(y.diag, x_diag)
-        return y
+        if fast_scalar_indexing(y.diag)
+            @inbounds for i in axes(x, 1)
+                y.diag[i] = max(y.diag[i], x[i, i])
+            end
+            return y
+        else
+            idxs = diagind(x)
+            @.. broadcast=false y.diag=max(y.diag, @view(x[idxs]))
+            return y
+        end
     else
-        return Diagonal(max.(y.diag, x_diag))
+        idxs = diagind(x)
+        return Diagonal(@.. broadcast=false max(y.diag, @view(x[idxs])))
     end
 end
diff --git a/test/23_test_problems.jl b/test/23_test_problems.jl
index 8f6519e73..741402057 100644
--- a/test/23_test_problems.jl
+++ b/test/23_test_problems.jl
@@ -73,8 +73,8 @@ end
 
     # dictionary with indices of test problems where method does not converge to small residual
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [3, 6, 17, 21]
-    broken_tests[alg_ops[2]] = [3, 6, 17, 21]
+    broken_tests[alg_ops[1]] = [3, 6, 11, 17, 21]
+    broken_tests[alg_ops[2]] = [3, 6, 11, 17, 21]
     broken_tests[alg_ops[3]] = [6, 11, 17, 21]
 
     test_on_library(problems, dicts, alg_ops, broken_tests)

From 445e97b92bfc37d5b3d3443fec63b3239dee2269 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 4 Dec 2023 20:30:13 -0500
Subject: [PATCH 18/25] Trust Region mostly works

---
 src/NonlinearSolve.jl |  98 +++++-----
 src/gaussnewton.jl    |   4 +-
 src/jacobian.jl       |  46 +++--
 src/trustRegion.jl    | 433 ++++++++++++++++--------------------------
 src/utils.jl          |   8 -
 5 files changed, 245 insertions(+), 344 deletions(-)

diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 578343345..278667790 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -169,7 +169,7 @@ include("trace.jl")
 include("extension_algs.jl")
 include("linesearch.jl")
 include("raphson.jl")
-# include("trustRegion.jl")
+include("trustRegion.jl")
 include("levenberg.jl")
 include("gaussnewton.jl")
 include("dfsane.jl")
@@ -179,54 +179,54 @@ include("klement.jl")
 include("lbroyden.jl")
 include("jacobian.jl")
 include("ad.jl")
-# include("default.jl")
-
-# @setup_workload begin
-#     nlfuncs = ((NonlinearFunction{false}((u, p) -> u .* u .- p), 0.1),
-#         (NonlinearFunction{false}((u, p) -> u .* u .- p), [0.1]),
-#         (NonlinearFunction{true}((du, u, p) -> du .= u .* u .- p), [0.1]))
-#     probs_nls = NonlinearProblem[]
-#     for T in (Float32, Float64), (fn, u0) in nlfuncs
-#         push!(probs_nls, NonlinearProblem(fn, T.(u0), T(2)))
-#     end
-
-#     nls_algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt(), PseudoTransient(),
-#         GeneralBroyden(), GeneralKlement(), DFSane(), nothing)
-
-#     probs_nlls = NonlinearLeastSquaresProblem[]
-#     nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), [0.1, 0.0]),
-#         (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)), [0.1, 0.1]),
-#         (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
-#                 resid_prototype = zeros(1)), [0.1, 0.0]),
-#         (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
-#                 resid_prototype = zeros(4)), [0.1, 0.1]))
-#     for (fn, u0) in nlfuncs
-#         push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0))
-#     end
-#     nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), Float32[0.1, 0.0]),
-#         (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)),
-#             Float32[0.1, 0.1]),
-#         (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
-#                 resid_prototype = zeros(Float32, 1)), Float32[0.1, 0.0]),
-#         (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
-#                 resid_prototype = zeros(Float32, 4)), Float32[0.1, 0.1]))
-#     for (fn, u0) in nlfuncs
-#         push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0f0))
-#     end
-
-#     nlls_algs = (LevenbergMarquardt(), GaussNewton(),
-#         LevenbergMarquardt(; linsolve = LUFactorization()),
-#         GaussNewton(; linsolve = LUFactorization()))
-
-#     @compile_workload begin
-#         for prob in probs_nls, alg in nls_algs
-#             solve(prob, alg, abstol = 1e-2)
-#         end
-#         for prob in probs_nlls, alg in nlls_algs
-#             solve(prob, alg, abstol = 1e-2)
-#         end
-#     end
-# end
+include("default.jl")
+
+@setup_workload begin
+    nlfuncs = ((NonlinearFunction{false}((u, p) -> u .* u .- p), 0.1),
+        (NonlinearFunction{false}((u, p) -> u .* u .- p), [0.1]),
+        (NonlinearFunction{true}((du, u, p) -> du .= u .* u .- p), [0.1]))
+    probs_nls = NonlinearProblem[]
+    for T in (Float32, Float64), (fn, u0) in nlfuncs
+        push!(probs_nls, NonlinearProblem(fn, T.(u0), T(2)))
+    end
+
+    nls_algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt(), PseudoTransient(),
+        GeneralBroyden(), GeneralKlement(), DFSane(), nothing)
+
+    probs_nlls = NonlinearLeastSquaresProblem[]
+    nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), [0.1, 0.0]),
+        (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)), [0.1, 0.1]),
+        (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
+                resid_prototype = zeros(1)), [0.1, 0.0]),
+        (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
+                resid_prototype = zeros(4)), [0.1, 0.1]))
+    for (fn, u0) in nlfuncs
+        push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0))
+    end
+    nlfuncs = ((NonlinearFunction{false}((u, p) -> (u .^ 2 .- p)[1:1]), Float32[0.1, 0.0]),
+        (NonlinearFunction{false}((u, p) -> vcat(u .* u .- p, u .* u .- p)),
+            Float32[0.1, 0.1]),
+        (NonlinearFunction{true}((du, u, p) -> du[1] = u[1] * u[1] - p,
+                resid_prototype = zeros(Float32, 1)), Float32[0.1, 0.0]),
+        (NonlinearFunction{true}((du, u, p) -> du .= vcat(u .* u .- p, u .* u .- p),
+                resid_prototype = zeros(Float32, 4)), Float32[0.1, 0.1]))
+    for (fn, u0) in nlfuncs
+        push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0f0))
+    end
+
+    nlls_algs = (LevenbergMarquardt(), GaussNewton(),
+        LevenbergMarquardt(; linsolve = LUFactorization()),
+        GaussNewton(; linsolve = LUFactorization()))
+
+    @compile_workload begin
+        for prob in probs_nls, alg in nls_algs
+            solve(prob, alg, abstol = 1e-2)
+        end
+        for prob in probs_nlls, alg in nlls_algs
+            solve(prob, alg, abstol = 1e-2)
+        end
+    end
+end
 
 export RadiusUpdateSchemes
 
diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index 94f2e975a..9a227a7fa 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -116,8 +116,8 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
 
     # Use normal form to solve the Linear Problem
     if cache.JᵀJ !== nothing
-        __update_JᵀJ!(cache, Val(:JᵀJ))
-        __update_Jᵀf!(cache, Val(:JᵀJ))
+        __update_JᵀJ!(cache)
+        __update_Jᵀf!(cache)
         A, b = __maybe_symmetric(cache.JᵀJ), _vec(cache.Jᵀf)
     else
         A, b = cache.J, _vec(cache.fu)
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 2e539fcd8..cd84b5d1d 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -138,7 +138,7 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u::Number,
         kwargs...) where {needsJᵀJ, F}
     # NOTE: Scalar `u` assumes scalar output from `f`
     uf = SciMLBase.JacobianWrapper{false}(f, p)
-    return uf, FakeLinearSolveJLCache(u, u), u, nothing, nothing, u, u, u
+    return uf, FakeLinearSolveJLCache(u, u), u, zero(u), nothing, u, u, u
 end
 
 # Linear Solve Cache
@@ -208,27 +208,49 @@ function __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
     end
 end
 
+# jvp fallback scalar
+__jacvec(args...; kwargs...) = JacVec(args...; kwargs...)
+function __jacvec(uf, u::Number; autodiff, kwargs...)
+    @assert autodiff isa AutoForwardDiff "Only ForwardDiff is currently supported."
+    return JVPScalar(uf, u, autodiff)
+end
+
+@concrete mutable struct JVPScalar
+    uf
+    u
+    autodiff
+end
+
+function Base.:*(jvp::JVPScalar, v)
+    T = typeof(ForwardDiff.Tag(typeof(jvp.uf), typeof(jvp.u)))
+    out = jvp.uf(ForwardDiff.Dual{T}(jvp.u, v))
+    return ForwardDiff.extract_derivative(T, out)
+end
+
 # Generic Handling of Krylov Methods for Normal Form Linear Solves
-function __update_JᵀJ!(cache::AbstractNonlinearSolveCache)
+function __update_JᵀJ!(cache::AbstractNonlinearSolveCache, J = nothing)
     if !(cache.JᵀJ isa KrylovJᵀJ)
-        @bb cache.JᵀJ = transpose(cache.J) × cache.J
+        J_ = ifelse(J === nothing, cache.J, J)
+        @bb cache.JᵀJ = transpose(J_) × J_
     end
 end
 
-function __update_Jᵀf!(cache::AbstractNonlinearSolveCache)
+function __update_Jᵀf!(cache::AbstractNonlinearSolveCache, J = nothing)
     if cache.JᵀJ isa KrylovJᵀJ
         @bb cache.Jᵀf = cache.JᵀJ.Jᵀ × cache.fu
     else
-        @bb cache.Jᵀf = transpose(cache.J) × vec(cache.fu)
+        J_ = ifelse(J === nothing, cache.J, J)
+        @bb cache.Jᵀf = transpose(J_) × vec(cache.fu)
     end
 end
 
 # Left-Right Multiplication
-__lr_mul(::Val, H, g) = dot(g, H, g)
-## TODO: Use a cache here to avoid allocations
-__lr_mul(::Val{false}, H::KrylovJᵀJ, g) = dot(g, H.JᵀJ, g)
-function __lr_mul(::Val{true}, H::KrylovJᵀJ, g)
-    c = similar(g)
-    mul!(c, H.JᵀJ, g)
-    return dot(g, c)
+__lr_mul(cache::AbstractNonlinearSolveCache) = __lr_mul(cache, cache.JᵀJ, cache.Jᵀf)
+function __lr_mul(cache::AbstractNonlinearSolveCache, JᵀJ::KrylovJᵀJ, Jᵀf)
+    @bb cache.lr_mul_cache = JᵀJ.JᵀJ × vec(Jᵀf)
+    return dot(_vec(Jᵀf), _vec(cache.lr_mul_cache))
+end
+function __lr_mul(cache::AbstractNonlinearSolveCache, JᵀJ, Jᵀf)
+    @bb cache.lr_mul_cache = JᵀJ × Jᵀf
+    return dot(_vec(Jᵀf), _vec(cache.lr_mul_cache))
 end
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 7e5497ffd..f27259d3f 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -182,19 +182,26 @@ function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAU
         expand_threshold, shrink_factor, expand_factor, max_shrink_times, vjp_autodiff)
 end
 
-@concrete mutable struct TrustRegionCache{iip, trustType, floatType} <:
-                         AbstractNonlinearSolveCache{iip}
+@concrete mutable struct TrustRegionCache{iip} <: AbstractNonlinearSolveCache{iip}
     f
     alg
-    u_prev
     u
-    fu_prev
+    u_cache
+    u_cache_2
+    u_gauss_newton
+    u_cauchy
     fu
-    fu2
+    fu_cache
+    fu_cache_2
+    J
+    J_cache
+    JᵀJ
+    Jᵀf
     p
     uf
+    du
+    lr_mul_cache
     linsolve
-    J
     jac_cache
     force_stop::Bool
     maxiters::Int
@@ -204,60 +211,55 @@ end
     reltol
     prob
     radius_update_scheme::RadiusUpdateSchemes.T
-    trust_r::trustType
-    max_trust_r::trustType
+    trust_r
+    max_trust_r
     step_threshold
-    shrink_threshold::trustType
-    expand_threshold::trustType
-    shrink_factor::trustType
-    expand_factor::trustType
-    loss::floatType
-    loss_new::floatType
-    H
-    g
+    shrink_threshold
+    expand_threshold
+    shrink_factor
+    expand_factor
+    loss
+    loss_new
     shrink_counter::Int
-    du
-    u_tmp
-    u_gauss_newton
-    u_cauchy
-    fu_new
     make_new_J::Bool
-    r::floatType
-    p1::floatType
-    p2::floatType
-    p3::floatType
-    p4::floatType
-    ϵ::floatType
+    r
+    p1
+    p2
+    p3
+    p4
+    ϵ
+    jvp_operator  # For Yuan
     stats::NLStats
     tc_cache
     trace
 end
 
-# TODO: add J_cache
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion, args...;
         alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm = DEFAULT_NORM, linsolve_kwargs = (;),
-        kwargs...) where {uType, iip}
+        termination_condition = nothing, internalnorm = Base.Fix2(norm, 2),
+        linsolve_kwargs = (;), kwargs...) where {uType, iip}
     alg = get_concrete_algorithm(alg_, prob)
     @unpack f, u0, p = prob
-    u = alias_u0 ? u0 : deepcopy(u0)
-    u_prev = zero(u)
-    fu1 = evaluate_f(prob, u)
-    fu_prev = zero(fu1)
+    u = __maybe_unaliased(u0, alias_u0)
+    @bb u_cache = copy(u)
+    @bb u_cache_2 = similar(u)
+    fu = evaluate_f(prob, u)
+    @bb fu_cache_2 = zero(fu)
 
-    loss = __get_trust_region_loss(fu1)
-    uf, _, J, fu2, jac_cache, du, H, g = jacobian_caches(alg, f, u, p, Val(iip);
+    loss = __trust_region_loss(internalnorm, fu)
+
+    uf, _, J, fu_cache, jac_cache, du, JᵀJ, Jᵀf = jacobian_caches(alg, f, u, p, Val(iip);
         linsolve_kwargs, linsolve_with_JᵀJ = Val(true), lininit = Val(false))
-    g = _restructure(fu1, g)
-    linsolve = u isa Number ? nothing : linsolve_caches(J, fu2, du, p, alg)
+    linsolve = linsolve_caches(J, fu_cache, du, p, alg)
 
-    u_tmp = zero(u)
-    u_cauchy = zero(u)
-    u_gauss_newton = _mutable_zero(u)
+    @bb u_cache_2 = similar(u)
+    @bb u_cauchy = similar(u)
+    @bb u_gauss_newton = similar(u)
+    @bb J_cache = similar(J)
+    @bb lr_mul_cache = similar(du)
 
     loss_new = loss
     shrink_counter = 0
-    fu_new = zero(fu1)
     make_new_J = true
     r = loss
 
@@ -270,11 +272,13 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
     trustType = floatType
     if radius_update_scheme == RadiusUpdateSchemes.NLsolve
         max_trust_radius = convert(trustType, Inf)
-        initial_trust_radius = norm(u0) > 0 ? convert(trustType, norm(u0)) : one(trustType)
+        initial_trust_radius = internalnorm(u0) > 0 ? convert(trustType, internalnorm(u0)) :
+                               one(trustType)
     else
         max_trust_radius = convert(trustType, alg.max_trust_radius)
         if iszero(max_trust_radius)
-            max_trust_radius = convert(trustType, max(norm(fu1), maximum(u) - minimum(u)))
+            max_trust_radius = convert(trustType,
+                max(internalnorm(fu), maximum(u) - minimum(u)))
         end
         initial_trust_radius = convert(trustType, alg.initial_trust_radius)
         if iszero(initial_trust_radius)
@@ -293,6 +297,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
     p3 = convert(floatType, 0.0)
     p4 = convert(floatType, 0.0)
     ϵ = convert(floatType, 1.0e-8)
+    jvp_operator = nothing
     if radius_update_scheme === RadiusUpdateSchemes.NLsolve
         p1 = convert(floatType, 0.5)
     elseif radius_update_scheme === RadiusUpdateSchemes.Hei
@@ -311,16 +316,9 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
         p1 = convert(floatType, 2.0)   # μ
         p2 = convert(floatType, 1 / 6) # c5
         p3 = convert(floatType, 6.0)   # c6
-        if iip
-            auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu1)
-        else
-            if isa(u, Number)
-                g = ForwardDiff.derivative(x -> f(x, p), u)
-            else
-                g = auto_jacvec(x -> f(x, p), u, fu1)
-            end
-        end
-        initial_trust_radius = convert(trustType, p1 * norm(g))
+        jvp_operator = __jacvec(uf, u; fu, autodiff = __get_nonsparse_ad(alg.ad))
+        @bb Jᵀf = jvp_operator × fu
+        initial_trust_radius = convert(trustType, p1 * internalnorm(Jᵀf))
     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
         step_threshold = convert(trustType, 0.0001)
         shrink_threshold = convert(trustType, 0.25)
@@ -329,7 +327,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
         p2 = convert(floatType, 0.25) # c5
         p3 = convert(floatType, 12.0) # c6
         p4 = convert(floatType, 1.0e18) # M
-        initial_trust_radius = convert(trustType, p1 * (norm(fu1)^0.99))
+        initial_trust_radius = convert(trustType, p1 * (internalnorm(fu)^0.99))
     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
         step_threshold = convert(trustType, 0.05)
         shrink_threshold = convert(trustType, 0.05)
@@ -339,25 +337,25 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
         initial_trust_radius = convert(trustType, 1.0)
     end
 
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu1, u,
+    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
         termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu1, ApplyArray(__zero, J), du; kwargs...)
+    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
 
-    return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu1, fu2, p, uf, linsolve, J,
+    return TrustRegionCache{iip}(f, alg, u, u_cache, u_cache_2, u_gauss_newton, u_cauchy,
+        fu, fu_cache, fu_cache_2, J, J_cache, JᵀJ, Jᵀf, p, uf, du, lr_mul_cache, linsolve,
         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob,
         radius_update_scheme, initial_trust_radius, max_trust_radius, step_threshold,
         shrink_threshold, expand_threshold, shrink_factor, expand_factor, loss, loss_new,
-        H, g, shrink_counter, du, u_tmp, u_gauss_newton, u_cauchy, fu_new, make_new_J, r,
-        p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
+        shrink_counter, make_new_J, r, p1, p2, p3, p4, ϵ, jvp_operator,
+        NLStats(1, 0, 0, 0, 0), tc_cache, trace)
 end
 
 function perform_step!(cache::TrustRegionCache{iip}) where {iip}
     if cache.make_new_J
         cache.J = jacobian!!(cache.J, cache)
 
-        __update_JᵀJ!(Val{iip}(), cache, :H, cache.J)
-        __update_Jᵀf!(Val{iip}(), cache, :g, :H, cache.J, _vec(cache.fu))
-        cache.stats.njacs += 1
+        __update_JᵀJ!(cache)
+        __update_Jᵀf!(cache)
 
         # do not use A = cache.H, b = _vec(cache.g) since it is equivalent
         # to  A = cache.J, b = _vec(fu) as long as the Jacobian is non-singular
@@ -374,7 +372,7 @@ function perform_step!(cache::TrustRegionCache{iip}) where {iip}
 
     # compute the potentially new u
     @bb @. cache.u_cache_2 = cache.u + cache.du
-    evaluate_f(cache, cache.u_tmp, cache.p, Val{:fu_cache_2}())
+    evaluate_f(cache, cache.u_cache_2, cache.p, Val{:fu_cache_2}())
     trust_region_step!(cache)
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
@@ -383,278 +381,157 @@ end
 
 function retrospective_step!(cache::TrustRegionCache{iip}) where {iip}
     J = jacobian!!(cache.J_cache, cache)
-    __update_JᵀJ!(Val{iip}(), cache, :H, J)
-    __update_Jᵀf!(Val{iip}(), cache, :g, :H, J, cache.fu)
-    cache.stats.njacs += 1
+    __update_JᵀJ!(cache, J)
+    __update_Jᵀf!(cache, J)
 
-    # FIXME: Caching in __lr_mul
-    num = __get_trust_region_loss(cache.fu) - __get_trust_region_loss(cache.fu_cache)
-    denom = dot(_vec(du), _vec(g)) + __lr_mul(Val{iip}(), H, _vec(du)) / 2
+    num = __trust_region_loss(cache, cache.fu) -
+          __get_trust_region_loss(cache, cache.fu_cache)
+    denom = dot(_vec(cache.du), _vec(cache.Jᵀf)) + __lr_mul(cache, cache.JᵀJ, cache.du) / 2
     return num / denom
 end
 
-# TODO
 function trust_region_step!(cache::TrustRegionCache)
-    @unpack fu_new, du, g, H, loss, max_trust_r, radius_update_scheme = cache
-
-    cache.loss_new = __get_trust_region_loss(fu_new)
+    cache.loss_new = __trust_region_loss(cache, cache.fu_cache_2)
 
     # Compute the ratio of the actual reduction to the predicted reduction.
-    cache.r = -(loss - cache.loss_new) /
-              (dot(_vec(du), _vec(g)) + __lr_mul(Val(isinplace(cache)), H, _vec(du)) / 2)
-    @unpack r = cache
+    cache.r = -(cache.loss - cache.loss_new) /
+              (dot(_vec(cache.du), _vec(cache.Jᵀf)) +
+               __lr_mul(cache, cache.JᵀJ, _vec(cache.du)) / 2)
+
+    @unpack r, radius_update_scheme = cache
+    make_new_J = false
+    if r > cache.step_threshold
+        take_step!(cache)
+        cache.loss = cache.loss_new
+        make_new_J = true
+    end
 
     if radius_update_scheme === RadiusUpdateSchemes.Simple
-        # Update the trust region radius.
         if r < cache.shrink_threshold
             cache.trust_r *= cache.shrink_factor
             cache.shrink_counter += 1
         else
             cache.shrink_counter = 0
-        end
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-
-            # Update the trust region radius.
-            if r > cache.expand_threshold
-                cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
+            if r > cache.step_threshold && r > cache.expand_threshold
+                cache.trust_r = min(cache.expand_factor * cache.trust_r, cache.max_trust_r)
             end
-
-            cache.make_new_J = true
-        else
-            # No need to make a new J, no step was taken, so we try again with a smaller trust_r
-            cache.make_new_J = false
         end
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-
     elseif radius_update_scheme === RadiusUpdateSchemes.NLsolve
-        # accept/reject decision
-        if r > cache.step_threshold # accept
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-        else # reject
-            cache.make_new_J = false
-        end
-
-        # trust region update
-        if r < 1 // 10 # cache.shrink_threshold
-            cache.trust_r *= 1 // 2 # cache.shrink_factor
-        elseif r >= 9 // 10 # cache.expand_threshold
-            cache.trust_r = 2 * norm(cache.du) # cache.expand_factor * norm(cache.du)
-        elseif r >= 1 // 2 # cache.p1
-            cache.trust_r = max(cache.trust_r, 2 * norm(cache.du)) # cache.expand_factor * norm(cache.du))
+        if r < 1 // 10
+            cache.shrink_counter += 1
+            cache.trust_r *= 1 // 2
+        else
+            cache.shrink_counter = 0
+            if r ≥ 9 // 10
+                cache.trust_r = 2 * cache.internalnorm(cache.du)
+            elseif r ≥ 1 // 2
+                cache.trust_r = max(cache.trust_r, 2 * cache.internalnorm(cache.du))
+            end
         end
-
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        # convergence test
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-
     elseif radius_update_scheme === RadiusUpdateSchemes.NocedalWright
-        # accept/reject decision
-        if r > cache.step_threshold # accept
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-        else # reject
-            cache.make_new_J = false
-        end
-
         if r < 1 // 4
-            cache.trust_r = (1 // 4) * norm(cache.du)
-        elseif (r > (3 // 4)) && abs(norm(cache.du) - cache.trust_r) / cache.trust_r < 1e-6
-            cache.trust_r = min(2 * cache.trust_r, cache.max_trust_r)
-        end
-
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        # convergence test
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-
-    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
+            cache.shrink_counter += 1
+            cache.trust_r = (1 // 4) * cache.internalnorm(cache.du)
         else
-            cache.make_new_J = false
+            cache.shrink_counter = 0
+            if r > 3 // 4 &&
+               abs(cache.internalnorm(cache.du) - cache.trust_r) < 1e-6 * cache.trust_r
+                cache.trust_r = min(2 * cache.trust_r, cache.max_trust_r)
+            end
         end
-        # Hei's radius update scheme
+    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
         @unpack shrink_threshold, p1, p2, p3, p4 = cache
-        if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(du) <
-           cache.trust_r
+        tr_new = __rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(du)
+        if tr_new < cache.trust_r
             cache.shrink_counter += 1
         else
             cache.shrink_counter = 0
         end
-        cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
-                        cache.internalnorm(du)
-
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-        cache.internalnorm(g) < cache.ϵ && (cache.force_stop = true)
+        cache.trust_r = tr_new
 
+        cache.internalnorm(cache.Jᵀf) < cache.ϵ && (cache.force_stop = true)
     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
         if r < cache.shrink_threshold
             cache.p1 = cache.p2 * cache.p1
             cache.shrink_counter += 1
-        elseif r >= cache.expand_threshold &&
-               cache.internalnorm(du) > cache.trust_r / 2
-            cache.p1 = cache.p3 * cache.p1
-            cache.shrink_counter = 0
-        end
-
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
         else
-            cache.make_new_J = false
+            if r ≥ cache.expand_threshold &&
+               cache.internalnorm(cache.du) > cache.trust_r / 2
+                cache.p1 = cache.p3 * cache.p1
+            end
+            cache.shrink_counter = 0
         end
 
-        @unpack p1 = cache
-        # TODO: Use the `vjp_autodiff` to for the jvp
-        cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
+        @bb cache.Jᵀf = cache.jvp_operator × vec(cache.fu)
+        cache.trust_r = cache.p1 * cache.internalnorm(cache.Jᵀf)
 
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-        cache.internalnorm(g) < cache.ϵ && (cache.force_stop = true)
-        #Fan's update scheme
+        cache.internalnorm(cache.Jᵀf) < cache.ϵ && (cache.force_stop = true)
     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
         if r < cache.shrink_threshold
             cache.p1 *= cache.p2
             cache.shrink_counter += 1
-        elseif r > cache.expand_threshold
-            cache.p1 = min(cache.p1 * cache.p3, cache.p4)
-            cache.shrink_counter = 0
-        end
-
-        if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
         else
-            cache.make_new_J = false
+            cache.shrink_counter = 0
+            r > cache.expand_threshold && (cache.p1 = min(cache.p1 * cache.p3, cache.p4))
         end
-
-        @unpack p1 = cache
-        cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
-
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-        cache.internalnorm(g) < cache.ϵ && (cache.force_stop = true)
+        cache.trust_r = cache.p1 * (cache.internalnorm(cache.fu)^0.99)
+        cache.internalnorm(cache.Jᵀf) < cache.ϵ && (cache.force_stop = true)
     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
         if r > cache.step_threshold
-            take_step!(cache)
-            cache.loss = cache.loss_new
-            cache.make_new_J = true
-            if retrospective_step!(cache) >= cache.expand_threshold
+            if retrospective_step!(cache) ≥ cache.expand_threshold
                 cache.trust_r = max(cache.p1 * cache.internalnorm(du), cache.trust_r)
             end
-
+            cache.shrink_counter = 0
         else
-            cache.make_new_J = false
             cache.trust_r *= cache.p2
             cache.shrink_counter += 1
         end
-
-        update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-            @~(cache.u.-cache.u_prev))
-        check_and_update!(cache, cache.fu, cache.u, cache.u_prev)
-    end
-end
-
-# TODO
-function dogleg!(cache::TrustRegionCache{true})
-    @unpack u_tmp, u_gauss_newton, u_cauchy, trust_r = cache
-
-    # Take the full Gauss-Newton step if lies within the trust region.
-    if norm(u_gauss_newton) ≤ trust_r
-        cache.du .= u_gauss_newton
-        return
     end
 
-    # Take intersection of steepest descent direction and trust region if Cauchy point lies outside of trust region
-    l_grad = norm(cache.g) # length of the gradient
-    d_cauchy = l_grad^3 / __lr_mul(Val{true}(), cache.H, _vec(cache.g)) # distance of the cauchy point from the current iterate
-    if d_cauchy >= trust_r
-        @. cache.du = -(trust_r / l_grad) * cache.g # step to the end of the trust region
-        return
-    end
-
-    # Take the intersection of dogleg with trust region if Cauchy point lies inside the trust region
-    @. u_cauchy = -(d_cauchy / l_grad) * cache.g # compute Cauchy point
-    @. u_tmp = u_gauss_newton - u_cauchy # calf of the dogleg -- use u_tmp to avoid allocation
-
-    a = dot(u_tmp, u_tmp)
-    b = 2 * dot(u_cauchy, u_tmp)
-    c = d_cauchy^2 - trust_r^2
-    aux = max(b^2 - 4 * a * c, 0.0) # technically guaranteed to be non-negative but hedging against floating point issues
-    τ = (-b + sqrt(aux)) / (2 * a) # stepsize along dogleg to trust region boundary
-
-    @. cache.du = u_cauchy + τ * u_tmp
+    update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
+        @~(cache.u.-cache.u_cache))
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 end
 
-# TODO
-function dogleg!(cache::TrustRegionCache{false})
-    @unpack u_tmp, u_gauss_newton, u_cauchy, trust_r = cache
-
+function dogleg!(cache::TrustRegionCache{iip}) where {iip}
     # Take the full Gauss-Newton step if lies within the trust region.
-    if norm(u_gauss_newton) ≤ trust_r
-        cache.du = deepcopy(u_gauss_newton)
+    if cache.internalnorm(cache.u_gauss_newton) ≤ cache.trust_r
+        @bb copyto!(cache.du, cache.u_gauss_newton)
         return
     end
 
-    ## Take intersection of steepest descent direction and trust region if Cauchy point lies outside of trust region
-    l_grad = norm(cache.g)
-    d_cauchy = l_grad^3 / __lr_mul(Val{false}(), cache.H, _vec(cache.g)) # distance of the cauchy point from the current iterate
-    if d_cauchy > trust_r # cauchy point lies outside of trust region
-        cache.du = -(trust_r / l_grad) * cache.g # step to the end of the trust region
+    # Take intersection of steepest descent direction and trust region if Cauchy point lies
+    # outside of trust region
+    l_grad = cache.internalnorm(cache.Jᵀf) # length of the gradient
+    d_cauchy = l_grad^3 / __lr_mul(cache)
+    if d_cauchy ≥ cache.trust_r
+        # step to the end of the trust region
+        @bb @. cache.du = -(cache.trust_r / l_grad) * cache.Jᵀf
         return
     end
 
-    # Take the intersection of dogleg with trust region if Cauchy point lies inside the trust region
-    u_cauchy = -(d_cauchy / l_grad) * cache.g # compute Cauchy point
-    u_tmp = u_gauss_newton - u_cauchy # calf of the dogleg
-    a = dot(u_tmp, u_tmp)
-    b = 2 * dot(u_cauchy, u_tmp)
-    c = d_cauchy^2 - trust_r^2
-    aux = max(b^2 - 4 * a * c, 0.0) # technically guaranteed to be non-negative but hedging against floating point issues
-    τ = (-b + sqrt(aux)) / (2 * a) # stepsize along dogleg to trust region boundary
-
-    cache.du = u_cauchy + τ * u_tmp
+    # Take the intersection of dogleg with trust region if Cauchy point lies inside the
+    # trust region
+    @bb @. cache.u_cauchy = -(d_cauchy / l_grad) * cache.Jᵀf # compute Cauchy point
+    @bb @. cache.u_cache_2 = cache.u_gauss_newton - cache.u_cauchy # calf of the dogleg
+
+    a = dot(cache.u_cache_2, cache.u_cache_2)
+    b = 2 * dot(cache.u_cauchy, cache.u_cache_2)
+    c = d_cauchy^2 - cache.trust_r^2
+    # technically guaranteed to be non-negative but hedging against floating point issues
+    aux = max(b^2 - 4 * a * c, 0)
+    # stepsize along dogleg to trust region boundary
+    τ = (-b + sqrt(aux)) / (2 * a)
+
+    @bb @. cache.du = cache.u_cauchy + τ * cache.u_cache_2
+    return
 end
 
-function __take_step!(cache::TrustRegionCache)
+function take_step!(cache::TrustRegionCache)
     @bb copyto!(cache.u_cache, cache.u)
-    @bb copyto!(cache.u, cache.u_cache_2)  # u_tmp --> u_cache_2
+    @bb copyto!(cache.u, cache.u_cache_2)
     @bb copyto!(cache.fu_cache, cache.fu)
-    @bb copyto!(cache.fu, cache.fu_cache_2)  # fu_new --> fu_cache_2
-end
-
-# TODO
-function jvp!(cache::TrustRegionCache{false})
-    @unpack f, u, fu, uf = cache
-    if isa(u, Number)
-        return value_derivative(uf, u)
-    end
-    return auto_jacvec(uf, u, fu)
-end
-
-function jvp!(cache::TrustRegionCache{true})
-    @unpack g, f, u, fu, uf = cache
-    if isa(u, Number)
-        return value_derivative(uf, u)
-    end
-    auto_jacvec!(g, uf, u, fu)
-    return g
+    @bb copyto!(cache.fu, cache.fu_cache_2)
 end
 
 function not_terminated(cache::TrustRegionCache)
@@ -670,8 +547,9 @@ function not_terminated(cache::TrustRegionCache)
     return true
 end
 
+# FIXME: Update the JacVec Operator for Yuan
 function __reinit_internal!(cache::TrustRegionCache; kwargs...)
-    cache.loss = __get_trust_region_loss(cache.fu)
+    cache.loss = __trust_region_loss(cache, cache.fu)
     cache.shrink_counter = 0
     cache.trust_r = convert(eltype(cache.u),
         ifelse(cache.alg.initial_trust_radius == 0, cache.alg.initial_trust_radius,
@@ -680,4 +558,13 @@ function __reinit_internal!(cache::TrustRegionCache; kwargs...)
     return nothing
 end
 
-__get_trust_region_loss(fu) = norm(fu)^2 / 2
+# This only holds for 2-norm?
+__trust_region_loss(cache::TrustRegionCache, x) = __trust_region_loss(cache.internalnorm, x)
+__trust_region_loss(nf::F, x) where {F} = nf(x)^2 / 2
+
+# R-function for adaptive trust region method
+function __rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real}
+    return ifelse(r ≥ c2,
+        (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / R(π),
+        (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β)))
+end
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index e19771ef7..4d8496015 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -151,14 +151,6 @@ function wrapprecs(_Pl, _Pr, weight)
     return Pl, Pr
 end
 
-function rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real} # R-function for adaptive trust region method
-    if (r ≥ c2)
-        return (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / π
-    else
-        return (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β))
-    end
-end
-
 concrete_jac(_) = nothing
 concrete_jac(::AbstractNewtonAlgorithm{CJ}) where {CJ} = CJ
 

From cefe5b02a9f71e43d5aef447d137a99668dc22e0 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 4 Dec 2023 22:47:39 -0500
Subject: [PATCH 19/25] Most 23 test problems now pass

---
 src/trustRegion.jl       | 7 +++----
 test/23_test_problems.jl | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index f27259d3f..9ed243d26 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -384,8 +384,7 @@ function retrospective_step!(cache::TrustRegionCache{iip}) where {iip}
     __update_JᵀJ!(cache, J)
     __update_Jᵀf!(cache, J)
 
-    num = __trust_region_loss(cache, cache.fu) -
-          __get_trust_region_loss(cache, cache.fu_cache)
+    num = __trust_region_loss(cache, cache.fu) - __trust_region_loss(cache, cache.fu_cache)
     denom = dot(_vec(cache.du), _vec(cache.Jᵀf)) + __lr_mul(cache, cache.JᵀJ, cache.du) / 2
     return num / denom
 end
@@ -441,7 +440,7 @@ function trust_region_step!(cache::TrustRegionCache)
         end
     elseif radius_update_scheme === RadiusUpdateSchemes.Hei
         @unpack shrink_threshold, p1, p2, p3, p4 = cache
-        tr_new = __rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(du)
+        tr_new = __rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(cache.du)
         if tr_new < cache.trust_r
             cache.shrink_counter += 1
         else
@@ -479,7 +478,7 @@ function trust_region_step!(cache::TrustRegionCache)
     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
         if r > cache.step_threshold
             if retrospective_step!(cache) ≥ cache.expand_threshold
-                cache.trust_r = max(cache.p1 * cache.internalnorm(du), cache.trust_r)
+                cache.trust_r = max(cache.p1 * cache.internalnorm(cache.du), cache.trust_r)
             end
             cache.shrink_counter = 0
         else
diff --git a/test/23_test_problems.jl b/test/23_test_problems.jl
index 741402057..7642f1ed6 100644
--- a/test/23_test_problems.jl
+++ b/test/23_test_problems.jl
@@ -59,8 +59,8 @@ end
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
     broken_tests[alg_ops[1]] = [6, 11, 21]
     broken_tests[alg_ops[2]] = [6, 11, 21]
-    broken_tests[alg_ops[3]] = [1, 6, 11, 12, 15, 16, 21]
-    broken_tests[alg_ops[4]] = [1, 6, 8, 11, 15, 16, 21, 22]
+    broken_tests[alg_ops[3]] = [6, 11, 21]
+    broken_tests[alg_ops[4]] = [6, 11, 21]
     broken_tests[alg_ops[5]] = [6, 21]
     broken_tests[alg_ops[6]] = [6, 21]
 

From ee15d8076ba497c152b608b3adb6ca8d54fa9953 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Mon, 4 Dec 2023 23:48:31 -0500
Subject: [PATCH 20/25] Fix most tests

---
 Project.toml          |  4 +++-
 src/NonlinearSolve.jl |  5 +++--
 src/dfsane.jl         |  5 +++--
 src/jacobian.jl       | 27 +++++++++++++++++++--------
 src/trustRegion.jl    |  3 ++-
 src/utils.jl          |  2 +-
 6 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/Project.toml b/Project.toml
index 9385b14a2..b4977a080 100644
--- a/Project.toml
+++ b/Project.toml
@@ -60,6 +60,7 @@ LeastSquaresOptim = "0.8"
 LineSearches = "7"
 LinearAlgebra = "<0.0.1, 1"
 LinearSolve = "2.12"
+MaybeInplace = "0.1"
 NaNMath = "1"
 NonlinearProblemLibrary = "0.1"
 Pkg = "1"
@@ -71,7 +72,7 @@ Reexport = "0.2, 1"
 SafeTestsets = "0.1"
 SciMLBase = "2.9"
 SciMLOperators = "0.3"
-SimpleNonlinearSolve = "1"  # FIXME: Don't update the version in this PR. Using it to test
+SimpleNonlinearSolve = "1"
 SparseArrays = "<0.0.1, 1"
 SparseDiffTools = "2.14"
 StaticArrays = "1"
@@ -98,6 +99,7 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 278667790..c6b4fca66 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -17,6 +17,7 @@ import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_work
     import ConcreteStructs: @concrete
     import EnumX: @enumx
     import FastBroadcast: @..
+    import FiniteDiff
     import ForwardDiff
     import ForwardDiff: Dual
     import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
@@ -56,7 +57,7 @@ function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(c
     cache.p = p
     if iip
         recursivecopy!(get_u(cache), u0)
-        cache.f(cache.fu1, get_u(cache), p)
+        cache.f(get_fu(cache), get_u(cache), p)
     else
         cache.u = __maybe_unaliased(u0, alias_u0)
         set_fu!(cache, cache.f(cache.u, p))
@@ -76,7 +77,7 @@ function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(c
 
     if hasfield(typeof(cache), :ls_cache)
         # TODO: A more efficient way to do this
-        cache.ls_cache = init_linesearch_cache(cache.prob, cache.alg.linesearch, cache.f,
+        cache.ls_cache = init_linesearch_cache(cache.alg.linesearch, cache.f,
             get_u(cache), p, get_fu(cache), Val(iip))
     end
 
diff --git a/src/dfsane.jl b/src/dfsane.jl
index 570dd7ccd..689c24485 100644
--- a/src/dfsane.jl
+++ b/src/dfsane.jl
@@ -55,6 +55,7 @@ Computation, 75, 1429-1448.](https://www.researchgate.net/publication/220576479_
 end
 
 @concrete mutable struct DFSaneCache{iip} <: AbstractNonlinearSolveCache{iip}
+    f
     alg
     u
     u_cache
@@ -110,8 +111,8 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::DFSane, args.
         termination_condition)
     trace = init_nonlinearsolve_trace(alg, u, fu, nothing, du; kwargs...)
 
-    return DFSaneCache{iip}(alg, u, u_cache, u_cache_2, fu, fu_cache, du, history, f_norm,
-        f_norm_0, alg.M, T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ),
+    return DFSaneCache{iip}(prob.f, alg, u, u_cache, u_cache_2, fu, fu_cache, du, history,
+        f_norm, f_norm_0, alg.M, T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ),
         T(alg.τ_min), T(alg.τ_max), alg.n_exp, prob.p, false, maxiters, internalnorm,
         ReturnCode.Default, abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
 end
diff --git a/src/jacobian.jl b/src/jacobian.jl
index cd84b5d1d..2174fbc8e 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -209,10 +209,14 @@ function __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
 end
 
 # jvp fallback scalar
-__jacvec(args...; kwargs...) = JacVec(args...; kwargs...)
-function __jacvec(uf, u::Number; autodiff, kwargs...)
-    @assert autodiff isa AutoForwardDiff "Only ForwardDiff is currently supported."
-    return JVPScalar(uf, u, autodiff)
+function __jacvec(uf, u; autodiff, kwargs...)
+    if !(autodiff isa AutoForwardDiff || autodiff isa AutoFiniteDiff)
+        _ad = autodiff
+        autodiff = ifelse(ForwardDiff.can_dual(eltype(u)), AutoForwardDiff(),
+            AutoFiniteDiff())
+        @warn "$(_ad) not supported for JacVec. Using $(autodiff) instead."
+    end
+    return u isa Number ? JVPScalar(uf, u, autodiff) : JacVec(uf, u; autodiff, kwargs...)
 end
 
 @concrete mutable struct JVPScalar
@@ -221,10 +225,17 @@ end
     autodiff
 end
 
-function Base.:*(jvp::JVPScalar, v)
-    T = typeof(ForwardDiff.Tag(typeof(jvp.uf), typeof(jvp.u)))
-    out = jvp.uf(ForwardDiff.Dual{T}(jvp.u, v))
-    return ForwardDiff.extract_derivative(T, out)
+function Base.:*(jvp::JVPScalar, v::Number)
+    if jvp.autodiff isa AutoForwardDiff
+        T = typeof(ForwardDiff.Tag(typeof(jvp.uf), typeof(jvp.u)))
+        out = jvp.uf(ForwardDiff.Dual{T}(jvp.u, v))
+        return ForwardDiff.extract_derivative(T, out)
+    elseif jvp.autodiff isa AutoFiniteDiff
+        J = FiniteDiff.finite_difference_derivative(jvp.uf, jvp.u, jvp.autodiff.fdtype)
+        return J * v
+    else
+        error("Only ForwardDiff & FiniteDiff is currently supported.")
+    end
 end
 
 # Generic Handling of Krylov Methods for Normal Form Linear Solves
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 9ed243d26..9087b0d53 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -255,7 +255,8 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion,
     @bb u_cache_2 = similar(u)
     @bb u_cauchy = similar(u)
     @bb u_gauss_newton = similar(u)
-    @bb J_cache = similar(J)
+    J_cache = J isa SciMLOperators.AbstractSciMLOperator ||
+              setindex_trait(J) === CannotSetindex() ? J : similar(J)
     @bb lr_mul_cache = similar(du)
 
     loss_new = loss
diff --git a/src/utils.jl b/src/utils.jl
index 4d8496015..56a976aa8 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -178,7 +178,7 @@ function evaluate_f(prob::Union{NonlinearProblem{uType, iip},
     return fu
 end
 
-function evaluate_f(f::F, u, p, ::Val{iip}; fu = nothing) where {F, iip <: Bool}
+function evaluate_f(f::F, u, p, ::Val{iip}; fu = nothing) where {F, iip}
     if iip
         f(fu, u, p)
         return fu

From ba26318289cda6f198b070029bd8f5a93c6a6fd1 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 5 Dec 2023 10:44:22 -0500
Subject: [PATCH 21/25] Run formatter

---
 src/pseudotransient.jl | 2 +-
 src/trustRegion.jl     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
index dfaf80180..2849e0a28 100644
--- a/src/pseudotransient.jl
+++ b/src/pseudotransient.jl
@@ -110,7 +110,7 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
 
     inv_α = inv(cache.alpha)
     if cache.J isa SciMLOperators.AbstractSciMLOperator
-        A  = cache.J - inv_α * I
+        A = cache.J - inv_α * I
     elseif setindex_trait(cache.J) === CanSetindex()
         if fast_scalar_indexing(cache.J)
             @inbounds for i in axes(cache.J, 1)
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 9087b0d53..738066bd2 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -567,4 +567,4 @@ function __rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real}
     return ifelse(r ≥ c2,
         (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / R(π),
         (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β)))
-end
\ No newline at end of file
+end

From 5def9122d90231c9230ebbc722acbdb8465aae7a Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 5 Dec 2023 12:23:05 -0500
Subject: [PATCH 22/25] Fix all tests

---
 Project.toml             |  4 ++--
 src/broyden.jl           |  2 +-
 src/jacobian.jl          |  2 +-
 src/lbroyden.jl          |  4 ++--
 src/levenberg.jl         | 22 ++++++++++------------
 src/trustRegion.jl       | 17 ++++++++++++-----
 test/23_test_problems.jl |  4 ++--
 test/gpu.jl              | 18 ++++++------------
 test/infeasible.jl       | 15 +++++----------
 test/matrix_resizing.jl  |  5 +++--
 test/polyalgs.jl         |  6 +++---
 11 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/Project.toml b/Project.toml
index b4977a080..cec099ef5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NonlinearSolve"
 uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
 authors = ["SciML"]
-version = "2.9.0"
+version = "2.10.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -72,7 +72,7 @@ Reexport = "0.2, 1"
 SafeTestsets = "0.1"
 SciMLBase = "2.9"
 SciMLOperators = "0.3"
-SimpleNonlinearSolve = "1"
+SimpleNonlinearSolve = "0.1.23"
 SparseArrays = "<0.0.1, 1"
 SparseDiffTools = "2.14"
 StaticArrays = "1"
diff --git a/src/broyden.jl b/src/broyden.jl
index 8b271d16c..c545ae0c2 100644
--- a/src/broyden.jl
+++ b/src/broyden.jl
@@ -116,7 +116,7 @@ function perform_step!(cache::GeneralBroydenCache{iip}) where {iip}
         @bb cache.u_cache = transpose(cache.J⁻¹) × vec(cache.du)
         denom = dot(cache.du, cache.J⁻¹dfu)
         @bb @. cache.du = (cache.du - cache.J⁻¹dfu) / ifelse(iszero(denom), T(1e-5), denom)
-        @bb cache.J⁻¹ += vec(cache.du) × transpose(cache.u_cache)
+        @bb cache.J⁻¹ += vec(cache.du) × transpose(_vec(cache.u_cache))
     end
 
     @bb copyto!(cache.fu_cache, cache.fu)
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 2174fbc8e..60be3f3cd 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -262,6 +262,6 @@ function __lr_mul(cache::AbstractNonlinearSolveCache, JᵀJ::KrylovJᵀJ, Jᵀf)
     return dot(_vec(Jᵀf), _vec(cache.lr_mul_cache))
 end
 function __lr_mul(cache::AbstractNonlinearSolveCache, JᵀJ, Jᵀf)
-    @bb cache.lr_mul_cache = JᵀJ × Jᵀf
+    @bb cache.lr_mul_cache = JᵀJ × vec(Jᵀf)
     return dot(_vec(Jᵀf), _vec(cache.lr_mul_cache))
 end
diff --git a/src/lbroyden.jl b/src/lbroyden.jl
index 34668e5c8..c4c73e11e 100644
--- a/src/lbroyden.jl
+++ b/src/lbroyden.jl
@@ -187,7 +187,7 @@ function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
     x_ = vec(x)
     xᵀU_ = view(xᵀU, 1:η)
     @bb xᵀU_ = transpose(U) × x_
-    @bb y = transpose(Vᵀ) × xᵀU_
+    @bb y = transpose(Vᵀ) × vec(xᵀU_)
     @bb @. y -= x
     return y
 end
@@ -202,7 +202,7 @@ function _matvec!!(y, Vᵀx, U, Vᵀ, x)
     x_ = vec(x)
     Vᵀx_ = view(Vᵀx, 1:η)
     @bb Vᵀx_ = Vᵀ × x_
-    @bb y = U × Vᵀx_
+    @bb y = U × vec(Vᵀx_)
     @bb @. y -= x
     return y
 end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 9463a7c34..0013be126 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -224,7 +224,7 @@ function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
     @bb u_cache = copy(u)
     @bb u_cache_2 = similar(u)
     @bb fu_cache_2 = similar(fu)
-    Jv = J * v
+    Jv = J * _vec(v)
     @bb v_cache = zero(v)
 
     return LevenbergMarquardtCache{iip, fastls}(f, alg, u, u_cache, u_cache_2, fu, fu_cache,
@@ -265,16 +265,15 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         end
         linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
             b = cache.rhs_tmp, linu = _vec(cache.v), cache.p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        @bb @. cache.v = -linres.u
     else
         @bb cache.u_cache_2 = transpose(cache.J) × cache.fu
         @bb @. cache.mat_tmp = cache.JᵀJ + cache.λ * cache.DᵀD
         linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
             b = _vec(cache.u_cache_2), linu = _vec(cache.v), cache.p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        @bb @. cache.v = -linres.u
     end
+    cache.linsolve = linres.cache
+    linu = _restructure(cache.v, linres.u)
+    @bb @. cache.v = -linu
 
     update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
         cache.v)
@@ -285,9 +284,9 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
 
     # The following lines do: cache.a = -cache.mat_tmp \ cache.fu_tmp
     # NOTE: Don't pass `A`` in again, since we want to reuse the previous solve
-    @bb cache.Jv = cache.J × cache.v
-    @bb @. cache.fu_cache_2 = (2 / cache.h) *
-                              ((cache.fu_cache_2 - cache.fu) / cache.h - cache.Jv)
+    @bb cache.Jv = cache.J × vec(cache.v)
+    Jv = _restructure(cache.fu_cache_2, cache.Jv)
+    @bb @. cache.fu_cache_2 = (2 / cache.h) * ((cache.fu_cache_2 - cache.fu) / cache.h - Jv)
     if fastls
         if setindex_trait(cache.rhs_tmp) === CanSetindex()
             cache.rhs_tmp[1:length(cache.fu)] .= _vec(cache.fu_cache_2)
@@ -296,15 +295,14 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         end
         linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.a),
             cache.p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        @bb @. cache.a = -linres.u
     else
         @bb cache.u_cache_2 = transpose(cache.J) × cache.fu_cache_2
         linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_cache_2),
             linu = _vec(cache.a), cache.p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        @bb @. cache.a = -linres.u
     end
+    cache.linsolve = linres.cache
+    linu = _restructure(cache.a, linres.u)
+    @bb @. cache.a = -linu
 
     cache.stats.nsolve += 2
     cache.stats.nfactors += 2
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 738066bd2..abc93fd9a 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -504,15 +504,16 @@ function dogleg!(cache::TrustRegionCache{iip}) where {iip}
     # outside of trust region
     l_grad = cache.internalnorm(cache.Jᵀf) # length of the gradient
     d_cauchy = l_grad^3 / __lr_mul(cache)
+    g = _restructure(cache.du, cache.Jᵀf)
     if d_cauchy ≥ cache.trust_r
         # step to the end of the trust region
-        @bb @. cache.du = -(cache.trust_r / l_grad) * cache.Jᵀf
+        @bb @. cache.du = -(cache.trust_r / l_grad) * g
         return
     end
 
     # Take the intersection of dogleg with trust region if Cauchy point lies inside the
     # trust region
-    @bb @. cache.u_cauchy = -(d_cauchy / l_grad) * cache.Jᵀf # compute Cauchy point
+    @bb @. cache.u_cauchy = -(d_cauchy / l_grad) * g # compute Cauchy point
     @bb @. cache.u_cache_2 = cache.u_gauss_newton - cache.u_cauchy # calf of the dogleg
 
     a = dot(cache.u_cache_2, cache.u_cache_2)
@@ -547,13 +548,19 @@ function not_terminated(cache::TrustRegionCache)
     return true
 end
 
-# FIXME: Update the JacVec Operator for Yuan
+# FIXME: Reinit `JᵀJ` operator if `p` is changed
 function __reinit_internal!(cache::TrustRegionCache; kwargs...)
+    if cache.jvp_operator !== nothing
+        cache.jvp_operator = __jacvec(cache.uf, cache.u; cache.fu,
+            autodiff = __get_nonsparse_ad(cache.alg.ad))
+        @bb cache.Jᵀf = cache.jvp_operator × cache.fu
+    end
     cache.loss = __trust_region_loss(cache, cache.fu)
+    cache.loss_new = cache.loss
     cache.shrink_counter = 0
     cache.trust_r = convert(eltype(cache.u),
-        ifelse(cache.alg.initial_trust_radius == 0, cache.alg.initial_trust_radius,
-            cache.max_trust_r / 11))
+        ifelse(cache.alg.initial_trust_radius == 0, cache.max_trust_r / 11,
+            cache.alg.initial_trust_radius))
     cache.make_new_J = true
     return nothing
 end
diff --git a/test/23_test_problems.jl b/test/23_test_problems.jl
index 7642f1ed6..58c08bb90 100644
--- a/test/23_test_problems.jl
+++ b/test/23_test_problems.jl
@@ -95,10 +95,10 @@ end
     alg_ops = (GeneralBroyden(; max_resets = 10),)
 
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [1, 2, 4, 5, 6, 11, 12, 13, 14]
+    broken_tests[alg_ops[1]] = [1, 4, 5, 6, 11, 12, 13, 14]
 
     skip_tests = Dict(alg => Int[] for alg in alg_ops)
-    skip_tests[alg_ops[1]] = [22]
+    skip_tests[alg_ops[1]] = [2, 22]
 
     test_on_library(problems, dicts, alg_ops, broken_tests; skip_tests)
 end
diff --git a/test/gpu.jl b/test/gpu.jl
index daeee0c58..c314f4d76 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -6,28 +6,22 @@ A = cu(rand(4, 4))
 u0 = cu(rand(4))
 b = cu(rand(4))
 
-function f(du, u, p)
-    du .= A * u .+ b
-end
+linear_f(du, u, p) = (du .= A * u .+ b)
 
-prob = NonlinearProblem(f, u0)
+prob = NonlinearProblem(linear_f, u0)
 
-# TrustRegion is broken
-# LimitedMemoryBroyden will diverge!
 for alg in (NewtonRaphson(), LevenbergMarquardt(; linsolve = QRFactorization()),
     PseudoTransient(; alpha_initial = 1.0f0), GeneralKlement(), GeneralBroyden(),
-    LimitedMemoryBroyden())
+    LimitedMemoryBroyden(), TrustRegion())
     @test_nowarn sol = solve(prob, alg; abstol = 1.0f-8, reltol = 1.0f-8)
 end
 
-f(u, p) = A * u .+ b
+linear_f(u, p) = A * u .+ b
 
-prob = NonlinearProblem{false}(f, u0)
+prob = NonlinearProblem{false}(linear_f, u0)
 
-# TrustRegion is broken
-# LimitedMemoryBroyden will diverge!
 for alg in (NewtonRaphson(), LevenbergMarquardt(; linsolve = QRFactorization()),
     PseudoTransient(; alpha_initial = 1.0f0), GeneralKlement(), GeneralBroyden(),
-    LimitedMemoryBroyden())
+    LimitedMemoryBroyden(), TrustRegion())
     @test_nowarn sol = solve(prob, alg; abstol = 1.0f-8, reltol = 1.0f-8)
 end
diff --git a/test/infeasible.jl b/test/infeasible.jl
index db5d31f1b..74ec4128e 100644
--- a/test/infeasible.jl
+++ b/test/infeasible.jl
@@ -56,15 +56,10 @@ end
     @test all(!isnan, sol.u)
     @test !SciMLBase.successful_retcode(sol.retcode)
 
-    try
-        u0 = @SVector [0.0, 0.0, 0.0]
-        prob = NonlinearProblem(f1, u0)
-        sol = solve(prob)
+    u0 = @SVector [0.0, 0.0, 0.0]
+    prob = NonlinearProblem(f1, u0)
+    sol = solve(prob)
 
-        @test all(!isnan, sol.u)
-        @test !SciMLBase.successful_retcode(sol.retcode)
-    catch err
-        # Static Arrays has different default linearsolve which throws an error
-        @test err isa SingularException
-    end
+    @test all(!isnan, sol.u)
+    @test !SciMLBase.successful_retcode(sol.retcode)
 end
diff --git a/test/matrix_resizing.jl b/test/matrix_resizing.jl
index 1d9462fa1..59a537ace 100644
--- a/test/matrix_resizing.jl
+++ b/test/matrix_resizing.jl
@@ -8,7 +8,7 @@ prob = NonlinearProblem(ff, u0, p)
 
 for alg in (NewtonRaphson(), TrustRegion(), LevenbergMarquardt(), PseudoTransient(),
     RobustMultiNewton(), FastShortcutNonlinearPolyalg(), GeneralBroyden(), GeneralKlement(),
-    LimitedMemoryBroyden())
+    LimitedMemoryBroyden(; threshold = 2))
     @test vec(solve(prob, alg).u) == solve(vecprob, alg).u
 end
 
@@ -19,6 +19,7 @@ vecprob = NonlinearProblem(fiip, vec(u0), p)
 prob = NonlinearProblem(fiip, u0, p)
 
 for alg in (NewtonRaphson(), TrustRegion(), LevenbergMarquardt(), PseudoTransient(),
-    RobustMultiNewton(), FastShortcutNonlinearPolyalg(), GeneralBroyden(), GeneralKlement())
+    RobustMultiNewton(), FastShortcutNonlinearPolyalg(), GeneralBroyden(), GeneralKlement(),
+    LimitedMemoryBroyden(; threshold = 2))
     @test vec(solve(prob, alg).u) == solve(vecprob, alg).u
 end
diff --git a/test/polyalgs.jl b/test/polyalgs.jl
index 0a4e599b3..e56bb5353 100644
--- a/test/polyalgs.jl
+++ b/test/polyalgs.jl
@@ -46,15 +46,15 @@ sol = solve(prob; abstol = 1e-9)
 
 # https://github.com/SciML/NonlinearSolve.jl/issues/187
 # If we use a General Nonlinear Solver the solution might go out of the domain!
-ff(u, p) = 0.5 / 1.5 * NaNMath.log.(u ./ (1.0 .- u)) .- 2.0 * u .+ 1.0
+ff_interval(u, p) = 0.5 / 1.5 * NaNMath.log.(u ./ (1.0 .- u)) .- 2.0 * u .+ 1.0
 
 uspan = (0.02, 0.1)
-prob = IntervalNonlinearProblem(ff, uspan)
+prob = IntervalNonlinearProblem(ff_interval, uspan)
 sol = solve(prob; abstol = 1e-9)
 @test SciMLBase.successful_retcode(sol)
 
 u0 = 0.06
 p = 2.0
-prob = NonlinearProblem(ff, u0, p)
+prob = NonlinearProblem(ff_interval, u0, p)
 sol = solve(prob; abstol = 1e-9)
 @test SciMLBase.successful_retcode(sol)

From ee042973bf6fd56fdb4a569a4c1aa4dd95665173 Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 5 Dec 2023 13:49:30 -0500
Subject: [PATCH 23/25] Count statistics inside calls and not in individual
 algorithms

---
 src/gaussnewton.jl     |  5 +----
 src/klement.jl         |  5 +----
 src/levenberg.jl       | 16 +++++++---------
 src/pseudotransient.jl |  5 +----
 src/raphson.jl         |  5 +----
 src/trustRegion.jl     |  4 +---
 src/utils.jl           | 20 +++++++++++++++++---
 7 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
index 9a227a7fa..822b0ffc3 100644
--- a/src/gaussnewton.jl
+++ b/src/gaussnewton.jl
@@ -123,7 +123,7 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
         A, b = cache.J, _vec(cache.fu)
     end
 
-    linres = dolinsolve(cache.alg.precs, cache.linsolve; A, b, linu = _vec(cache.du),
+    linres = dolinsolve(cache, cache.alg.precs, cache.linsolve; A, b, linu = _vec(cache.du),
         cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     cache.du = _restructure(cache.du, linres.u)
@@ -142,9 +142,6 @@ function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
     @bb copyto!(cache.u_cache, cache.u)
     @bb copyto!(cache.dfu, cache.fu)
 
-    cache.stats.njacs += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
     return nothing
 end
 
diff --git a/src/klement.jl b/src/klement.jl
index 62aa8f681..da34958fe 100644
--- a/src/klement.jl
+++ b/src/klement.jl
@@ -123,7 +123,7 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
     A = ifelse(cache.J isa SMatrix || cache.J isa Number || !fact_done, cache.J, nothing)
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, cache.linsolve; A,
+    linres = dolinsolve(cache, alg.precs, cache.linsolve; A,
         b = _vec(cache.fu), linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     cache.du = _restructure(cache.du, linres.u)
@@ -139,9 +139,6 @@ function perform_step!(cache::GeneralKlementCache{iip}) where {iip}
 
     @bb copyto!(cache.u_cache, cache.u)
 
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-
     cache.force_stop && return nothing
 
     # Update the Jacobian
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 0013be126..160406f66 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -263,13 +263,14 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         else
             cache.rhs_tmp = _vcat(_vec(cache.fu), zero(_vec(cache.u)))
         end
-        linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp,
+        linres = dolinsolve(cache, alg.precs, linsolve; A = cache.mat_tmp,
             b = cache.rhs_tmp, linu = _vec(cache.v), cache.p, reltol = cache.abstol)
     else
         @bb cache.u_cache_2 = transpose(cache.J) × cache.fu
         @bb @. cache.mat_tmp = cache.JᵀJ + cache.λ * cache.DᵀD
-        linres = dolinsolve(alg.precs, linsolve; A = __maybe_symmetric(cache.mat_tmp),
-            b = _vec(cache.u_cache_2), linu = _vec(cache.v), cache.p, reltol = cache.abstol)
+        linres = dolinsolve(cache, alg.precs, linsolve;
+            A = __maybe_symmetric(cache.mat_tmp), b = _vec(cache.u_cache_2),
+            linu = _vec(cache.v), cache.p, reltol = cache.abstol)
     end
     cache.linsolve = linres.cache
     linu = _restructure(cache.v, linres.u)
@@ -293,20 +294,17 @@ function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip,
         else
             cache.rhs_tmp = _vcat(_vec(cache.fu_cache_2), zero(_vec(cache.u)))
         end
-        linres = dolinsolve(alg.precs, linsolve; b = cache.rhs_tmp, linu = _vec(cache.a),
-            cache.p, reltol = cache.abstol)
+        linres = dolinsolve(cache, alg.precs, linsolve; b = cache.rhs_tmp,
+            linu = _vec(cache.a), cache.p, reltol = cache.abstol)
     else
         @bb cache.u_cache_2 = transpose(cache.J) × cache.fu_cache_2
-        linres = dolinsolve(alg.precs, linsolve; b = _vec(cache.u_cache_2),
+        linres = dolinsolve(cache, alg.precs, linsolve; b = _vec(cache.u_cache_2),
             linu = _vec(cache.a), cache.p, reltol = cache.abstol)
     end
     cache.linsolve = linres.cache
     linu = _restructure(cache.a, linres.u)
     @bb @. cache.a = -linu
 
-    cache.stats.nsolve += 2
-    cache.stats.nfactors += 2
-
     # Require acceptable steps to satisfy the following condition.
     norm_v = cache.internalnorm(cache.v)
     if 2 * cache.internalnorm(cache.a) ≤ cache.α_geodesic * norm_v
diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
index 2849e0a28..1416cc4b8 100644
--- a/src/pseudotransient.jl
+++ b/src/pseudotransient.jl
@@ -127,7 +127,7 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
     end
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, cache.linsolve; A, b = _vec(cache.fu),
+    linres = dolinsolve(cache, alg.precs, cache.linsolve; A, b = _vec(cache.fu),
         linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     cache.du = _restructure(cache.du, linres.u)
@@ -145,9 +145,6 @@ function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
     @bb copyto!(cache.u_cache, cache.u)
-    cache.stats.njacs += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
     return nothing
 end
 
diff --git a/src/raphson.jl b/src/raphson.jl
index baf2ec10c..9ba6319aa 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -104,7 +104,7 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     cache.J = jacobian!!(cache.J, cache)
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
+    linres = dolinsolve(cache, alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
         linu = _vec(cache.du), cache.p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     cache.du = _restructure(cache.du, linres.u)
@@ -119,8 +119,5 @@ function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
     check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
 
     @bb copyto!(cache.u_cache, cache.u)
-    cache.stats.njacs += 1
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
     return nothing
 end
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index abc93fd9a..524aa8694 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -360,7 +360,7 @@ function perform_step!(cache::TrustRegionCache{iip}) where {iip}
 
         # do not use A = cache.H, b = _vec(cache.g) since it is equivalent
         # to  A = cache.J, b = _vec(fu) as long as the Jacobian is non-singular
-        linres = dolinsolve(cache.alg.precs, cache.linsolve, A = cache.J,
+        linres = dolinsolve(cache, cache.alg.precs, cache.linsolve, A = cache.J,
             b = _vec(cache.fu), linu = _vec(cache.u_gauss_newton), p = cache.p,
             reltol = cache.abstol)
         cache.linsolve = linres.cache
@@ -375,8 +375,6 @@ function perform_step!(cache::TrustRegionCache{iip}) where {iip}
     @bb @. cache.u_cache_2 = cache.u + cache.du
     evaluate_f(cache, cache.u_cache_2, cache.p, Val{:fu_cache_2}())
     trust_region_step!(cache)
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
     return nothing
 end
 
diff --git a/src/utils.jl b/src/utils.jl
index 56a976aa8..99eda8807 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -88,18 +88,26 @@ end
 
 DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
 
-function dolinsolve(precs::P, linsolve::FakeLinearSolveJLCache; A = nothing,
+function dolinsolve(cache, precs::P, linsolve::FakeLinearSolveJLCache; A = nothing,
         linu = nothing, b = nothing, du = nothing, p = nothing, weight = nothing,
         cachedata = nothing, reltol = nothing, reuse_A_if_factorization = false) where {P}
+    # Update Statistics
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += !(A isa Number)
+
     A !== nothing && (linsolve.A = A)
     b !== nothing && (linsolve.b = b)
     linres = linsolve.A \ linsolve.b
     return FakeLinearSolveJLResult(linsolve, linres)
 end
 
-function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing,
+function dolinsolve(cache, precs::P, linsolve; A = nothing, linu = nothing, b = nothing,
         du = nothing, p = nothing, weight = nothing, cachedata = nothing, reltol = nothing,
         reuse_A_if_factorization = false) where {P}
+    # Update Statistics
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
+
     # Some Algorithms would reuse factorization but it causes the cache to not reset in
     # certain cases
     if A !== nothing
@@ -108,10 +116,16 @@ function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing
            (alg isa LinearSolve.DefaultLinearSolver && !(alg ==
               LinearSolve.DefaultLinearSolver(LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES)))
             # Factorization Algorithm
-            !reuse_A_if_factorization && (linsolve.A = A)
+            if reuse_A_if_factorization
+                cache.stats.nfactors -= 1
+            else
+                linsolve.A = A
+            end
         else
             linsolve.A = A
         end
+    else
+        cache.stats.nfactors -= 1
     end
     b !== nothing && (linsolve.b = b)
     linu !== nothing && (linsolve.u = linu)

From 8666e05a922699a20e085815ba926fbb1ebe223a Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Tue, 5 Dec 2023 17:59:51 -0500
Subject: [PATCH 24/25] Counter for jacobians

---
 src/jacobian.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/jacobian.jl b/src/jacobian.jl
index 60be3f3cd..1ab1ff2b7 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -34,6 +34,7 @@ jacobian!!(J, _) = J
 # and we don't want wasteful `copyto!`
 function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
     @unpack f, uf, u, p, jac_cache, alg, fu_cache = cache
+    cache.stats.njacs += 1
     iip = isinplace(cache)
     if iip
         if has_jac(f)
@@ -53,8 +54,10 @@ function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
     end
 end
 # Scalar case
-jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
-
+function jacobian!!(::Number, cache)
+    cache.stats.njacs += 1
+    return last(value_derivative(cache.uf, cache.u))
+end
 # Build Jacobian Caches
 function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val{iip};
         linsolve_kwargs = (;), lininit::Val{linsolve_init} = Val(true),

From 3b52a5accf37028a1f32f1ce2664ddca7cb255bb Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 6 Dec 2023 11:07:26 -0500
Subject: [PATCH 25/25] More tests pass

---
 src/trustRegion.jl       | 2 +-
 test/23_test_problems.jl | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 524aa8694..5a6360f73 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -236,7 +236,7 @@ end
 
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::TrustRegion, args...;
         alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm = Base.Fix2(norm, 2),
+        termination_condition = nothing, internalnorm = DEFAULT_NORM,
         linsolve_kwargs = (;), kwargs...) where {uType, iip}
     alg = get_concrete_algorithm(alg_, prob)
     @unpack f, u0, p = prob
diff --git a/test/23_test_problems.jl b/test/23_test_problems.jl
index 58c08bb90..035bb130c 100644
--- a/test/23_test_problems.jl
+++ b/test/23_test_problems.jl
@@ -75,7 +75,7 @@ end
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
     broken_tests[alg_ops[1]] = [3, 6, 11, 17, 21]
     broken_tests[alg_ops[2]] = [3, 6, 11, 17, 21]
-    broken_tests[alg_ops[3]] = [6, 11, 17, 21]
+    broken_tests[alg_ops[3]] = [6, 11, 21]
 
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end
@@ -89,8 +89,6 @@ end
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end
 
-# Broyden and Klement Tests are quite flaky and failure seems to be platform dependent
-# needs additional investigation before we can enable them
 @testset "GeneralBroyden 23 Test Problems" begin
     alg_ops = (GeneralBroyden(; max_resets = 10),)