JuliaStats · palday · Aug 29, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+MixedModels v4.20.0 Release Notes
+==============================
+* The `.tbl` property of a `MixedModelBootstrap` now includes the correlation parameters for lower triangular elements of the `λ` field. [#702]
+
 MixedModels v4.19.0 Release Notes
 ==============================
 * New method `StatsAPI.coefnames(::ReMat)` returns the coefficient names associated with each grouping factor. [#709]
@@ -463,6 +467,7 @@ Package dependencies
 [#682]: https://github.com/JuliaStats/MixedModels.jl/issues/682
 [#694]: https://github.com/JuliaStats/MixedModels.jl/issues/694
 [#698]: https://github.com/JuliaStats/MixedModels.jl/issues/698
+[#702]: https://github.com/JuliaStats/MixedModels.jl/issues/702
 [#703]: https://github.com/JuliaStats/MixedModels.jl/issues/703
 [#707]: https://github.com/JuliaStats/MixedModels.jl/issues/707
 [#709]: https://github.com/JuliaStats/MixedModels.jl/issues/709
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MixedModels"
 uuid = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
 author = ["Phillip Alday <[email protected]>", "Douglas Bates <[email protected]>", "Jose Bayoan Santiago Calderon <[email protected]>"]
-version = "4.19.0"
+version = "4.20.0"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"

diff --git a/docs/src/bootstrap.md b/docs/src/bootstrap.md
@@ -24,10 +24,6 @@ parameter, `θ`, that defines the variance-covariance matrices of the random eff
 For example, a simple linear mixed-effects model for the `Dyestuff` data in the [`lme4`](http://github.com/lme4/lme4)
 package for [`R`](https://www.r-project.org) is fit by
 
-```@setup Main
-using DisplayAs
-```
-
 ```@example Main
 using DataFrames
 using Gadfly          # plotting package
@@ -38,41 +34,29 @@ using Random
 ```@example Main
 dyestuff = MixedModels.dataset(:dyestuff)
 m1 = fit(MixedModel, @formula(yield ~ 1 + (1 | batch)), dyestuff)
-DisplayAs.Text(ans) # hide
 ```
 
-To bootstrap the model parameters, first initialize a random number generator then create a bootstrap sample
+To bootstrap the model parameters, first initialize a random number generator then create a bootstrap sample and extract the `tbl` property, which is a `Table` - a lightweight dataframe-like object.
 
 ```@example Main
 const rng = MersenneTwister(1234321);
 samp = parametricbootstrap(rng, 10_000, m1);
-df = DataFrame(samp.allpars);
-first(df, 10)
+tbl = samp.tbl
 ```
 
-Especially for those with a background in [`R`](https://www.R-project.org/) or [`pandas`](https://pandas.pydata.org),
-the simplest way of accessing the parameter estimates in the parametric bootstrap object is to create a `DataFrame` from the `allpars` property as shown above.
-
-We can use `filter` to filter out relevant rows of a dataframe.
 A density plot of the estimates of `σ`, the residual standard deviation, can be created as
 ```@example Main
-σres = filter(df) do row # create a thunk that operates on rows
-    row.type == "σ" && row.group == "residual" # our filtering rule
-end
-
-plot(x = σres.value, Geom.density, Guide.xlabel("Parametric bootstrap estimates of σ"))
+plot(x = tbl.σ, Geom.density, Guide.xlabel("Parametric bootstrap estimates of σ"))
 ```
-For the estimates of the intercept parameter, the `getproperty` extractor must be used
+
+or, for the intercept parameter
 ```@example Main
-plot(filter(:type => ==("β"),  df), x = :value, Geom.density, Guide.xlabel("Parametric bootstrap estimates of β₁"))
+plot(x = tbl.β1, Geom.density, Guide.xlabel("Parametric bootstrap estimates of β₁"))
 ```
 
 A density plot of the estimates of the standard deviation of the random effects is obtained as
 ```@example Main
-σbatch = filter(df) do row # create a thunk that operates on rows
-    row.type == "σ" && row.group == "batch" # our filtering rule
-end
-plot(x = σbatch.value, Geom.density,
+plot(x = tbl.σ1, Geom.density,
     Guide.xlabel("Parametric bootstrap estimates of σ₁"))
 ```
 
@@ -81,7 +65,7 @@ Although this mode appears to be diffuse, this is an artifact of the way that de
 In fact, it is a pulse, as can be seen from a histogram.
 
 ```@example Main
-plot(x = σbatch.value, Geom.histogram,
+plot(x = tbl.σ1, Geom.histogram,
     Guide.xlabel("Parametric bootstrap estimates of σ₁"))
 ```
 
@@ -93,14 +77,9 @@ The shortest such intervals, obtained with the `shortestcovint` extractor, corre
 shortestcovint
 ```
 
-We generate these for all random and fixed effects:
+We generate these directly from the original bootstrap object:
 ```@example Main
-combine(groupby(df, [:type, :group, :names]), :value => shortestcovint => :interval)
-```
-
-We can also generate this directly from the original bootstrap object:
-```@example Main
-DataFrame(shortestcovint(samp))
+Table(shortestcovint(samp))
 ```
 
 A value of zero for the standard deviation of the random effects is an example of a *singular* covariance.
@@ -110,48 +89,39 @@ However, it is not as straightforward to detect singularity in vector-valued ran
 For example, if we bootstrap a model fit to the `sleepstudy` data
 ```@example Main
 sleepstudy = MixedModels.dataset(:sleepstudy)
-m2 = fit(
-    MixedModel,
-    @formula(reaction ~ 1+days+(1+days|subj)),
-    sleepstudy,
-)
-DisplayAs.Text(ans) # hide
+contrasts = Dict(:subj => Grouping())
+m2 = let f = @formula reaction ~ 1+days+(1+days|subj)
+    fit(MixedModel, f, sleepstudy; contrasts)
+end
 ```
 ```@example Main
 samp2 = parametricbootstrap(rng, 10_000, m2);
-df2 = DataFrame(samp2.allpars);
-first(df2, 10)
+tbl2 = samp2.tbl
 ```
 the singularity can be exhibited as a standard deviation of zero or as a correlation of $\pm1$.
 
 ```@example Main
-DataFrame(shortestcovint(samp2))
+shortestcovint(samp2)
 ```
 
 A histogram of the estimated correlations from the bootstrap sample has a spike at `+1`.
 ```@example Main
-ρs = filter(df2) do row
-    row.type == "ρ" && row.group == "subj"
-end
-plot(x = ρs.value, Geom.histogram,
+plot(x = tbl2.ρ1, Geom.histogram,
     Guide.xlabel("Parametric bootstrap samples of correlation of random effects"))
 ```
 or, as a count,
 ```@example Main
-count(ρs.value .≈ 1)
+count(tbl2.ρ1 .≈ 1)
 ```
 
 Close examination of the histogram shows a few values of `-1`.
 ```@example Main
-count(ρs.value .≈ -1)
+count(tbl2.ρ1 .≈ -1)
 ```
 
 Furthermore there are even a few cases where the estimate of the standard deviation of the random effect for the intercept is zero.
 ```@example Main
-σs = filter(df2) do row
-    row.type == "σ" && row.group == "subj" && row.names == "(Intercept)"
-end
-count(σs.value .≈ 0)
+count(tbl2.σ1 .≈ 0)
 ```
 
 There is a general condition to check for singularity of an estimated covariance matrix or matrices in a bootstrap sample.

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -253,7 +253,7 @@ of `iter`, `type`, `group`, `name` and `value`.
     a breaking change.
 """
 function allpars(bsamp::MixedModelFitCollection{T}) where {T}
-    fits, λ, fcnames = bsamp.fits, bsamp.λ, bsamp.fcnames
+    (; fits, λ, fcnames) = bsamp
     npars = 2 + length(first(fits).β) + sum(map(k -> (k * (k + 1)) >> 1, size.(bsamp.λ, 2)))
     nresrow = length(fits) * npars
     cols = (
@@ -392,12 +392,12 @@ function Base.propertynames(bsamp::MixedModelFitCollection)
 end
 
 """
+    setθ!(bsamp::MixedModelFitCollection, θ::AbstractVector)
     setθ!(bsamp::MixedModelFitCollection, i::Integer)
 
 Install the values of the i'th θ value of `bsamp.fits` in `bsamp.λ`
 """
-function setθ!(bsamp::MixedModelFitCollection, i::Integer)
-    θ = bsamp.fits[i].θ
+function setθ!(bsamp::MixedModelFitCollection{T}, θ::AbstractVector{T}) where {T}
     offset = 0
     for (λ, inds) in zip(bsamp.λ, bsamp.inds)
         λdat = _getdata(λ)
@@ -410,6 +410,10 @@ function setθ!(bsamp::MixedModelFitCollection, i::Integer)
     return bsamp
 end
 
+function setθ!(bsamp::MixedModelFitCollection, i::Integer)
+    return setθ!(bsamp, bsamp.θ[i])
+end
+
 _getdata(x::Diagonal) = x
 _getdata(x::LowerTriangular) = x.data
 
@@ -444,7 +448,7 @@ Return the shortest interval containing `level` proportion for each parameter fr
     a breaking change.
 """
 function shortestcovint(bsamp::MixedModelFitCollection{T}, level=0.95) where {T}
-    allpars = bsamp.allpars
+    allpars = bsamp.allpars  # TODO probably simpler to use .tbl instead of .allpars
     pars = unique(zip(allpars.type, allpars.group, allpars.names))
 
     colnms = (:type, :group, :names, :lower, :upper)
@@ -521,6 +525,7 @@ end
 
 """
     tidyσs(bsamp::MixedModelFitCollection)
+
 Return a tidy (row)table with the estimates of the variance components (on the standard deviation scale) spread into columns
 of `iter`, `group`, `column` and `σ`.
 """
@@ -544,40 +549,90 @@ function tidyσs(bsamp::MixedModelFitCollection{T}) where {T}
     return result
 end
 
+_nρ(d::Diagonal) = 0
+_nρ(t::LowerTriangular) = kchoose2(size(t.data, 1))
+
+function σρnms(λ)
+    σsyms = _generatesyms('σ', sum(first ∘ size, λ))
+    ρsyms = _generatesyms('ρ', sum(_nρ, λ))
+    val = sizehint!(Symbol[], length(σsyms) + length(ρsyms))
+    for l in λ
+        for _ in axes(l, 1)
+            push!(val, popfirst!(σsyms))
+        end
+        for _ in 1:_nρ(l)
+            push!(val, popfirst!(ρsyms))
+        end
+    end
+    return val
+end
+
+function _syms(bsamp::MixedModelBootstrap)
+    (; fits, λ) = bsamp
+    (; β, θ) = first(fits)
+    syms = [:obj]
+    append!(syms, _generatesyms('β', length(β)))
+    push!(syms, :σ)
+    append!(syms, σρnms(λ))
+    return append!(syms, _generatesyms('θ', length(θ)))
+end
+
+function σρ!(v::AbstractVector, d::Diagonal, σ)
+    return append!(v, σ .* d.diag)
+end
+
+function σρ!(v::AbstractVector{T}, t::LowerTriangular{T}, σ::T) where {T}
+    """
+        σρ!(v, t, σ)
+    push! `σ` times the row lengths (σs) and the inner products of normalized rows (ρs) of `t` onto `v` 
+    """
+    dat = t.data
+    for i in axes(dat, 1)
+        ssqr = zero(T)
+        for j in 1:i
+            ssqr += abs2(dat[i, j])
+        end
+        len = sqrt(ssqr)
+        push!(v, σ * len)
+        if len > 0
+            for j in 1:i
+                dat[i, j] /= len
+            end
+        end
+    end
+    for i in axes(dat, 1)
+        for j in 1:(i - 1)
+            s = zero(T)
+            for k in 1:i
+                s += dat[i, k] * dat[j, k]
+            end
+            push!(v, s)
+        end
+    end
+    return v
+end
+
 function pbstrtbl(bsamp::MixedModelFitCollection{T}) where {T}
-    (; fits, λ, inds) = bsamp
-    row1 = first(fits)
-    cnms = [:obj, :σ]
-    pos = Dict{Symbol,UnitRange{Int}}(:obj => 1:1, :σ => 2:2)
-    βsz = length(row1.β)
-    append!(cnms, _generatesyms('β', βsz))
-    lastpos = 2 + βsz
-    pos[:β] = 3:lastpos
-    σsz = sum(m -> size(m, 1), bsamp.λ)
-    append!(cnms, _generatesyms('σ', σsz))
-    pos[:σs] = (lastpos + 1):(lastpos + σsz)
-    lastpos += σsz
-    θsz = length(row1.θ)
-    append!(cnms, _generatesyms('θ', θsz))
-    pos[:θ] = (lastpos + 1):(lastpos + θsz)
-    tblrowtyp = NamedTuple{(cnms...,),NTuple{length(cnms),T}}
-    val = sizehint!(tblrowtyp[], length(bsamp.fits))
-    v = Vector{T}(undef, length(cnms))
-    for (i, r) in enumerate(bsamp.fits)
-        v[1] = r.objective
-        v[2] = coalesce(r.σ, one(T))
-        copyto!(view(v, pos[:β]), r.β)
-        fill!(view(v, pos[:σs]), zero(T))
-        copyto!(view(v, pos[:θ]), r.θ)
-        setθ!(bsamp, i)
-        vpos = first(pos[:σs])
+    (; fits, λ) = bsamp
+    λcp = copy.(λ)
+    syms = _syms(bsamp)
+    m = length(syms)
+    n = length(fits)
+    v = sizehint!(T[], m * n)
+    for f in fits
+        (; β, θ, σ) = f
+        push!(v, f.objective)
+        append!(v, β)
+        push!(v, σ)
+        setθ!(bsamp, θ)
         for l in λ
-            for λr in eachrow(l)
-                v[vpos] = r.σ * norm(λr)
-                vpos += 1
-            end
+            σρ!(v, l, σ)
         end
-        push!(val, tblrowtyp(v))
+        append!(v, θ)
     end
-    return val
+    m = permutedims(reshape(v, (m, n)), (2, 1)) # equivalent to collect(transpose(...))
+    for k in eachindex(λ, λcp)   # restore original contents of λ
+        copyto!(λ[k], λcp[k])
+    end
+    return Table(Tables.table(m; header=syms))
 end
diff --git a/src/profile/utilities.jl b/src/profile/utilities.jl
@@ -24,10 +24,12 @@ Utility to generate a vector of Symbols of the form :<tag><index> from a tag and
 
 The indices are left-padded with zeros to allow lexicographic sorting.
 """
-function _generatesyms(tag::Char, len::Integer)
+function _generatesyms(tag::AbstractString, len::Integer)
     return Symbol.(string.(tag, lpad.(Base.OneTo(len), ndigits(len), '0')))
 end
 
+_generatesyms(tag::Char, len::Integer) = _generatesyms(string(tag), len)
+
 function TableColumns(m::LinearMixedModel{T}) where {T}
     nmvec = [:ζ]
     positions = Dict(:ζ => 1:1)

diff --git a/test/bootstrap.jl b/test/bootstrap.jl
@@ -156,7 +156,11 @@ end
         pb0 = quickboot(m0)
         pb1 = quickboot(m1)
         savereplicates(io, pb1)
-        # wrong model``
+        @test isa(pb0.tbl, Table)
+        @test isa(pb1.tbl, Table)  # create tbl here to check it doesn't modify pb1
+        @test ncol(DataFrame(pb1.β)) == 3
+
+        # wrong model
         @test_throws ArgumentError restorereplicates(seekstart(io), m0)
         # need to specify an eltype!
         @test_throws MethodError restorereplicates(seekstart(io), m1, MixedModelBootstrap)
@@ -184,7 +188,7 @@ end
         @test pb1 == restorereplicates(seekstart(io), MixedModels.unfit!(deepcopy(m1)))
 
         @test pb1 ≈ restorereplicates(seekstart(io), m1, Float16) rtol=1
-    end
+end
 
     @testset "Bernoulli simulate! and GLMM bootstrap" begin
         contra = dataset(:contra)