JuliaStats · palday · Apr 15, 2021 · Apr 12, 2021 · Apr 12, 2021 · Apr 12, 2021
diff --git a/NEWS.md b/NEWS.md
@@ -36,6 +36,14 @@ Run-time formula syntax
 * Methods for `Base./(::AbstractTerm, ::AbstractTerm)` are added, allowing
   nesting syntax to be used with `Term`s at run-time as well [#470]
 
+MixedModels v3.6.0 Release Notes
+========================
+* Add `likelihoodratiotest` method for comparing non-mixed (generalized) linear models to (generalized) linear mixed models [#508].
+
+MixedModels v3.5.2 Release Notes
+========================
+* Explicitly deprecate vestigial `named` kwarg in `ranef` in favor of `raneftables` [#507].
+
 MixedModels v3.5.1 Release Notes
 ========================
 * Fix MIME show methods for models with random-effects not corresponding to a fixed effect [#501].
@@ -223,3 +231,5 @@ Package dependencies
 [#495]: https://github.com/JuliaStats/MixedModels.jl/issues/495
 [#501]: https://github.com/JuliaStats/MixedModels.jl/issues/501
 [#506]: https://github.com/JuliaStats/MixedModels.jl/issues/506
+[#507]: https://github.com/JuliaStats/MixedModels.jl/issues/507
+[#508]: https://github.com/JuliaStats/MixedModels.jl/issues/508
diff --git a/src/MixedModels.jl b/src/MixedModels.jl
@@ -23,6 +23,7 @@ using Tables
 using LinearAlgebra: BlasFloat, BlasReal, HermOrSym, PosDefException, copytri!
 using Base: Ryu
 using GLM: Link, canonicallink
+using StatsModels: TableRegressionModel
 
 using StatsFuns: log2π, normccdf
 

diff --git a/src/likelihoodratiotest.jl b/src/likelihoodratiotest.jl
@@ -6,17 +6,20 @@ Results of MixedModels.likelihoodratiotest
 ## Fields
 * `formulas`: Vector of model formulae
 * `models`: NamedTuple of the `dof` and `deviance` of the models
-* `tests`: NamedTuple of the sequential `dofdiff`, `deviancediff`, and resulting `pvalues`
+* `tests`: NamedTuple of the sequential `dofdiff`, `deviancediff`,
+           and resulting `pvalues`
 
 ## Properties
-* `deviance`
+* `deviance` : note that this is actually -2 log likelihood for linear models
+               (i.e. without subtracting the constant for a saturated model)
 * `pvalues`
 
 """
 struct LikelihoodRatioTest
     formulas::AbstractVector{String}
     models::NamedTuple{(:dof,:deviance)}
     tests::NamedTuple{(:dofdiff,:deviancediff,:pvalues)}
+    linear::Bool
 end
 
 Base.propertynames(lrt::LikelihoodRatioTest, private::Bool = false) = (
@@ -46,12 +49,24 @@ Base.getindex(lrt::LikelihoodRatioTest, s::Symbol) = getfield(lrt,s)
 
 """
     likelihoodratiotest(m::MixedModel...)
+    likelihoodratiotest(m0::LinearModel, m::MixedModel...)
+    likelihoodratiotest(m0::GeneralizedLinearModel, m::MixedModel...)
+    likelihoodratiotest(m0::TableRegressionModel{LinearModel}, m::MixedModel...)
+    likelihoodratiotest(m0::TableRegressionModel{GeneralizedLinearModel}, m::MixedModel...)
+
 
 Likeihood ratio test applied to a set of nested models.
 
-Note that nesting of the models is not checked.  It is incumbent on the user
-to check this. This differs from [`StatsModels.lrtest`](@ref) as nesting in
-mixed models, especially in the random effects specification, may be non obvious.
+!!! note
+    The nesting of the models is not checked.  It is incumbent on the user
+    to check this. This differs from [`StatsModels.lrtest`](@ref) as nesting in
+    mixed models, especially in the random effects specification, may be non obvious.
+
+!!! note
+    For comparisons between mixed and non-mixed models, the deviance for the non-mixed
+    model is taken to be -2 log likelihood, i.e. omitting the additive constant for the
+    fully saturated model. This is in line with the computation of the deviance for mixed
+    models.
 
 This functionality may be deprecated in the future in favor of [`StatsModels.lrtest`](@ref).
 """
@@ -81,7 +96,44 @@ function likelihoodratiotest(m::MixedModel...)
     LikelihoodRatioTest(
         formulas,
         (dof = dofs, deviance = devs),
-        (dofdiff = dofdiffs, deviancediff = devdiffs, pvalues = pvals)
+        (dofdiff = dofdiffs, deviancediff = devdiffs, pvalues = pvals),
+        first(m) isa LinearMixedModel
+    )
+end
+
+_formula(::Union{LinearModel, GeneralizedLinearModel}) = "NA"
+_formula(x::TableRegressionModel{<:Union{LinearModel, GeneralizedLinearModel}}) = String(Symbol(x.mf.f))
+
+# for GLMMs we're actually looking at the deviance and additive constants are comparable
+# (because GLM deviance is actually part of the GLMM deviance computation)
+# for LMMs, we're always looking at the "deviance scale" but without the additive constant
+# for the fully saturated model
+_criterion(x::Union{GeneralizedLinearModel, TableRegressionModel{<:GeneralizedLinearModel}}) = deviance(x)
+_criterion(x::Union{LinearModel, TableRegressionModel{<:LinearModel}}) = -2 * loglikelihood(x)
+
+function likelihoodratiotest(m0::Union{TableRegressionModel{<:Union{LinearModel, GeneralizedLinearModel}},
+                                       LinearModel, GeneralizedLinearModel},
+                             m::MixedModel...)
+    _iscomparable(m0, first(m)) ||
+        throw(ArgumentError("""Models are not comparable: are the objectives, data
+                               and, where appropriate, the link and family the same?
+                            """))
+    lrt = likelihoodratiotest(m...)
+    devs = pushfirst!(lrt.deviance, _criterion(m0))
+    formulas = pushfirst!(lrt.formulas, _formula(m0))
+    dofs = pushfirst!(lrt.models.dof, dof(m0))
+    devdiffs = pushfirst!(lrt.tests.deviancediff, devs[1] - devs[2])
+    dofdiffs = pushfirst!(lrt.tests.dofdiff, dofs[2] - dofs[1])
+
+    df, dev = first(dofdiffs), first(devdiffs)
+    p = dev > 0 ? ccdf(Chisq(df), dev) : NaN
+    pvals = pushfirst!(lrt.tests.pvalues, p)
+
+    LikelihoodRatioTest(
+        formulas,
+        (dof = dofs, deviance = devs),
+        (dofdiff = dofdiffs, deviancediff = devdiffs, pvalues = pvals),
+        lrt.linear
     )
 end
 
@@ -103,7 +155,7 @@ function Base.show(io::IO, ::MIME"text/plain", lrt::LikelihoodRatioTest)
 
     outrows[1, :] = ["",
                     "model-dof",
-                    "deviance",
+                    lrt.linear ? "-2 logLik" : "deviance",
                     "χ²",
                     "χ²-dof",
                     "P(>χ²)"] # colnms
@@ -218,3 +270,90 @@ function StatsModels.isnested(m1::MixedModel, m2::MixedModel; atol::Real=0.0)
 
     true
 end
+
+
+function _iscomparable(m1::TableRegressionModel{<:Union{LinearModel, GeneralizedLinearModel}},
+                       m2::MixedModel)
+     _iscomparable(m1.model, m2) || return false
+
+    # check that the nested fixef are a subset of the outer
+    all(in.(coefnames(m1),  Ref(coefnames(m2)))) || return false
+
+    return true
+end
+
+# GLM isn't nested with in LMM and LM isn't nested within GLMM
+_iscomparable(m1::Union{LinearModel, GeneralizedLinearModel}, m2::MixedModel) = false
+
+function _iscomparable(m1::LinearModel, m2::LinearMixedModel)
+    nobs(m1) == nobs(m2) || return false
+
+    # XXX This reaches into the internal structure of GLM
+    size(m1.pp.X, 2) <= size(m2.X, 2) || return false
+
+    _isnested(m1.pp.X, m2.X) || return false
+
+    !m2.optsum.REML ||
+        throw(ArgumentError("REML-fitted models cannot be compared to linear models"))
+
+    return true
+end
+
+function _iscomparable(m1::GeneralizedLinearModel, m2::GeneralizedLinearMixedModel)
+    nobs(m1) == nobs(m2) || return false
+
+    size(modelmatrix(m1), 2) <= size(modelmatrix(m2), 2) || return false
+
+    _isnested(modelmatrix(m1), modelmatrix(m2)) || return false
+
+    Distribution(m1) == Distribution(m2) ||
+        throw(ArgumentError("Models must be fit to the same distribution"))
+
+    Link(m1) == Link(m2) ||
+        throw(ArgumentError("Models must have the same link function"))
+
+    return true
+end
+
+"""
+    _isnested(x::AbstractMatrix, y::AbstractMatrix; atol::Real=0.0)
+
+Test whether the column span of `x` is a subspace of (nested within)
+the column span of y.
+
+The nesting of the column span of the fixed-effects model matrices is a necessary,
+but not sufficient condition for a linear model (whether mixed-effects or not)
+to be nested within a linear mixed-effects model.
+
+!!! note
+    The `rtol` argument is an internal threshold and not currently
+    compatible with the `atol` argument of `StatsModels.isnested`.
+"""
+function _isnested(x::AbstractMatrix, y::AbstractMatrix; rtol=1e-8, ranktol=1e-8)
+
+    # technically this can return false positives if x or y
+    # are rank deficient, but either they're rank deficient
+    # in the same way (b/c same data) and we don't care OR
+    # it's not the same data/fixef specification and we're
+    # extra conservative
+    size(x, 2) <= size(y, 2) || return false
+
+    qy = qr(y).Q
+
+    qrx = qr(x, Val(true))
+    dvec = abs.(diag(qrx.R))
+    fdv = first(dvec)
+    cmp = fdv * ranktol
+    r = searchsortedlast(dvec, cmp, rev=true)
+
+    p = qy' * x
+
+    nested = map(eachcol(p)) do col
+        # if set Julia 1.6 as the minimum, we can use last(col, r)
+        top = @view col[firstindex(col):(end-r-1)]
+        tail = @view col[(end-r):end]
+        return norm(tail) / norm(top) < rtol
+    end
+
+    return all(nested)
+end
diff --git a/test/likelihoodratiotest.jl b/test/likelihoodratiotest.jl
@@ -1,3 +1,5 @@
+using DataFrames
+using GLM
 using MixedModels
 using Test
 
@@ -45,8 +47,13 @@ end
 
     fm0 = fit(MixedModel,@formula(reaction ~ 1 + (1+days|subj)),slp);
     fm1 = fit(MixedModel,@formula(reaction ~ 1 + days + (1+days|subj)),slp);
+    lm0 = lm(@formula(reaction ~ 1), slp)
+    lm1 = lm(@formula(reaction ~ 1 + days), slp)
 
-    lrt = likelihoodratiotest(fm0,fm1);
+    @test MixedModels._iscomparable(lm0, fm1)
+    @test !MixedModels._iscomparable(lm1, fm0)
+
+    lrt = likelihoodratiotest(fm0,fm1)
 
     @test [deviance(fm0), deviance(fm1)] == lrt.deviance
     @test deviance(fm0) - deviance(fm1) == only(lrt.tests.deviancediff)
@@ -58,19 +65,44 @@ end
     show(IOBuffer(),lrt);
     @test :pvalues in propertynames(lrt)
 
+    lrt = likelihoodratiotest(lm1,fm1)
+    @test lrt.deviance ≈ likelihoodratiotest(lm1.model,fm1).deviance
+    @test lrt.dof == [3, 6]
+    @test lrt.deviance ≈ -2 * loglikelihood.([lm1, fm1])
+    shown = sprint(show, lrt)
+    @test occursin("-2 logLik", shown)
+    @test !occursin("deviance", shown)
+
+    # non nested FE between non-mixed and mixed
+    @test_throws ArgumentError likelihoodratiotest(lm1, fm0)
 
     # mix of REML and ML
     fm0 = fit(MixedModel,@formula(reaction ~ 1 + (1+days|subj)),slp, REML=true);
     @test_throws ArgumentError likelihoodratiotest(fm0,fm1)
+    @test_throws ArgumentError likelihoodratiotest(lm0,fm0)
 
     # differing FE with REML
     fm1 = fit(MixedModel,@formula(reaction ~ 1 + days + (1+days|subj)),slp, REML=true);
 
     @test_throws ArgumentError likelihoodratiotest(fm0,fm1)
 
     contra = MixedModels.dataset(:contra);
+    # glm doesn't like categorical responses, so we convert it to numeric ourselves
+    # TODO: upstream fix
+    cc = DataFrame(contra);
+    cc.usenum = ifelse.(cc.use .== "Y", 1 , 0)
+    gmf = glm(@formula(usenum ~ 1+age+urban+livch), cc, Bernoulli());
+    gmf2 = glm(@formula(usenum ~ 1+age+abs2(age)+urban+livch), cc, Bernoulli());
     gm0 = fit(MixedModel, @formula(use ~ 1+age+urban+livch+(1|urban&dist)), contra, Bernoulli(), fast=true);
     gm1 = fit(MixedModel, @formula(use ~ 1+age+abs2(age)+urban+livch+(1|urban&dist)), contra, Bernoulli(), fast=true);
+
+    lrt = likelihoodratiotest(gmf, gm1)
+    @test [-2 * loglikelihood(gmf), deviance(gm1)] ≈ lrt.deviance
+    @test -2 * loglikelihood(gmf) - deviance(gm1) ≈ only(lrt.tests.deviancediff)
+    shown = sprint(show, lrt)
+    @test !occursin("-2 logLik", shown)
+    @test occursin("deviance", shown)
+
     lrt = likelihoodratiotest(gm0,gm1);
     @test [deviance(gm0), deviance(gm1)] == lrt.deviance
     @test deviance(gm0) - deviance(gm1) == only(lrt.tests.deviancediff)
@@ -82,9 +114,23 @@ end
 
     # mismatched links
     gm_probit = fit(MixedModel, @formula(use ~ 1+age+urban+livch+(1|urban&dist)), contra, Bernoulli(), ProbitLink(), fast=true);
-    @test_throws ArgumentError likelihoodratiotest(gm0,gm_probit)
+    @test_throws ArgumentError likelihoodratiotest(gmf, gm_probit)
+    @test_throws ArgumentError likelihoodratiotest(gm0, gm_probit)
 
     # mismatched families
     gm_poisson = fit(MixedModel, @formula(use ~ 1+age+urban+livch+(1|urban&dist)), contra, Poisson(), fast=true);
-    @test_throws ArgumentError MixedModels.likelihoodratiotest(gm0,gm_poisson)
+    @test_throws ArgumentError likelihoodratiotest(gmf, gm_poisson)
+    @test_throws ArgumentError likelihoodratiotest(gm0, gm_poisson)
+
+    @test !MixedModels._iscomparable(lm0, gm0)
+    @test !MixedModels._iscomparable(gmf, fm1)
+
+    @test MixedModels._iscomparable(gmf, gm0)
+    @test !MixedModels._iscomparable(gmf2, gm0)
+
+    @test MixedModels._isnested(gmf.mm.m, gm0.X)
+    @test !MixedModels._isnested(gmf2.mm.m, gm0.X)
+    # this skips the linear term so that the model matrices
+    # have the same column rank
+    @test !MixedModels._isnested(gmf2.mm.m[:,Not(2)], gm0.X)
 end