diff --git a/NEWS.md b/NEWS.md index af59f5710..011b1da77 100644 --- a/NEWS.md +++ b/NEWS.md @@ -36,6 +36,14 @@ Run-time formula syntax * Methods for `Base./(::AbstractTerm, ::AbstractTerm)` are added, allowing nesting syntax to be used with `Term`s at run-time as well [#470] +MixedModels v3.6.0 Release Notes +======================== +* Add `likelihoodratiotest` method for comparing non-mixed (generalized) linear models to (generalized) linear mixed models [#508]. + +MixedModels v3.5.2 Release Notes +======================== +* Explicitly deprecate vestigial `named` kwarg in `ranef` in favor of `raneftables` [#507]. + MixedModels v3.5.1 Release Notes ======================== * Fix MIME show methods for models with random-effects not corresponding to a fixed effect [#501]. @@ -223,3 +231,5 @@ Package dependencies [#495]: https://github.com/JuliaStats/MixedModels.jl/issues/495 [#501]: https://github.com/JuliaStats/MixedModels.jl/issues/501 [#506]: https://github.com/JuliaStats/MixedModels.jl/issues/506 +[#507]: https://github.com/JuliaStats/MixedModels.jl/issues/507 +[#508]: https://github.com/JuliaStats/MixedModels.jl/issues/508 diff --git a/src/MixedModels.jl b/src/MixedModels.jl index 47c890287..c3f779d94 100644 --- a/src/MixedModels.jl +++ b/src/MixedModels.jl @@ -23,6 +23,7 @@ using Tables using LinearAlgebra: BlasFloat, BlasReal, HermOrSym, PosDefException, copytri! using Base: Ryu using GLM: Link, canonicallink +using StatsModels: TableRegressionModel using StatsFuns: log2π, normccdf diff --git a/src/likelihoodratiotest.jl b/src/likelihoodratiotest.jl index 47a0be5fa..81ea626ba 100644 --- a/src/likelihoodratiotest.jl +++ b/src/likelihoodratiotest.jl @@ -6,10 +6,12 @@ Results of MixedModels.likelihoodratiotest ## Fields * `formulas`: Vector of model formulae * `models`: NamedTuple of the `dof` and `deviance` of the models -* `tests`: NamedTuple of the sequential `dofdiff`, `deviancediff`, and resulting `pvalues` +* `tests`: NamedTuple of the sequential `dofdiff`, `deviancediff`, + and resulting `pvalues` ## Properties -* `deviance` +* `deviance` : note that this is actually -2 log likelihood for linear models + (i.e. without subtracting the constant for a saturated model) * `pvalues` """ @@ -17,6 +19,7 @@ struct LikelihoodRatioTest formulas::AbstractVector{String} models::NamedTuple{(:dof,:deviance)} tests::NamedTuple{(:dofdiff,:deviancediff,:pvalues)} + linear::Bool end Base.propertynames(lrt::LikelihoodRatioTest, private::Bool = false) = ( @@ -46,12 +49,24 @@ Base.getindex(lrt::LikelihoodRatioTest, s::Symbol) = getfield(lrt,s) """ likelihoodratiotest(m::MixedModel...) + likelihoodratiotest(m0::LinearModel, m::MixedModel...) + likelihoodratiotest(m0::GeneralizedLinearModel, m::MixedModel...) + likelihoodratiotest(m0::TableRegressionModel{LinearModel}, m::MixedModel...) + likelihoodratiotest(m0::TableRegressionModel{GeneralizedLinearModel}, m::MixedModel...) + Likeihood ratio test applied to a set of nested models. -Note that nesting of the models is not checked. It is incumbent on the user -to check this. This differs from [`StatsModels.lrtest`](@ref) as nesting in -mixed models, especially in the random effects specification, may be non obvious. +!!! note + The nesting of the models is not checked. It is incumbent on the user + to check this. This differs from [`StatsModels.lrtest`](@ref) as nesting in + mixed models, especially in the random effects specification, may be non obvious. + +!!! note + For comparisons between mixed and non-mixed models, the deviance for the non-mixed + model is taken to be -2 log likelihood, i.e. omitting the additive constant for the + fully saturated model. This is in line with the computation of the deviance for mixed + models. This functionality may be deprecated in the future in favor of [`StatsModels.lrtest`](@ref). """ @@ -81,7 +96,44 @@ function likelihoodratiotest(m::MixedModel...) LikelihoodRatioTest( formulas, (dof = dofs, deviance = devs), - (dofdiff = dofdiffs, deviancediff = devdiffs, pvalues = pvals) + (dofdiff = dofdiffs, deviancediff = devdiffs, pvalues = pvals), + first(m) isa LinearMixedModel + ) +end + +_formula(::Union{LinearModel, GeneralizedLinearModel}) = "NA" +_formula(x::TableRegressionModel{<:Union{LinearModel, GeneralizedLinearModel}}) = String(Symbol(x.mf.f)) + +# for GLMMs we're actually looking at the deviance and additive constants are comparable +# (because GLM deviance is actually part of the GLMM deviance computation) +# for LMMs, we're always looking at the "deviance scale" but without the additive constant +# for the fully saturated model +_criterion(x::Union{GeneralizedLinearModel, TableRegressionModel{<:GeneralizedLinearModel}}) = deviance(x) +_criterion(x::Union{LinearModel, TableRegressionModel{<:LinearModel}}) = -2 * loglikelihood(x) + +function likelihoodratiotest(m0::Union{TableRegressionModel{<:Union{LinearModel, GeneralizedLinearModel}}, + LinearModel, GeneralizedLinearModel}, + m::MixedModel...) + _iscomparable(m0, first(m)) || + throw(ArgumentError("""Models are not comparable: are the objectives, data + and, where appropriate, the link and family the same? + """)) + lrt = likelihoodratiotest(m...) + devs = pushfirst!(lrt.deviance, _criterion(m0)) + formulas = pushfirst!(lrt.formulas, _formula(m0)) + dofs = pushfirst!(lrt.models.dof, dof(m0)) + devdiffs = pushfirst!(lrt.tests.deviancediff, devs[1] - devs[2]) + dofdiffs = pushfirst!(lrt.tests.dofdiff, dofs[2] - dofs[1]) + + df, dev = first(dofdiffs), first(devdiffs) + p = dev > 0 ? ccdf(Chisq(df), dev) : NaN + pvals = pushfirst!(lrt.tests.pvalues, p) + + LikelihoodRatioTest( + formulas, + (dof = dofs, deviance = devs), + (dofdiff = dofdiffs, deviancediff = devdiffs, pvalues = pvals), + lrt.linear ) end @@ -103,7 +155,7 @@ function Base.show(io::IO, ::MIME"text/plain", lrt::LikelihoodRatioTest) outrows[1, :] = ["", "model-dof", - "deviance", + lrt.linear ? "-2 logLik" : "deviance", "χ²", "χ²-dof", "P(>χ²)"] # colnms @@ -218,3 +270,90 @@ function StatsModels.isnested(m1::MixedModel, m2::MixedModel; atol::Real=0.0) true end + + +function _iscomparable(m1::TableRegressionModel{<:Union{LinearModel, GeneralizedLinearModel}}, + m2::MixedModel) + _iscomparable(m1.model, m2) || return false + + # check that the nested fixef are a subset of the outer + all(in.(coefnames(m1), Ref(coefnames(m2)))) || return false + + return true +end + +# GLM isn't nested with in LMM and LM isn't nested within GLMM +_iscomparable(m1::Union{LinearModel, GeneralizedLinearModel}, m2::MixedModel) = false + +function _iscomparable(m1::LinearModel, m2::LinearMixedModel) + nobs(m1) == nobs(m2) || return false + + # XXX This reaches into the internal structure of GLM + size(m1.pp.X, 2) <= size(m2.X, 2) || return false + + _isnested(m1.pp.X, m2.X) || return false + + !m2.optsum.REML || + throw(ArgumentError("REML-fitted models cannot be compared to linear models")) + + return true +end + +function _iscomparable(m1::GeneralizedLinearModel, m2::GeneralizedLinearMixedModel) + nobs(m1) == nobs(m2) || return false + + size(modelmatrix(m1), 2) <= size(modelmatrix(m2), 2) || return false + + _isnested(modelmatrix(m1), modelmatrix(m2)) || return false + + Distribution(m1) == Distribution(m2) || + throw(ArgumentError("Models must be fit to the same distribution")) + + Link(m1) == Link(m2) || + throw(ArgumentError("Models must have the same link function")) + + return true +end + +""" + _isnested(x::AbstractMatrix, y::AbstractMatrix; atol::Real=0.0) + +Test whether the column span of `x` is a subspace of (nested within) +the column span of y. + +The nesting of the column span of the fixed-effects model matrices is a necessary, +but not sufficient condition for a linear model (whether mixed-effects or not) +to be nested within a linear mixed-effects model. + +!!! note + The `rtol` argument is an internal threshold and not currently + compatible with the `atol` argument of `StatsModels.isnested`. +""" +function _isnested(x::AbstractMatrix, y::AbstractMatrix; rtol=1e-8, ranktol=1e-8) + + # technically this can return false positives if x or y + # are rank deficient, but either they're rank deficient + # in the same way (b/c same data) and we don't care OR + # it's not the same data/fixef specification and we're + # extra conservative + size(x, 2) <= size(y, 2) || return false + + qy = qr(y).Q + + qrx = qr(x, Val(true)) + dvec = abs.(diag(qrx.R)) + fdv = first(dvec) + cmp = fdv * ranktol + r = searchsortedlast(dvec, cmp, rev=true) + + p = qy' * x + + nested = map(eachcol(p)) do col + # if set Julia 1.6 as the minimum, we can use last(col, r) + top = @view col[firstindex(col):(end-r-1)] + tail = @view col[(end-r):end] + return norm(tail) / norm(top) < rtol + end + + return all(nested) +end diff --git a/test/likelihoodratiotest.jl b/test/likelihoodratiotest.jl index 80ec31096..8a4c58a78 100644 --- a/test/likelihoodratiotest.jl +++ b/test/likelihoodratiotest.jl @@ -1,3 +1,5 @@ +using DataFrames +using GLM using MixedModels using Test @@ -45,8 +47,13 @@ end fm0 = fit(MixedModel,@formula(reaction ~ 1 + (1+days|subj)),slp); fm1 = fit(MixedModel,@formula(reaction ~ 1 + days + (1+days|subj)),slp); + lm0 = lm(@formula(reaction ~ 1), slp) + lm1 = lm(@formula(reaction ~ 1 + days), slp) - lrt = likelihoodratiotest(fm0,fm1); + @test MixedModels._iscomparable(lm0, fm1) + @test !MixedModels._iscomparable(lm1, fm0) + + lrt = likelihoodratiotest(fm0,fm1) @test [deviance(fm0), deviance(fm1)] == lrt.deviance @test deviance(fm0) - deviance(fm1) == only(lrt.tests.deviancediff) @@ -58,10 +65,21 @@ end show(IOBuffer(),lrt); @test :pvalues in propertynames(lrt) + lrt = likelihoodratiotest(lm1,fm1) + @test lrt.deviance ≈ likelihoodratiotest(lm1.model,fm1).deviance + @test lrt.dof == [3, 6] + @test lrt.deviance ≈ -2 * loglikelihood.([lm1, fm1]) + shown = sprint(show, lrt) + @test occursin("-2 logLik", shown) + @test !occursin("deviance", shown) + + # non nested FE between non-mixed and mixed + @test_throws ArgumentError likelihoodratiotest(lm1, fm0) # mix of REML and ML fm0 = fit(MixedModel,@formula(reaction ~ 1 + (1+days|subj)),slp, REML=true); @test_throws ArgumentError likelihoodratiotest(fm0,fm1) + @test_throws ArgumentError likelihoodratiotest(lm0,fm0) # differing FE with REML fm1 = fit(MixedModel,@formula(reaction ~ 1 + days + (1+days|subj)),slp, REML=true); @@ -69,8 +87,22 @@ end @test_throws ArgumentError likelihoodratiotest(fm0,fm1) contra = MixedModels.dataset(:contra); + # glm doesn't like categorical responses, so we convert it to numeric ourselves + # TODO: upstream fix + cc = DataFrame(contra); + cc.usenum = ifelse.(cc.use .== "Y", 1 , 0) + gmf = glm(@formula(usenum ~ 1+age+urban+livch), cc, Bernoulli()); + gmf2 = glm(@formula(usenum ~ 1+age+abs2(age)+urban+livch), cc, Bernoulli()); gm0 = fit(MixedModel, @formula(use ~ 1+age+urban+livch+(1|urban&dist)), contra, Bernoulli(), fast=true); gm1 = fit(MixedModel, @formula(use ~ 1+age+abs2(age)+urban+livch+(1|urban&dist)), contra, Bernoulli(), fast=true); + + lrt = likelihoodratiotest(gmf, gm1) + @test [-2 * loglikelihood(gmf), deviance(gm1)] ≈ lrt.deviance + @test -2 * loglikelihood(gmf) - deviance(gm1) ≈ only(lrt.tests.deviancediff) + shown = sprint(show, lrt) + @test !occursin("-2 logLik", shown) + @test occursin("deviance", shown) + lrt = likelihoodratiotest(gm0,gm1); @test [deviance(gm0), deviance(gm1)] == lrt.deviance @test deviance(gm0) - deviance(gm1) == only(lrt.tests.deviancediff) @@ -82,9 +114,23 @@ end # mismatched links gm_probit = fit(MixedModel, @formula(use ~ 1+age+urban+livch+(1|urban&dist)), contra, Bernoulli(), ProbitLink(), fast=true); - @test_throws ArgumentError likelihoodratiotest(gm0,gm_probit) + @test_throws ArgumentError likelihoodratiotest(gmf, gm_probit) + @test_throws ArgumentError likelihoodratiotest(gm0, gm_probit) # mismatched families gm_poisson = fit(MixedModel, @formula(use ~ 1+age+urban+livch+(1|urban&dist)), contra, Poisson(), fast=true); - @test_throws ArgumentError MixedModels.likelihoodratiotest(gm0,gm_poisson) + @test_throws ArgumentError likelihoodratiotest(gmf, gm_poisson) + @test_throws ArgumentError likelihoodratiotest(gm0, gm_poisson) + + @test !MixedModels._iscomparable(lm0, gm0) + @test !MixedModels._iscomparable(gmf, fm1) + + @test MixedModels._iscomparable(gmf, gm0) + @test !MixedModels._iscomparable(gmf2, gm0) + + @test MixedModels._isnested(gmf.mm.m, gm0.X) + @test !MixedModels._isnested(gmf2.mm.m, gm0.X) + # this skips the linear term so that the model matrices + # have the same column rank + @test !MixedModels._isnested(gmf2.mm.m[:,Not(2)], gm0.X) end