Fix coeftable for saturated linear models (#458)

* Fix `coeftable` for saturated linear models `coeftable` failed for saturated `LinearModel`s due to trying to compute F and T distributions with zero DOF. * Use Inf/1.0/0.0 rather than NaN * Fix handling of rank-deficient models * Bump minor version
JuliaStats · Apr 11, 2022 · 42a0d04 · 42a0d04 · nalimilan · Apr 11, 2022
1 parent b52461f
commit 42a0d04
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 4 deletions.
diff --git a/src/glmfit.jl b/src/glmfit.jl
@@ -528,7 +528,9 @@ function dispersion(m::AbstractGLM, sqr::Bool=false)
     r = m.rr
     if dispersion_parameter(r.d)
         wrkwt, wrkresid = r.wrkwt, r.wrkresid
-        s = sum(i -> wrkwt[i] * abs2(wrkresid[i]), eachindex(wrkwt, wrkresid)) / dof_residual(m)
+        dofr = dof_residual(m)
+        s = sum(i -> wrkwt[i] * abs2(wrkresid[i]), eachindex(wrkwt, wrkresid)) / dofr
+        dofr > 0 || return oftype(s, Inf)
         sqr ? s : sqrt(s)
     else
         one(eltype(r.mu))

diff --git a/src/lm.jl b/src/lm.jl
@@ -214,16 +214,24 @@ function adjr2(obj::LinearModel)
 end
 
 function dispersion(x::LinearModel, sqr::Bool=false)
-    ssqr = deviance(x.rr)/dof_residual(x)
+    dofr = dof_residual(x)
+    ssqr = deviance(x.rr)/dofr
+    dofr > 0 || return oftype(ssqr, Inf)
     return sqr ? ssqr : sqrt(ssqr)
 end
 
 function coeftable(mm::LinearModel; level::Real=0.95)
     cc = coef(mm)
+    dofr = dof_residual(mm)
     se = stderror(mm)
     tt = cc ./ se
-    p = ccdf.(Ref(FDist(1, dof_residual(mm))), abs2.(tt))
-    ci = se*quantile(TDist(dof_residual(mm)), (1-level)/2)
+    if dofr > 0
+        p = ccdf.(Ref(FDist(1, dofr)), abs2.(tt))
+        ci = se*quantile(TDist(dofr), (1-level)/2)
+    else
+        p = [isnan(t) ? NaN : 1.0 for t in tt]
+        ci = [isnan(t) ? NaN : -Inf for t in tt]
+    end
     levstr = isinteger(level*100) ? string(Integer(level*100)) : string(level*100)
     CoefTable(hcat(cc,se,tt,p,cc+ci,cc-ci),
               ["Coef.","Std. Error","t","Pr(>|t|)","Lower $levstr%","Upper $levstr%"],

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -125,6 +125,7 @@ end
     @test isapprox(coef(m2p), [0.9772643585228885, 8.903341608496437, 3.027347397503281,
         3.9661379199401257, 5.079410103608552, 6.1944618141188625, 0.0, 7.930328728005131,
         8.879994918604757, 2.986388408421915, 10.84972230524356, 11.844809275711485])
+    @test all(isnan, hcat(coeftable(m2p).cols[2:end]...)[7,:])
 
     m2p_dep_pos = fit(LinearModel, Xmissingcell, ymissingcell, true)
     @test_logs (:warn, "Positional argument `allowrankdeficient` is deprecated, use keyword " *
@@ -141,6 +142,71 @@ end
     @test isapprox(coef(m2p_dep_pos_kw), coef(m2p))
 end
 
+@testset "saturated linear model" begin
+    df = DataFrame(x=["a", "b", "c"], y=[1, 2, 3])
+    model = lm(@formula(y ~ x), df)
+    ct = coeftable(model)
+    @test dof_residual(model) == 0
+    @test dof(model) == 4
+    @test isinf(GLM.dispersion(model.model))
+    @test coef(model) ≈ [1, 1, 2]
+    @test isequal(hcat(ct.cols[2:end]...),
+                  [Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf])
+
+    model = lm(@formula(y ~ 0 + x), df)
+    ct = coeftable(model)
+    @test dof_residual(model) == 0
+    @test dof(model) == 4
+    @test isinf(GLM.dispersion(model.model))
+    @test coef(model) ≈ [1, 2, 3]
+    @test isequal(hcat(ct.cols[2:end]...),
+                  [Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf])
+
+    model = glm(@formula(y ~ x), df, Normal(), IdentityLink())
+    ct = coeftable(model)
+    @test dof_residual(model) == 0
+    @test dof(model) == 4
+    @test isinf(GLM.dispersion(model.model))
+    @test coef(model) ≈ [1, 1, 2]
+    @test isequal(hcat(ct.cols[2:end]...),
+                  [Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf])
+
+    model = glm(@formula(y ~ 0 + x), df, Normal(), IdentityLink())
+    ct = coeftable(model)
+    @test dof_residual(model) == 0
+    @test dof(model) == 4
+    @test isinf(GLM.dispersion(model.model))
+    @test coef(model) ≈ [1, 2, 3]
+    @test isequal(hcat(ct.cols[2:end]...),
+                  [Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf])
+
+    # Saturated and rank-deficient model
+    df = DataFrame(x1=["a", "b", "c"], x2=["a", "b", "c"], y=[1, 2, 3])
+    model = lm(@formula(y ~ x1 + x2), df)
+    ct = coeftable(model)
+    @test dof_residual(model) == 0
+    @test dof(model) == 4
+    @test isinf(GLM.dispersion(model.model))
+    @test coef(model) ≈ [1, 1, 2, 0, 0]
+    @test isequal(hcat(ct.cols[2:end]...),
+                  [Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf
+                   Inf 0.0 1.0 -Inf Inf
+                   NaN NaN NaN  NaN NaN
+                   NaN NaN NaN  NaN NaN])
+
+    # TODO: add tests similar to the one above once this model can be fitted
+    @test_broken glm(@formula(y ~ x1 + x2), df, Normal(), IdentityLink())
+end
+
 dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12],
     Outcome = categorical(repeat(string.('A':'C'), outer = 3)),
     Treatment = categorical(repeat(string.('a':'c'), inner = 3)))