backport VIF to 1.x release (#549)

* [G]VIF (#548) * [G]VIF * add reference value source * more tests * glm tests (cherry picked from commit b1ba4c5) * fix formula implementation * version bump
JuliaStats · Sep 14, 2023 · afbb513 · afbb513 · palday · Sep 14, 2023
1 parent d32f361
commit afbb513
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 6 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "GLM"
 uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
-version = "1.8.3"
+version = "1.9.0"
 
 [deps]
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
@@ -26,7 +26,7 @@ SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1, 2.0"
 StatsAPI = "1.4"
 StatsBase = "0.33.5, 0.34"
 StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0"
-StatsModels = "0.6.23, 0.7"
+StatsModels = "0.7.3"
 julia = "1.6"
 
 [extras]

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -10,16 +10,18 @@ module GLM
     import Base: (\), convert, show, size
     import LinearAlgebra: cholesky, cholesky!
     import Statistics: cor
-    import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
-                      loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
+    using StatsAPI
+    import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual,
+                      loglikelihood, nullloglikelihood, nobs, stderror, vcov,
+                      residuals, predict, predict!,
                       fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue
     import StatsFuns: xlogy
     import SpecialFunctions: erfc, erfcinv, digamma, trigamma
     import StatsModels: hasintercept
     export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual,
            loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict,
            fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr²,
-           cooksdistance, hasintercept, dispersion
+           cooksdistance, hasintercept, dispersion, vif, gvif, termnames
 
     export
         # types

diff --git a/src/linpred.jl b/src/linpred.jl
@@ -263,7 +263,7 @@ response(obj::LinPredModel) = obj.rr.y
 
 fitted(m::LinPredModel) = m.rr.mu
 predict(mm::LinPredModel) = fitted(mm)
-StatsModels.formula(obj::LinPredModel) = modelframe(obj).formula
+StatsModels.formula(::LinPredModel) = throw(ArgumentError("model was fitted without a formula"))
 residuals(obj::LinPredModel) = residuals(obj.rr)
 
 """

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1602,3 +1602,46 @@ end
     # 3. 44 / wt == y
     @test GLM.loglik_obs(Binomial(), y, μ, wt, ϕ) ≈ GLM.logpdf(Binomial(Int(wt), μ), 44)
 end
+
+@testset "[G]VIF" begin
+    # Reference values from car::vif in R:
+    # > library(car)
+    # > data(Duncan)
+    # > lm1 = lm(prestige ~ 1 + income + education, Duncan)
+    # > vif(lm1)
+    #    income education 
+    #    2.1049    2.1049 
+    # > lm2 = lm(prestige ~ 1 + income + education + type, Duncan)
+    # > vif(lm2)
+    #               GVIF Df GVIF^(1/(2*Df))
+    # income    2.209178  1        1.486330
+    # education 5.297584  1        2.301648
+    # type      5.098592  2        1.502666
+    duncan = RDatasets.dataset("car", "Duncan")
+    lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan)
+    @test termnames(lm1)[2] == coefnames(lm1)
+    @test vif(lm1) ≈ gvif(lm1)
+
+    lm1_noform = lm(modelmatrix(lm1), response(lm1))
+    @test vif(lm1) ≈ vif(lm1_noform)
+    @test_throws ArgumentError("model was fitted without a formula") gvif(lm1_noform)
+
+    lm1log = lm(@formula(Prestige ~ 1 + exp(log(Income)) + exp(log(Education))), duncan)
+    @test termnames(lm1log)[2] == coefnames(lm1log) == ["(Intercept)", "exp(log(Income))", "exp(log(Education))"]
+    @test vif(lm1) ≈ vif(lm1log)
+
+    gm1 = glm(modelmatrix(lm1), response(lm1), Normal())
+    @test vif(lm1) ≈ vif(gm1)
+
+    lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan)
+    @test termnames(lm2)[2] != coefnames(lm2)
+    @test gvif(lm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4
+
+    gm2 = glm(@formula(Prestige ~ 1 + Income + Education + Type), duncan, Normal())
+    @test termnames(gm2)[2] != coefnames(gm2)
+    @test gvif(gm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4   
+
+    # the VIF definition depends on modelmatrix, vcov and stderror returning valid
+    # values. It doesn't care about links, offsets, etc. as long as the model matrix,
+    # vcov matrix and stderrors are well defined.
+end