From afbb5130ab2773c4b72a3efb4737cf6c6f0c1b09 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Thu, 14 Sep 2023 09:41:05 -0500 Subject: [PATCH] backport VIF to 1.x release (#549) * [G]VIF (#548) * [G]VIF * add reference value source * more tests * glm tests (cherry picked from commit b1ba4c5fdd5030b98a8cf9fe9c46319e5f5eb20e) * fix formula implementation * version bump --- Project.toml | 4 ++-- src/GLM.jl | 8 +++++--- src/linpred.jl | 2 +- test/runtests.jl | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/Project.toml b/Project.toml index b7ee4ce2..c3d4318e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "GLM" uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a" -version = "1.8.3" +version = "1.9.0" [deps] Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" @@ -26,7 +26,7 @@ SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1, 2.0" StatsAPI = "1.4" StatsBase = "0.33.5, 0.34" StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0" -StatsModels = "0.6.23, 0.7" +StatsModels = "0.7.3" julia = "1.6" [extras] diff --git a/src/GLM.jl b/src/GLM.jl index 4b83d5dc..2231f62d 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -10,8 +10,10 @@ module GLM import Base: (\), convert, show, size import LinearAlgebra: cholesky, cholesky! import Statistics: cor - import StatsBase: coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, - loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, + using StatsAPI + import StatsBase: coef, coeftable, coefnames, confint, deviance, nulldeviance, dof, dof_residual, + loglikelihood, nullloglikelihood, nobs, stderror, vcov, + residuals, predict, predict!, fitted, fit, model_response, response, modelmatrix, r2, r², adjr2, adjr², PValue import StatsFuns: xlogy import SpecialFunctions: erfc, erfcinv, digamma, trigamma @@ -19,7 +21,7 @@ module GLM export coef, coeftable, confint, deviance, nulldeviance, dof, dof_residual, loglikelihood, nullloglikelihood, nobs, stderror, vcov, residuals, predict, fitted, fit, fit!, model_response, response, modelmatrix, r2, r², adjr2, adjr², - cooksdistance, hasintercept, dispersion + cooksdistance, hasintercept, dispersion, vif, gvif, termnames export # types diff --git a/src/linpred.jl b/src/linpred.jl index 770e4062..cb0ecf7d 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -263,7 +263,7 @@ response(obj::LinPredModel) = obj.rr.y fitted(m::LinPredModel) = m.rr.mu predict(mm::LinPredModel) = fitted(mm) -StatsModels.formula(obj::LinPredModel) = modelframe(obj).formula +StatsModels.formula(::LinPredModel) = throw(ArgumentError("model was fitted without a formula")) residuals(obj::LinPredModel) = residuals(obj.rr) """ diff --git a/test/runtests.jl b/test/runtests.jl index f55bb0c0..a8e0b9e2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1602,3 +1602,46 @@ end # 3. 44 / wt == y @test GLM.loglik_obs(Binomial(), y, μ, wt, ϕ) ≈ GLM.logpdf(Binomial(Int(wt), μ), 44) end + +@testset "[G]VIF" begin + # Reference values from car::vif in R: + # > library(car) + # > data(Duncan) + # > lm1 = lm(prestige ~ 1 + income + education, Duncan) + # > vif(lm1) + # income education + # 2.1049 2.1049 + # > lm2 = lm(prestige ~ 1 + income + education + type, Duncan) + # > vif(lm2) + # GVIF Df GVIF^(1/(2*Df)) + # income 2.209178 1 1.486330 + # education 5.297584 1 2.301648 + # type 5.098592 2 1.502666 + duncan = RDatasets.dataset("car", "Duncan") + lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan) + @test termnames(lm1)[2] == coefnames(lm1) + @test vif(lm1) ≈ gvif(lm1) + + lm1_noform = lm(modelmatrix(lm1), response(lm1)) + @test vif(lm1) ≈ vif(lm1_noform) + @test_throws ArgumentError("model was fitted without a formula") gvif(lm1_noform) + + lm1log = lm(@formula(Prestige ~ 1 + exp(log(Income)) + exp(log(Education))), duncan) + @test termnames(lm1log)[2] == coefnames(lm1log) == ["(Intercept)", "exp(log(Income))", "exp(log(Education))"] + @test vif(lm1) ≈ vif(lm1log) + + gm1 = glm(modelmatrix(lm1), response(lm1), Normal()) + @test vif(lm1) ≈ vif(gm1) + + lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan) + @test termnames(lm2)[2] != coefnames(lm2) + @test gvif(lm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4 + + gm2 = glm(@formula(Prestige ~ 1 + Income + Education + Type), duncan, Normal()) + @test termnames(gm2)[2] != coefnames(gm2) + @test gvif(gm2; scale=true) ≈ [1.486330, 2.301648, 1.502666] atol=1e-4 + + # the VIF definition depends on modelmatrix, vcov and stderror returning valid + # values. It doesn't care about links, offsets, etc. as long as the model matrix, + # vcov matrix and stderrors are well defined. +end