Merge branch 'master' of github.com:JuliaStats/GLM.jl into pa/resid

JuliaStats · Sep 2, 2023 · c3fbc4a · c3fbc4a
2 parents e9865be + c13577e
commit c3fbc4a
Show file tree

Hide file tree

Showing 16 changed files with 775 additions and 282 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "daily"
diff --git a/.github/workflows/CI-future.yml b/.github/workflows/CI-future.yml
@@ -24,12 +24,12 @@ jobs:
         arch: ['x64']
     steps:
       - run: git config --global core.autocrlf false
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v3
         env:
           cache-name: cache-artifacts
         with:

diff --git a/.github/workflows/CI-stable.yml b/.github/workflows/CI-stable.yml
@@ -24,12 +24,12 @@ jobs:
         arch: ['x64']
     steps:
       - run: git config --global core.autocrlf false
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: actions/cache@v1
+      - uses: actions/cache@v3
         env:
           cache-name: cache-artifacts
         with:
@@ -43,7 +43,7 @@ jobs:
       - uses: julia-actions/julia-runtest@v1
       - uses: julia-actions/julia-processcoverage@v1
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.version == '1' && matrix.arch == 'x64' }}
-      - uses: codecov/codecov-action@v1
+      - uses: codecov/codecov-action@v3
         if: ${{ matrix.os == 'ubuntu-latest' && matrix.version == '1' && matrix.arch == 'x64' }}
         with:
           file: lcov.info
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -1,24 +1,45 @@
 name: CompatHelper
-
 on:
   schedule:
-    - cron: '00 * * * *'
-
+    - cron: 0 0 * * *
+  workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
   CompatHelper:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        julia-version: [1.2.0]
-        julia-arch: [x86]
-        os: [ubuntu-latest]
+    runs-on: ubuntu-latest
     steps:
-      - uses: julia-actions/setup-julia@latest
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
         with:
-          version: ${{ matrix.julia-version }}
-      - name: Pkg.add("CompatHelper")
-        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
-      - name: CompatHelper.main()
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
@@ -13,7 +13,7 @@ jobs:
     name: Documentation
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: julia-actions/julia-buildpkg@latest
       - uses: julia-actions/julia-docdeploy@latest
         env:

diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
@@ -1,11 +1,33 @@
 name: TagBot
 on:
-  schedule:
-    - cron: 0 * * * *
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+    inputs:
+      lookback:
+        default: 3
+permissions:
+  actions: read
+  checks: read
+  contents: write
+  deployments: read
+  issues: read
+  discussions: read
+  packages: read
+  pages: read
+  pull-requests: read
+  repository-projects: read
+  security-events: read
+  statuses: read
 jobs:
   TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
     runs-on: ubuntu-latest
     steps:
       - uses: JuliaRegistries/TagBot@v1
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
+          # Edit the following line to reflect the actual name of the GitHub Secret containing your private key
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
+          # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }}
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "GLM"
 uuid = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
-version = "1.8.1"
+version = "2.0.0-DEV"
 
 [deps]
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
@@ -25,9 +25,9 @@ RDatasets = "0.5, 0.6, 0.7"
 Reexport = "0.1, 0.2, 1.0"
 SpecialFunctions = "0.6, 0.7, 0.8, 0.9, 0.10, 1, 2.0"
 StatsAPI = "1.4"
-StatsBase = "0.33.5"
+StatsBase = "0.33.5, 0.34"
 StatsFuns = "0.6, 0.7, 0.8, 0.9, 1.0"
-StatsModels = "0.6.23"
+StatsModels = "0.6.23, 0.7"
 Tables = "1"
 julia = "1.6"
 

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -64,7 +64,6 @@ fit
 StatsBase.deviance
 GLM.dispersion
 GLM.ftest
-GLM.installbeta!
 StatsBase.nobs
 StatsBase.nulldeviance
 StatsBase.predict

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -86,6 +86,129 @@ julia> round.(vcov(ols); digits=5)
   0.38889  -0.16667
  -0.16667   0.08333
 ```
+By default, the `lm` method uses the Cholesky factorization which is known as fast but numerically unstable, especially for ill-conditioned design matrices. Also, the Cholesky method aggressively detects multicollinearity. You can use the `method` keyword argument to apply a more stable QR factorization method.
+
+```jldoctest
+julia> data = DataFrame(X=[1,2,3], Y=[2,4,7]);
+
+julia> ols = lm(@formula(Y ~ X), data; method=:qr)
+LinearModel
+
+Y ~ 1 + X
+
+Coefficients:
+─────────────────────────────────────────────────────────────────────────
+                 Coef.  Std. Error      t  Pr(>|t|)  Lower 95%  Upper 95%
+─────────────────────────────────────────────────────────────────────────
+(Intercept)  -0.666667    0.62361   -1.07    0.4788   -8.59038    7.25704
+X             2.5         0.288675   8.66    0.0732   -1.16797    6.16797
+─────────────────────────────────────────────────────────────────────────
+```
+The following example shows that QR decomposition works better for an ill-conditioned design matrix. The linear model with the QR method is a better model than the linear model with Cholesky decomposition method since the estimated loglikelihood of previous model is higher.
+Note that, the condition number of the design matrix is quite high (≈ 3.52e7).
+
+```
+julia> X = [-0.4011512997627107 0.6368622664511552;
+            -0.0808472925693535 0.12835204623364604;
+            -0.16931095045225217 0.2687956795496601;
+            -0.4110745650568839 0.6526163576003452;
+            -0.4035951747670475 0.6407421349445884;
+            -0.4649907741370211 0.7382129928076485;
+            -0.15772708898883683 0.25040532268222715;
+            -0.38144358562952446 0.6055745630707645;
+            -0.1012787681395544 0.16078875117643368;
+            -0.2741403589052255 0.4352214984054432];
+
+julia> y = [4.362866166172215,
+            0.8792840060172619,
+            1.8414020451091684,
+            4.470790758717395,
+            4.3894454833815395,
+            5.0571760643993455,
+            1.7154177874916376,
+            4.148527704012107,
+            1.1014936742570425,
+            2.9815131910316097];
+
+julia> modelqr = lm(X, y; method=:qr)
+LinearModel
+
+Coefficients:
+────────────────────────────────────────────────────────────────
+       Coef.  Std. Error       t  Pr(>|t|)  Lower 95%  Upper 95%
+────────────────────────────────────────────────────────────────
+x1   5.00389   0.0560164   89.33    <1e-12    4.87472    5.13307
+x2  10.0025    0.035284   283.48    <1e-16    9.92109   10.0838
+────────────────────────────────────────────────────────────────
+
+julia> modelchol = lm(X, y; method=:cholesky)
+LinearModel
+
+Coefficients:
+────────────────────────────────────────────────────────────────
+       Coef.  Std. Error       t  Pr(>|t|)  Lower 95%  Upper 95%
+────────────────────────────────────────────────────────────────
+x1   5.17647   0.0849184   60.96    <1e-11    4.98065    5.37229
+x2  10.1112    0.053489   189.03    <1e-15    9.98781   10.2345
+────────────────────────────────────────────────────────────────
+
+julia> loglikelihood(modelqr) > loglikelihood(modelchol)
+true
+```
+Since the Cholesky method with `dropcollinear = true` aggressively detects multicollinearity,
+if you ever encounter multicollinearity in any GLM model with Cholesky,
+it is worth trying the same model with QR decomposition.
+The following example is taken from `Introductory Econometrics: A Modern Approach, 7e" by Jeffrey M. Wooldridge`.
+The dataset is used to study the relationship between firm size—often measured by annual sales—and spending on
+research and development (R&D).
+The following shows that for the given model,
+the Cholesky method detects multicollinearity in the design matrix with `dropcollinear=true`
+and hence does not estimate all parameters as opposed to QR.
+
+```jldoctest
+julia> y = [9.42190647125244, 2.084805727005, 3.9376676082611, 2.61976027488708, 4.04761934280395, 2.15384602546691,
+            2.66240668296813, 4.39475727081298, 5.74520826339721, 3.59616208076477, 1.54265284538269, 2.59368276596069,
+            1.80476510524749, 1.69270837306976, 3.04201245307922, 2.18389105796813, 2.73844122886657, 2.88134002685546,
+            2.46666669845581, 3.80616021156311, 5.12149810791015, 6.80378007888793, 3.73669862747192, 1.21332454681396,
+            2.54629635810852, 5.1612901687622, 1.86798071861267, 1.21465551853179, 6.31019830703735, 1.02669405937194, 
+            2.50623273849487, 1.5936255455017];
+
+julia> x = [4570.2001953125, 2830, 596.799987792968, 133.600006103515, 42, 390, 93.9000015258789, 907.900024414062,
+            19773, 39709, 2936.5, 2513.80004882812, 1124.80004882812, 921.599975585937, 2432.60009765625, 6754,
+            1066.30004882812, 3199.89990234375, 150, 509.700012207031, 1452.69995117187, 8995, 1212.30004882812,
+            906.599975585937, 2592, 201.5, 2617.80004882812, 502.200012207031, 2824, 292.200012207031, 7621, 1631.5];
+
+julia> rdchem = DataFrame(rdintens=y, sales=x);
+
+julia> mdl = lm(@formula(rdintens ~ sales + sales^2), rdchem; method=:cholesky)
+LinearModel
+
+rdintens ~ 1 + sales + :(sales ^ 2)
+
+Coefficients:
+───────────────────────────────────────────────────────────────────────────────────────
+                    Coef.     Std. Error       t  Pr(>|t|)      Lower 95%     Upper 95%
+───────────────────────────────────────────────────────────────────────────────────────
+(Intercept)   0.0          NaN            NaN       NaN     NaN            NaN
+sales         0.000852509    0.000156784    5.44    <1e-05    0.000532313    0.00117271
+sales ^ 2    -1.97385e-8     4.56287e-9    -4.33    0.0002   -2.90571e-8    -1.04199e-8
+───────────────────────────────────────────────────────────────────────────────────────
+
+julia> mdl = lm(@formula(rdintens ~ sales + sales^2), rdchem; method=:qr)
+LinearModel
+
+rdintens ~ 1 + sales + :(sales ^ 2)
+
+Coefficients:
+─────────────────────────────────────────────────────────────────────────────────
+                    Coef.   Std. Error      t  Pr(>|t|)    Lower 95%    Upper 95%
+─────────────────────────────────────────────────────────────────────────────────
+(Intercept)   2.61251      0.429442      6.08    <1e-05   1.73421     3.49082
+sales         0.000300571  0.000139295   2.16    0.0394   1.56805e-5  0.000585462
+sales ^ 2    -6.94594e-9   3.72614e-9   -1.86    0.0725  -1.45667e-8  6.7487e-10
+─────────────────────────────────────────────────────────────────────────────────
+```
+
 
 ## Probit regression
 ```jldoctest

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -91,13 +91,23 @@ module GLM
         pivoted_cholesky!(A; kwargs...) = cholesky!(A, RowMaximum(); kwargs...)
     end
 
+    @static if VERSION < v"1.7.0"
+        pivoted_qr!(A; kwargs...) = qr!(A, Val(true); kwargs...)
+    else
+        pivoted_qr!(A; kwargs...) = qr!(A, ColumnNorm(); kwargs...)
+    end
+
     const COMMON_FIT_KWARGS_DOCS = """
         - `dropcollinear::Bool=true`: Controls whether or not a model matrix
           less-than-full rank is accepted.
           If `true` (the default) the coefficient for redundant linearly dependent columns is
           `0.0` and all associated statistics are set to `NaN`.
           Typically from a set of linearly-dependent columns the last ones are identified as redundant
           (however, the exact selection of columns identified as redundant is not guaranteed).
+        - `method::Symbol=:cholesky`: Controls which decomposition method to use.
+          If `method=:cholesky` (the default), then the `Cholesky` decomposition method will be used.
+          If `method=:qr`, then the `QR` decomposition method (which is more stable
+          but slower) will be used.
         - `wts::Vector=similar(y,0)`: Prior frequency (a.k.a. case) weights of observations.
           Such weights are equivalent to repeating each observation a number of times equal
           to its weight. Do note that this interpretation gives equal point estimates but

diff --git a/src/deprecated.jl b/src/deprecated.jl
@@ -19,3 +19,6 @@ function Base.getproperty(mm::LinPredModel, f::Symbol)
         return getfield(mm, f)
     end
 end
+
+@deprecate installbeta!(p) (p.beta0 .= p.delbeta) false
+@deprecate installbeta!(p, f) (p.beta0 .+= p.delbeta .* f) false