Some work on bayesian binomial model

ncn-foreigners · Apr 19, 2024 · d3a1223 · d3a1223
1 parent 7f197bf
commit d3a1223
Show file tree

Hide file tree

Showing 8 changed files with 422 additions and 103 deletions.
diff --git a/Manifest.toml b/Manifest.toml
diff --git a/Project.toml b/Project.toml
@@ -8,6 +8,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
+Integrals = "de52edbc-65ea-441a-8357-d3a637375a31"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"

diff --git a/src/UnobservedCountEstimation.jl b/src/UnobservedCountEstimation.jl
@@ -4,11 +4,12 @@ module UnobservedCountEstimation
 
 # Imports
 using Optim, Statistics, GLM, Distributions, DataFrames, SpecialFunctions, LinearAlgebra
+using Integrals
 
 include("zhang_likelihood.jl")
 include("original_model.jl")
 
-include("binomial_likelihood.jl")
+include("binomial_model_sampling.jl")
 include("binomial_model.jl")
 
 include("interface.jl")

diff --git a/src/binomial_likelihood.jl b/src/binomial_likelihood.jl
diff --git a/src/binomial_model.jl b/src/binomial_model.jl
@@ -1,4 +1,6 @@
-function binomial_model(m, N, n; start = "glm")
+function binomial_model(m, N, n; start = "glm", iter = 2000, 
+                        warm_up = floor(Int, iter / 2), grid,
+                        save_simulation = true)
     # TODO:: add X, Z arguments and then methods for type X/Z nothing or formula
     df = DataFrame(
         y = m,
@@ -12,29 +14,15 @@ function binomial_model(m, N, n; start = "glm")
     Z = ones(length(n))
     Z = Z[:, :]
 
-    log_l_f  = x -> log_lik_binomial_model(x[1:(end - 2)], x[end - 1], x[end], m, N, n, X, Z) * (-1.0)
-    grad_l_f = x -> grad_log_lik_binomial_model(x[1:(end - 2)], x[end - 1], x[end], m, N, n, X, Z) * (-1.0)
-    hes_l_f  = x -> hess_log_lik_binomial_model(x[1:(end - 2)], x[end - 1], x[end], m, N, n, X, Z) * (-1.0)
-
-    #= result   = optimize(log_l_f, [start[1], start[2], 1], Newton(); inplace = false)
-    result_1 = optimize(log_l_f, grad_l_f, [start[1], start[2], 1], Newton(); inplace = false) =#
-    # TODO :: dependent α, β
-
-    start = Float64[]
-    append!(start, zeros(length(N)))
+    start = [M]
 
     if start == "glm"
         append!(start, coef(glm(@formula(y ~ x1 + x2 + 0), df, Poisson(), LogLink())))
     else
         append!(start, coef(lm(@formula(log(y) ~ x1 + x2 + 0), df)))
-    end # end if
+    end # end
+
+    res = gibbs_sampler_binomial_model(start, grid, iter, n, N, m)
 
-    optim_problem = optimize(log_l_f, grad_l_f, hes_l_f, start, NewtonTrustRegion(); inplace = false)
-    α̂ = optim_problem.minimizer[end - 1]
-    β̂ = optim_problem.minimizer[end]
-    M̂ = optim_problem.minimizer[1:length(N)]
-    ξ̂ = N .^ α̂
-    #[coef(ols), coef(mm)]
-    #[start, log_l, grad_l, hes_l]
-    [[α̂, β̂, ξ̂, M̂, sum(ξ̂), sum(M̂)], optim_problem]
+    # return object with summary statistics and 
 end # end function
diff --git a/src/binomial_model_sampling.jl b/src/binomial_model_sampling.jl
@@ -0,0 +1,117 @@
+function sample_gamma_1_cond(grid, n, N, γ₂, M, m, μ_γ₁, μ_γ₂, ρ, σ_γ₁, σ_γ₂, ε = 1e-6)
+    # get posteriori normal parameters
+    μ_γ₁_post  = μ_γ₁ + ρ * σ_γ₂ / σ_γ₁ * (γ₂ - μ_γ₂)
+    σ_γ₁_post = sqrt(1 - ρ ^ 2) * σ_γ₁
+    distr = Normal(μ_γ₁, σ_γ₁)
+    # compute R_i's
+    #log_N_sq = log.(N) .^ 2
+    #= R = zeros(BigFloat, length(M))
+    next_iter = true
+    t = 0
+    while next_iter
+        println(t)
+        # Rprev = copy(R)
+        # iterate sum
+        R_add = cgf.(distr, (M .- t) .* log.(N)) .- logfactorial(t)
+        R .+= exp.(R_add) .* (-1) ^ t
+        println(R_add)
+        # check convergence
+        next_iter = any(exp.(R_add) .< ε) | t > 1
+        t += 1
+    end # end while
+    println(R) =#
+    f(x, p) = exp.(x .* M .* log.(N) - N .^ x) .* pdf(distr, x)
+    prob = IntegralProblem(f, [-Inf, Inf])
+    R = solve(prob, HCubatureJL(), reltol = ε, abstol = ε)
+    #println(R)
+    R .*= exp.(-logfactorial.(M)) # <--- if this fails this is probably why
+    #println(R)
+    #error("bcd")
+
+    # shift grid towards the mean since it is most probable
+    grid1 = grid .+ μ_γ₁_post
+    # get unscaled density function and evaluate it on a grid
+    function density_function(x)
+        μ = (N .^ x) .* ((n ./ N) .^ γ₂)
+        μ = 1 ./ (1 .+ exp.(-μ))
+        lξ = x .* log.(N)
+
+        #res = (μ .^ m) .* ((1 .- μ) .^ (M .- m))
+        res = zeros(BigFloat, length(M))
+        res .+= m .* log.(μ) .+ (M .- m) .* log.(1 .- μ) .- log.(R) .- logfactorial.(M .- m) .- exp.(lξ) .+ M .* lξ
+        #println(exp(sum(res)))
+        exp(sum(res)) .* pdf(Normal(μ_γ₁_post, σ_γ₁_post), x)
+    end # end funciton
+    evaluated_denisty = density_function.(grid1)
+    #println(evaluated_denisty)
+    evaluated_denisty ./= sum(evaluated_denisty)
+    # sample acording to evaluation
+    grid1[rand(Categorical(evaluated_denisty))]
+end # end funciton
+
+function sample_gamma_2_cond(grid, n, N, γ₁, M, m, μ_γ₁, μ_γ₂, ρ, σ_γ₁, σ_γ₂)
+    # get posteriori normal parameters
+    μ_γ₂_post  = μ_γ₂ + ρ * σ_γ₁ / σ_γ₂ * (γ₁ - μ_γ₁)
+    σ_γ₂_post = sqrt(1 - ρ ^ 2) * σ_γ₂
+    # shift grid towards the mean since it is most probable
+    grid1 = grid .- μ_γ₂_post
+    # get unscaled density function and evaluate it on a grid
+    function density_function(x)
+        μ = (N .^ γ₁) .* ((n ./ N) .^ x)
+        μ = 1 ./ (1 .+ exp.(-μ))
+        #ξ = N .^ γ₁
+
+        res = logfactorial.(M) .- logfactorial.(M .- m) .- logfactorial.(m) .+ m .* log.(μ).+ (M .- m) .* log.(1 .- μ)
+        exp(sum(res)) * pdf(Normal(μ_γ₂_post, σ_γ₂_post), x)
+    end # end funciton
+    evaluated_denisty = density_function.(grid1)
+    evaluated_denisty ./= sum(evaluated_denisty)
+    # sample acording to evaluation
+    grid1[rand(Categorical(evaluated_denisty))]
+end # end funciton
+
+function sample_M_cond(n, N, m, γ₁, γ₂)
+    # compute ξ, μ
+    μ = (N .^ γ₁) .* ((n ./ N) .^ γ₂)
+    μ = 1 ./ (1 .+ exp.(-μ))
+    ξ = N .^ γ₁
+    # draw M-m vector from poisson intependently
+    M_minus_m = reduce(vcat, rand.(Poisson.(ξ .* (1 .- μ)), 1))
+    # return M = (M-m) + increment
+    m + M_minus_m
+end # end funciton
+
+function gibbs_sampler_binomial_model(start, grid, iter, n, N, m, μ_γ₁, μ_γ₂, σ_γ₁, σ_γ₂, ρ, ε = 1e-6)
+    # create storage vectors
+    M  = start[1]
+    γ₁ = start[2]
+    γ₂ = start[3]
+
+    storage = [[M], [γ₁], [γ₂]]
+
+    for k in iter
+        # sample M  conditional on γ₁ and γ₂
+        M = sample_M_cond(n, N, m, γ₁, γ₂)
+        #println(M)
+        # sample γ₁ conditional on M  and γ₂
+        γ₁ = sample_gamma_1_cond(
+            grid, n, N, γ₂, 
+            M, m, μ_γ₁, μ_γ₂, 
+            ρ, σ_γ₁, σ_γ₂, ε
+        )
+        #println(γ₁)
+        # sample γ₂ conditional on γ₁ and M
+        γ₂ = sample_gamma_2_cond(
+            grid, n, N, γ₁, 
+            M, m, μ_γ₁, μ_γ₂, 
+            ρ, σ_γ₁, σ_γ₂
+        )
+        #println(γ₂)
+        # store them
+        append!(storage[1], M)
+        append!(storage[2], γ₁)
+        append!(storage[3], γ₂)
+    end # end for
+    # return stored values
+    storage
+end # end funciton
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,7 +3,7 @@ using Test
 # TODO:: CSV is only used for testing, maybe specify that in Project.toml
 using CSV, DataFrames
 
-@testset "UnobservedCountEstimation.jl" begin
+@testset "zhang_model.jl" begin
     df = CSV.read(pwd() * "/test_csv_zhang.csv", DataFrame)
 
     a = zhang_model(df[:, :m], df[:, :N], df[:, :n]; start = "glm")
@@ -15,3 +15,18 @@ using CSV, DataFrames
     @test b[1][4] ≈ sum(df[:, :ξ]) rtol=.075
     @test b[1][4] ≈ sum(df[:, :M]) rtol=.075
 end
+
+# for now
+#= @testset "binomial_model.jl" begin
+    df = CSV.read(pwd() * "/test_csv_binomial.csv", DataFrame)
+
+    a = binomial_model(df[:, :m], df[:, :N], df[:, :n]; start = "glm")
+    b = binomial_model(df[:, :m], df[:, :N], df[:, :n]; start =  "lm")
+
+    @test a[1][4] ≈ sum(df[:, :ξ]) rtol=.075
+    @test a[1][4] ≈ sum(df[:, :M]) rtol=.075
+
+    @test b[1][4] ≈ sum(df[:, :ξ]) rtol=.075
+    @test b[1][4] ≈ sum(df[:, :M]) rtol=.075
+end =#
+
diff --git a/test/test_csv_binomial.csv b/test/test_csv_binomial.csv
@@ -0,0 +1,21 @@
+N,n,m,M,ξ
+4946,501,616,907,902.4087332179555
+4886,480,661,965,893.6403502565424
+4935,506,608,887,900.8027961854027
+4943,488,598,895,901.9708212977752
+5079,535,647,943,921.7700017557884
+5054,516,587,897,918.1384813234845
+5027,521,658,922,914.2124012586877
+4890,507,604,862,894.2255764308815
+5022,490,593,868,913.4848871407867
+4928,477,589,888,899.7804635417899
+4983,513,636,930,907.8052912210046
+4968,485,595,917,905.6184666779144
+5144,543,716,970,931.195284188665
+4993,507,643,942,909.2624427471206
+4854,472,605,879,888.9550830898875
+4988,499,588,884,908.5339400169311
+4977,489,617,917,906.9307196114336
+5053,524,617,902,917.9931458823555
+4950,531,629,880,902.9925331538545
+4845,492,551,837,887.6362400289374