Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sdmdata and models as namedtuple #9

Merged
merged 16 commits into from
Jul 5, 2024
7 changes: 4 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,17 @@ SpeciesDistributionModelsMakieExt = "Makie"
[compat]
CategoricalDistributions = "0.1.14"
MLJGLMInterface = "0.3.7"
Rasters = "0.10.1"
Rasters = "0.10.1, 0.11"
StatisticalMeasures = "0.1.5"
StatsModels = "0.7.3"
julia = "1.9"

[extras]
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"

[targets]
test = ["Distributions", "StableRNGs", "Test", "Makie"]
test = ["Distributions", "Makie", "MLJModels", "StableRNGs", "Test"]
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
module SpeciesDistributionModelsMakieExt
using Makie, SpeciesDistributionModels
import SpeciesDistributionModels as SDM
import SpeciesDistributionModels: model_names, machine_evaluations, sdm_machines, data,
import SpeciesDistributionModels: model_keys, machine_evaluations, sdm_machines, data,
_conf_mats_from_thresholds
import SpeciesDistributionModels: interactive_evaluation
import Statistics, Loess
Expand Down
8 changes: 4 additions & 4 deletions ext/SpeciesDistributionModelsMakieExt/plotrecipes.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
function _model_controls!(fig, ensemble)
toggles = [Makie.Toggle(fig, active = true) for i in 1:Base.length(ensemble)]
labels = [Makie.Label(fig, String(key)) for key in SDM.model_names(ensemble)]
labels = [Makie.Label(fig, String(key)) for key in SDM.model_keys(ensemble)]
g = Makie.grid!(hcat(toggles, labels))
return g, toggles
end


function Makie.boxplot(ev::SDMensembleEvaluation, measure::Symbol)
modelnames = Base.string.(model_names(ev.ensemble))
modelnames = collect(Base.string.(model_keys(ev.ensemble)))
f = Makie.Figure()

for (i, t) in enumerate((:train, :test))
Expand Down Expand Up @@ -55,13 +55,13 @@ function SDM.interactive_evaluation(ensemble; thresholds = 0:0.01:1)

idx_by_model = map(enumerate(ensemble)) do (i, e)
fill(i, Base.length(e))
end |> NamedTuple{Tuple(model_names(ensemble))}
end |> NamedTuple{Tuple(model_keys(ensemble))}

n_models = length(idx_by_model)

conf_mats = mapreduce(hcat, ensemble) do gr
map(gr) do sdm_machine
rows = sdm_machine.test_rows
rows = SDM.test_rows(sdm_machine)
y_hat = SDM.MLJBase.predict(sdm_machine.machine; rows)
y = data(sdm_machine).response[rows]
_conf_mats_from_thresholds(SDM.MLJBase.pdf.(y_hat, true), y, thresholds)
Expand Down
10 changes: 5 additions & 5 deletions src/SpeciesDistributionModels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ import GLM, PrettyTables, Rasters, EvoTrees, DecisionTree, Shapley, Loess
using MLJBase: pdf
using Rasters: Raster, RasterStack, Band
using ScientificTypesBase: Continuous, OrderedFactor, Multiclass, Count
using ComputationalResources: CPU1, CPUThreads, AbstractCPU
using ComputationalResources: CPU1, CPUThreads, AbstractCPU, CPUProcesses

using ScientificTypesBase: Continuous, OrderedFactor, Multiclass, Count
using StatisticalMeasures: auc, kappa, sensitivity, selectivity, accuracy
import MLJBase: StratifiedCV, CV, Holdout, ResamplingStrategy
import MLJBase: StratifiedCV, CV, Holdout, ResamplingStrategy, Machine, Probabilistic

export SDMensemble, predict, sdm, select, machines, machine_keys,
export SDMensemble, predict, sdm, sdmdata, select, machines, machine_keys,
remove_collinear,
explain, variable_importance, ShapleyValues,
SDMmachineExplanation, SDMgroupExplanation, SDMensembleExplanation,
Expand All @@ -23,13 +23,13 @@ export SDMensemble, predict, sdm, select, machines, machine_keys,
export auc, kappa, sensitivity, selectivity, accuracy,
Continuous, OrderedFactor, Multiclass, Count,
StratifiedCV, CV, Holdout, ResamplingStrategy

#include("learningnetwork.jl")
include("models.jl")
include("data_utils.jl")
include("resample.jl")
# export stubs for extensions
export interactive_response_curves, interactive_evaluation


include("collinearity.jl")
include("models.jl")
include("ensemble.jl")
Expand Down
133 changes: 115 additions & 18 deletions src/data_utils.jl
Original file line number Diff line number Diff line change
@@ -1,32 +1,129 @@
### Miscelanious utilities to deal with data issues such as names, missing values

# Convert a BitArray to a CategoricalArray. Faster and type-stable version of `categorical`
BooleanCategorical{N} = CategoricalArrays.CategoricalArray{Bool, N, UInt8} where N
function boolean_categorical(A::BitArray{N}) where N
CategoricalArrays.CategoricalArray{Bool, N, UInt8}(A, levels=[false, true], ordered=false)
BooleanCategorical{N}(A, levels=[false, true], ordered=false)
end
boolean_categorical(A::AbstractVector{Bool}) = boolean_categorical(BitArray(A))

function _get_predictor_names(p, a)
predictors = Base.intersect(Tables.schema(a).names, Tables.schema(p).names)
predictors = filter!(!=(:geometry), predictors) # geometry is never a variable
length(predictors) > 0 || error("Presence and absence data have no common variable names - can't fit the ensemble.")
return predictors

struct SDMdata{K}
predictor::NamedTuple
response::CategoricalArrays.CategoricalArray
geometry::Union{Nothing, Vector}
traintestpairs::MLJBase.TrainTestPairs
resampler::Union{Nothing, MLJBase.ResamplingStrategy}

function SDMdata(predictor::P, response, geometry, traintestpairs, resampler) where P<:NamedTuple{K} where K
new{K}(predictor, response, geometry, traintestpairs, resampler)
end
end

function Base.show(io::IO, mime::MIME"text/plain", data::SDMdata{K}) where K
y = response(data)
print("SDMdata object with ")
printstyled(sum(y), bold = true)
print(" presence points and ")
printstyled(length(y) - sum(y), bold = true)
print(" absence points. \n \n")

function _predictor_response_from_presence_absence(presences, absences, predictors)
p_columns = Tables.columns(presences)
a_columns = Tables.columns(absences)
n_presence = Tables.rowcount(p_columns)
n_absence = Tables.rowcount(a_columns)
printstyled("Resampling: \n", bold = true)
println("Data is divided into $(nfolds(data)) folds using resampling strategy $(resampler(data)).")

# merge presence and absence data into one namedtuple of vectors
predictor_values = NamedTuple{Tuple(predictors)}([[a_columns[var]; p_columns[var]] for var in predictors])
response_values = boolean_categorical([falses(n_absence); trues(n_presence)])
return predictor_values, response_values
n_presences = length.(getindex.(traintestpairs(data), 1))
n_absences = length.(getindex.(traintestpairs(data), 2))
table_cols = hcat(1:nfolds(data), n_presences, n_absences)
header = (["fold", "presences", "absences"])
PrettyTables.pretty_table(io, table_cols; header = header)

printstyled("Predictor variables: \n", bold = true)
Base.show(io, mime, MLJBase.schema(predictor(data)))

if isnothing(geometry(data))
print("Does not contain geometry data")
else
print("Also contains geometry data")
end
end


_gettrainrows(d::SDMdata, i) = d.traintestpairs[i][1]
_gettestrows(d::SDMdata, i) = d.traintestpairs[i][2]
predictor(d::SDMdata) = d.predictor
predictorkeys(d::SDMdata{K}) where K = K
response(d::SDMdata) = convert(AbstractArray{Bool}, d.response)
geometry(d::SDMdata) = d.geometry
traintestpairs(d::SDMdata) = d.traintestpairs
resampler(d::SDMdata) = d.resampler
nfolds(d::SDMdata) = length(d.traintestpairs)

function _sdmdata(presences, absences, resampler, ::Nothing)
predictorkeys = Tuple(Base.intersect(Tables.schema(presences).names, Tables.schema(absences).names))
length(predictorkeys) > 0 || error("Presence and absence data have no common variable names - can't fit the ensemble.")
_sdmdata(presences, absences, resampler, predictorkeys)
end

function _sdmdata(presences, absences, resampler, predictorkeys::NTuple{<:Any, <:Symbol})
X, y = _predictor_response_from_presence_absence(presences, absences, predictorkeys)
_sdmdata(X, y, resampler, predictorkeys)
end

# in case input is a table
function _sdmdata(X, response::BitVector, resampler, ::Nothing)
columns = Tables.columntable(X)
Tables.rowcount(columns) == length(response) || error("Number of rows in predictors and response do not match")
predictorkeys = Tables.columnnames(columns)
_sdmdata(columns, response, resampler, predictorkeys)
end

_sdmdata(X::Tables.ColumnTable{K}, y::BitVector, resampler, predictorkeys::NTuple{<:Any, <:Symbol}) where K =
if K == predictorkeys
_sdmdata(X, boolean_categorical(y), resampler)
else
_sdmdata(X[predictorkeys], boolean_categorical(y), resampler)
end

function _sdmdata(
X::Tables.ColumnTable,
y::BooleanCategorical,
resampler::CV,
)
shuffled_resampler = CV(; nfolds = resampler.nfolds, rng = resampler.rng, shuffle = true)
traintestpairs = MLJBase.train_test_pairs(shuffled_resampler, eachindex(y), X, y)
_sdmdata(X, y, traintestpairs, shuffled_resampler)
end
function _sdmdata(
X::Tables.ColumnTable,
y::BooleanCategorical,
resampler::MLJBase.ResamplingStrategy,
)
traintestpairs = MLJBase.train_test_pairs(resampler, eachindex(y), X, y)
_sdmdata(X, y, traintestpairs, resampler)
end

function _sdmdata(
X::Tables.ColumnTable,
y::BooleanCategorical,
traintestpairs::MLJBase.TrainTestPairs,
resampler = CustomRows()
)
geometries = :geometry ∈ keys(X) ? Tables.getcolumn(X, :geometry) : nothing
X = Base.structdiff(X, NamedTuple{(:geometry,)})
SDMdata(X, y, geometries, traintestpairs, resampler)
end

cpu_backend(threaded) = threaded ? CPUThreads() : CPU1()
_map(::CPU1) = Base.map
_map(::CPUThreads) = ThreadsX.map
_map(::CPUThreads) = ThreadsX.map


function _predictor_response_from_presence_absence(presences, absences, predictorkeys::NTuple{<:Any, <:Symbol})
p_columns = Tables.columns(presences)
a_columns = Tables.columns(absences)
n_presence = Tables.rowcount(p_columns)
n_absence = Tables.rowcount(a_columns)

# merge presence and absence data into one namedtuple of vectors
X = NamedTuple{predictorkeys}([[a_columns[var]; p_columns[var]] for var in predictorkeys])
y = [falses(n_absence); trues(n_presence)]
return (X, y)
end
Loading
Loading