diff --git a/src/common.jl b/src/common.jl index 36c128daa..6520278b7 100644 --- a/src/common.jl +++ b/src/common.jl @@ -24,11 +24,11 @@ const RealFP = Union{Float32, Float64} const DepBool = Union{Bool, Nothing} function depcheck(fname::Symbol, b::DepBool) - if b == nothing + if b === nothing msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour." Base.depwarn(msg, fname) - false + return false else - b + return b end end diff --git a/src/deprecates.jl b/src/deprecates.jl index 9e409b33a..21b6c5f14 100644 --- a/src/deprecates.jl +++ b/src/deprecates.jl @@ -39,3 +39,9 @@ end ### Deprecated September 2019 @deprecate sum(A::AbstractArray, w::AbstractWeights, dims::Int) sum(A, w, dims=dims) @deprecate values(wv::AbstractWeights) convert(Vector, wv) + +### Deprecate August 2020 (v0.33) +function (ecdf::ECDF)(v::RealArray) + depwarn("(ecdf::ECDF)(v::RealArray) is deprecated, use `ecdf.(v)` broadcasting instead", "(ECDF::ecdf)(v::RealArray)") + return ecdf.(v) +end diff --git a/src/empirical.jl b/src/empirical.jl index 98ef7d91b..36fa82eea 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -1,68 +1,142 @@ # Empirical estimation of CDF and PDF -## Empirical CDF +""" +Empirical Cumulative Distribution Function (ECDF). + +Represents ECDF constructed from random variable samples by [`ecdf`](@ref). + +# Type parameters +* `T`: the type of random variable values +* `W`: the type of sample weights and CDF values +* `I`: boolean indicating whether to interpolate + the CDF between neighboring samples (continuous distribution) + or not (discrete distribution) +""" +struct ECDF{T <: Real, W <: Real, I} + # sorted random variable values and associated probabilities. + # The tuple provides: + # - `x[i]` (value of random variable) + # - `ECDF(x[i])` + # - `1/(x[i+1] - x[i])` + # - `ECDF(x[i+1]) - ECDF(x[i])` (the weight of `x[i+1]`) + sorted_values::Vector{Tuple{T, W, W, W}} +end + +# type of value weights and CDF values +weighttype(::Type{<:ECDF{<:Any, W}}) where W = W +weighttype(ecdf::ECDF) = weighttype(typeof(ecdf)) -struct ECDF{T <: AbstractVector{<:Real}, W <: AbstractWeights{<:Real}} - sorted_values::T - weights::W +# whether to interpolate between the neighboring ECDF values +isinterpolating(::Type{<:ECDF{<:Any, <:Any, I}}) where I = I +isinterpolating(ecdf::ECDF) = isinterpolating(typeof(ecdf)) + +function Base.show(io::IO, e::ECDF) + print(io, typeof(e)) + print(io, "(") + print(io, length(e.sorted_values)) + println(io, " values)") end +# calculate ECDF at given point function (ecdf::ECDF)(x::Real) isnan(x) && return NaN - n = searchsortedlast(ecdf.sorted_values, x) - evenweights = isempty(ecdf.weights) - weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) - partialsum = evenweights ? n : sum(view(ecdf.weights, 1:n)) - partialsum / weightsum + pos = searchsortedlast(ecdf.sorted_values, x, by=first) + (pos == 0) && return zero(weighttype(ecdf)) + @inbounds val, cdf_val, val_invdelta, cdf_delta = ecdf.sorted_values[pos] + if !isinterpolating(ecdf) || (val == x) || (pos == length(ecdf.sorted_values)) + return cdf_val + else + return muladd(cdf_delta, min((x - val) * val_invdelta, one(weighttype(ecdf))), cdf_val) + end end -function (ecdf::ECDF)(v::RealVector) - evenweights = isempty(ecdf.weights) - weightsum = evenweights ? length(ecdf.sorted_values) : sum(ecdf.weights) - ord = sortperm(v) - m = length(v) - r = similar(ecdf.sorted_values, m) - r0 = zero(weightsum) - i = 1 - for (j, x) in enumerate(ecdf.sorted_values) - while i <= m && x > v[ord[i]] - r[ord[i]] = r0 - i += 1 - end - r0 += evenweights ? 1 : ecdf.weights[j] - if i > m - break - end - end - while i <= m - r[ord[i]] = weightsum - i += 1 +# broadcasts ecdf() over an array +# caches the last calculated value +function Base.Broadcast.broadcasted(ecdf::ECDF, v::AbstractArray) + res = similar(v, weighttype(ecdf)) + @inbounds for i in eachindex(v) + res[i] = i == 1 || v[i] != v[i-1] ? ecdf(v[i]) : res[i-1] end - return r / weightsum + return res end """ - ecdf(X; weights::AbstractWeights) + ecdf(X::RealVector; weights::AbstractWeights=nothing, interpolate::Bool=false) -> ECDF + +Construct an *empirical cumulative distribution function* (ECDF) based on a vector of samples (`X`). +Returns a callable [`ECDF`](@ref) object. -Return an empirical cumulative distribution function (ECDF) based on a vector of samples -given in `X`. Optionally providing `weights` returns a weighted ECDF. +# Example -Note: this function that returns a callable composite type, which can then be applied to -evaluate CDF values on other samples. +```jldoctest +julia> using StatsBase -`extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which -function is inside the interval ``(0,1)``; the function is defined for the whole real line. +julia> X = 1:15 +1:15 + +julia> xcdf = ecdf(X) +ECDF{Int64,Float64,false}(15 values) + +julia> xcdf(9) +0.6 +``` + +## Keyword arguments +* `weights`: optional weights for each `X` sample +* `interpolate`: when enabled, `ECDF(y)` will linearly interpolate the cumulative density between + the neighboring `X` samples (``x_i \\leq y < x_{i+1}``), + otherwise (by default) `ECDF(x[i])` is returned + +## Notes +The returned function is defined for the whole real line. +Use [`extrema`](@ref), [`minimum`](@ref), and [`maximum`](@ref) to obtain the range +over which the ECDF is inside the interval ``(0, 1)``. """ -function ecdf(X::RealVector; weights::AbstractVector{<:Real}=Weights(Float64[])) +function ecdf(X::RealVector; + weights::Union{Nothing, RealVector}=nothing, + interpolate::Bool=false) any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values")) - isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," * - "got $(length(X)) and $(length(weights))")) + evenweights = isnothing(weights) || isempty(weights) + evenweights || (length(X) == length(weights)) || + throw(ArgumentError("data and weight vectors must be the same size, " * + "got $(length(X)) and $(length(weights))")) + T = eltype(X) + W0 = evenweights ? Int : eltype(weights) + W = isnothing(weights) ? Float64 : eltype(one(W0)/sum(weights)) + + wsum = evenweights ? length(X) : sum(weights) ord = sortperm(X) - ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord])) + + sorted_vals = sizehint!(Vector{Tuple{T, W, W, W}}(), length(X)) + isempty(X) && return ECDF{T, W, interpolate}(sorted_vals) + + valprev = val = X[first(ord)] + wsumprev = zero(W0) + valw = zero(W0) + + push_valprev!() = push!(sorted_vals, (valprev, min(wsumprev/wsum, one(W)), + inv(val - valprev), valw/wsum)) + + @inbounds for i in ord + valnew = X[i] + if (val != valnew) || (i == last(ord)) + (wsumprev > 0) && push_valprev!() + valprev = val + val = valnew + wsumprev += valw + valw = zero(W0) + end + valw += evenweights ? one(W0) : weights[i] + end + #@assert valw + wsumprev == wsum # may fail due to fp-arithmetic + (wsumprev > 0) && push_valprev!() + # last value + push!(sorted_vals, (val, one(W), zero(W), zero(W))) + return ECDF{T,W,interpolate}(sorted_vals) end -minimum(ecdf::ECDF) = first(ecdf.sorted_values) +minimum(ecdf::ECDF) = first(ecdf.sorted_values)[1] -maximum(ecdf::ECDF) = last(ecdf.sorted_values) +maximum(ecdf::ECDF) = last(ecdf.sorted_values)[1] extrema(ecdf::ECDF) = (minimum(ecdf), maximum(ecdf)) diff --git a/src/weights.jl b/src/weights.jl index e5df6b738..dfea3c07e 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -121,8 +121,8 @@ aweights(vs::RealArray) = AnalyticWeights(vec(vs)) s = w.sum if corrected - sum_sn = sum(x -> (x / s) ^ 2, w) - 1 / (s * (1 - sum_sn)) + sum_w2 = sum(abs2, w) + 1 / (s - sum_w2 / s) else 1 / s end diff --git a/test/empirical.jl b/test/empirical.jl index cb0317467..ce959889e 100644 --- a/test/empirical.jl +++ b/test/empirical.jl @@ -1,6 +1,12 @@ using StatsBase using Test +function showstring(obj) + iobuf = IOBuffer() + show(iobuf, obj) + return String(take!(iobuf)) +end + @testset "ECDF" begin x = randn(10000000) fnecdf = ecdf(x) @@ -10,10 +16,15 @@ using Test @test fnecdf(y) ≈ map(fnecdf, y) @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x) fnecdf = ecdf([0.5]) + @test showstring(fnecdf) == "ECDF{Float64,Float64,false}(1 values)\n" + @test fnecdf(0.5) == 1.0 @test fnecdf([zeros(5000); ones(5000)]) == [zeros(5000); ones(5000)] @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 0.5) @test isnan(ecdf([1,2,3])(NaN)) @test_throws ArgumentError ecdf([1, NaN]) + fnecdf = ecdf(Int[]) + @test showstring(fnecdf) == "ECDF{$Int,Float64,false}(0 values)\n" + @test fnecdf.([0, 1, 2, 3]) == [0.0, 0.0, 0.0, 0.0] end @testset "Weighted ECDF" begin @@ -23,17 +34,25 @@ end fnecdf = ecdf(x, weights=w1) fnecdfalt = ecdf(x, weights=w2) @test fnecdf.sorted_values == fnecdfalt.sorted_values - @test fnecdf.weights == fnecdfalt.weights - @test fnecdf.weights != w1 # check that w wasn't accidently modified in place - @test fnecdfalt.weights != w2 + @test_skip fnecdf.weights == fnecdfalt.weights + @test_skip fnecdf.weights != w1 # check that w wasn't accidently modified in place + @test_skip fnecdfalt.weights != w2 y = [-1.96, -1.644854, -1.281552, -0.6744898, 0, 0.6744898, 1.281552, 1.644854, 1.96] @test isapprox(fnecdf(y), [0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975], atol=1e-3) @test isapprox(fnecdf(1.96), 0.975, atol=1e-3) @test fnecdf(y) ≈ map(fnecdf, y) @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == extrema(x) fnecdf = ecdf([1.0, 0.5], weights=weights([3, 1])) + @test fnecdf.([0.0, 0.5, 0.75, 1.0, 2.0]) == [0.0, 0.25, 0.25, 1.0, 1.0] @test fnecdf(0.75) == 0.25 @test extrema(fnecdf) == (minimum(fnecdf), maximum(fnecdf)) == (0.5, 1.0) + + fnecdfi = ecdf([1.0, 0.5], weights=weights([3, 1]), interpolate=true) + @test showstring(fnecdfi) == "ECDF{Float64,Float64,true}(2 values)\n" + @test fnecdfi.([0.0, 0.5, 0.75, 1.0, 2.0]) == [0.0, 0.25, 0.625, 1.0, 1.0] + @test fnecdfi(0.75) == 0.625 + @test extrema(fnecdfi) == (minimum(fnecdfi), maximum(fnecdfi)) == (0.5, 1.0) + @test_throws ArgumentError ecdf(rand(8), weights=weights(rand(10))) # Check frequency weights v = randn(100)