Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

16 hull clustering #75

Merged
merged 4 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ DuckDB_jll = "2cbbab25-fc8b-58cf-88d4-687a02676033"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
CSV = "0.10"
Expand Down
2 changes: 2 additions & 0 deletions src/TulipaClustering.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@ using Clustering
using DataFrames
using Distances
using DuckDB
using LinearAlgebra
using ProgressBars
using SparseArrays
using Statistics

include("structures.jl")
include("io.jl")
Expand Down
158 changes: 158 additions & 0 deletions src/cluster.jl
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,92 @@ function append_period_from_source_df_as_rp!(
append!(df, period_df)
end

"""
greedy_convex_hull(matrix; n_points, distance, initial_indices, mean_vector)

Greedy method for finding `n_points` points in a hull of the dataset. The points
are added iteratively, at each step the point that is the furthest away from the
hull of the current set of points is found and added to the hull.

- `matrix`: the clustering matrix
- `n_points`: number of hull points to find
- `distance`: distance semimetric
- `initial_indices`: initial points which must be added to the hull, can be nothing
- `mean_vector`: when adding the first point (if `initial_indices` is not given),
it will be chosen as the point furthest away from the `mean_vector`; this can be
nothing, in which case the first step will add a point furtherst away from
the centroid (the mean) of the dataset
"""
function greedy_convex_hull(
matrix::AbstractMatrix{Float64};
n_points::Int,
distance::SemiMetric,
initial_indices::Union{Vector{Int}, Nothing} = nothing,
mean_vector::Union{Vector{Float64}, Nothing} = nothing,
)
# First resolve the points that are already in the hull given via `initial_indices`
if initial_indices ≡ nothing
if mean_vector ≡ nothing
mean_vector = vec(mean(matrix; dims = 2))
end
distances_from_mean = [distance(mean_vector, matrix[:, j]) for j in axes(matrix, 2)]
initial_indices = [argmax(distances_from_mean)]
end

# If there are more initial points than `n_points`, return the first `n_points`
if length(initial_indices) ≥ n_points
return initial_indices[1:n_points]
end

# Start filling in the remaining points
hull_indices = initial_indices
distances_cache = Dict{Int, Float64}() # store previously computed distances
starting_index = length(initial_indices) + 1

for _ in starting_index:n_points
# Find the point that is the furthest away from the current hull
max_distance = -Inf
furthest_vector_index = nothing
hull_matrix = matrix[:, hull_indices]
projection_matrix = pinv(hull_matrix)
for column_index in axes(matrix, 2)
if column_index in hull_indices
continue
end
last_added_vector = matrix[:, last(hull_indices)]
target_vector = matrix[:, column_index]

# Check whether the distance was previosly computed
cached_distance = get(distances_cache, column_index, Inf)
if distance(target_vector, last_added_vector) ≥ cached_distance
d = cached_distance
else
subgradient = x -> hull_matrix' * (hull_matrix * x - target_vector)
x = projection_matrix * target_vector
x =
projected_subgradient_descent!(x; subgradient, projection = project_onto_simplex)
projected_target = hull_matrix * x
d = distance(projected_target, target_vector)
distances_cache[column_index] = d
end

if d > max_distance
max_distance = d
furthest_vector_index = column_index
end
end

# If no point is found for some reason, throw an error
if furthest_vector_index ≡ nothing
throw(ArgumentError("Point not found"))
end

# Add the found point to the hull
push!(hull_indices, furthest_vector_index)
end
return hull_indices
end

"""
find_representative_periods(
clustering_data;
Expand Down Expand Up @@ -486,6 +572,78 @@ function find_representative_periods(
rp_matrix = clustering_matrix[:, kmedoids_result.medoids]
assignments = kmedoids_result.assignments
aux.medoids = kmedoids_result.medoids
elseif method ≡ :convex_hull
# Do the clustering
hull_indices = greedy_convex_hull(clustering_matrix; n_points = n_rp, distance)

# Reinterpret the results
rp_matrix = clustering_matrix[:, hull_indices]
assignments = [
argmin([
distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices
]) for p in 1:n_complete_periods
]
aux.medoids = hull_indices
elseif method ≡ :convex_hull_with_null
# Check if we can add null to the clustering matrix. The distance to null can
# be undefined, e.g., for the cosine distance.
is_distance_to_zero_undefined =
isnan(distance(zeros(size(clustering_matrix, 1), 1), clustering_matrix[:, 1]))

if is_distance_to_zero_undefined
throw(
ArgumentError(
"cannot add null to the clustering data because distance to it is undefined",
),
)
end

# Add null to the clustering matrix
matrix = [clustering_matrix zeros(size(clustering_matrix, 1), 1)]

# Do the clustering
hull_indices =
greedy_convex_hull(matrix; n_points = n_rp + 1, distance, initial_indices = [1])

# Remove null from the beginning and shift all indices by one
popfirst!(hull_indices)
hull_indices .-= 1

# Reinterpret the results
rp_matrix = clustering_matrix[:, hull_indices]
assignments = [
argmin([
distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices
]) for p in 1:n_complete_periods
]
aux.medoids = hull_indices
elseif method ≡ :conical_hull
# Do a gnomonic projection (normalization) of the data
normal_vector = vec(mean(clustering_matrix; dims = 2))
normalize!(normal_vector)
projection_coefficients = [
1.0 / dot(normal_vector, clustering_matrix[:, j]) for j in axes(clustering_matrix, 2)
]
projected_matrix = [
clustering_matrix[i, j] * projection_coefficients[j] for
i in axes(clustering_matrix, 1), j in axes(clustering_matrix, 2)
]

# Do the clustering
hull_indices = greedy_convex_hull(
projected_matrix;
n_points = n_rp,
distance,
mean_vector = normal_vector,
)

# Reinterpret the results
rp_matrix = clustering_matrix[:, hull_indices]
assignments = [
argmin([
distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices
]) for p in 1:n_complete_periods
]
else
throw(ArgumentError("Clustering method is not supported"))
end
Expand Down
1 change: 0 additions & 1 deletion src/weight_fitting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,6 @@ function fit_rep_period_weights!(
# TODO: this can be parallelized; investigate
target_vector = clustering_matrix[:, period]
subgradient = (x) -> rp_matrix' * (rp_matrix * x - target_vector)
x = Vector(weight_matrix[period, 1:n_rp])
if weight_type == :conical_bounded
x = vcat(Vector(weight_matrix[period, 1:(n_rp - 1)]), [0.0])
else
Expand Down
57 changes: 57 additions & 0 deletions test/test-cluster.jl
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,22 @@ end
end
end

@testset "Convex hull clustering" begin
@testset "Make sure that convex hull clustering finds the hull" begin
@test begin
clustering_data = DataFrame([
:period => repeat(1:2; inner = 4),
:timestep => repeat(1:2; inner = 2, outer = 2),
:technology => repeat(["Solar", "Nuclear"], 4),
:value => 5:12,
])
clustering_result =
find_representative_periods(clustering_data, 2; method = :convex_hull)
clustering_result.weight_matrix == [1.0 0.0; 0.0 1.0]
end
end
end

@testset "Bad clustering method" begin
@testset "Make sure that clustering fails when incorrect method is given" begin
@test_throws ArgumentError begin
Expand All @@ -210,6 +226,23 @@ end
find_representative_periods(clustering_data, 2; method = :bad_method, init = :kmcen)
end
end

@testset "Make sure that clustering fails with cosine distance" begin
@test_throws ArgumentError begin
clustering_data = DataFrame([
:period => repeat(1:2; inner = 4),
:timestep => repeat(1:2; inner = 2, outer = 2),
:technology => repeat(["Solar", "Nuclear"], 4),
:value => 5:12,
])
clustering_result = find_representative_periods(
clustering_data,
2;
method = :convex_hull_with_null,
distance = CosineDist(),
)
end
end
end

@testset "Bad number of representative periods" begin
Expand All @@ -225,3 +258,27 @@ end
@test_throws ArgumentError find_representative_periods(clustering_data, 3)
end
end

@testset "Greedy convex hull" begin
@testset "Test the case where points cannot be found" begin
@test_throws ArgumentError TulipaClustering.greedy_convex_hull(
[1.0 0.0; 0.0 1.0];
n_points = 10,
distance = Euclidean(),
initial_indices = [1, 2],
mean_vector = nothing,
)
end

@testset "Test the case when there are more initial indices than points" begin
@test size(
TulipaClustering.greedy_convex_hull(
[1.0 0.0; 0.0 1.0];
n_points = 1,
distance = Euclidean(),
initial_indices = [1, 2],
mean_vector = nothing,
),
) == (1,)
end
end
18 changes: 9 additions & 9 deletions test/test-weight-fitting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,8 @@ end
clustering_data,
10;
drop_incomplete_last_period = false,
method = :k_means,
method = :convex_hull,
distance = SqEuclidean(),
init = :kmcen,
)
TulipaClustering.fit_rep_period_weights!(
clustering_result;
Expand All @@ -110,16 +109,17 @@ end
clustering_data,
10;
drop_incomplete_last_period = false,
method = :k_means,
method = :convex_hull_with_null,
distance = SqEuclidean(),
init = :kmcen,
)
TulipaClustering.fit_rep_period_weights!(
clustering_result;
weight_type = :conical_bounded,
niters = 5,
show_progress = true,
)
all(sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) .≤ 1.0)
total_weights = sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2)
all((total_weights .< 1.0) .|| (total_weights .≈ 1.0))
end

@test begin
Expand All @@ -137,9 +137,9 @@ end
clustering_result;
weight_type = :conical_bounded,
niters = 5,
show_progress = true,
)
all(sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) .≤ 1.0)
total_weights = sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2)
all((total_weights .< 1.0) .|| (total_weights .≈ 1.0))
end
end

Expand All @@ -151,8 +151,8 @@ end
clustering_data,
10;
drop_incomplete_last_period = false,
method = :k_means,
distance = SqEuclidean(),
method = :conical_hull,
distance = CosineDist(),
init = :kmcen,
)
TulipaClustering.fit_rep_period_weights!(
Expand Down