From 6670f3456bd9c2f2a09a51d010bb635040887a6b Mon Sep 17 00:00:00 2001 From: Greg Neustroev Date: Tue, 22 Oct 2024 11:31:07 +0200 Subject: [PATCH 1/4] Add hull clustering methods --- src/cluster.jl | 156 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/src/cluster.jl b/src/cluster.jl index ffe812d..0a00641 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -382,6 +382,92 @@ function append_period_from_source_df_as_rp!( append!(df, period_df) end +""" + greedy_convex_hull(matrix; n_points, distance, initial_indices, mean_vector) + + Greedy method for finding `n_points` points in a hull of the dataset. The points + are added iteratively, at each step the point that is the furthest away from the + hull of the current set of points is found and added to the hull. + + - `matrix`: the clustering matrix + - `n_points`: number of hull points to find + - `distance`: distance semimetric + - `initial_indices`: initial points which must be added to the hull, can be nothing + - `mean_vector`: when adding the first point (if `initial_indices` is not given), + it will be chosen as the point furthest away from the `mean_vector`; this can be + nothing, in which case the first step will add a point furtherst away from + the centroid (the mean) of the dataset +""" +function greedy_convex_hull( + matrix::AbstractMatrix{Float64}; + n_points::Int, + distance::SemiMetric, + initial_indices::Union{Vector{Int}, Nothing} = nothing, + mean_vector::Union{Vector{Float64}, Nothing} = nothing, +) + # First resolve the points that are already in the hull given via `initial_indices` + if initial_indices ≡ nothing + if mean_vector ≡ nothing + mean_vector = vec(mean(matrix; dims = 2)) + end + distances_from_mean = [distance(mean_vector, matrix[:, j]) for j in axes(matrix, 2)] + initial_indices = [argmax(distances_from_mean)] + end + + # If there are more initial points than `n_points`, return the first `n_points` + if length(initial_indices) ≥ n_points + return initial_indices[1:n_points] + end + + # Start filling in the remaining points + hull_indices = initial_indices + distances_cache = Dict{Int, Float64}() # store previously computed distances + starting_index = length(initial_indices) + 1 + + for _ in starting_index:n_points + # Find the point that is the furthest away from the current hull + max_distance = -Inf + furthest_vector_index = nothing + hull_matrix = matrix[:, hull_indices] + projection_matrix = pinv(hull_matrix) + for column_index in axes(matrix, 2) + if column_index ∈ hull_indices + continue + end + last_added_vector = matrix[:, last(hull_indices)] + target_vector = matrix[:, column_index] + + # Check whether the distance was previosly computed + cached_distance = get(distances_cache, column_index, Inf) + if distance(target_vector, last_added_vector) ≥ cached_distance + d = cached_distance + else + subgradient = x -> hull_matrix' * (hull_matrix * x - target_vector) + x = projection_matrix * target_vector + x = + projected_subgradient_descent!(x; subgradient, projection = project_onto_simplex) + projected_target = hull_matrix * x + d = distance(projected_target, target_vector) + distances_cache[column_index] = d + end + + if d > max_distance + max_distance = d + furthest_vector_index = column_index + end + end + + # If no point is found for some reason, throw an error + if furthest_vector_index ≡ nothing + throw(ArgumentError("Point not found")) + end + + # Add the found point to the hull + push!(hull_indices, furthest_vector_index) + end + return hull_indices +end + """ find_representative_periods( clustering_data; @@ -486,6 +572,76 @@ function find_representative_periods( rp_matrix = clustering_matrix[:, kmedoids_result.medoids] assignments = kmedoids_result.assignments aux.medoids = kmedoids_result.medoids + elseif method ≡ :convex_hull + # Do the clustering + hull_indices = greedy_convex_hull(clustering_matrix; n_points = n_rp, distance) + + # Reinterpret the results + rp_matrix = clustering_matrix[:, hull_indices] + assignments = [ + argmin([ + distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices + ]) for p in 1:n_periods + ] + aux.medoids = hull_indices + elseif method ≡ :convex_hull_with_null + # Check if we can add null to the clustering matrix. The distance to null can + # be undefined, e.g., for the cosine distance. + is_distance_to_zero_undefined = + isnan(distance(zeros(size(clustering_matrix, 1), 1), clustering_matrix[:, 1])) + + if is_distance_to_zero_undefined + throw( + DomainError( + "cannot add null to the clustering data because distance to it is undefined", + ), + ) + end + + # Add null to the clustering matrix + matrix = [zeros(size(clustering_matrix, 1), 1) clustering_matrix] + + # Do the clustering + hull_indices = + greedy_convex_hull(matrix; n_points = n_rp + 1, distance, initial_indices = [1]) + popfirst!(hull_indices) + + # Reinterpret the results + rp_matrix = clustering_matrix[:, hull_indices] + assignments = [ + argmin([ + distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices + ]) for p in 1:n_periods + ] + aux.medoids = hull_indices + elseif method ≡ :conical_hull + # Do a gnomonic projection (normalization) of the data + normal_vector = vec(mean(clustering_matrix; dims = 2)) + normalize!(normal_vector) + projection_coefficients = [ + 1.0 / dot(normal_vector, clustering_matrix[:, j]) for j in axes(clustering_matrix, 2) + ] + projected_matrix = [ + clustering_matrix[i, j] * projection_coefficients[j] for + i in axes(clustering_matrix, 1), j in axes(clustering_matrix, 2) + ] + + # Do the clustering + hull_indices = greedy_convex_hull( + projected_matrix; + n_points = n_rp, + distance, + mean_vector = normal_vector, + ) + + # Reinterpret the results + rp_matrix = clustering_matrix[:, hull_indices] + assignments = [ + argmin([ + distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices + ]) for p in 1:n_periods + ] + aux.medoids = hull_indices else throw(ArgumentError("Clustering method is not supported")) end From 851c715a4b56262ba1af3d4bff3d81419cb781b9 Mon Sep 17 00:00:00 2001 From: Greg Neustroev Date: Tue, 22 Oct 2024 12:18:01 +0200 Subject: [PATCH 2/4] Add dependencies --- Project.toml | 1 + src/TulipaClustering.jl | 2 ++ 2 files changed, 3 insertions(+) diff --git a/Project.toml b/Project.toml index c9e40ec..10ef3fb 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ DuckDB_jll = "2cbbab25-fc8b-58cf-88d4-687a02676033" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] CSV = "0.10" diff --git a/src/TulipaClustering.jl b/src/TulipaClustering.jl index e220ef0..f5b16ed 100644 --- a/src/TulipaClustering.jl +++ b/src/TulipaClustering.jl @@ -6,8 +6,10 @@ using Clustering using DataFrames using Distances using DuckDB +using LinearAlgebra using ProgressBars using SparseArrays +using Statistics include("structures.jl") include("io.jl") From a5f0bfd8f7379a37fbb1eaab53999283e62c63ff Mon Sep 17 00:00:00 2001 From: Greg Neustroev Date: Tue, 22 Oct 2024 12:18:19 +0200 Subject: [PATCH 3/4] Add tests for new clustering methods --- test/test-cluster.jl | 30 ++++++++++++++++++++++++++++++ test/test-weight-fitting.jl | 6 +++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/test/test-cluster.jl b/test/test-cluster.jl index b257cc6..b80b355 100644 --- a/test/test-cluster.jl +++ b/test/test-cluster.jl @@ -197,6 +197,22 @@ end end end +@testset "Convex hull clustering" begin + @testset "Make sure that convex hull clustering finds the hull" begin + @test begin + clustering_data = DataFrame([ + :period => repeat(1:2; inner = 4), + :timestep => repeat(1:2; inner = 2, outer = 2), + :technology => repeat(["Solar", "Nuclear"], 4), + :value => 5:12, + ]) + clustering_result = + find_representative_periods(clustering_data, 2; method = :convex_hull) + clustering_result.weight_matrix == [1.0 0.0; 0.0 1.0] + end + end +end + @testset "Bad clustering method" begin @testset "Make sure that clustering fails when incorrect method is given" begin @test_throws ArgumentError begin @@ -225,3 +241,17 @@ end @test_throws ArgumentError find_representative_periods(clustering_data, 3) end end + +@testset "Greedy convex hull" begin + @testset "Test the case when there are more initial indices than points" begin + @test size( + TulipaClustering.greedy_convex_hull( + [1.0 0.0; 0.0 1.0]; + n_points = 1, + distance = Euclidean(), + initial_indices = [1, 2], + mean_vector = nothing, + ), + ) == (1,) + end +end diff --git a/test/test-weight-fitting.jl b/test/test-weight-fitting.jl index 58a4c37..9b597e9 100644 --- a/test/test-weight-fitting.jl +++ b/test/test-weight-fitting.jl @@ -88,7 +88,7 @@ end clustering_data, 10; drop_incomplete_last_period = false, - method = :k_means, + method = :convex_hull, distance = SqEuclidean(), init = :kmcen, ) @@ -110,7 +110,7 @@ end clustering_data, 10; drop_incomplete_last_period = false, - method = :k_means, + method = :convex_hull_with_null, distance = SqEuclidean(), init = :kmcen, ) @@ -151,7 +151,7 @@ end clustering_data, 10; drop_incomplete_last_period = false, - method = :k_means, + method = :conical_hull, distance = SqEuclidean(), init = :kmcen, ) From 28f74be024544984aacd49eed836efab125f6ca7 Mon Sep 17 00:00:00 2001 From: Greg Neustroev Date: Tue, 22 Oct 2024 13:48:45 +0200 Subject: [PATCH 4/4] Add tests --- src/cluster.jl | 16 +++++++++------- src/weight_fitting.jl | 1 - test/test-cluster.jl | 27 +++++++++++++++++++++++++++ test/test-weight-fitting.jl | 12 ++++++------ 4 files changed, 42 insertions(+), 14 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index 0a00641..47304b0 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -431,7 +431,7 @@ function greedy_convex_hull( hull_matrix = matrix[:, hull_indices] projection_matrix = pinv(hull_matrix) for column_index in axes(matrix, 2) - if column_index ∈ hull_indices + if column_index in hull_indices continue end last_added_vector = matrix[:, last(hull_indices)] @@ -581,7 +581,7 @@ function find_representative_periods( assignments = [ argmin([ distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices - ]) for p in 1:n_periods + ]) for p in 1:n_complete_periods ] aux.medoids = hull_indices elseif method ≡ :convex_hull_with_null @@ -592,26 +592,29 @@ function find_representative_periods( if is_distance_to_zero_undefined throw( - DomainError( + ArgumentError( "cannot add null to the clustering data because distance to it is undefined", ), ) end # Add null to the clustering matrix - matrix = [zeros(size(clustering_matrix, 1), 1) clustering_matrix] + matrix = [clustering_matrix zeros(size(clustering_matrix, 1), 1)] # Do the clustering hull_indices = greedy_convex_hull(matrix; n_points = n_rp + 1, distance, initial_indices = [1]) + + # Remove null from the beginning and shift all indices by one popfirst!(hull_indices) + hull_indices .-= 1 # Reinterpret the results rp_matrix = clustering_matrix[:, hull_indices] assignments = [ argmin([ distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices - ]) for p in 1:n_periods + ]) for p in 1:n_complete_periods ] aux.medoids = hull_indices elseif method ≡ :conical_hull @@ -639,9 +642,8 @@ function find_representative_periods( assignments = [ argmin([ distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices - ]) for p in 1:n_periods + ]) for p in 1:n_complete_periods ] - aux.medoids = hull_indices else throw(ArgumentError("Clustering method is not supported")) end diff --git a/src/weight_fitting.jl b/src/weight_fitting.jl index e726d45..59a45d7 100644 --- a/src/weight_fitting.jl +++ b/src/weight_fitting.jl @@ -214,7 +214,6 @@ function fit_rep_period_weights!( # TODO: this can be parallelized; investigate target_vector = clustering_matrix[:, period] subgradient = (x) -> rp_matrix' * (rp_matrix * x - target_vector) - x = Vector(weight_matrix[period, 1:n_rp]) if weight_type == :conical_bounded x = vcat(Vector(weight_matrix[period, 1:(n_rp - 1)]), [0.0]) else diff --git a/test/test-cluster.jl b/test/test-cluster.jl index b80b355..11c80da 100644 --- a/test/test-cluster.jl +++ b/test/test-cluster.jl @@ -226,6 +226,23 @@ end find_representative_periods(clustering_data, 2; method = :bad_method, init = :kmcen) end end + + @testset "Make sure that clustering fails with cosine distance" begin + @test_throws ArgumentError begin + clustering_data = DataFrame([ + :period => repeat(1:2; inner = 4), + :timestep => repeat(1:2; inner = 2, outer = 2), + :technology => repeat(["Solar", "Nuclear"], 4), + :value => 5:12, + ]) + clustering_result = find_representative_periods( + clustering_data, + 2; + method = :convex_hull_with_null, + distance = CosineDist(), + ) + end + end end @testset "Bad number of representative periods" begin @@ -243,6 +260,16 @@ end end @testset "Greedy convex hull" begin + @testset "Test the case where points cannot be found" begin + @test_throws ArgumentError TulipaClustering.greedy_convex_hull( + [1.0 0.0; 0.0 1.0]; + n_points = 10, + distance = Euclidean(), + initial_indices = [1, 2], + mean_vector = nothing, + ) + end + @testset "Test the case when there are more initial indices than points" begin @test size( TulipaClustering.greedy_convex_hull( diff --git a/test/test-weight-fitting.jl b/test/test-weight-fitting.jl index 9b597e9..22c63f0 100644 --- a/test/test-weight-fitting.jl +++ b/test/test-weight-fitting.jl @@ -90,7 +90,6 @@ end drop_incomplete_last_period = false, method = :convex_hull, distance = SqEuclidean(), - init = :kmcen, ) TulipaClustering.fit_rep_period_weights!( clustering_result; @@ -112,14 +111,15 @@ end drop_incomplete_last_period = false, method = :convex_hull_with_null, distance = SqEuclidean(), - init = :kmcen, ) TulipaClustering.fit_rep_period_weights!( clustering_result; weight_type = :conical_bounded, niters = 5, + show_progress = true, ) - all(sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) .≤ 1.0) + total_weights = sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) + all((total_weights .< 1.0) .|| (total_weights .≈ 1.0)) end @test begin @@ -137,9 +137,9 @@ end clustering_result; weight_type = :conical_bounded, niters = 5, - show_progress = true, ) - all(sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) .≤ 1.0) + total_weights = sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) + all((total_weights .< 1.0) .|| (total_weights .≈ 1.0)) end end @@ -152,7 +152,7 @@ end 10; drop_incomplete_last_period = false, method = :conical_hull, - distance = SqEuclidean(), + distance = CosineDist(), init = :kmcen, ) TulipaClustering.fit_rep_period_weights!(