TulipaEnergy · datejada · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/Project.toml b/Project.toml
@@ -13,6 +13,7 @@ DuckDB_jll = "2cbbab25-fc8b-58cf-88d4-687a02676033"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 ProgressBars = "49802e3a-d2f1-5c88-81d8-b72133a6f568"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
 CSV = "0.10"

diff --git a/src/TulipaClustering.jl b/src/TulipaClustering.jl
@@ -6,8 +6,10 @@ using Clustering
 using DataFrames
 using Distances
 using DuckDB
+using LinearAlgebra
 using ProgressBars
 using SparseArrays
+using Statistics
 
 include("structures.jl")
 include("io.jl")

diff --git a/src/cluster.jl b/src/cluster.jl
@@ -382,6 +382,92 @@ function append_period_from_source_df_as_rp!(
   append!(df, period_df)
 end
 
+"""
+    greedy_convex_hull(matrix; n_points, distance, initial_indices, mean_vector)
+
+  Greedy method for finding `n_points` points in a hull of the dataset. The points
+  are added iteratively, at each step the point that is the furthest away from the
+  hull of the current set of points is found and added to the hull.
+
+  - `matrix`: the clustering matrix
+  - `n_points`: number of hull points to find
+  - `distance`: distance semimetric
+  - `initial_indices`: initial points which must be added to the hull, can be nothing
+  - `mean_vector`: when adding the first point (if `initial_indices` is not given),
+      it will be chosen as the point furthest away from the `mean_vector`; this can be
+      nothing, in which case the first step will add a point furtherst away from
+      the centroid (the mean) of the dataset
+"""
+function greedy_convex_hull(
+  matrix::AbstractMatrix{Float64};
+  n_points::Int,
+  distance::SemiMetric,
+  initial_indices::Union{Vector{Int}, Nothing} = nothing,
+  mean_vector::Union{Vector{Float64}, Nothing} = nothing,
+)
+  # First resolve the points that are already in the hull given via `initial_indices`
+  if initial_indices ≡ nothing
+    if mean_vector ≡ nothing
+      mean_vector = vec(mean(matrix; dims = 2))
+    end
+    distances_from_mean = [distance(mean_vector, matrix[:, j]) for j in axes(matrix, 2)]
+    initial_indices = [argmax(distances_from_mean)]
+  end
+
+  # If there are more initial points than `n_points`, return the first `n_points`
+  if length(initial_indices) ≥ n_points
+    return initial_indices[1:n_points]
+  end
+
+  # Start filling in the remaining points
+  hull_indices = initial_indices
+  distances_cache = Dict{Int, Float64}()  # store previously computed distances
+  starting_index = length(initial_indices) + 1
+
+  for _ in starting_index:n_points
+    # Find the point that is the furthest away from the current hull
+    max_distance = -Inf
+    furthest_vector_index = nothing
+    hull_matrix = matrix[:, hull_indices]
+    projection_matrix = pinv(hull_matrix)
+    for column_index in axes(matrix, 2)
+      if column_index in hull_indices
+        continue
+      end
+      last_added_vector = matrix[:, last(hull_indices)]
+      target_vector = matrix[:, column_index]
+
+      # Check whether the distance was previosly computed
+      cached_distance = get(distances_cache, column_index, Inf)
+      if distance(target_vector, last_added_vector) ≥ cached_distance
+        d = cached_distance
+      else
+        subgradient = x -> hull_matrix' * (hull_matrix * x - target_vector)
+        x = projection_matrix * target_vector
+        x =
+          projected_subgradient_descent!(x; subgradient, projection = project_onto_simplex)
+        projected_target = hull_matrix * x
+        d = distance(projected_target, target_vector)
+        distances_cache[column_index] = d
+      end
+
+      if d > max_distance
+        max_distance = d
+        furthest_vector_index = column_index
+      end
+    end
+
+    # If no point is found for some reason, throw an error
+    if furthest_vector_index ≡ nothing
+      throw(ArgumentError("Point not found"))
+    end
+
+    # Add the found point to the hull
+    push!(hull_indices, furthest_vector_index)
+  end
+  return hull_indices
+end
+
 """
   find_representative_periods(
     clustering_data;
@@ -486,6 +572,78 @@ function find_representative_periods(
     rp_matrix = clustering_matrix[:, kmedoids_result.medoids]
     assignments = kmedoids_result.assignments
     aux.medoids = kmedoids_result.medoids
+  elseif method ≡ :convex_hull
+    # Do the clustering
+    hull_indices = greedy_convex_hull(clustering_matrix; n_points = n_rp, distance)
+
+    # Reinterpret the results
+    rp_matrix = clustering_matrix[:, hull_indices]
+    assignments = [
+      argmin([
+        distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices
+      ]) for p in 1:n_complete_periods
+    ]
+    aux.medoids = hull_indices
+  elseif method ≡ :convex_hull_with_null
+    # Check if we can add null to the clustering matrix. The distance to null can
+    # be undefined, e.g., for the cosine distance.
+    is_distance_to_zero_undefined =
+      isnan(distance(zeros(size(clustering_matrix, 1), 1), clustering_matrix[:, 1]))
+
+    if is_distance_to_zero_undefined
+      throw(
+        ArgumentError(
+          "cannot add null to the clustering data because distance to it is undefined",
+        ),
+      )
+    end
+
+    # Add null to the clustering matrix
+    matrix = [clustering_matrix zeros(size(clustering_matrix, 1), 1)]
+
+    # Do the clustering
+    hull_indices =
+      greedy_convex_hull(matrix; n_points = n_rp + 1, distance, initial_indices = [1])
+
+    # Remove null from the beginning and shift all indices by one
+    popfirst!(hull_indices)
+    hull_indices .-= 1
+
+    # Reinterpret the results
+    rp_matrix = clustering_matrix[:, hull_indices]
+    assignments = [
+      argmin([
+        distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices
+      ]) for p in 1:n_complete_periods
+    ]
+    aux.medoids = hull_indices
+  elseif method ≡ :conical_hull
+    # Do a gnomonic projection (normalization) of the data
+    normal_vector = vec(mean(clustering_matrix; dims = 2))
+    normalize!(normal_vector)
+    projection_coefficients = [
+      1.0 / dot(normal_vector, clustering_matrix[:, j]) for j in axes(clustering_matrix, 2)
+    ]
+    projected_matrix = [
+      clustering_matrix[i, j] * projection_coefficients[j] for
+      i in axes(clustering_matrix, 1), j in axes(clustering_matrix, 2)
+    ]
+
+    # Do the clustering
+    hull_indices = greedy_convex_hull(
+      projected_matrix;
+      n_points = n_rp,
+      distance,
+      mean_vector = normal_vector,
+    )
+
+    # Reinterpret the results
+    rp_matrix = clustering_matrix[:, hull_indices]
+    assignments = [
+      argmin([
+        distance(clustering_matrix[:, h], clustering_matrix[:, p]) for h in hull_indices
+      ]) for p in 1:n_complete_periods
+    ]
   else
     throw(ArgumentError("Clustering method is not supported"))
   end

diff --git a/src/weight_fitting.jl b/src/weight_fitting.jl
@@ -214,7 +214,6 @@ function fit_rep_period_weights!(
     # TODO: this can be parallelized; investigate
     target_vector = clustering_matrix[:, period]
     subgradient = (x) -> rp_matrix' * (rp_matrix * x - target_vector)
-    x = Vector(weight_matrix[period, 1:n_rp])
     if weight_type == :conical_bounded
       x = vcat(Vector(weight_matrix[period, 1:(n_rp - 1)]), [0.0])
     else

diff --git a/test/test-cluster.jl b/test/test-cluster.jl
@@ -197,6 +197,22 @@ end
   end
 end
 
+@testset "Convex hull clustering" begin
+  @testset "Make sure that convex hull clustering finds the hull" begin
+    @test begin
+      clustering_data = DataFrame([
+        :period => repeat(1:2; inner = 4),
+        :timestep => repeat(1:2; inner = 2, outer = 2),
+        :technology => repeat(["Solar", "Nuclear"], 4),
+        :value => 5:12,
+      ])
+      clustering_result =
+        find_representative_periods(clustering_data, 2; method = :convex_hull)
+      clustering_result.weight_matrix == [1.0 0.0; 0.0 1.0]
+    end
+  end
+end
+
 @testset "Bad clustering method" begin
   @testset "Make sure that clustering fails when incorrect method is given" begin
     @test_throws ArgumentError begin
@@ -210,6 +226,23 @@ end
         find_representative_periods(clustering_data, 2; method = :bad_method, init = :kmcen)
     end
   end
+
+  @testset "Make sure that clustering fails with cosine distance" begin
+    @test_throws ArgumentError begin
+      clustering_data = DataFrame([
+        :period => repeat(1:2; inner = 4),
+        :timestep => repeat(1:2; inner = 2, outer = 2),
+        :technology => repeat(["Solar", "Nuclear"], 4),
+        :value => 5:12,
+      ])
+      clustering_result = find_representative_periods(
+        clustering_data,
+        2;
+        method = :convex_hull_with_null,
+        distance = CosineDist(),
+      )
+    end
+  end
 end
 
 @testset "Bad number of representative periods" begin
@@ -225,3 +258,27 @@ end
     @test_throws ArgumentError find_representative_periods(clustering_data, 3)
   end
 end
+
+@testset "Greedy convex hull" begin
+  @testset "Test the case where points cannot be found" begin
+    @test_throws ArgumentError TulipaClustering.greedy_convex_hull(
+      [1.0 0.0; 0.0 1.0];
+      n_points = 10,
+      distance = Euclidean(),
+      initial_indices = [1, 2],
+      mean_vector = nothing,
+    )
+  end
+
+  @testset "Test the case when there are more initial indices than points" begin
+    @test size(
+      TulipaClustering.greedy_convex_hull(
+        [1.0 0.0; 0.0 1.0];
+        n_points = 1,
+        distance = Euclidean(),
+        initial_indices = [1, 2],
+        mean_vector = nothing,
+      ),
+    ) == (1,)
+  end
+end
diff --git a/test/test-weight-fitting.jl b/test/test-weight-fitting.jl
@@ -88,9 +88,8 @@ end
         clustering_data,
         10;
         drop_incomplete_last_period = false,
-        method = :k_means,
+        method = :convex_hull,
         distance = SqEuclidean(),
-        init = :kmcen,
       )
       TulipaClustering.fit_rep_period_weights!(
         clustering_result;
@@ -110,16 +109,17 @@ end
         clustering_data,
         10;
         drop_incomplete_last_period = false,
-        method = :k_means,
+        method = :convex_hull_with_null,
         distance = SqEuclidean(),
-        init = :kmcen,
       )
       TulipaClustering.fit_rep_period_weights!(
         clustering_result;
         weight_type = :conical_bounded,
         niters = 5,
+        show_progress = true,
       )
-      all(sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) .≤ 1.0)
+      total_weights = sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2)
+      all((total_weights .< 1.0) .|| (total_weights .≈ 1.0))
     end
 
     @test begin
@@ -137,9 +137,9 @@ end
         clustering_result;
         weight_type = :conical_bounded,
         niters = 5,
-        show_progress = true,
       )
-      all(sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2) .≤ 1.0)
+      total_weights = sum(clustering_result.weight_matrix[1:(end - 1), :]; dims = 2)
+      all((total_weights .< 1.0) .|| (total_weights .≈ 1.0))
     end
   end
 
@@ -151,8 +151,8 @@ end
         clustering_data,
         10;
         drop_incomplete_last_period = false,
-        method = :k_means,
-        distance = SqEuclidean(),
+        method = :conical_hull,
+        distance = CosineDist(),
         init = :kmcen,
       )
       TulipaClustering.fit_rep_period_weights!(