[WIP] Updates relevant to the workflow/data pipeline

abelsiqueira · abelsiqueira · commit d6cd6d00214c · 2025-04-14T23:59:45.000+02:00
diff --git a/src/TulipaClustering.jl b/src/TulipaClustering.jl
@@ -12,8 +12,10 @@ using SparseArrays
 using Statistics
 
 include("structures.jl")
+include("data-validation.jl")
 include("io.jl")
 include("weight_fitting.jl")
 include("cluster.jl")
+include("convenience.jl")
 
 end
diff --git a/src/convenience.jl b/src/convenience.jl
@@ -0,0 +1,145 @@
+export cluster!, dummy_cluster!, transform_wide_to_long!
+
+"""
+    cluster!(
+        connection,
+        period_duration,
+        num_rps;
+        input_profile_table_name = "input_profiles",
+        kwargs...,
+    )
+
+Convenience function to cluster the table named in `input_profile_table_name`
+using `period_duration` and `num_rps`. The resulting tables
+`cluster_profiles_rep_periods`, `cluster_rep_periods_mapping`, and
+`cluster_rep_periods_data` are loaded into `connection`, enriched with `year` information.
+
+This function extract the table, then calls [`split_into_periods!`](@ref),
+[`find_representative_periods`](@ref), [`fit_rep_period_weights!`](@ref), and
+finally `write_clustering_result_to_tables`.
+"""
+function cluster!(
+  connection,
+  period_duration,
+  num_rps;
+  input_profile_table_name = "input_profiles",
+  drop_incomplete_last_period::Bool = false,
+  method::Symbol = :k_means,
+  distance::SemiMetric = SqEuclidean(),
+  weight_type::Symbol = :convex,
+  tol::Float64 = 1e-2,
+)
+  df = DuckDB.query(
+    connection,
+    "SELECT * FROM $input_profile_table_name
+    ",
+  ) |> DataFrame
+  split_into_periods!(df; period_duration)
+  clusters =
+    find_representative_periods(df, num_rps; drop_incomplete_last_period, method, distance)
+  fit_rep_period_weights!(clusters; weight_type, tol)
+
+  write_clustering_result_to_tables(connection, clusters)
+
+  # enrich the cluster_ data with year information because TulipaClustering
+  # currently does not support multi-year
+  years = [
+    row.year for
+    row in DuckDB.query(connection, "SELECT DISTINCT year FROM $input_profile_table_name")
+  ]
+  years_str = join(years, ", ")
+  for table_name in ("cluster_rep_periods_data", "cluster_rep_periods_mapping")
+    DuckDB.query(
+      connection,
+      "CREATE OR REPLACE TEMP TABLE t_new_$table_name AS
+      SELECT unnest([$years_str]) AS year, $table_name.*
+      FROM $table_name",
+    )
+    # DROP TABLE OR VIEW
+    is_table =
+      only([
+        row.count for row in DuckDB.query(
+          connection,
+          "SELECT COUNT(*) AS count FROM duckdb_tables WHERE table_name='$table_name'",
+        )
+      ]) > 0
+    if is_table
+      DuckDB.query(connection, "DROP TABLE $table_name")
+    else
+      DuckDB.query(connection, "DROP VIEW $table_name")
+    end
+    DuckDB.query(
+      connection,
+      "ALTER TABLE t_new_$table_name
+      RENAME TO $table_name",
+    )
+  end
+
+  return clusters
+end
+
+function dummy_cluster!(conncetion) end
+
+"""
+    transform_wide_to_long!(
+        connection,
+        wide_table_name,
+        long_table_name;
+    )
+
+Convenience function to convert a table in wide format to long format using DuckDB.
+Originally aimed at converting a profile table like the following:
+
+| year | timestep | name1 | name2 | ⋯  | name2 |
+| ---- | -------- | ----- | ----- | -- | ----- |
+| 2030 |        1 |   1.0 |   2.5 | ⋯  |   0.0 |
+| 2030 |        2 |   1.5 |   2.6 | ⋯  |   0.0 |
+| 2030 |        3 |   2.0 |   2.6 | ⋯  |   0.0 |
+
+To a table like the following:
+
+| year | timestep | profile_name | value |
+| ---- | -------- | ------------ | ----- |
+| 2030 |        1 |        name1 |   1.0 |
+| 2030 |        2 |        name1 |   1.5 |
+| 2030 |        3 |        name1 |   2.0 |
+| 2030 |        1 |        name2 |   2.5 |
+| 2030 |        2 |        name2 |   2.6 |
+| 2030 |        3 |        name2 |   2.6 |
+|    ⋮ |        ⋮ |            ⋮ |     ⋮ |
+| 2030 |        1 |        name3 |   0.0 |
+| 2030 |        2 |        name3 |   0.0 |
+| 2030 |        3 |        name3 |   0.0 |
+
+This conversion is done using the `UNPIVOT` SQL command from DuckDB.
+
+## Keyword arguments
+
+- `exclude_columns = ["year", "timestep"]`: Which tables to exclude from the conversion
+- `name_column = "profile_name"`: Name of the new column that contains the names of the old columns
+- `value_column = "value"`: Name of the new column that holds the values from the old columns
+"""
+function transform_wide_to_long!(
+  connection,
+  wide_table_name,
+  long_table_name;
+  exclude_columns = ["year", "timestep"],
+  name_column = "profile_name",
+  value_column = "value",
+)
+  @assert length(exclude_columns) > 0
+  exclude_str = join(exclude_columns, ", ")
+  DuckDB.query(
+    connection,
+    "CREATE TABLE $long_table_name AS
+    UNPIVOT $wide_table_name
+    ON COLUMNS(* EXCLUDE ($exclude_str))
+    INTO
+        NAME $name_column
+        VALUE $value_column
+    ORDER BY $name_column, $exclude_str
+    ",
+  )
+
+  return
+end
diff --git a/src/data-validation.jl b/src/data-validation.jl
@@ -0,0 +1,74 @@
+"""
+    DataValidationException
+
+Exception related to data validation of the Tulipa Energy Model input data.
+"""
+mutable struct DataValidationException <: Exception
+  error_messages::Vector{String}
+end
+
+function Base.showerror(io::IO, ex::DataValidationException)
+  println(io, "DataValidationException: The following issues were found in the data:")
+  for error_message in ex.error_messages
+    println(io, "- " * error_message)
+  end
+end
+
+"""
+    validate_date!(connection)
+
+Validate that the required data in `connection` exists and is correct.
+Throws a `DataValidationException` if any error is found.
+"""
+function validate_data!(connection)
+  error_messages = String[]
+
+  for (log_msg, validation_function, fail_fast) in
+      (("has required tables and columns", _validate_required_tables_and_columns!, true),)
+    @debug log_msg
+    append!(error_messages, validation_function(connection))
+    if fail_fast && length(error_messages) > 0
+      break
+    end
+  end
+
+  if length(error_messages) > 0
+    throw(DataValidationException(error_messages))
+  end
+
+  return
+end
+
+function _validate_required_tables_and_columns!(connection)
+  error_messages = String[]
+  table_name = "input_profiles" # Only one table is required so far
+
+  columns_from_connection = [
+    row.column_name for row in DuckDB.query(
+      connection,
+      "SELECT column_name FROM duckdb_columns() WHERE table_name = '$table_name'",
+    )
+  ]
+  if length(columns_from_connection) == 0
+    # Just to make sure that this is not a random case with no columns but the table exists
+    has_table =
+      only([
+        row.count for row in DuckDB.query(
+          connection,
+          "SELECT COUNT(table_name) as count FROM duckdb_tables() WHERE table_name = '$table_name'",
+        )
+      ]) == 1
+    if !has_table
+      push!(error_messages, "Table '$table_name' expected but not found")
+      return error_messages
+    end
+  end
+
+  for column in ["profile_name", "year", "timestep", "value"]
+    if !(column in columns_from_connection)
+      push!(error_messages, "Column '$column' is missing from table '$table_name'")
+    end
+  end
+
+  return error_messages
+end
diff --git a/src/io.jl b/src/io.jl
@@ -27,12 +27,16 @@ function write_clustering_result_to_tables(
   connection,
   clustering_result::TulipaClustering.ClusteringResult,
 )
-  DuckDB.register_data_frame(connection, clustering_result.profiles, "profiles_rep_periods")
+  DuckDB.register_data_frame(
+    connection,
+    clustering_result.profiles,
+    "cluster_profiles_rep_periods",
+  )
 
   DuckDB.register_data_frame(
     connection,
     weight_matrix_to_df(clustering_result.weight_matrix),
-    "rep_periods_mapping",
+    "cluster_rep_periods_mapping",
   )
 
   aux = clustering_result.auxiliary_data
@@ -46,7 +50,7 @@ function write_clustering_result_to_tables(
       num_timesteps = period_duration,
       resolution = 1.0,
     ),
-    "rep_periods_data",
+    "cluster_rep_periods_data",
   )
   return nothing
 end
diff --git a/src/weight_fitting.jl b/src/weight_fitting.jl
@@ -175,7 +175,7 @@ function fit_rep_period_weights!(
   clustering_matrix::Matrix{Float64},
   rp_matrix::Matrix{Float64};
   weight_type::Symbol = :dirac,
-  tol::Float64 = 10e-3,
+  tol::Float64 = 1e-2,
   show_progress = false,
   args...,
 )
@@ -270,7 +270,7 @@ The arguments:
 function fit_rep_period_weights!(
   clustering_result::ClusteringResult;
   weight_type::Symbol = :dirac,
-  tol::Float64 = 10e-3,
+  tol::Float64 = 1e-2,
   args...,
 )
   fit_rep_period_weights!(
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -12,6 +12,8 @@ using TulipaIO
 const INPUT_FOLDER = joinpath(@__DIR__, "inputs")
 const OUTPUT_FOLDER = joinpath(@__DIR__, "outputs")
 
+include("utils.jl")
+
 #=
 Don't add your tests to runtests.jl. Instead, create files named
 
diff --git a/test/test-convenience.jl b/test/test-convenience.jl
@@ -0,0 +1,85 @@
+@testset "Trasform wide in long" begin
+  connection = DBInterface.connect(DuckDB.DB)
+  DuckDB.query(
+    connection,
+    "CREATE TABLE t_wide AS
+    SELECT
+        2030 AS year,
+           i AS timestep,
+       2.0*i AS name1,
+       i * i AS name2,
+         0.0 AS name3,
+    FROM
+      generate_series(1, 24) AS s(i)
+    ",
+  )
+
+  transform_wide_to_long!(connection, "t_wide", "t_long")
+
+  df = DuckDB.query(
+    connection,
+    "FROM t_long
+    ORDER BY profile_name, year, timestep
+    ",
+  ) |> DataFrame
+  @test size(df) == (72, 4)
+  @test sort(names(df)) == ["profile_name", "timestep", "value", "year"]
+  @test df.value == [2.0 * (1:24); (1:24) .* (1:24); fill(0.0, 24)]
+end
+
+@testset "cluster!" begin
+  period_duration = 24
+  num_periods = 7
+  num_timesteps = period_duration * num_periods
+  num_rps = 4
+  profile_names = ["name1", "name2", "name3"]
+  years = [2030, 2050]
+
+  connection = _new_connection(; profile_names, years, num_timesteps)
+
+  clusters = cluster!(connection, period_duration, num_rps)
+
+  df_rep_periods_data =
+    DuckDB.query(
+      connection,
+      "FROM cluster_rep_periods_data
+      ORDER BY year, rep_period",
+    ) |> DataFrame
+  df_rep_periods_mapping =
+    DuckDB.query(
+      connection,
+      "FROM cluster_rep_periods_mapping
+      ORDER BY year, period, rep_period",
+    ) |> DataFrame
+  df_profiles_rep_periods =
+    DuckDB.query(
+      connection,
+      "FROM cluster_profiles_rep_periods
+      ORDER BY profile_name, year, rep_period, timestep",
+    ) |> DataFrame
+
+  @test sort(names(df_rep_periods_data)) ==
+        ["num_timesteps", "rep_period", "resolution", "year"]
+  @test sort(names(df_rep_periods_mapping)) == ["period", "rep_period", "weight", "year"]
+  @test sort(names(df_profiles_rep_periods)) ==
+        ["profile_name", "rep_period", "timestep", "value", "year"]
+
+  @test df_rep_periods_data.year == repeat(years; inner = num_rps)
+  @test df_rep_periods_data.rep_period == repeat(1:num_rps; outer = length(years))
+  @test all(df_rep_periods_data.resolution .== 1.0)
+  @test all(df_rep_periods_data.num_timesteps .== period_duration)
+
+  @test size(df_rep_periods_mapping, 1) ≥ length(years) * num_periods
+
+  @test df_profiles_rep_periods.profile_name ==
+        repeat(profile_names; inner = period_duration * num_rps * length(years))
+  @test df_profiles_rep_periods.year ==
+        repeat(years; inner = period_duration * num_rps, outer = length(profile_names))
+  @test df_profiles_rep_periods.rep_period == repeat(
+    1:num_rps;
+    inner = period_duration,
+    outer = length(profile_names) * length(years),
+  )
+  @test df_profiles_rep_periods.timestep ==
+        repeat(1:period_duration; outer = length(profile_names) * length(years) * num_rps)
+end
diff --git a/test/test-data-validation.jl b/test/test-data-validation.jl
diff --git a/test/utils.jl b/test/utils.jl