Skip to content

Commit d6cd6d0

Browse files
committed
[WIP] Updates relevant to the workflow/data pipeline
1 parent 7456ea6 commit d6cd6d0

9 files changed

+376
-5
lines changed

src/TulipaClustering.jl

+2
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ using SparseArrays
1212
using Statistics
1313

1414
include("structures.jl")
15+
include("data-validation.jl")
1516
include("io.jl")
1617
include("weight_fitting.jl")
1718
include("cluster.jl")
19+
include("convenience.jl")
1820

1921
end

src/convenience.jl

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
export cluster!, dummy_cluster!, transform_wide_to_long!
2+
3+
"""
4+
cluster!(
5+
connection,
6+
period_duration,
7+
num_rps;
8+
input_profile_table_name = "input_profiles",
9+
kwargs...,
10+
)
11+
12+
Convenience function to cluster the table named in `input_profile_table_name`
13+
using `period_duration` and `num_rps`. The resulting tables
14+
`cluster_profiles_rep_periods`, `cluster_rep_periods_mapping`, and
15+
`cluster_rep_periods_data` are loaded into `connection`, enriched with `year` information.
16+
17+
This function extract the table, then calls [`split_into_periods!`](@ref),
18+
[`find_representative_periods`](@ref), [`fit_rep_period_weights!`](@ref), and
19+
finally `write_clustering_result_to_tables`.
20+
"""
21+
function cluster!(
22+
connection,
23+
period_duration,
24+
num_rps;
25+
input_profile_table_name = "input_profiles",
26+
drop_incomplete_last_period::Bool = false,
27+
method::Symbol = :k_means,
28+
distance::SemiMetric = SqEuclidean(),
29+
weight_type::Symbol = :convex,
30+
tol::Float64 = 1e-2,
31+
)
32+
df = DuckDB.query(
33+
connection,
34+
"SELECT * FROM $input_profile_table_name
35+
",
36+
) |> DataFrame
37+
split_into_periods!(df; period_duration)
38+
clusters =
39+
find_representative_periods(df, num_rps; drop_incomplete_last_period, method, distance)
40+
fit_rep_period_weights!(clusters; weight_type, tol)
41+
42+
write_clustering_result_to_tables(connection, clusters)
43+
44+
# enrich the cluster_ data with year information because TulipaClustering
45+
# currently does not support multi-year
46+
years = [
47+
row.year for
48+
row in DuckDB.query(connection, "SELECT DISTINCT year FROM $input_profile_table_name")
49+
]
50+
years_str = join(years, ", ")
51+
for table_name in ("cluster_rep_periods_data", "cluster_rep_periods_mapping")
52+
DuckDB.query(
53+
connection,
54+
"CREATE OR REPLACE TEMP TABLE t_new_$table_name AS
55+
SELECT unnest([$years_str]) AS year, $table_name.*
56+
FROM $table_name",
57+
)
58+
# DROP TABLE OR VIEW
59+
is_table =
60+
only([
61+
row.count for row in DuckDB.query(
62+
connection,
63+
"SELECT COUNT(*) AS count FROM duckdb_tables WHERE table_name='$table_name'",
64+
)
65+
]) > 0
66+
if is_table
67+
DuckDB.query(connection, "DROP TABLE $table_name")
68+
else
69+
DuckDB.query(connection, "DROP VIEW $table_name")
70+
end
71+
DuckDB.query(
72+
connection,
73+
"ALTER TABLE t_new_$table_name
74+
RENAME TO $table_name",
75+
)
76+
end
77+
78+
return clusters
79+
end
80+
81+
function dummy_cluster!(conncetion) end
82+
83+
"""
84+
transform_wide_to_long!(
85+
connection,
86+
wide_table_name,
87+
long_table_name;
88+
)
89+
90+
Convenience function to convert a table in wide format to long format using DuckDB.
91+
Originally aimed at converting a profile table like the following:
92+
93+
| year | timestep | name1 | name2 | ⋯ | name2 |
94+
| ---- | -------- | ----- | ----- | -- | ----- |
95+
| 2030 | 1 | 1.0 | 2.5 | ⋯ | 0.0 |
96+
| 2030 | 2 | 1.5 | 2.6 | ⋯ | 0.0 |
97+
| 2030 | 3 | 2.0 | 2.6 | ⋯ | 0.0 |
98+
99+
To a table like the following:
100+
101+
| year | timestep | profile_name | value |
102+
| ---- | -------- | ------------ | ----- |
103+
| 2030 | 1 | name1 | 1.0 |
104+
| 2030 | 2 | name1 | 1.5 |
105+
| 2030 | 3 | name1 | 2.0 |
106+
| 2030 | 1 | name2 | 2.5 |
107+
| 2030 | 2 | name2 | 2.6 |
108+
| 2030 | 3 | name2 | 2.6 |
109+
| ⋮ | ⋮ | ⋮ | ⋮ |
110+
| 2030 | 1 | name3 | 0.0 |
111+
| 2030 | 2 | name3 | 0.0 |
112+
| 2030 | 3 | name3 | 0.0 |
113+
114+
This conversion is done using the `UNPIVOT` SQL command from DuckDB.
115+
116+
## Keyword arguments
117+
118+
- `exclude_columns = ["year", "timestep"]`: Which tables to exclude from the conversion
119+
- `name_column = "profile_name"`: Name of the new column that contains the names of the old columns
120+
- `value_column = "value"`: Name of the new column that holds the values from the old columns
121+
"""
122+
function transform_wide_to_long!(
123+
connection,
124+
wide_table_name,
125+
long_table_name;
126+
exclude_columns = ["year", "timestep"],
127+
name_column = "profile_name",
128+
value_column = "value",
129+
)
130+
@assert length(exclude_columns) > 0
131+
exclude_str = join(exclude_columns, ", ")
132+
DuckDB.query(
133+
connection,
134+
"CREATE TABLE $long_table_name AS
135+
UNPIVOT $wide_table_name
136+
ON COLUMNS(* EXCLUDE ($exclude_str))
137+
INTO
138+
NAME $name_column
139+
VALUE $value_column
140+
ORDER BY $name_column, $exclude_str
141+
",
142+
)
143+
144+
return
145+
end

src/data-validation.jl

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""
2+
DataValidationException
3+
4+
Exception related to data validation of the Tulipa Energy Model input data.
5+
"""
6+
mutable struct DataValidationException <: Exception
7+
error_messages::Vector{String}
8+
end
9+
10+
function Base.showerror(io::IO, ex::DataValidationException)
11+
println(io, "DataValidationException: The following issues were found in the data:")
12+
for error_message in ex.error_messages
13+
println(io, "- " * error_message)
14+
end
15+
end
16+
17+
"""
18+
validate_date!(connection)
19+
20+
Validate that the required data in `connection` exists and is correct.
21+
Throws a `DataValidationException` if any error is found.
22+
"""
23+
function validate_data!(connection)
24+
error_messages = String[]
25+
26+
for (log_msg, validation_function, fail_fast) in
27+
(("has required tables and columns", _validate_required_tables_and_columns!, true),)
28+
@debug log_msg
29+
append!(error_messages, validation_function(connection))
30+
if fail_fast && length(error_messages) > 0
31+
break
32+
end
33+
end
34+
35+
if length(error_messages) > 0
36+
throw(DataValidationException(error_messages))
37+
end
38+
39+
return
40+
end
41+
42+
function _validate_required_tables_and_columns!(connection)
43+
error_messages = String[]
44+
table_name = "input_profiles" # Only one table is required so far
45+
46+
columns_from_connection = [
47+
row.column_name for row in DuckDB.query(
48+
connection,
49+
"SELECT column_name FROM duckdb_columns() WHERE table_name = '$table_name'",
50+
)
51+
]
52+
if length(columns_from_connection) == 0
53+
# Just to make sure that this is not a random case with no columns but the table exists
54+
has_table =
55+
only([
56+
row.count for row in DuckDB.query(
57+
connection,
58+
"SELECT COUNT(table_name) as count FROM duckdb_tables() WHERE table_name = '$table_name'",
59+
)
60+
]) == 1
61+
if !has_table
62+
push!(error_messages, "Table '$table_name' expected but not found")
63+
return error_messages
64+
end
65+
end
66+
67+
for column in ["profile_name", "year", "timestep", "value"]
68+
if !(column in columns_from_connection)
69+
push!(error_messages, "Column '$column' is missing from table '$table_name'")
70+
end
71+
end
72+
73+
return error_messages
74+
end

src/io.jl

+7-3
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@ function write_clustering_result_to_tables(
2727
connection,
2828
clustering_result::TulipaClustering.ClusteringResult,
2929
)
30-
DuckDB.register_data_frame(connection, clustering_result.profiles, "profiles_rep_periods")
30+
DuckDB.register_data_frame(
31+
connection,
32+
clustering_result.profiles,
33+
"cluster_profiles_rep_periods",
34+
)
3135

3236
DuckDB.register_data_frame(
3337
connection,
3438
weight_matrix_to_df(clustering_result.weight_matrix),
35-
"rep_periods_mapping",
39+
"cluster_rep_periods_mapping",
3640
)
3741

3842
aux = clustering_result.auxiliary_data
@@ -46,7 +50,7 @@ function write_clustering_result_to_tables(
4650
num_timesteps = period_duration,
4751
resolution = 1.0,
4852
),
49-
"rep_periods_data",
53+
"cluster_rep_periods_data",
5054
)
5155
return nothing
5256
end

src/weight_fitting.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ function fit_rep_period_weights!(
175175
clustering_matrix::Matrix{Float64},
176176
rp_matrix::Matrix{Float64};
177177
weight_type::Symbol = :dirac,
178-
tol::Float64 = 10e-3,
178+
tol::Float64 = 1e-2,
179179
show_progress = false,
180180
args...,
181181
)
@@ -270,7 +270,7 @@ The arguments:
270270
function fit_rep_period_weights!(
271271
clustering_result::ClusteringResult;
272272
weight_type::Symbol = :dirac,
273-
tol::Float64 = 10e-3,
273+
tol::Float64 = 1e-2,
274274
args...,
275275
)
276276
fit_rep_period_weights!(

test/runtests.jl

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ using TulipaIO
1212
const INPUT_FOLDER = joinpath(@__DIR__, "inputs")
1313
const OUTPUT_FOLDER = joinpath(@__DIR__, "outputs")
1414

15+
include("utils.jl")
16+
1517
#=
1618
Don't add your tests to runtests.jl. Instead, create files named
1719

test/test-convenience.jl

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
@testset "Trasform wide in long" begin
2+
connection = DBInterface.connect(DuckDB.DB)
3+
DuckDB.query(
4+
connection,
5+
"CREATE TABLE t_wide AS
6+
SELECT
7+
2030 AS year,
8+
i AS timestep,
9+
2.0*i AS name1,
10+
i * i AS name2,
11+
0.0 AS name3,
12+
FROM
13+
generate_series(1, 24) AS s(i)
14+
",
15+
)
16+
17+
transform_wide_to_long!(connection, "t_wide", "t_long")
18+
19+
df = DuckDB.query(
20+
connection,
21+
"FROM t_long
22+
ORDER BY profile_name, year, timestep
23+
",
24+
) |> DataFrame
25+
@test size(df) == (72, 4)
26+
@test sort(names(df)) == ["profile_name", "timestep", "value", "year"]
27+
@test df.value == [2.0 * (1:24); (1:24) .* (1:24); fill(0.0, 24)]
28+
end
29+
30+
@testset "cluster!" begin
31+
period_duration = 24
32+
num_periods = 7
33+
num_timesteps = period_duration * num_periods
34+
num_rps = 4
35+
profile_names = ["name1", "name2", "name3"]
36+
years = [2030, 2050]
37+
38+
connection = _new_connection(; profile_names, years, num_timesteps)
39+
40+
clusters = cluster!(connection, period_duration, num_rps)
41+
42+
df_rep_periods_data =
43+
DuckDB.query(
44+
connection,
45+
"FROM cluster_rep_periods_data
46+
ORDER BY year, rep_period",
47+
) |> DataFrame
48+
df_rep_periods_mapping =
49+
DuckDB.query(
50+
connection,
51+
"FROM cluster_rep_periods_mapping
52+
ORDER BY year, period, rep_period",
53+
) |> DataFrame
54+
df_profiles_rep_periods =
55+
DuckDB.query(
56+
connection,
57+
"FROM cluster_profiles_rep_periods
58+
ORDER BY profile_name, year, rep_period, timestep",
59+
) |> DataFrame
60+
61+
@test sort(names(df_rep_periods_data)) ==
62+
["num_timesteps", "rep_period", "resolution", "year"]
63+
@test sort(names(df_rep_periods_mapping)) == ["period", "rep_period", "weight", "year"]
64+
@test sort(names(df_profiles_rep_periods)) ==
65+
["profile_name", "rep_period", "timestep", "value", "year"]
66+
67+
@test df_rep_periods_data.year == repeat(years; inner = num_rps)
68+
@test df_rep_periods_data.rep_period == repeat(1:num_rps; outer = length(years))
69+
@test all(df_rep_periods_data.resolution .== 1.0)
70+
@test all(df_rep_periods_data.num_timesteps .== period_duration)
71+
72+
@test size(df_rep_periods_mapping, 1) length(years) * num_periods
73+
74+
@test df_profiles_rep_periods.profile_name ==
75+
repeat(profile_names; inner = period_duration * num_rps * length(years))
76+
@test df_profiles_rep_periods.year ==
77+
repeat(years; inner = period_duration * num_rps, outer = length(profile_names))
78+
@test df_profiles_rep_periods.rep_period == repeat(
79+
1:num_rps;
80+
inner = period_duration,
81+
outer = length(profile_names) * length(years),
82+
)
83+
@test df_profiles_rep_periods.timestep ==
84+
repeat(1:period_duration; outer = length(profile_names) * length(years) * num_rps)
85+
end

0 commit comments

Comments
 (0)