Skip to content

Commit 971f652

Browse files
reinecfikim-mskw
andauthored
Automatically calculate obs_dim based on foresight, unique_obs_dim, ... (#708)
<!-- SPDX-FileCopyrightText: ASSUME Developers SPDX-License-Identifier: AGPL-3.0-or-later --> ## Description Currently obs_dim, foresight, unique_obs_dim are fixed values for a given strategy. As they are directly interdependent, obs_dim is now calculated based on the other ones. Adjustments should now only need less changes per strategy. Also correcting some wrong values in the doc strings and removing the reward scaling in RenewableEnergyLearningSingleBidStrategy (as already done in EnergyLearningStrategy). ## Checklist - [x] Documentation updated (docstrings, READMEs, user guides, inline comments, `doc` folder updates etc.) - [x] New unit/integration tests added (if applicable) - [x] Changes noted in release notes (if any) - [x] Consent to release this PR's code under the GNU Affero General Public License v3.0 --------- Co-authored-by: kim-mskw <[email protected]>
1 parent 8f7bd22 commit 971f652

File tree

11 files changed

+102
-93
lines changed

11 files changed

+102
-93
lines changed

assume/common/base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ class LearningStrategy(BaseStrategy):
893893
convention when designing your create_observation method and the observation space.
894894
895895
Attributes:
896-
obs_dim (int): The observation dimension.
896+
foresight (int): Number of steps of for- and backwards looking in observations.
897897
act_dim (int): The action dimension.
898898
unique_obs_dim (int): The unique observation dimension.
899899
num_timeseries_obs_dim (int): The number of observation timeseries dimension.
@@ -907,7 +907,7 @@ class LearningStrategy(BaseStrategy):
907907
def __init__(
908908
self,
909909
learning_role,
910-
obs_dim: int,
910+
foresight: int,
911911
act_dim: int,
912912
unique_obs_dim: int,
913913
num_timeseries_obs_dim: int = 3,
@@ -923,7 +923,7 @@ def __init__(
923923
self.learning_role = learning_role
924924
self.learning_config = learning_role.learning_config
925925

926-
self.obs_dim = obs_dim
926+
self.foresight = foresight
927927
self.act_dim = act_dim
928928

929929
# this defines the number of unique observations, which are not the same for all units
@@ -934,6 +934,8 @@ def __init__(
934934
# them into suitable format for recurrent neural networks
935935
self.num_timeseries_obs_dim = num_timeseries_obs_dim
936936

937+
self.obs_dim = num_timeseries_obs_dim * foresight + unique_obs_dim
938+
937939

938940
class MinMaxStrategy(BaseStrategy):
939941
pass

assume/reinforcement_learning/algorithms/matd3.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -270,23 +270,25 @@ def check_strategy_dimensions(self) -> None:
270270
Also check if the unique observation dimensions are the same. If not, raise a ValueError.
271271
This is important for the TD3 algorithm, as it uses a centralized critic that requires consistent dimensions across all agents.
272272
"""
273+
foresight_list = []
273274
obs_dim_list = []
274275
act_dim_list = []
275276
unique_obs_dim_list = []
276277
num_timeseries_obs_dim_list = []
277278

278279
for strategy in self.learning_role.rl_strats.values():
280+
foresight_list.append(strategy.foresight)
279281
obs_dim_list.append(strategy.obs_dim)
280282
act_dim_list.append(strategy.act_dim)
281283
unique_obs_dim_list.append(strategy.unique_obs_dim)
282284
num_timeseries_obs_dim_list.append(strategy.num_timeseries_obs_dim)
283285

284-
if len(set(obs_dim_list)) > 1:
286+
if len(set(foresight_list)) > 1:
285287
raise ValueError(
286-
f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
288+
f"All foresight values must be the same for all RL agents. The defined learning strategies have the following foresight values: {foresight_list}"
287289
)
288290
else:
289-
self.obs_dim = obs_dim_list[0]
291+
self.foresight = foresight_list[0]
290292

291293
if len(set(act_dim_list)) > 1:
292294
raise ValueError(
@@ -309,6 +311,14 @@ def check_strategy_dimensions(self) -> None:
309311
else:
310312
self.num_timeseries_obs_dim = num_timeseries_obs_dim_list[0]
311313

314+
# Check last, as other cases should fail before!
315+
if len(set(obs_dim_list)) > 1:
316+
raise ValueError(
317+
f"All observation dimensions must be the same for all RL agents. The defined learning strategies have the following observation dimensions: {obs_dim_list}"
318+
)
319+
else:
320+
self.obs_dim = obs_dim_list[0]
321+
312322
def create_actors(self) -> None:
313323
"""
314324
Create actor networks for reinforcement learning for each unit strategy.

assume/strategies/learning_strategies.py

Lines changed: 30 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ def load_actor_params(self, load_path):
117117

118118
def prepare_observations(self, unit, market_id):
119119
# scaling factors for the observations
120+
# Note: These scaling factors could be interpreted as information leakage. However as we are in a simulation environment and not a purley forecasting setting
121+
# we assume that the agent has access to this information already
120122
upper_scaling_factor_price = max(unit.forecaster.price[market_id])
121123
lower_scaling_factor_price = min(unit.forecaster.price[market_id])
122124
residual_load = unit.forecaster.residual_load.get(
@@ -185,6 +187,8 @@ def create_observation(
185187
)
186188

187189
# --- 2. Historical actual prices (backward-looking) ---
190+
# Note: We scale with the max_bid_price here in comparison to the scaling of the forecast where we use the max price of the forecast period
191+
# this is not consistent but has worked well so far. Future work could look into this in more detail.
188192
scaled_price_history = (
189193
unit.outputs["energy_accepted_price"].window(
190194
start, self.foresight, direction="backward"
@@ -308,11 +312,11 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
308312
on an Energy-Only Market.
309313
310314
The agent submits two price bids: one for the inflexible component (P_min) and another for
311-
the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 50
315+
the flexible component (P_max - P_min) of its capacity. This strategy utilizes a set of 38
312316
observations to generate actions, which are then transformed into market bids. The observation
313317
space comprises two unique values: the marginal cost and the current capacity of the unit.
314318
315-
The observation space for this strategy consists of 50 elements, drawn from both the forecaster
319+
The observation space for this strategy consists of 38 elements, drawn from both the forecaster
316320
and the unit's internal state. Observations include the following components:
317321
318322
- **Forecasted Residual Load**: Forecasted load over the foresight period, scaled by the maximum
@@ -344,7 +348,7 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
344348
Attributes
345349
----------
346350
foresight : int
347-
Number of time steps for which the agent forecasts market conditions. Defaults to 24.
351+
Number of time steps for which the agent forecasts market conditions. Defaults to 12.
348352
max_bid_price : float
349353
Maximum allowable bid price. Defaults to 100.
350354
max_demand : float
@@ -375,24 +379,19 @@ class EnergyLearningStrategy(TorchLearningStrategy, MinMaxStrategy):
375379
"""
376380

377381
def __init__(self, *args, **kwargs):
378-
obs_dim = kwargs.pop("obs_dim", 38)
382+
# 'foresight' represents the number of time steps into the future that we will consider
383+
# when constructing the observations.
384+
foresight = kwargs.pop("foresight", 12)
379385
act_dim = kwargs.pop("act_dim", 2)
380386
unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
381387
super().__init__(
382-
obs_dim=obs_dim,
388+
foresight=foresight,
383389
act_dim=act_dim,
384390
unique_obs_dim=unique_obs_dim,
385391
*args,
386392
**kwargs,
387393
)
388394

389-
# 'foresight' represents the number of time steps into the future that we will consider
390-
# when constructing the observations. This value is fixed for each strategy, as the
391-
# neural network architecture is predefined, and the size of the observations must remain consistent.
392-
# If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
393-
# as the observation dimension depends on the foresight value.
394-
self.foresight = 12
395-
396395
# define allowed order types
397396
self.order_types = kwargs.get("order_types", ["SB"])
398397

@@ -682,8 +681,8 @@ def calculate_reward(
682681

683682
# scaling factor to normalize the reward to the range [-1,1]
684683
scaling = 1 / (self.max_bid_price * unit.max_power)
685-
reward = scaling * (profit - regret_scale * opportunity_cost)
686684
regret = regret_scale * opportunity_cost
685+
reward = scaling * (profit - regret)
687686

688687
# Store results in unit outputs
689688
# Note: these are not learning-specific results but stored for all units for analysis
@@ -722,20 +721,18 @@ class EnergyLearningSingleBidStrategy(EnergyLearningStrategy, MinMaxStrategy):
722721
"""
723722

724723
def __init__(self, *args, **kwargs):
725-
obs_dim = kwargs.pop("obs_dim", 74)
724+
# we select 24 to be in line with the storage strategies
725+
foresight = kwargs.pop("foresight", 24)
726726
act_dim = kwargs.pop("act_dim", 1)
727727
unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
728728
super().__init__(
729-
obs_dim=obs_dim,
729+
foresight=foresight,
730730
act_dim=act_dim,
731731
unique_obs_dim=unique_obs_dim,
732732
*args,
733733
**kwargs,
734734
)
735735

736-
# we select 24 to be in line with the storage strategies
737-
self.foresight = 24
738-
739736
def calculate_bids(
740737
self,
741738
unit: SupportsMinMax,
@@ -807,7 +804,7 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
807804
Reinforcement Learning Strategy for a storage unit that enables the agent to learn
808805
optimal bidding strategies on an Energy-Only Market.
809806
810-
The observation space for this strategy consists of 50 elements. Key components include:
807+
The observation space for this strategy consists of 74 elements. Key components include:
811808
812809
- **State of Charge**: Represents the current level of energy in the storage unit,
813810
influencing the bid direction and capacity.
@@ -868,24 +865,19 @@ class StorageEnergyLearningStrategy(TorchLearningStrategy, MinMaxChargeStrategy)
868865
"""
869866

870867
def __init__(self, *args, **kwargs):
871-
obs_dim = kwargs.pop("obs_dim", 74)
868+
# 'foresight' represents the number of time steps into the future that we will consider
869+
# when constructing the observations.
870+
foresight = kwargs.pop("foresight", 24)
872871
act_dim = kwargs.pop("act_dim", 1)
873872
unique_obs_dim = kwargs.pop("unique_obs_dim", 2)
874873
super().__init__(
875-
obs_dim=obs_dim,
874+
foresight=foresight,
876875
act_dim=act_dim,
877876
unique_obs_dim=unique_obs_dim,
878877
*args,
879878
**kwargs,
880879
)
881880

882-
# 'foresight' represents the number of time steps into the future that we will consider
883-
# when constructing the observations. This value is fixed for each strategy, as the
884-
# neural network architecture is predefined, and the size of the observations must remain consistent.
885-
# If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
886-
# as the observation dimension depends on the foresight value.
887-
self.foresight = 24
888-
889881
# define allowed order types
890882
self.order_types = kwargs.get("order_types", ["SB"])
891883

@@ -1168,24 +1160,19 @@ class RenewableEnergyLearningSingleBidStrategy(EnergyLearningSingleBidStrategy):
11681160
"""
11691161

11701162
def __init__(self, *args, **kwargs):
1171-
obs_dim = kwargs.pop("obs_dim", 75)
1163+
# 'foresight' represents the number of time steps into the future that we will consider
1164+
# when constructing the observations.
1165+
foresight = kwargs.pop("foresight", 24)
11721166
act_dim = kwargs.pop("act_dim", 1)
11731167
unique_obs_dim = kwargs.pop("unique_obs_dim", 3)
11741168
super().__init__(
1175-
obs_dim=obs_dim,
1169+
foresight=foresight,
11761170
act_dim=act_dim,
11771171
unique_obs_dim=unique_obs_dim,
11781172
*args,
11791173
**kwargs,
11801174
)
11811175

1182-
# 'foresight' represents the number of time steps into the future that we will consider
1183-
# when constructing the observations. This value is fixed for each strategy, as the
1184-
# neural network architecture is predefined, and the size of the observations must remain consistent.
1185-
# If you wish to modify the foresight length, remember to also update the 'obs_dim' parameter above,
1186-
# as the observation dimension depends on the foresight value.
1187-
self.foresight = 24
1188-
11891176
# define allowed order types
11901177
self.order_types = kwargs.get("order_types", ["SB"])
11911178

@@ -1308,12 +1295,16 @@ def calculate_reward(
13081295

13091296
profit = income - operational_cost
13101297

1311-
# Stabilizing learning: Limit positive profit to 10% of its absolute value.
1298+
# Stabilizing learning: Limit positive profit to 50% of its absolute value.
13121299
# This reduces variance in rewards and prevents overfitting to extreme profit-seeking behavior.
13131300
# However, this does NOT prevent the agent from exploiting market inefficiencies if they exist.
13141301
# RL by nature identifies and exploits system weaknesses if they lead to higher profit.
13151302
# This is not a price cap but rather a stabilizing factor to avoid reward spikes affecting learning stability.
1316-
profit = min(profit, 0.5 * abs(profit))
1303+
# IMPORTANT: This is a clear case of reward_tuning to stabilize learning - Use with caution!
1304+
# profit_scale = 0.5
1305+
1306+
profit_scale = 1
1307+
profit = min(profit, profit_scale * abs(profit))
13171308

13181309
# get potential maximum infeed according to availability from order volume
13191310
# Note: this will only work as the correct reference point when the volume is not defined by an action

docs/source/learning.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ The Actor
140140
We will explain the way learning works in ASSUME starting from the interface to the simulation, namely the bidding strategy of the power plants.
141141
The bidding strategy, per definition in ASSUME, defines the way we formulate bids based on the technical restrictions of the unit.
142142
In a learning setting, this is done by the actor network which maps the observation to an action. The observation thereby is managed and collected by the units operator as
143-
summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast for the next 24 hours and a price
144-
forecast for 24 hours, as well as the current capacity of the power plant and its marginal costs.
143+
summarized in the following picture. As you can see in the current working version, the observation space contains a residual load forecast and a price
144+
forecast for example for the next 24 hours, as well as the current capacity of the power plant and its marginal costs.
145145

146146
.. image:: img/ActorTask.jpg
147147
:align: center

0 commit comments

Comments
 (0)