Skip to content

Commit 69f1c0d

Browse files
committed
Added PvUS dataset
1 parent 4a5d6c4 commit 69f1c0d

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

tsl/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
SolarBenchmark, TrafficBenchmark)
1313
from .pems_bay import PemsBay
1414
from .pems_benchmarks import PeMS03, PeMS04, PeMS07, PeMS08
15+
from .pv_us import PvUS
1516
from .synthetic import GaussianNoiseSyntheticDataset
1617

1718
dataset_classes = [
@@ -23,6 +24,7 @@
2324
'PeMS04',
2425
'PeMS07',
2526
'PeMS08',
27+
'PvUS',
2628
'ElectricityBenchmark',
2729
'TrafficBenchmark',
2830
'SolarBenchmark',

tsl/datasets/pv_us.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import os
2+
from typing import List, Union
3+
4+
import pandas as pd
5+
6+
from tsl.utils.python_utils import ensure_list
7+
8+
from ..utils import download_url
9+
from .prototypes import DatetimeDataset
10+
11+
12+
class PvUS(DatetimeDataset):
13+
r"""Simulated solar power production from more than 5,000 photovoltaic
14+
plants in the US.
15+
16+
Data are provided by `National Renewable Energy Laboratory (NREL)
17+
<https://www.nrel.gov/>`_'s `Solar Power Data for Integration Studies
18+
<https://www.nrel.gov/grid/solar-power-data.html>`_. Original raw data
19+
consist of 1 year (2006) of 5-minute solar power (in MW) for approximately
20+
5,000 synthetic PV plants in the United States.
21+
22+
Preprocessed data are resampled in 10-minutes intervals taking the average.
23+
The entire dataset contains 5016 plants, divided in two macro zones (east
24+
and west). The "east" zone contains 4084 plants, the "west" zone has 1082
25+
plants. Some states appear in both zones, with plants at same geographical
26+
position. When loading the entire datasets, duplicated plants in "east" zone
27+
are dropped.
28+
29+
Dataset size:
30+
+ Time steps: 52560
31+
+ Nodes:
32+
33+
+ Full graph: 5016
34+
+ East only: 4084
35+
+ West only: 1082
36+
37+
+ Channels: 1
38+
+ Sampling rate: 10 minutes
39+
+ Missing values: 0.00%
40+
41+
Args:
42+
zones (Union[str, List], optional): The US zones to include in the
43+
dataset. Can be ``"east"``, ``"west"``, or a list of both.
44+
If :obj:`None`, then the full dataset is loaded.
45+
(default: :obj:`None`)
46+
mask_zeros (bool, optional): If :obj:`True`, then zero values
47+
(corresponding to night hours) are masked out.
48+
(default: :obj:`False`)
49+
root (str, optional): The root directory for the data.
50+
(default: :obj:`None`)
51+
freq (str, optional): The data sampling rate for resampling.
52+
(default: :obj:`None`)
53+
"""
54+
available_zones = ['east', 'west']
55+
urls = {
56+
'east': "https://drive.switch.ch/index.php/s/ZUORMr4uzBSr04b/download",
57+
'west': "https://drive.switch.ch/index.php/s/HRPNJdeAzeQLA1f/download"
58+
}
59+
60+
similarity_options = {'distance', 'correntropy'}
61+
62+
def __init__(self,
63+
zones: Union[str, List] = None,
64+
mask_zeros: bool = False,
65+
root: str = None,
66+
freq: str = None):
67+
# allow to download a single zone
68+
if zones is None:
69+
zones = self.available_zones
70+
else:
71+
zones = ensure_list(zones)
72+
if not set(zones).issubset(self.available_zones):
73+
invalid_zones = set(zones).difference(self.available_zones)
74+
raise ValueError(f"Invalid zones {invalid_zones}. "
75+
f"Allowed zones are {self.available_zones}.")
76+
self.zones = zones
77+
self.mask_zeros = mask_zeros
78+
self.root = root
79+
# set name
80+
name = "PvUS" if len(zones) == 2 else f"PvUS-{zones[0]}"
81+
# load dataset
82+
actual, mask, metadata = self.load(mask_zeros)
83+
super().__init__(target=actual,
84+
mask=mask,
85+
freq=freq,
86+
similarity_score="distance",
87+
spatial_aggregation="sum",
88+
temporal_aggregation="mean",
89+
name=name)
90+
self.add_covariate('metadata', metadata, pattern='n f')
91+
92+
@property
93+
def raw_file_names(self):
94+
return [f'{zone}.h5' for zone in self.zones]
95+
96+
@property
97+
def required_file_names(self):
98+
return self.raw_file_names
99+
100+
def download(self) -> None:
101+
for zone in self.zones:
102+
download_url(self.urls[zone], self.root_dir, filename=f'{zone}.h5')
103+
104+
def load_raw(self):
105+
self.maybe_download()
106+
actual, metadata = [], []
107+
for zone in self.zones:
108+
# load zone data
109+
zone_path = os.path.join(self.root_dir, f'{zone}.h5')
110+
actual.append(pd.read_hdf(zone_path, key='actual'))
111+
metadata.append(pd.read_hdf(zone_path, key='metadata'))
112+
# concat zone and sort by plant id
113+
actual = pd.concat(actual, axis=1).sort_index(axis=1, level=0)
114+
metadata = pd.concat(metadata, axis=0).sort_index()
115+
# drop duplicated farms when loading whole dataset
116+
if len(self.zones) == 2:
117+
duplicated_farms = metadata.index[[
118+
s_id.endswith('-east') for s_id in metadata.state_id
119+
]]
120+
metadata = metadata.drop(duplicated_farms, axis=0)
121+
actual = actual.drop(duplicated_farms, axis=1, level=0)
122+
return actual, metadata
123+
124+
def load(self, mask_zeros):
125+
actual, metadata = self.load_raw()
126+
mask = (actual > 0) if mask_zeros else None
127+
return actual, mask, metadata
128+
129+
def compute_similarity(self, method: str, theta: float = 150, **kwargs):
130+
if method == "distance":
131+
from tsl.ops.similarities import (gaussian_kernel,
132+
geographical_distance)
133+
134+
# compute distances from latitude and longitude degrees
135+
loc_coord = self.metadata.loc[:, ['lat', 'lon']]
136+
dist = geographical_distance(loc_coord, to_rad=True).values
137+
return gaussian_kernel(dist, theta=theta)

0 commit comments

Comments
 (0)