|
| 1 | +import os |
| 2 | +from typing import List, Union |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from tsl.utils.python_utils import ensure_list |
| 7 | + |
| 8 | +from ..utils import download_url |
| 9 | +from .prototypes import DatetimeDataset |
| 10 | + |
| 11 | + |
| 12 | +class PvUS(DatetimeDataset): |
| 13 | + r"""Simulated solar power production from more than 5,000 photovoltaic |
| 14 | + plants in the US. |
| 15 | +
|
| 16 | + Data are provided by `National Renewable Energy Laboratory (NREL) |
| 17 | + <https://www.nrel.gov/>`_'s `Solar Power Data for Integration Studies |
| 18 | + <https://www.nrel.gov/grid/solar-power-data.html>`_. Original raw data |
| 19 | + consist of 1 year (2006) of 5-minute solar power (in MW) for approximately |
| 20 | + 5,000 synthetic PV plants in the United States. |
| 21 | +
|
| 22 | + Preprocessed data are resampled in 10-minutes intervals taking the average. |
| 23 | + The entire dataset contains 5016 plants, divided in two macro zones (east |
| 24 | + and west). The "east" zone contains 4084 plants, the "west" zone has 1082 |
| 25 | + plants. Some states appear in both zones, with plants at same geographical |
| 26 | + position. When loading the entire datasets, duplicated plants in "east" zone |
| 27 | + are dropped. |
| 28 | +
|
| 29 | + Dataset size: |
| 30 | + + Time steps: 52560 |
| 31 | + + Nodes: |
| 32 | +
|
| 33 | + + Full graph: 5016 |
| 34 | + + East only: 4084 |
| 35 | + + West only: 1082 |
| 36 | +
|
| 37 | + + Channels: 1 |
| 38 | + + Sampling rate: 10 minutes |
| 39 | + + Missing values: 0.00% |
| 40 | +
|
| 41 | + Args: |
| 42 | + zones (Union[str, List], optional): The US zones to include in the |
| 43 | + dataset. Can be ``"east"``, ``"west"``, or a list of both. |
| 44 | + If :obj:`None`, then the full dataset is loaded. |
| 45 | + (default: :obj:`None`) |
| 46 | + mask_zeros (bool, optional): If :obj:`True`, then zero values |
| 47 | + (corresponding to night hours) are masked out. |
| 48 | + (default: :obj:`False`) |
| 49 | + root (str, optional): The root directory for the data. |
| 50 | + (default: :obj:`None`) |
| 51 | + freq (str, optional): The data sampling rate for resampling. |
| 52 | + (default: :obj:`None`) |
| 53 | + """ |
| 54 | + available_zones = ['east', 'west'] |
| 55 | + urls = { |
| 56 | + 'east': "https://drive.switch.ch/index.php/s/ZUORMr4uzBSr04b/download", |
| 57 | + 'west': "https://drive.switch.ch/index.php/s/HRPNJdeAzeQLA1f/download" |
| 58 | + } |
| 59 | + |
| 60 | + similarity_options = {'distance', 'correntropy'} |
| 61 | + |
| 62 | + def __init__(self, |
| 63 | + zones: Union[str, List] = None, |
| 64 | + mask_zeros: bool = False, |
| 65 | + root: str = None, |
| 66 | + freq: str = None): |
| 67 | + # allow to download a single zone |
| 68 | + if zones is None: |
| 69 | + zones = self.available_zones |
| 70 | + else: |
| 71 | + zones = ensure_list(zones) |
| 72 | + if not set(zones).issubset(self.available_zones): |
| 73 | + invalid_zones = set(zones).difference(self.available_zones) |
| 74 | + raise ValueError(f"Invalid zones {invalid_zones}. " |
| 75 | + f"Allowed zones are {self.available_zones}.") |
| 76 | + self.zones = zones |
| 77 | + self.mask_zeros = mask_zeros |
| 78 | + self.root = root |
| 79 | + # set name |
| 80 | + name = "PvUS" if len(zones) == 2 else f"PvUS-{zones[0]}" |
| 81 | + # load dataset |
| 82 | + actual, mask, metadata = self.load(mask_zeros) |
| 83 | + super().__init__(target=actual, |
| 84 | + mask=mask, |
| 85 | + freq=freq, |
| 86 | + similarity_score="distance", |
| 87 | + spatial_aggregation="sum", |
| 88 | + temporal_aggregation="mean", |
| 89 | + name=name) |
| 90 | + self.add_covariate('metadata', metadata, pattern='n f') |
| 91 | + |
| 92 | + @property |
| 93 | + def raw_file_names(self): |
| 94 | + return [f'{zone}.h5' for zone in self.zones] |
| 95 | + |
| 96 | + @property |
| 97 | + def required_file_names(self): |
| 98 | + return self.raw_file_names |
| 99 | + |
| 100 | + def download(self) -> None: |
| 101 | + for zone in self.zones: |
| 102 | + download_url(self.urls[zone], self.root_dir, filename=f'{zone}.h5') |
| 103 | + |
| 104 | + def load_raw(self): |
| 105 | + self.maybe_download() |
| 106 | + actual, metadata = [], [] |
| 107 | + for zone in self.zones: |
| 108 | + # load zone data |
| 109 | + zone_path = os.path.join(self.root_dir, f'{zone}.h5') |
| 110 | + actual.append(pd.read_hdf(zone_path, key='actual')) |
| 111 | + metadata.append(pd.read_hdf(zone_path, key='metadata')) |
| 112 | + # concat zone and sort by plant id |
| 113 | + actual = pd.concat(actual, axis=1).sort_index(axis=1, level=0) |
| 114 | + metadata = pd.concat(metadata, axis=0).sort_index() |
| 115 | + # drop duplicated farms when loading whole dataset |
| 116 | + if len(self.zones) == 2: |
| 117 | + duplicated_farms = metadata.index[[ |
| 118 | + s_id.endswith('-east') for s_id in metadata.state_id |
| 119 | + ]] |
| 120 | + metadata = metadata.drop(duplicated_farms, axis=0) |
| 121 | + actual = actual.drop(duplicated_farms, axis=1, level=0) |
| 122 | + return actual, metadata |
| 123 | + |
| 124 | + def load(self, mask_zeros): |
| 125 | + actual, metadata = self.load_raw() |
| 126 | + mask = (actual > 0) if mask_zeros else None |
| 127 | + return actual, mask, metadata |
| 128 | + |
| 129 | + def compute_similarity(self, method: str, theta: float = 150, **kwargs): |
| 130 | + if method == "distance": |
| 131 | + from tsl.ops.similarities import (gaussian_kernel, |
| 132 | + geographical_distance) |
| 133 | + |
| 134 | + # compute distances from latitude and longitude degrees |
| 135 | + loc_coord = self.metadata.loc[:, ['lat', 'lon']] |
| 136 | + dist = geographical_distance(loc_coord, to_rad=True).values |
| 137 | + return gaussian_kernel(dist, theta=theta) |
0 commit comments