|
| 1 | +import glob |
| 2 | +import os |
| 3 | +import zipfile |
| 4 | +from datetime import timedelta as td |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +import pandas as pd |
| 8 | +import xarray as xr |
| 9 | +from floris.utilities import wrap_360 |
| 10 | +from zenodo_get import download as zn_download |
| 11 | + |
| 12 | +from flasc.data_processing.time_operations import df_resample_by_interpolation |
| 13 | + |
| 14 | + |
| 15 | +class AspireTimeseriesReader: |
| 16 | + """This class is used to read the output .tar.gz files from Whiffle and export |
| 17 | + them in various formats. |
| 18 | + """ |
| 19 | + |
| 20 | + def __init__(self, aspire_metmast_filelist=[], aspire_turbine_filelist=[], verbose=False): |
| 21 | + """Initialize the class. |
| 22 | +
|
| 23 | + Args: |
| 24 | + aspire_filelist (list): List of strings, where each entry of the list defines the |
| 25 | + path to one of the ASPIRE nc files. |
| 26 | + """ |
| 27 | + self.metmast_files = aspire_metmast_filelist |
| 28 | + self.turbine_files = aspire_turbine_filelist |
| 29 | + self.turbine_datasets = None |
| 30 | + self.metmast_datasets = None |
| 31 | + self.verbose = verbose |
| 32 | + |
| 33 | + # Private functions |
| 34 | + def _read_member_as_xarray(self, fn): |
| 35 | + """Read the contents of one of the files within the .tar.gz file as an xarray DataSet. |
| 36 | +
|
| 37 | + Args: |
| 38 | + fn (str): Filename refering to a turbine ASPIRE datafile. |
| 39 | +
|
| 40 | + Returns: |
| 41 | + dataset (xarray.Dataset): Contents of the HDF5 file imported as an xarray Dataset. |
| 42 | + """ |
| 43 | + # Now read the turbine .nc (HDF5) data file as an xarray and convert to a Pandas DataFrame |
| 44 | + dataset = xr.open_dataset(fn).load() # Load it into xarray format |
| 45 | + dataset.close() # Close dataset after reading to avoid conflicts |
| 46 | + |
| 47 | + if self.verbose: |
| 48 | + print(f"Successfully imported the contents of {os.path.basename(fn)}.") |
| 49 | + |
| 50 | + return dataset |
| 51 | + |
| 52 | + def _concatenate_dataframes_and_remove_startup_time(self, df_list): |
| 53 | + """Typically, ASPIRE simulations are performed one day at a time, including several hours of |
| 54 | + simulation start-up. This means that each day runs for about 26 hours, and that means there |
| 55 | + is an overlap window of about 2 hours between simulations. The start-up period, usually the |
| 56 | + first two hours of the simulation, must be removed so that each datafile contains exactly |
| 57 | + 24 hours of data. This script removes the startup periods and concatenates the Pandas |
| 58 | + DataFrames into a single file. |
| 59 | +
|
| 60 | + Args: |
| 61 | + df_list (list): List where each entry is a Pandas DataFrame containing the simulation |
| 62 | + data for one day from ASPIRE. These datasets typically start about 2 hours before the |
| 63 | + actual day of simulation. These 2 hours will be removed so that it perfectly connects |
| 64 | + with the simulation data of the previous day. |
| 65 | +
|
| 66 | + Returns: |
| 67 | + df_out (pd.DataFrame): Pandas DataFrame containing multiple days of simulation data, |
| 68 | + with the start-up periods and overlapping measurement times removed. |
| 69 | + """ |
| 70 | + # Here we stitch dataframes together while removing start-up periods |
| 71 | + |
| 72 | + # The dataset usually starts a couple hours before midnight in the day before the |
| 73 | + # actual day of the simulation. That is the start-up period. We must remove that |
| 74 | + # period from the dataset. Let's do this for the first dataframe separately. |
| 75 | + df = df_list[0].copy() |
| 76 | + first_day_change = np.where(np.diff([t.day for t in df["time"]]) != 0)[0][0] + 2 |
| 77 | + df = df.loc[first_day_change::] # Remove start-up period from first file |
| 78 | + df_list[0] = df # Update the dataframe with the start-up measurements removed |
| 79 | + |
| 80 | + # Establish the end time of the first simulation, so we can use that to remove start-up |
| 81 | + # periods from the next files |
| 82 | + t_end_prev_simulation = df.iloc[-1]["time"] |
| 83 | + |
| 84 | + for ii in range(1, len(df_list)): |
| 85 | + # For every file after the first, we can see where the previous simulation ended |
| 86 | + # and make sure we remove measurement data of this simulation that happens *before* |
| 87 | + # the latest simulation time of the previous file. Namely, that period is considered |
| 88 | + # the start-up period for this file and should be removed. |
| 89 | + df = df_list[ii].copy() |
| 90 | + dt = df["time"].diff().median() # Average duration between timesteps |
| 91 | + df = df.loc[ |
| 92 | + df["time"] > t_end_prev_simulation + 0.5 * dt |
| 93 | + ] # Only keep timesteps at least 5 minutes past the last measurement from previous |
| 94 | + |
| 95 | + # Update the dataframe with the start-up measurements removed |
| 96 | + df_list[ii] = df |
| 97 | + t_end_prev_simulation = df.iloc[-1]["time"] |
| 98 | + |
| 99 | + # Collect all the outputs together into a single DataFrame and sort it chronologically |
| 100 | + df_out = pd.concat(df_list, axis=0).sort_values(by="time").reset_index(drop=True) |
| 101 | + return df_out |
| 102 | + |
| 103 | + def get_turbine_hub_heights(self): |
| 104 | + """Extract the turbine hub heights from the imported turbine data files. |
| 105 | +
|
| 106 | + Returns: |
| 107 | + hub_heights (np.array): Array with length equal to the number of turbines, containing |
| 108 | + the hub height value for each wind turbine in the ASPIRE simulation. |
| 109 | + """ |
| 110 | + if self.turbine_datasets is None: |
| 111 | + raise UserWarning( |
| 112 | + "Cannot extract turbine hub heights. Please read the files first using " |
| 113 | + "get_turbine_data_as_xarrays()." |
| 114 | + ) |
| 115 | + |
| 116 | + # Extract hub heights from the first file |
| 117 | + hub_heights = np.array(self.turbine_datasets[0]["ztur"], dtype=float) |
| 118 | + return hub_heights |
| 119 | + |
| 120 | + def get_turbine_data_as_xarrays(self): |
| 121 | + """Import the turbine HDF5 file from each .tar.gz file and format them as xarray Datasets. |
| 122 | +
|
| 123 | + Returns: |
| 124 | + turbine_datasets: List of xarray Datasets, one for each tarball file, containing |
| 125 | + the turbine simulation output data. |
| 126 | + """ |
| 127 | + turbine_datasets = [] |
| 128 | + for fn in self.turbine_files: |
| 129 | + # Now read the turbine .nc (HDF5) data file as an xarray |
| 130 | + turbine_dataset = self._read_member_as_xarray(fn) |
| 131 | + turbine_datasets.append(turbine_dataset) |
| 132 | + |
| 133 | + self.turbine_datasets = turbine_datasets |
| 134 | + return turbine_datasets |
| 135 | + |
| 136 | + def get_turbine_data_as_dataframe(self, variables=None): |
| 137 | + """Convert the turbine data that is formatted as xarray Datasets to a single Pandas |
| 138 | + DataFrame that is easy to investigate and do analysis with. |
| 139 | +
|
| 140 | + Args: |
| 141 | + variables (list, optional): List of turbine measurement variables that should be |
| 142 | + exported from the simulation file. Defaults to ["ptur", "ufsf", "vfsf"]. |
| 143 | +
|
| 144 | + Returns: |
| 145 | + df_out (pd.DataFrame): Pandas DataFrame containing the turbine measurement data in a |
| 146 | + wide table format, where there is one column for each turbine and for each variable. |
| 147 | + Each rows depicts the timestamp of one set of measurements. The overlapping time windows |
| 148 | + of about 2 hours between each day of ASPIRE simulation has been removed so that the |
| 149 | + start-up periods are removed and the dataset is monotonically increasing with 10-minute |
| 150 | + timesteps. |
| 151 | + """ |
| 152 | + # Load the turbine data files, if we haven't done that yet |
| 153 | + if self.turbine_datasets is None: |
| 154 | + self.get_turbine_data_as_xarrays() # Get all turbine data as xarrays |
| 155 | + |
| 156 | + # Default options: |
| 157 | + if variables is None: |
| 158 | + if "ufsf" in list(self.turbine_datasets[0].data_vars): |
| 159 | + variables = ["ptur", "ufsf", "vfsf"] |
| 160 | + print(f"WARNING: Using legacy variable naming convention from GRASP, '{variables}'") |
| 161 | + elif "Mdfs" in list(self.turbine_datasets[0].data_vars): |
| 162 | + variables = ["ptur", "Mdfs", "cosangle", "sinangle"] |
| 163 | + else: |
| 164 | + raise UserWarning("Unfamiliar variable naming convention in GRASP datafiles.") |
| 165 | + |
| 166 | + # Convert files one by one to Pandas DataFrames |
| 167 | + df_list = [] |
| 168 | + num_turbines = len(self.turbine_datasets[0].turbine) |
| 169 | + for turbine_dataset in self.turbine_datasets: |
| 170 | + # Convert the data from each file into a 'wide' Pandas DataFrame |
| 171 | + df_turbine = turbine_dataset[variables].to_dataframe().unstack() |
| 172 | + |
| 173 | + # Update column names in wide format following FLASC format |
| 174 | + columns = [] |
| 175 | + for var in variables: |
| 176 | + columns = columns + [f"{var:s}_{ti:03d}" for ti in range(num_turbines)] |
| 177 | + df_turbine.columns = columns |
| 178 | + df_turbine = df_turbine.reset_index() |
| 179 | + |
| 180 | + # Finally, append it to a list that we will stitch together later |
| 181 | + df_list.append(df_turbine) |
| 182 | + |
| 183 | + # Concatenate all the individual dataframes and deal with overlap in timeseries entries |
| 184 | + df_out = self._concatenate_dataframes_and_remove_startup_time(df_list) |
| 185 | + return df_out |
| 186 | + |
| 187 | + def construct_flasc_timeseries(self): |
| 188 | + """ """ |
| 189 | + # First, get the turbine information, if we dont have those in memory yet |
| 190 | + self.get_turbine_data_as_dataframe() |
| 191 | + df_tot = self.get_turbine_data_as_dataframe() |
| 192 | + |
| 193 | + # Calculate turbine wind speed and wind direction, if variables available |
| 194 | + n_turbines = len([c for c in df_tot.columns if c.startswith("ptur_")]) |
| 195 | + df_tot.columns = [c.replace("ptur_", "pow_") for c in df_tot.columns] |
| 196 | + |
| 197 | + dict_out = {} |
| 198 | + for ti in range(n_turbines): |
| 199 | + u = df_tot[f"cosangle_{ti:03d}"] * df_tot[f"Mdfs_{ti:03d}"] |
| 200 | + v = df_tot[f"sinangle_{ti:03d}"] * df_tot[f"Mdfs_{ti:03d}"] |
| 201 | + dict_out[f"ws_{ti:03d}"] = np.sqrt(u**2.0 + v**2.0) |
| 202 | + dict_out[f"wd_{ti:03d}"] = wrap_360(180.0 + np.rad2deg(np.arctan2(u, v))) |
| 203 | + |
| 204 | + # Append local wind speed and direction measurements to the dataframe |
| 205 | + df_tot = pd.concat([df_tot, pd.DataFrame(dict_out)], axis=1) |
| 206 | + df_tot = df_tot.drop(columns=[c for c in df_tot.columns if c.startswith("cosangle_")]) |
| 207 | + df_tot = df_tot.drop(columns=[c for c in df_tot.columns if c.startswith("sinangle_")]) |
| 208 | + df_tot = df_tot.drop(columns=[c for c in df_tot.columns if c.startswith("Mdfs_")]) |
| 209 | + |
| 210 | + return df_tot |
| 211 | + |
| 212 | + |
| 213 | +if __name__ == "__main__": |
| 214 | + # Download files from Zenodo. Note that this may fail in certain VPN |
| 215 | + # environments or with certain SSL certificates. |
| 216 | + root_path = os.path.dirname(os.path.abspath(__file__)) |
| 217 | + data_path = os.path.join(root_path, "data") |
| 218 | + zn_download("10.5281/zenodo.18888663", output_dir=data_path) |
| 219 | + |
| 220 | + # Unzip the LES timeseries data |
| 221 | + path_to_zip_file = os.path.join(data_path, "les_output.zip") |
| 222 | + with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: |
| 223 | + zip_ref.extractall(data_path) |
| 224 | + |
| 225 | + aspire_turbine_files = glob.glob( |
| 226 | + os.path.join(data_path, "les_output", "2020", "*", "*", "00", "turbinesOut.les.nc") |
| 227 | + ) |
| 228 | + aspire_turbine_files = np.sort(aspire_turbine_files) |
| 229 | + |
| 230 | + # Load them one at a time |
| 231 | + atr = AspireTimeseriesReader(aspire_turbine_filelist=aspire_turbine_files, verbose=True) |
| 232 | + df_turbine = atr.get_turbine_data_as_dataframe() |
| 233 | + df_les_timeseries = atr.construct_flasc_timeseries() |
| 234 | + |
| 235 | + # Resample to 10 min steps |
| 236 | + t0 = ( |
| 237 | + str(df_les_timeseries["time"].iloc[0])[0:17] + "00" |
| 238 | + ) # Create time array rounded to nearest 10-min averages |
| 239 | + t1 = str(df_les_timeseries["time"].iloc[-1])[0:17] + "00" |
| 240 | + time_array = pd.date_range(start=t0, end=t1, freq="10min").tolist() |
| 241 | + df_les_timeseries = df_resample_by_interpolation( |
| 242 | + df=df_les_timeseries, |
| 243 | + time_array=time_array, # Interpolate onto the same timeseries that df_metmast uses |
| 244 | + circular_cols=[ |
| 245 | + c for c in df_les_timeseries.columns if c.startswith("wd_") |
| 246 | + ], # No variables in turbine measurement dataset that require circular averaging |
| 247 | + interp_method="linear", # Use linear interpolation |
| 248 | + max_gap=td( |
| 249 | + minutes=20 |
| 250 | + ), # Do not interpolate over gaps larger than 20 minutes between measurements |
| 251 | + verbose=False, |
| 252 | + ) |
| 253 | + |
| 254 | + # Save as .csv |
| 255 | + fout = os.path.join(root_path, "les_timeseries.csv") |
| 256 | + df_les_timeseries.to_csv(fout, index=False, float_format="%.3f") |
| 257 | + print("Converted LES timeseries data has been saved to: " + fout) |
0 commit comments