diff --git a/.gitignore b/.gitignore index 1b8af0f9..721620e6 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,5 @@ SMARTEOLE_WakeSteering_ReadMe.xlsx SMARTEOLE_WakeSteering_Map.pdf SMARTEOLE-WFC-open-dataset.zip examples_artificial_data/03_energy_ratio/heterogeneity_layouts.pdf +examples_smarteole/data/SMARTEOLE-LES-simulation-data/data *.pkl diff --git a/examples_smarteole/12_model_les_wake_loss_validation.ipynb b/examples_smarteole/12_model_les_wake_loss_validation.ipynb new file mode 100644 index 00000000..404207a6 --- /dev/null +++ b/examples_smarteole/12_model_les_wake_loss_validation.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmark ASPIRE LES and FLORIS CC on cumulative energy & cumulative energy wake loss\n", + "\n", + "In this notebook, we will demonstrate a comparison of Whiffles ASPIRE LES and FLORIS CC models against the measured baseline SCADA data from the Smarteole dataset. We benchmark the models using two metrics: wind-farm-wide cumulative energy production (MWh) and the wind-farm-wide cumulative wake loss (%). The latter metric is equivalent to the wake loss and wake-loss error metrics, 'L' and 'epsilon', from Orsteds large-scale benchmarking paper, described in the article \"Large-scale benchmarking of wake models for offshore wind farms\" by Nygaard et al. While this method benchmarks ASPIRE LES and FLORIS against SCADA, it may also be used to benchmark FLORIS against LES, purely FLORIS against SCADA, or purely LES against SCADA.\n", + "\n", + "\n", + "**The general approach followed in this notebook is**:\n", + "1. Load the cleaned-up SCADA data timeseries.\n", + "2. Generate an equivalent FLORIS data timeseries based on SCADA's freestream wind speeds and wind directions. We do this using a precalculated table of solutions and linear interpolation, but for simple wind farms without neighbors users may also directly use the FLORIS timeseries calculation mode instead.* \n", + "3. Generate an equivalent LES data timeseries. There are various decisions one can make here. \n", + "\n", + " a. In this example, we simulated the ASPIRE LES code for the exact same time window as the SCADA data. Since ASPIRE LES uses ERA5, it is highly likely that measured (SCADA) and simulated (LES) freestream conditions are very comparable. Thus, one may not need to apply any modifications and can just directly use the LES timeseries.\n", + "\n", + " b. Alternatively, one may want to force a perfect agreement between the LES-simulated and SCADA-measured timeseries. One can achieve this by binning the LES simulation output into a grid of wind directions and wind speeds, similar to a `df_fm_approx` table for FLORIS. Then, using linear interpolation one can turn this into a LES timeseries where the freestream wind conditions and wind speeds are those measured by SCADA.\n", + "\n", + " We explore both options in the notebook below.\n", + "4. Once all dataframe timeseries are available (SCADA, LES, FLORIS), we must match the missing outliers. Thus, we copy over any NaNs in the SCADA to the LES/FLORIS predictions, and vice versa. This ensures we compare apples to apples.\n", + "5. Finally, we can calculate the relevant metrics: cumulative energy production, and relative wake loss. We do this both for each individual turbine and the wind farm as a whole.\n", + "\n", + "Note that the dataset in this example only contains 3 months of data. Hence, the final numbers cannot be interpreted as \"annual energy production\" metrics, but rather only represent the cumulative production for the time period of the data.\n", + "\n", + "*(*if you have neighboring wind farms, then the measured freestream conditions from the SCADA are the wind speeds measured by the turbines of your own wind farm, not necessarily the most upstream turbine of all the machines simulated in case of neighbors. The recommended practice here would then be: 1. Generate a high-resolution grid of FLORIS solutions while modeling all neighbors, 2. Overwrite the 'ws' column in this `df_fm_approx` file with the wind speeds measured by the upstream turbines of your internal wind farm using the `dfm.set_ws_by_upstream_turbs()` function, and 3. interpolate these solutions back to a new regular, monotonic grid.)*\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Suppress warnings\n", + "import warnings\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from flasc.analysis import energy_ratio as er\n", + "from flasc.analysis.analysis_input import AnalysisInput\n", + "from flasc.analysis.cumulative_production_analysis import (\n", + " _mirror_timeseries_nans_in_df_list,\n", + " compare_cumulative_production,\n", + " compare_relative_wake_loss,\n", + ")\n", + "from flasc.data_processing import dataframe_manipulations as dfm, time_operations as tops\n", + "from flasc.data_processing.timeseries_to_grid_solutions import bin_timeseries_to_grid\n", + "from flasc.utilities import floris_tools as ftools\n", + "from flasc.utilities.utilities_examples import load_floris_smarteole as load_floris\n", + "\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 0: Specify user settings and load FLORIS\n", + "Specify turbines we want to use (for reference conditions and for benchmarking). We also load a FLORIS model for the wind farm at hand and determine which turbines are upstream for every wind direction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify which turbines to include/exclude. Since it's all baseline operation, we use all turbines.\n", + "include_turbs = [0, 1, 2, 3, 4, 5, 6]\n", + "exclude_turbs = []\n", + "\n", + "# Load FLORIS model of site\n", + "fm, turbine_weights = load_floris()\n", + "\n", + "# Use FLORIS to identify upstream / unwaked turbines for each direction\n", + "df_upstream = ftools.get_upstream_turbs_floris(fm)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Load processed SCADA data\n", + "\n", + "Load the processed SCADA data and assign reference wind directions and wind speeds by the upstream turbines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data():\n", + " root_path = Path.cwd()\n", + " f = root_path / \"postprocessed\" / \"df_scada_data_60s_filtered_and_northing_calibrated.pkl\"\n", + " df_scada = pd.read_pickle(f)\n", + "\n", + " return df_scada\n", + "\n", + "# Load the data\n", + "df_scada = load_data()\n", + "\n", + "# Calculate some helper variables\n", + "n_turbs = dfm.get_num_turbines(df_scada)\n", + "pow_cols = [f\"pow_{ti:03d}\" for ti in range(n_turbs)]\n", + "print(f\"Number of turbines in dataset: {n_turbs}.\")\n", + "\n", + "# The SCADA data set contains alternating 1-hour periods with baseline or wake steering control.\n", + "# For these examples, we'll limit the data to baseline operation. \n", + "df_scada = df_scada[df_scada.control_mode == \"baseline\"].reset_index(drop=True)\n", + "\n", + "# Set the wind direction as the average of all turbine averages\n", + "df_scada = dfm.set_wd_by_turbines(df_scada, include_turbs)\n", + "\n", + "# Set the wind speed to be the average of all upstream turbines\n", + "# (turbines not in a wake in a given direction)\n", + "# Except for SMV5\n", + "df_scada = dfm.set_ws_by_upstream_turbines(df_scada, df_upstream, exclude_turbs=exclude_turbs)\n", + "\n", + "# Scale power columns in SCADA from kW to W to match the FLORIS and LES units\n", + "df_scada[pow_cols] = df_scada[pow_cols] * 1.0e3\n", + "\n", + "# Print information to console\n", + "print(df_scada)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 2: Calculate FLORIS model predictions for the SCADA dataset\n", + "We first calculate the FLORIS model predictions for the SCADA dataset using our precalculated grid of FLORIS solutions. We only look at the Cumulative Curl (CC) model here, but can readily also use other wake models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Grab a FLORIS solution and turn it into a timeseries\n", + "wake_model = \"cc\"\n", + "fn = Path.cwd() / \"precalculated_floris_solutions\" / \"df_fi_approx_{:s}.ftr\".format(wake_model)\n", + "if fn.is_file():\n", + " df_fm_approx = pd.read_feather(fn)\n", + "else:\n", + " raise UserWarning(\n", + " \"Please run '01_precalculate_floris_solutions.ipynb' \"\n", + " \"for the appropriate wake models first.\"\n", + " )\n", + "\n", + "df_floris_timeseries = ftools.interpolate_floris_from_df_approx(\n", + " df=df_scada, df_approx=df_fm_approx, method=\"linear\", verbose=True\n", + ")\n", + "print(df_floris_timeseries)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 3: Calculate ASPIRE LES model predictions for the SCADA dataset\n", + "We calculate the ASPIRE LES model predictions for the SCADA dataset. Since the ASPIRE LES timeseries matches the exact same time window as SCADA, we can readily use this dataset for a one-to-one comparison." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load LES timeseries in preparation for validation against SCADA.\n", + "# We will try various validation metrics and approaches...\n", + "fn = Path.cwd() / \"data\" / \"SMARTEOLE-LES-simulation-data\" / \"les_timeseries.csv\"\n", + "if not fn.exists():\n", + " raise UserWarning(\"LES timeseries data not found. Please run 'examples_smarteole/SMARTEOLE-\"\n", + " +\"LES-simulation-data/import_aspire_les_data.py' first to download and process the LES data.\")\n", + "df_les_timeseries_raw = pd.read_csv(fn)\n", + "df_les_timeseries_raw[\"time\"] = pd.to_datetime(df_les_timeseries_raw[\"time\"])\n", + "\n", + "# Assign freestream ambient conditions in the same fashion as we did for the SCADA\n", + "df_les_timeseries_raw = dfm.set_wd_by_turbines(df_les_timeseries_raw, turbine_numbers=include_turbs)\n", + "df_les_timeseries_raw = dfm.set_ws_by_upstream_turbines(\n", + " df_les_timeseries_raw,\n", + " df_upstream,\n", + " exclude_turbs=exclude_turbs\n", + ")\n", + "df_les_timeseries_raw[\"ti\"] = 0.10 # Placeholder assumption for TI\n", + "\n", + "# Map raw LES timeseries directly to the SCADA timeseries using linear interpolation\n", + "df_les_raw_resampled = tops.df_resample_by_interpolation(\n", + " df=df_les_timeseries_raw,\n", + " time_array=pd.to_datetime(df_scada[\"time\"]),\n", + " circular_cols=[c for c in df_les_timeseries_raw.columns if c.startswith(\"wd\")],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Steps 3a-5: compare SCADA and LES timeseries directly (without rebinning)\n", + "\n", + "In this first method, we make the assumption that the underlying wind resource in LES accurately represents the true wind conditions experienced in the SCADA. This is generally a fair assumption to make since LES relies on the ERA5 dataset. We find a fairly high error in the cumulative production (LES: -5.03% error, CC: -2.29% error). This is strange: we find that our simulation models predict a *lower* cumulative energy production than what the SCADA reports. Namely, we typically expect the other way around, since FLORIS and LES assume idealized conditions and ignore any technical operational losses (e.g., outer-range losses) which commonly cause losses of over 2%.\n", + "\n", + "One likely explanation for this large offset in cumulative production may be that the true wind speeds are higher than those reported by the wind turbine sensors (for CC) and by ERA5 (for LES). Generally, the high sensitivity of the cumulative production to the assumed freestream wind speeds makes it a relatively poor metric to benchmark wake models specifically. Instead, we prefer looking at the relative wake loss as also used by Nygaard et al., shown in the second table. This table shows excellent performance of the LES model, with a very small error of 0.08 p.p. The FLORIS CC model shows a larger error but still reasonable performance with 0.76 p.p. wake loss error." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now being on the same timeseries, map NaNs from SCADA to LES\n", + "df_les = df_les_raw_resampled.copy()\n", + "\n", + "# Benchmark model performance: calculate and show cumulative production score\n", + "table_absolute_aep_dict = compare_cumulative_production(\n", + " df_list=[df_scada, df_les, df_floris_timeseries],\n", + " exclude_turbs=exclude_turbs,\n", + " model_tags=[\"SCADA\", \"LES\", \"FLORIS CC\"],\n", + ")\n", + "\n", + "# Benchmark model performance: calculate and show relative wake loss score\n", + "table_wakeloss_aep_dict = compare_relative_wake_loss(\n", + " df_list=[df_scada, df_les, df_floris_timeseries],\n", + " df_upstream=df_upstream,\n", + " exclude_turbs=exclude_turbs,\n", + " model_tags=[\"SCADA\", \"LES\", \"FLORIS CC\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Steps 3b-5: Compare SCADA with remapped LES timeseries. \n", + "\n", + "In this second method, we make the assumption that the underlying wind resource in LES does not necessarily accurately represents the true wind conditions experienced in the SCADA. Hence, we resolve this by first binning the LES timeseries into a grid of solutions across wind directions and wind speeds, after which that is used to interpolate to a new timeseries in which the freestream conditions are those of SCADA. This allows for a 'perfect' mapping between the LES and SCADA freestream conditions. We put 'perfect' in quotation marks, because it essentially means we 100% trust the wind conditions reported by SCADA to accurately represent the true background flow. This is not always an entirely fair assumption to make, since a 1%+ bias in the turbine's wind speed sensors is not uncommon. Yet, it also corrects for differences in the wind directions.\n", + "\n", + "We again show the cumulative production errors and wake loss errors. Generally, the cumulative production may again not be very reliable. Actually, we see a *bigger* error than in step 3a, now having a +11% error in cumulative production from a -5% error before. This likely indicates that the LES turbine-reported wind speeds are too low (perhaps a modelling issue), causing a significant overcompensation when scaled to the SCADA wind speeds. We also see a slight change in the wake loss error, from -0.08% to -0.40%. While the change is not zero, it remains quite small and further puts confidence in the wake loss error metric, which significantly reduces sensitivity to the wind resource in comparison to the cumulative production metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # We start with the original timeseries, mapped onto the same time vector as the SCADA\n", + "\n", + "# Copy NaNs over from the original SCADA to LES. This is beneficial because, for example,\n", + "# SCADA may have a lot of NaNs in particular seasons, weighing SCADA towards e.g. the\n", + "# summer. If we take the full LES timeseries without removing NaNs, we may be unfairly\n", + "# penalizing the LES while the underlying reason is a mismatch is seasonality.\n", + "df_list = _mirror_timeseries_nans_in_df_list(df_list=[df_scada.copy(), df_les_raw_resampled.copy()])\n", + "df_les = df_list[1]\n", + "\n", + "# We then turn the LES timeseries into a steady-state table by binning the LES data in\n", + "# wind direction and wind speed bins, and taking the average power in each bin as the\n", + "# steady-state power for that bin. This is the LES-equivalent of a `df_fm_approx` table,\n", + "# which we can then use to remap the LES timeseries to have the same inflow conditions as\n", + "# SCADA, and then compare the remapped LES timeseries directly to SCADA.\n", + "df_approx_les_steadystate_table = bin_timeseries_to_grid(\n", + " df_timeseries=df_les,\n", + " wd_step=5.0, # Ideally as low as pos, but increase to get enough samples in each bin\n", + " wd_min=0.0, # Cover entire wind rose\n", + " wd_max=360.0, # Cover entire wind rose\n", + " ws_step=1.0, # Ideally as low as pos, but increase to get enough samples in each bin\n", + " ws_min=0.0, # Take a very large range of wind speeds to cover all possible conditions\n", + " ws_max=50.0, # Take a very large range of wind speeds to cover all possible conditions\n", + " N_min=3, # Min. n.o. of samples within a bin for it to be valid for df_fm_approx table\n", + " plot=True, # Generate coverage plot: show how many samples we have in each wd/ws bin\n", + ")\n", + "\n", + "# Use this steady-state table to produce a new LES timeseries with identical inflow as SCADA\n", + "df_les_timeseries_inflowremapped = ftools.interpolate_floris_from_df_approx(\n", + " df=df_scada, df_approx=df_approx_les_steadystate_table, method=\"linear\", verbose=False\n", + ")\n", + "\n", + "# And finally, we can compare this inflow-remapped timeseries directly to SCADA.\n", + "# This may change the cumulative production errors substantially, but likely will leave\n", + "# the relative wake loss percentages mostly intact. Lets look at both.\n", + "\n", + "# Benchmark model performance: show performance for cumulative production\n", + "print(\"\\n\")\n", + "table_absolute_aep_dict = compare_cumulative_production(\n", + " df_list=[df_scada, df_les_timeseries_inflowremapped, df_floris_timeseries],\n", + " exclude_turbs=exclude_turbs,\n", + " model_tags=[\"SCADA\", \"LES (inflow remapped to SCADA)\", \"FLORIS CC\"],\n", + " print_to_console=True,\n", + " )\n", + "\n", + "# Benchmark model performance: calculate and show relative wake loss score\n", + "table_wakeloss_aep_dict = compare_relative_wake_loss(\n", + " df_list=[df_scada, df_les_timeseries_inflowremapped, df_floris_timeseries],\n", + " df_upstream=df_upstream,\n", + " exclude_turbs=exclude_turbs,\n", + " model_tags=[\"SCADA\", \"LES (inflow remapped to SCADA)\", \"FLORIS CC\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extra: Plot wind-farm energy ratio\n", + "It generally is a good idea to check the energy ratios of the wind farm to ensure we are not making a mistake anywhere. We should roughly see agreement in where the largest wake dips occur. This is the case. We do not draw any further quantitative conclusions from these plots. And indeed, we see good qualitative agreement between the SCADA and the models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify upstream turbines in same fashion as for the SCADA data\n", + "df_er_list = [\n", + " df_scada,\n", + " df_floris_timeseries,\n", + " df_les_timeseries_raw,\n", + " df_les_timeseries_inflowremapped\n", + "]\n", + "df_er_list = [df.copy() for df in df_er_list] # Independent copy\n", + "for df in df_er_list:\n", + " df = dfm.set_pow_ref_by_upstream_turbines(df, df_upstream, exclude_turbs=exclude_turbs)\n", + "\n", + "# Initialize energy ratio class and perform common operations. E.g., let's look at turbine 4.\n", + "model_tags = [\n", + " \"SCADA data\",\n", + " f\"FLORIS: {wake_model}\",\n", + " \"LES (original LES inflow)\",\n", + " \"LES (inflow remapped to SCADA)\"\n", + " ]\n", + "a_in = AnalysisInput(df_er_list, model_tags)\n", + "\n", + "N = 20\n", + "print(\"Calculating energy ratios with bootstrapping (N={}).\".format(N))\n", + "print(\"This may take a couple seconds...\")\n", + "np.random.seed(0)\n", + "er_out = er.compute_energy_ratio(\n", + " a_in,\n", + " test_turbines=include_turbs,\n", + " use_predefined_ref=True,\n", + " use_predefined_wd=True,\n", + " use_predefined_ws=True,\n", + " wd_step=5.0,\n", + " wd_bin_overlap_radius=0.0,\n", + " ws_min=6.0,\n", + " ws_max=12.0,\n", + " N=N,\n", + " percentiles=[5.0, 95.0],\n", + ")\n", + "ax = er_out.plot_energy_ratios(overlay_frequency=True)\n", + "ax[0].set_title(f\"Energy Ratios for turbines {include_turbs}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "confide", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples_smarteole/data/SMARTEOLE-LES-simulation-data/import_aspire_les_data.py b/examples_smarteole/data/SMARTEOLE-LES-simulation-data/import_aspire_les_data.py new file mode 100644 index 00000000..ceae8930 --- /dev/null +++ b/examples_smarteole/data/SMARTEOLE-LES-simulation-data/import_aspire_les_data.py @@ -0,0 +1,257 @@ +import glob +import os +import zipfile +from datetime import timedelta as td + +import numpy as np +import pandas as pd +import xarray as xr +from floris.utilities import wrap_360 +from zenodo_get import download as zn_download + +from flasc.data_processing.time_operations import df_resample_by_interpolation + + +class AspireTimeseriesReader: + """This class is used to read the output .tar.gz files from Whiffle and export + them in various formats. + """ + + def __init__(self, aspire_metmast_filelist=[], aspire_turbine_filelist=[], verbose=False): + """Initialize the class. + + Args: + aspire_filelist (list): List of strings, where each entry of the list defines the + path to one of the ASPIRE nc files. + """ + self.metmast_files = aspire_metmast_filelist + self.turbine_files = aspire_turbine_filelist + self.turbine_datasets = None + self.metmast_datasets = None + self.verbose = verbose + + # Private functions + def _read_member_as_xarray(self, fn): + """Read the contents of one of the files within the .tar.gz file as an xarray DataSet. + + Args: + fn (str): Filename refering to a turbine ASPIRE datafile. + + Returns: + dataset (xarray.Dataset): Contents of the HDF5 file imported as an xarray Dataset. + """ + # Now read the turbine .nc (HDF5) data file as an xarray and convert to a Pandas DataFrame + dataset = xr.open_dataset(fn).load() # Load it into xarray format + dataset.close() # Close dataset after reading to avoid conflicts + + if self.verbose: + print(f"Successfully imported the contents of {os.path.basename(fn)}.") + + return dataset + + def _concatenate_dataframes_and_remove_startup_time(self, df_list): + """Typically, ASPIRE simulations are performed one day at a time, including several hours of + simulation start-up. This means that each day runs for about 26 hours, and that means there + is an overlap window of about 2 hours between simulations. The start-up period, usually the + first two hours of the simulation, must be removed so that each datafile contains exactly + 24 hours of data. This script removes the startup periods and concatenates the Pandas + DataFrames into a single file. + + Args: + df_list (list): List where each entry is a Pandas DataFrame containing the simulation + data for one day from ASPIRE. These datasets typically start about 2 hours before the + actual day of simulation. These 2 hours will be removed so that it perfectly connects + with the simulation data of the previous day. + + Returns: + df_out (pd.DataFrame): Pandas DataFrame containing multiple days of simulation data, + with the start-up periods and overlapping measurement times removed. + """ + # Here we stitch dataframes together while removing start-up periods + + # The dataset usually starts a couple hours before midnight in the day before the + # actual day of the simulation. That is the start-up period. We must remove that + # period from the dataset. Let's do this for the first dataframe separately. + df = df_list[0].copy() + first_day_change = np.where(np.diff([t.day for t in df["time"]]) != 0)[0][0] + 2 + df = df.loc[first_day_change::] # Remove start-up period from first file + df_list[0] = df # Update the dataframe with the start-up measurements removed + + # Establish the end time of the first simulation, so we can use that to remove start-up + # periods from the next files + t_end_prev_simulation = df.iloc[-1]["time"] + + for ii in range(1, len(df_list)): + # For every file after the first, we can see where the previous simulation ended + # and make sure we remove measurement data of this simulation that happens *before* + # the latest simulation time of the previous file. Namely, that period is considered + # the start-up period for this file and should be removed. + df = df_list[ii].copy() + dt = df["time"].diff().median() # Average duration between timesteps + df = df.loc[ + df["time"] > t_end_prev_simulation + 0.5 * dt + ] # Only keep timesteps at least 5 minutes past the last measurement from previous + + # Update the dataframe with the start-up measurements removed + df_list[ii] = df + t_end_prev_simulation = df.iloc[-1]["time"] + + # Collect all the outputs together into a single DataFrame and sort it chronologically + df_out = pd.concat(df_list, axis=0).sort_values(by="time").reset_index(drop=True) + return df_out + + def get_turbine_hub_heights(self): + """Extract the turbine hub heights from the imported turbine data files. + + Returns: + hub_heights (np.array): Array with length equal to the number of turbines, containing + the hub height value for each wind turbine in the ASPIRE simulation. + """ + if self.turbine_datasets is None: + raise UserWarning( + "Cannot extract turbine hub heights. Please read the files first using " + "get_turbine_data_as_xarrays()." + ) + + # Extract hub heights from the first file + hub_heights = np.array(self.turbine_datasets[0]["ztur"], dtype=float) + return hub_heights + + def get_turbine_data_as_xarrays(self): + """Import the turbine HDF5 file from each .tar.gz file and format them as xarray Datasets. + + Returns: + turbine_datasets: List of xarray Datasets, one for each tarball file, containing + the turbine simulation output data. + """ + turbine_datasets = [] + for fn in self.turbine_files: + # Now read the turbine .nc (HDF5) data file as an xarray + turbine_dataset = self._read_member_as_xarray(fn) + turbine_datasets.append(turbine_dataset) + + self.turbine_datasets = turbine_datasets + return turbine_datasets + + def get_turbine_data_as_dataframe(self, variables=None): + """Convert the turbine data that is formatted as xarray Datasets to a single Pandas + DataFrame that is easy to investigate and do analysis with. + + Args: + variables (list, optional): List of turbine measurement variables that should be + exported from the simulation file. Defaults to ["ptur", "ufsf", "vfsf"]. + + Returns: + df_out (pd.DataFrame): Pandas DataFrame containing the turbine measurement data in a + wide table format, where there is one column for each turbine and for each variable. + Each rows depicts the timestamp of one set of measurements. The overlapping time windows + of about 2 hours between each day of ASPIRE simulation has been removed so that the + start-up periods are removed and the dataset is monotonically increasing with 10-minute + timesteps. + """ + # Load the turbine data files, if we haven't done that yet + if self.turbine_datasets is None: + self.get_turbine_data_as_xarrays() # Get all turbine data as xarrays + + # Default options: + if variables is None: + if "ufsf" in list(self.turbine_datasets[0].data_vars): + variables = ["ptur", "ufsf", "vfsf"] + print(f"WARNING: Using legacy variable naming convention from GRASP, '{variables}'") + elif "Mdfs" in list(self.turbine_datasets[0].data_vars): + variables = ["ptur", "Mdfs", "cosangle", "sinangle"] + else: + raise UserWarning("Unfamiliar variable naming convention in GRASP datafiles.") + + # Convert files one by one to Pandas DataFrames + df_list = [] + num_turbines = len(self.turbine_datasets[0].turbine) + for turbine_dataset in self.turbine_datasets: + # Convert the data from each file into a 'wide' Pandas DataFrame + df_turbine = turbine_dataset[variables].to_dataframe().unstack() + + # Update column names in wide format following FLASC format + columns = [] + for var in variables: + columns = columns + [f"{var:s}_{ti:03d}" for ti in range(num_turbines)] + df_turbine.columns = columns + df_turbine = df_turbine.reset_index() + + # Finally, append it to a list that we will stitch together later + df_list.append(df_turbine) + + # Concatenate all the individual dataframes and deal with overlap in timeseries entries + df_out = self._concatenate_dataframes_and_remove_startup_time(df_list) + return df_out + + def construct_flasc_timeseries(self): + """ """ + # First, get the turbine information, if we dont have those in memory yet + self.get_turbine_data_as_dataframe() + df_tot = self.get_turbine_data_as_dataframe() + + # Calculate turbine wind speed and wind direction, if variables available + n_turbines = len([c for c in df_tot.columns if c.startswith("ptur_")]) + df_tot.columns = [c.replace("ptur_", "pow_") for c in df_tot.columns] + + dict_out = {} + for ti in range(n_turbines): + u = df_tot[f"cosangle_{ti:03d}"] * df_tot[f"Mdfs_{ti:03d}"] + v = df_tot[f"sinangle_{ti:03d}"] * df_tot[f"Mdfs_{ti:03d}"] + dict_out[f"ws_{ti:03d}"] = np.sqrt(u**2.0 + v**2.0) + dict_out[f"wd_{ti:03d}"] = wrap_360(180.0 + np.rad2deg(np.arctan2(u, v))) + + # Append local wind speed and direction measurements to the dataframe + df_tot = pd.concat([df_tot, pd.DataFrame(dict_out)], axis=1) + df_tot = df_tot.drop(columns=[c for c in df_tot.columns if c.startswith("cosangle_")]) + df_tot = df_tot.drop(columns=[c for c in df_tot.columns if c.startswith("sinangle_")]) + df_tot = df_tot.drop(columns=[c for c in df_tot.columns if c.startswith("Mdfs_")]) + + return df_tot + + +if __name__ == "__main__": + # Download files from Zenodo. Note that this may fail in certain VPN + # environments or with certain SSL certificates. + root_path = os.path.dirname(os.path.abspath(__file__)) + data_path = os.path.join(root_path, "data") + zn_download("10.5281/zenodo.18888663", output_dir=data_path) + + # Unzip the LES timeseries data + path_to_zip_file = os.path.join(data_path, "les_output.zip") + with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: + zip_ref.extractall(data_path) + + aspire_turbine_files = glob.glob( + os.path.join(data_path, "les_output", "2020", "*", "*", "00", "turbinesOut.les.nc") + ) + aspire_turbine_files = np.sort(aspire_turbine_files) + + # Load them one at a time + atr = AspireTimeseriesReader(aspire_turbine_filelist=aspire_turbine_files, verbose=True) + df_turbine = atr.get_turbine_data_as_dataframe() + df_les_timeseries = atr.construct_flasc_timeseries() + + # Resample to 10 min steps + t0 = ( + str(df_les_timeseries["time"].iloc[0])[0:17] + "00" + ) # Create time array rounded to nearest 10-min averages + t1 = str(df_les_timeseries["time"].iloc[-1])[0:17] + "00" + time_array = pd.date_range(start=t0, end=t1, freq="10min").tolist() + df_les_timeseries = df_resample_by_interpolation( + df=df_les_timeseries, + time_array=time_array, # Interpolate onto the same timeseries that df_metmast uses + circular_cols=[ + c for c in df_les_timeseries.columns if c.startswith("wd_") + ], # No variables in turbine measurement dataset that require circular averaging + interp_method="linear", # Use linear interpolation + max_gap=td( + minutes=20 + ), # Do not interpolate over gaps larger than 20 minutes between measurements + verbose=False, + ) + + # Save as .csv + fout = os.path.join(root_path, "les_timeseries.csv") + df_les_timeseries.to_csv(fout, index=False, float_format="%.3f") + print("Converted LES timeseries data has been saved to: " + fout) diff --git a/flasc/analysis/cumulative_production_analysis.py b/flasc/analysis/cumulative_production_analysis.py new file mode 100644 index 00000000..d0d813a4 --- /dev/null +++ b/flasc/analysis/cumulative_production_analysis.py @@ -0,0 +1,400 @@ +"""Compare energy production and wake losses across dataframes.""" + +import numpy as np +import pandas as pd + +from flasc.data_processing import dataframe_manipulations as dfm + + +def _err(old_val, new_val): + """Calculate percentage error between old and new value. + + Note that this is not + a symmetric error metric, and should be interpreted as the percentage change from + old_val to new_val. + """ + return 100.0 * (new_val - old_val) / old_val + + +def _wake_loss(cumprod_waked, cumprod_unwaked): + """Calculate wake loss percentage between waked and unwaked cumulative production. + + Note that this is not a symmetric metric, and should be interpreted as the percentage loss from + unwaked to waked production. + """ + return 100.0 * (cumprod_unwaked - cumprod_waked) / cumprod_unwaked + + +def _print_pretty_table(table_dict, title): + """Print the given table dictionary in a pretty format using markdown. + + The title is centered above the table. + + Args: + table_dict (dict): Dictionary containing the table data. + title (str): Title to be displayed above the table. + + Returns: + None + """ + df_table = pd.DataFrame(table_dict) + mrkdwn = df_table.to_markdown(headers="keys", tablefmt="psql", index=False, floatfmt=".2f") + spc = int(np.floor(len(mrkdwn.split("\n")[0]) / 2 - len(title) / 2)) + mrkdwn = (" " * spc + title + "\n") + mrkdwn + "\n" + return print(mrkdwn) + + +def _mirror_timeseries_nans_in_df_list(df_list: list, verbose: bool = False) -> list: + """Mirror NaNs in 'pow_{id:03d}' columns across a list of dataframes. + + Args: + df_list (list): A list of DataFrames containing SCADA, LES or FLORIS model timeseries with + power production columns, pow_000, pow_001, etc. + verbose (bool, optional): If True, print the number of NaNs in each dataframe. + Defaults to False. + + Returns: + list: The list of DataFrames with copied-over NaNs. Each dataframe has an equal number of + NaNs in the power columns, and these NaNs are located at the same timestamps across all + dataframes. + """ + # Helper variables + n_turbs = dfm.get_num_turbines(df_list[0]) + pow_cols = [f"pow_{ti:03d}" for ti in range(n_turbs)] + + # First ensure consistent NaN mapping between modelled data and SCADA timeseries + for ti in range(n_turbs): + # For each individual turbine, identify all timestamps where any of the timeseries have NaN + # values, and create a combined mask of these + ids_nan = np.zeros(len(df_list[0]), dtype=bool) + for df in df_list: + ids_nan = ids_nan | df[f"pow_{ti:03d}"].isna() + ids_nan = np.where(ids_nan)[0] + + # Mirror NaNs across all timeseries + for df in df_list: + df.loc[ids_nan, f"pow_{ti:03d}"] = None + + if verbose: + # Assert NaNs are identical between dataframes + n_nans_per_timeseries = [df[pow_cols].isna().sum().sum() for df in df_list] + print(f"NaNs in df_list power columns: {n_nans_per_timeseries}") + + return df_list + + +def _prepare_df_list(df_list: list, model_tags: list, exclude_turbs: list, ws_range: list): + """Prepare list with timeseries dataframes for cumulative production and rel. wake loss. + + Args: + df_list (list): List of Pandas DataFrame timeseries. The first entry is typically SCADA or + LES, and the remaining entries are models to compare. + model_tags (list, optional): List of string tags for the models. + exclude_turbs (list, optional): List of turbines to exclude from the analysis, e.g., + because of poor performance or odd behavior. Defaults to []. + ws_range (list, optional): Wind speed range for filtering the data. When inspecting wake + losses, one may want to zoom into the relevant wind speed range, typically between + 6 and 14 m/s. This also allows you to inspect the model performance for different + wind speed regions. Defaults to [0.0, 99.0]. + print_to_console (bool, optional): Whether to print the results to the console. + Defaults to True. + + Raises: + ValueError: If input timeseries dataframes in df_list have different timestamps. + ValueError: If input timeseries dataframes in df_list have different number of turbines. + ValueError: If input timeseries dataframes in df_list already contain a 'pow_ref' column. + + Returns: + df_list (list): List of Pandas DataFrame timeseries, after applying wind speed filtering, + excluding turbines, and mirroring NaNs across dataframes. + """ + # Check input dataframes are consistent in terms of number of turbines and timestamps + if not all([all(df_list[0]["time"] == df["time"]) for df in df_list]): + raise ValueError( + "Input dataframes have different timestamps. Please ensure all dataframes have the " + "same timestamps." + ) + + n_turbs_list = [dfm.get_num_turbines(df) for df in df_list] + if len(set(n_turbs_list)) > 1: + raise ValueError( + f"Input dataframes have different number of turbines: {n_turbs_list}. Please ensure " + "all dataframes have the same number of turbines." + ) + + for dfii, df in enumerate(df_list): + if "pow_ref" in df.columns: + raise ValueError( + f"Input dataframe[{dfii}] for {model_tags[dfii]} may not contain 'pow_ref' column. " + "This may only happen AFTER mirroring NaNs between dataframes and will be done " + "automatically." + ) + + # Make local copies of dataframes that we can manipulate + df_list = [df.copy() for df in df_list] + + # Apply wind speed filter + N_b = ( + ~df_list[0]["ws"].isna() + ).sum() # Number of valid entries in first df before wind speed masking + ids_within_ws_range = (df_list[0]["ws"] >= ws_range[0]) & (df_list[0]["ws"] <= ws_range[1]) + for dfii in range(len(df_list)): + df_list[dfii] = df_list[dfii].loc[ids_within_ws_range].reset_index(drop=True) + + N_a = ( + ~df_list[0]["ws"].isna() + ).sum() # Number of valid entries in first df after wind speed masking + if N_a < N_b: + print( + f"Masking down to wind speed range {ws_range[0]:.1f} m/s to {ws_range[1]:.1f} m/s. " + + f"Number of measurements before: {N_b}, after: {N_a}. Percentage: " + + f"{100.0 * N_a / N_b:.1f}%." + ) + + # Apply simple NaN rule for every excluded turbine to avoid annoying edge cases in the analysis + for ti in exclude_turbs: + for dfii in range(len(df_list)): + df_list[dfii][f"pow_{ti:03d}"] = None + + # Mirror NaNs across dataframes to ensure consistent NaN mapping between modelled data and SCADA + # timeseries. This is important to ensure that we are comparing production and wake losses over + # the same timestamps, and that we are not unfairly penalizing models for having values where + # SCADA has NaNs (e.g. due to turbine downtime). + df_list = _mirror_timeseries_nans_in_df_list(df_list, verbose=False) + + # Check if our dataset is a whole multiple of 8760 hours (1 year), if not, raise a warning that + # the AEP calculation is not exact and that we are comparing cumulative energy rather than AEP. + # This is just a sanity check, not a strict requirement. + t0 = df_list[0].iloc[0]["time"] + t1 = df_list[0].iloc[-1]["time"] + timespan = t1 - t0 + n_measurements_per_hour = np.timedelta64(1, "h") / ( + df_list[0].iloc[1]["time"] - df_list[0].iloc[0]["time"] + ) + n_hours_total = timespan / np.timedelta64(1, "h") + n_weeks = n_hours_total / 168 + n_hours_in_year = 8760 + avg_no_years_in_data = round(n_hours_total / n_hours_in_year) + + # Minimum of 1 year needed to have cumulative production be comparable to AEP + avg_no_years_in_data = np.max([avg_no_years_in_data, 1]) + + offset_prct = ( + 100.0 * (n_hours_total - avg_no_years_in_data * n_hours_in_year) / n_hours_in_year + ) # Offset percentage from whole number of years + if np.abs(offset_prct) > 1.0: + print( + f"WARNING: Dataset spans {n_weeks:.1f} weeks and is not a whole multiple of 8760 hours " + + "(1 year).\nNote the reported numbers therefore do not represent AEP. Instead, they " + + "represent \na form of cumulative production. Offset from whole number of years: " + + f"{offset_prct:+.2f} %." + ) + + return df_list, n_measurements_per_hour + + +def compare_cumulative_production( + df_list, + exclude_turbs=[], + ws_range=[0.0, 99.0], + model_tags=None, + print_to_console=True, +): + """Compare energy and wake losses between dataframes. + + Calculate the cumulative energy production for a list of Pandas DataFrame timeseries. + Then, calculate the error between the first timeseries in the list (typically SCADA + or LES) and the remaining timeseries (typically LES and/or FLORIS models). + + Args: + df_list (list): List of Pandas DataFrame timeseries. The first entry is typically SCADA or + LES, and the remaining entries are models to compare. + exclude_turbs (list, optional): List of turbines to exclude from the analysis, e.g., + because of poor performance or odd behavior. Defaults to []. + ws_range (list, optional): Wind speed range for filtering the data. When inspecting wake + losses, one may want to zoom into the relevant wind + speed range, typically between 6 and 14 m/s. This also allows you to inspect the model + performance for different wind speed regions. Defaults to [0.0, 99.0]. + model_tags (list, optional): List of string tags for the models. Defaults to None, which + will generate tags as "Model 0", "Model 1", etc. + print_to_console (bool, optional): Whether to print the results to the console. + Defaults to True. + + Raises: + ValueError: If input timeseries dataframes in df_list have different timestamps. + ValueError: If input timeseries dataframes in df_list have different number of turbines. + ValueError: If input timeseries dataframes in df_list already contain a 'pow_ref' column. + + Returns: + table_absolute_cumprod_dict: Dictionary containing the absolute cumulative production + numbers, including errors w.r.t. the first dataframe. + """ + # Apply default model tags if not provided + if model_tags is None: + model_tags = [f"Model {ti}" for ti in range(len(df_list))] + + # Check inputs and format df_list as needed + df_list, n_measurements_per_hour = _prepare_df_list( + df_list, model_tags, exclude_turbs, ws_range + ) + + # Helper variables + n_turbs = dfm.get_num_turbines(df_list[0]) + pow_cols = [f"pow_{ti:03d}" for ti in range(n_turbs)] + + # Absolute cumulative energy production for the entire farm and per turbine + cumprod_turbine_list = [ + [(df[pc].sum() * 1.0e-6 / n_measurements_per_hour) for pc in pow_cols] for df in df_list + ] + cumprod_farm_list = [np.sum(cumprod_tm) for cumprod_tm in cumprod_turbine_list] + + table_absolute_cumprod_dict = { + "Selection": ["Entire farm"] + [f"Turbine {ti:02d}" for ti in range(n_turbs)], + } + for mii, model_tag in enumerate(model_tags): + table_absolute_cumprod_dict[f"{model_tag} (MWh)"] = [ + cumprod_farm_list[mii] + ] + cumprod_turbine_list[mii] + if mii > 0: # Compare against first entry (usually SCADA or LES) and calculate errors + table_absolute_cumprod_dict[f"{model_tag} error (%)"] = [ + _err(cumprod_farm_list[0], cumprod_farm_list[mii]) + ] + [_err(x, y) for x, y in zip(cumprod_turbine_list[0], cumprod_turbine_list[mii])] + + # Finally print + if print_to_console: + t0 = df_list[0].iloc[0]["time"] + t1 = df_list[0].iloc[-1]["time"] + c = ( + f"Data from {t0.strftime('%Y-%m-%d')} to {t1.strftime('%Y-%m-%d')}; Wind speeds " + + f"{ws_range[0]:.1f} m/s to {ws_range[1]:.1f} m/s" + ) + print("") + _print_pretty_table( + table_absolute_cumprod_dict, title=f"Absolute cumulative energy (MWh); {c}" + ) + + return table_absolute_cumprod_dict + + +def compare_relative_wake_loss( + df_list, + df_upstream, + exclude_turbs=[], + ws_range=[0.0, 99.0], + model_tags=None, + print_to_console=True, +): + """Compare energy and wake losses between dataframes. + + Calculate the relative wake loss for a list of Pandas DataFrame timeseries. Then, calculate + the error between the first timeseries in the list (typically SCADA or LES) and the remaining + timeseries (typically LES and/or FLORIS models). + + Args: + df_list (list): List of Pandas DataFrame timeseries. The first entry is typically SCADA or + LES, and the remaining entries are models to compare. + df_upstream (Pandas DataFrame): Upstream data for reference, generated using + 'ftools.get_upstream_turbs_floris()' + exclude_turbs (list, optional): List of turbines to exclude from the analysis, e.g., + because of poor performance or odd behavior. Defaults to []. + ws_range (list, optional): Wind speed range for filtering the data. When inspecting wake + losses, one may want to zoom into the relevant wind + speed range, typically between 6 and 14 m/s. This also allows you to inspect the model + performance for different wind speed regions. Defaults to [0.0, 99.0]. + model_tags (list, optional): List of string tags for the models. Defaults to None, which + will generate tags as "Model 0", "Model 1", etc. + print_to_console (bool, optional): Whether to print the results to the console. + Defaults to True. + + Raises: + ValueError: If input timeseries dataframes in df_list have different timestamps. + ValueError: If input timeseries dataframes in df_list have different number of turbines. + ValueError: If input timeseries dataframes in df_list already contain a 'pow_ref' column. + + Returns: + table_wakeloss_cumprod_dict: Dictionary containing the relative wake loss numbers, including + errors w.r.t. the first dataframe. + """ + # Apply default model tags if not provided + if model_tags is None: + model_tags = [f"Model {ti}" for ti in range(len(df_list))] + + # Check inputs and format df_list as needed + df_list, n_measurements_per_hour = _prepare_df_list( + df_list, model_tags, exclude_turbs, ws_range + ) + + # Helper variables + n_turbs = dfm.get_num_turbines(df_list[0]) + + # Specify reference power production as the power production of the most upstream turbine(s) + for dfii, df in enumerate(df_list): + # Specify upstream power in the exact same way as with the SCADA data + df = dfm.set_pow_ref_by_upstream_turbines(df, df_upstream, exclude_turbs=exclude_turbs) + + # Compare cumulative energy wake loss relative to most upstream turbines + cumprod_turbine_waked_list = [ + np.array([None for _ in range(n_turbs)], dtype=float) for _ in df_list + ] + cumprod_turbine_unwaked_list = [ + np.array([None for _ in range(n_turbs)], dtype=float) for _ in df_list + ] + + # For each timeseries and for every turbine, determine test and reference power productions + for ti in range(n_turbs): + for dii, df in enumerate(df_list): + p_test = np.array(df[f"pow_{ti:03d}"], dtype=float, copy=True) + p_ref = np.array(df["pow_ref"], dtype=float, copy=True) + ids_non_nan = (~np.isnan(p_test)) & (~np.isnan(p_ref)) + cumprod_turbine_unwaked_list[dii][ti] = ( + np.sum(p_ref[ids_non_nan]) * 1.0e-6 / n_measurements_per_hour + ) + cumprod_turbine_waked_list[dii][ti] = ( + np.sum(p_test[ids_non_nan]) * 1.0e-6 / n_measurements_per_hour + ) + + cumprod_farm_waked_list = [ + np.sum(cumprod_tm_waked) for cumprod_tm_waked in cumprod_turbine_waked_list + ] + cumprod_farm_unwaked_list = [ + np.sum(cumprod_tm_unwaked) for cumprod_tm_unwaked in cumprod_turbine_unwaked_list + ] + + # Create placeholder dictionary to collect wake loss results + table_wakeloss_cumprod_dict = { + "Selection": ["Entire farm"] + [f"Turbine {ti:02d}" for ti in range(n_turbs)], + } + + # Calculate wake losses for each model and store in list + wake_losses_list = [ + np.hstack( + [ + _wake_loss(cumprod_farm_waked_list[dii], cumprod_farm_unwaked_list[dii]), + _wake_loss(cumprod_turbine_waked_list[dii], cumprod_turbine_unwaked_list[dii]), + ] + ) + for dii in range(len(df_list)) + ] + + # Calculate wake loss errors between first and all remaining models, and store in list + for mii, model_tag in enumerate(model_tags): + table_wakeloss_cumprod_dict[f"{model_tag} (%)"] = wake_losses_list[mii] + if mii > 0: # Compare against first entry (usually SCADA or LES) and calculate error + table_wakeloss_cumprod_dict[f"{model_tag} error (p.p.)"] = ( + wake_losses_list[mii] - wake_losses_list[0] + ) + + # Finally print + if print_to_console: + t0 = df_list[0].iloc[0]["time"] + t1 = df_list[0].iloc[-1]["time"] + c = ( + f"Data from {t0.strftime('%Y-%m-%d')} to {t1.strftime('%Y-%m-%d')}; Wind speeds " + + f"{ws_range[0]:.1f} m/s to {ws_range[1]:.1f} m/s" + ) + print("") + _print_pretty_table( + table_wakeloss_cumprod_dict, title=f"Cumulative energy wake loss (%); {c}" + ) + + return table_wakeloss_cumprod_dict diff --git a/flasc/data_processing/timeseries_to_grid_solutions.py b/flasc/data_processing/timeseries_to_grid_solutions.py new file mode 100644 index 00000000..e0e284e5 --- /dev/null +++ b/flasc/data_processing/timeseries_to_grid_solutions.py @@ -0,0 +1,147 @@ +"""Module for generating tabulated data from times series.""" + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import polars as pl +import seaborn as sns + +from flasc.analysis.expected_power_analysis_utilities import ( + _add_wd_ws_bins, + _bin_and_group_dataframe_expected_power, +) + + +def _plot_binned_data_counts(df_grid: pd.DataFrame): + """Plot the number of entries in each wind direction/wind speed bin. + + This is a supporting function used to display + how complete one can derive a steady-state table (grid) from a timeseries of solutions, e.g., + from an LES timeseries. + + Args: + df_grid (pd.DataFrame): DataFrame containing the binned data. + + Returns: + matplotlib.axes.Axes: Axes object of the generated plot. + """ + df_binned = df_grid + df_N = ( + df_binned[["wd_bin", "ws_bin", "count"]] + .set_index(["wd_bin", "ws_bin"]) + .unstack() + .transpose() + ) + df_N.index = [i[1] for i in df_N.index] + + fig, ax = plt.subplots(figsize=(24, 12)) + sns.heatmap( + df_N, + annot=True, + annot_kws={"fontsize": 8}, + linewidth=0.5, + cmap="rocket_r", + ax=ax, + fmt=".0f", + cbar=False, + ) + ax.set_ylabel("Wind speed bin (m/s)") + ax.set_xlabel("Wind direction bin (deg)") + ax.set_title("Number of entries per wd/ws bin in binning of timeseries data") + plt.tight_layout() + + return ax + + +def bin_timeseries_to_grid( + df_timeseries: pd.DataFrame, + wd_step: float = 5.0, + wd_min: float = 0.0, + wd_max: float = 360.0, + ws_step: float = 1.0, + ws_min: float = 0.0, + ws_max: float = 50.0, + N_min: int = 3, + plot: bool = False, +): + """Convert a timeseries DataFrame into a gridded solution table. + + Table is based on wind direction and wind speed bins. + + Args: + df_timeseries (pd.DataFrame): Dataframe with timeseries that you want to turn into a gridded + solution table. Requires the columns 'wd', 'ws', and one power column + (pow_000, pow_001, etc.) for every turbine in your wind farm. + wd_step (float, optional): Step size for wind direction bins. Defaults to 5.0. + wd_min (float, optional): Minimum wind direction for binning. Defaults to 0.0. + wd_max (float, optional): Maximum wind direction for binning. Defaults to 360.0. + ws_step (float, optional): Step size for wind speed bins. Defaults to 1.0. + ws_min (float, optional): Minimum wind speed for binning. Defaults to 0.0. + ws_max (float, optional): Maximum wind speed for binning. Defaults to 50.0. + N_min (int, optional): Minimum number of samples within a bin for it to be considered valid. + Defaults to 3. + plot (bool, optional): Whether to generate a plot of the binned data counts. + Defaults to False. + + Returns: + pd.DataFrame: DataFrame containing the gridded solution table. + """ + # Local copy of the timeseries that we can manipulate + df_ = pl.from_pandas(df_timeseries.copy()) + df_ = df_.with_columns(pl.lit("name").alias("df_name")) + + # Add the columns ws_bin and wd_bin, based on the columns wd and ws in df_ + df_ = _add_wd_ws_bins( + df_, + wd_cols=["wd"], + ws_cols=["ws"], + wd_step=wd_step, + wd_min=wd_min, + wd_max=wd_max, + ws_step=ws_step, + ws_min=ws_min, + ws_max=ws_max, + ) + + # Bin df_ into df_bin based on the columns bin_cols_without_df_name. + # The output contains the mean and count of all test_cols for each bin, and also a total count + # of samples in each bin (count). + df_bin = _bin_and_group_dataframe_expected_power( + df_=df_, + test_cols=df_timeseries.columns, + bin_cols_without_df_name=["wd_bin", "ws_bin"], + ) + + # Filter out bins with less than N_min samples. + df_bin = df_bin.filter(pl.col("count") >= N_min) + + # Convert the dataframe back to Pandas for formatting and plotting + df_grid = df_bin.to_pandas() + + # Generate a plot, if necessary, but removing unused columns from the dataframe + if plot: + _plot_binned_data_counts(df_grid) + + # Convert into a minimum dataframe with only necessary columns and rename + df_grid = df_grid[ + [col for col in df_grid.columns if "pow_" in col] + ["wd_bin", "ws_bin", "ti_mean", "count"] + ] + df_grid = df_grid.rename( + columns={ + "wd_bin": "wd", + "ws_bin": "ws", + **{col: col.replace("_mean", "") for col in df_grid.columns if "_mean" in col}, + } + ) + + # Make sure that all bins exists in df_grid, even if they have no data. + for wd_value in np.arange(wd_min + 0.5 * wd_step, wd_max, wd_step): + for ws_value in np.arange(ws_min + 0.5 * ws_step, ws_max + 0.5 * ws_step, ws_step): + if not ((df_grid["wd"] == wd_value) & (df_grid["ws"] == ws_value)).any(): + new_row = pd.DataFrame({"wd": [wd_value], "ws": [ws_value]}) + df_grid = pd.concat([df_grid, new_row], ignore_index=True) + + # Sort table by wind directions and wind speeds + df_grid = df_grid.sort_values(by=["ws", "wd"]).reset_index(drop=True) + + return df_grid diff --git a/flasc/utilities/floris_tools.py b/flasc/utilities/floris_tools.py index 330fc356..ecd43ff2 100644 --- a/flasc/utilities/floris_tools.py +++ b/flasc/utilities/floris_tools.py @@ -214,15 +214,39 @@ def interpolate_floris_from_df_approx( " Creating a gridded interpolant with " + "interpolation method '%s'." % method ) - # Make a copy from wd=0.0 deg to wd=360.0 deg for wrapping + # Make a copy from wd=0.0 deg to wd=360.0 deg for wrapping, and vice versa if wrap_0deg_to_360deg and (not (df_approx["wd"] > 359.999999).any()): - if not np.any(df_approx["wd"] < 0.01): + unique_steps = np.unique(np.diff(np.unique(df_approx["wd"]))) + step_size = unique_steps[0] + if len(unique_steps) > 1: raise UserWarning( - "wrap_0deg_to_360deg is set to True but no solutions at wd=0 deg in the precalculated solution table." + "wrap_0deg_to_360deg is set to True but precalculated solution table has irregular steps in wind direction." ) - df_subset = df_approx[df_approx["wd"] == 0.0].copy() - df_subset["wd"] = 360.0 - df_approx = pd.concat([df_approx, df_subset], axis=0) + + if (0.0 in df_approx["wd"].values) & ((360.0 - step_size) in df_approx["wd"].values): + wd_min = 0.0 + wd_max = 360.0 - step_size + elif ((step_size / 2) in df_approx["wd"].values) & ( + (360.0 - step_size / 2) in df_approx["wd"].values + ): + wd_min = step_size / 2 + wd_max = 360.0 - step_size / 2 + else: + raise UserWarning( + "wrap_0deg_to_360deg is set to True but it seems that your table does not cover the wind rose.\n" + + f"Your current table covers the following wind directions: {df_approx['wd'].unique()}.\n" + "Please assign `wrap_0deg_to_360deg=False` or alternatively ensure that:\n" + + " 1. The step size is consistent across the table, and either:\n" + + " 2a. 0.0 deg and (360.0 - step_size) are included in the table, or\n" + + " 2b. (step_size/2) and (360.0 - step_size/2) are included in the table." + ) + + # Copy lower/upper bounds (e.g. copy 0.0/2.5 deg to 360.0/362.5 deg, and copy 355.0/357.5 deg to -5.0/-2.5 deg) to allow interpolation over the entire wind rose + df_subset_lb = df_approx[df_approx["wd"] == wd_min].copy() + df_subset_lb["wd"] += 360.0 + df_subset_ub = df_approx[df_approx["wd"] == wd_max].copy() + df_subset_ub["wd"] += -360.0 + df_approx = pd.concat([df_approx, df_subset_lb, df_subset_ub], axis=0) # Copy TI to lower and upper bound if (grid_type == "3d") and extrapolate_ti: diff --git a/pyproject.toml b/pyproject.toml index 61b94646..9d8b2eb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,9 @@ dependencies = [ "res-wind-up~=0.1", "optuna~=4.0", "scikit-learn~=1.5", + "zenodo-get~=3.0.0", + "xarray", + "netcdf4~=1.6", ] [project.optional-dependencies] diff --git a/tests/floris_tools_test.py b/tests/floris_tools_test.py index 007c15f5..3c5065e1 100644 --- a/tests/floris_tools_test.py +++ b/tests/floris_tools_test.py @@ -92,7 +92,7 @@ def test_floris_approx_table(self): } ) df["time"] = 0.0 # Empty array - df = interpolate_floris_from_df_approx(df, df_fm_approx) + df = interpolate_floris_from_df_approx(df, df_fm_approx, wrap_0deg_to_360deg=False) # Ensure that NaNs are mimicked appropriately self.assertTrue(~df[["pow_003", "pow_004"]].isna().any().any())