diff --git a/bluemath_tk/core/decorators.py b/bluemath_tk/core/decorators.py index db5d271..f597c40 100644 --- a/bluemath_tk/core/decorators.py +++ b/bluemath_tk/core/decorators.py @@ -216,7 +216,14 @@ def wrapper( raise ValueError("Number of iterations must be integer and > 0") if not isinstance(normalize_data, bool): raise TypeError("Normalize data must be a boolean") - return func(self, data, directional_variables, num_iteration) + return func( + self, + data, + directional_variables, + custom_scale_factor, + num_iteration, + normalize_data, + ) return wrapper diff --git a/bluemath_tk/core/operations.py b/bluemath_tk/core/operations.py index 494c5de..2ce3b4d 100644 --- a/bluemath_tk/core/operations.py +++ b/bluemath_tk/core/operations.py @@ -65,9 +65,10 @@ def normalize( ... } ... ) >>> normalized_data, scale_factor = normalize(data=df) + >>> import numpy as np >>> import xarray as xr - >>> from bluemath_tk.core.data import normalize + >>> from bluemath_tk.core.operations import normalize >>> ds = xr.Dataset( ... { ... "Hs": (("time",), np.random.rand(1000) * 7), @@ -85,6 +86,7 @@ def normalize( vars_to_normalize = list(data.data_vars) else: raise TypeError("Data must be a pandas DataFrame or an xarray Dataset") + normalized_data = data.copy() # Copy data to avoid bad memory replacements scale_factor = ( custom_scale_factor.copy() @@ -122,6 +124,7 @@ def normalize( normalized_data[data_var] = (normalized_data[data_var] - data_var_min) / ( data_var_max - data_var_min ) + return normalized_data, scale_factor @@ -173,6 +176,7 @@ def denormalize( ... "Dir": [0, 360], ... } >>> denormalized_data = denormalize(normalized_data=df, scale_factor=scale_factor) + >>> import numpy as np >>> import xarray as xr >>> from bluemath_tk.core.operations import denormalize @@ -204,6 +208,7 @@ def denormalize( data[data_var] * (scale_factor[data_var][1] - scale_factor[data_var][0]) + scale_factor[data_var][0] ) + return data @@ -263,6 +268,7 @@ def standarize( }, coords=data.coords, ) + return standarized_data, scaler @@ -308,6 +314,7 @@ def destandarize( }, coords=standarized_data.coords, ) + return data diff --git a/bluemath_tk/core/plotting/scatter.py b/bluemath_tk/core/plotting/scatter.py new file mode 100644 index 0000000..8bd10ef --- /dev/null +++ b/bluemath_tk/core/plotting/scatter.py @@ -0,0 +1,83 @@ +from typing import List, Tuple + +import numpy as np +import pandas as pd +from matplotlib.axes import Axes +from matplotlib.figure import Figure + +from .base_plotting import DefaultStaticPlotting +from .colors import default_colors + + +def plot_scatters_in_triangle( + dataframes: List[pd.DataFrame], + data_colors: List[str] = default_colors, + **kwargs, +) -> Tuple[Figure, Axes]: + """ + Plot a scatter plot of the dataframes with axes in a triangle. + + Parameters + ---------- + dataframes : List[pd.DataFrame] + List of dataframes to plot. + data_colors : List[str], optional + List of colors for the dataframes. + **kwargs : dict, optional + Keyword arguments for the scatter plot. Will be passed to the + DefaultStaticPlotting.plot_scatter method, which is the same + as the one in matplotlib.pyplot.scatter. + For example, to change the marker size, you can use: + ``plot_scatters_in_triangle(dataframes, s=10)`` + + Returns + ------- + fig : Figure + Figure object. + axes : Axes + Axes object. + """ + + # Get the number and names of variables from the first dataframe + variables_names = list(dataframes[0].columns) + num_variables = len(variables_names) + + # Check variables names are in all dataframes + for df in dataframes: + if not all(v in df.columns for v in variables_names): + raise ValueError( + f"Variables {variables_names} are not in dataframe {df.columns}." + ) + + # Create figure and axes + default_static_plot = DefaultStaticPlotting() + fig, axes = default_static_plot.get_subplots( + nrows=num_variables - 1, + ncols=num_variables - 1, + sharex=False, + sharey=False, + ) + if isinstance(axes, Axes): + axes = np.array([[axes]]) + + for c1, v1 in enumerate(variables_names[1:]): + for c2, v2 in enumerate(variables_names[:-1]): + for idf, df in enumerate(dataframes): + default_static_plot.plot_scatter( + ax=axes[c2, c1], + x=df[v1], + y=df[v2], + c=data_colors[idf], + alpha=0.6, + **kwargs, + ) + if c1 == c2: + axes[c2, c1].set_xlabel(variables_names[c1 + 1]) + axes[c2, c1].set_ylabel(variables_names[c2]) + elif c1 > c2: + axes[c2, c1].xaxis.set_ticklabels([]) + axes[c2, c1].yaxis.set_ticklabels([]) + else: + fig.delaxes(axes[c2, c1]) + + return fig, axes diff --git a/bluemath_tk/datamining/_base_datamining.py b/bluemath_tk/datamining/_base_datamining.py index 7dcae17..0a7f14d 100644 --- a/bluemath_tk/datamining/_base_datamining.py +++ b/bluemath_tk/datamining/_base_datamining.py @@ -4,24 +4,18 @@ import numpy as np import pandas as pd import xarray as xr -from matplotlib import pyplot as plt from matplotlib.axes import Axes +from matplotlib.figure import Figure from ..core.models import BlueMathModel from ..core.plotting.base_plotting import DefaultStaticPlotting +from ..core.plotting.scatter import plot_scatters_in_triangle class BaseSampling(BlueMathModel): """ Base class for all sampling BlueMath models. This class provides the basic structure for all sampling models. - - Methods - ------- - generate : pd.DataFrame - Generates samples. - plot_generated_data : Tuple[plt.figure, plt.axes] - Plots the generated data on a scatter plot matrix. """ @abstractmethod @@ -52,7 +46,7 @@ def plot_generated_data( self, data_color: str = "blue", **kwargs, - ) -> Tuple[plt.figure, plt.axes]: + ) -> Tuple[Figure, Axes]: """ Plots the generated data on a scatter plot matrix. @@ -65,9 +59,9 @@ def plot_generated_data( Returns ------- - plt.figure + Figure The figure object containing the plot. - plt.axes + Axes Array of axes objects for the subplots. Raises @@ -76,40 +70,14 @@ def plot_generated_data( If the data is empty. """ - if not self.data.empty: - variables_names = list(self.data.columns) - num_variables = len(variables_names) - else: + if self.data.empty: raise ValueError("Data must be a non-empty DataFrame with columns to plot.") - # Create figure and axes - default_static_plot = DefaultStaticPlotting() - fig, axes = default_static_plot.get_subplots( - nrows=num_variables - 1, - ncols=num_variables - 1, - sharex=False, - sharey=False, + fig, axes = plot_scatters_in_triangle( + dataframes=[self.data], + data_colors=[data_color], + **kwargs, ) - if isinstance(axes, Axes): - axes = np.array([[axes]]) - - for c1, v1 in enumerate(variables_names[1:]): - for c2, v2 in enumerate(variables_names[:-1]): - default_static_plot.plot_scatter( - ax=axes[c2, c1], - x=self.data[v1], - y=self.data[v2], - c=data_color, - **kwargs, - ) - if c1 == c2: - axes[c2, c1].set_xlabel(variables_names[c1 + 1]) - axes[c2, c1].set_ylabel(variables_names[c2]) - elif c1 > c2: - axes[c2, c1].xaxis.set_ticklabels([]) - axes[c2, c1].yaxis.set_ticklabels([]) - else: - fig.delaxes(axes[c2, c1]) return fig, axes @@ -162,7 +130,7 @@ def fit( self.custom_scale_factor = custom_scale_factor.copy() else: self.logger.info( - "Normalization is disabled. Using default scale factor (0, 1) for all fitting variables." + "Normalization is disabled. Set normalize_data to True to enable normalization." ) self.custom_scale_factor = { fitting_variable: (0, 1) for fitting_variable in self.fitting_variables @@ -198,7 +166,7 @@ def plot_selected_centroids( centroids_color: str = "red", plot_text: bool = False, **kwargs, - ) -> Tuple[plt.figure, plt.axes]: + ) -> Tuple[Figure, Axes]: """ Plots data and selected centroids on a scatter plot matrix. @@ -215,9 +183,9 @@ def plot_selected_centroids( Returns ------- - plt.figure + Figure The figure object containing the plot. - plt.axes + Axes Array of axes objects for the subplots. Raises @@ -231,59 +199,27 @@ def plot_selected_centroids( and list(self.data.columns) != [] ): variables_names = list(self.data.columns) - num_variables = len(variables_names) else: raise ValueError( "Data and centroids must have the same number of columns > 0." ) - # Create figure and axes - default_static_plot = DefaultStaticPlotting() - fig, axes = default_static_plot.get_subplots( - nrows=num_variables - 1, - ncols=num_variables - 1, - sharex=False, - sharey=False, + fig, axes = plot_scatters_in_triangle( + dataframes=[self.data, self.centroids], + data_colors=[data_color, centroids_color], + **kwargs, ) - if isinstance(axes, Axes): - axes = np.array([[axes]]) - - for c1, v1 in enumerate(variables_names[1:]): - for c2, v2 in enumerate(variables_names[:-1]): - default_static_plot.plot_scatter( - ax=axes[c2, c1], - x=self.data[v1], - y=self.data[v2], - c=data_color, - alpha=0.6, - **kwargs, - ) - if self.centroids is not None: - default_static_plot.plot_scatter( - ax=axes[c2, c1], - x=self.centroids[v1], - y=self.centroids[v2], - c=centroids_color, - alpha=0.9, - **kwargs, - ) - if plot_text: - for i in range(self.centroids.shape[0]): - axes[c2, c1].text( - self.centroids[v1][i], - self.centroids[v2][i], - str(i + 1), - fontsize=12, - fontweight="bold", - ) - if c1 == c2: - axes[c2, c1].set_xlabel(variables_names[c1 + 1]) - axes[c2, c1].set_ylabel(variables_names[c2]) - elif c1 > c2: - axes[c2, c1].xaxis.set_ticklabels([]) - axes[c2, c1].yaxis.set_ticklabels([]) - else: - fig.delaxes(axes[c2, c1]) + if plot_text: + for c1, v1 in enumerate(variables_names[1:]): + for c2, v2 in enumerate(variables_names[:-1]): + for i in range(self.centroids.shape[0]): + axes[c2, c1].text( + self.centroids[v1][i], + self.centroids[v2][i], + str(i + 1), + fontsize=12, + fontweight="bold", + ) return fig, axes @@ -292,7 +228,7 @@ def plot_data_as_clusters( data: pd.DataFrame, nearest_centroids: np.ndarray, **kwargs, - ) -> Tuple[plt.figure, plt.axes]: + ) -> Tuple[Figure, Axes]: """ Plots data as nearest clusters. @@ -307,9 +243,9 @@ def plot_data_as_clusters( Returns ------- - plt.figure + Figure The figure object containing the plot. - plt.axes + Axes The axes object for the plot. """ diff --git a/bluemath_tk/datamining/kma.py b/bluemath_tk/datamining/kma.py index 5cc4868..a1ce203 100644 --- a/bluemath_tk/datamining/kma.py +++ b/bluemath_tk/datamining/kma.py @@ -243,10 +243,6 @@ def fit( """ Fit the K-Means algorithm to the provided data. - This method initializes centroids for the K-Means algorithm using the - provided dataframe and custom scale factor. - It normalizes the data, and returns the calculated centroids. - TODO: Add option to force KMA initialization with MDA centroids. Parameters @@ -254,10 +250,14 @@ def fit( data : pd.DataFrame The input data to be used for the KMA algorithm. directional_variables : List[str], optional - A list of directional variables (will be transformed to u and v). + A list of directional variables that will be transformed to u and v components. + Then, to use custom_scale_factor, you must specify the variables names with the u and v suffixes. + Example: directional_variables=["Dir"], custom_scale_factor={"Dir_u": [0, 1], "Dir_v": [0, 1]}. Default is []. custom_scale_factor : dict, optional A dictionary specifying custom scale factors for normalization. + If normalize_data is True, this will be used to normalize the data. + Example: {"Hs": [0, 10], "Tp": [0, 10]}. Default is {}. min_number_of_points : int, optional The minimum number of points to consider a cluster. @@ -267,10 +267,13 @@ def fit( This is used when min_number_of_points is not None. Default is 10. normalize_data : bool, optional - A flag to normalize the data. Default is False. + A flag to normalize the data. + If True, the data will be normalized using the custom_scale_factor. + Default is False. regression_guided: dict, optional A dictionary specifying regression-guided clustering variables and relative weights. - Example: {"vars":["Fe"],"alpha":[0.6]}. Default is {}. + Example: {"vars": ["Fe"], "alpha": [0.6]}. + Default is {}. """ if regression_guided: @@ -374,10 +377,14 @@ def fit_predict( data : pd.DataFrame The input data to be used for the KMA algorithm. directional_variables : List[str], optional - A list of directional variables (will be transformed to u and v). + A list of directional variables that will be transformed to u and v components. + Then, to use custom_scale_factor, you must specify the variables names with the u and v suffixes. + Example: directional_variables=["Dir"], custom_scale_factor={"Dir_u": [0, 1], "Dir_v": [0, 1]}. Default is []. - custom_scale_factor : dict + custom_scale_factor : dict, optional A dictionary specifying custom scale factors for normalization. + If normalize_data is True, this will be used to normalize the data. + Example: {"Hs": [0, 10], "Tp": [0, 10]}. Default is {}. min_number_of_points : int, optional The minimum number of points to consider a cluster. @@ -387,10 +394,13 @@ def fit_predict( This is used when min_number_of_points is not None. Default is 10. normalize_data : bool, optional - A flag to normalize the data. Default is False. + A flag to normalize the data. + If True, the data will be normalized using the custom_scale_factor. + Default is False. regression_guided: dict, optional A dictionary specifying regression-guided clustering variables and relative weights. - Example: {"vars":["Fe"],"alpha":[0.6]}. Default is {}. + Example: {"vars": ["Fe"], "alpha": [0.6]}. + Default is {}. Returns ------- diff --git a/bluemath_tk/datamining/mda.py b/bluemath_tk/datamining/mda.py index ced2757..fef0538 100644 --- a/bluemath_tk/datamining/mda.py +++ b/bluemath_tk/datamining/mda.py @@ -297,16 +297,22 @@ def fit( data : pd.DataFrame The input data to be used for the MDA algorithm. directional_variables : List[str], optional - A list of names of the directional variables within the data. + A list of directional variables that will be transformed to u and v components. + Then, to use custom_scale_factor, you must specify the variables names with the u and v suffixes. + Example: directional_variables=["Dir"], custom_scale_factor={"Dir_u": [0, 1], "Dir_v": [0, 1]}. Default is []. custom_scale_factor : dict, optional A dictionary specifying custom scale factors for normalization. + If normalize_data is True, this will be used to normalize the data. + Example: {"Hs": [0, 10], "Tp": [0, 10]}. Default is {}. first_centroid_seed : int, optional The index of the first centroid to use in the MDA algorithm. Default is None. normalize_data : bool, optional - A flag to normalize the data. Default is False. + A flag to normalize the data. + If True, the data will be normalized using the custom_scale_factor. + Default is False. Notes ----- @@ -432,16 +438,22 @@ def fit_predict( data : pd.DataFrame The input data to be used for the MDA algorithm. directional_variables : List[str], optional - A list of names of the directional variables within the data. + A list of directional variables that will be transformed to u and v components. + Then, to use custom_scale_factor, you must specify the variables names with the u and v suffixes. + Example: directional_variables=["Dir"], custom_scale_factor={"Dir_u": [0, 1], "Dir_v": [0, 1]}. Default is []. custom_scale_factor : dict, optional A dictionary specifying custom scale factors for normalization. + If normalize_data is True, this will be used to normalize the data. + Example: {"Hs": [0, 10], "Tp": [0, 10]}. Default is {}. first_centroid_seed : int, optional The index of the first centroid to use in the MDA algorithm. Default is None. normalize_data : bool, optional - A flag to normalize the data. Default is False. + A flag to normalize the data. + If True, the data will be normalized using the custom_scale_factor. + Default is False. Returns ------- diff --git a/bluemath_tk/datamining/som.py b/bluemath_tk/datamining/som.py index 972abd9..c356750 100644 --- a/bluemath_tk/datamining/som.py +++ b/bluemath_tk/datamining/som.py @@ -294,18 +294,24 @@ def fit( Parameters ---------- data : pd.DataFrame - The input data to be used for the fitting. + The input data to be used for the SOM algorithm. directional_variables : List[str], optional - A list with the directional variables (will be transformed to u and v). + A list of directional variables that will be transformed to u and v components. + Then, to use custom_scale_factor, you must specify the variables names with the u and v suffixes. + Example: directional_variables=["Dir"], custom_scale_factor={"Dir_u": [0, 1], "Dir_v": [0, 1]}. Default is []. custom_scale_factor : dict, optional A dictionary specifying custom scale factors for normalization. + If normalize_data is True, this will be used to normalize the data. + Example: {"Hs": [0, 10], "Tp": [0, 10]}. Default is {}. num_iteration : int, optional The number of iterations for the SOM fitting. Default is 1000. normalize_data : bool, optional - A flag to normalize the data. Default is False. + A flag to normalize the data. + If True, the data will be normalized using the custom_scale_factor. + Default is False. """ super().fit( @@ -382,16 +388,22 @@ def fit_predict( data : pd.DataFrame The input data to be used for the SOM algorithm. directional_variables : List[str], optional - A list of directional variables (will be transformed to u and v). + A list of directional variables that will be transformed to u and v components. + Then, to use custom_scale_factor, you must specify the variables names with the u and v suffixes. + Example: directional_variables=["Dir"], custom_scale_factor={"Dir_u": [0, 1], "Dir_v": [0, 1]}. Default is []. custom_scale_factor : dict, optional A dictionary specifying custom scale factors for normalization. + If normalize_data is True, this will be used to normalize the data. + Example: {"Hs": [0, 10], "Tp": [0, 10]}. Default is {}. num_iteration : int, optional The number of iterations for the SOM fitting. Default is 1000. normalize_data : bool, optional - A flag to normalize the data. Default is False. + A flag to normalize the data. + If True, the data will be normalized using the custom_scale_factor. + Default is False. Returns ------- diff --git a/bluemath_tk/wrappers/_base_wrappers.py b/bluemath_tk/wrappers/_base_wrappers.py index 15a6f9e..9e3b387 100644 --- a/bluemath_tk/wrappers/_base_wrappers.py +++ b/bluemath_tk/wrappers/_base_wrappers.py @@ -78,8 +78,10 @@ def __init__( ---------- templates_dir : str The directory where the templates are searched. + Both binary and text files are supported, for the case where the user + needs to have a fixed binary file in all cases directories. metamodel_parameters : dict - The parameters to be used for all cases. + The parameters to be used for the different cases. fixed_parameters : dict The fixed parameters for the cases. output_dir : str @@ -428,11 +430,17 @@ def build_case_and_render_files( case_dir=case_dir, ) for template_name in self.templates_name: - self.render_file_from_template( - template_name=template_name, - context=case_context, - output_filename=op.join(case_dir, template_name), - ) + try: + self.render_file_from_template( + template_name=template_name, + context=case_context, + output_filename=op.join(case_dir, template_name), + ) + except UnicodeDecodeError as _ude: + self.copy_files( + src=op.join(self.templates_dir, template_name), + dst=op.join(case_dir, template_name), + ) def build_cases( self,