[JTH] first version to try with real case scenario

tausiaj · tausiaj · commit 8925aa67c895 · 2025-03-06T13:52:47.000+01:00
diff --git a/bluemath_tk/core/models.py b/bluemath_tk/core/models.py
@@ -507,7 +507,7 @@ def set_num_processors_to_use(self, num_processors: int) -> None:
             num_processors = num_processors_available
         elif num_processors <= 0:
             raise ValueError("Number of processors must be greater than 0")
-        elif (num_processors - num_processors_available) < 2:
+        elif (num_processors_available - num_processors) < 2:
             raise ValueError(
                 "Number of processors requested is less than 2 processors available"
             )
diff --git a/bluemath_tk/interpolation/rbf.py b/bluemath_tk/interpolation/rbf.py
@@ -2,6 +2,7 @@
 from typing import Callable, List, Tuple
 
 import numpy as np
+import dask.array as da
 import pandas as pd
 from scipy.optimize import fmin, fminbound
 
@@ -278,6 +279,9 @@ def __init__(
         # Exclude attributes to .save_model() method
         self._exclude_attributes = []
 
+        # Row chunks for parallel computation
+        self.row_chunks: int = None
+
     @property
     def sigma_min(self) -> float:
         return self._sigma_min
@@ -714,24 +718,50 @@ def _rbf_variable_interpolation(
             The interpolated variable.
         """
 
-        r = np.linalg.norm(
-            normalized_dataset.values[:, np.newaxis, :]
-            - self.normalized_subset_data.values[np.newaxis, :, :],
-            axis=2,
-        )
-        kernel_values = self.kernel_func(r, opt_sigma)
-        linear_part = np.dot(
-            normalized_dataset.values,
-            rbf_coeff[
-                num_points_subset + 1 : num_points_subset + 1 + num_vars_subset
-            ].T,
-        )
+        # Calculate optimal chunk size based on memory
+        norm_dataset = normalized_dataset.values
+        norm_subset = self.normalized_subset_data.values
 
-        return (
-            rbf_coeff[num_points_subset]
-            + np.dot(kernel_values, rbf_coeff[:num_points_subset])
-            + linear_part
-        )
+        if self.row_chunks is not None:
+            chunks = (min(self.row_chunks, norm_dataset.shape[0]), -1)
+            self.logger.info(f"Using row chunks of size {chunks[0]}")
+        # elif self.num_workers > 1:
+        #     chunks = (norm_dataset.shape[0] // self.num_workers, -1)
+        else:
+            chunks = (norm_dataset.shape[0], -1)
+
+        # Convert to dask arrays for large operations
+        d_dataset = da.from_array(norm_dataset, chunks=chunks)
+        d_subset = da.from_array(norm_subset)
+
+        # Split computation into chunks
+        result = []
+        for i in range(0, len(d_dataset), chunks[0]):
+            chunk = d_dataset[i : i + chunks[0]]
+
+            # Calculate r for this chunk
+            r_chunk = da.linalg.norm(chunk[:, None, :] - d_subset[None, :, :], axis=2)
+
+            # Apply kernel and dot product
+            kernel_values = self.kernel_func(r_chunk, opt_sigma)
+
+            # Compute this chunk's result
+            chunk_result = (
+                rbf_coeff[num_points_subset]
+                + da.dot(kernel_values, rbf_coeff[:num_points_subset])
+                + da.dot(
+                    chunk,
+                    rbf_coeff[
+                        num_points_subset + 1 : num_points_subset + 1 + num_vars_subset
+                    ].T,
+                )
+            )
+
+            # Compute and append
+            result.append(chunk_result.compute())
+
+        # Combine results
+        return np.concatenate(result)
 
     def _rbf_interpolate(
         self, dataset: pd.DataFrame, num_workers: int = None

Original file line number	Diff line number	Diff line change
`@@ -507,7 +507,7 @@ def set_num_processors_to_use(self, num_processors: int) -> None:`
`507`	`507`	`num_processors = num_processors_available`
`508`	`508`	`elif num_processors <= 0:`
`509`	`509`	`raise ValueError("Number of processors must be greater than 0")`
`510`		`- elif (num_processors - num_processors_available) < 2:`
	`510`	`+ elif (num_processors_available - num_processors) < 2:`
`511`	`511`	`raise ValueError(`
`512`	`512`	`"Number of processors requested is less than 2 processors available"`
`513`	`513`	`)`