Switch to functools for caching (#700)

PGijsbers · web-flow · commit c4f132b8f539 · 2025-03-30T22:59:20.000+02:00
* Switch to functools implementation which introduces 'bug'

The bug is that now objects of the same class can not have their
cache cleared independently. From the usage, I do not believe that
there would otherwise be two different instances of the same class
anyway, though I may be wrong (thinking datasplit). Will need
to investigate more tomorrow.

In any case, the plan is to move to a more friendly implementation
anyway where those classes that do need cache eviction simply add
a two-liner for that.

* Use cached_property instead of caching a property manually

* Properly clear properties and also functions

Still has the issue that cache for all instances of the class are
cleared for functions (but not for properties).

* Use functools for caching
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
@@ -12,6 +12,7 @@
 
 from copy import copy
 from enum import Enum
+from functools import cached_property
 from importlib import import_module, invalidate_caches
 import logging
 import math
@@ -37,7 +38,6 @@
     file_lock,
     flatten,
     json_dump,
-    lazy_property,
     profile,
     repr_def,
     run_cmd,
@@ -503,7 +503,7 @@ def _results_summary(self, scoreboard=None):
         )
         return board.as_data_frame()
 
-    @lazy_property
+    @cached_property
     def output_dirs(self):
         return routput_dirs(
             rconfig().output_dir,
diff --git a/amlb/data.py b/amlb/data.py
@@ -15,6 +15,7 @@
 
 from abc import ABC, abstractmethod
 from enum import Enum
+from functools import cached_property
 import logging
 from typing import List, Union, Iterable
 
@@ -24,7 +25,7 @@
 from typing_extensions import TypeAlias
 
 from .datautils import Encoder
-from .utils import clear_cache, lazy_property, profile, repr_def
+from .utils import clear_cache, profile, repr_def
 
 log = logging.getLogger(__name__)
 
@@ -66,7 +67,7 @@ def is_categorical(self, strict: bool = True) -> bool:
     def is_numerical(self) -> bool:
         return self.data_type in ["int", "float", "number"]
 
-    @lazy_property
+    @cached_property
     def label_encoder(self) -> Encoder:
         return Encoder(
             "label" if self.values is not None else "no-op",
@@ -77,7 +78,7 @@ def label_encoder(self) -> Encoder:
             normalize_fn=Feature.normalize,
         ).fit(self.values)
 
-    @lazy_property
+    @cached_property
     def one_hot_encoder(self) -> Encoder:
         return Encoder(
             "one-hot" if self.values is not None else "no-op",
@@ -127,15 +128,15 @@ def data_path(self, format: str) -> str:
         """
         pass
 
-    @property
+    @cached_property
     @abstractmethod
     def data(self) -> DF:
         """
         :return: all the columns (predictors + target) as a pandas DataFrame.
         """
         pass
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def X(self) -> DF:
         """
@@ -144,15 +145,15 @@ def X(self) -> DF:
         predictors_ind = [p.index for p in self.dataset.predictors]
         return self.data.iloc[:, predictors_ind]
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def y(self) -> DF:
         """
         :return:the target column as a pandas DataFrame: if you need a Series, just call `y.squeeze()`.
         """
         return self.data.iloc[:, [self.dataset.target.index]]  # type: ignore
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def data_enc(self) -> AM:
         encoded_cols = [
@@ -162,15 +163,15 @@ def data_enc(self) -> AM:
         # optimize mem usage : frameworks use either raw data or encoded ones,
         # so we can clear the cached raw data once they've been encoded
         self.release(["data", "X", "y"])
-        return np.hstack(tuple(col.reshape(-1, 1) for col in encoded_cols))
+        return np.hstack(tuple(col.reshape(-1, 1) for col in encoded_cols))  # type: ignore[union-attr]
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def X_enc(self) -> AM:
         predictors_ind = [p.index for p in self.dataset.predictors]
         return self.data_enc[:, predictors_ind]
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def y_enc(self) -> AM:
         # return self.dataset.target.label_encoder.transform(self.y)
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
@@ -5,6 +5,7 @@
 import os
 import re
 import tempfile
+from functools import cache, cached_property
 from typing import List
 
 import arff
@@ -18,9 +19,7 @@
 from ..utils import (
     Namespace as ns,
     as_list,
-    lazy_property,
     list_all_files,
-    memoize,
     path_from_split,
     profile,
     repr_def,
@@ -257,7 +256,7 @@ def features(self) -> List[Feature]:
     def target(self) -> Feature:
         return self._get_metadata("target")
 
-    @memoize
+    @cache
     def _get_metadata(self, prop):
         meta = self._train.load_metadata()
         return meta[prop]
@@ -281,7 +280,7 @@ def data_path(self, format):
             )
         return self._get_data(format)
 
-    @lazy_property
+    @cached_property
     def data(self):
         # use codecs for unicode support: path = codecs.load(self._path, 'rb', 'utf-8')
         log.debug("Loading datasplit %s.", self.path)
diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
@@ -9,6 +9,7 @@
 from abc import abstractmethod
 import copy
 import functools
+from functools import cached_property
 import logging
 import os
 import re
@@ -27,7 +28,6 @@
 from ..resources import config as rconfig, get as rget
 from ..utils import (
     as_list,
-    lazy_property,
     path_from_split,
     profile,
     split_path,
@@ -107,7 +107,7 @@ def nrows(self) -> int:
             self._nrows = len(self._load_full_data(fmt="dataframe"))
         return self._nrows
 
-    @lazy_property
+    @cached_property
     def type(self):
         def get_type(card):
             if card > 2:
@@ -262,7 +262,7 @@ def get_non_empty_columns(data: DF) -> list[Hashable]:
 
         return subsample_path
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def features(self):
         def has_missing_values(f) -> bool:
@@ -298,7 +298,7 @@ def to_feature_type(dt):
             )
         ]
 
-    @lazy_property
+    @cached_property
     def target(self):
         return next(f for f in self.features if f.is_target)
 
@@ -347,12 +347,12 @@ def data_path(self, format):
             )
         return self._get_data(format)
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def data(self) -> DF:
         return self._get_data("dataframe")
 
-    @lazy_property
+    @cached_property
     @profile(logger=log)
     def data_enc(self) -> AM:
         return self._get_data("array")
diff --git a/amlb/resources.py b/amlb/resources.py
@@ -12,14 +12,13 @@
 import random
 import re
 import sys
+from functools import cache, cached_property
 
 from amlb.benchmarks.parser import benchmark_load
 from amlb.frameworks import default_tag, load_framework_definitions
 from .frameworks.definitions import TaskConstraint
 from .utils import (
     Namespace,
-    lazy_property,
-    memoize,
     normalize_path,
     run_cmd,
     str_sanitize,
@@ -66,15 +65,15 @@ def __init__(self, config: Namespace):
         sys.path.append(common_dirs["user"])
         log.debug("Extended Python sys.path to user directory: %s.", sys.path)
 
-    @lazy_property
+    @cached_property
     def project_info(self):
         split_url = self.config.project_repository.split("#", 1)
         repo = split_url[0]
         tag = None if len(split_url) == 1 else split_url[1]
         branch = tag or "master"
         return Namespace(repo=repo, tag=tag, branch=branch)
 
-    @lazy_property
+    @cached_property
     def git_info(self):
         def git(cmd, defval=None):
             try:
@@ -99,7 +98,7 @@ def git(cmd, defval=None):
             repo=repo, branch=branch, commit=commit, tags=tags, status=status
         )
 
-    @lazy_property
+    @cached_property
     def app_version(self):
         v = __version__
         if v != dev:
@@ -118,7 +117,7 @@ def seed(self, fold=None):
         else:
             return self._seed
 
-    @lazy_property
+    @cached_property
     def _seed(self):
         if str(self.config.seed).lower() in ["none", ""]:
             return None
@@ -167,12 +166,12 @@ def framework_definition(self, name, tag=None):
             )
         return framework, framework.name
 
-    @lazy_property
+    @cached_property
     def _frameworks(self):
         frameworks_file = self.config.frameworks.definition_file
         return load_framework_definitions(frameworks_file, self.config)
 
-    @memoize
+    @cache
     def constraint_definition(self, name: str) -> TaskConstraint:
         """
         :param name: name of the benchmark constraint definition as defined in the constraints file
@@ -187,7 +186,7 @@ def constraint_definition(self, name: str) -> TaskConstraint:
             )
         return TaskConstraint(**Namespace.dict(constraint))
 
-    @lazy_property
+    @cached_property
     def _constraints(self):
         constraints_file = self.config.benchmarks.constraints_file
         log.info("Loading benchmark constraint definitions from %s.", constraints_file)
diff --git a/amlb/results.py b/amlb/results.py
@@ -8,6 +8,8 @@
 import collections
 import io
 import logging
+from functools import cache
+
 import math
 import os
 import re
@@ -43,11 +45,9 @@
 from .utils import (
     Namespace,
     backup_file,
-    cached,
     datetime_iso,
     get_metadata,
     json_load,
-    memoize,
     profile,
     set_metadata,
 )
@@ -185,7 +185,7 @@ def __init__(
             else None
         )
 
-    @cached
+    @cache
     def as_data_frame(self):
         # index = ['task', 'framework', 'fold']
         index = []
@@ -236,7 +236,7 @@ def as_data_frame(self):
         log.debug("Scores columns: %s.", df.columns)
         return df
 
-    @memoize
+    @cache
     def as_printable_data_frame(self, verbosity=3):
         def none_like_as_empty(val: Any) -> str:
             return (
@@ -450,7 +450,7 @@ def save_predictions(
                 ]  # reorder columns alphabetically: necessary to match label encoding
                 if any(prob_cols != df.columns.values):
                     encoding_map = {
-                        prob_cols.index(col): i
+                        prob_cols.index(col): i  # type: ignore[union-attr]
                         for i, col in enumerate(df.columns.values)
                     }
                     remap = np.vectorize(lambda v: encoding_map[v])
@@ -606,11 +606,11 @@ def __init__(
         )
         self._metadata = metadata
 
-    @cached
+    @cache
     def get_result(self):
         return self.load_predictions(self._predictions_file)
 
-    @cached
+    @cache
     def get_result_metadata(self):
         return self._metadata or self.load_metadata(self._metadata_file)
 
diff --git a/amlb/utils/cache.py b/amlb/utils/cache.py