Set up new balancer test

timovdk · timovdk · commit 267784c21b18 · 2025-07-03T13:08:17.000+02:00
diff --git a/src/balancers.py b/src/balancers.py
@@ -0,0 +1,56 @@
+import numpy as np
+
+from sklearn.base import BaseEstimator
+from sklearn.utils.class_weight import compute_sample_weight as _compute_sample_weight
+
+class DynamicLossBalancer(BaseEstimator):
+    """Dynamic Balancer
+
+    Parameters
+    ----------
+    ratio : float
+        Maximum weighting factor applied to minority class.
+    window_size : int
+        Number of recent samples to compute pseudo-derivative.
+    activation : str
+        Activation function to shape the derivative response. One of: 'linear', 'sigmoid', 'tanh'.
+    initial_weights : dict
+        Initial static weights per class, e.g., {0: 1.0, 1: 1.0}.
+    """
+
+    name = "dynamic balancer"
+    label = "Dynamic Balanced Sample Weight"
+
+    def __init__(self, ratio=1.0, window_size=10, activation='linear', a=1.0):
+        self.ratio = ratio
+        self.window_size = window_size
+        self.activation = activation
+        self.a = a
+
+    def _activation_fn(self, x):
+        if self.activation == 'linear':
+            return x
+        elif self.activation == 'sigmoid':
+            return 1 / (1 + np.exp(-x))
+        elif self.activation == 'tanh':
+            return np.tanh(x)
+        else:
+            raise ValueError(f"Unsupported activation: {self.activation}")
+
+    def compute_sample_weight(self, y):
+        if len(set(y)) != 2:
+            raise ValueError("Only binary classification is supported.")
+        
+        slope = (np.sum(y[-min(self.window_size, len(y)):])) / min(self.window_size, len(y))
+
+        base_class_0_weight = sum(y == 1) / (self.ratio * sum(y == 0))
+        act_base_class_0_weight = self._activation_fn(base_class_0_weight)
+
+        # Compute dynamic weight: interpolates from 1.0 to ratio
+        class_1_weight = 1.0
+        class_0_weight = self.ratio * (self.a + (slope * act_base_class_0_weight))
+
+        weights = _compute_sample_weight(
+            class_weight={0: class_0_weight, 1: class_1_weight}, y=y
+        )
+        return weights * (len(y) / np.sum(weights))  # normalize sum of weights to sample count
diff --git a/src/feature_extractors.py b/src/feature_extractors.py
@@ -4,11 +4,13 @@
 
 def tfidf_params(trial: optuna.trial.FrozenTrial):
     max_df = trial.suggest_float("tfidf__max_df", 0.5, 1.0)
+    min_df = trial.suggest_int("tfidf__min_df", 1, 10)
+    ngram_range = trial.suggest_int("tfidf__ngram_range", 1, 3)
 
     return {
         "max_df": max_df,
-        "min_df": 1,
-        "ngram_range": (1, 2),
+        "min_df": min_df,
+        "ngram_range": (1, ngram_range),
         "sublinear_tf": True,
     }
 
diff --git a/src/main.py b/src/main.py
@@ -16,19 +16,20 @@
 from asreview.models.balancers import Balanced
 from asreview.models.queriers import Max
 
+from balancers import DynamicLossBalancer
 from classifiers import classifier_params, classifiers
 from feature_extractors import feature_extractor_params, feature_extractors
 
 # Study variables
 VERSION = 1
 METRIC = "loss"  # Options: "loss", "ndcg"
-STUDY_SET = "full"
+STUDY_SET = "demo"
 CLASSIFIER_TYPE = "svm"  # Options: "nb", "log", "svm", "rf"
-FEATURE_EXTRACTOR_TYPE = "e5"  # Options: "tfidf", "onehot", "labse", "bge-m3", "stella", "mxbai", "gist", "e5", "gte", "kalm", "lajavaness", "snowflake"
+FEATURE_EXTRACTOR_TYPE = "tfidf"  # Options: "tfidf", "onehot", "labse", "bge-m3", "stella", "mxbai", "gist", "e5", "gte", "kalm", "lajavaness", "snowflake"
 PICKLE_FOLDER_PATH = Path("synergy-dataset", f"pickles_{FEATURE_EXTRACTOR_TYPE}")
-PRE_PROCESSED_FMS = True  # False = on the fly
+PRE_PROCESSED_FMS = False  # False = on the fly
 PARALLELIZE_OBJECTIVE = True
-AUTO_SHUTDOWN = False
+AUTO_SHUTDOWN = True
 
 # Optuna variables
 OPTUNA_N_TRIALS = 500
@@ -111,11 +112,12 @@ def run_sequential(studies, *args, **kwargs):
 
 
 # Function to process each row
-def process_row(row, clf_params, fe_params, ratio):
+def process_row(row, clf_params, fe_params, ratio, a, activation, window_size):
     priors = row["prior_inclusions"] + row["prior_exclusions"]
 
     # Create balancer with optuna value
-    blc = Balanced(ratio=ratio)
+    #blc = DynamicLossBalancer(ratio=ratio)
+    blc = DynamicLossBalancer(ratio=ratio, a=a, activation=activation, window_size=window_size)
 
     # Create classifier and feature extractor with params
     clf = classifiers[CLASSIFIER_TYPE](**clf_params)
@@ -171,6 +173,9 @@ def objective_report(report_order):
     def objective(trial):
         # Use normal distribution for ratio (ratio effect is linear)
         ratio = trial.suggest_float("ratio", 1.0, 10.0)
+        a = trial.suggest_float("a", 1.0, 10.0)
+        activation = trial.suggest_categorical("activation", ["linear", "sigmoid", "tanh"])
+        window_size = trial.suggest_int("window_size", 10, 100)
 
         clf_params = classifier_params[CLASSIFIER_TYPE](trial)
         fe_params = (
@@ -181,11 +186,11 @@ def objective(trial):
 
         if PARALLELIZE_OBJECTIVE:
             metric_values = run_parallel(
-                studies, clf_params=clf_params, fe_params=fe_params, ratio=ratio
+                studies, clf_params=clf_params, fe_params=fe_params, ratio=ratio, a=a, activation=activation, window_size=window_size
             )
         else:
             metric_values = run_sequential(
-                studies, clf_params=clf_params, fe_params=fe_params, ratio=ratio
+                studies, clf_params=clf_params, fe_params=fe_params, ratio=ratio, a=a, activation=activation, window_size=window_size
             )
 
         all_metric_values = []
@@ -268,7 +273,7 @@ def download_pickles(report_order):
         storage=os.getenv(
             "DB_URI", "sqlite:///db.sqlite3"
         ),  # Specify the storage URL here.
-        study_name=f"ASReview2_0b4-{CLASSIFIER_TYPE}-{FEATURE_EXTRACTOR_TYPE}-{STUDY_SET}-{VERSION}",
+        study_name=f"ASReview2_1_1_1-{CLASSIFIER_TYPE}-{FEATURE_EXTRACTOR_TYPE}-{STUDY_SET}-{VERSION}",
         direction="minimize" if METRIC == "loss" else "maximize",
         sampler=sampler,
         load_if_exists=True,