feat: make parameter ranges configurable

spikymoth · spikymoth · commit 3dde55819c55 · 2026-05-05T20:15:44.000+02:00
diff --git a/config.default.toml b/config.default.toml
@@ -199,3 +199,45 @@ column = "text"
 dataset = "mlabonne/harmful_behaviors"
 split = "test[:100]"
 column = "text"
+
+# The parameters used to choose suggest settings for abliteration. With the
+# exception of direction_scope and max_weight, all values are between 0 and 1.
+# min_weight is set relative to max_weight, and the other values are relative
+# to the number of layers in the model.
+# By default, the same ranges are used across all components. For parameters
+# that are sampled per component, the ranges are specified for each component.
+[parameters]
+
+# The different refusal direction scopes that can be applied to each trial.
+direction_scope = [
+    # Choose a refusal direction by interpolating between 2 layers and apply it globally.
+    "global",
+    # For each layer within range, apply the layer's own refusal direction to itself.
+    "per layer",
+]
+
+# For the global direction scope, the layer from which to choose the refusal direction.
+direction_fraction = { low = 0.4, high = 0.9 }
+
+# The maximum weight with which to apply the abliteration. Set log = true to
+# sample from the log space, which will select lower values more frequently.
+# Note that low must be greater than 0 when log = true.
+[parameters.max_weight]
+"attn.o_proj"   = { low = 0.8, high = 1.5, log = false }
+"mlp.down_proj" = { low = 0.8, high = 1.5, log = false }
+
+# The position (layer) at which the maximum weight should be applied.
+[parameters.max_weight_position_fraction]
+"attn.o_proj"   = { low = 0.6, high = 1.0 }
+"mlp.down_proj" = { low = 0.6, high = 1.0 }
+
+# The minimum weight as a fraction of the maximum weight.
+[parameters.min_weight_relative]
+"attn.o_proj"   = { low = 0.0, high = 1.0 }
+"mlp.down_proj" = { low = 0.0, high = 1.0 }
+
+# The distance from max_weight_position across which the weight drops from
+# max_weight to min_weight. Beyond this distance, the weight is set to 0.
+[parameters.min_weight_distance_fraction]
+"attn.o_proj"   = { low = 0.0, high = 0.6 }
+"mlp.down_proj" = { low = 0.0, high = 0.6 }
diff --git a/config.noslop.toml b/config.noslop.toml
@@ -161,3 +161,45 @@ dataset = "llm-aes/writing-prompts"
 split = "train[1000:1100]"
 column = "prompt"
 prefix = "Write a short story based on the writing prompt below.\n\nWriting prompt:"
+
+# The parameters used to choose suggest settings for abliteration. With the
+# exception of direction_scope and max_weight, all values are between 0 and 1.
+# min_weight is set relative to max_weight, and the other values are relative
+# to the number of layers in the model.
+# By default, the same ranges are used across all components. For parameters
+# that are sampled per component, the ranges are specified for each component.
+[parameters]
+
+# The different refusal direction scopes that can be applied to each trial.
+direction_scope = [
+    # Choose a refusal direction by interpolating between 2 layers and apply it globally.
+    "global",
+    # For each layer within range, apply the layer's own refusal direction to itself.
+    "per layer",
+]
+
+# For the global direction scope, the layer from which to choose the refusal direction.
+direction_fraction = { low = 0.4, high = 0.9 }
+
+# The maximum weight with which to apply the abliteration. Set log = true to
+# sample from the log space, which will select lower values more frequently.
+# Note that low must be greater than 0 when log = true.
+[parameters.max_weight]
+"attn.o_proj"   = { low = 0.8, high = 1.5, log = false }
+"mlp.down_proj" = { low = 0.8, high = 1.5, log = false }
+
+# The position (layer) at which the maximum weight should be applied.
+[parameters.max_weight_position_fraction]
+"attn.o_proj"   = { low = 0.6, high = 1.0 }
+"mlp.down_proj" = { low = 0.6, high = 1.0 }
+
+# The minimum weight as a fraction of the maximum weight.
+[parameters.min_weight_relative]
+"attn.o_proj"   = { low = 0.0, high = 1.0 }
+"mlp.down_proj" = { low = 0.0, high = 1.0 }
+
+# The distance from max_weight_position across which the weight drops from
+# max_weight to min_weight. Beyond this distance, the weight is set to 0.
+[parameters.min_weight_distance_fraction]
+"attn.o_proj"   = { low = 0.0, high = 0.6 }
+"mlp.down_proj" = { low = 0.0, high = 0.6 }
diff --git a/src/heretic/config.py b/src/heretic/config.py
@@ -13,6 +13,14 @@
     TomlConfigSettingsSource,
 )
 
+from .parameters import (
+    CategoricalParamSpec,
+    DirectionScope,
+    FloatParamSpec,
+    Parameters,
+    UnitParamSpec,
+)
+
 # !!!IMPORTANT!!!
 #
 # Any settings added to the classes defined in this module
@@ -483,6 +491,28 @@ class Settings(BaseSettings):
         description="Dataset of prompts that tend to result in refusals (used for evaluating model performance).",
     )
 
+    parameters: Parameters = Field(
+        default=Parameters(
+            direction_scope=CategoricalParamSpec(
+                [DirectionScope.GLOBAL, DirectionScope.PER_LAYER]
+            ),
+            # Discrimination between "harmful" and "harmless" inputs is usually strongest
+            # in layers slightly past the midpoint of the layer stack. See the original
+            # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
+            direction_fraction=UnitParamSpec(low=0.4, high=0.9),
+            # The parameter ranges are based on experiments with various models
+            # and much wider ranges. They are not set in stone and might have to be
+            # adjusted for future models.
+            max_weight=FloatParamSpec(low=0.8, high=1.5, log=False),
+            max_weight_position_fraction=UnitParamSpec(low=0.6, high=1.0),
+            # For sampling purposes, min_weight is expressed as a fraction of max_weight,
+            # because multivariate TPE doesn't support variable-range parameters.
+            min_weight_relative=UnitParamSpec(low=0.0, high=1.0),
+            min_weight_distance_fraction=UnitParamSpec(low=0.0, high=0.6),
+        ),
+        description="The parameter specifications, per parameter or per component within each parameter.",
+    )
+
     @classmethod
     def settings_customise_sources(
         cls,
diff --git a/src/heretic/main.py b/src/heretic/main.py
@@ -193,7 +193,10 @@ def run():
         print(f"[red]Configuration contains [bold]{error.error_count()}[/] errors:[/]")
 
         for error in error.errors():
-            print(f"[bold]{error['loc'][0]}[/]: [yellow]{error['msg']}[/]")
+            full_loc = str(error["loc"][0])
+            for loc in error["loc"][1:]:
+                full_loc += f".{loc}" if isinstance(loc, str) else f"[{loc}]"
+            print(f"[bold]{full_loc}[/]: [yellow]{error['msg']}[/]")
 
         print()
         print(
@@ -483,71 +486,53 @@ def objective(trial: Trial) -> tuple[float, float]:
         trial_index += 1
         trial.set_user_attr("index", trial_index)
 
-        direction_scope = trial.suggest_categorical(
-            "direction_scope",
-            [
-                "global",
-                "per layer",
-            ],
-        )
+        params = settings.parameters
 
-        last_layer_index = len(model.get_layers()) - 1
+        direction_scope_param = params.direction_scope.get()
+        direction_scope = direction_scope_param.suggest(trial)
 
-        # Discrimination between "harmful" and "harmless" inputs is usually strongest
-        # in layers slightly past the midpoint of the layer stack. See the original
-        # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
-        #
-        # Note that we always sample this parameter even though we only need it for
-        # the "global" direction scope. The reason is that multivariate TPE doesn't
-        # work with conditional or variable-range parameters.
-        direction_index = trial.suggest_float(
-            "direction_index",
-            0.4 * last_layer_index,
-            0.9 * last_layer_index,
-        )
+        last_layer_index = len(model.get_layers()) - 1
 
-        if direction_scope == "per layer":
-            direction_index = None
+        # Note that we always sample this parameter when the "global" direction
+        # scope is included in the choices, even though we only need it for the
+        # "global" direction scope itself. The reason is that multivariate TPE
+        # doesn't work with conditional or variable-range parameters.
+        if "global" in direction_scope_param.root:
+            direction_fraction = params.direction_fraction.suggest(trial)
+        else:
+            direction_fraction = None
 
-        parameters = {}
+        suggested_params: dict[str, AbliterationParameters] = {}
 
         for component in model.get_abliterable_components():
-            # The parameter ranges are based on experiments with various models
-            # and much wider ranges. They are not set in stone and might have to be
-            # adjusted for future models.
-            max_weight = trial.suggest_float(
-                f"{component}.max_weight",
-                0.8,
-                1.5,
-            )
-            max_weight_position = trial.suggest_float(
-                f"{component}.max_weight_position",
-                0.6 * last_layer_index,
-                1.0 * last_layer_index,
-            )
-            # For sampling purposes, min_weight is expressed as a fraction of max_weight,
-            # again because multivariate TPE doesn't support variable-range parameters.
-            # The value is transformed into the actual min_weight value below.
-            min_weight = trial.suggest_float(
-                f"{component}.min_weight",
-                0.0,
-                1.0,
+            max_weight = params.max_weight.suggest(trial, component)
+
+            max_weight_position_fraction = params.max_weight_position_fraction.suggest(
+                trial, component
             )
-            min_weight_distance = trial.suggest_float(
-                f"{component}.min_weight_distance",
-                1.0,
-                0.6 * last_layer_index,
+
+            min_weight_relative = params.min_weight_relative.suggest(trial, component)
+
+            min_weight_distance_fraction = params.min_weight_distance_fraction.suggest(
+                trial, component
             )
 
-            parameters[component] = AbliterationParameters(
+            suggested_params[component] = AbliterationParameters(
                 max_weight=max_weight,
-                max_weight_position=max_weight_position,
-                min_weight=(min_weight * max_weight),
-                min_weight_distance=min_weight_distance,
+                max_weight_position=max_weight_position_fraction * last_layer_index,
+                min_weight=min_weight_relative * max_weight,
+                min_weight_distance=min_weight_distance_fraction * last_layer_index,
             )
 
+        if direction_fraction is None or direction_scope != "global":
+            direction_index = None
+        else:
+            direction_index = direction_fraction * last_layer_index
+
         trial.set_user_attr("direction_index", direction_index)
-        trial.set_user_attr("parameters", {k: asdict(v) for k, v in parameters.items()})
+        trial.set_user_attr(
+            "parameters", {k: asdict(v) for k, v in suggested_params.items()}
+        )
 
         print()
         print(
@@ -559,7 +544,7 @@ def objective(trial: Trial) -> tuple[float, float]:
         print("* Resetting model...")
         model.reset_model()
         print("* Abliterating...")
-        model.abliterate(refusal_directions, direction_index, parameters)
+        model.abliterate(refusal_directions, direction_index, suggested_params)
         print("* Evaluating...")
         score, kl_divergence, refusals = evaluator.get_score()
 
diff --git a/src/heretic/parameters.py b/src/heretic/parameters.py