Skip to content

Commit 3dde558

Browse files
committed
feat: make parameter ranges configurable
1 parent 9b7624d commit 3dde558

5 files changed

Lines changed: 471 additions & 54 deletions

File tree

config.default.toml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,45 @@ column = "text"
199199
dataset = "mlabonne/harmful_behaviors"
200200
split = "test[:100]"
201201
column = "text"
202+
203+
# The parameters used to choose suggest settings for abliteration. With the
204+
# exception of direction_scope and max_weight, all values are between 0 and 1.
205+
# min_weight is set relative to max_weight, and the other values are relative
206+
# to the number of layers in the model.
207+
# By default, the same ranges are used across all components. For parameters
208+
# that are sampled per component, the ranges are specified for each component.
209+
[parameters]
210+
211+
# The different refusal direction scopes that can be applied to each trial.
212+
direction_scope = [
213+
# Choose a refusal direction by interpolating between 2 layers and apply it globally.
214+
"global",
215+
# For each layer within range, apply the layer's own refusal direction to itself.
216+
"per layer",
217+
]
218+
219+
# For the global direction scope, the layer from which to choose the refusal direction.
220+
direction_fraction = { low = 0.4, high = 0.9 }
221+
222+
# The maximum weight with which to apply the abliteration. Set log = true to
223+
# sample from the log space, which will select lower values more frequently.
224+
# Note that low must be greater than 0 when log = true.
225+
[parameters.max_weight]
226+
"attn.o_proj" = { low = 0.8, high = 1.5, log = false }
227+
"mlp.down_proj" = { low = 0.8, high = 1.5, log = false }
228+
229+
# The position (layer) at which the maximum weight should be applied.
230+
[parameters.max_weight_position_fraction]
231+
"attn.o_proj" = { low = 0.6, high = 1.0 }
232+
"mlp.down_proj" = { low = 0.6, high = 1.0 }
233+
234+
# The minimum weight as a fraction of the maximum weight.
235+
[parameters.min_weight_relative]
236+
"attn.o_proj" = { low = 0.0, high = 1.0 }
237+
"mlp.down_proj" = { low = 0.0, high = 1.0 }
238+
239+
# The distance from max_weight_position across which the weight drops from
240+
# max_weight to min_weight. Beyond this distance, the weight is set to 0.
241+
[parameters.min_weight_distance_fraction]
242+
"attn.o_proj" = { low = 0.0, high = 0.6 }
243+
"mlp.down_proj" = { low = 0.0, high = 0.6 }

config.noslop.toml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,3 +161,45 @@ dataset = "llm-aes/writing-prompts"
161161
split = "train[1000:1100]"
162162
column = "prompt"
163163
prefix = "Write a short story based on the writing prompt below.\n\nWriting prompt:"
164+
165+
# The parameters used to choose suggest settings for abliteration. With the
166+
# exception of direction_scope and max_weight, all values are between 0 and 1.
167+
# min_weight is set relative to max_weight, and the other values are relative
168+
# to the number of layers in the model.
169+
# By default, the same ranges are used across all components. For parameters
170+
# that are sampled per component, the ranges are specified for each component.
171+
[parameters]
172+
173+
# The different refusal direction scopes that can be applied to each trial.
174+
direction_scope = [
175+
# Choose a refusal direction by interpolating between 2 layers and apply it globally.
176+
"global",
177+
# For each layer within range, apply the layer's own refusal direction to itself.
178+
"per layer",
179+
]
180+
181+
# For the global direction scope, the layer from which to choose the refusal direction.
182+
direction_fraction = { low = 0.4, high = 0.9 }
183+
184+
# The maximum weight with which to apply the abliteration. Set log = true to
185+
# sample from the log space, which will select lower values more frequently.
186+
# Note that low must be greater than 0 when log = true.
187+
[parameters.max_weight]
188+
"attn.o_proj" = { low = 0.8, high = 1.5, log = false }
189+
"mlp.down_proj" = { low = 0.8, high = 1.5, log = false }
190+
191+
# The position (layer) at which the maximum weight should be applied.
192+
[parameters.max_weight_position_fraction]
193+
"attn.o_proj" = { low = 0.6, high = 1.0 }
194+
"mlp.down_proj" = { low = 0.6, high = 1.0 }
195+
196+
# The minimum weight as a fraction of the maximum weight.
197+
[parameters.min_weight_relative]
198+
"attn.o_proj" = { low = 0.0, high = 1.0 }
199+
"mlp.down_proj" = { low = 0.0, high = 1.0 }
200+
201+
# The distance from max_weight_position across which the weight drops from
202+
# max_weight to min_weight. Beyond this distance, the weight is set to 0.
203+
[parameters.min_weight_distance_fraction]
204+
"attn.o_proj" = { low = 0.0, high = 0.6 }
205+
"mlp.down_proj" = { low = 0.0, high = 0.6 }

src/heretic/config.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@
1313
TomlConfigSettingsSource,
1414
)
1515

16+
from .parameters import (
17+
CategoricalParamSpec,
18+
DirectionScope,
19+
FloatParamSpec,
20+
Parameters,
21+
UnitParamSpec,
22+
)
23+
1624
# !!!IMPORTANT!!!
1725
#
1826
# Any settings added to the classes defined in this module
@@ -483,6 +491,28 @@ class Settings(BaseSettings):
483491
description="Dataset of prompts that tend to result in refusals (used for evaluating model performance).",
484492
)
485493

494+
parameters: Parameters = Field(
495+
default=Parameters(
496+
direction_scope=CategoricalParamSpec(
497+
[DirectionScope.GLOBAL, DirectionScope.PER_LAYER]
498+
),
499+
# Discrimination between "harmful" and "harmless" inputs is usually strongest
500+
# in layers slightly past the midpoint of the layer stack. See the original
501+
# abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
502+
direction_fraction=UnitParamSpec(low=0.4, high=0.9),
503+
# The parameter ranges are based on experiments with various models
504+
# and much wider ranges. They are not set in stone and might have to be
505+
# adjusted for future models.
506+
max_weight=FloatParamSpec(low=0.8, high=1.5, log=False),
507+
max_weight_position_fraction=UnitParamSpec(low=0.6, high=1.0),
508+
# For sampling purposes, min_weight is expressed as a fraction of max_weight,
509+
# because multivariate TPE doesn't support variable-range parameters.
510+
min_weight_relative=UnitParamSpec(low=0.0, high=1.0),
511+
min_weight_distance_fraction=UnitParamSpec(low=0.0, high=0.6),
512+
),
513+
description="The parameter specifications, per parameter or per component within each parameter.",
514+
)
515+
486516
@classmethod
487517
def settings_customise_sources(
488518
cls,

src/heretic/main.py

Lines changed: 39 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,10 @@ def run():
193193
print(f"[red]Configuration contains [bold]{error.error_count()}[/] errors:[/]")
194194

195195
for error in error.errors():
196-
print(f"[bold]{error['loc'][0]}[/]: [yellow]{error['msg']}[/]")
196+
full_loc = str(error["loc"][0])
197+
for loc in error["loc"][1:]:
198+
full_loc += f".{loc}" if isinstance(loc, str) else f"[{loc}]"
199+
print(f"[bold]{full_loc}[/]: [yellow]{error['msg']}[/]")
197200

198201
print()
199202
print(
@@ -483,71 +486,53 @@ def objective(trial: Trial) -> tuple[float, float]:
483486
trial_index += 1
484487
trial.set_user_attr("index", trial_index)
485488

486-
direction_scope = trial.suggest_categorical(
487-
"direction_scope",
488-
[
489-
"global",
490-
"per layer",
491-
],
492-
)
489+
params = settings.parameters
493490

494-
last_layer_index = len(model.get_layers()) - 1
491+
direction_scope_param = params.direction_scope.get()
492+
direction_scope = direction_scope_param.suggest(trial)
495493

496-
# Discrimination between "harmful" and "harmless" inputs is usually strongest
497-
# in layers slightly past the midpoint of the layer stack. See the original
498-
# abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
499-
#
500-
# Note that we always sample this parameter even though we only need it for
501-
# the "global" direction scope. The reason is that multivariate TPE doesn't
502-
# work with conditional or variable-range parameters.
503-
direction_index = trial.suggest_float(
504-
"direction_index",
505-
0.4 * last_layer_index,
506-
0.9 * last_layer_index,
507-
)
494+
last_layer_index = len(model.get_layers()) - 1
508495

509-
if direction_scope == "per layer":
510-
direction_index = None
496+
# Note that we always sample this parameter when the "global" direction
497+
# scope is included in the choices, even though we only need it for the
498+
# "global" direction scope itself. The reason is that multivariate TPE
499+
# doesn't work with conditional or variable-range parameters.
500+
if "global" in direction_scope_param.root:
501+
direction_fraction = params.direction_fraction.suggest(trial)
502+
else:
503+
direction_fraction = None
511504

512-
parameters = {}
505+
suggested_params: dict[str, AbliterationParameters] = {}
513506

514507
for component in model.get_abliterable_components():
515-
# The parameter ranges are based on experiments with various models
516-
# and much wider ranges. They are not set in stone and might have to be
517-
# adjusted for future models.
518-
max_weight = trial.suggest_float(
519-
f"{component}.max_weight",
520-
0.8,
521-
1.5,
522-
)
523-
max_weight_position = trial.suggest_float(
524-
f"{component}.max_weight_position",
525-
0.6 * last_layer_index,
526-
1.0 * last_layer_index,
527-
)
528-
# For sampling purposes, min_weight is expressed as a fraction of max_weight,
529-
# again because multivariate TPE doesn't support variable-range parameters.
530-
# The value is transformed into the actual min_weight value below.
531-
min_weight = trial.suggest_float(
532-
f"{component}.min_weight",
533-
0.0,
534-
1.0,
508+
max_weight = params.max_weight.suggest(trial, component)
509+
510+
max_weight_position_fraction = params.max_weight_position_fraction.suggest(
511+
trial, component
535512
)
536-
min_weight_distance = trial.suggest_float(
537-
f"{component}.min_weight_distance",
538-
1.0,
539-
0.6 * last_layer_index,
513+
514+
min_weight_relative = params.min_weight_relative.suggest(trial, component)
515+
516+
min_weight_distance_fraction = params.min_weight_distance_fraction.suggest(
517+
trial, component
540518
)
541519

542-
parameters[component] = AbliterationParameters(
520+
suggested_params[component] = AbliterationParameters(
543521
max_weight=max_weight,
544-
max_weight_position=max_weight_position,
545-
min_weight=(min_weight * max_weight),
546-
min_weight_distance=min_weight_distance,
522+
max_weight_position=max_weight_position_fraction * last_layer_index,
523+
min_weight=min_weight_relative * max_weight,
524+
min_weight_distance=min_weight_distance_fraction * last_layer_index,
547525
)
548526

527+
if direction_fraction is None or direction_scope != "global":
528+
direction_index = None
529+
else:
530+
direction_index = direction_fraction * last_layer_index
531+
549532
trial.set_user_attr("direction_index", direction_index)
550-
trial.set_user_attr("parameters", {k: asdict(v) for k, v in parameters.items()})
533+
trial.set_user_attr(
534+
"parameters", {k: asdict(v) for k, v in suggested_params.items()}
535+
)
551536

552537
print()
553538
print(
@@ -559,7 +544,7 @@ def objective(trial: Trial) -> tuple[float, float]:
559544
print("* Resetting model...")
560545
model.reset_model()
561546
print("* Abliterating...")
562-
model.abliterate(refusal_directions, direction_index, parameters)
547+
model.abliterate(refusal_directions, direction_index, suggested_params)
563548
print("* Evaluating...")
564549
score, kl_divergence, refusals = evaluator.get_score()
565550

0 commit comments

Comments
 (0)