@@ -193,7 +193,10 @@ def run():
193193 print (f"[red]Configuration contains [bold]{ error .error_count ()} [/] errors:[/]" )
194194
195195 for error in error .errors ():
196- print (f"[bold]{ error ['loc' ][0 ]} [/]: [yellow]{ error ['msg' ]} [/]" )
196+ full_loc = str (error ["loc" ][0 ])
197+ for loc in error ["loc" ][1 :]:
198+ full_loc += f".{ loc } " if isinstance (loc , str ) else f"[{ loc } ]"
199+ print (f"[bold]{ full_loc } [/]: [yellow]{ error ['msg' ]} [/]" )
197200
198201 print ()
199202 print (
@@ -483,71 +486,53 @@ def objective(trial: Trial) -> tuple[float, float]:
483486 trial_index += 1
484487 trial .set_user_attr ("index" , trial_index )
485488
486- direction_scope = trial .suggest_categorical (
487- "direction_scope" ,
488- [
489- "global" ,
490- "per layer" ,
491- ],
492- )
489+ params = settings .parameters
493490
494- last_layer_index = len (model .get_layers ()) - 1
491+ direction_scope_param = params .direction_scope .get ()
492+ direction_scope = direction_scope_param .suggest (trial )
495493
496- # Discrimination between "harmful" and "harmless" inputs is usually strongest
497- # in layers slightly past the midpoint of the layer stack. See the original
498- # abliteration paper (https://arxiv.org/abs/2406.11717) for a deeper analysis.
499- #
500- # Note that we always sample this parameter even though we only need it for
501- # the "global" direction scope. The reason is that multivariate TPE doesn't
502- # work with conditional or variable-range parameters.
503- direction_index = trial .suggest_float (
504- "direction_index" ,
505- 0.4 * last_layer_index ,
506- 0.9 * last_layer_index ,
507- )
494+ last_layer_index = len (model .get_layers ()) - 1
508495
509- if direction_scope == "per layer" :
510- direction_index = None
496+ # Note that we always sample this parameter when the "global" direction
497+ # scope is included in the choices, even though we only need it for the
498+ # "global" direction scope itself. The reason is that multivariate TPE
499+ # doesn't work with conditional or variable-range parameters.
500+ if "global" in direction_scope_param .root :
501+ direction_fraction = params .direction_fraction .suggest (trial )
502+ else :
503+ direction_fraction = None
511504
512- parameters = {}
505+ suggested_params : dict [ str , AbliterationParameters ] = {}
513506
514507 for component in model .get_abliterable_components ():
515- # The parameter ranges are based on experiments with various models
516- # and much wider ranges. They are not set in stone and might have to be
517- # adjusted for future models.
518- max_weight = trial .suggest_float (
519- f"{ component } .max_weight" ,
520- 0.8 ,
521- 1.5 ,
522- )
523- max_weight_position = trial .suggest_float (
524- f"{ component } .max_weight_position" ,
525- 0.6 * last_layer_index ,
526- 1.0 * last_layer_index ,
527- )
528- # For sampling purposes, min_weight is expressed as a fraction of max_weight,
529- # again because multivariate TPE doesn't support variable-range parameters.
530- # The value is transformed into the actual min_weight value below.
531- min_weight = trial .suggest_float (
532- f"{ component } .min_weight" ,
533- 0.0 ,
534- 1.0 ,
508+ max_weight = params .max_weight .suggest (trial , component )
509+
510+ max_weight_position_fraction = params .max_weight_position_fraction .suggest (
511+ trial , component
535512 )
536- min_weight_distance = trial .suggest_float (
537- f"{ component } .min_weight_distance" ,
538- 1.0 ,
539- 0.6 * last_layer_index ,
513+
514+ min_weight_relative = params .min_weight_relative .suggest (trial , component )
515+
516+ min_weight_distance_fraction = params .min_weight_distance_fraction .suggest (
517+ trial , component
540518 )
541519
542- parameters [component ] = AbliterationParameters (
520+ suggested_params [component ] = AbliterationParameters (
543521 max_weight = max_weight ,
544- max_weight_position = max_weight_position ,
545- min_weight = ( min_weight * max_weight ) ,
546- min_weight_distance = min_weight_distance ,
522+ max_weight_position = max_weight_position_fraction * last_layer_index ,
523+ min_weight = min_weight_relative * max_weight ,
524+ min_weight_distance = min_weight_distance_fraction * last_layer_index ,
547525 )
548526
527+ if direction_fraction is None or direction_scope != "global" :
528+ direction_index = None
529+ else :
530+ direction_index = direction_fraction * last_layer_index
531+
549532 trial .set_user_attr ("direction_index" , direction_index )
550- trial .set_user_attr ("parameters" , {k : asdict (v ) for k , v in parameters .items ()})
533+ trial .set_user_attr (
534+ "parameters" , {k : asdict (v ) for k , v in suggested_params .items ()}
535+ )
551536
552537 print ()
553538 print (
@@ -559,7 +544,7 @@ def objective(trial: Trial) -> tuple[float, float]:
559544 print ("* Resetting model..." )
560545 model .reset_model ()
561546 print ("* Abliterating..." )
562- model .abliterate (refusal_directions , direction_index , parameters )
547+ model .abliterate (refusal_directions , direction_index , suggested_params )
563548 print ("* Evaluating..." )
564549 score , kl_divergence , refusals = evaluator .get_score ()
565550
0 commit comments