Azure · martinpollack · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 30, 2025
diff --git a/doc/code/orchestrators/5_crescendo_ensemble_orchestrator.ipynb b/doc/code/orchestrators/5_crescendo_ensemble_orchestrator.ipynb
diff --git a/pyrit/orchestrator/__init__.py b/pyrit/orchestrator/__init__.py
@@ -29,6 +29,9 @@
     "ContextComplianceOrchestrator",
     "ContextDescriptionPaths",
     "CrescendoOrchestrator",
+
+    "CrescendoEnsembleOrchestrator",
+
     "FlipAttackOrchestrator",
     "FuzzerOrchestrator",
     "MultiTurnAttackResult",

diff --git a/pyrit/orchestrator/multi_turn/crescendo_orchestrator.py b/pyrit/orchestrator/multi_turn/crescendo_orchestrator.py
@@ -4,7 +4,7 @@
 import json
 import logging
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Dict, List
 from uuid import uuid4
 
 from pyrit.common.path import DATASETS_PATH
@@ -23,9 +23,9 @@
 )
 from pyrit.prompt_target import PromptChatTarget
 from pyrit.score import (
+    Scorer,
     FloatScaleThresholdScorer,
     SelfAskRefusalScorer,
-    SelfAskScaleScorer,
 )
 
 logger = logging.getLogger(__name__)
@@ -62,7 +62,8 @@ def __init__(
         self,
         objective_target: PromptChatTarget,
         adversarial_chat: PromptChatTarget,
-        scoring_target: PromptChatTarget,
+        refusal_target: PromptChatTarget,
+        objective_float_scale_scorer: Scorer,
         adversarial_chat_system_prompt_path: Optional[Path] = None,
         objective_achieved_score_threshhold: float = 0.7,
         max_turns: int = 10,
@@ -77,12 +78,8 @@ def __init__(
         )
 
         objective_scorer = FloatScaleThresholdScorer(
-            scorer=SelfAskScaleScorer(
-                chat_target=scoring_target,
-                scale_arguments_path=SelfAskScaleScorer.ScalePaths.TASK_ACHIEVED_SCALE.value,
-                system_prompt_path=SelfAskScaleScorer.SystemPaths.RED_TEAMER_SYSTEM_PROMPT.value,
-            ),
-            threshold=objective_achieved_score_threshhold,
+            scorer=objective_float_scale_scorer,
+            threshold=objective_achieved_score_threshhold
         )
 
         super().__init__(
@@ -96,7 +93,7 @@ def __init__(
         )
 
         self._refusal_scorer = SelfAskRefusalScorer(
-            chat_target=scoring_target,
+            chat_target=refusal_target,
         )
 
         self._prompt_normalizer = PromptNormalizer()

diff --git a/pyrit/score/__init__.py b/pyrit/score/__init__.py
@@ -1,15 +1,19 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+from pyrit.score.scorer import Scorer
+
 from pyrit.score.azure_content_filter_scorer import AzureContentFilterScorer
+from pyrit.score.composite_scorer import CompositeScorer
+from pyrit.score.ensemble_scorer import EnsembleScorer, WeakScorerSpec
 from pyrit.score.float_scale_threshold_scorer import FloatScaleThresholdScorer
 from pyrit.score.gandalf_scorer import GandalfScorer
 from pyrit.score.human_in_the_loop_scorer import HumanInTheLoopScorer
 from pyrit.score.human_in_the_loop_gradio import HumanInTheLoopScorerGradio
 from pyrit.score.insecure_code_scorer import InsecureCodeScorer
 from pyrit.score.markdown_injection import MarkdownInjectionScorer
 from pyrit.score.prompt_shield_scorer import PromptShieldScorer
-from pyrit.score.scorer import Scorer
+from pyrit.score.score_aggregator import AND_, MAJORITY_, OR_, ScoreAggregator
 from pyrit.score.self_ask_category_scorer import ContentClassifierPaths, SelfAskCategoryScorer
 from pyrit.score.self_ask_likert_scorer import LikertScalePaths, SelfAskLikertScorer
 from pyrit.score.self_ask_refusal_scorer import SelfAskRefusalScorer
@@ -19,17 +23,24 @@
 from pyrit.score.true_false_inverter_scorer import TrueFalseInverterScorer
 
 __all__ = [
+    "AND_",
     "AzureContentFilterScorer",
     "ContentClassifierPaths",
+    "CompositeScorer",
+    "EnsembleScorer",
+    "ContentClassifierPaths",
     "FloatScaleThresholdScorer",
     "GandalfScorer",
     "HumanInTheLoopScorer",
     "HumanInTheLoopScorerGradio",
     "InsecureCodeScorer",
     "LikertScalePaths",
+    "MAJORITY_",
     "MarkdownInjectionScorer",
+    "OR_",
     "PromptShieldScorer",
     "Scorer",
+    "ScoreAggregator",
     "SelfAskCategoryScorer",
     "SelfAskLikertScorer",
     "SelfAskRefusalScorer",
@@ -39,4 +50,5 @@
     "TrueFalseInverterScorer",
     "TrueFalseQuestion",
     "TrueFalseQuestionPaths",
+    "WeakScorerSpec",
 ]
diff --git a/pyrit/score/composite_scorer.py b/pyrit/score/composite_scorer.py
@@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import List, Optional
+
+from pyrit.models import PromptRequestPiece, Score
+from pyrit.score.score_aggregator import ScoreAggregator
+from pyrit.score.scorer import Scorer
+
+
+class CompositeScorer(Scorer):
+    """A scorer that aggregates other true_false scorers using a specified aggregation function.
+
+    It returns a single score of True or False based on the aggregation of the scores of the constituent
+    scorers.
+
+    Args:
+        aggregator: The aggregation function to use (e.g. `AND_`, `OR_`, `MAJORITY_`)
+        scorers: List of true_false scorers to combine
+        score_category: Optional category for the score
+    """
+
+    def __init__(
+        self, *, aggregator: ScoreAggregator, scorers: List[Scorer], score_category: Optional[str] = None
+    ) -> None:
+        self.scorer_type = "true_false"
+        self._aggregator = aggregator
+        self._score_category = score_category
+
+        if not scorers:
+            raise ValueError("At least one scorer must be provided.")
+
+        for scorer in scorers:
+            if scorer.scorer_type != "true_false":
+                raise ValueError("All scorers must be true_false scorers.")
+
+        self._scorers = scorers
+
+    async def score_async(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> List[Score]:
+        """Scores the request response by combining results from all constituent scorers.
+
+        Args:
+            request_response: The request response to be scored
+            task: Optional task description for scoring context
+
+        Returns:
+            List containing a single Score object representing the combined result
+        """
+        self.validate(request_response, task=task)
+        scores = await self._score_all_async(request_response, task=task)
+
+        identifier_dict = self.get_identifier()
+        identifier_dict["sub_identifier"] = [scorer.get_identifier() for scorer in self._scorers]
+
+        result = self._aggregator(scores)
+
+        return_score = Score(
+            score_value=str(result.value),
+            score_value_description=None,
+            score_type=self.scorer_type,
+            score_category=self._score_category,
+            score_metadata=None,
+            score_rationale=result.rationale,
+            scorer_class_identifier=identifier_dict,
+            prompt_request_response_id=request_response.id,
+            task=task,
+        )
+
+        return [return_score]
+
+    async def _score_all_async(
+        self, request_response: PromptRequestPiece, *, task: Optional[str] = None
+    ) -> List[Score]:
+        """Scores the request_response using all constituent scorers sequentially.
+
+        Args:
+            request_response: The request response to be scored
+            task: Optional task description for scoring context
+
+        Returns:
+            List of scores from all constituent scorers
+        """
+        if not self._scorers:
+            return []
+
+        all_scores = []
+        for scorer in self._scorers:
+            scores = await scorer.score_async(request_response=request_response, task=task)
+            all_scores.extend(scores)
+
+        return all_scores
+
+    def validate(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> None:
+        """Validates the request response for scoring.
+
+        Args:
+            request_response: The request response to validate
+            task: Optional task description for validation context
+        """
+        pass
diff --git a/pyrit/score/ensemble_scorer.py b/pyrit/score/ensemble_scorer.py
@@ -0,0 +1,133 @@
+from typing import Optional, Dict, Literal, get_args
+from dataclasses import dataclass
+
+from pyrit.models import PromptRequestPiece, Score
+from pyrit.score import Scorer
+
+@dataclass
+class WeakScorerSpec:
+    scorer: Scorer
+    weight: Optional[float] = None
+    class_weights: Optional[Dict[str, float]] = None
+
+LossMetric = Literal["MSE", "MAE"]
+
+class EnsembleScorer(Scorer):
+
+    def __init__(self, 
+                 *,
+                 weak_scorer_dict: Dict[str, WeakScorerSpec],
+                 ground_truth_scorer: Scorer,
+                 fit_weights: bool = False,
+                 lr: float = 1e-2,
+                 category: str = "jailbreak"):
+        self.scorer_type = "float_scale"
+        self._score_category = category
+
+        if not isinstance(weak_scorer_dict, dict) or (len(weak_scorer_dict) == 0):
+            raise ValueError("Please pass a nonempty dictionary of weights")
+
+        for scorer_name, weak_scorer_spec in weak_scorer_dict.items():
+            if scorer_name == "AzureContentFilterScorer":
+                if not isinstance(weak_scorer_spec.class_weights, dict) or len(weak_scorer_spec.class_weights) == 0:
+                    raise ValueError("Weights for AzureContentFilterScorer must be a dictionary of category (str) to weight (float)")
+                for acfs_k, acfs_v in weak_scorer_spec.class_weights.items():
+                    if not isinstance(acfs_k, str) or not isinstance(acfs_v, float):
+                        raise ValueError("Weights for AzureContentFilterScorer must be a dictionary of category (str) to weight (float)")
+            elif not isinstance(weak_scorer_spec.weight, float):
+                raise ValueError("Weight for this scorer must be a float")
+
+        if not isinstance(lr, float) or lr <= 0:
+            raise ValueError("Learning rate must be a floating point number greater than 0")
+
+        self._weak_scorer_dict = weak_scorer_dict
+
+        self._fit_weights = fit_weights
+        self._lr = lr
+
+        self._ground_truth_scorer = ground_truth_scorer
+
+    async def score_async(self, request_response: PromptRequestPiece, *, task: Optional[str] = None) -> list[Score]:
+        self.validate(request_response, task=task)
+
+        ensemble_score_value = 0
+        score_values = {}
+        metadata = {}
+        for scorer_name, weak_scorer_spec in self._weak_scorer_dict.items():
+            scorer = weak_scorer_spec.scorer
+            current_scores = await scorer.score_async(request_response=request_response, task=task)
+            for curr_score in current_scores:
+                if scorer_name == "AzureContentFilterScorer":
+                    score_category = curr_score.score_category
+                    curr_weight = weak_scorer_spec.class_weights[score_category]
+                    metadata_label = "_".join([scorer_name, score_category, "weight"])
+
+                    curr_score_value = float(curr_score.get_value())
+                    if scorer_name not in score_values:
+                        score_values[scorer_name] = {}
+                    score_values[scorer_name][score_category] = curr_score_value
+                else:
+                    curr_weight = weak_scorer_spec.weight
+                    metadata_label = "_".join([scorer_name, "weight"])
+                    curr_score_value = float(curr_score.get_value())
+                    score_values[scorer_name] = curr_score_value
+
+
+                ensemble_score_value += curr_weight * curr_score_value
+
+                metadata[metadata_label] = str(curr_weight)
+
+        ensemble_score_rationale = f"Total Ensemble Score is {ensemble_score_value}"
+
+        ensemble_score = Score(
+            score_type="float_scale",
+            score_value=str(ensemble_score_value),
+            score_value_description=None,
+            score_category=self._score_category,
+            score_metadata=metadata,
+            score_rationale=ensemble_score_rationale,
+            scorer_class_identifier=self.get_identifier(),
+            prompt_request_response_id=request_response.id,
+            task=task,
+        )
+        self._memory.add_scores_to_memory(scores=[ensemble_score])
+
+        if self._fit_weights:
+            await self.step_weights(score_values=score_values, ensemble_score=ensemble_score, request_response=request_response, task=task)
+
+        return [ensemble_score]
+
+    async def step_weights(self, 
+                           *,
+                           score_values: Dict[str, float], 
+                           ensemble_score: Scorer,
+                           request_response: PromptRequestPiece, 
+                           task: Optional[str] = None,
+                           loss_metric: LossMetric = "MSE"):
+        if loss_metric not in get_args(LossMetric):
+            raise ValueError(f"Loss metric {loss_metric} is not a valid loss metric.")
+
+        ground_truth_scores = await self._ground_truth_scorer.score_async(request_response=request_response, task=task)
+        for ground_truth_score in ground_truth_scores:
+            if loss_metric == "MSE":
+                diff = ensemble_score.get_value() - float(ground_truth_score.get_value())
+                d_loss_d_ensemble_score = 2 * diff
+            elif loss_metric == "MAE":
+                diff = ensemble_score.get_value() - float(ground_truth_score.get_value())
+                d_loss_d_ensemble_score = -1 if diff < 0 else 1
+
+            for scorer_name in score_values:
+                if scorer_name == "AzureContentFilterScorer":
+                    self._weak_scorer_dict[scorer_name].class_weights = {score_category: 
+                                                                            self._weak_scorer_dict[scorer_name][1][score_category] -
+                                                                            self._lr * score_values[scorer_name][score_category] * d_loss_d_ensemble_score
+                                                                         for score_category in self._weak_scorer_dict[scorer_name][1]}
+                else:
+                    self._weak_scorer_dict[scorer_name].weight = self._weak_scorer_dict[scorer_name].weight - self._lr * score_values[scorer_name] * d_loss_d_ensemble_score
+
+
+    def validate(self, request_response: PromptRequestPiece, *, task: Optional[str] = None):
+        if request_response.original_value_data_type != "text":
+            raise ValueError("The original value data type must be text.")
+        if not task:
+            raise ValueError("Task must be provided.")