aws-samples · claumazz · Aug 13, 2025 · Aug 13, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/migrations/360-eval/default-config/judge_profiles.jsonl b/migrations/360-eval/default-config/judge_profiles.jsonl
@@ -1,3 +1,3 @@
-{"model_id": "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", "region": "us-west-2", "input_cost_per_1k": 0.003, "output_cost_per_1k": 0.015}
-{"model_id": "bedrock/us.meta.llama4-maverick-17b-instruct-v1:0", "region": "us-east-2", "input_cost_per_1k": 0.00024, "output_cost_per_1k": 0.00097}
-{"model_id": "bedrock/us.amazon.nova-premier-v1:0", "region": "us-east-2", "input_cost_per_1k": 0.0025, "output_cost_per_1k": 0.0125}
+{"model_id": "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0", "region": "us-west-2", "input_cost_per_1k": 0.003, "output_cost_per_1k": 0.015}
+{"model_id": "bedrock/us.amazon.nova-premier-v1:0", "region": "us-east-2", "input_cost_per_1k": 0.0025, "output_cost_per_1k": 0.0125}
+{"model_id": "bedrock/qwen.qwen3-coder-480b-a35b-v1:0", "region": "us-west-2", "input_cost_per_1k": 0.00022, "output_cost_per_1k": 0.0018}
diff --git a/migrations/360-eval/default-config/models_profiles.jsonl b/migrations/360-eval/default-config/models_profiles.jsonl
@@ -1,8 +1,12 @@
 {"model_id":"bedrock/us.amazon.nova-pro-v1:0", "region": "us-west-2","input_token_cost": 0.0008, "output_token_cost": 0.0032}
 {"model_id":"bedrock/us.amazon.nova-micro-v1:0", "region": "us-west-2","input_token_cost": 0.000035, "output_token_cost": 0.00014}
 {"model_id":"bedrock/us.amazon.nova-lite-v1:0", "region": "us-west-2","input_token_cost": 0.00006, "output_token_cost": 0.00024}
-{"model_id":"bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0", "region": "us-east-1","input_token_cost": 0.0008, "output_token_cost": 0.004}
-{"model_id":"bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0","region": "us-east-1", "input_token_cost": 0.003, "output_token_cost": 0.015}
 {"model_id":"bedrock/us.meta.llama3-3-70b-instruct-v1:0", "region": "us-east-2", "input_token_cost": 0.00072, "output_token_cost": 0.00072}
 {"model_id":"bedrock/us.mistral.mixtral-8x7b-instruct-v0:1", "region": "us-east-2", "input_token_cost": 0.00045, "output_token_cost": 0.0007}
 {"model_id":"bedrock/us.mistral.pixtral-large-2502-v1:0", "region": "us-east-1", "input_token_cost": 0.002, "output_token_cost": 0.006}
+{"model_id":"bedrock/openai.gpt-oss-120b-1:0", "region": "us-west-2", "input_token_cost": 0.00015, "output_token_cost": 0.0006}
+{"model_id":"bedrock/openai.gpt-oss-20b-1:0", "region": "us-west-2", "input_token_cost": 0.00007, "output_token_cost": 0.0003}
+{"model_id":"bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0", "region": "us-west-2", "input_token_cost": 0.003, "output_token_cost": 0.015}
+{"model_id":"bedrock/deepseek.v3-v1:0", "region": "us-west-2", "input_token_cost": 0.00058, "output_token_cost": 0.00168}
+{"model_id":"bedrock/qwen.qwen3-coder-30b-a3b-v1:0", "region": "us-west-2", "input_token_cost": 0.00015, "output_token_cost": 0.0006}
+
diff --git a/migrations/360-eval/src/benchmarks_run.py b/migrations/360-eval/src/benchmarks_run.py
@@ -50,7 +50,7 @@ def evaluate_with_llm_judge(judge_model_id,
                                        model_response,
                                        golden_answer)
 
-    cfg = {"maxTokens": 1500, "temperature": 0.3, "topP": 0.9, "aws_region_name": judge_region}
+    cfg = {"maxTokens": 1500, "topP": 0.9, "aws_region_name": judge_region}
     try:
         resp = run_inference(model_name=judge_model_id,
                              prompt_text=eval_template,
@@ -105,8 +105,11 @@ def evaluate_with_judges(judges,
     for j in judges:
         try:
             logging.debug(f"Evaluating with judge model {j['model_id']}")
+            model_identification = j["model_id"]
+            if "bedrock" in j["model_id"]:
+                model_identification = model_identification.replace("bedrock", "bedrock/converse")
             r = evaluate_with_llm_judge(
-                judge_model_id=j["model_id"],
+                judge_model_id=model_identification,
                 judge_region=j["region"],
                 prompt=prompt,
                 model_response=model_response,
@@ -193,7 +196,8 @@ def benchmark(
             params['api_key'] = os.getenv('AZURE_API_KEY')
         elif "bedrock" in model_id:
             params['aws_region_name'] = region
-            model_id = model_id.replace("bedrock", "bedrock/converse")
+            if 'converse' not in model_id:
+                model_id = model_id.replace("bedrock", "bedrock/converse")
         elif 'openai/' in model_id:
             params['api_key'] = os.getenv('OPENAI_API')
         else:
@@ -415,7 +419,8 @@ def check_single_model(model):
             params['api_key'] = os.getenv('GOOGLE_API')
         elif 'azure' in model_id:
             params['api_key'] = os.getenv('AZURE_API_KEY')
-            model_id = model_id.replace("bedrock", "bedrock/converse")
+        elif 'bedrock' in model_id and 'converse' not in model_id:
+                model_id = model_id.replace("bedrock", "bedrock/converse")
         elif 'openai/' in model_id:
             params['api_key'] = os.getenv('OPENAI_API')
         else:
@@ -440,7 +445,7 @@ def check_single_model(model):
             executor.submit(check_single_model, model): model 
             for model in models
         }
-        
+
         completed = 0
         total = len(models)
 
@@ -589,7 +594,7 @@ def main(
                 "task_types": js["task"]["task_type"],
                 "task_criteria": js["task"]["task_criteria"],
                 "golden_answer": js.get("golden_answer", ""),
-                "configured_output_tokens_for_request": js.get("expected_output_tokens", 5000),
+                "configured_output_tokens_for_request": js.get("expected_output_tokens", 4500),
                 "region": js.get("region", "us-east-1"),
                 "temperature": js.get("temperature", 0.7),
                 "user_defined_metrics": js.get("user_defined_metrics", ""),

diff --git a/migrations/360-eval/src/utils.py b/migrations/360-eval/src/utils.py
@@ -7,6 +7,7 @@
 import random
 import logging
 import base64
+import litellm
 import requests
 import requests.exceptions
 from tenacity import retry, stop_after_delay, wait_exponential, retry_if_exception_type
@@ -19,7 +20,7 @@
 
 
 logger = logging.getLogger(__name__)
-
+litellm.drop_params = True
 
 # ----------------------------------------
 # Request Builders

diff --git a/migrations/360-eval/src/visualize_results.py b/migrations/360-eval/src/visualize_results.py
@@ -38,7 +38,7 @@
 # Performance thresholds
 PERFORMANCE_THRESHOLDS = {
     'success_rate': {'good': 0.95, 'medium': 0.85},
-    'avg_latency': {'good': 0.6, 'medium': 1.5},
+    # 'avg_latency': {'good': 1.5, 'medium': 2},
     'avg_cost': {'good': 0.5, 'medium': 1.0},
     'avg_otps': {'good': 100, 'medium': 35},
 }
@@ -1108,6 +1108,37 @@ def extract_judge_scores(json_str):
         return {}
 
 
+from collections import defaultdict
+import numpy as np
+def build_task_latency_thresholds(records, method="percentile", value=0.75, round_ndigits=3):
+    """
+    Build latency thresholds per task across models.
+    Parameters
+    ----------
+    """
+    by_task = defaultdict(list)
+    # group latencies by task
+    for r in records:
+        tt = r.get("task_types")
+        lat = r.get("avg_latency")
+        if tt and isinstance(lat, (int, float)) and lat > 0:
+            by_task[tt].append(float(lat))
+    out = {}
+    for tt, lats in by_task.items():
+        arr = np.array(lats, dtype=float)
+        med = float(np.median(arr))
+        if method == "percentile":
+            medium_cutoff = float(np.quantile(arr, value))
+        elif method == "tolerance":
+            medium_cutoff = med * (1 + value)
+        else:
+            raise ValueError("method must be 'percentile' or 'tolerance'")
+        out[tt] = {
+                "good": round(med, round_ndigits),
+                "medium": round(medium_cutoff, round_ndigits)
+        }
+    return out
+
 
 ##############################
 ##############################
@@ -1125,149 +1156,95 @@ def create_integrated_analysis_table(model_task_metrics):
         'below': '#ffd4a3',     # Orange - within 30% of best
         'poor': '#ffcccc'       # Light red - more than 30% behind
     }
-
-    # Group by task and create separate tables
-    task_tables = {}
-
-    for task in model_task_metrics['task_types'].unique():
-        task_data = model_task_metrics[model_task_metrics['task_types'] == task].copy()
-
-        if task_data.empty:
-            continue
-
-        # Calculate average token size for this task
-        avg_tokens =  task_data['avg_input_tokens'].mean() #task_data['avg_output_tokens'].mean() +
-
-        # Get best values for each metric in this task
-        # best_success_rate = task_data['success_rate'].max()
-        # best_latency = task_data['avg_latency'].min()  # Lower is better
-        # best_cost = task_data['avg_cost'].min()        # Lower is better
-        # best_otps = task_data['avg_otps'].max()        # Higher is better
-
-        # Format metrics for display
-        task_data['success_rate_fmt'] = task_data['success_rate'].apply(lambda x: f"{x:.1%}")
-        task_data['avg_latency_fmt'] = task_data['avg_latency'].apply(lambda x: f"{x:.2f}s")
-        task_data['avg_cost_fmt'] = task_data['avg_cost'].apply(lambda x: f"${x:.4f}")
-        task_data['avg_otps_fmt'] = task_data['avg_otps'].apply(lambda x: f"{x:.1f}")
-        # task_data['total_tokens_fmt'] = (task_data['avg_input_tokens'] + task_data['avg_output_tokens']).apply(lambda x: f"{x:.0f}")
-        task_data['total_tokens_fmt'] = task_data['avg_output_tokens'].apply(lambda x: f"{x:.0f}")
-
-        # Calculate composite score
-        max_latency = task_data['avg_latency'].max() or 1
-        max_cost = task_data['avg_cost'].max() or 1
-
-        task_data['composite_score'] = (
-            task_data['success_rate'] +
-            (1 - (task_data['avg_latency'] / max_latency)) * COMPOSITE_SCORE_WEIGHTS['latency'] +
-            (1 - (task_data['avg_cost'] / max_cost)) * COMPOSITE_SCORE_WEIGHTS['cost']
-        )
-
-        # Sort by composite score descending
-        task_data = task_data.sort_values('composite_score', ascending=False)
-
-        # Get best composite score for coloring
-        # best_composite = task_data['composite_score'].max()
-
-        # Helper function to get color based on distance from best
-        def get_distance_based_color(value, metric, best_value):
-            """
-            Colors based on percentage distance from the best performer.
-            For 'higher is better' metrics: distance = (best - value) / best
-            For 'lower is better' metrics: distance = (value - best) / best
-            """
-
-            # Calculate percentage difference from best
-            if metric in ['success_rate', 'avg_otps']:  # Higher is better
-                if best_value == 0:  # Avoid division by zero
-                    distance_pct = 100
-                else:
-                    distance_pct = ((best_value - value) / best_value) * 100
-
-                # Special case: if value equals best, it's the best
-                if value == best_value:
-                    return colors['best']
-
-            else:  # Lower is better (latency, cost)
-                if best_value == 0:  # Avoid division by zero
-                    distance_pct = 100 if value > 0 else 0
-                else:
-                    distance_pct = ((value - best_value) / best_value) * 100
-
-                # Special case: if value equals best, it's the best
-                if value == best_value:
-                    return colors['best']
-
-            # Assign color based on distance from best
-            if distance_pct <= 5:
-                return colors['excellent']   # Within 5% of best
-            elif distance_pct <= 10:
-                return colors['good']        # Within 10% of best
-            elif distance_pct <= 20:
-                return colors['medium']      # Within 20% of best
-            elif distance_pct <= 30:
-                return colors['below']       # Within 30% of best
+
+    # Prepare the data for the table
+    table_data = model_task_metrics.copy()
+
+    thresholds['avg_latency'] = build_task_latency_thresholds(table_data[['model_name', 'task_types', 'avg_latency']].to_dict(orient='records'))
+    # ['avg_output_tokens'].median()
+    # Format Model Name
+    table_data['model_name'] = table_data['model_name'].apply(lambda x: x.split('/')[-1])
+
+    # Format metrics for display
+    table_data['success_rate_fmt'] = table_data['success_rate'].apply(lambda x: f"{x:.1%}")
+    table_data['avg_latency_fmt'] = table_data['avg_latency'].apply(lambda x: f"{x:.2f}s")
+    table_data['avg_cost_fmt'] = table_data['avg_cost'].apply(lambda x: f"${x:.4f}")
+    table_data['avg_otps_fmt'] = table_data['avg_otps'].apply(lambda x: f"{x:.1f}")
+
+    # Calculate composite score (higher is better)
+    # Normalize metrics to 0-1 range and combine them
+    max_latency = table_data['avg_latency'].max() or 1
+    max_cost = table_data['avg_cost'].max() or 1
+
+    table_data['composite_score'] = (
+            table_data['success_rate'] +
+            (1 - (table_data['avg_latency'] / max_latency)) * COMPOSITE_SCORE_WEIGHTS['latency'] +
+            (1 - (table_data['avg_cost'] / max_cost)) * COMPOSITE_SCORE_WEIGHTS['cost']
+    )
+
+    # Create figure
+    fig = go.Figure()
+
+    # Helper function to determine color based on value and thresholds
+    def get_color(value, metric):
+        if metric == 'success_rate' or metric == 'avg_otps':
+            if value >= thresholds[metric]['good']:
+                return colors['good']
+            elif value >= thresholds[metric]['medium']:
+                return colors['medium']
             else:
-                return colors['poor']        # More than 30% behind
-
-        # Create figure for this task
-        fig = go.Figure()
-
-        # Create table cells with conditional formatting
-        fig.add_trace(go.Table(
-            header=dict(
-                values=['Model', 'Success Rate', 'Latency', 'Cost', 'Tokens/sec', 'Avg Output Tokens', 'Score'],
-                font=dict(size=12, color='white'),
-                fill_color='#2E5A88',
-                align='left'
-            ),
-            cells=dict(
-                values=[
-                    task_data['model_name'].tolist(),
-                    task_data['success_rate_fmt'].tolist(),
-                    task_data['avg_latency_fmt'].tolist(),
-                    task_data['avg_cost_fmt'].tolist(),
-                    task_data['avg_otps_fmt'].tolist(),
-                    task_data['total_tokens_fmt'].tolist(),
-                    task_data['composite_score'].apply(lambda x: f"{x:.2f}").tolist()
-                ],
-                align='left',
-                font=dict(size=11, color='#333333', family='Arial, sans-serif'),  # Explicit dark font
-                # Conditional formatting based on distance from best performer
-                fill_color=[
-                    ['white'] * len(task_data),  # Model column (no coloring)
-                    # Success rate coloring (higher is better)
-                    [get_distance_based_color(sr, 'success_rate', task_data['success_rate'].max()) for sr in task_data['success_rate']],
-                    # Latency coloring (lower is better) 
-                    [get_distance_based_color(lt, 'avg_latency', task_data['avg_latency'].min()) for lt in task_data['avg_latency']],
-                    # Cost coloring (lower is better)
-                    [get_distance_based_color(cost, 'avg_cost', task_data['avg_cost'].min()) for cost in task_data['avg_cost']],
-                    # OTPS coloring (higher is better)
-                    [get_distance_based_color(tps, 'avg_otps', task_data['avg_otps'].max()) for tps in task_data['avg_otps']],
-                    ['#f0f0f0'] * len(task_data),  # Avg tokens column (light gray)
-                    # Composite score coloring based on distance from best
-                    [get_distance_based_color(score, 'composite_score', task_data['composite_score'].max()) for score in task_data['composite_score']]
-                ]
-            )
-        ))
-
-        # Calculate precise height based on content
-        header_height = 45  # Height for table header
-        row_height = 30     # Height per data row
-        title_height = 10   # Space for title and subtitle
-        margin_height = 30  # Top and bottom margins
-
-        total_height = header_height + (len(task_data) * row_height) + title_height + margin_height
-
-        # Update layout with title showing token context
-        fig.update_layout(
-            title=f'Performance Analysis: {task}<br><sub>Average Input tokens: {avg_tokens:.0f}</sub>',
-            title_font=dict(size=16, color='#333333'),
-            width=1100,
-            height=total_height,  # Precise height calculation
-            margin=dict(l=20, r=20, b=20, t=60),
-            template="plotly_white",
-            paper_bgcolor="#ffffff",
-            font=dict(color='#333333')  # Ensure text is dark on white background
+                return colors['poor']
+        elif metric == 'avg_latency':
+            if value['avg_latency'] <= thresholds[metric][value['task_types']]['good']:
+                return colors['good']
+            else:
+                return colors['medium']
+        else:  # For latency and cost, lower is better
+            if value <= thresholds[metric]['good']:
+                return colors['good']
+            elif value <= thresholds[metric]['medium']:
+                return colors['medium']
+            else:
+                return colors['poor']
+
+    # Create table cells with conditional formatting
+    fig.add_trace(go.Table(
+        header=dict(
+            values=['Model', 'Task Type', 'Success Rate', 'Latency', 'Cost', 'Tokens/sec', 'Score'],
+            font=dict(size=12, color='white'),
+            fill_color='#2E5A88',
+            align='left'
+        ),
+        cells=dict(
+            values=[
+                table_data['model_name'],
+                table_data['task_types'],
+                table_data['success_rate_fmt'],
+                table_data['avg_latency_fmt'],
+                table_data['avg_cost_fmt'],
+                table_data['avg_otps_fmt'],
+                table_data['composite_score'].apply(lambda x: f"{x:.2f}")
+            ],
+            align='left',
+            font=dict(size=11),
+            # Conditional formatting based on thresholds
+            fill_color=[
+                ['white'] * len(table_data),  # Model column (no coloring)
+                ['white'] * len(table_data),  # Task column (no coloring)
+                # Success rate coloring (three-color)
+                [get_color(sr, 'success_rate') for sr in table_data['success_rate']],
+                # Latency coloring (three-color)
+                [get_color(lt, 'avg_latency') for lt in table_data[['avg_latency','task_types']].to_dict(orient='records')],
+                # Cost coloring (three-color)
+                [get_color(cost, 'avg_cost') for cost in table_data['avg_cost']],
+                # OTPS coloring (just use white)
+                # ['white'] * len(table_data),
+                [get_color(tps, 'avg_otps') for tps in table_data['avg_otps']],
+                # Composite score coloring based on quantiles
+                [colors['good'] if score >= table_data['composite_score'].quantile(0.67) else
+                 colors['medium'] if score >= table_data['composite_score'].quantile(0.33) else
+                 colors['poor'] for score in table_data['composite_score']]
+            ]
         )
 
         # Store the table for this task