pokeshadowbench/analyze_pokemon_accuracy.py at main · freddiev4/pokeshadowbench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import argparse
from collections import defaultdict

# Parse command line arguments
parser = argparse.ArgumentParser(description='Analyze Pokémon recognition accuracy from LLM results')
parser.add_argument('json_file', help='Path to the JSON results file')
args = parser.parse_args()

# Read the JSON file
with open(args.json_file, 'r') as f:
    data = json.load(f)

# Dictionary to store results for each Pokemon
pokemon_results = defaultdict(lambda: {'correct': 0, 'total': 0})

# Process all models and their responses
# Handle both old format (direct providers) and new format (default.providers)
if 'default' in data and 'providers' in data['default']:
    # New format: data['default']['providers']
    providers = data['default']['providers']
else:
    # Old format: data directly contains providers
    providers = data

for provider, provider_models in providers.items():
    for model_name, model_data in provider_models.items():
        for response in model_data['responses']:
            pokemon = response['pokemon']
            correct = response['correct']

            pokemon_results[pokemon]['total'] += 1
            if correct:
                pokemon_results[pokemon]['correct'] += 1

# Calculate accuracy for each Pokemon and sort by success rate
pokemon_accuracy = []
for pokemon, results in pokemon_results.items():
    accuracy = (results['correct'] / results['total']) * 100 if results['total'] > 0 else 0
    pokemon_accuracy.append({
        'pokemon': pokemon,
        'correct': results['correct'],
        'total': results['total'],
        'accuracy': accuracy
    })

# Sort by accuracy (descending) and then by total attempts (descending) for ties
pokemon_accuracy.sort(key=lambda x: (x['accuracy'], x['total']), reverse=True)

# Print results
print("Pokémon Recognition Accuracy (sorted by success rate)")
print("=" * 60)
print(f"{'Rank':<4} {'Pokémon':<15} {'Correct':<7} {'Total':<5} {'Accuracy':<8}")
print("-" * 60)

for i, pokemon_data in enumerate(pokemon_accuracy, 1):
    print(f"{i:<4} {pokemon_data['pokemon']:<15} {pokemon_data['correct']:<7} {pokemon_data['total']:<5} {pokemon_data['accuracy']:<8.1f}%")

# Also show top 10 and bottom 10
print("\n" + "=" * 60)
print("TOP 10 MOST RECOGNIZABLE POKÉMON:")
print("=" * 60)
for i, pokemon_data in enumerate(pokemon_accuracy[:10], 1):
    print(f"{i}. {pokemon_data['pokemon']} - {pokemon_data['correct']}/{pokemon_data['total']} ({pokemon_data['accuracy']:.1f}%)")

print("\n" + "=" * 60)
print("BOTTOM 10 LEAST RECOGNIZABLE POKÉMON:")
print("=" * 60)
for i, pokemon_data in enumerate(pokemon_accuracy[-10:], len(pokemon_accuracy)-9):
    print(f"{i}. {pokemon_data['pokemon']} - {pokemon_data['correct']}/{pokemon_data['total']} ({pokemon_data['accuracy']:.1f}%)")