Arabic_AES_LREC2026/calculate_CI.py at master · UCREL/Arabic_AES_LREC2026 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Load predictions and references
preds = pd.read_csv("confidance_intervals/prompt_level_3.csv")
refs = pd.read_csv("dataset.csv")

# List of traits to evaluate
traits = ['organization', 'vocabulary', 'style', 'development', 'mechanics', 'structure', 'relevance', 'final_score']

# Function to compute bootstrapped confidence intervals
def compute_qwk_ci(y_true, y_pred, n_iterations=1000, ci=95):
    scores = []
    n = len(y_true)
    for _ in range(n_iterations):
        indices = np.random.choice(n, n, replace=True)
        score = cohen_kappa_score(y_true[indices], y_pred[indices], weights='quadratic')
        scores.append(score)
    lower = np.percentile(scores, (100 - ci) / 2)
    upper = np.percentile(scores, 100 - (100 - ci) / 2)
    return np.mean(scores), (lower, upper)

# Calculate QWK + confidence intervals per trait
results = []
for trait in traits:
    y_true = refs[trait].values
    y_pred = preds[trait].values
    mean_qwk, (ci_low, ci_high) = compute_qwk_ci(y_true, y_pred)
    results.append({
        "Trait": trait,
        "QWK": round(mean_qwk, 3),
        "95% CI Lower": round(ci_low, 3),
        "95% CI Upper": round(ci_high, 3)
    })

# Save results to CSV
qwk_df = pd.DataFrame(results)
qwk_df.to_csv("qwk_confidence_intervals.csv", index=False)
print(qwk_df)