Pacific-Coding-Challenge/app.py at main · ThonyAnt/Pacific-Coding-Challenge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
"""Streamlit UI for context-eval."""

import json
import time
from pathlib import Path

import anthropic
import pandas as pd
import streamlit as st
import yaml
from dotenv import load_dotenv

from src.judge import score_response
from src.strategies import Strategy, build_strategies

load_dotenv()

# ── Page config ───────────────────────────────────────────────────────────────
st.set_page_config(
    page_title="context-eval",
    layout="wide",
    initial_sidebar_state="expanded",
)

# ── Defaults ──────────────────────────────────────────────────────────────────
_example_context = Path("fixtures/example_context.md")
DEFAULT_CONTEXT = _example_context.read_text(encoding="utf-8") if _example_context.exists() else ""

DEFAULT_QUERIES = [
    {"id": "q1", "category": "career",   "query": "What projects should I focus on for my internship applications?"},
    {"id": "q2", "category": "technical","query": "What should I learn next to level up as an engineer?"},
    {"id": "q3", "category": "planning", "query": "What should I work on this weekend to make progress toward my goals?"},
    {"id": "q4", "category": "personal", "query": "Can you suggest some books I'd enjoy?"},
]

MODELS = [
    "claude-haiku-4-5-20251001",
    "claude-sonnet-4-6",
]

SCORE_COLORS = {5: "🟢", 4: "🟡", 3: "🟠", 2: "🔴", 1: "🔴"}


# ── Helpers ───────────────────────────────────────────────────────────────────
def _run_query(query: str, system_prompt: str, client: anthropic.Anthropic, model: str) -> tuple[str, float]:
    start = time.perf_counter()
    ttft = None
    chunks: list[str] = []
    with client.messages.stream(
        model=model,
        max_tokens=400,
        system=system_prompt,
        messages=[{"role": "user", "content": query}],
    ) as stream:
        for text in stream.text_stream:
            if ttft is None:
                ttft = time.perf_counter() - start
            chunks.append(text)
    return "".join(chunks), (ttft or (time.perf_counter() - start)) * 1000


def _score_badge(val: float) -> str:
    icon = SCORE_COLORS.get(round(val), "⚪")
    return f"{icon} {val:.2f}"


# ── Session state ─────────────────────────────────────────────────────────────
if "results" not in st.session_state:
    st.session_state.results = None
if "run_error" not in st.session_state:
    st.session_state.run_error = None


# ── Sidebar ───────────────────────────────────────────────────────────────────
with st.sidebar:
    st.title("context-eval")
    st.caption("Measure how much personal context improves LLM response quality")
    st.divider()

    # Context input
    st.subheader("Personal Context")
    context_source = st.radio("Source", ["Use example", "Paste text", "Upload file"], horizontal=True)

    if context_source == "Use example":
        context = DEFAULT_CONTEXT
        st.info("Using `example_context.md`")
    elif context_source == "Upload file":
        uploaded = st.file_uploader("Upload .md or .txt", type=["md", "txt"])
        context = uploaded.read().decode("utf-8") if uploaded else ""
        if not context:
            st.warning("No file uploaded yet.")
    else:
        context = st.text_area("Paste context here", height=220, value=DEFAULT_CONTEXT)

    st.divider()

    # Strategies
    st.subheader("Strategies")
    use_none       = st.checkbox("none (baseline)",     value=True)
    use_full       = st.checkbox("full context",        value=True)
    use_compressed = st.checkbox("compressed context",  value=True)

    strategy_names = (
        (["none"]       if use_none       else []) +
        (["full"]       if use_full       else [])  +
        (["compressed"] if use_compressed else [])
    )

    st.divider()

    # Model + CI threshold
    st.subheader("Settings")
    model = st.selectbox("Model", MODELS)
    threshold = st.slider("CI pass threshold", 1.0, 5.0, 3.5, 0.1,
                          help="Shown after run — does not block the UI, just reports pass/fail")

    st.divider()
    run_button = st.button("Run Eval", type="primary", use_container_width=True,
                           disabled=not context or not strategy_names)
    if not context:
        st.caption("Add context to enable run.")
    if not strategy_names:
        st.caption("Select at least one strategy.")


# ── Main ──────────────────────────────────────────────────────────────────────
st.header("context-eval")
st.caption("Compare LLM response quality across context injection strategies.")

# Benchmark query editor
st.subheader("Benchmark Queries")

benchmark_file = st.file_uploader("Load benchmark from YAML", type=["yaml", "yml"])
if benchmark_file:
    loaded = yaml.safe_load(benchmark_file.read().decode("utf-8"))
    initial_queries = loaded.get("queries", DEFAULT_QUERIES)
else:
    initial_queries = DEFAULT_QUERIES

queries_df = st.data_editor(
    pd.DataFrame(initial_queries),
    num_rows="dynamic",
    use_container_width=True,
    column_config={
        "id":       st.column_config.TextColumn("ID",       width="small"),
        "category": st.column_config.TextColumn("Category", width="small"),
        "query":    st.column_config.TextColumn("Query",    width="large"),
    },
    hide_index=True,
)
queries = queries_df.dropna(subset=["query"]).to_dict("records")


# ── Run ───────────────────────────────────────────────────────────────────────
if run_button:
    st.session_state.results = None
    st.session_state.run_error = None

    client = anthropic.Anthropic()
    all_results: list[dict] = []

    try:
        with st.status("Running eval...", expanded=True) as status:
            st.write("Preparing context strategies...")
            prepared = build_strategies(strategy_names, context, client)

            token_info = {
                name: ctx.prompt_tokens for name, ctx in prepared.items()
            }
            st.write(f"Ready: {', '.join(f'{k} (~{v} tokens)' for k, v in token_info.items())}")

            progress = st.progress(0.0)

            for i, q in enumerate(queries):
                query_text: str = q.get("query", "")
                if not query_text.strip():
                    continue

                st.write(f"[{i+1}/{len(queries)}] {query_text[:70]}...")
                strategy_results: dict[str, dict] = {}

                for strategy_name, ctx in prepared.items():
                    response_text, ttft_ms = _run_query(query_text, ctx.system_prompt, client, model)
                    context_for_judge = context if strategy_name != "none" else ""
                    score = score_response(query_text, response_text, context_for_judge, client)

                    strategy_results[strategy_name] = {
                        "response": response_text,
                        "ttft_ms": ttft_ms,
                        "prompt_tokens": ctx.prompt_tokens,
                        "score": {
                            "personalization": score.personalization,
                            "specificity": score.specificity,
                            "helpfulness": score.helpfulness,
                            "total": score.total,
                            "reasoning": score.reasoning,
                        },
                    }

                all_results.append({
                    "id": q.get("id", f"q{i+1}"),
                    "query": query_text,
                    "category": q.get("category", ""),
                    "strategies": strategy_results,
                })
                progress.progress((i + 1) / len(queries))

            status.update(label="Eval complete!", state="complete")

        st.session_state.results = all_results

    except Exception as e:
        st.session_state.run_error = str(e)


# ── Error ─────────────────────────────────────────────────────────────────────
if st.session_state.run_error:
    st.error(f"Error: {st.session_state.run_error}")


# ── Results ───────────────────────────────────────────────────────────────────
if st.session_state.results:
    results = st.session_state.results
    all_strategy_names = list(results[0]["strategies"].keys())

    st.divider()
    st.subheader("Results")

    # ── Summary metrics ───────────────────────────────────────────────────────
    def _mean(lst):
        return sum(lst) / len(lst) if lst else 0.0

    agg: dict[str, dict] = {s: {"total": [], "ttft_ms": [], "prompt_tokens": []} for s in all_strategy_names}
    for result in results:
        for s, data in result["strategies"].items():
            agg[s]["total"].append(data["score"]["total"])
            agg[s]["ttft_ms"].append(data["ttft_ms"])
            agg[s]["prompt_tokens"].append(data["prompt_tokens"])

    summary_cols = st.columns(len(all_strategy_names))
    for col, strategy in zip(summary_cols, all_strategy_names):
        avg_score = _mean(agg[strategy]["total"])
        avg_ttft  = _mean(agg[strategy]["ttft_ms"])
        col.metric(
            label=strategy,
            value=f"{avg_score:.2f} / 5",
            delta=f"{avg_ttft:.0f} ms avg TTFT",
            delta_color="off",
        )

    # CI pass/fail
    best_strategy = next((s for s in ("compressed", "full") if s in agg), None)
    if best_strategy:
        best_score = _mean(agg[best_strategy]["total"])
        passed = best_score >= threshold
        if passed:
            st.success(f"CI PASSED — {best_strategy} score {best_score:.2f} >= threshold {threshold:.2f}")
        else:
            st.error(f"CI FAILED — {best_strategy} score {best_score:.2f} < threshold {threshold:.2f}")

    # ── Delta analysis ────────────────────────────────────────────────────────
    if "none" in agg and len(all_strategy_names) > 1:
        st.subheader("Delta Analysis")
        delta_data = []
        for strategy in all_strategy_names:
            if strategy == "none":
                continue
            score_delta  = _mean(agg[strategy]["total"]) - _mean(agg["none"]["total"])
            ttft_delta   = _mean(agg[strategy]["ttft_ms"]) - _mean(agg["none"]["ttft_ms"])
            token_cost   = _mean(agg[strategy]["prompt_tokens"])
            delta_data.append({
                "strategy":     strategy,
                "score delta":  f"{score_delta:+.2f}",
                "TTFT delta ms": f"{ttft_delta:+.0f}",
                "avg tokens":   f"~{token_cost:.0f}",
            })
        if "full" in agg and "compressed" in agg:
            comp_vs_full = _mean(agg["compressed"]["total"]) - _mean(agg["full"]["total"])
            token_saving = _mean(agg["full"]["prompt_tokens"]) - _mean(agg["compressed"]["prompt_tokens"])
            delta_data.append({
                "strategy":     "compressed vs full",
                "score delta":  f"{comp_vs_full:+.2f}",
                "TTFT delta ms": "—",
                "avg tokens":   f"~{token_saving:.0f} saved",
            })
        st.dataframe(pd.DataFrame(delta_data), use_container_width=True, hide_index=True)

    # ── Summary table ─────────────────────────────────────────────────────────
    st.subheader("Summary Table")
    summary_rows = []
    for strategy in all_strategy_names:
        summary_rows.append({
            "strategy":        strategy,
            "avg score":       round(_mean(agg[strategy]["total"]), 2),
            "personalization": round(_mean([r["strategies"][strategy]["score"]["personalization"] for r in results if strategy in r["strategies"]]), 2),
            "specificity":     round(_mean([r["strategies"][strategy]["score"]["specificity"]     for r in results if strategy in r["strategies"]]), 2),
            "helpfulness":     round(_mean([r["strategies"][strategy]["score"]["helpfulness"]     for r in results if strategy in r["strategies"]]), 2),
            "avg TTFT ms":     round(_mean(agg[strategy]["ttft_ms"]), 0),
            "avg tokens":      round(_mean(agg[strategy]["prompt_tokens"]), 0),
        })
    st.dataframe(pd.DataFrame(summary_rows), use_container_width=True, hide_index=True)

    # ── Per-query breakdown ───────────────────────────────────────────────────
    st.subheader("Per-Query Breakdown")
    for result in results:
        with st.expander(f"[{result['category']}] {result['query'][:80]}"):
            # Score table
            rows = []
            for strategy, data in result["strategies"].items():
                s = data["score"]
                rows.append({
                    "strategy":        strategy,
                    "personalization": s["personalization"],
                    "specificity":     s["specificity"],
                    "helpfulness":     s["helpfulness"],
                    "score":           round(s["total"], 2),
                    "TTFT ms":         round(data["ttft_ms"]),
                    "tokens":          data["prompt_tokens"],
                    "reasoning":       s["reasoning"],
                })
            st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)

            # Side-by-side responses
            resp_cols = st.columns(len(result["strategies"]))
            for col, (strategy, data) in zip(resp_cols, result["strategies"].items()):
                with col:
                    score = data["score"]["total"]
                    col.markdown(f"**{strategy}** — {_score_badge(score)}")
                    col.markdown(data["response"])

    # ── Download ──────────────────────────────────────────────────────────────
    st.divider()
    st.download_button(
        label="Download results (JSON)",
        data=json.dumps(results, indent=2),
        file_name="eval_results.json",
        mime="application/json",
    )