-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
120 lines (95 loc) · 4.02 KB
/
Copy pathmain.py
File metadata and controls
120 lines (95 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Two-pass evaluate() pattern: full local results, partial tracing to LangSmith.
All dataset examples are evaluated locally. Only a configurable subset of runs
(and their correlated evaluator traces + feedback) are sent to LangSmith.
"""
import os
import random
from dotenv import load_dotenv
from langsmith import Client, evaluate
load_dotenv()
TRACING_SAMPLE_RATE = float(os.getenv("TRACING_SAMPLE_RATE", "0.1"))
DATASET_NAME = os.getenv("DATASET_NAME", "my-dataset")
EXPERIMENT_PREFIX = os.getenv("EXPERIMENT_PREFIX", "subset-tracing")
# ---------------------------------------------------------------------------
# Target function – replace with your actual LLM pipeline
# ---------------------------------------------------------------------------
def target(inputs: dict) -> dict:
"""Dummy target that echoes inputs. Replace with your real chain/agent."""
return {"output": f"echo: {inputs}"}
# ---------------------------------------------------------------------------
# Evaluators – replace / extend with your own
# ---------------------------------------------------------------------------
def exact_match(run, example) -> dict:
"""Example evaluator. Replace with your real evaluation logic."""
prediction = run.outputs.get("output", "")
reference = (example.outputs or {}).get("output", "")
return {"key": "exact_match", "score": int(prediction == reference)}
EVALUATORS = [exact_match]
# ---------------------------------------------------------------------------
# Two-pass evaluation
# ---------------------------------------------------------------------------
def evaluate_with_partial_tracing(
target_fn,
*,
dataset_name: str = DATASET_NAME,
evaluators: list = EVALUATORS,
sample_rate: float = TRACING_SAMPLE_RATE,
experiment_prefix: str = EXPERIMENT_PREFIX,
seed: int | None = None,
) -> dict:
"""Run evaluate() on a full dataset, tracing only a subset to LangSmith.
Returns a dict with:
traced_results – ExperimentResults for the traced subset
untraced_results – ExperimentResults for the untraced remainder
all_results – combined list of every individual result
"""
client = Client()
# -- Partition examples ------------------------------------------------
all_examples = list(client.list_examples(dataset_name=dataset_name))
if not all_examples:
raise ValueError(f"Dataset '{dataset_name}' is empty or does not exist.")
rng = random.Random(seed)
rng.shuffle(all_examples)
k = max(1, int(len(all_examples) * sample_rate))
traced_examples = all_examples[:k]
untraced_examples = all_examples[k:]
print(
f"Dataset has {len(all_examples)} examples. "
f"Tracing {len(traced_examples)}, running {len(untraced_examples)} locally-only."
)
# -- Pass 1: traced subset (uploaded to LangSmith) ---------------------
traced_results = evaluate(
target_fn,
data=traced_examples,
evaluators=evaluators,
experiment_prefix=f"{experiment_prefix}-traced",
upload_results=True,
client=client,
)
# -- Pass 2: untraced remainder (local only) ---------------------------
untraced_results = evaluate(
target_fn,
data=untraced_examples,
evaluators=evaluators,
experiment_prefix=f"{experiment_prefix}-local",
upload_results=False,
client=client,
)
# -- Merge results -----------------------------------------------------
all_results = list(traced_results) + list(untraced_results)
print(
f"\nDone. {len(list(traced_results))} traced results in LangSmith, "
f"{len(list(untraced_results))} local-only results. "
f"{len(all_results)} total."
)
return {
"traced_results": traced_results,
"untraced_results": untraced_results,
"all_results": all_results,
}
if __name__ == "__main__":
results = evaluate_with_partial_tracing(target)
# All results are available locally:
for r in results["all_results"]:
print(r)