Skip to content
6 changes: 5 additions & 1 deletion docs/evals/core-concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import IsInstance

dataset = Dataset(
name='my_eval_suite', # Optional name
name='my_eval_suite',
cases=[
Case(inputs='test input', expected_output='test output'),
],
Expand Down Expand Up @@ -76,6 +76,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected, IsInstance

dataset = Dataset(
name='case_level_evaluators',
cases=[
Case(
name='special_case',
Expand Down Expand Up @@ -107,6 +108,7 @@ from pydantic_evals import Case, Dataset

# Define your dataset (static definition)
dataset = Dataset(
name='uppercase_experiment',
cases=[
Case(inputs='hello', expected_output='HELLO'),
Case(inputs='world', expected_output='WORLD'),
Expand Down Expand Up @@ -146,6 +148,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected

dataset = Dataset(
name='comparison_test',
cases=[
Case(inputs='hello', expected_output='HELLO'),
],
Expand Down Expand Up @@ -376,6 +379,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected

dataset = Dataset(
name='report_example',
cases=[Case(inputs='hello', expected_output='HELLO')],
evaluators=[EqualsExpected()],
)
Expand Down
5 changes: 5 additions & 0 deletions docs/evals/evaluators/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import LLMJudge

dataset = Dataset(
name='llm_judge_example',
cases=[Case(inputs='What is 2+2?', expected_output='4')],
evaluators=[
LLMJudge(
Expand Down Expand Up @@ -213,6 +214,7 @@ from pydantic_evals.evaluators import (
)

dataset = Dataset(
name='layered_evaluation',
cases=[Case(inputs='test', expected_output='result')],
evaluators=[
# Fast deterministic checks first
Expand All @@ -237,6 +239,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import IsInstance, LLMJudge

dataset = Dataset(
name='case_specific_evaluators',
cases=[
Case(
name='greeting_response',
Expand Down Expand Up @@ -287,6 +290,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import LLMJudge

dataset = Dataset(
name='golden_dataset',
cases=[
Case(
name='handle_refund_request',
Expand Down Expand Up @@ -447,6 +451,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import ConfusionMatrixEvaluator

dataset = Dataset(
name='report_evaluator_example',
cases=[
Case(inputs='meow', expected_output='cat'),
Case(inputs='woof', expected_output='dog'),
Expand Down
5 changes: 5 additions & 0 deletions docs/evals/evaluators/report-evaluators.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def my_classifier(text: str) -> str:


dataset = Dataset(
name='animal_classifier',
cases=[
Case(name='cat', inputs='The cat goes meow', expected_output='cat'),
Case(name='dog', inputs='The dog barks', expected_output='dog'),
Expand Down Expand Up @@ -95,6 +96,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import ConfusionMatrixEvaluator

dataset = Dataset(
name='animal_sounds',
cases=[
Case(inputs='meow', expected_output='cat'),
Case(inputs='woof', expected_output='dog'),
Expand Down Expand Up @@ -136,6 +138,7 @@ def categorize(output: str) -> str:


dataset = Dataset(
name='labels_example',
cases=[Case(inputs='test', expected_output='positive')],
evaluators=[ClassifyOutput()],
report_evaluators=[
Expand Down Expand Up @@ -223,6 +226,7 @@ def calculate_confidence(output: str) -> float:


dataset = Dataset(
name='precision_recall_example',
cases=[
Case(inputs='test 1', expected_output='cat'),
Case(inputs='test 2', expected_output='dog'),
Expand Down Expand Up @@ -694,6 +698,7 @@ class AccuracyEvaluator(ReportEvaluator):


dataset = Dataset(
name='full_example',
cases=[
Case(inputs='The cat meows', expected_output='cat'),
Case(inputs='The dog barks', expected_output='dog'),
Expand Down
4 changes: 4 additions & 0 deletions docs/evals/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ from pydantic_evals.evaluators import Contains, EqualsExpected

# Create a dataset with test cases
dataset = Dataset(
name='uppercase_tests',
cases=[
Case(
name='uppercase_basic',
Expand Down Expand Up @@ -117,6 +118,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import Contains, IsInstance

dataset = Dataset(
name='dict_validation',
cases=[
Case(inputs={'data': 'required_key present'}, expected_output={'result': 'success'}),
],
Expand All @@ -136,6 +138,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import LLMJudge

dataset = Dataset(
name='llm_judge_test',
cases=[
Case(inputs='What is the capital of France?', expected_output='Paris'),
],
Expand All @@ -158,6 +161,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import MaxDuration

dataset = Dataset(
name='performance_test',
cases=[
Case(inputs='test input', expected_output='test output'),
],
Expand Down
19 changes: 14 additions & 5 deletions pydantic_evals/pydantic_evals/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def evaluate(self, ctx: EvaluatorContext) -> bool:
return ctx.output == ctx.expected_output

dataset = Dataset(
name='uppercase_tests',
cases=[
Case(name='test1', inputs={'text': 'Hello'}, expected_output='HELLO'),
Case(name='test2', inputs={'text': 'World'}, expected_output='WORLD'),
Expand Down Expand Up @@ -226,7 +227,7 @@ async def main():
"""

name: str | None = None
"""Optional name of the dataset."""
"""Name of the dataset. Required in future versions."""
cases: list[Case[InputsT, OutputT, MetadataT]]
"""List of test cases in the dataset."""
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = []
Expand All @@ -245,11 +246,19 @@ def __init__(
"""Initialize a new dataset with test cases and optional evaluators.

Args:
name: Optional name for the dataset.
name: Name for the dataset. Omitting this is deprecated and will raise an error in a future version.
cases: Sequence of test cases to include in the dataset.
evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
report_evaluators: Optional sequence of report evaluators that run on the full evaluation report.
"""
if name is None:
warnings.warn(
'Omitting the `name` parameter is deprecated. '
'Please provide a name for your Dataset.',
DeprecationWarning,
stacklevel=2,
)
Comment on lines +255 to +260
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Scope deprecation warning to avoid hard failures under -W error

Dataset.__init__ now unconditionally emits a DeprecationWarning whenever name is None, and this warning becomes an exception in environments that treat warnings as errors (including this repo’s pytest config with filterwarnings = ["error", ...]). That means existing unnamed-dataset code paths (for example current tests like tests/evals/test_report_evaluators.py) now fail at construction time instead of exercising behavior, so this change is effectively breaking rather than deprecating for strict-warning users. Consider limiting the warning to explicit end-user constructor calls or updating all remaining internal unnamed call sites in the same change.

Useful? React with 👍 / 👎.

Comment on lines +255 to +260
Copy link
Copy Markdown
Contributor

@devin-ai-integration devin-ai-integration bot Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Incomplete deprecation migration across docs

The PR updates name in a subset of doc files (quick-start, core-concepts, overview, report-evaluators), but dozens of other doc files still have Dataset(cases=...) without name — e.g., docs/evals/evaluators/llm-judge.md:384, docs/evals/how-to/concurrency.md:19, docs/evals/how-to/retry-strategies.md:33, docs/evals/how-to/metrics-attributes.md:121, docs/evals.md:150, pydantic_evals/README.md:47, and pydantic_evals/pydantic_evals/lifecycle.py:59 (docstring). The test_examples.py:132 filter 'ignore:Omitting the \name` parameter is deprecated'ensures tests pass, but perdocs/AGENTS.md` rule:714 ("Omit deprecated features from user-facing docs"), these remaining usages should ideally be updated in a follow-up to avoid teaching users the deprecated pattern.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


case_names = set[str]()
for case in cases:
if case.name is None:
Expand Down Expand Up @@ -727,9 +736,9 @@ def _from_dataset_model(
cases.append(row)
if errors:
raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3])
result = cls(name=dataset_model.name, cases=cases, report_evaluators=report_evaluators)
if result.name is None:
result.name = default_name
# Use default_name if no name was provided in the serialized data
name = dataset_model.name if dataset_model.name is not None else default_name
result = cls(name=name, cases=cases, report_evaluators=report_evaluators)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is correct — it avoids the deprecation warning when loading from file (where default_name=path.stem). However, there's no test covering this behavior: that from_file with a YAML/JSON file that has no name field uses the filename stem and does not emit the deprecation warning. Adding such a test would guard against regressions if this code is refactored.

Comment on lines +739 to +741
Copy link
Copy Markdown
Contributor

@devin-ai-integration devin-ai-integration bot Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Deprecation warning stacklevel=2 points to internal code when triggered via from_text/from_dict

When from_text() or from_dict() is called without a name in the serialized data and without providing default_name, _from_dataset_model computes name=None (pydantic_evals/pydantic_evals/dataset.py:740) and calls cls(name=None, ...) (dataset.py:741). This triggers the deprecation warning at dataset.py:256 with stacklevel=2, which attributes the warning to _from_dataset_model (an internal method) rather than the user's from_text()/from_dict() call. The stacklevel=2 is correct for the direct Dataset(cases=[...]) construction path, but not for the deserialization path where the call stack is deeper. Note: from_file is unaffected because it always provides default_name=path.stem.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude:

it's fixable, but every approach adds some complexity. Here's the tradeoff:

  The problem: __init__'s stacklevel=2 is correct for direct Dataset(cases=[...]) calls but wrong for the from_text/from_dict → _from_dataset_model → cls()
   chain, where the user's frame is 3-5 levels up instead of 1.

  The practical impact: After our changes, this path almost never fires because:
  - from_file always provides default_name=path.stem
  - generate_dataset now provides default_name='generated'
  - All doc examples include name in serialized data

  A user would need to call from_text/from_dict with data that has no name AND no default_name. Even then, the warning message itself ("Please provide a
  name for your Dataset") is clear about what to fix.

  Cleanest fix if we wanted to do it: suppress the __init__ warning inside _from_dataset_model with warnings.catch_warnings(), then re-emit with a
  _stacklevel_offset parameter passed through from each public caller. But that's plumbing through 3 layers for a rare edge case.

  My take: not worth the complexity for this PR. Worth noting in review but fine to leave as-is.

result.evaluators = dataset_evaluators
return result

Expand Down
Loading
Loading