Skip to content
6 changes: 5 additions & 1 deletion docs/evals/core-concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import IsInstance

dataset = Dataset(
name='my_eval_suite', # Optional name
name='my_eval_suite',
cases=[
Case(inputs='test input', expected_output='test output'),
],
Expand Down Expand Up @@ -76,6 +76,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected, IsInstance

dataset = Dataset(
name='case_level_evaluators',
cases=[
Case(
name='special_case',
Expand Down Expand Up @@ -107,6 +108,7 @@ from pydantic_evals import Case, Dataset

# Define your dataset (static definition)
dataset = Dataset(
name='uppercase_experiment',
cases=[
Case(inputs='hello', expected_output='HELLO'),
Case(inputs='world', expected_output='WORLD'),
Expand Down Expand Up @@ -146,6 +148,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected

dataset = Dataset(
name='comparison_test',
cases=[
Case(inputs='hello', expected_output='HELLO'),
],
Expand Down Expand Up @@ -376,6 +379,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import EqualsExpected

dataset = Dataset(
name='report_example',
cases=[Case(inputs='hello', expected_output='HELLO')],
evaluators=[EqualsExpected()],
)
Expand Down
5 changes: 5 additions & 0 deletions docs/evals/evaluators/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import LLMJudge

dataset = Dataset(
name='llm_judge_example',
cases=[Case(inputs='What is 2+2?', expected_output='4')],
evaluators=[
LLMJudge(
Expand Down Expand Up @@ -213,6 +214,7 @@ from pydantic_evals.evaluators import (
)

dataset = Dataset(
name='layered_evaluation',
cases=[Case(inputs='test', expected_output='result')],
evaluators=[
# Fast deterministic checks first
Expand All @@ -237,6 +239,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import IsInstance, LLMJudge

dataset = Dataset(
name='case_specific_evaluators',
cases=[
Case(
name='greeting_response',
Expand Down Expand Up @@ -287,6 +290,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import LLMJudge

dataset = Dataset(
name='golden_dataset',
cases=[
Case(
name='handle_refund_request',
Expand Down Expand Up @@ -447,6 +451,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import ConfusionMatrixEvaluator

dataset = Dataset(
name='report_evaluator_example',
cases=[
Case(inputs='meow', expected_output='cat'),
Case(inputs='woof', expected_output='dog'),
Expand Down
5 changes: 5 additions & 0 deletions docs/evals/evaluators/report-evaluators.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def my_classifier(text: str) -> str:


dataset = Dataset(
name='animal_classifier',
cases=[
Case(name='cat', inputs='The cat goes meow', expected_output='cat'),
Case(name='dog', inputs='The dog barks', expected_output='dog'),
Expand Down Expand Up @@ -95,6 +96,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import ConfusionMatrixEvaluator

dataset = Dataset(
name='animal_sounds',
cases=[
Case(inputs='meow', expected_output='cat'),
Case(inputs='woof', expected_output='dog'),
Expand Down Expand Up @@ -136,6 +138,7 @@ def categorize(output: str) -> str:


dataset = Dataset(
name='labels_example',
cases=[Case(inputs='test', expected_output='positive')],
evaluators=[ClassifyOutput()],
report_evaluators=[
Expand Down Expand Up @@ -223,6 +226,7 @@ def calculate_confidence(output: str) -> float:


dataset = Dataset(
name='precision_recall_example',
cases=[
Case(inputs='test 1', expected_output='cat'),
Case(inputs='test 2', expected_output='dog'),
Expand Down Expand Up @@ -694,6 +698,7 @@ class AccuracyEvaluator(ReportEvaluator):


dataset = Dataset(
name='full_example',
cases=[
Case(inputs='The cat meows', expected_output='cat'),
Case(inputs='The dog barks', expected_output='dog'),
Expand Down
4 changes: 4 additions & 0 deletions docs/evals/quick-start.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ from pydantic_evals.evaluators import Contains, EqualsExpected

# Create a dataset with test cases
dataset = Dataset(
name='uppercase_tests',
cases=[
Case(
name='uppercase_basic',
Expand Down Expand Up @@ -117,6 +118,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import Contains, IsInstance

dataset = Dataset(
name='dict_validation',
cases=[
Case(inputs={'data': 'required_key present'}, expected_output={'result': 'success'}),
],
Expand All @@ -136,6 +138,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import LLMJudge

dataset = Dataset(
name='llm_judge_test',
cases=[
Case(inputs='What is the capital of France?', expected_output='Paris'),
],
Expand All @@ -158,6 +161,7 @@ from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import MaxDuration

dataset = Dataset(
name='performance_test',
cases=[
Case(inputs='test input', expected_output='test output'),
],
Expand Down
2 changes: 2 additions & 0 deletions pydantic_evals/pydantic_evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
- Generating reports for evaluation results
"""

from ._warnings import PydanticEvalsDeprecationWarning
from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
from .lifecycle import CaseLifecycle

__all__ = (
'Case',
'CaseLifecycle',
'Dataset',
'PydanticEvalsDeprecationWarning',
'increment_eval_metric',
'set_eval_attribute',
)
10 changes: 10 additions & 0 deletions pydantic_evals/pydantic_evals/_warnings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from __future__ import annotations


class PydanticEvalsDeprecationWarning(UserWarning):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice choice following the UserWarning pattern for library deprecations. Pydantic core uses PydanticDeprecatedSince20 (inheriting DeprecationWarning) — was there a deliberate decision to diverge from that pattern for pydantic_evals, or would it be worth aligning? Either way is reasonable, but if the intent is to eventually align with Pydantic core's approach, it's worth noting. @DouweM any preference here?

"""Warning emitted when a deprecated Pydantic Evals API is used.
Inherits from `UserWarning` instead of `DeprecationWarning` so that
deprecations are visible by default at runtime, following the approach
described in https://sethmlarson.dev/deprecations-via-warnings-dont-work-for-python-libraries.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, should like we should be doing something like this for Pydantic AI deprecation warnings as well.

"""
19 changes: 14 additions & 5 deletions pydantic_evals/pydantic_evals/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from pydantic_evals._utils import get_event_loop

from ._utils import get_unwrapped_function_name, logfire_span, task_group_gather
from ._warnings import PydanticEvalsDeprecationWarning
from .evaluators import EvaluationResult, Evaluator
from .evaluators._base import BaseEvaluator
from .evaluators._run_evaluator import run_evaluator
Expand Down Expand Up @@ -196,6 +197,7 @@ def evaluate(self, ctx: EvaluatorContext) -> bool:
return ctx.output == ctx.expected_output

dataset = Dataset(
name='uppercase_tests',
cases=[
Case(name='test1', inputs={'text': 'Hello'}, expected_output='HELLO'),
Case(name='test2', inputs={'text': 'World'}, expected_output='WORLD'),
Expand Down Expand Up @@ -226,7 +228,7 @@ async def main():
"""

name: str | None = None
"""Optional name of the dataset."""
"""Name of the dataset. Required in future versions."""
cases: list[Case[InputsT, OutputT, MetadataT]]
"""List of test cases in the dataset."""
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = []
Expand All @@ -245,11 +247,18 @@ def __init__(
"""Initialize a new dataset with test cases and optional evaluators.

Args:
name: Optional name for the dataset.
name: Name for the dataset. Omitting this is deprecated and will raise an error in a future version.
cases: Sequence of test cases to include in the dataset.
evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
report_evaluators: Optional sequence of report evaluators that run on the full evaluation report.
"""
if name is None:
warnings.warn(
'Omitting the `name` parameter is deprecated. Please provide a name for your `Dataset`.',
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deprecation message says "will raise an error in a future version" — since the version policy states deprecated features are removed in V2, it would be more helpful to users to say something like '...will be required in a future version. Please provide a name for your \Dataset`.'` or even mention V2 explicitly, so users can prioritize the migration.

Also, stacklevel=2 is correct for direct Dataset() construction, but when this fires through _from_dataset_modelcls(name=None, ...) (e.g. from from_text or from_dict without default_name), the warning will point at _from_dataset_model, which is an internal method. Consider emitting the warning separately (or adjusting stacklevel) in the _from_dataset_model path so the warning points to the user's from_text()/from_dict() call instead.

PydanticEvalsDeprecationWarning,
stacklevel=2,
)
Comment on lines +255 to +260
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Scope deprecation warning to avoid hard failures under -W error

Dataset.__init__ now unconditionally emits a DeprecationWarning whenever name is None, and this warning becomes an exception in environments that treat warnings as errors (including this repo’s pytest config with filterwarnings = ["error", ...]). That means existing unnamed-dataset code paths (for example current tests like tests/evals/test_report_evaluators.py) now fail at construction time instead of exercising behavior, so this change is effectively breaking rather than deprecating for strict-warning users. Consider limiting the warning to explicit end-user constructor calls or updating all remaining internal unnamed call sites in the same change.

Useful? React with 👍 / 👎.

Comment on lines +255 to +260
Copy link
Copy Markdown
Contributor

@devin-ai-integration devin-ai-integration bot Mar 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Incomplete deprecation migration across docs

The PR updates name in a subset of doc files (quick-start, core-concepts, overview, report-evaluators), but dozens of other doc files still have Dataset(cases=...) without name — e.g., docs/evals/evaluators/llm-judge.md:384, docs/evals/how-to/concurrency.md:19, docs/evals/how-to/retry-strategies.md:33, docs/evals/how-to/metrics-attributes.md:121, docs/evals.md:150, pydantic_evals/README.md:47, and pydantic_evals/pydantic_evals/lifecycle.py:59 (docstring). The test_examples.py:132 filter 'ignore:Omitting the \name` parameter is deprecated'ensures tests pass, but perdocs/AGENTS.md` rule:714 ("Omit deprecated features from user-facing docs"), these remaining usages should ideally be updated in a follow-up to avoid teaching users the deprecated pattern.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


case_names = set[str]()
for case in cases:
if case.name is None:
Expand Down Expand Up @@ -727,9 +736,9 @@ def _from_dataset_model(
cases.append(row)
if errors:
raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3])
result = cls(name=dataset_model.name, cases=cases, report_evaluators=report_evaluators)
if result.name is None:
result.name = default_name
# Use default_name if no name was provided in the serialized data
name = dataset_model.name if dataset_model.name is not None else default_name
result = cls(name=name, cases=cases, report_evaluators=report_evaluators)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is correct — it avoids the deprecation warning when loading from file (where default_name=path.stem). However, there's no test covering this behavior: that from_file with a YAML/JSON file that has no name field uses the filename stem and does not emit the deprecation warning. Adding such a test would guard against regressions if this code is refactored.

Comment on lines +739 to +741
Copy link
Copy Markdown
Contributor

@devin-ai-integration devin-ai-integration bot Mar 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Deprecation warning stacklevel=2 points to internal code when triggered via from_text/from_dict

When from_text() or from_dict() is called without a name in the serialized data and without providing default_name, _from_dataset_model computes name=None (pydantic_evals/pydantic_evals/dataset.py:740) and calls cls(name=None, ...) (dataset.py:741). This triggers the deprecation warning at dataset.py:256 with stacklevel=2, which attributes the warning to _from_dataset_model (an internal method) rather than the user's from_text()/from_dict() call. The stacklevel=2 is correct for the direct Dataset(cases=[...]) construction path, but not for the deserialization path where the call stack is deeper. Note: from_file is unaffected because it always provides default_name=path.stem.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude:

it's fixable, but every approach adds some complexity. Here's the tradeoff:

  The problem: __init__'s stacklevel=2 is correct for direct Dataset(cases=[...]) calls but wrong for the from_text/from_dict → _from_dataset_model → cls()
   chain, where the user's frame is 3-5 levels up instead of 1.

  The practical impact: After our changes, this path almost never fires because:
  - from_file always provides default_name=path.stem
  - generate_dataset now provides default_name='generated'
  - All doc examples include name in serialized data

  A user would need to call from_text/from_dict with data that has no name AND no default_name. Even then, the warning message itself ("Please provide a
  name for your Dataset") is clear about what to fix.

  Cleanest fix if we wanted to do it: suppress the __init__ warning inside _from_dataset_model with warnings.catch_warnings(), then re-emit with a
  _stacklevel_offset parameter passed through from each public caller. But that's plumbing through 3 layers for a rare edge case.

  My take: not worth the complexity for this PR. Worth noting in review but fine to leave as-is.

result.evaluators = dataset_evaluators
return result

Expand Down
Loading
Loading