Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/autointent/generation/utterances/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""Generative methods for enriching dataset with synthetic samples."""

from ._adversarial import CriticHumanLike, HumanUtteranceGenerator
from ._basic import DatasetBalancer, UtteranceGenerator
from ._evolution import IncrementalUtteranceEvolver, UtteranceEvolver

__all__ = [
"CriticHumanLike",
"DatasetBalancer",
"HumanUtteranceGenerator",
"IncrementalUtteranceEvolver",
"UtteranceEvolver",
"UtteranceGenerator",
Expand Down
4 changes: 4 additions & 0 deletions src/autointent/generation/utterances/_adversarial/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .critic_human_like import CriticHumanLike
from .human_utterance_generator import HumanUtteranceGenerator

__all__ = ["CriticHumanLike", "HumanUtteranceGenerator"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""CriticHumanLike class for distinguishing human vs generated utterances."""

from typing import Literal

from pydantic import BaseModel

from autointent.generation import Generator
from autointent.generation.chat_templates import Message, Role


class CriticResponse(BaseModel):
"""Structured answer."""

reasoning: str
label: Literal["human", "generated"]


class CriticHumanLike:
"""A simple critic class that classifies user utterances as either 'human' or 'generated'.

using an LLM-based binary classifier prompt.
"""

def __init__(self, generator: Generator, max_retries: int = 3) -> None:
"""Initialize the CriticFirst.

Args:
generator: Wrapper for the LLM API to generate classification responses.
max_retries: Maximum number of attempts to retry classification if the response is invalid.
"""
self.generator = generator
self.max_retries = max_retries

def build_classification_prompt(self, example: str, intent_name: str) -> Message:
"""Args.

example: The user utterance to classify.
intent_name: The name of the intent associated with the utterance.

Returns:
Message: A formatted message prompt for classification.
"""
content = (
"You are a critic that determines whether a user utterance was written by a human or "
"generated by a language model.\n\n"
f"Intent: {intent_name}\n"
f'Utterance: "{example}"\n\n'
"Here is an example of a human-written utterance for this intent:\n"
'"Could you please help me find the nearest coffee shop?"\n\n'
"Respond in **JSON format** with three keys:\n"
"- `reasoning`: a short chain-of-thought where you explain your logic\n"
"- `label`: must be either `human` or `generated`\n"
"Example:\n"
"{\n"
' "reasoning": "The phrasing includes casual contractions and natural hesitation. The utterance '
'flows similarly to how a human would speak spontaneously.",\n'
' "label": "human",\n'
"}"
)
return Message(role=Role.USER, content=content)

def is_human(self, utterance: str, intent_name: str) -> bool:
"""Args.

utterance: The utterance to evaluate.
intent_name: The associated intent.

Returns:
bool: True if classified as human, False otherwise.
"""
message = self.build_classification_prompt(utterance, intent_name)
response = self.generator.get_structured_output_sync(
messages=[message], output_model=CriticResponse, max_retries=self.max_retries
)
return response.label == "human"

async def is_human_async(self, utterance: str, intent_name: str) -> bool:
message = self.build_classification_prompt(utterance, intent_name)

response = await self.generator.get_structured_output_async(
messages=[message], output_model=CriticResponse, max_retries=self.max_retries
)
return response.label == "human"
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
import asyncio
import logging
import random
from collections import defaultdict
from functools import partial
from typing import Any

import aiometer
from datasets import Dataset as HFDataset
from datasets import concatenate_datasets

from autointent import Dataset
from autointent.custom_types import Split
from autointent.generation import Generator
from autointent.generation.chat_templates._evolution_templates_schemas import Message, Role
from autointent.schemas import Sample

from .critic_human_like import CriticHumanLike

logger = logging.getLogger(__name__)


class HumanUtteranceGenerator:
"""Generator of human-like utterances.

This class rewrites given user utterances to make them sound more natural and human-like,
while preserving their original intent. The generation process is iterative and attempts
to bypass a critic that identifies machine-generated text.
"""

def __init__(
self,
generator: Generator,
critic: CriticHumanLike,
async_mode: bool = False,
max_at_once: int = 5,
max_per_second: int = 10,
) -> None:
"""Initialize the HumanUtteranceGeneratoror.

Args:
generator: Wrapper for the LLM API used to generate utterances.
critic: Critic to determine whether the generated utterance sounds human-like.
async_mode: Whether to use asynchronous mode for generation.
max_at_once: Maximum number of concurrent async tasks.
max_per_second: Maximum number of tasks per second.
"""
self.generator = generator
self.critic = critic
self.async_mode = async_mode
self.max_at_once = max_at_once
self.max_per_second = max_per_second

def augment(
self, dataset: Dataset, split_name: str = Split.TRAIN, update_split: bool = True, n_final_per_class: int = 5
) -> list[Sample]:
"""Generate human-like utterances for each intent by iteratively refining machine-generated candidates.

Args:
dataset: The dataset to augment.
split_name: The name of the split to augment (e.g., 'train').
update_split: Whether to update the dataset split with the new utterances.
n_final_per_class: Number of successful utterances to generate per intent.

Returns:
list[Sample]: List of newly generated samples.
"""
if self.async_mode:
return asyncio.run(
self.augment_async(
dataset=dataset,
split_name=split_name,
update_split=update_split,
n_final_per_class=n_final_per_class,
)
)
original_split = dataset[split_name]
id_to_name = {intent.id: intent.name for intent in dataset.intents}
new_samples = []

class_to_samples = defaultdict(list)
for sample in original_split:
class_to_samples[sample["label"]].append(sample["utterance"])

for intent_id, intent_name in id_to_name.items():
if intent_name is None:
logger.warning("Intent with id %s has no name! Skipping it...", intent_id)
continue
generated_count = 0
attempt = 0

seed_utterances = class_to_samples.get(intent_id, [])
if not seed_utterances:
continue

while generated_count < n_final_per_class and attempt < n_final_per_class * 3:
attempt += 1
n_seeds = min(3, len(seed_utterances))
seed_examples = random.sample(seed_utterances, k=n_seeds)
rejected: list[str] = []

for _ in range(3):
prompt = self._build_adversarial_prompt(intent_name, seed_examples, rejected)
generated = self.generator.get_chat_completion([prompt]).strip()
if self.critic.is_human(generated, intent_name):
new_samples.append({Dataset.label_feature: intent_id, Dataset.utterance_feature: generated})
generated_count += 1
break
rejected.append(generated)
if update_split:
generated_split = HFDataset.from_list(new_samples)
dataset[split_name] = concatenate_datasets([original_split, generated_split])

return [Sample(**sample) for sample in new_samples]

async def augment_async(
self, dataset: Dataset, split_name: str = Split.TRAIN, update_split: bool = True, n_final_per_class: int = 5
) -> list[Sample]:
original_split = dataset[split_name]
id_to_name = {intent.id: intent.name for intent in dataset.intents}
new_samples = []

class_to_samples = defaultdict(list)
for sample in original_split:
class_to_samples[sample["label"]].append(sample["utterance"])

async def generate_one(intent_id: str, intent_name: str) -> list[dict[str, Any]]:
generated: list[dict[str, Any]] = []
attempts = 0
seed_utterances = class_to_samples[intent_id]
while len(generated) < n_final_per_class and attempts < n_final_per_class * 3:
attempts += 1
seed_examples = random.sample(seed_utterances, k=min(3, len(seed_utterances)))
rejected: list[str] = []

for _ in range(3):
prompt = self._build_adversarial_prompt(intent_name, seed_examples, rejected)
utterance = (await self.generator.get_chat_completion_async([prompt])).strip()
if await self.critic.is_human_async(utterance, intent_name):
generated.append({Dataset.label_feature: int(intent_id), Dataset.utterance_feature: utterance})
break
rejected.append(utterance)
return generated

tasks = [
partial(generate_one, str(intent_id), intent_name)
for intent_id, intent_name in id_to_name.items()
if class_to_samples.get(intent_id) and intent_name is not None
]

results = await aiometer.run_all(
tasks,
max_at_once=self.max_at_once,
max_per_second=self.max_per_second,
)

for result in results:
new_samples.extend(result)
if update_split:
generated_split = HFDataset.from_list(new_samples)
dataset[split_name] = concatenate_datasets([original_split, generated_split])

return [Sample(**sample) for sample in new_samples]

def _build_adversarial_prompt(self, intent_name: str, seed_examples: list[str], rejected: list[str]) -> Message:
"""Build a few-shot prompt.

Build a few-shot prompt to guide the generator to create a new human-like utterance
from scratch based on the intent name and example utterances.

Args:
intent_name: The intent of the utterance.
seed_examples: List of 1-3 example utterances for the intent.
rejected: List of previously rejected generations.

Returns:
Message: A formatted prompt instructing the generator to produce a new natural-sounding utterance..
"""
rejected_block = "\n".join(f"- {r}" for r in rejected) if rejected else "None"
examples_block = "\n".join(f'- "{ex}"' for ex in seed_examples)
content = (
f"Your task is to generate a new user utterance that fits the intent '{intent_name}'.\n\n"
"Here are some examples of utterances for this intent:\n"
f"{examples_block}\n\n"
"Preserving its original intent: "
f"'{intent_name}'.\n\n"
f"The following previous attempts were classified as machine-generated and rejected:\n{rejected_block}\n\n"
"Try to write something that would pass as written by a real human. Output a single version only.\n"
"IMPORTANT: You must modify the original utterance."
)
return Message(role=Role.USER, content=content)
75 changes: 75 additions & 0 deletions tests/generation/utterances/test_adversarial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from unittest.mock import AsyncMock, Mock

import pytest

from autointent import Dataset
from autointent.generation.utterances import CriticHumanLike, HumanUtteranceGenerator
from autointent.schemas import Sample


@pytest.fixture
def dataset():
return Dataset.from_dict(
{
"intents": [
{"id": 0, "name": "Greeting"},
{"id": 1, "name": "OrderFood"},
],
"train": [
{"utterance": "hello", "label": 0},
{"utterance": "hi there", "label": 0},
{"utterance": "i want pizza", "label": 1},
],
}
)


def test_human_utterance_generator_sync(dataset):
mock_llm = Mock()
mock_llm.get_chat_completion.return_value = "Human-like utterance"

mock_critic = Mock(spec=CriticHumanLike)
mock_critic.is_human.return_value = True

generator = HumanUtteranceGenerator(mock_llm, mock_critic, async_mode=False)

n_before = len(dataset["train"])
new_samples = generator.augment(dataset, split_name="train", update_split=False, n_final_per_class=2)
n_after = len(dataset["train"])

assert n_before == n_after
assert len(new_samples) > 0
assert all(isinstance(sample, Sample) for sample in new_samples)
assert all("utterance" in sample.dict() for sample in new_samples)
assert all("label" in sample.dict() for sample in new_samples)


def test_human_utterance_generator_async(dataset):
mock_llm = AsyncMock()
mock_llm.get_chat_completion_async.return_value = "Human-like utterance"

mock_critic = AsyncMock(spec=CriticHumanLike)
mock_critic.is_human_async.return_value = True

generator = HumanUtteranceGenerator(mock_llm, mock_critic, async_mode=True)

n_before = len(dataset["train"])
new_samples = generator.augment(dataset, split_name="train", update_split=False, n_final_per_class=2)
n_after = len(dataset["train"])
assert n_before == n_after
assert len(new_samples) > 0
assert all(isinstance(sample, Sample) for sample in new_samples)
assert all("utterance" in sample.dict() for sample in new_samples)
assert all("label" in sample.dict() for sample in new_samples)


def test_human_utterance_generator_respects_critic(dataset):
mock_llm = Mock()
mock_llm.get_chat_completion.return_value = "Generated utterance"

mock_critic = Mock(spec=CriticHumanLike)
mock_critic.is_human.return_value = True
generator = HumanUtteranceGenerator(mock_llm, mock_critic, async_mode=False)
new_samples = generator.augment(dataset, split_name="train", update_split=False, n_final_per_class=1)
assert len(new_samples) > 0
assert mock_critic.is_human.call_count >= 1