From 320711f2ad706cd4ce6ff3016099919681471204 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Sun, 26 Oct 2025 01:23:12 +0000 Subject: [PATCH 01/16] ifbench --- examples/eval_multi_task/gpqa-dev.yaml | 12 - .../{gpqa-dev.sh => multi_task.sh} | 4 +- examples/eval_multi_task/multi_task.yaml | 15 + slime/rollout/rm_hub/__init__.py | 3 + slime/rollout/rm_hub/ifbench.py | 161 ++ .../rm_hub/ifbench_utils/instructions.py | 2347 +++++++++++++++++ .../ifbench_utils/instructions_registry.py | 79 + .../rm_hub/ifbench_utils/instructions_util.py | 1651 ++++++++++++ 8 files changed, 4258 insertions(+), 14 deletions(-) delete mode 100644 examples/eval_multi_task/gpqa-dev.yaml rename examples/eval_multi_task/{gpqa-dev.sh => multi_task.sh} (97%) create mode 100644 examples/eval_multi_task/multi_task.yaml create mode 100644 slime/rollout/rm_hub/ifbench.py create mode 100644 slime/rollout/rm_hub/ifbench_utils/instructions.py create mode 100644 slime/rollout/rm_hub/ifbench_utils/instructions_registry.py create mode 100644 slime/rollout/rm_hub/ifbench_utils/instructions_util.py diff --git a/examples/eval_multi_task/gpqa-dev.yaml b/examples/eval_multi_task/gpqa-dev.yaml deleted file mode 100644 index 2592b93b6..000000000 --- a/examples/eval_multi_task/gpqa-dev.yaml +++ /dev/null @@ -1,12 +0,0 @@ -eval: - defaults: - n_samples_per_eval_prompt: 2 - max_response_len: 16384 - top_p: 0.7 - datasets: - - name: aime - path: /root/aime-2024/aime-2024.jsonl - rm_type: deepscaler - - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa - path: /root/gpqa/gpqa_eval.jsonl - rm_type: gpqa diff --git a/examples/eval_multi_task/gpqa-dev.sh b/examples/eval_multi_task/multi_task.sh similarity index 97% rename from examples/eval_multi_task/gpqa-dev.sh rename to examples/eval_multi_task/multi_task.sh index 8591fe394..9a69dcbc1 100644 --- a/examples/eval_multi_task/gpqa-dev.sh +++ b/examples/eval_multi_task/multi_task.sh @@ -26,7 +26,7 @@ echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)" SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" REPO_ROOT="$(cd -- "${SCRIPT_DIR}/../.." &>/dev/null && pwd)" source "${REPO_ROOT}/scripts/models/qwen3-4B.sh" -EVAL_CONFIG_PATH="${REPO_ROOT}/examples/eval_multi_task/gpqa-dev.yaml" +EVAL_CONFIG_PATH="${REPO_ROOT}/examples/eval_multi_task/multi_task.yaml" CKPT_ARGS=( --hf-checkpoint /root/Qwen3-4B @@ -98,7 +98,7 @@ OPTIMIZER_ARGS=( WANDB_ARGS=( --use-wandb --wandb-project eval - --wandb-group gpqa + --wandb-group multi_task --wandb-key ${WANDB_KEY} ) diff --git a/examples/eval_multi_task/multi_task.yaml b/examples/eval_multi_task/multi_task.yaml new file mode 100644 index 000000000..47ad91c4d --- /dev/null +++ b/examples/eval_multi_task/multi_task.yaml @@ -0,0 +1,15 @@ +eval: + defaults: + n_samples_per_eval_prompt: 16 + max_response_len: 16384 + top_p: 0.7 + datasets: + - name: aime + path: /root/aime-2024/aime-2024.jsonl + rm_type: deepscaler + - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa + path: /root/gpqa/gpqa_eval.jsonl + rm_type: gpqa + - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/IFBench + path: /root/IFBench/IFBench_eval.jsonl + rm_type: ifbench diff --git a/slime/rollout/rm_hub/__init__.py b/slime/rollout/rm_hub/__init__.py index 5395da095..cb5245b0a 100644 --- a/slime/rollout/rm_hub/__init__.py +++ b/slime/rollout/rm_hub/__init__.py @@ -9,6 +9,7 @@ from .deepscaler import get_deepscaler_rule_based_reward from .f1 import f1_score from .gpqa import compute_gpqa_reward +from .ifbench import compute_ifbench_reward from .math_dapo_utils import compute_score as compute_score_dapo from .math_utils import extract_answer as extract_boxed_answer from .math_utils import grade_answer_verl @@ -54,6 +55,8 @@ async def async_rm(args, sample: Sample, **kwargs): return f1_score(response, label)[0] elif rm_type == "gpqa": return compute_gpqa_reward(response, label, metadata=metadata) + elif rm_type == "ifbench": + return compute_ifbench_reward(response, label, metadata=metadata) elif rm_type: raise NotImplementedError(f"Rule-based RM for {rm_type} is not implemented.") else: diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py new file mode 100644 index 000000000..a5f65340b --- /dev/null +++ b/slime/rollout/rm_hub/ifbench.py @@ -0,0 +1,161 @@ +from __future__ import annotations + +import dataclasses +import logging +from typing import Any, Dict, List, Optional, Sequence, Union + +from .ifbench_utils import instructions_registry + +logger = logging.getLogger(__name__) + + +JsonDict = Dict[str, Any] +KwargsDict = Dict[str, Optional[Union[str, int, float]]] + + +@dataclasses.dataclass +class InputExample: + """Subset of the official InputExample schema needed for evaluation.""" + + key: int + instruction_id_list: List[str] + prompt: str + kwargs: List[KwargsDict] + + +@dataclasses.dataclass +class OutputExample: + """Official output structure for readability and parity.""" + + instruction_id_list: List[str] + prompt: str + response: str + follow_all_instructions: bool + follow_instruction_list: List[bool] + + +def _normalize_instruction_ids(raw_ids: Sequence[Any]) -> List[str]: + """Ensure instruction identifiers are clean strings.""" + + normalized: List[str] = [] + for entry in raw_ids or []: + if entry is None: + continue + text = str(entry).strip() + if not text: + continue + normalized.append(text) + return normalized + + +def _coerce_kwargs_list( + raw_kwargs: Any, + num_instructions: int, +) -> List[KwargsDict]: + """Convert stored kwargs into the list structure expected by IFBench.""" + + if isinstance(raw_kwargs, list): + processed: List[KwargsDict] = [] + for entry in raw_kwargs: + if isinstance(entry, dict): + processed.append(dict(entry)) + else: + processed.append({}) + elif isinstance(raw_kwargs, dict): + processed = [dict(raw_kwargs) for _ in range(num_instructions)] + else: + processed = [{} for _ in range(num_instructions)] + + if len(processed) < num_instructions: + tail = processed[-1] if processed else {} + processed.extend([dict(tail) for _ in range(num_instructions - len(processed))]) + elif len(processed) > num_instructions: + processed = processed[:num_instructions] + + # Remove explicit None values to match official preprocessing. + sanitized: List[KwargsDict] = [] + for entry in processed: + sanitized.append({k: v for k, v in entry.items() if v is not None}) + return sanitized + + +def _build_input_example(metadata: JsonDict) -> Optional[InputExample]: + instruction_ids = _normalize_instruction_ids(metadata.get("instruction_id_list") or []) + if not instruction_ids: + logger.debug("Missing instruction identifiers in metadata: %s", metadata) + return None + + prompt_text = metadata.get("prompt_text") + if prompt_text is None: + prompt_text = "" + else: + prompt_text = str(prompt_text) + + raw_kwargs = metadata.get("kwargs") + kwargs_list = _coerce_kwargs_list(raw_kwargs, len(instruction_ids)) + + return InputExample( + key=int(metadata.get("record_id") or 0), + instruction_id_list=instruction_ids, + prompt=prompt_text, + kwargs=kwargs_list, + ) + + +def test_instruction_following_strict(inp: InputExample, response: str) -> OutputExample: + """Official strict evaluation copied from evaluation_lib.py.""" + + response = response or "" + instruction_list = inp.instruction_id_list + is_following_list: List[bool] = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT.get(instruction_id) + if instruction_cls is None: + logger.warning("Unknown instruction id '%s'; marking as failed.", instruction_id) + is_following_list.append(False) + continue + + instruction = instruction_cls(instruction_id) + kwargs = inp.kwargs[index] if index < len(inp.kwargs) else {} + + try: + instruction.build_description(**kwargs) + except Exception as exc: # pragma: no cover - parity with official logic + logger.debug("build_description failed for %s with kwargs %s: %s", instruction_id, kwargs, exc) + instruction.build_description() + + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + if response.strip() and instruction.check_following(response): + is_following_list.append(True) + else: + is_following_list.append(False) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDict] = None) -> float: + """Score a model response using the official IFBench rules.""" + + if metadata is None: + logger.debug("No metadata provided for IFBench scoring.") + return 0.0 + + if response is None: + return 0.0 + + inp = _build_input_example(metadata) + if inp is None: + return 0.0 + + output = test_instruction_following_strict(inp, str(response)) + return 1.0 if output.follow_all_instructions else 0.0 diff --git a/slime/rollout/rm_hub/ifbench_utils/instructions.py b/slime/rollout/rm_hub/ifbench_utils/instructions.py new file mode 100644 index 000000000..1ed036522 --- /dev/null +++ b/slime/rollout/rm_hub/ifbench_utils/instructions.py @@ -0,0 +1,2347 @@ +# Copyright 2025 Allen Institute for AI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Library of instructions.""" + +import csv +import io +import logging +import random +import re +import string +import unicodedata +from collections import Counter +from typing import Dict, Optional, Sequence, Union + +import emoji +import nltk +import spacy +import syllapy +from spacy.cli import download + +from . import instructions_util + +try: + spacy.load("en_core_web_sm") +except OSError: + download("en_core_web_sm") + +logger = logging.getLogger(__name__) + +_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] + +# The number of keywords. +_NUM_KEYWORDS = 2 + +# The number of words in the response. +_NUM_WORDS_LOWER_LIMIT = 100 +_NUM_WORDS_UPPER_LIMIT = 500 + +# The number of numbers. +_NUM_NUMBERS = 6 + +# Period length for periodic words. +_NUM_WORD_CYCLE = 30 + +# Maximum number of times a word can be repeated. +_MAX_REPEATS = 5 + +# Which sentence must contain a keyword. +_NUM_KEYWORD_SENTENCE = 20 + +# Minimum number of pronouns. +_NUM_PRONOUNS = 25 + +# The size of increment for lengths. +_NUM_INCREMENT = 5 + +# The number of coordinating conjunctions. +_NUM_CONJUNCTIONS = 6 + + +class Instruction: + """An instruction template.""" + + def __init__(self, instruction_id): + self.id = instruction_id + + def build_description(self, **kwargs): + raise NotImplementedError("`build_description` not implemented.") + + def get_instruction_args(self): + raise NotImplementedError("`get_instruction_args` not implemented.") + + def get_instruction_args_keys(self): + raise NotImplementedError("`get_instruction_args_keys` not implemented.") + + def check_following(self, value): + raise NotImplementedError("`check_following` not implemented.") + + +# Everything as follows is part of OOD IFEval + + +class WordCountRangeChecker(Instruction): + """Word Count Range: The response must contain between X and Y words.""" + + def build_description(self, *, min_words=None, max_words=None): + """Build the instruction description. + + Args: + min_words: An integer specifying the minimum number of words contained in the response. + max_words: An integer specifying the maximum number of words contained in the response. + + Returns: + A string representing the instruction description. + """ + self._min_words = min_words + self._max_words = max_words + + if self._min_words is None or self._min_words < 0: + self._min_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT) + + # Make the range small + if self._max_words is None or self._max_words < 0: + self._max_words = self._min_words + random.randint(int(self._min_words * 0.05), int(self._min_words * 0.1)) + + self._description_pattern = "The response must contain between {min_words} and {max_words} words." + + return self._description_pattern.format(min_words=self._min_words, max_words=self._max_words) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"min_words": self._min_words, "max_words": self._max_words} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["min_words", "max_words"] + + def check_following(self, value): + """Checks if the response contains the expected number of words.""" + num_words = instructions_util.count_words(value) + return self._min_words <= num_words <= self._max_words + + +class UniqueWordCountChecker(Instruction): + """Unique Word Count: The response must contain X unique words.""" + + def build_description(self, *, N=None): + """Build the instruction description. + + Args: + n: An integer specifying the number of unique words contained in the response. + + Returns: + A string representing the instruction description. + """ + self._num_unique_words = N + + if self._num_unique_words is None or self._num_unique_words < 0: + self._num_unique_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT) + + self._description_pattern = "Use at least {N} unique words in the response." + + return self._description_pattern.format(N=self._num_unique_words) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"N": self._num_unique_words} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["N"] + + def check_following(self, value): + """Checks if the response contains the expected number of unique words.""" + words = value.lower().split() + unique_words = set() + for word in words: + unique_words.add(word.strip("".join(string.punctuation) + " ")) + # Convert to set to get unique words + return len(unique_words) >= self._num_unique_words + + +class StopWordPercentageChecker(Instruction): + """Ensure that stop words constitute no more than {percentage}% of the total words in your response.""" + + def build_description(self, *, percentage=None): + """Build the instruction description. + + Args: + percentage: An integer specifying the percentage of stop words that are allowed in the response. + + Returns: + A string representing the instruction description. + """ + self._percentage = percentage + + if self._percentage is None or self._percentage < 0: + self._percentage = random.randint(1, 100) + + self._description_pattern = ( + "Ensure that stop words constitute no more than {percentage}% of the total words in your response." + ) + + return self._description_pattern.format(percentage=self._percentage) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"percentage": self._percentage} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["percentage"] + + def check_following(self, value): + """Checks if the response contains the expected percentage of stop words.""" + num_words = instructions_util.count_words(value) + num_stopwords = instructions_util.count_stopwords(value) + stopword_percentage = (num_stopwords / num_words) * 100 + return stopword_percentage <= self._percentage + + +class SentTypeRatioChecker(Instruction): + """Maintain a 2:1 ratio of declarative to interrogative sentences.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Maintain a 2:1 ratio of declarative to interrogative sentences." + nltk.download("punkt_tab") + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains the expected ratio of declarative to interrogative sentences.""" + # Split the text into sentences + sentences = instructions_util.split_into_sentences(value) + # Count the number of declarative and interrogative sentences + declarative_count = sum(1 for sentence in sentences if sentence.endswith(".")) + interrogative_count = sum(1 for sentence in sentences if sentence.endswith("?")) + # Check if the ratio is 2:1 + return declarative_count == 2 * interrogative_count + + +class SentBalanceChecker(Instruction): + """Ensure that the ratio of sentence types (declarative, interrogative, exclamatory) is balanced.""" + + def build_description(self): + """Build the instruction description.""" + nltk.download("punkt_tab") + self._description_pattern = ( + "Ensure that the ratio of sentence types (declarative, interrogative, exclamatory) is balanced." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains a balanced ratio of sentence types.""" + # Split the text into sentences + sentences = instructions_util.split_into_sentences(value) + # Count the number of each sentence type + declarative_count = sum(1 for sentence in sentences if sentence.endswith(".")) + interrogative_count = sum(1 for sentence in sentences if sentence.endswith("?")) + exclamatory_count = sum(1 for sentence in sentences if sentence.endswith("!")) + # Check if the ratio of sentence types is balanced + return declarative_count == interrogative_count == exclamatory_count + + +class ConjunctionCountChecker(Instruction): + """Use at least {small_n} different coordinating conjunctions in the response.""" + + def build_description(self, *, small_n=None): + """Build the instruction description. + + Args: + small_n: An integer specifying the number of different coordinating conjunctions contained in the response. + + Returns: + A string representing the instruction description. + """ + self._num_conjunctions = small_n + + if self._num_conjunctions is None or self._num_conjunctions < 0: + self._num_conjunctions = random.randint(2, _NUM_CONJUNCTIONS) + + self._description_pattern = "Use at least {small_n} different coordinating conjunctions in the response." + + return self._description_pattern.format(small_n=self._num_conjunctions) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"small_n": self._num_conjunctions} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["small_n"] + + def check_following(self, value): + """Checks if the response contains the expected number of different coordinating conjunctions.""" + # Split the text into words + words = value.split() + # Count the number of coordinating conjunctions + conjunctions = [ + word + for word in words + if word.strip("".join(string.punctuation) + " ").lower() in ["and", "but", "for", "nor", "or", "so", "yet"] + ] + unique_conjunctions = set(conjunctions) + return len(unique_conjunctions) >= self._num_conjunctions + + +class PersonNameCountChecker(Instruction): + """Mention at least {N} different person names in the response, from this list of person names: Emma, Liam, Sophia...""" + + def build_description(self, *, N=None): + """Build the instruction description. + + Args: + N: An integer specifying the minimum number of unique person names contained in the response. + + Returns: + A string representing the instruction description. + """ + self._num_person_names = N + + if self._num_person_names is None or self._num_person_names < 0: + self._num_person_names = random.randint(1, 50) + + self._description_pattern = "Mention at least {N} different person names in the response, from this list of person names: Emma, Liam, Sophia, Jackson, Olivia, Noah, Ava, Lucas, Isabella, Mason, Mia, Ethan, Charlotte, Alexander, Amelia, Benjamin, Harper, Leo, Zoe, Daniel, Chloe, Samuel, Lily, Matthew, Grace, Owen, Abigail, Gabriel, Ella, Jacob, Scarlett, Nathan, Victoria, Elijah, Layla, Nicholas, Audrey, David, Hannah, Christopher, Penelope, Thomas, Nora, Andrew, Aria, Joseph, Claire, Ryan, Stella, Jonathan ." + return self._description_pattern.format(N=self._num_person_names) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"N": self._num_person_names} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["N"] + + def check_following(self, value): + """Checks if the response contains at least the expected number of unique person names.""" + person_name_list = [ + "Emma", + "Liam", + "Sophia", + "Jackson", + "Olivia", + "Noah", + "Ava", + "Lucas", + "Isabella", + "Mason", + "Mia", + "Ethan", + "Charlotte", + "Alexander", + "Amelia", + "Benjamin", + "Harper", + "Leo", + "Zoe", + "Daniel", + "Chloe", + "Samuel", + "Lily", + "Matthew", + "Grace", + "Owen", + "Abigail", + "Gabriel", + "Ella", + "Jacob", + "Scarlett", + "Nathan", + "Victoria", + "Elijah", + "Layla", + "Nicholas", + "Audrey", + "David", + "Hannah", + "Christopher", + "Penelope", + "Thomas", + "Nora", + "Andrew", + "Aria", + "Joseph", + "Claire", + "Ryan", + "Stella", + "Jonathan", + ] + # Extract the named entities + person_names = [] + for name in person_name_list: + if name in value: + person_names.append(name) + unique_person_names = set(person_names) + + return len(unique_person_names) >= self._num_person_names + + +class NGramOverlapChecker(Instruction): + """Maintain a trigram overlap of {percentage}% (±2%) with the provided reference text.""" + + def build_description(self, *, reference_text=None, percentage=None): + """Build the instruction description. + + Args: + reference_text: A string representing the reference text. + percentage: An integer specifying the percent trigram overlap + to maintain in the response. + + Returns: + A string representing the instruction description. + """ + self._reference_text = reference_text + self._percentage = percentage + if self._percentage is None or self._percentage < 0: + self._percentage = random.randint(1, 100) + + self._description_pattern = ( + "Maintain a trigram overlap of {percentage}% (±2%) with the provided reference text." + ) + return self._description_pattern.format(percentage=self._percentage) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"reference_text": self._reference_text, "percentage": self._percentage} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["reference_text", "percentage"] + + def check_following(self, value): + """Checks if the response maintains a trigram overlap with the reference text within 2% of {percent}.""" + n = 3 + ngrams = set(nltk.ngrams(value, n)) + ref_ngrams = set(nltk.ngrams(self._reference_text, n)) + overlap = len(ngrams.intersection(ref_ngrams)) / len(ngrams) + return self._percentage - 2 <= overlap * 100 <= self._percentage + 2 + + +class NumbersCountChecker(Instruction): + """Include exactly {N} numbers in the response.""" + + def build_description(self, *, N=None): + """Build the instruction description. + + Args: + N: An integer specifying the exact number of numbers + that is required to appear in the response. + + Returns: + A string representing the instruction description. + """ + self._count_numbers = N + if self._count_numbers is None or self._count_numbers < 0: + self._count_numbers = random.randint(1, _NUM_NUMBERS) + + self._description_pattern = "Include exactly {N} numbers in the response." + return self._description_pattern.format(N=self._count_numbers) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"N": self._count_numbers} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["N"] + + def check_following(self, value): + """Checks if the response includes exactly {N} numbers.""" + # Strip punctuation to handle decimals and commas in numbers correctly + value = value.translate(str.maketrans("", "", string.punctuation)) + numbers = re.findall(r"\d+", value) + return len(numbers) == self._count_numbers + + +class AlphabetLoopChecker(Instruction): + """Each word must start with the next letter of the alphabet, looping back to 'A' after 'Z'.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Each word must start with the next letter of the alphabet, looping back to 'A' after 'Z'." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if each word of the response starts with the next letter of the alphabet.""" + value = value.translate(str.maketrans("", "", string.punctuation)) + words = value.strip("".join(string.punctuation) + " ").split() + alphabet = string.ascii_lowercase + correct_letter = words[0][0].lower() + if correct_letter not in alphabet: # numbers are fails + return False + for word in words[1:]: + word = word.strip("".join(string.punctuation) + " ").lower() + if not word: + continue + correct_letter = alphabet[(alphabet.index(correct_letter) + 1) % 26] + if word[0] != correct_letter: + return False + return True + + +class SingleVowelParagraphChecker(Instruction): + """Write a paragraph using words that contain only three type of vowels.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Write a paragraph using words that contain only three types of vowels." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if no more than three types of vowels are used in the response and the response is only 1 paragraph.""" + paragraphs = value.strip().split("\n") + if len(paragraphs) != 1: + return False + paragraph = paragraphs[0].lower() + + vowels = set("aeiou") + paragraph_vowels = set([char for char in paragraph if char in vowels]) + return len(paragraph_vowels) <= 3 + + +class ConsonantClusterChecker(Instruction): + """Ensure each word in your response has at least one consonant cluster (two or more consonants together).""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Ensure each word in your response has at least one consonant cluster (two or more consonants together)." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if each word in the response includes at least one consonant cluster.""" + words = value.lower().strip().split() + consonants = set("bcdfghjklmnpqrstvwxyz") + for word in words: + cluster = False + for i in range(len(word) - 1): + if word[i] in consonants and word[i + 1] in consonants: + cluster = True + break + if not cluster: + return False + return True + + +class IncrementingAlliterationChecker(Instruction): + """Each sentence must have a longer sequence of consecutive alliterative words than the previous one.""" + + def build_description(self): + """Build the instruction description.""" + nltk.download("punkt_tab") + self._description_pattern = ( + "Each sentence must have a longer sequence of consecutive alliterative words than the previous one." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if each sentence in the response has more alliterative words (determined by common first letter) than the previous sentence.""" + sentences = instructions_util.split_into_sentences(value) + prev_alliteration = -1 + for sentence in sentences: + words = sentence.lower().split() + alliteration = 0 + prev_alliterative = False + new_words = [] + for word in words: + clean = word.lstrip("".join(string.punctuation) + " ") + if clean: + new_words.append(clean) + for i in range(len(new_words) - 1): + if new_words[i][0] == new_words[i + 1][0]: + if prev_alliterative: + alliteration += 1 + else: + alliteration += 2 + prev_alliterative = True + else: + prev_alliterative = False + if alliteration <= prev_alliteration: + return False + prev_alliteration = alliteration + return True + + +class PalindromeChecker(Instruction): + """Include at least 10 single-word palindromes, each at least 5 characters long.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Include at least 10 single-word palindromes, each at least 5 characters long." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response includes at least 10 single-word palindromes of length at least 5.""" + value = value.translate(str.maketrans("", "", string.punctuation)) + words = value.lower().split() + palindromes = [word for word in words if word == word[::-1] and len(word) >= 5] + return len(palindromes) >= 10 + + +class PunctuationCoverChecker(Instruction): + """Use every standard punctuation mark at least once, including semicolons, colons, and the interrobang (?!).""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Use every standard punctuation mark at least once, including semicolons, colons, and the interrobang (?!)." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response includes every standard punctuation mark at least once, including the interrobang (?!).""" + punctuation = {".", ",", "!", "?", ";", ":"} + if not ("!?" in value or "?!" in value or "‽" in value): + return False + new_value = value.replace("?!", "", 1) + if len(new_value) == len(value): + new_value = value.replace("!?", "", 1) + for char in new_value: + if char in punctuation: + punctuation.remove(char) + return not punctuation + + +class NestedParenthesesChecker(Instruction): + """Nest parentheses (and [brackets {and braces}]) at least 5 levels deep.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Nest parentheses (and [brackets {and braces}]) at least 5 levels deep." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response includes a correctly closed set of at least 5 nested brackets.""" + levels = [] + min_levels = 5 + max_depth = 0 + depth_stack = [] # Track depth per matched group + + for char in value: + if char in "([{": + levels.append(char) + if len(levels) > max_depth: + max_depth = len(levels) + elif char in ")]}": + if levels and ( + (levels[-1] == "(" and char == ")") + or (levels[-1] == "[" and char == "]") + or (levels[-1] == "{" and char == "}") + ): + levels.pop() + # Check if we just closed a group that reached 5+ depth + if max_depth >= min_levels and len(levels) < max_depth: + return True + else: + # Mismatch — reset + levels = [] + max_depth = 0 + + return False + + +class NestedQuotesChecker(Instruction): + """Include quotes within quotes within quotes, at least 3 levels deep, alternating between double quotes and single quotes.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Include quotes within quotes within quotes, at least 3 levels deep, alternating between double quotes and single quotes." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response includes nested quotes to at least 3 levels + alternating between " and ' starting with either character.""" + levels = [] + min_levels = 3 + reached_depth = 0 + current_depth = 0 + for char in value: + if len(levels) != 0 and char == levels[-1]: + levels.pop() + current_depth -= 1 + if reached_depth - current_depth >= min_levels: + return True + elif char == '"' or char == "'": + levels.append(char) + current_depth += 1 + if current_depth > reached_depth: + reached_depth = current_depth + return False + + +class PrimeLengthsChecker(Instruction): + """Use only words with lengths that are prime numbers.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Use only words with lengths that are prime numbers." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response only includes words with prime length.""" + value = value.translate(str.maketrans("", "", string.punctuation)) + words = value.split() + primes = set([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]) + for word in words: + if len(word) not in primes: + return False + return True + + +class OptionsResponseChecker(Instruction): + """Answer with one of the following options: {options}. Do not give any explanation.""" + + def build_description(self, *, options=None): + """Build the instruction description. + + Args: + options: A string specifying the permitted options for + the response. + + Returns: + A string representing the instruction description. + """ + # Options string may be: yes/no/maybe, I know or I don't know, a), b), c), d) + # Can be separated by "/", "or", "," + options_bank = ["yes/no/maybe", "I know or I don't know", "a), b), c), d)"] + if options is None: + options = random.choice(options_bank) + + # Be more strict about format for multiple choice letters than for text options + self._strict = False + if re.match(r"\W*[aA]\W*[bB]\W*[cC]\W*", options) is not None: + self._strict = True + if "/" in options: + separator = "/" + elif "or" in options: + separator = "or" + else: + separator = "," + self._options = [option.strip() for option in options.split(separator)] + self._options_text = options # in text, shouldn't be formatted as a list + self._description_pattern = "Answer with one of the following options: {options}. Do not give any explanation." + return self._description_pattern.format(options=self._options_text) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"options": self._options_text} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["options"] + + def check_following(self, value): + """Checks if the response is exactly one of {options}.""" + if self._strict: + return value in self._options + value = value.strip("".join(string.punctuation) + " ").lower() + for option in self._options: + if option.strip("".join(string.punctuation) + " ").lower() == value: + return True + return False + + +class NewLineWordsChecker(Instruction): + """Write each word on a new line.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Write each word on a new line." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response has each word on a new line.""" + value = value.translate(str.maketrans("", "", string.punctuation)) + lines = value.strip().split("\n") + while "" in lines: + lines.remove("") + return len(lines) == len(value.strip().split()) + + +class EmojiSentenceChecker(Instruction): + """Please use an emoji at the end of every sentence.""" + + def build_description(self): + """Build the instruction description.""" + nltk.download("punkt_tab") + self._description_pattern = "Please use an emoji at the end of every sentence." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response includes an emoji at the end of every sentence.""" + + sentences = instructions_util.split_into_sentences(value) + for i, sentence in enumerate(sentences): + stripped = sentence.translate(str.maketrans("", "", string.punctuation)).strip() + # check for empty string + if not stripped: + return False + last_char = stripped[-1] + # because blank spaces are treated oddly + second_last_char = stripped[-2] if len(stripped) > 1 else stripped[-1] + if not emoji.is_emoji(last_char) and not emoji.is_emoji(second_last_char): + if i < len(sentences) - 1: + stripped = sentences[i + 1].translate(str.maketrans("", "", string.punctuation)).strip() + # fixed empty string + if not stripped: + return False + first_char = stripped[0] + if not emoji.is_emoji(first_char): + return False + else: + return False + return True + + +class CharacterCountUniqueWordsChecker(Instruction): + """Respond with three sentences, all containing the same number of characters but using all different words.""" + + def build_description(self): + """Build the instruction description.""" + nltk.download("punkt_tab") + self._description_pattern = ( + "Respond with three sentences, all containing the same number of characters but using all different words." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response has exactly 3 sentences containing the same number of characters but different words.""" + sentences = instructions_util.split_into_sentences(value) + if len(sentences) != 3: + return False + char_count = len(sentences[0].strip()) + for sentence in sentences: + if len(sentence.strip()) != char_count: + return False + return True + + +class NthWordJapaneseChecker(Instruction): + """Every {N}th word of your response must be in Japanese.""" + + def build_description(self, *, N=None): + """Build the instruction description. + + Args: + N: An integer specifying the cycle length for + Japanese words to appear in the response. + + Returns: + A string representing the instruction description. + """ + self._japanese_position = N + if self._japanese_position is None or self._japanese_position < 0: + self._japanese_position = random.randint(1, _NUM_WORD_CYCLE) + + self._description_pattern = "Every {N}th word of your response must be in Japanese." + if N % 10 == 1: + self._description_pattern = "Every {N}st of your response must be in Japanese." + if N % 10 == 2: + self._description_pattern = "Every {N}nd of your response must be in Japanese." + elif N % 10 == 3: + self._description_pattern = "Every {N}rd of your response must be in Japanese." + return self._description_pattern.format(N=self._japanese_position) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"N": self._japanese_position} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["N"] + + def check_following(self, value): + """Checks if every {N}th word of the response is in Japanese.""" + + def is_japanese(text): + """ + Checks if a string contains Japanese characters (Hiragana, Katakana, or Kanji). + + Args: + text: The string to check. + + Returns: + True if the string contains Japanese characters, False otherwise. + """ + japanese_pattern = re.compile(r"[\u3040-\u30ff\u4e00-\u9fff]") + return bool(japanese_pattern.search(text)) + + words = value.split() + for i, word in enumerate(words): + word = word.strip("".join(string.punctuation) + " ") + if (i + 1) % self._japanese_position == 0 and word and not word.isdigit(): + if not is_japanese(word): + return False + return True + + +class StartWithVerbChecker(Instruction): + """The response must start with a verb.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "The response must start with a verb." + nltk.download("averaged_perceptron_tagger_eng") + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response starts with a verb.""" + text = nltk.word_tokenize(value) + return len(text) > 0 and len(nltk.pos_tag(text)) > 0 and "VB" in nltk.pos_tag(text)[0][1] + + +class LimitedWordRepeatChecker(Instruction): + """The response should not repeat any word more than {small_n} times.""" + + def build_description(self, *, small_n=None): + """Build the instruction description. + + Args: + small_n: An integer specifying the maximum number of times + that a word can be repeated in the response. + + Returns: + A string representing the instruction description. + """ + self._max_repeats = small_n + if self._max_repeats is None or self._max_repeats < 0: + self._max_repeats = random.randint(1, _MAX_REPEATS) + + self._description_pattern = "The response should not repeat any word more than {small_n} times." + return self._description_pattern.format(small_n=self._max_repeats) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"small_n": self._max_repeats} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["small_n"] + + def check_following(self, value): + """Checks if the response repeats any word more than {small_n} times.""" + words = value.lower().translate(str.maketrans("", "", string.punctuation)).split() + word_count = Counter(words) + for word, count in word_count.items(): + if count > self._max_repeats: + return False + return True + + +class IncludeKeywordChecker(Instruction): + """The response must include keyword {word} in the {N}-th sentence.""" + + def build_description(self, *, word=None, N=None): + """Build the instruction description. + + Args: + word: A string specifying the keyword that is + required to appear in the response. + N: An integer specifying which sentence of the + response is required to have the keyword. + + Returns: + A string representing the instruction description. + """ + nltk.download("punkt_tab") + + if not word: + self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword = word + self._keyword_position = N + if self._keyword_position is None or self._keyword_position < 0: + self._keyword_position = random.randint(1, _NUM_KEYWORD_SENTENCE) + + self._description_pattern = 'The response must include keyword "{word}" in the {N}-th sentence.' + return self._description_pattern.format(word=self._keyword, N=self._keyword_position) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"word": self._keyword, "N": self._keyword_position} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["word", "N"] + + def check_following(self, value): + """Checks if the {N}th sentence of the response includes keyword {word}.""" + sentences = instructions_util.split_into_sentences(value) + if len(sentences) < self._keyword_position: + return False + return self._keyword.lower() in sentences[int(self._keyword_position - 1)].lower() + + +class PronounCountChecker(Instruction): + """The response should include at least {N} pronouns.""" + + def build_description(self, *, N=None): + """Build the instruction description. + + Args: + N: An integer specifying the minimum number of pronouns + that is required to appear in the response. + + Returns: + A string representing the instruction description. + """ + self._num_pronouns = N + if self._num_pronouns is None or self._num_pronouns < 0: + self._num_pronouns = random.randint(1, _NUM_PRONOUNS) + + self._description_pattern = "The response should include at least {N} pronouns." + return self._description_pattern.format(N=self._num_pronouns) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"N": self._num_pronouns} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["N"] + + def check_following(self, value): + """Checks if the response includes at least {N} pronouns.""" + pronouns = set( + [ + "i", + "me", + "my", + "mine", + "myself", + "we", + "us", + "our", + "ours", + "ourselves", + "you", + "your", + "yours", + "yourself", + "yourselves", + "he", + "him", + "his", + "himself", + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + "they", + "them", + "their", + "theirs", + "themselves", + ] + ) + value = value.replace( + "/", " " + ) # to correctly count pronoun sets like she/her/hers, a common use case of pronouns + value = value.lower().translate(str.maketrans("", "", string.punctuation)) + words = value.split() + pronoun_count = sum(1 for word in words if word in pronouns) + return pronoun_count >= self._num_pronouns + + +class AlternateParitySyllablesChecker(Instruction): + """Alternate between words with odd and even numbers of syllables.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Alternate between words with odd and even numbers of syllables." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response alternates between words with odd and even numbers of syllables.""" + words = value.translate(str.maketrans("", "", string.punctuation)).lower().split() + syllables = [syllapy.count(word) % 2 for word in words if word.strip()] + return all(syllables[i] != syllables[i + 1] for i in range(len(syllables) - 1)) + + +class LastWordFirstNextChecker(Instruction): + """The last word of each sentence must become the first word of the next sentence.""" + + def build_description(self): + """Build the instruction description.""" + nltk.download("punkt_tab") + self._description_pattern = "The last word of each sentence must become the first word of the next sentence." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the last word of each sentence in the response is the first word of the next sentence.""" + sentences = instructions_util.split_into_sentences(value) + for i in range(len(sentences) - 1): + last_word = sentences[i].rstrip("".join(string.punctuation) + " ").split()[-1] + first_word = sentences[i + 1].lstrip("".join(string.punctuation) + " ").split()[0] + if last_word.lower() != first_word.lower(): + return False + return True + + +class ParagraphLastFirstWordMatchChecker(Instruction): + """Each paragraph must end with the same word it started with, separate paragraphs with a newline.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Each paragraph must end with the same word it started with, separate paragraphs with a newline." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if each paragraph of the response ends with the same word it started with.""" + paragraphs = value.split("\n") + for paragraph in paragraphs: + paragraph = paragraph.strip().lower() + if not paragraph: + continue + words = paragraph.strip("".join(string.punctuation) + " ").split() + if not words: + continue + if words[0] != words[-1]: + return False + return True + + +class IncrementingWordCountChecker(Instruction): + """Each sentence must contain exactly {small_n} more words than the previous one.""" + + def build_description(self, *, small_n=None): + """Build the instruction description. + + Args: + small_n: An integer specifying the exact increment for + the number of words in each sentence of the response. + + Returns: + A string representing the instruction description. + """ + self._num_increment = small_n + if self._num_increment is None or self._num_increment < 0: + self._num_increment = random.randint(1, _NUM_INCREMENT) + + nltk.download("punkt_tab") + + self._description_pattern = "Each sentence must contain exactly {small_n} more words than the previous one." + return self._description_pattern.format(small_n=self._num_increment) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"small_n": self._num_increment} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["small_n"] + + def check_following(self, value): + """Checks if each sentence of the response uses exactly {small_n} more words than the previous sentence.""" + sentences = instructions_util.split_into_sentences(value) + words = sentences[0].translate(str.maketrans("", "", string.punctuation)).strip().split() + while "" in words: + words.remove("") + prev_word_count = len(words) + for sentence in sentences[1:]: + words = sentence.translate(str.maketrans("", "", string.punctuation)).strip().split() + while "" in words: + words.remove("") + if len(words) != prev_word_count + self._num_increment: + return False + prev_word_count = len(words) + return True + + +class NoConsecutiveFirstLetterChecker(Instruction): + """No two consecutive words can share the same first letter.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "No two consecutive words can share the same first letter." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if no two consecutive words in the response share the same first letter.""" + words = value.lower().translate(str.maketrans("", "", string.punctuation)).split() + while "" in words: + words.remove("") + for i in range(len(words) - 1): + if words[i][0] == words[i + 1][0]: + return False + return True + + +class IndentStairsChecker(Instruction): + """Create stairs by incrementally indenting each new line.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Create stairs by incrementally indenting each new line." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response incrementally indents each new line.""" + lines = value.split("\n") + for line in lines: + if not line.strip(): + lines.remove(line) + for i in range(len(lines) - 1): + if len(lines[i + 1]) - len(lines[i + 1].lstrip(" ")) <= len(lines[i]) - len(lines[i].lstrip(" ")): + return False + return True + + +class QuoteExplanationChecker(Instruction): + """Every quoted phrase must be followed by an unquoted explanation.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Every quoted phrase must be followed by an unquoted explanation." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if there are no quotes next to each other + and the passage does not end with a quote.""" + value = value.replace("“", '"').replace("”", '"') + value = value.replace("'\"'", "") # remove references to the character '"' + value = "".join(value.split()) # remove all whitespace + if '""' in value: + return False + if value.strip(string.digits + string.punctuation.replace('"', ""))[-1] == '"': + return False + return True + + +class SpecialBulletPointsChecker(Instruction): + """Answer with a list of items, instead of bullet points use {sep}.""" + + def build_description(self, *, sep=None): + """Build the instruction description. + + Args: + sep: A string specifying the bullet point marker for + the list in the response. + + Returns: + A string representing the instruction description. + """ + self._bullet_marker = sep + if sep is None: + self._bullet_marker = random.choice(["...", "SEPARATOR", "!?!?", "-"]) + self._description_pattern = "Answer with a list of items, instead of bullet points use {sep}." + return self._description_pattern.format(sep=self._bullet_marker) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"sep": self._bullet_marker} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["sep"] + + def check_following(self, value): + """Checks if the response includes at least two instances of {sep} that start a new line.""" + return len(re.findall(re.escape(self._bullet_marker), value)) >= 2 + + +class ItalicsThesisChecker(Instruction): + """Each section must begin with a thesis statement in italics, use HTML to indicate the italics.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Each section must begin with a thesis statement in italics, use HTML to indicate the italics." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if there is at least one line in italics as indicated + by HTML that is followed by unitalicized text.""" + index = value.find("") + if index == -1: + index = value.find("") + if index == -1: + return False + value = value[index:] + end_thesis = value.find("") + if end_thesis == -1: + end_thesis = value.find("") + if end_thesis == -1: + return False + thesis = value[3:end_thesis] + if thesis.strip() == "": + return False + text = value[end_thesis + 4 :] + return text.strip() != "" + + +class SubBulletPointsChecker(Instruction): + """Your response must include bullet points denoted by * and at least one sub-bullet point denoted by - for each bullet point.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Your response must include bullet points denoted by * and at least one sub-bullet point denoted by - for each bullet point." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that there is at least one * that starts a line and each * that starts a line + is followed by at least one line starting with -.""" + bullets = value.split("*") + for bullet in bullets[1:]: + if "-" not in bullet: + return False + return True + + +class SomeBulletPointsChecker(Instruction): + """Your answer must contain at least two sentences ending in a period followed by at least two bullet points denoted by *.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Your answer must contain at least two sentences ending in a period followed by at least two bullet points denoted by *." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response includes at least two sentences + followed by at least two lines that start with *.""" + lines = value.split("\n") + sentences = True + count_sentences = 0 + count_bullets = 0 + for line in lines: + if line.strip().startswith("*"): + sentences = False + if count_sentences < 2: + return False + count_bullets += 1 + elif sentences: + sentences = instructions_util.split_into_sentences(line.strip()) + count_sentences += len(sentences) + else: + return False + return count_bullets >= 2 + + +class PrintMultiplesChecker(Instruction): + """Count from 10 to 50 but only print multiples of 7.""" + + def build_description(self, **kwargs): + self._description_pattern = "Count from 10 to 50 but only print multiples of 7." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response prints multiples of 7 from 10 to 50.""" + value = value.replace(",", ", ") + numbers = re.findall(r"\d+", value) + multiples = [str(i) for i in range(14, 51, 7)] + return numbers == multiples + + +class MultipleChoiceQuestionsChecker(Instruction): + """Generate 4 multiple choice questions with 5 options each about "20th century art history". Each question should start with the label "Question". The questions should get progressively longer. Do not provide an explanation.""" + + def build_description(self, **kwargs): + self._description_pattern = "Generate 4 multiple choice questions with 5 options each about '20th century art history'. Each question should start with the label \"Question\". The questions should get progressively longer. Do not provide an explanation." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response generates 4 multiple choice questions with 5 options.""" + # Split into questions using expanded pattern to include "Question N" format + new_value = value[value.find("Question") :] + if new_value != value: + return False # failed no explanation + value = new_value + questions = re.split(r"\n*(?:Question \d+[\.|\):;]?\s*)", value) + if questions[0] == "": + questions = questions[1:] + questions = [q.strip() for q in questions if q.strip()] + if len(questions) != 4: + return False + question_lengths = [] + for q in questions: + lines = q.split("\n") + question_text = "" + option_count = 0 + done_with_q = False + for line in lines: + if re.match(r"^[A-Ea-e][\.|\)]\s*\w+", line.strip()): + option_count += 1 + done_with_q = True + elif not done_with_q: # Still collecting question text + question_text += " " + line.strip() + if option_count != 5: + return False + question_lengths.append(len(question_text.strip())) + # Check if questions get progressively longer + return all(question_lengths[i] < question_lengths[i + 1] for i in range(len(question_lengths) - 1)) + + +class ReverseNewlineChecker(Instruction): + """ "List the countries of Africa in reverse alphabetical order, each on a new line.""" + + def build_description(self, **kwargs): + self._description_pattern = "List the countries of Africa in reverse alphabetical order, each on a new line." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """ + Checks if text satisfies the following constraints: + 1. Contains at least 53 newlines with text + 2. Lines are in reverse alphabetical order + 3. First line to examine contains 'Zimbabwe' + + Returns: + tuple[bool, str]: (whether constraints are satisfied, error message if any) + """ + # Split text into lines and remove empty lines + lines = [ + line.strip("".join(string.punctuation) + " ") + for line in value.split("\n") + if line.strip("".join(string.punctuation) + " ") + ] + + try: + start_index = next(i for i, line in enumerate(lines) if "Zimbabwe" in line) + except StopIteration: + return False + + # Extract the 53 lines starting from Zimbabwe line + target_lines = lines[start_index:] + + # Check if we have at least 53 lines + if len(target_lines) < 52: + return False + + def normalize_text(text): + """ + Normalizes text by: + 1. Converting to NFKD form (separates combined characters) + 2. Removes diacritical marks + 3. Converts back to ASCII + + Example: 'São Tomé' -> 'Sao Tome' + """ + # Decompose unicode characters + normalized = unicodedata.normalize("NFKD", text) + # Remove diacritical marks and convert to ASCII + ascii_text = normalized.encode("ASCII", "ignore").decode("ASCII") + return ascii_text + + # Create normalized versions for comparison while keeping originals for error messages + normalized_lines = [normalize_text(line) for line in target_lines] + sorted_normalized = sorted(normalized_lines, reverse=True) + return normalized_lines == sorted_normalized + + +class WordReverseOrderChecker(Instruction): + """What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per word.""" + + def build_description(self, **kwargs): + nltk.download("punkt_tab") + self._description_pattern = "What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per word." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the reverse of the sentence is a valid English sentence.""" + value = value.lower().strip().translate(str.maketrans("", "", string.punctuation)) + value = " ".join(value.split()[::-1]) + if "bald eagle" not in value: + return False + return value in instructions_util.split_into_sentences(value) + + +class CharacterReverseOrderChecker(Instruction): + """What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per letter.""" + + def build_description(self, **kwargs): + self._description_pattern = "What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per letter." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + value = value.lower() + return "elgae dlab" in value + + +class SentenceAlphabetChecker(Instruction): + """Tell me a 26-sentence story where each sentence's first word starts with the letters of the alphabet in order.""" + + def build_description(self, **kwargs): + nltk.download("punkt_tab") + self._description_pattern = "Tell me a 26-sentence story where each sentence's first word starts with the letters of the alphabet in order." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + sentences = instructions_util.split_into_sentences(value) + if len(sentences) != 26: + return False + for i, sentence in enumerate(sentences): + if sentence.lstrip().split()[0].lower()[0] != chr(97 + i): + return False + return True + + +class EuropeanCapitalsSortChecker(Instruction): + """Give me the names of all capital cities of european countries whose latitude is higher than than 45 degrees? List the capital cities without country names, separated by commas, sorted by latitude, from highest to lowest.""" + + def build_description(self, **kwargs): + """Build the instruction description.""" + self._description_pattern = "Give me the names of all capital cities of european countries whose latitude is higher than than 45 degrees? List the capital cities without country names, separated by commas, sorted by latitude, from highest to lowest." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response lists the relevant capitals of Europe in correct order.""" + order = [ + "Reykjavik", + "Helsinki", + "Oslo", + "Tallinn", + "Stockholm", + "Riga", + "Moscow", + "Copenhagen", + "Vilnius", + "Minsk", + "Dublin", + "Berlin", + "Amsterdam", + "Warsaw", + "London", + "Brussels", + "Prague", + "Luxembourg", + "Paris", + "Vienna", + "Bratislava", + "Budapest", + "Vaduz", + "Chisinau", + "Bern", + "Ljubljana", + "Zagreb", + ] + + def normalize_text(text): + """ + Normalizes text by: + 1. Converting to NFKD form (separates combined characters) + 2. Removes diacritical marks + 3. Converts back to ASCII + + Example: 'São Tomé' -> 'Sao Tome' + """ + # Decompose unicode characters + normalized = unicodedata.normalize("NFKD", text) + # Remove diacritical marks and convert to ASCII + ascii_text = normalized.encode("ASCII", "ignore").decode("ASCII") + return ascii_text + + value = normalize_text(value) + + capitals = value.split(",") + capitals = [cap for cap in capitals if cap.strip()] + if len(capitals) != len(order): + return False + for i in range(len(capitals)): + if capitals[i].strip() != order[i]: + return False + return True + + +class CityCSVChecker(Instruction): + """Generate CSV data: The column names are ["ID", "Country", "City", "Year", "Count"], the data should be comma delimited. Please generate 7 rows.""" + + def build_description(self, **kwargs): + """Build the instruction description.""" + self._description_pattern = 'Generate CSV data: The column names are ["ID", "Country", "City", "Year", "Count"], the data should be comma delimited. Please generate 7 rows.' + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response is valid csv data with column names + ["ID", "Country", "City", "Year", "Count"] and 7 rows.""" + string_io = io.StringIO(value) + reader = csv.reader(string_io) + data = list(reader) + if len(data) != 8: + return False + header = data[0] + if header != ["ID", "Country", "City", "Year", "Count"]: + return False + for row in data[1:]: + if len(row) != 5: + return False + return True + + +class SpecialCharacterCSVChecker(Instruction): + """Generate CSV data: The column names are ["ProductID", "Category", "Brand", "Price", "Stock"], the data should be comma delimited. Please generate 14 rows. Add one field which contains a special character and enclose it in double quotes.""" + + def build_description(self, **kwargs): + """Build the instruction description.""" + self._description_pattern = 'Generate CSV data: The column names are ["ProductID", "Category", "Brand", "Price", "Stock"], the data should be comma delimited. Please generate 14 rows. Add one field which contains a special character and enclose it in double quotes.' + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """ "Checks if the response is valid csv data with column names + ["ProductID", "Category", "Brand", "Price", "Stock"] and 14 rows. + Also checks if one field contains a special character enclosed in double quotes.""" + header = value.split("\n")[0].strip() + if not re.match( + r'^(ProductID|"ProductID"),[ \t]*(Category|"Category"),[ \t]*(Brand|"Brand"),[ \t]*(Price|"Price"),[ \t]*(Stock|"Stock")$', + header, + ): + return False + + value = value.replace('"', '"""') + string_io = io.StringIO(value) + reader = csv.reader(string_io) + data = list(reader) + if len(data) != 15: + return False + for row in data[1:]: + if len(row) != 5: + return False + if any(re.match(r'".*[^\d\w\s].*"', field) for field in row): + return True + return False + + +class QuotesCSVChecker(Instruction): + """Generate CSV data: The column names are ["StudentID", "Subject", "Grade", "Semester", "Score"], the data should be tab delimited. Please generate 3 rows and enclose each single field in double quotes.""" + + def build_description(self, **kwargs): + """Build the instruction description.""" + self._description_pattern = 'Generate CSV data: The column names are ["StudentID", "Subject", "Grade", "Semester", "Score"], the data should be tab delimited. Please generate 3 rows and enclose each single field in double quotes.' + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """ "Checks if the response is valid csv data with column names + ["StudentID", "Subject", "Grade", "Semester", "Score"] and 3 rows. + Also checks if each field is enclosed in double quotes.""" + header = value.split("\n")[0].strip() + if not re.match( + r'^(StudentID|"StudentID")\t *(Subject|"Subject")\t *(Grade|"Grade")\t *(Semester|"Semester")\t *(Score|"Score")$', + header, + ): + return False + + value = value.replace('"', '"""') + string_io = io.StringIO(value) + reader = csv.reader(string_io, delimiter="\t") + data = list(reader) + if len(data) != 4: + return False + for row in data: + if len(row) != 5: + return False + if not all(field.strip()[0] == '"' and field.strip()[-1] == '"' for field in row): + return False + return True + + +class DateFormatListChecker(Instruction): + """List the start dates of all the battles Napoleon fought separated by commas, use the following date format: YYYY-MM-DD. Do not provide an explanation.""" + + def build_description(self, **kwargs): + """Build the instruction description.""" + self._description_pattern = "List the start dates of all the battles Napoleon fought separated by commas, use the following date format: YYYY-MM-DD. Do not provide an explanation." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """ "Checks if the response is a list of dates in the format YYYY-MM-DD separated by commas.""" + value = value.strip() + dates = value.split(",") + for date in dates: + date = date.strip() + if not re.match(r"^\d{4}-\d{2}-\d{2}$", date): + return False + date = date.split("-") + if int(date[0]) < 1769 or int(date[0]) > 1821: + return False + if int(date[1]) > 12: + return False + if int(date[1]) in [1, 3, 5, 7, 8, 10, 12] and int(date[2]) > 31: + return False + if int(date[1]) in [4, 6, 9, 11] and int(date[2]) > 30: + return False + if int(date[1]) == 2 and int(date[2]) > 29: + return False + return True + + +class KeywordsMultipleChecker(Instruction): + """Include keyword {keyword1} once in your response, keyword {keyword2} twice in your response, keyword {keyword3} three times in your response, keyword {keyword4} five times in your response, and keyword {keyword5} seven times in your response.""" + + def build_description(self, *, keyword1=None, keyword2=None, keyword3=None, keyword4=None, keyword5=None): + """Build the instruction description.""" + if keyword1 is None: + self._keyword1 = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword1 = keyword1.strip() + if keyword2 is None: + self._keyword2 = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword2 = keyword2.strip() + if keyword3 is None: + self._keyword3 = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword3 = keyword3.strip() + if keyword4 is None: + self._keyword4 = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword4 = keyword4.strip() + if keyword5 is None: + self._keyword5 = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword5 = keyword5.strip() + self._description_pattern = "Include keyword {keyword1} once in your response, keyword {keyword2} twice in your response, keyword {keyword3} three times in your response, keyword {keyword4} five times in your response, and keyword {keyword5} seven times in your response." + return self._description_pattern.format( + keyword1=self._keyword1, + keyword2=self._keyword2, + keyword3=self._keyword3, + keyword4=self._keyword4, + keyword5=self._keyword5, + ) + + def get_instruction_args(self): + return { + "keyword1": self._keyword1, + "keyword2": self._keyword2, + "keyword3": self._keyword3, + "keyword4": self._keyword4, + "keyword5": self._keyword5, + } + + def get_instruction_args_keys(self): + return ["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"] + + def check_following(self, value): + for keyword, count in zip( + [self._keyword1, self._keyword2, self._keyword3, self._keyword4, self._keyword5], [1, 2, 3, 5, 7] + ): + if value.lower().count(keyword.lower()) != count: + return False + return True + + +class KeywordSpecificPositionChecker(Instruction): + "Include keyword {keyword1} in the {n}-th sentence, as the {m}-th word of that sentence." + + def build_description(self, keyword=None, n=None, m=None): + """Build the instruction description. + + Args: + keyword: A string representing a keyword that is expected in the response. + n: An integer representing the sentence number. + m: An integer representing the word number. + + Returns: + A string representing the instruction description. + """ + if not keyword: + self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword = keyword.strip() + if not n: + self._n = random.randint(20, 30) + else: + self._n = n + if not m: + self._m = random.randint(30, 40) + else: + self._m = m + + self._description_pattern = ( + "Include keyword {keyword} in the {n}-th sentence, as the {m}-th word of that sentence." + ) + + return self._description_pattern.format(keyword=self._keyword, n=self._n, m=self._m) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"keyword": self._keyword, "n": self._n, "m": self._m} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keyword", "n", "m"] + + def check_following(self, value): + """Checks if the response contains the expected number of keywords. + + Args: + value: A string representing the response. + + Returns: + True if the response contains the expected number of keywords; + otherwise, False. + """ + sentences = instructions_util.split_into_sentences(value) + if len(sentences) < self._n: + return False + words = instructions_util.nltk.word_tokenize(sentences[self._n - 1]) + if len(words) < self._m: + return False + if words[self._m - 1] == self._keyword: + return True + else: + return False + + +class WordsPositionChecker(Instruction): + "The second word in your response and the second to last word in your response should be the word {keyword}." + + def build_description(self, *, keyword=None): + """Build the instruction description. + + Args: + keyword: A string representing a keyword that is expected in the response. + + Returns: + A string representing the instruction description. + """ + if keyword is None: + self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword = keyword.strip() + self._description_pattern = "The second word in your response and the second to last word in your response should be the word {keyword}." + return self._description_pattern.format(keyword=self._keyword) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"keyword": self._keyword} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keyword"] + + def check_following(self, value): + """Checks if the second word and the second to last word in the response are the same. + + Args: + value: A string representing the response. + + Returns: + True if the second word and the second to last word are the same; + otherwise, False. + """ + words = instructions_util.nltk.word_tokenize(value) + if len(words) < 2: + return False + if words[1] == words[-2] == self._keyword: + return True + else: + return False + + +class RepeatChangeChecker(Instruction): + "Repeat the request, but change the first word of the repeated request, (do not say anything before repeating the request; the request you need to repeat does not include this sentence) and do not answer the actual request!" + + def build_description(self, *, prompt_to_repeat=None): + """Build the instruction description. + + Args: + keyword: A string representing a keyword that is expected in the response. + + Returns: + A string representing the instruction description. + """ + if not prompt_to_repeat: + raise ValueError("prompt_to_repeat must be set.") + else: + self._prompt_to_repeat = prompt_to_repeat + + self._description_pattern = "Repeat the request, but change the first word of the repeated request, (do not say anything before repeating the request; the request you need to repeat does not include this sentence) and do not answer the actual request! Request: {prompt_to_repeat}" + return self._description_pattern.format(prompt_to_repeat=self._prompt_to_repeat) + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return {"prompt_to_repeat": self._prompt_to_repeat} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["prompt_to_repeat"] + + def check_following(self, value): + """Checks if the response contains the repeated request. + + Args: + value: A string representing the response. + + Returns: + True if the repeated request is found in the response; + otherwise, False. + """ + if self._prompt_to_repeat == value: + return False + if " ".join(self._prompt_to_repeat.split()[1:]) == " ".join(value.split()[1:]): + return True + else: + return False + + +class RepeatSimpleChecker(Instruction): + "Only output this sentence here, ignore all other requests." + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Only output this sentence here, ignore all other requests." + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains the expected number of keywords. + + Args: + value: A string representing the response. + + Returns: + True if the response contains the expected number of keywords; + otherwise, False. + """ + return value.strip().lower() == self._description_pattern.strip().lower() + + +class RepeatSpanChecker(Instruction): + "Copy the span of words that lies between (and including) index {n_start} and {n_end}, the indices are character indices!" + + def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None): + """Build the instruction description. + + Args: + n_start: An integer representing the start index of the span. + n_end: An integer representing the end index of the span. + + Returns: + A string representing the instruction description. + """ + if not prompt_to_repeat: + raise ValueError("prompt_to_repeat must be set.") + else: + self._prompt_to_repeat = prompt_to_repeat + if not n_start: + self._n_start = random.randint(0, len(self._prompt_to_repeat.split()) - 2) + else: + self._n_start = n_start + if not n_end: + self._n_end = random.randint(self._n_start + 1, len(self._prompt_to_repeat.split()) - 1) + else: + self._n_end = n_end + self._description_pattern = "Copy the span of words that lies between (and including) index {n_start} and {n_end}, the indices are character indices!" + return self._description_pattern.format( + n_start=self._n_start, n_end=self._n_end, prompt_to_repeat=self._prompt_to_repeat + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"n_start": self._n_start, "n_end": self._n_end, "prompt_to_repeat": self._prompt_to_repeat} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["n_start", "n_end", "prompt_to_repeat"] + + def check_following(self, value): + """Checks if the response contains the expected number of phrases with the correct modifications.""" + if ( + value.strip().lower().split() + == self._prompt_to_repeat.strip().lower().split()[self._n_start : self._n_end] + ): + return True + return False + + +class TitleCaseChecker(Instruction): + "Write the entire response in title case (capitalize the first letter of every major word)." + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Write the entire response in title case (capitalize the first letter of every major word)." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response is in title case. + + Args: + value: A string representing the response. + + Returns: + True if the response is in title case; + otherwise, False. + """ + words = instructions_util.nltk.word_tokenize(value) + for word in words: + if word[0].isupper() and word[1:].islower(): + continue + elif word[0].islower() and word[1:].isupper(): + return False + elif word[0].islower() and word[1:].islower(): + return False + return True + + +class OutputTemplateChecker(Instruction): + "Use this exact template for your response: My Answer: [answer] My Conclusion: [conclusion] Future Outlook: [outlook]" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "Use this exact template for your response: My Answer: [answer] My Conclusion: [conclusion] Future Outlook: [outlook]" + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response follows the specified template. + + Args: + value: A string representing the response. + + Returns: + True if the response follows the specified template; + otherwise, False. + """ + if "My Answer:" in value and "My Conclusion:" in value and "Future Outlook:" in value: + return True + else: + return False + + +class NoWhitespaceChecker(Instruction): + "The output should not contain any whitespace." + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = "The output should not contain any whitespace." + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains any whitespace. + + Args: + value: A string representing the response. + + Returns: + True if the response contains no whitespace; + otherwise, False. + """ + return not any(char.isspace() for char in value) diff --git a/slime/rollout/rm_hub/ifbench_utils/instructions_registry.py b/slime/rollout/rm_hub/ifbench_utils/instructions_registry.py new file mode 100644 index 000000000..05edd0450 --- /dev/null +++ b/slime/rollout/rm_hub/ifbench_utils/instructions_registry.py @@ -0,0 +1,79 @@ +# Copyright 2025 Allen Institute for AI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Registry of all instructions.""" + +from . import instructions + + +INSTRUCTION_DICT = { + "count:word_count_range": instructions.WordCountRangeChecker, + "count:unique_word_count": instructions.UniqueWordCountChecker, + "ratio:stop_words": instructions.StopWordPercentageChecker, + "ratio:sentence_type": instructions.SentTypeRatioChecker, + "ratio:sentence_balance": instructions.SentBalanceChecker, + "count:conjunctions": instructions.ConjunctionCountChecker, + "count:person_names": instructions.PersonNameCountChecker, + "ratio:overlap": instructions.NGramOverlapChecker, + "count:numbers": instructions.NumbersCountChecker, + "words:alphabet": instructions.AlphabetLoopChecker, + "words:vowel": instructions.SingleVowelParagraphChecker, + "words:consonants": instructions.ConsonantClusterChecker, + "sentence:alliteration_increment": instructions.IncrementingAlliterationChecker, + "words:palindrome": instructions.PalindromeChecker, + "count:punctuation": instructions.PunctuationCoverChecker, + "format:parentheses": instructions.NestedParenthesesChecker, + "format:quotes": instructions.NestedQuotesChecker, + "words:prime_lengths": instructions.PrimeLengthsChecker, + "format:options": instructions.OptionsResponseChecker, + "format:newline": instructions.NewLineWordsChecker, + "format:emoji": instructions.EmojiSentenceChecker, + "ratio:sentence_words": instructions.CharacterCountUniqueWordsChecker, + "count:words_japanese": instructions.NthWordJapaneseChecker, + "words:start_verb": instructions.StartWithVerbChecker, + "words:repeats": instructions.LimitedWordRepeatChecker, + "sentence:keyword": instructions.IncludeKeywordChecker, + "count:pronouns": instructions.PronounCountChecker, + "words:odd_even_syllables": instructions.AlternateParitySyllablesChecker, + "words:last_first": instructions.LastWordFirstNextChecker, + "words:paragraph_last_first": instructions.ParagraphLastFirstWordMatchChecker, + "sentence:increment": instructions.IncrementingWordCountChecker, + "words:no_consecutive": instructions.NoConsecutiveFirstLetterChecker, + "format:line_indent": instructions.IndentStairsChecker, + "format:quote_unquote": instructions.QuoteExplanationChecker, + "format:list": instructions.SpecialBulletPointsChecker, + "format:thesis": instructions.ItalicsThesisChecker, + "format:sub-bullets": instructions.SubBulletPointsChecker, + "format:no_bullets_bullets": instructions.SomeBulletPointsChecker, + "custom:multiples": instructions.PrintMultiplesChecker, + "custom:mcq_count_length": instructions.MultipleChoiceQuestionsChecker, + "custom:reverse_newline": instructions.ReverseNewlineChecker, + "custom:word_reverse": instructions.WordReverseOrderChecker, + "custom:character_reverse": instructions.CharacterReverseOrderChecker, + "custom:sentence_alphabet": instructions.SentenceAlphabetChecker, + "custom:european_capitals_sort": instructions.EuropeanCapitalsSortChecker, + "custom:csv_city": instructions.CityCSVChecker, + "custom:csv_special_character": instructions.SpecialCharacterCSVChecker, + "custom:csv_quotes": instructions.QuotesCSVChecker, + "custom:date_format_list": instructions.DateFormatListChecker, + "count:keywords_multiple": instructions.KeywordsMultipleChecker, + "words:keywords_specific_position": instructions.KeywordSpecificPositionChecker, + "words:words_position": instructions.WordsPositionChecker, + "repeat:repeat_change": instructions.RepeatChangeChecker, + "repeat:repeat_simple": instructions.RepeatSimpleChecker, + "repeat:repeat_span": instructions.RepeatSpanChecker, + "format:title_case": instructions.TitleCaseChecker, + "format:output_template": instructions.OutputTemplateChecker, + "format:no_whitespace": instructions.NoWhitespaceChecker, +} diff --git a/slime/rollout/rm_hub/ifbench_utils/instructions_util.py b/slime/rollout/rm_hub/ifbench_utils/instructions_util.py new file mode 100644 index 000000000..19dd35d87 --- /dev/null +++ b/slime/rollout/rm_hub/ifbench_utils/instructions_util.py @@ -0,0 +1,1651 @@ +# Copyright 2025 Allen Institute for AI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility library of instructions.""" + +import functools +import random +import re + +import nltk + +WORD_LIST = [ + "western", + "sentence", + "signal", + "dump", + "spot", + "opposite", + "bottom", + "potato", + "administration", + "working", + "welcome", + "morning", + "good", + "agency", + "primary", + "wish", + "responsibility", + "press", + "problem", + "president", + "steal", + "brush", + "read", + "type", + "beat", + "trainer", + "growth", + "lock", + "bone", + "case", + "equal", + "comfortable", + "region", + "replacement", + "performance", + "mate", + "walk", + "medicine", + "film", + "thing", + "rock", + "tap", + "total", + "competition", + "ease", + "south", + "establishment", + "gather", + "parking", + "world", + "plenty", + "breath", + "claim", + "alcohol", + "trade", + "dear", + "highlight", + "street", + "matter", + "decision", + "mess", + "agreement", + "studio", + "coach", + "assist", + "brain", + "wing", + "style", + "private", + "top", + "brown", + "leg", + "buy", + "procedure", + "method", + "speed", + "high", + "company", + "valuable", + "pie", + "analyst", + "session", + "pattern", + "district", + "pleasure", + "dinner", + "swimming", + "joke", + "order", + "plate", + "department", + "motor", + "cell", + "spend", + "cabinet", + "difference", + "power", + "examination", + "engine", + "horse", + "dimension", + "pay", + "toe", + "curve", + "literature", + "bother", + "fire", + "possibility", + "debate", + "activity", + "passage", + "hello", + "cycle", + "background", + "quiet", + "author", + "effect", + "actor", + "page", + "bicycle", + "error", + "throat", + "attack", + "character", + "phone", + "tea", + "increase", + "outcome", + "file", + "specific", + "inspector", + "internal", + "potential", + "staff", + "building", + "employer", + "shoe", + "hand", + "direction", + "garden", + "purchase", + "interview", + "study", + "recognition", + "member", + "spiritual", + "oven", + "sandwich", + "weird", + "passenger", + "particular", + "response", + "reaction", + "size", + "variation", + "a", + "cancel", + "candy", + "exit", + "guest", + "condition", + "fly", + "price", + "weakness", + "convert", + "hotel", + "great", + "mouth", + "mind", + "song", + "sugar", + "suspect", + "telephone", + "ear", + "roof", + "paint", + "refrigerator", + "organization", + "jury", + "reward", + "engineering", + "day", + "possession", + "crew", + "bar", + "road", + "description", + "celebration", + "score", + "mark", + "letter", + "shower", + "suggestion", + "sir", + "luck", + "national", + "progress", + "hall", + "stroke", + "theory", + "offer", + "story", + "tax", + "definition", + "history", + "ride", + "medium", + "opening", + "glass", + "elevator", + "stomach", + "question", + "ability", + "leading", + "village", + "computer", + "city", + "grand", + "confidence", + "candle", + "priest", + "recommendation", + "point", + "necessary", + "body", + "desk", + "secret", + "horror", + "noise", + "culture", + "warning", + "water", + "round", + "diet", + "flower", + "bus", + "tough", + "permission", + "week", + "prompt", + "connection", + "abuse", + "height", + "save", + "corner", + "border", + "stress", + "drive", + "stop", + "rip", + "meal", + "listen", + "confusion", + "girlfriend", + "living", + "relation", + "significance", + "plan", + "creative", + "atmosphere", + "blame", + "invite", + "housing", + "paper", + "drink", + "roll", + "silver", + "drunk", + "age", + "damage", + "smoke", + "environment", + "pack", + "savings", + "influence", + "tourist", + "rain", + "post", + "sign", + "grandmother", + "run", + "profit", + "push", + "clerk", + "final", + "wine", + "swim", + "pause", + "stuff", + "singer", + "funeral", + "average", + "source", + "scene", + "tradition", + "personal", + "snow", + "nobody", + "distance", + "sort", + "sensitive", + "animal", + "major", + "negotiation", + "click", + "mood", + "period", + "arrival", + "expression", + "holiday", + "repeat", + "dust", + "closet", + "gold", + "bad", + "sail", + "combination", + "clothes", + "emphasis", + "duty", + "black", + "step", + "school", + "jump", + "document", + "professional", + "lip", + "chemical", + "front", + "wake", + "while", + "inside", + "watch", + "row", + "subject", + "penalty", + "balance", + "possible", + "adult", + "aside", + "sample", + "appeal", + "wedding", + "depth", + "king", + "award", + "wife", + "blow", + "site", + "camp", + "music", + "safe", + "gift", + "fault", + "guess", + "act", + "shame", + "drama", + "capital", + "exam", + "stupid", + "record", + "sound", + "swing", + "novel", + "minimum", + "ratio", + "machine", + "shape", + "lead", + "operation", + "salary", + "cloud", + "affair", + "hit", + "chapter", + "stage", + "quantity", + "access", + "army", + "chain", + "traffic", + "kick", + "analysis", + "airport", + "time", + "vacation", + "philosophy", + "ball", + "chest", + "thanks", + "place", + "mountain", + "advertising", + "red", + "past", + "rent", + "return", + "tour", + "house", + "construction", + "net", + "native", + "war", + "figure", + "fee", + "spray", + "user", + "dirt", + "shot", + "task", + "stick", + "friend", + "software", + "promotion", + "interaction", + "surround", + "block", + "purpose", + "practice", + "conflict", + "routine", + "requirement", + "bonus", + "hole", + "state", + "junior", + "sweet", + "catch", + "tear", + "fold", + "wall", + "editor", + "life", + "position", + "pound", + "respect", + "bathroom", + "coat", + "script", + "job", + "teach", + "birth", + "view", + "resolve", + "theme", + "employee", + "doubt", + "market", + "education", + "serve", + "recover", + "tone", + "harm", + "miss", + "union", + "understanding", + "cow", + "river", + "association", + "concept", + "training", + "recipe", + "relationship", + "reserve", + "depression", + "proof", + "hair", + "revenue", + "independent", + "lift", + "assignment", + "temporary", + "amount", + "loss", + "edge", + "track", + "check", + "rope", + "estimate", + "pollution", + "stable", + "message", + "delivery", + "perspective", + "mirror", + "assistant", + "representative", + "witness", + "nature", + "judge", + "fruit", + "tip", + "devil", + "town", + "emergency", + "upper", + "drop", + "stay", + "human", + "neck", + "speaker", + "network", + "sing", + "resist", + "league", + "trip", + "signature", + "lawyer", + "importance", + "gas", + "choice", + "engineer", + "success", + "part", + "external", + "worker", + "simple", + "quarter", + "student", + "heart", + "pass", + "spite", + "shift", + "rough", + "lady", + "grass", + "community", + "garage", + "youth", + "standard", + "skirt", + "promise", + "blind", + "television", + "disease", + "commission", + "positive", + "energy", + "calm", + "presence", + "tune", + "basis", + "preference", + "head", + "common", + "cut", + "somewhere", + "presentation", + "current", + "thought", + "revolution", + "effort", + "master", + "implement", + "republic", + "floor", + "principle", + "stranger", + "shoulder", + "grade", + "button", + "tennis", + "police", + "collection", + "account", + "register", + "glove", + "divide", + "professor", + "chair", + "priority", + "combine", + "peace", + "extension", + "maybe", + "evening", + "frame", + "sister", + "wave", + "code", + "application", + "mouse", + "match", + "counter", + "bottle", + "half", + "cheek", + "resolution", + "back", + "knowledge", + "make", + "discussion", + "screw", + "length", + "accident", + "battle", + "dress", + "knee", + "log", + "package", + "it", + "turn", + "hearing", + "newspaper", + "layer", + "wealth", + "profile", + "imagination", + "answer", + "weekend", + "teacher", + "appearance", + "meet", + "bike", + "rise", + "belt", + "crash", + "bowl", + "equivalent", + "support", + "image", + "poem", + "risk", + "excitement", + "remote", + "secretary", + "public", + "produce", + "plane", + "display", + "money", + "sand", + "situation", + "punch", + "customer", + "title", + "shake", + "mortgage", + "option", + "number", + "pop", + "window", + "extent", + "nothing", + "experience", + "opinion", + "departure", + "dance", + "indication", + "boy", + "material", + "band", + "leader", + "sun", + "beautiful", + "muscle", + "farmer", + "variety", + "fat", + "handle", + "director", + "opportunity", + "calendar", + "outside", + "pace", + "bath", + "fish", + "consequence", + "put", + "owner", + "go", + "doctor", + "information", + "share", + "hurt", + "protection", + "career", + "finance", + "force", + "golf", + "garbage", + "aspect", + "kid", + "food", + "boot", + "milk", + "respond", + "objective", + "reality", + "raw", + "ring", + "mall", + "one", + "impact", + "area", + "news", + "international", + "series", + "impress", + "mother", + "shelter", + "strike", + "loan", + "month", + "seat", + "anything", + "entertainment", + "familiar", + "clue", + "year", + "glad", + "supermarket", + "natural", + "god", + "cost", + "conversation", + "tie", + "ruin", + "comfort", + "earth", + "storm", + "percentage", + "assistance", + "budget", + "strength", + "beginning", + "sleep", + "other", + "young", + "unit", + "fill", + "store", + "desire", + "hide", + "value", + "cup", + "maintenance", + "nurse", + "function", + "tower", + "role", + "class", + "camera", + "database", + "panic", + "nation", + "basket", + "ice", + "art", + "spirit", + "chart", + "exchange", + "feedback", + "statement", + "reputation", + "search", + "hunt", + "exercise", + "nasty", + "notice", + "male", + "yard", + "annual", + "collar", + "date", + "platform", + "plant", + "fortune", + "passion", + "friendship", + "spread", + "cancer", + "ticket", + "attitude", + "island", + "active", + "object", + "service", + "buyer", + "bite", + "card", + "face", + "steak", + "proposal", + "patient", + "heat", + "rule", + "resident", + "broad", + "politics", + "west", + "knife", + "expert", + "girl", + "design", + "salt", + "baseball", + "grab", + "inspection", + "cousin", + "couple", + "magazine", + "cook", + "dependent", + "security", + "chicken", + "version", + "currency", + "ladder", + "scheme", + "kitchen", + "employment", + "local", + "attention", + "manager", + "fact", + "cover", + "sad", + "guard", + "relative", + "county", + "rate", + "lunch", + "program", + "initiative", + "gear", + "bridge", + "breast", + "talk", + "dish", + "guarantee", + "beer", + "vehicle", + "reception", + "woman", + "substance", + "copy", + "lecture", + "advantage", + "park", + "cold", + "death", + "mix", + "hold", + "scale", + "tomorrow", + "blood", + "request", + "green", + "cookie", + "church", + "strip", + "forever", + "beyond", + "debt", + "tackle", + "wash", + "following", + "feel", + "maximum", + "sector", + "sea", + "property", + "economics", + "menu", + "bench", + "try", + "language", + "start", + "call", + "solid", + "address", + "income", + "foot", + "senior", + "honey", + "few", + "mixture", + "cash", + "grocery", + "link", + "map", + "form", + "factor", + "pot", + "model", + "writer", + "farm", + "winter", + "skill", + "anywhere", + "birthday", + "policy", + "release", + "husband", + "lab", + "hurry", + "mail", + "equipment", + "sink", + "pair", + "driver", + "consideration", + "leather", + "skin", + "blue", + "boat", + "sale", + "brick", + "two", + "feed", + "square", + "dot", + "rush", + "dream", + "location", + "afternoon", + "manufacturer", + "control", + "occasion", + "trouble", + "introduction", + "advice", + "bet", + "eat", + "kill", + "category", + "manner", + "office", + "estate", + "pride", + "awareness", + "slip", + "crack", + "client", + "nail", + "shoot", + "membership", + "soft", + "anybody", + "web", + "official", + "individual", + "pizza", + "interest", + "bag", + "spell", + "profession", + "queen", + "deal", + "resource", + "ship", + "guy", + "chocolate", + "joint", + "formal", + "upstairs", + "car", + "resort", + "abroad", + "dealer", + "associate", + "finger", + "surgery", + "comment", + "team", + "detail", + "crazy", + "path", + "tale", + "initial", + "arm", + "radio", + "demand", + "single", + "draw", + "yellow", + "contest", + "piece", + "quote", + "pull", + "commercial", + "shirt", + "contribution", + "cream", + "channel", + "suit", + "discipline", + "instruction", + "concert", + "speech", + "low", + "effective", + "hang", + "scratch", + "industry", + "breakfast", + "lay", + "join", + "metal", + "bedroom", + "minute", + "product", + "rest", + "temperature", + "many", + "give", + "argument", + "print", + "purple", + "laugh", + "health", + "credit", + "investment", + "sell", + "setting", + "lesson", + "egg", + "middle", + "marriage", + "level", + "evidence", + "phrase", + "love", + "self", + "benefit", + "guidance", + "affect", + "you", + "dad", + "anxiety", + "special", + "boyfriend", + "test", + "blank", + "payment", + "soup", + "obligation", + "reply", + "smile", + "deep", + "complaint", + "addition", + "review", + "box", + "towel", + "minor", + "fun", + "soil", + "issue", + "cigarette", + "internet", + "gain", + "tell", + "entry", + "spare", + "incident", + "family", + "refuse", + "branch", + "can", + "pen", + "grandfather", + "constant", + "tank", + "uncle", + "climate", + "ground", + "volume", + "communication", + "kind", + "poet", + "child", + "screen", + "mine", + "quit", + "gene", + "lack", + "charity", + "memory", + "tooth", + "fear", + "mention", + "marketing", + "reveal", + "reason", + "court", + "season", + "freedom", + "land", + "sport", + "audience", + "classroom", + "law", + "hook", + "win", + "carry", + "eye", + "smell", + "distribution", + "research", + "country", + "dare", + "hope", + "whereas", + "stretch", + "library", + "if", + "delay", + "college", + "plastic", + "book", + "present", + "use", + "worry", + "champion", + "goal", + "economy", + "march", + "election", + "reflection", + "midnight", + "slide", + "inflation", + "action", + "challenge", + "guitar", + "coast", + "apple", + "campaign", + "field", + "jacket", + "sense", + "way", + "visual", + "remove", + "weather", + "trash", + "cable", + "regret", + "buddy", + "beach", + "historian", + "courage", + "sympathy", + "truck", + "tension", + "permit", + "nose", + "bed", + "son", + "person", + "base", + "meat", + "usual", + "air", + "meeting", + "worth", + "game", + "independence", + "physical", + "brief", + "play", + "raise", + "board", + "she", + "key", + "writing", + "pick", + "command", + "party", + "yesterday", + "spring", + "candidate", + "physics", + "university", + "concern", + "development", + "change", + "string", + "target", + "instance", + "room", + "bitter", + "bird", + "football", + "normal", + "split", + "impression", + "wood", + "long", + "meaning", + "stock", + "cap", + "leadership", + "media", + "ambition", + "fishing", + "essay", + "salad", + "repair", + "today", + "designer", + "night", + "bank", + "drawing", + "inevitable", + "phase", + "vast", + "chip", + "anger", + "switch", + "cry", + "twist", + "personality", + "attempt", + "storage", + "being", + "preparation", + "bat", + "selection", + "white", + "technology", + "contract", + "side", + "section", + "station", + "till", + "structure", + "tongue", + "taste", + "truth", + "difficulty", + "group", + "limit", + "main", + "move", + "feeling", + "light", + "example", + "mission", + "might", + "wait", + "wheel", + "shop", + "host", + "classic", + "alternative", + "cause", + "agent", + "consist", + "table", + "airline", + "text", + "pool", + "craft", + "range", + "fuel", + "tool", + "partner", + "load", + "entrance", + "deposit", + "hate", + "article", + "video", + "summer", + "feature", + "extreme", + "mobile", + "hospital", + "flight", + "fall", + "pension", + "piano", + "fail", + "result", + "rub", + "gap", + "system", + "report", + "suck", + "ordinary", + "wind", + "nerve", + "ask", + "shine", + "note", + "line", + "mom", + "perception", + "brother", + "reference", + "bend", + "charge", + "treat", + "trick", + "term", + "homework", + "bake", + "bid", + "status", + "project", + "strategy", + "orange", + "let", + "enthusiasm", + "parent", + "concentrate", + "device", + "travel", + "poetry", + "business", + "society", + "kiss", + "end", + "vegetable", + "employ", + "schedule", + "hour", + "brave", + "focus", + "process", + "movie", + "illegal", + "general", + "coffee", + "ad", + "highway", + "chemistry", + "psychology", + "hire", + "bell", + "conference", + "relief", + "show", + "neat", + "funny", + "weight", + "quality", + "club", + "daughter", + "zone", + "touch", + "tonight", + "shock", + "burn", + "excuse", + "name", + "survey", + "landscape", + "advance", + "satisfaction", + "bread", + "disaster", + "item", + "hat", + "prior", + "shopping", + "visit", + "east", + "photo", + "home", + "idea", + "father", + "comparison", + "cat", + "pipe", + "winner", + "count", + "lake", + "fight", + "prize", + "foundation", + "dog", + "keep", + "ideal", + "fan", + "struggle", + "peak", + "safety", + "solution", + "hell", + "conclusion", + "population", + "strain", + "alarm", + "measurement", + "second", + "train", + "race", + "due", + "insurance", + "boss", + "tree", + "monitor", + "sick", + "course", + "drag", + "appointment", + "slice", + "still", + "care", + "patience", + "rich", + "escape", + "emotion", + "royal", + "female", + "childhood", + "government", + "picture", + "will", + "sock", + "big", + "gate", + "oil", + "cross", + "pin", + "improvement", + "championship", + "silly", + "help", + "sky", + "pitch", + "man", + "diamond", + "most", + "transition", + "work", + "science", + "committee", + "moment", + "fix", + "teaching", + "dig", + "specialist", + "complex", + "guide", + "people", + "dead", + "voice", + "original", + "break", + "topic", + "data", + "degree", + "reading", + "recording", + "bunch", + "reach", + "judgment", + "lie", + "regular", + "set", + "painting", + "mode", + "list", + "player", + "bear", + "north", + "wonder", + "carpet", + "heavy", + "officer", + "negative", + "clock", + "unique", + "baby", + "pain", + "assumption", + "disk", + "iron", + "bill", + "drawer", + "look", + "double", + "mistake", + "finish", + "future", + "brilliant", + "contact", + "math", + "rice", + "leave", + "restaurant", + "discount", + "sex", + "virus", + "bit", + "trust", + "event", + "wear", + "juice", + "failure", + "bug", + "context", + "mud", + "whole", + "wrap", + "intention", + "draft", + "pressure", + "cake", + "dark", + "explanation", + "space", + "angle", + "word", + "efficiency", + "management", + "habit", + "star", + "chance", + "finding", + "transportation", + "stand", + "criticism", + "flow", + "door", + "injury", + "insect", + "surprise", + "apartment", +] # pylint: disable=line-too-long + + +def download_nltk_resources(): + """Download 'punkt' if not already installed""" + try: + nltk.data.find("tokenizers/punkt") + except LookupError: + nltk.download("punkt") + + +download_nltk_resources() + + +_ALPHABETS = "([A-Za-z])" +_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]" +_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)" +_STARTERS = ( + r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +) +_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +_WEBSITES = "[.](com|net|org|io|gov|edu|me)" +_DIGITS = "([0-9])" +_MULTIPLE_DOTS = r"\.{2,}" + + +def split_into_sentences(text): + """Split the text into sentences. + + Args: + text: A string that consists of more than or equal to one sentences. + + Returns: + A list of strings where each string is a sentence. + """ + text = " " + text + " " + text = text.replace("\n", " ") + text = re.sub(_PREFIXES, "\\1", text) + text = re.sub(_WEBSITES, "\\1", text) + text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1\\2", text) + text = re.sub( + _MULTIPLE_DOTS, + lambda match: "" * len(match.group(0)) + "", + text, + ) + if "Ph.D" in text: + text = text.replace("Ph.D.", "PhD") + text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1 ", text) + text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1 \\2", text) + text = re.sub( + _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]", + "\\1\\2\\3", + text, + ) + text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1\\2", text) + text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1 \\2", text) + text = re.sub(" " + _SUFFIXES + "[.]", " \\1", text) + text = re.sub(" " + _ALPHABETS + "[.]", " \\1", text) + if "”" in text: + text = text.replace(".”", "”.") + if '"' in text: + text = text.replace('."', '".') + if "!" in text: + text = text.replace('!"', '"!') + if "?" in text: + text = text.replace('?"', '"?') + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") + sentences = text.split("") + sentences = [s.strip() for s in sentences] + if sentences and not sentences[-1]: + sentences = sentences[:-1] + return sentences + + +def count_words(text): + """Counts the number of words.""" + tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(text) + num_words = len(tokens) + return num_words + + +@functools.lru_cache(maxsize=None) +def _get_sentence_tokenizer(): + return nltk.data.load("nltk:tokenizers/punkt/english.pickle") + + +def count_stopwords(text): + """Counts the number of stopwords.""" + nltk.download("stopwords") + stopwords = nltk.corpus.stopwords.words("english") + tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(text) + num_stopwords = len([t for t in tokens if t.lower() in stopwords]) + return num_stopwords + + +def generate_keywords(num_keywords): + """Randomly generates a few keywords.""" + return random.sample(WORD_LIST, k=num_keywords) From e24e776abaebd75368da2f3f1a3ffecb9fd9a8e2 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Sun, 26 Oct 2025 02:32:32 +0000 Subject: [PATCH 02/16] cfg --- examples/eval_multi_task/multi_task.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/eval_multi_task/multi_task.yaml b/examples/eval_multi_task/multi_task.yaml index 47ad91c4d..643f759c3 100644 --- a/examples/eval_multi_task/multi_task.yaml +++ b/examples/eval_multi_task/multi_task.yaml @@ -1,15 +1,17 @@ eval: defaults: - n_samples_per_eval_prompt: 16 max_response_len: 16384 top_p: 0.7 datasets: - name: aime path: /root/aime-2024/aime-2024.jsonl rm_type: deepscaler + n_samples_per_eval_prompt: 16 - name: gpqa # huggingface-cli download --repo-type dataset zyzshishui0627/gpqa_diamond --local-dir /root/gpqa path: /root/gpqa/gpqa_eval.jsonl rm_type: gpqa + n_samples_per_eval_prompt: 2 - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/IFBench path: /root/IFBench/IFBench_eval.jsonl rm_type: ifbench + n_samples_per_eval_prompt: 1 From 10eb6b4ac676d2ebc9afaa2736453ac1f5374751 Mon Sep 17 00:00:00 2001 From: Jiajun Li Date: Sun, 26 Oct 2025 04:43:24 +0000 Subject: [PATCH 03/16] add comment --- slime/rollout/rm_hub/ifbench.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index a5f65340b..63bfceb01 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -1,3 +1,20 @@ +# Copyright 2025 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The follwoing code is adapted from the official IFBench code: +# https://github.com/allenai/IFBench/blob/main/evaluation_lib.py + from __future__ import annotations import dataclasses From 55ecdb09350681b31bb6a45fd84775d46e6f2039 Mon Sep 17 00:00:00 2001 From: Jiajun Li Date: Sun, 26 Oct 2025 05:26:23 +0000 Subject: [PATCH 04/16] fix requirements.txt and wandb log params --- examples/eval_multi_task/requirements.txt | 9 +++++++++ slime/utils/wandb_utils.py | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 examples/eval_multi_task/requirements.txt diff --git a/examples/eval_multi_task/requirements.txt b/examples/eval_multi_task/requirements.txt new file mode 100644 index 000000000..c5501a4ab --- /dev/null +++ b/examples/eval_multi_task/requirements.txt @@ -0,0 +1,9 @@ +absl-py +emoji +immutabledict +langdetect +nltk +numpy==1.26.4 +spacy +syllapy +unicodedata2 diff --git a/slime/utils/wandb_utils.py b/slime/utils/wandb_utils.py index b8edab7b6..eb16544ea 100644 --- a/slime/utils/wandb_utils.py +++ b/slime/utils/wandb_utils.py @@ -1,7 +1,19 @@ import os +from typing import Any + import wandb +def _build_wandb_config(args) -> dict[str, Any]: + config = args.__dict__ + if args.eval_datasets: + for dataset_config in args.eval_datasets: + name = dataset_config.name + for key, value in dataset_config.__dict__.items(): + config[f"eval_{name}_{key}"] = value + return config + + def _is_offline_mode(args) -> bool: """Detect whether W&B should run in offline mode. @@ -49,7 +61,7 @@ def init_wandb_primary(args): "project": args.wandb_project, "group": group, "name": run_name, - "config": args.__dict__, + "config": _build_wandb_config(args), } # Configure settings based on offline/online mode @@ -111,7 +123,7 @@ def init_wandb_secondary(args, wandb_run_id, router_addr=None): "id": wandb_run_id, "entity": args.wandb_team, "project": args.wandb_project, - "config": args.__dict__, + "config": _build_wandb_config(args), "resume": "allow", "reinit": True, "settings": wandb.Settings(**settings_kwargs), From f8821381134024cbb73b6385d664c95f1267180e Mon Sep 17 00:00:00 2001 From: Jiajun Li Date: Sun, 26 Oct 2025 16:53:43 +0000 Subject: [PATCH 05/16] revert log params --- slime/utils/wandb_utils.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/slime/utils/wandb_utils.py b/slime/utils/wandb_utils.py index eb16544ea..b8edab7b6 100644 --- a/slime/utils/wandb_utils.py +++ b/slime/utils/wandb_utils.py @@ -1,19 +1,7 @@ import os -from typing import Any - import wandb -def _build_wandb_config(args) -> dict[str, Any]: - config = args.__dict__ - if args.eval_datasets: - for dataset_config in args.eval_datasets: - name = dataset_config.name - for key, value in dataset_config.__dict__.items(): - config[f"eval_{name}_{key}"] = value - return config - - def _is_offline_mode(args) -> bool: """Detect whether W&B should run in offline mode. @@ -61,7 +49,7 @@ def init_wandb_primary(args): "project": args.wandb_project, "group": group, "name": run_name, - "config": _build_wandb_config(args), + "config": args.__dict__, } # Configure settings based on offline/online mode @@ -123,7 +111,7 @@ def init_wandb_secondary(args, wandb_run_id, router_addr=None): "id": wandb_run_id, "entity": args.wandb_team, "project": args.wandb_project, - "config": _build_wandb_config(args), + "config": args.__dict__, "resume": "allow", "reinit": True, "settings": wandb.Settings(**settings_kwargs), From b87fbeebdba5025fc77fb11add1b48111e1b68a2 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Sun, 26 Oct 2025 20:42:36 +0000 Subject: [PATCH 06/16] change data path to avoid conflicts --- examples/eval_multi_task/multi_task.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/eval_multi_task/multi_task.yaml b/examples/eval_multi_task/multi_task.yaml index 643f759c3..83ae67f8a 100644 --- a/examples/eval_multi_task/multi_task.yaml +++ b/examples/eval_multi_task/multi_task.yaml @@ -11,7 +11,7 @@ eval: path: /root/gpqa/gpqa_eval.jsonl rm_type: gpqa n_samples_per_eval_prompt: 2 - - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/IFBench - path: /root/IFBench/IFBench_eval.jsonl + - name: ifbench # huggingface-cli download --repo-type dataset zyzshishui0627/IFBench --local-dir /root/ifbench + path: /root/ifbench/IFBench_eval.jsonl rm_type: ifbench n_samples_per_eval_prompt: 1 From ba6e933f0d8ae65d894fac6834faf71263e2fcbf Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Sun, 26 Oct 2025 20:44:10 +0000 Subject: [PATCH 07/16] replace local ifbench utils with outside import --- slime/rollout/rm_hub/ifbench.py | 17 +- .../rm_hub/ifbench_utils/instructions.py | 2347 ----------------- .../ifbench_utils/instructions_registry.py | 79 - .../rm_hub/ifbench_utils/instructions_util.py | 1651 ------------ 4 files changed, 16 insertions(+), 4078 deletions(-) delete mode 100644 slime/rollout/rm_hub/ifbench_utils/instructions.py delete mode 100644 slime/rollout/rm_hub/ifbench_utils/instructions_registry.py delete mode 100644 slime/rollout/rm_hub/ifbench_utils/instructions_util.py diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index 63bfceb01..e51f053c5 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -18,10 +18,25 @@ from __future__ import annotations import dataclasses +import importlib import logging +import sys +from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Union -from .ifbench_utils import instructions_registry +try: + from ifbench import instructions_registry # type: ignore[attr-defined] +except ImportError: + _IFBENCH_REPO_ROOT = Path(__file__).resolve().parents[3] / "ifbench" + if not _IFBENCH_REPO_ROOT.exists(): + raise ImportError( + "IFBench repository not found. Clone https://github.com/allenai/IFBench.git " + "into the repo root or export PYTHONPATH accordingly." + ) from None + repo_path = str(_IFBENCH_REPO_ROOT) + if repo_path not in sys.path: + sys.path.insert(0, repo_path) + instructions_registry = importlib.import_module("instructions_registry") logger = logging.getLogger(__name__) diff --git a/slime/rollout/rm_hub/ifbench_utils/instructions.py b/slime/rollout/rm_hub/ifbench_utils/instructions.py deleted file mode 100644 index 1ed036522..000000000 --- a/slime/rollout/rm_hub/ifbench_utils/instructions.py +++ /dev/null @@ -1,2347 +0,0 @@ -# Copyright 2025 Allen Institute for AI. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Library of instructions.""" - -import csv -import io -import logging -import random -import re -import string -import unicodedata -from collections import Counter -from typing import Dict, Optional, Sequence, Union - -import emoji -import nltk -import spacy -import syllapy -from spacy.cli import download - -from . import instructions_util - -try: - spacy.load("en_core_web_sm") -except OSError: - download("en_core_web_sm") - -logger = logging.getLogger(__name__) - -_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] - -# The number of keywords. -_NUM_KEYWORDS = 2 - -# The number of words in the response. -_NUM_WORDS_LOWER_LIMIT = 100 -_NUM_WORDS_UPPER_LIMIT = 500 - -# The number of numbers. -_NUM_NUMBERS = 6 - -# Period length for periodic words. -_NUM_WORD_CYCLE = 30 - -# Maximum number of times a word can be repeated. -_MAX_REPEATS = 5 - -# Which sentence must contain a keyword. -_NUM_KEYWORD_SENTENCE = 20 - -# Minimum number of pronouns. -_NUM_PRONOUNS = 25 - -# The size of increment for lengths. -_NUM_INCREMENT = 5 - -# The number of coordinating conjunctions. -_NUM_CONJUNCTIONS = 6 - - -class Instruction: - """An instruction template.""" - - def __init__(self, instruction_id): - self.id = instruction_id - - def build_description(self, **kwargs): - raise NotImplementedError("`build_description` not implemented.") - - def get_instruction_args(self): - raise NotImplementedError("`get_instruction_args` not implemented.") - - def get_instruction_args_keys(self): - raise NotImplementedError("`get_instruction_args_keys` not implemented.") - - def check_following(self, value): - raise NotImplementedError("`check_following` not implemented.") - - -# Everything as follows is part of OOD IFEval - - -class WordCountRangeChecker(Instruction): - """Word Count Range: The response must contain between X and Y words.""" - - def build_description(self, *, min_words=None, max_words=None): - """Build the instruction description. - - Args: - min_words: An integer specifying the minimum number of words contained in the response. - max_words: An integer specifying the maximum number of words contained in the response. - - Returns: - A string representing the instruction description. - """ - self._min_words = min_words - self._max_words = max_words - - if self._min_words is None or self._min_words < 0: - self._min_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT) - - # Make the range small - if self._max_words is None or self._max_words < 0: - self._max_words = self._min_words + random.randint(int(self._min_words * 0.05), int(self._min_words * 0.1)) - - self._description_pattern = "The response must contain between {min_words} and {max_words} words." - - return self._description_pattern.format(min_words=self._min_words, max_words=self._max_words) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"min_words": self._min_words, "max_words": self._max_words} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["min_words", "max_words"] - - def check_following(self, value): - """Checks if the response contains the expected number of words.""" - num_words = instructions_util.count_words(value) - return self._min_words <= num_words <= self._max_words - - -class UniqueWordCountChecker(Instruction): - """Unique Word Count: The response must contain X unique words.""" - - def build_description(self, *, N=None): - """Build the instruction description. - - Args: - n: An integer specifying the number of unique words contained in the response. - - Returns: - A string representing the instruction description. - """ - self._num_unique_words = N - - if self._num_unique_words is None or self._num_unique_words < 0: - self._num_unique_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT) - - self._description_pattern = "Use at least {N} unique words in the response." - - return self._description_pattern.format(N=self._num_unique_words) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"N": self._num_unique_words} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["N"] - - def check_following(self, value): - """Checks if the response contains the expected number of unique words.""" - words = value.lower().split() - unique_words = set() - for word in words: - unique_words.add(word.strip("".join(string.punctuation) + " ")) - # Convert to set to get unique words - return len(unique_words) >= self._num_unique_words - - -class StopWordPercentageChecker(Instruction): - """Ensure that stop words constitute no more than {percentage}% of the total words in your response.""" - - def build_description(self, *, percentage=None): - """Build the instruction description. - - Args: - percentage: An integer specifying the percentage of stop words that are allowed in the response. - - Returns: - A string representing the instruction description. - """ - self._percentage = percentage - - if self._percentage is None or self._percentage < 0: - self._percentage = random.randint(1, 100) - - self._description_pattern = ( - "Ensure that stop words constitute no more than {percentage}% of the total words in your response." - ) - - return self._description_pattern.format(percentage=self._percentage) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"percentage": self._percentage} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["percentage"] - - def check_following(self, value): - """Checks if the response contains the expected percentage of stop words.""" - num_words = instructions_util.count_words(value) - num_stopwords = instructions_util.count_stopwords(value) - stopword_percentage = (num_stopwords / num_words) * 100 - return stopword_percentage <= self._percentage - - -class SentTypeRatioChecker(Instruction): - """Maintain a 2:1 ratio of declarative to interrogative sentences.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Maintain a 2:1 ratio of declarative to interrogative sentences." - nltk.download("punkt_tab") - return self._description_pattern - - def get_instruction_args(self): - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response contains the expected ratio of declarative to interrogative sentences.""" - # Split the text into sentences - sentences = instructions_util.split_into_sentences(value) - # Count the number of declarative and interrogative sentences - declarative_count = sum(1 for sentence in sentences if sentence.endswith(".")) - interrogative_count = sum(1 for sentence in sentences if sentence.endswith("?")) - # Check if the ratio is 2:1 - return declarative_count == 2 * interrogative_count - - -class SentBalanceChecker(Instruction): - """Ensure that the ratio of sentence types (declarative, interrogative, exclamatory) is balanced.""" - - def build_description(self): - """Build the instruction description.""" - nltk.download("punkt_tab") - self._description_pattern = ( - "Ensure that the ratio of sentence types (declarative, interrogative, exclamatory) is balanced." - ) - return self._description_pattern - - def get_instruction_args(self): - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response contains a balanced ratio of sentence types.""" - # Split the text into sentences - sentences = instructions_util.split_into_sentences(value) - # Count the number of each sentence type - declarative_count = sum(1 for sentence in sentences if sentence.endswith(".")) - interrogative_count = sum(1 for sentence in sentences if sentence.endswith("?")) - exclamatory_count = sum(1 for sentence in sentences if sentence.endswith("!")) - # Check if the ratio of sentence types is balanced - return declarative_count == interrogative_count == exclamatory_count - - -class ConjunctionCountChecker(Instruction): - """Use at least {small_n} different coordinating conjunctions in the response.""" - - def build_description(self, *, small_n=None): - """Build the instruction description. - - Args: - small_n: An integer specifying the number of different coordinating conjunctions contained in the response. - - Returns: - A string representing the instruction description. - """ - self._num_conjunctions = small_n - - if self._num_conjunctions is None or self._num_conjunctions < 0: - self._num_conjunctions = random.randint(2, _NUM_CONJUNCTIONS) - - self._description_pattern = "Use at least {small_n} different coordinating conjunctions in the response." - - return self._description_pattern.format(small_n=self._num_conjunctions) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"small_n": self._num_conjunctions} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["small_n"] - - def check_following(self, value): - """Checks if the response contains the expected number of different coordinating conjunctions.""" - # Split the text into words - words = value.split() - # Count the number of coordinating conjunctions - conjunctions = [ - word - for word in words - if word.strip("".join(string.punctuation) + " ").lower() in ["and", "but", "for", "nor", "or", "so", "yet"] - ] - unique_conjunctions = set(conjunctions) - return len(unique_conjunctions) >= self._num_conjunctions - - -class PersonNameCountChecker(Instruction): - """Mention at least {N} different person names in the response, from this list of person names: Emma, Liam, Sophia...""" - - def build_description(self, *, N=None): - """Build the instruction description. - - Args: - N: An integer specifying the minimum number of unique person names contained in the response. - - Returns: - A string representing the instruction description. - """ - self._num_person_names = N - - if self._num_person_names is None or self._num_person_names < 0: - self._num_person_names = random.randint(1, 50) - - self._description_pattern = "Mention at least {N} different person names in the response, from this list of person names: Emma, Liam, Sophia, Jackson, Olivia, Noah, Ava, Lucas, Isabella, Mason, Mia, Ethan, Charlotte, Alexander, Amelia, Benjamin, Harper, Leo, Zoe, Daniel, Chloe, Samuel, Lily, Matthew, Grace, Owen, Abigail, Gabriel, Ella, Jacob, Scarlett, Nathan, Victoria, Elijah, Layla, Nicholas, Audrey, David, Hannah, Christopher, Penelope, Thomas, Nora, Andrew, Aria, Joseph, Claire, Ryan, Stella, Jonathan ." - return self._description_pattern.format(N=self._num_person_names) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"N": self._num_person_names} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["N"] - - def check_following(self, value): - """Checks if the response contains at least the expected number of unique person names.""" - person_name_list = [ - "Emma", - "Liam", - "Sophia", - "Jackson", - "Olivia", - "Noah", - "Ava", - "Lucas", - "Isabella", - "Mason", - "Mia", - "Ethan", - "Charlotte", - "Alexander", - "Amelia", - "Benjamin", - "Harper", - "Leo", - "Zoe", - "Daniel", - "Chloe", - "Samuel", - "Lily", - "Matthew", - "Grace", - "Owen", - "Abigail", - "Gabriel", - "Ella", - "Jacob", - "Scarlett", - "Nathan", - "Victoria", - "Elijah", - "Layla", - "Nicholas", - "Audrey", - "David", - "Hannah", - "Christopher", - "Penelope", - "Thomas", - "Nora", - "Andrew", - "Aria", - "Joseph", - "Claire", - "Ryan", - "Stella", - "Jonathan", - ] - # Extract the named entities - person_names = [] - for name in person_name_list: - if name in value: - person_names.append(name) - unique_person_names = set(person_names) - - return len(unique_person_names) >= self._num_person_names - - -class NGramOverlapChecker(Instruction): - """Maintain a trigram overlap of {percentage}% (±2%) with the provided reference text.""" - - def build_description(self, *, reference_text=None, percentage=None): - """Build the instruction description. - - Args: - reference_text: A string representing the reference text. - percentage: An integer specifying the percent trigram overlap - to maintain in the response. - - Returns: - A string representing the instruction description. - """ - self._reference_text = reference_text - self._percentage = percentage - if self._percentage is None or self._percentage < 0: - self._percentage = random.randint(1, 100) - - self._description_pattern = ( - "Maintain a trigram overlap of {percentage}% (±2%) with the provided reference text." - ) - return self._description_pattern.format(percentage=self._percentage) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"reference_text": self._reference_text, "percentage": self._percentage} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["reference_text", "percentage"] - - def check_following(self, value): - """Checks if the response maintains a trigram overlap with the reference text within 2% of {percent}.""" - n = 3 - ngrams = set(nltk.ngrams(value, n)) - ref_ngrams = set(nltk.ngrams(self._reference_text, n)) - overlap = len(ngrams.intersection(ref_ngrams)) / len(ngrams) - return self._percentage - 2 <= overlap * 100 <= self._percentage + 2 - - -class NumbersCountChecker(Instruction): - """Include exactly {N} numbers in the response.""" - - def build_description(self, *, N=None): - """Build the instruction description. - - Args: - N: An integer specifying the exact number of numbers - that is required to appear in the response. - - Returns: - A string representing the instruction description. - """ - self._count_numbers = N - if self._count_numbers is None or self._count_numbers < 0: - self._count_numbers = random.randint(1, _NUM_NUMBERS) - - self._description_pattern = "Include exactly {N} numbers in the response." - return self._description_pattern.format(N=self._count_numbers) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"N": self._count_numbers} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["N"] - - def check_following(self, value): - """Checks if the response includes exactly {N} numbers.""" - # Strip punctuation to handle decimals and commas in numbers correctly - value = value.translate(str.maketrans("", "", string.punctuation)) - numbers = re.findall(r"\d+", value) - return len(numbers) == self._count_numbers - - -class AlphabetLoopChecker(Instruction): - """Each word must start with the next letter of the alphabet, looping back to 'A' after 'Z'.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = ( - "Each word must start with the next letter of the alphabet, looping back to 'A' after 'Z'." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if each word of the response starts with the next letter of the alphabet.""" - value = value.translate(str.maketrans("", "", string.punctuation)) - words = value.strip("".join(string.punctuation) + " ").split() - alphabet = string.ascii_lowercase - correct_letter = words[0][0].lower() - if correct_letter not in alphabet: # numbers are fails - return False - for word in words[1:]: - word = word.strip("".join(string.punctuation) + " ").lower() - if not word: - continue - correct_letter = alphabet[(alphabet.index(correct_letter) + 1) % 26] - if word[0] != correct_letter: - return False - return True - - -class SingleVowelParagraphChecker(Instruction): - """Write a paragraph using words that contain only three type of vowels.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Write a paragraph using words that contain only three types of vowels." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if no more than three types of vowels are used in the response and the response is only 1 paragraph.""" - paragraphs = value.strip().split("\n") - if len(paragraphs) != 1: - return False - paragraph = paragraphs[0].lower() - - vowels = set("aeiou") - paragraph_vowels = set([char for char in paragraph if char in vowels]) - return len(paragraph_vowels) <= 3 - - -class ConsonantClusterChecker(Instruction): - """Ensure each word in your response has at least one consonant cluster (two or more consonants together).""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = ( - "Ensure each word in your response has at least one consonant cluster (two or more consonants together)." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if each word in the response includes at least one consonant cluster.""" - words = value.lower().strip().split() - consonants = set("bcdfghjklmnpqrstvwxyz") - for word in words: - cluster = False - for i in range(len(word) - 1): - if word[i] in consonants and word[i + 1] in consonants: - cluster = True - break - if not cluster: - return False - return True - - -class IncrementingAlliterationChecker(Instruction): - """Each sentence must have a longer sequence of consecutive alliterative words than the previous one.""" - - def build_description(self): - """Build the instruction description.""" - nltk.download("punkt_tab") - self._description_pattern = ( - "Each sentence must have a longer sequence of consecutive alliterative words than the previous one." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if each sentence in the response has more alliterative words (determined by common first letter) than the previous sentence.""" - sentences = instructions_util.split_into_sentences(value) - prev_alliteration = -1 - for sentence in sentences: - words = sentence.lower().split() - alliteration = 0 - prev_alliterative = False - new_words = [] - for word in words: - clean = word.lstrip("".join(string.punctuation) + " ") - if clean: - new_words.append(clean) - for i in range(len(new_words) - 1): - if new_words[i][0] == new_words[i + 1][0]: - if prev_alliterative: - alliteration += 1 - else: - alliteration += 2 - prev_alliterative = True - else: - prev_alliterative = False - if alliteration <= prev_alliteration: - return False - prev_alliteration = alliteration - return True - - -class PalindromeChecker(Instruction): - """Include at least 10 single-word palindromes, each at least 5 characters long.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Include at least 10 single-word palindromes, each at least 5 characters long." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response includes at least 10 single-word palindromes of length at least 5.""" - value = value.translate(str.maketrans("", "", string.punctuation)) - words = value.lower().split() - palindromes = [word for word in words if word == word[::-1] and len(word) >= 5] - return len(palindromes) >= 10 - - -class PunctuationCoverChecker(Instruction): - """Use every standard punctuation mark at least once, including semicolons, colons, and the interrobang (?!).""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Use every standard punctuation mark at least once, including semicolons, colons, and the interrobang (?!)." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response includes every standard punctuation mark at least once, including the interrobang (?!).""" - punctuation = {".", ",", "!", "?", ";", ":"} - if not ("!?" in value or "?!" in value or "‽" in value): - return False - new_value = value.replace("?!", "", 1) - if len(new_value) == len(value): - new_value = value.replace("!?", "", 1) - for char in new_value: - if char in punctuation: - punctuation.remove(char) - return not punctuation - - -class NestedParenthesesChecker(Instruction): - """Nest parentheses (and [brackets {and braces}]) at least 5 levels deep.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Nest parentheses (and [brackets {and braces}]) at least 5 levels deep." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response includes a correctly closed set of at least 5 nested brackets.""" - levels = [] - min_levels = 5 - max_depth = 0 - depth_stack = [] # Track depth per matched group - - for char in value: - if char in "([{": - levels.append(char) - if len(levels) > max_depth: - max_depth = len(levels) - elif char in ")]}": - if levels and ( - (levels[-1] == "(" and char == ")") - or (levels[-1] == "[" and char == "]") - or (levels[-1] == "{" and char == "}") - ): - levels.pop() - # Check if we just closed a group that reached 5+ depth - if max_depth >= min_levels and len(levels) < max_depth: - return True - else: - # Mismatch — reset - levels = [] - max_depth = 0 - - return False - - -class NestedQuotesChecker(Instruction): - """Include quotes within quotes within quotes, at least 3 levels deep, alternating between double quotes and single quotes.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Include quotes within quotes within quotes, at least 3 levels deep, alternating between double quotes and single quotes." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response includes nested quotes to at least 3 levels - alternating between " and ' starting with either character.""" - levels = [] - min_levels = 3 - reached_depth = 0 - current_depth = 0 - for char in value: - if len(levels) != 0 and char == levels[-1]: - levels.pop() - current_depth -= 1 - if reached_depth - current_depth >= min_levels: - return True - elif char == '"' or char == "'": - levels.append(char) - current_depth += 1 - if current_depth > reached_depth: - reached_depth = current_depth - return False - - -class PrimeLengthsChecker(Instruction): - """Use only words with lengths that are prime numbers.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Use only words with lengths that are prime numbers." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response only includes words with prime length.""" - value = value.translate(str.maketrans("", "", string.punctuation)) - words = value.split() - primes = set([2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]) - for word in words: - if len(word) not in primes: - return False - return True - - -class OptionsResponseChecker(Instruction): - """Answer with one of the following options: {options}. Do not give any explanation.""" - - def build_description(self, *, options=None): - """Build the instruction description. - - Args: - options: A string specifying the permitted options for - the response. - - Returns: - A string representing the instruction description. - """ - # Options string may be: yes/no/maybe, I know or I don't know, a), b), c), d) - # Can be separated by "/", "or", "," - options_bank = ["yes/no/maybe", "I know or I don't know", "a), b), c), d)"] - if options is None: - options = random.choice(options_bank) - - # Be more strict about format for multiple choice letters than for text options - self._strict = False - if re.match(r"\W*[aA]\W*[bB]\W*[cC]\W*", options) is not None: - self._strict = True - if "/" in options: - separator = "/" - elif "or" in options: - separator = "or" - else: - separator = "," - self._options = [option.strip() for option in options.split(separator)] - self._options_text = options # in text, shouldn't be formatted as a list - self._description_pattern = "Answer with one of the following options: {options}. Do not give any explanation." - return self._description_pattern.format(options=self._options_text) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"options": self._options_text} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["options"] - - def check_following(self, value): - """Checks if the response is exactly one of {options}.""" - if self._strict: - return value in self._options - value = value.strip("".join(string.punctuation) + " ").lower() - for option in self._options: - if option.strip("".join(string.punctuation) + " ").lower() == value: - return True - return False - - -class NewLineWordsChecker(Instruction): - """Write each word on a new line.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Write each word on a new line." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response has each word on a new line.""" - value = value.translate(str.maketrans("", "", string.punctuation)) - lines = value.strip().split("\n") - while "" in lines: - lines.remove("") - return len(lines) == len(value.strip().split()) - - -class EmojiSentenceChecker(Instruction): - """Please use an emoji at the end of every sentence.""" - - def build_description(self): - """Build the instruction description.""" - nltk.download("punkt_tab") - self._description_pattern = "Please use an emoji at the end of every sentence." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response includes an emoji at the end of every sentence.""" - - sentences = instructions_util.split_into_sentences(value) - for i, sentence in enumerate(sentences): - stripped = sentence.translate(str.maketrans("", "", string.punctuation)).strip() - # check for empty string - if not stripped: - return False - last_char = stripped[-1] - # because blank spaces are treated oddly - second_last_char = stripped[-2] if len(stripped) > 1 else stripped[-1] - if not emoji.is_emoji(last_char) and not emoji.is_emoji(second_last_char): - if i < len(sentences) - 1: - stripped = sentences[i + 1].translate(str.maketrans("", "", string.punctuation)).strip() - # fixed empty string - if not stripped: - return False - first_char = stripped[0] - if not emoji.is_emoji(first_char): - return False - else: - return False - return True - - -class CharacterCountUniqueWordsChecker(Instruction): - """Respond with three sentences, all containing the same number of characters but using all different words.""" - - def build_description(self): - """Build the instruction description.""" - nltk.download("punkt_tab") - self._description_pattern = ( - "Respond with three sentences, all containing the same number of characters but using all different words." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response has exactly 3 sentences containing the same number of characters but different words.""" - sentences = instructions_util.split_into_sentences(value) - if len(sentences) != 3: - return False - char_count = len(sentences[0].strip()) - for sentence in sentences: - if len(sentence.strip()) != char_count: - return False - return True - - -class NthWordJapaneseChecker(Instruction): - """Every {N}th word of your response must be in Japanese.""" - - def build_description(self, *, N=None): - """Build the instruction description. - - Args: - N: An integer specifying the cycle length for - Japanese words to appear in the response. - - Returns: - A string representing the instruction description. - """ - self._japanese_position = N - if self._japanese_position is None or self._japanese_position < 0: - self._japanese_position = random.randint(1, _NUM_WORD_CYCLE) - - self._description_pattern = "Every {N}th word of your response must be in Japanese." - if N % 10 == 1: - self._description_pattern = "Every {N}st of your response must be in Japanese." - if N % 10 == 2: - self._description_pattern = "Every {N}nd of your response must be in Japanese." - elif N % 10 == 3: - self._description_pattern = "Every {N}rd of your response must be in Japanese." - return self._description_pattern.format(N=self._japanese_position) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"N": self._japanese_position} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["N"] - - def check_following(self, value): - """Checks if every {N}th word of the response is in Japanese.""" - - def is_japanese(text): - """ - Checks if a string contains Japanese characters (Hiragana, Katakana, or Kanji). - - Args: - text: The string to check. - - Returns: - True if the string contains Japanese characters, False otherwise. - """ - japanese_pattern = re.compile(r"[\u3040-\u30ff\u4e00-\u9fff]") - return bool(japanese_pattern.search(text)) - - words = value.split() - for i, word in enumerate(words): - word = word.strip("".join(string.punctuation) + " ") - if (i + 1) % self._japanese_position == 0 and word and not word.isdigit(): - if not is_japanese(word): - return False - return True - - -class StartWithVerbChecker(Instruction): - """The response must start with a verb.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "The response must start with a verb." - nltk.download("averaged_perceptron_tagger_eng") - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response starts with a verb.""" - text = nltk.word_tokenize(value) - return len(text) > 0 and len(nltk.pos_tag(text)) > 0 and "VB" in nltk.pos_tag(text)[0][1] - - -class LimitedWordRepeatChecker(Instruction): - """The response should not repeat any word more than {small_n} times.""" - - def build_description(self, *, small_n=None): - """Build the instruction description. - - Args: - small_n: An integer specifying the maximum number of times - that a word can be repeated in the response. - - Returns: - A string representing the instruction description. - """ - self._max_repeats = small_n - if self._max_repeats is None or self._max_repeats < 0: - self._max_repeats = random.randint(1, _MAX_REPEATS) - - self._description_pattern = "The response should not repeat any word more than {small_n} times." - return self._description_pattern.format(small_n=self._max_repeats) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"small_n": self._max_repeats} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["small_n"] - - def check_following(self, value): - """Checks if the response repeats any word more than {small_n} times.""" - words = value.lower().translate(str.maketrans("", "", string.punctuation)).split() - word_count = Counter(words) - for word, count in word_count.items(): - if count > self._max_repeats: - return False - return True - - -class IncludeKeywordChecker(Instruction): - """The response must include keyword {word} in the {N}-th sentence.""" - - def build_description(self, *, word=None, N=None): - """Build the instruction description. - - Args: - word: A string specifying the keyword that is - required to appear in the response. - N: An integer specifying which sentence of the - response is required to have the keyword. - - Returns: - A string representing the instruction description. - """ - nltk.download("punkt_tab") - - if not word: - self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword = word - self._keyword_position = N - if self._keyword_position is None or self._keyword_position < 0: - self._keyword_position = random.randint(1, _NUM_KEYWORD_SENTENCE) - - self._description_pattern = 'The response must include keyword "{word}" in the {N}-th sentence.' - return self._description_pattern.format(word=self._keyword, N=self._keyword_position) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"word": self._keyword, "N": self._keyword_position} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["word", "N"] - - def check_following(self, value): - """Checks if the {N}th sentence of the response includes keyword {word}.""" - sentences = instructions_util.split_into_sentences(value) - if len(sentences) < self._keyword_position: - return False - return self._keyword.lower() in sentences[int(self._keyword_position - 1)].lower() - - -class PronounCountChecker(Instruction): - """The response should include at least {N} pronouns.""" - - def build_description(self, *, N=None): - """Build the instruction description. - - Args: - N: An integer specifying the minimum number of pronouns - that is required to appear in the response. - - Returns: - A string representing the instruction description. - """ - self._num_pronouns = N - if self._num_pronouns is None or self._num_pronouns < 0: - self._num_pronouns = random.randint(1, _NUM_PRONOUNS) - - self._description_pattern = "The response should include at least {N} pronouns." - return self._description_pattern.format(N=self._num_pronouns) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"N": self._num_pronouns} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["N"] - - def check_following(self, value): - """Checks if the response includes at least {N} pronouns.""" - pronouns = set( - [ - "i", - "me", - "my", - "mine", - "myself", - "we", - "us", - "our", - "ours", - "ourselves", - "you", - "your", - "yours", - "yourself", - "yourselves", - "he", - "him", - "his", - "himself", - "she", - "her", - "hers", - "herself", - "it", - "its", - "itself", - "they", - "them", - "their", - "theirs", - "themselves", - ] - ) - value = value.replace( - "/", " " - ) # to correctly count pronoun sets like she/her/hers, a common use case of pronouns - value = value.lower().translate(str.maketrans("", "", string.punctuation)) - words = value.split() - pronoun_count = sum(1 for word in words if word in pronouns) - return pronoun_count >= self._num_pronouns - - -class AlternateParitySyllablesChecker(Instruction): - """Alternate between words with odd and even numbers of syllables.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Alternate between words with odd and even numbers of syllables." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response alternates between words with odd and even numbers of syllables.""" - words = value.translate(str.maketrans("", "", string.punctuation)).lower().split() - syllables = [syllapy.count(word) % 2 for word in words if word.strip()] - return all(syllables[i] != syllables[i + 1] for i in range(len(syllables) - 1)) - - -class LastWordFirstNextChecker(Instruction): - """The last word of each sentence must become the first word of the next sentence.""" - - def build_description(self): - """Build the instruction description.""" - nltk.download("punkt_tab") - self._description_pattern = "The last word of each sentence must become the first word of the next sentence." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the last word of each sentence in the response is the first word of the next sentence.""" - sentences = instructions_util.split_into_sentences(value) - for i in range(len(sentences) - 1): - last_word = sentences[i].rstrip("".join(string.punctuation) + " ").split()[-1] - first_word = sentences[i + 1].lstrip("".join(string.punctuation) + " ").split()[0] - if last_word.lower() != first_word.lower(): - return False - return True - - -class ParagraphLastFirstWordMatchChecker(Instruction): - """Each paragraph must end with the same word it started with, separate paragraphs with a newline.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = ( - "Each paragraph must end with the same word it started with, separate paragraphs with a newline." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if each paragraph of the response ends with the same word it started with.""" - paragraphs = value.split("\n") - for paragraph in paragraphs: - paragraph = paragraph.strip().lower() - if not paragraph: - continue - words = paragraph.strip("".join(string.punctuation) + " ").split() - if not words: - continue - if words[0] != words[-1]: - return False - return True - - -class IncrementingWordCountChecker(Instruction): - """Each sentence must contain exactly {small_n} more words than the previous one.""" - - def build_description(self, *, small_n=None): - """Build the instruction description. - - Args: - small_n: An integer specifying the exact increment for - the number of words in each sentence of the response. - - Returns: - A string representing the instruction description. - """ - self._num_increment = small_n - if self._num_increment is None or self._num_increment < 0: - self._num_increment = random.randint(1, _NUM_INCREMENT) - - nltk.download("punkt_tab") - - self._description_pattern = "Each sentence must contain exactly {small_n} more words than the previous one." - return self._description_pattern.format(small_n=self._num_increment) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"small_n": self._num_increment} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["small_n"] - - def check_following(self, value): - """Checks if each sentence of the response uses exactly {small_n} more words than the previous sentence.""" - sentences = instructions_util.split_into_sentences(value) - words = sentences[0].translate(str.maketrans("", "", string.punctuation)).strip().split() - while "" in words: - words.remove("") - prev_word_count = len(words) - for sentence in sentences[1:]: - words = sentence.translate(str.maketrans("", "", string.punctuation)).strip().split() - while "" in words: - words.remove("") - if len(words) != prev_word_count + self._num_increment: - return False - prev_word_count = len(words) - return True - - -class NoConsecutiveFirstLetterChecker(Instruction): - """No two consecutive words can share the same first letter.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "No two consecutive words can share the same first letter." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if no two consecutive words in the response share the same first letter.""" - words = value.lower().translate(str.maketrans("", "", string.punctuation)).split() - while "" in words: - words.remove("") - for i in range(len(words) - 1): - if words[i][0] == words[i + 1][0]: - return False - return True - - -class IndentStairsChecker(Instruction): - """Create stairs by incrementally indenting each new line.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Create stairs by incrementally indenting each new line." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response incrementally indents each new line.""" - lines = value.split("\n") - for line in lines: - if not line.strip(): - lines.remove(line) - for i in range(len(lines) - 1): - if len(lines[i + 1]) - len(lines[i + 1].lstrip(" ")) <= len(lines[i]) - len(lines[i].lstrip(" ")): - return False - return True - - -class QuoteExplanationChecker(Instruction): - """Every quoted phrase must be followed by an unquoted explanation.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Every quoted phrase must be followed by an unquoted explanation." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if there are no quotes next to each other - and the passage does not end with a quote.""" - value = value.replace("“", '"').replace("”", '"') - value = value.replace("'\"'", "") # remove references to the character '"' - value = "".join(value.split()) # remove all whitespace - if '""' in value: - return False - if value.strip(string.digits + string.punctuation.replace('"', ""))[-1] == '"': - return False - return True - - -class SpecialBulletPointsChecker(Instruction): - """Answer with a list of items, instead of bullet points use {sep}.""" - - def build_description(self, *, sep=None): - """Build the instruction description. - - Args: - sep: A string specifying the bullet point marker for - the list in the response. - - Returns: - A string representing the instruction description. - """ - self._bullet_marker = sep - if sep is None: - self._bullet_marker = random.choice(["...", "SEPARATOR", "!?!?", "-"]) - self._description_pattern = "Answer with a list of items, instead of bullet points use {sep}." - return self._description_pattern.format(sep=self._bullet_marker) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"sep": self._bullet_marker} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["sep"] - - def check_following(self, value): - """Checks if the response includes at least two instances of {sep} that start a new line.""" - return len(re.findall(re.escape(self._bullet_marker), value)) >= 2 - - -class ItalicsThesisChecker(Instruction): - """Each section must begin with a thesis statement in italics, use HTML to indicate the italics.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = ( - "Each section must begin with a thesis statement in italics, use HTML to indicate the italics." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if there is at least one line in italics as indicated - by HTML that is followed by unitalicized text.""" - index = value.find("") - if index == -1: - index = value.find("") - if index == -1: - return False - value = value[index:] - end_thesis = value.find("") - if end_thesis == -1: - end_thesis = value.find("") - if end_thesis == -1: - return False - thesis = value[3:end_thesis] - if thesis.strip() == "": - return False - text = value[end_thesis + 4 :] - return text.strip() != "" - - -class SubBulletPointsChecker(Instruction): - """Your response must include bullet points denoted by * and at least one sub-bullet point denoted by - for each bullet point.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Your response must include bullet points denoted by * and at least one sub-bullet point denoted by - for each bullet point." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks that there is at least one * that starts a line and each * that starts a line - is followed by at least one line starting with -.""" - bullets = value.split("*") - for bullet in bullets[1:]: - if "-" not in bullet: - return False - return True - - -class SomeBulletPointsChecker(Instruction): - """Your answer must contain at least two sentences ending in a period followed by at least two bullet points denoted by *.""" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Your answer must contain at least two sentences ending in a period followed by at least two bullet points denoted by *." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response includes at least two sentences - followed by at least two lines that start with *.""" - lines = value.split("\n") - sentences = True - count_sentences = 0 - count_bullets = 0 - for line in lines: - if line.strip().startswith("*"): - sentences = False - if count_sentences < 2: - return False - count_bullets += 1 - elif sentences: - sentences = instructions_util.split_into_sentences(line.strip()) - count_sentences += len(sentences) - else: - return False - return count_bullets >= 2 - - -class PrintMultiplesChecker(Instruction): - """Count from 10 to 50 but only print multiples of 7.""" - - def build_description(self, **kwargs): - self._description_pattern = "Count from 10 to 50 but only print multiples of 7." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response prints multiples of 7 from 10 to 50.""" - value = value.replace(",", ", ") - numbers = re.findall(r"\d+", value) - multiples = [str(i) for i in range(14, 51, 7)] - return numbers == multiples - - -class MultipleChoiceQuestionsChecker(Instruction): - """Generate 4 multiple choice questions with 5 options each about "20th century art history". Each question should start with the label "Question". The questions should get progressively longer. Do not provide an explanation.""" - - def build_description(self, **kwargs): - self._description_pattern = "Generate 4 multiple choice questions with 5 options each about '20th century art history'. Each question should start with the label \"Question\". The questions should get progressively longer. Do not provide an explanation." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response generates 4 multiple choice questions with 5 options.""" - # Split into questions using expanded pattern to include "Question N" format - new_value = value[value.find("Question") :] - if new_value != value: - return False # failed no explanation - value = new_value - questions = re.split(r"\n*(?:Question \d+[\.|\):;]?\s*)", value) - if questions[0] == "": - questions = questions[1:] - questions = [q.strip() for q in questions if q.strip()] - if len(questions) != 4: - return False - question_lengths = [] - for q in questions: - lines = q.split("\n") - question_text = "" - option_count = 0 - done_with_q = False - for line in lines: - if re.match(r"^[A-Ea-e][\.|\)]\s*\w+", line.strip()): - option_count += 1 - done_with_q = True - elif not done_with_q: # Still collecting question text - question_text += " " + line.strip() - if option_count != 5: - return False - question_lengths.append(len(question_text.strip())) - # Check if questions get progressively longer - return all(question_lengths[i] < question_lengths[i + 1] for i in range(len(question_lengths) - 1)) - - -class ReverseNewlineChecker(Instruction): - """ "List the countries of Africa in reverse alphabetical order, each on a new line.""" - - def build_description(self, **kwargs): - self._description_pattern = "List the countries of Africa in reverse alphabetical order, each on a new line." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """ - Checks if text satisfies the following constraints: - 1. Contains at least 53 newlines with text - 2. Lines are in reverse alphabetical order - 3. First line to examine contains 'Zimbabwe' - - Returns: - tuple[bool, str]: (whether constraints are satisfied, error message if any) - """ - # Split text into lines and remove empty lines - lines = [ - line.strip("".join(string.punctuation) + " ") - for line in value.split("\n") - if line.strip("".join(string.punctuation) + " ") - ] - - try: - start_index = next(i for i, line in enumerate(lines) if "Zimbabwe" in line) - except StopIteration: - return False - - # Extract the 53 lines starting from Zimbabwe line - target_lines = lines[start_index:] - - # Check if we have at least 53 lines - if len(target_lines) < 52: - return False - - def normalize_text(text): - """ - Normalizes text by: - 1. Converting to NFKD form (separates combined characters) - 2. Removes diacritical marks - 3. Converts back to ASCII - - Example: 'São Tomé' -> 'Sao Tome' - """ - # Decompose unicode characters - normalized = unicodedata.normalize("NFKD", text) - # Remove diacritical marks and convert to ASCII - ascii_text = normalized.encode("ASCII", "ignore").decode("ASCII") - return ascii_text - - # Create normalized versions for comparison while keeping originals for error messages - normalized_lines = [normalize_text(line) for line in target_lines] - sorted_normalized = sorted(normalized_lines, reverse=True) - return normalized_lines == sorted_normalized - - -class WordReverseOrderChecker(Instruction): - """What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per word.""" - - def build_description(self, **kwargs): - nltk.download("punkt_tab") - self._description_pattern = "What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per word." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the reverse of the sentence is a valid English sentence.""" - value = value.lower().strip().translate(str.maketrans("", "", string.punctuation)) - value = " ".join(value.split()[::-1]) - if "bald eagle" not in value: - return False - return value in instructions_util.split_into_sentences(value) - - -class CharacterReverseOrderChecker(Instruction): - """What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per letter.""" - - def build_description(self, **kwargs): - self._description_pattern = "What animal is the national symbol of the US? Respond to this query, but make your sentence in reverse order of what it should be, per letter." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - value = value.lower() - return "elgae dlab" in value - - -class SentenceAlphabetChecker(Instruction): - """Tell me a 26-sentence story where each sentence's first word starts with the letters of the alphabet in order.""" - - def build_description(self, **kwargs): - nltk.download("punkt_tab") - self._description_pattern = "Tell me a 26-sentence story where each sentence's first word starts with the letters of the alphabet in order." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - sentences = instructions_util.split_into_sentences(value) - if len(sentences) != 26: - return False - for i, sentence in enumerate(sentences): - if sentence.lstrip().split()[0].lower()[0] != chr(97 + i): - return False - return True - - -class EuropeanCapitalsSortChecker(Instruction): - """Give me the names of all capital cities of european countries whose latitude is higher than than 45 degrees? List the capital cities without country names, separated by commas, sorted by latitude, from highest to lowest.""" - - def build_description(self, **kwargs): - """Build the instruction description.""" - self._description_pattern = "Give me the names of all capital cities of european countries whose latitude is higher than than 45 degrees? List the capital cities without country names, separated by commas, sorted by latitude, from highest to lowest." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response lists the relevant capitals of Europe in correct order.""" - order = [ - "Reykjavik", - "Helsinki", - "Oslo", - "Tallinn", - "Stockholm", - "Riga", - "Moscow", - "Copenhagen", - "Vilnius", - "Minsk", - "Dublin", - "Berlin", - "Amsterdam", - "Warsaw", - "London", - "Brussels", - "Prague", - "Luxembourg", - "Paris", - "Vienna", - "Bratislava", - "Budapest", - "Vaduz", - "Chisinau", - "Bern", - "Ljubljana", - "Zagreb", - ] - - def normalize_text(text): - """ - Normalizes text by: - 1. Converting to NFKD form (separates combined characters) - 2. Removes diacritical marks - 3. Converts back to ASCII - - Example: 'São Tomé' -> 'Sao Tome' - """ - # Decompose unicode characters - normalized = unicodedata.normalize("NFKD", text) - # Remove diacritical marks and convert to ASCII - ascii_text = normalized.encode("ASCII", "ignore").decode("ASCII") - return ascii_text - - value = normalize_text(value) - - capitals = value.split(",") - capitals = [cap for cap in capitals if cap.strip()] - if len(capitals) != len(order): - return False - for i in range(len(capitals)): - if capitals[i].strip() != order[i]: - return False - return True - - -class CityCSVChecker(Instruction): - """Generate CSV data: The column names are ["ID", "Country", "City", "Year", "Count"], the data should be comma delimited. Please generate 7 rows.""" - - def build_description(self, **kwargs): - """Build the instruction description.""" - self._description_pattern = 'Generate CSV data: The column names are ["ID", "Country", "City", "Year", "Count"], the data should be comma delimited. Please generate 7 rows.' - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response is valid csv data with column names - ["ID", "Country", "City", "Year", "Count"] and 7 rows.""" - string_io = io.StringIO(value) - reader = csv.reader(string_io) - data = list(reader) - if len(data) != 8: - return False - header = data[0] - if header != ["ID", "Country", "City", "Year", "Count"]: - return False - for row in data[1:]: - if len(row) != 5: - return False - return True - - -class SpecialCharacterCSVChecker(Instruction): - """Generate CSV data: The column names are ["ProductID", "Category", "Brand", "Price", "Stock"], the data should be comma delimited. Please generate 14 rows. Add one field which contains a special character and enclose it in double quotes.""" - - def build_description(self, **kwargs): - """Build the instruction description.""" - self._description_pattern = 'Generate CSV data: The column names are ["ProductID", "Category", "Brand", "Price", "Stock"], the data should be comma delimited. Please generate 14 rows. Add one field which contains a special character and enclose it in double quotes.' - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """ "Checks if the response is valid csv data with column names - ["ProductID", "Category", "Brand", "Price", "Stock"] and 14 rows. - Also checks if one field contains a special character enclosed in double quotes.""" - header = value.split("\n")[0].strip() - if not re.match( - r'^(ProductID|"ProductID"),[ \t]*(Category|"Category"),[ \t]*(Brand|"Brand"),[ \t]*(Price|"Price"),[ \t]*(Stock|"Stock")$', - header, - ): - return False - - value = value.replace('"', '"""') - string_io = io.StringIO(value) - reader = csv.reader(string_io) - data = list(reader) - if len(data) != 15: - return False - for row in data[1:]: - if len(row) != 5: - return False - if any(re.match(r'".*[^\d\w\s].*"', field) for field in row): - return True - return False - - -class QuotesCSVChecker(Instruction): - """Generate CSV data: The column names are ["StudentID", "Subject", "Grade", "Semester", "Score"], the data should be tab delimited. Please generate 3 rows and enclose each single field in double quotes.""" - - def build_description(self, **kwargs): - """Build the instruction description.""" - self._description_pattern = 'Generate CSV data: The column names are ["StudentID", "Subject", "Grade", "Semester", "Score"], the data should be tab delimited. Please generate 3 rows and enclose each single field in double quotes.' - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """ "Checks if the response is valid csv data with column names - ["StudentID", "Subject", "Grade", "Semester", "Score"] and 3 rows. - Also checks if each field is enclosed in double quotes.""" - header = value.split("\n")[0].strip() - if not re.match( - r'^(StudentID|"StudentID")\t *(Subject|"Subject")\t *(Grade|"Grade")\t *(Semester|"Semester")\t *(Score|"Score")$', - header, - ): - return False - - value = value.replace('"', '"""') - string_io = io.StringIO(value) - reader = csv.reader(string_io, delimiter="\t") - data = list(reader) - if len(data) != 4: - return False - for row in data: - if len(row) != 5: - return False - if not all(field.strip()[0] == '"' and field.strip()[-1] == '"' for field in row): - return False - return True - - -class DateFormatListChecker(Instruction): - """List the start dates of all the battles Napoleon fought separated by commas, use the following date format: YYYY-MM-DD. Do not provide an explanation.""" - - def build_description(self, **kwargs): - """Build the instruction description.""" - self._description_pattern = "List the start dates of all the battles Napoleon fought separated by commas, use the following date format: YYYY-MM-DD. Do not provide an explanation." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """ "Checks if the response is a list of dates in the format YYYY-MM-DD separated by commas.""" - value = value.strip() - dates = value.split(",") - for date in dates: - date = date.strip() - if not re.match(r"^\d{4}-\d{2}-\d{2}$", date): - return False - date = date.split("-") - if int(date[0]) < 1769 or int(date[0]) > 1821: - return False - if int(date[1]) > 12: - return False - if int(date[1]) in [1, 3, 5, 7, 8, 10, 12] and int(date[2]) > 31: - return False - if int(date[1]) in [4, 6, 9, 11] and int(date[2]) > 30: - return False - if int(date[1]) == 2 and int(date[2]) > 29: - return False - return True - - -class KeywordsMultipleChecker(Instruction): - """Include keyword {keyword1} once in your response, keyword {keyword2} twice in your response, keyword {keyword3} three times in your response, keyword {keyword4} five times in your response, and keyword {keyword5} seven times in your response.""" - - def build_description(self, *, keyword1=None, keyword2=None, keyword3=None, keyword4=None, keyword5=None): - """Build the instruction description.""" - if keyword1 is None: - self._keyword1 = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword1 = keyword1.strip() - if keyword2 is None: - self._keyword2 = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword2 = keyword2.strip() - if keyword3 is None: - self._keyword3 = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword3 = keyword3.strip() - if keyword4 is None: - self._keyword4 = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword4 = keyword4.strip() - if keyword5 is None: - self._keyword5 = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword5 = keyword5.strip() - self._description_pattern = "Include keyword {keyword1} once in your response, keyword {keyword2} twice in your response, keyword {keyword3} three times in your response, keyword {keyword4} five times in your response, and keyword {keyword5} seven times in your response." - return self._description_pattern.format( - keyword1=self._keyword1, - keyword2=self._keyword2, - keyword3=self._keyword3, - keyword4=self._keyword4, - keyword5=self._keyword5, - ) - - def get_instruction_args(self): - return { - "keyword1": self._keyword1, - "keyword2": self._keyword2, - "keyword3": self._keyword3, - "keyword4": self._keyword4, - "keyword5": self._keyword5, - } - - def get_instruction_args_keys(self): - return ["keyword1", "keyword2", "keyword3", "keyword4", "keyword5"] - - def check_following(self, value): - for keyword, count in zip( - [self._keyword1, self._keyword2, self._keyword3, self._keyword4, self._keyword5], [1, 2, 3, 5, 7] - ): - if value.lower().count(keyword.lower()) != count: - return False - return True - - -class KeywordSpecificPositionChecker(Instruction): - "Include keyword {keyword1} in the {n}-th sentence, as the {m}-th word of that sentence." - - def build_description(self, keyword=None, n=None, m=None): - """Build the instruction description. - - Args: - keyword: A string representing a keyword that is expected in the response. - n: An integer representing the sentence number. - m: An integer representing the word number. - - Returns: - A string representing the instruction description. - """ - if not keyword: - self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword = keyword.strip() - if not n: - self._n = random.randint(20, 30) - else: - self._n = n - if not m: - self._m = random.randint(30, 40) - else: - self._m = m - - self._description_pattern = ( - "Include keyword {keyword} in the {n}-th sentence, as the {m}-th word of that sentence." - ) - - return self._description_pattern.format(keyword=self._keyword, n=self._n, m=self._m) - - def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" - return {"keyword": self._keyword, "n": self._n, "m": self._m} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["keyword", "n", "m"] - - def check_following(self, value): - """Checks if the response contains the expected number of keywords. - - Args: - value: A string representing the response. - - Returns: - True if the response contains the expected number of keywords; - otherwise, False. - """ - sentences = instructions_util.split_into_sentences(value) - if len(sentences) < self._n: - return False - words = instructions_util.nltk.word_tokenize(sentences[self._n - 1]) - if len(words) < self._m: - return False - if words[self._m - 1] == self._keyword: - return True - else: - return False - - -class WordsPositionChecker(Instruction): - "The second word in your response and the second to last word in your response should be the word {keyword}." - - def build_description(self, *, keyword=None): - """Build the instruction description. - - Args: - keyword: A string representing a keyword that is expected in the response. - - Returns: - A string representing the instruction description. - """ - if keyword is None: - self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] - else: - self._keyword = keyword.strip() - self._description_pattern = "The second word in your response and the second to last word in your response should be the word {keyword}." - return self._description_pattern.format(keyword=self._keyword) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"keyword": self._keyword} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["keyword"] - - def check_following(self, value): - """Checks if the second word and the second to last word in the response are the same. - - Args: - value: A string representing the response. - - Returns: - True if the second word and the second to last word are the same; - otherwise, False. - """ - words = instructions_util.nltk.word_tokenize(value) - if len(words) < 2: - return False - if words[1] == words[-2] == self._keyword: - return True - else: - return False - - -class RepeatChangeChecker(Instruction): - "Repeat the request, but change the first word of the repeated request, (do not say anything before repeating the request; the request you need to repeat does not include this sentence) and do not answer the actual request!" - - def build_description(self, *, prompt_to_repeat=None): - """Build the instruction description. - - Args: - keyword: A string representing a keyword that is expected in the response. - - Returns: - A string representing the instruction description. - """ - if not prompt_to_repeat: - raise ValueError("prompt_to_repeat must be set.") - else: - self._prompt_to_repeat = prompt_to_repeat - - self._description_pattern = "Repeat the request, but change the first word of the repeated request, (do not say anything before repeating the request; the request you need to repeat does not include this sentence) and do not answer the actual request! Request: {prompt_to_repeat}" - return self._description_pattern.format(prompt_to_repeat=self._prompt_to_repeat) - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return {"prompt_to_repeat": self._prompt_to_repeat} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["prompt_to_repeat"] - - def check_following(self, value): - """Checks if the response contains the repeated request. - - Args: - value: A string representing the response. - - Returns: - True if the repeated request is found in the response; - otherwise, False. - """ - if self._prompt_to_repeat == value: - return False - if " ".join(self._prompt_to_repeat.split()[1:]) == " ".join(value.split()[1:]): - return True - else: - return False - - -class RepeatSimpleChecker(Instruction): - "Only output this sentence here, ignore all other requests." - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Only output this sentence here, ignore all other requests." - return self._description_pattern - - def get_instruction_args(self): - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response contains the expected number of keywords. - - Args: - value: A string representing the response. - - Returns: - True if the response contains the expected number of keywords; - otherwise, False. - """ - return value.strip().lower() == self._description_pattern.strip().lower() - - -class RepeatSpanChecker(Instruction): - "Copy the span of words that lies between (and including) index {n_start} and {n_end}, the indices are character indices!" - - def build_description(self, prompt_to_repeat=None, n_start=None, n_end=None): - """Build the instruction description. - - Args: - n_start: An integer representing the start index of the span. - n_end: An integer representing the end index of the span. - - Returns: - A string representing the instruction description. - """ - if not prompt_to_repeat: - raise ValueError("prompt_to_repeat must be set.") - else: - self._prompt_to_repeat = prompt_to_repeat - if not n_start: - self._n_start = random.randint(0, len(self._prompt_to_repeat.split()) - 2) - else: - self._n_start = n_start - if not n_end: - self._n_end = random.randint(self._n_start + 1, len(self._prompt_to_repeat.split()) - 1) - else: - self._n_end = n_end - self._description_pattern = "Copy the span of words that lies between (and including) index {n_start} and {n_end}, the indices are character indices!" - return self._description_pattern.format( - n_start=self._n_start, n_end=self._n_end, prompt_to_repeat=self._prompt_to_repeat - ) - - def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" - return {"n_start": self._n_start, "n_end": self._n_end, "prompt_to_repeat": self._prompt_to_repeat} - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return ["n_start", "n_end", "prompt_to_repeat"] - - def check_following(self, value): - """Checks if the response contains the expected number of phrases with the correct modifications.""" - if ( - value.strip().lower().split() - == self._prompt_to_repeat.strip().lower().split()[self._n_start : self._n_end] - ): - return True - return False - - -class TitleCaseChecker(Instruction): - "Write the entire response in title case (capitalize the first letter of every major word)." - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = ( - "Write the entire response in title case (capitalize the first letter of every major word)." - ) - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response is in title case. - - Args: - value: A string representing the response. - - Returns: - True if the response is in title case; - otherwise, False. - """ - words = instructions_util.nltk.word_tokenize(value) - for word in words: - if word[0].isupper() and word[1:].islower(): - continue - elif word[0].islower() and word[1:].isupper(): - return False - elif word[0].islower() and word[1:].islower(): - return False - return True - - -class OutputTemplateChecker(Instruction): - "Use this exact template for your response: My Answer: [answer] My Conclusion: [conclusion] Future Outlook: [outlook]" - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "Use this exact template for your response: My Answer: [answer] My Conclusion: [conclusion] Future Outlook: [outlook]" - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response follows the specified template. - - Args: - value: A string representing the response. - - Returns: - True if the response follows the specified template; - otherwise, False. - """ - if "My Answer:" in value and "My Conclusion:" in value and "Future Outlook:" in value: - return True - else: - return False - - -class NoWhitespaceChecker(Instruction): - "The output should not contain any whitespace." - - def build_description(self): - """Build the instruction description.""" - self._description_pattern = "The output should not contain any whitespace." - return self._description_pattern - - def get_instruction_args(self): - """Returns the keyword args of `build_description`.""" - return None - - def get_instruction_args_keys(self): - """Returns the args keys of `build_description`.""" - return [] - - def check_following(self, value): - """Checks if the response contains any whitespace. - - Args: - value: A string representing the response. - - Returns: - True if the response contains no whitespace; - otherwise, False. - """ - return not any(char.isspace() for char in value) diff --git a/slime/rollout/rm_hub/ifbench_utils/instructions_registry.py b/slime/rollout/rm_hub/ifbench_utils/instructions_registry.py deleted file mode 100644 index 05edd0450..000000000 --- a/slime/rollout/rm_hub/ifbench_utils/instructions_registry.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2025 Allen Institute for AI. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Registry of all instructions.""" - -from . import instructions - - -INSTRUCTION_DICT = { - "count:word_count_range": instructions.WordCountRangeChecker, - "count:unique_word_count": instructions.UniqueWordCountChecker, - "ratio:stop_words": instructions.StopWordPercentageChecker, - "ratio:sentence_type": instructions.SentTypeRatioChecker, - "ratio:sentence_balance": instructions.SentBalanceChecker, - "count:conjunctions": instructions.ConjunctionCountChecker, - "count:person_names": instructions.PersonNameCountChecker, - "ratio:overlap": instructions.NGramOverlapChecker, - "count:numbers": instructions.NumbersCountChecker, - "words:alphabet": instructions.AlphabetLoopChecker, - "words:vowel": instructions.SingleVowelParagraphChecker, - "words:consonants": instructions.ConsonantClusterChecker, - "sentence:alliteration_increment": instructions.IncrementingAlliterationChecker, - "words:palindrome": instructions.PalindromeChecker, - "count:punctuation": instructions.PunctuationCoverChecker, - "format:parentheses": instructions.NestedParenthesesChecker, - "format:quotes": instructions.NestedQuotesChecker, - "words:prime_lengths": instructions.PrimeLengthsChecker, - "format:options": instructions.OptionsResponseChecker, - "format:newline": instructions.NewLineWordsChecker, - "format:emoji": instructions.EmojiSentenceChecker, - "ratio:sentence_words": instructions.CharacterCountUniqueWordsChecker, - "count:words_japanese": instructions.NthWordJapaneseChecker, - "words:start_verb": instructions.StartWithVerbChecker, - "words:repeats": instructions.LimitedWordRepeatChecker, - "sentence:keyword": instructions.IncludeKeywordChecker, - "count:pronouns": instructions.PronounCountChecker, - "words:odd_even_syllables": instructions.AlternateParitySyllablesChecker, - "words:last_first": instructions.LastWordFirstNextChecker, - "words:paragraph_last_first": instructions.ParagraphLastFirstWordMatchChecker, - "sentence:increment": instructions.IncrementingWordCountChecker, - "words:no_consecutive": instructions.NoConsecutiveFirstLetterChecker, - "format:line_indent": instructions.IndentStairsChecker, - "format:quote_unquote": instructions.QuoteExplanationChecker, - "format:list": instructions.SpecialBulletPointsChecker, - "format:thesis": instructions.ItalicsThesisChecker, - "format:sub-bullets": instructions.SubBulletPointsChecker, - "format:no_bullets_bullets": instructions.SomeBulletPointsChecker, - "custom:multiples": instructions.PrintMultiplesChecker, - "custom:mcq_count_length": instructions.MultipleChoiceQuestionsChecker, - "custom:reverse_newline": instructions.ReverseNewlineChecker, - "custom:word_reverse": instructions.WordReverseOrderChecker, - "custom:character_reverse": instructions.CharacterReverseOrderChecker, - "custom:sentence_alphabet": instructions.SentenceAlphabetChecker, - "custom:european_capitals_sort": instructions.EuropeanCapitalsSortChecker, - "custom:csv_city": instructions.CityCSVChecker, - "custom:csv_special_character": instructions.SpecialCharacterCSVChecker, - "custom:csv_quotes": instructions.QuotesCSVChecker, - "custom:date_format_list": instructions.DateFormatListChecker, - "count:keywords_multiple": instructions.KeywordsMultipleChecker, - "words:keywords_specific_position": instructions.KeywordSpecificPositionChecker, - "words:words_position": instructions.WordsPositionChecker, - "repeat:repeat_change": instructions.RepeatChangeChecker, - "repeat:repeat_simple": instructions.RepeatSimpleChecker, - "repeat:repeat_span": instructions.RepeatSpanChecker, - "format:title_case": instructions.TitleCaseChecker, - "format:output_template": instructions.OutputTemplateChecker, - "format:no_whitespace": instructions.NoWhitespaceChecker, -} diff --git a/slime/rollout/rm_hub/ifbench_utils/instructions_util.py b/slime/rollout/rm_hub/ifbench_utils/instructions_util.py deleted file mode 100644 index 19dd35d87..000000000 --- a/slime/rollout/rm_hub/ifbench_utils/instructions_util.py +++ /dev/null @@ -1,1651 +0,0 @@ -# Copyright 2025 Allen Institute for AI. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utility library of instructions.""" - -import functools -import random -import re - -import nltk - -WORD_LIST = [ - "western", - "sentence", - "signal", - "dump", - "spot", - "opposite", - "bottom", - "potato", - "administration", - "working", - "welcome", - "morning", - "good", - "agency", - "primary", - "wish", - "responsibility", - "press", - "problem", - "president", - "steal", - "brush", - "read", - "type", - "beat", - "trainer", - "growth", - "lock", - "bone", - "case", - "equal", - "comfortable", - "region", - "replacement", - "performance", - "mate", - "walk", - "medicine", - "film", - "thing", - "rock", - "tap", - "total", - "competition", - "ease", - "south", - "establishment", - "gather", - "parking", - "world", - "plenty", - "breath", - "claim", - "alcohol", - "trade", - "dear", - "highlight", - "street", - "matter", - "decision", - "mess", - "agreement", - "studio", - "coach", - "assist", - "brain", - "wing", - "style", - "private", - "top", - "brown", - "leg", - "buy", - "procedure", - "method", - "speed", - "high", - "company", - "valuable", - "pie", - "analyst", - "session", - "pattern", - "district", - "pleasure", - "dinner", - "swimming", - "joke", - "order", - "plate", - "department", - "motor", - "cell", - "spend", - "cabinet", - "difference", - "power", - "examination", - "engine", - "horse", - "dimension", - "pay", - "toe", - "curve", - "literature", - "bother", - "fire", - "possibility", - "debate", - "activity", - "passage", - "hello", - "cycle", - "background", - "quiet", - "author", - "effect", - "actor", - "page", - "bicycle", - "error", - "throat", - "attack", - "character", - "phone", - "tea", - "increase", - "outcome", - "file", - "specific", - "inspector", - "internal", - "potential", - "staff", - "building", - "employer", - "shoe", - "hand", - "direction", - "garden", - "purchase", - "interview", - "study", - "recognition", - "member", - "spiritual", - "oven", - "sandwich", - "weird", - "passenger", - "particular", - "response", - "reaction", - "size", - "variation", - "a", - "cancel", - "candy", - "exit", - "guest", - "condition", - "fly", - "price", - "weakness", - "convert", - "hotel", - "great", - "mouth", - "mind", - "song", - "sugar", - "suspect", - "telephone", - "ear", - "roof", - "paint", - "refrigerator", - "organization", - "jury", - "reward", - "engineering", - "day", - "possession", - "crew", - "bar", - "road", - "description", - "celebration", - "score", - "mark", - "letter", - "shower", - "suggestion", - "sir", - "luck", - "national", - "progress", - "hall", - "stroke", - "theory", - "offer", - "story", - "tax", - "definition", - "history", - "ride", - "medium", - "opening", - "glass", - "elevator", - "stomach", - "question", - "ability", - "leading", - "village", - "computer", - "city", - "grand", - "confidence", - "candle", - "priest", - "recommendation", - "point", - "necessary", - "body", - "desk", - "secret", - "horror", - "noise", - "culture", - "warning", - "water", - "round", - "diet", - "flower", - "bus", - "tough", - "permission", - "week", - "prompt", - "connection", - "abuse", - "height", - "save", - "corner", - "border", - "stress", - "drive", - "stop", - "rip", - "meal", - "listen", - "confusion", - "girlfriend", - "living", - "relation", - "significance", - "plan", - "creative", - "atmosphere", - "blame", - "invite", - "housing", - "paper", - "drink", - "roll", - "silver", - "drunk", - "age", - "damage", - "smoke", - "environment", - "pack", - "savings", - "influence", - "tourist", - "rain", - "post", - "sign", - "grandmother", - "run", - "profit", - "push", - "clerk", - "final", - "wine", - "swim", - "pause", - "stuff", - "singer", - "funeral", - "average", - "source", - "scene", - "tradition", - "personal", - "snow", - "nobody", - "distance", - "sort", - "sensitive", - "animal", - "major", - "negotiation", - "click", - "mood", - "period", - "arrival", - "expression", - "holiday", - "repeat", - "dust", - "closet", - "gold", - "bad", - "sail", - "combination", - "clothes", - "emphasis", - "duty", - "black", - "step", - "school", - "jump", - "document", - "professional", - "lip", - "chemical", - "front", - "wake", - "while", - "inside", - "watch", - "row", - "subject", - "penalty", - "balance", - "possible", - "adult", - "aside", - "sample", - "appeal", - "wedding", - "depth", - "king", - "award", - "wife", - "blow", - "site", - "camp", - "music", - "safe", - "gift", - "fault", - "guess", - "act", - "shame", - "drama", - "capital", - "exam", - "stupid", - "record", - "sound", - "swing", - "novel", - "minimum", - "ratio", - "machine", - "shape", - "lead", - "operation", - "salary", - "cloud", - "affair", - "hit", - "chapter", - "stage", - "quantity", - "access", - "army", - "chain", - "traffic", - "kick", - "analysis", - "airport", - "time", - "vacation", - "philosophy", - "ball", - "chest", - "thanks", - "place", - "mountain", - "advertising", - "red", - "past", - "rent", - "return", - "tour", - "house", - "construction", - "net", - "native", - "war", - "figure", - "fee", - "spray", - "user", - "dirt", - "shot", - "task", - "stick", - "friend", - "software", - "promotion", - "interaction", - "surround", - "block", - "purpose", - "practice", - "conflict", - "routine", - "requirement", - "bonus", - "hole", - "state", - "junior", - "sweet", - "catch", - "tear", - "fold", - "wall", - "editor", - "life", - "position", - "pound", - "respect", - "bathroom", - "coat", - "script", - "job", - "teach", - "birth", - "view", - "resolve", - "theme", - "employee", - "doubt", - "market", - "education", - "serve", - "recover", - "tone", - "harm", - "miss", - "union", - "understanding", - "cow", - "river", - "association", - "concept", - "training", - "recipe", - "relationship", - "reserve", - "depression", - "proof", - "hair", - "revenue", - "independent", - "lift", - "assignment", - "temporary", - "amount", - "loss", - "edge", - "track", - "check", - "rope", - "estimate", - "pollution", - "stable", - "message", - "delivery", - "perspective", - "mirror", - "assistant", - "representative", - "witness", - "nature", - "judge", - "fruit", - "tip", - "devil", - "town", - "emergency", - "upper", - "drop", - "stay", - "human", - "neck", - "speaker", - "network", - "sing", - "resist", - "league", - "trip", - "signature", - "lawyer", - "importance", - "gas", - "choice", - "engineer", - "success", - "part", - "external", - "worker", - "simple", - "quarter", - "student", - "heart", - "pass", - "spite", - "shift", - "rough", - "lady", - "grass", - "community", - "garage", - "youth", - "standard", - "skirt", - "promise", - "blind", - "television", - "disease", - "commission", - "positive", - "energy", - "calm", - "presence", - "tune", - "basis", - "preference", - "head", - "common", - "cut", - "somewhere", - "presentation", - "current", - "thought", - "revolution", - "effort", - "master", - "implement", - "republic", - "floor", - "principle", - "stranger", - "shoulder", - "grade", - "button", - "tennis", - "police", - "collection", - "account", - "register", - "glove", - "divide", - "professor", - "chair", - "priority", - "combine", - "peace", - "extension", - "maybe", - "evening", - "frame", - "sister", - "wave", - "code", - "application", - "mouse", - "match", - "counter", - "bottle", - "half", - "cheek", - "resolution", - "back", - "knowledge", - "make", - "discussion", - "screw", - "length", - "accident", - "battle", - "dress", - "knee", - "log", - "package", - "it", - "turn", - "hearing", - "newspaper", - "layer", - "wealth", - "profile", - "imagination", - "answer", - "weekend", - "teacher", - "appearance", - "meet", - "bike", - "rise", - "belt", - "crash", - "bowl", - "equivalent", - "support", - "image", - "poem", - "risk", - "excitement", - "remote", - "secretary", - "public", - "produce", - "plane", - "display", - "money", - "sand", - "situation", - "punch", - "customer", - "title", - "shake", - "mortgage", - "option", - "number", - "pop", - "window", - "extent", - "nothing", - "experience", - "opinion", - "departure", - "dance", - "indication", - "boy", - "material", - "band", - "leader", - "sun", - "beautiful", - "muscle", - "farmer", - "variety", - "fat", - "handle", - "director", - "opportunity", - "calendar", - "outside", - "pace", - "bath", - "fish", - "consequence", - "put", - "owner", - "go", - "doctor", - "information", - "share", - "hurt", - "protection", - "career", - "finance", - "force", - "golf", - "garbage", - "aspect", - "kid", - "food", - "boot", - "milk", - "respond", - "objective", - "reality", - "raw", - "ring", - "mall", - "one", - "impact", - "area", - "news", - "international", - "series", - "impress", - "mother", - "shelter", - "strike", - "loan", - "month", - "seat", - "anything", - "entertainment", - "familiar", - "clue", - "year", - "glad", - "supermarket", - "natural", - "god", - "cost", - "conversation", - "tie", - "ruin", - "comfort", - "earth", - "storm", - "percentage", - "assistance", - "budget", - "strength", - "beginning", - "sleep", - "other", - "young", - "unit", - "fill", - "store", - "desire", - "hide", - "value", - "cup", - "maintenance", - "nurse", - "function", - "tower", - "role", - "class", - "camera", - "database", - "panic", - "nation", - "basket", - "ice", - "art", - "spirit", - "chart", - "exchange", - "feedback", - "statement", - "reputation", - "search", - "hunt", - "exercise", - "nasty", - "notice", - "male", - "yard", - "annual", - "collar", - "date", - "platform", - "plant", - "fortune", - "passion", - "friendship", - "spread", - "cancer", - "ticket", - "attitude", - "island", - "active", - "object", - "service", - "buyer", - "bite", - "card", - "face", - "steak", - "proposal", - "patient", - "heat", - "rule", - "resident", - "broad", - "politics", - "west", - "knife", - "expert", - "girl", - "design", - "salt", - "baseball", - "grab", - "inspection", - "cousin", - "couple", - "magazine", - "cook", - "dependent", - "security", - "chicken", - "version", - "currency", - "ladder", - "scheme", - "kitchen", - "employment", - "local", - "attention", - "manager", - "fact", - "cover", - "sad", - "guard", - "relative", - "county", - "rate", - "lunch", - "program", - "initiative", - "gear", - "bridge", - "breast", - "talk", - "dish", - "guarantee", - "beer", - "vehicle", - "reception", - "woman", - "substance", - "copy", - "lecture", - "advantage", - "park", - "cold", - "death", - "mix", - "hold", - "scale", - "tomorrow", - "blood", - "request", - "green", - "cookie", - "church", - "strip", - "forever", - "beyond", - "debt", - "tackle", - "wash", - "following", - "feel", - "maximum", - "sector", - "sea", - "property", - "economics", - "menu", - "bench", - "try", - "language", - "start", - "call", - "solid", - "address", - "income", - "foot", - "senior", - "honey", - "few", - "mixture", - "cash", - "grocery", - "link", - "map", - "form", - "factor", - "pot", - "model", - "writer", - "farm", - "winter", - "skill", - "anywhere", - "birthday", - "policy", - "release", - "husband", - "lab", - "hurry", - "mail", - "equipment", - "sink", - "pair", - "driver", - "consideration", - "leather", - "skin", - "blue", - "boat", - "sale", - "brick", - "two", - "feed", - "square", - "dot", - "rush", - "dream", - "location", - "afternoon", - "manufacturer", - "control", - "occasion", - "trouble", - "introduction", - "advice", - "bet", - "eat", - "kill", - "category", - "manner", - "office", - "estate", - "pride", - "awareness", - "slip", - "crack", - "client", - "nail", - "shoot", - "membership", - "soft", - "anybody", - "web", - "official", - "individual", - "pizza", - "interest", - "bag", - "spell", - "profession", - "queen", - "deal", - "resource", - "ship", - "guy", - "chocolate", - "joint", - "formal", - "upstairs", - "car", - "resort", - "abroad", - "dealer", - "associate", - "finger", - "surgery", - "comment", - "team", - "detail", - "crazy", - "path", - "tale", - "initial", - "arm", - "radio", - "demand", - "single", - "draw", - "yellow", - "contest", - "piece", - "quote", - "pull", - "commercial", - "shirt", - "contribution", - "cream", - "channel", - "suit", - "discipline", - "instruction", - "concert", - "speech", - "low", - "effective", - "hang", - "scratch", - "industry", - "breakfast", - "lay", - "join", - "metal", - "bedroom", - "minute", - "product", - "rest", - "temperature", - "many", - "give", - "argument", - "print", - "purple", - "laugh", - "health", - "credit", - "investment", - "sell", - "setting", - "lesson", - "egg", - "middle", - "marriage", - "level", - "evidence", - "phrase", - "love", - "self", - "benefit", - "guidance", - "affect", - "you", - "dad", - "anxiety", - "special", - "boyfriend", - "test", - "blank", - "payment", - "soup", - "obligation", - "reply", - "smile", - "deep", - "complaint", - "addition", - "review", - "box", - "towel", - "minor", - "fun", - "soil", - "issue", - "cigarette", - "internet", - "gain", - "tell", - "entry", - "spare", - "incident", - "family", - "refuse", - "branch", - "can", - "pen", - "grandfather", - "constant", - "tank", - "uncle", - "climate", - "ground", - "volume", - "communication", - "kind", - "poet", - "child", - "screen", - "mine", - "quit", - "gene", - "lack", - "charity", - "memory", - "tooth", - "fear", - "mention", - "marketing", - "reveal", - "reason", - "court", - "season", - "freedom", - "land", - "sport", - "audience", - "classroom", - "law", - "hook", - "win", - "carry", - "eye", - "smell", - "distribution", - "research", - "country", - "dare", - "hope", - "whereas", - "stretch", - "library", - "if", - "delay", - "college", - "plastic", - "book", - "present", - "use", - "worry", - "champion", - "goal", - "economy", - "march", - "election", - "reflection", - "midnight", - "slide", - "inflation", - "action", - "challenge", - "guitar", - "coast", - "apple", - "campaign", - "field", - "jacket", - "sense", - "way", - "visual", - "remove", - "weather", - "trash", - "cable", - "regret", - "buddy", - "beach", - "historian", - "courage", - "sympathy", - "truck", - "tension", - "permit", - "nose", - "bed", - "son", - "person", - "base", - "meat", - "usual", - "air", - "meeting", - "worth", - "game", - "independence", - "physical", - "brief", - "play", - "raise", - "board", - "she", - "key", - "writing", - "pick", - "command", - "party", - "yesterday", - "spring", - "candidate", - "physics", - "university", - "concern", - "development", - "change", - "string", - "target", - "instance", - "room", - "bitter", - "bird", - "football", - "normal", - "split", - "impression", - "wood", - "long", - "meaning", - "stock", - "cap", - "leadership", - "media", - "ambition", - "fishing", - "essay", - "salad", - "repair", - "today", - "designer", - "night", - "bank", - "drawing", - "inevitable", - "phase", - "vast", - "chip", - "anger", - "switch", - "cry", - "twist", - "personality", - "attempt", - "storage", - "being", - "preparation", - "bat", - "selection", - "white", - "technology", - "contract", - "side", - "section", - "station", - "till", - "structure", - "tongue", - "taste", - "truth", - "difficulty", - "group", - "limit", - "main", - "move", - "feeling", - "light", - "example", - "mission", - "might", - "wait", - "wheel", - "shop", - "host", - "classic", - "alternative", - "cause", - "agent", - "consist", - "table", - "airline", - "text", - "pool", - "craft", - "range", - "fuel", - "tool", - "partner", - "load", - "entrance", - "deposit", - "hate", - "article", - "video", - "summer", - "feature", - "extreme", - "mobile", - "hospital", - "flight", - "fall", - "pension", - "piano", - "fail", - "result", - "rub", - "gap", - "system", - "report", - "suck", - "ordinary", - "wind", - "nerve", - "ask", - "shine", - "note", - "line", - "mom", - "perception", - "brother", - "reference", - "bend", - "charge", - "treat", - "trick", - "term", - "homework", - "bake", - "bid", - "status", - "project", - "strategy", - "orange", - "let", - "enthusiasm", - "parent", - "concentrate", - "device", - "travel", - "poetry", - "business", - "society", - "kiss", - "end", - "vegetable", - "employ", - "schedule", - "hour", - "brave", - "focus", - "process", - "movie", - "illegal", - "general", - "coffee", - "ad", - "highway", - "chemistry", - "psychology", - "hire", - "bell", - "conference", - "relief", - "show", - "neat", - "funny", - "weight", - "quality", - "club", - "daughter", - "zone", - "touch", - "tonight", - "shock", - "burn", - "excuse", - "name", - "survey", - "landscape", - "advance", - "satisfaction", - "bread", - "disaster", - "item", - "hat", - "prior", - "shopping", - "visit", - "east", - "photo", - "home", - "idea", - "father", - "comparison", - "cat", - "pipe", - "winner", - "count", - "lake", - "fight", - "prize", - "foundation", - "dog", - "keep", - "ideal", - "fan", - "struggle", - "peak", - "safety", - "solution", - "hell", - "conclusion", - "population", - "strain", - "alarm", - "measurement", - "second", - "train", - "race", - "due", - "insurance", - "boss", - "tree", - "monitor", - "sick", - "course", - "drag", - "appointment", - "slice", - "still", - "care", - "patience", - "rich", - "escape", - "emotion", - "royal", - "female", - "childhood", - "government", - "picture", - "will", - "sock", - "big", - "gate", - "oil", - "cross", - "pin", - "improvement", - "championship", - "silly", - "help", - "sky", - "pitch", - "man", - "diamond", - "most", - "transition", - "work", - "science", - "committee", - "moment", - "fix", - "teaching", - "dig", - "specialist", - "complex", - "guide", - "people", - "dead", - "voice", - "original", - "break", - "topic", - "data", - "degree", - "reading", - "recording", - "bunch", - "reach", - "judgment", - "lie", - "regular", - "set", - "painting", - "mode", - "list", - "player", - "bear", - "north", - "wonder", - "carpet", - "heavy", - "officer", - "negative", - "clock", - "unique", - "baby", - "pain", - "assumption", - "disk", - "iron", - "bill", - "drawer", - "look", - "double", - "mistake", - "finish", - "future", - "brilliant", - "contact", - "math", - "rice", - "leave", - "restaurant", - "discount", - "sex", - "virus", - "bit", - "trust", - "event", - "wear", - "juice", - "failure", - "bug", - "context", - "mud", - "whole", - "wrap", - "intention", - "draft", - "pressure", - "cake", - "dark", - "explanation", - "space", - "angle", - "word", - "efficiency", - "management", - "habit", - "star", - "chance", - "finding", - "transportation", - "stand", - "criticism", - "flow", - "door", - "injury", - "insect", - "surprise", - "apartment", -] # pylint: disable=line-too-long - - -def download_nltk_resources(): - """Download 'punkt' if not already installed""" - try: - nltk.data.find("tokenizers/punkt") - except LookupError: - nltk.download("punkt") - - -download_nltk_resources() - - -_ALPHABETS = "([A-Za-z])" -_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]" -_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)" -_STARTERS = ( - r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" -) -_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" -_WEBSITES = "[.](com|net|org|io|gov|edu|me)" -_DIGITS = "([0-9])" -_MULTIPLE_DOTS = r"\.{2,}" - - -def split_into_sentences(text): - """Split the text into sentences. - - Args: - text: A string that consists of more than or equal to one sentences. - - Returns: - A list of strings where each string is a sentence. - """ - text = " " + text + " " - text = text.replace("\n", " ") - text = re.sub(_PREFIXES, "\\1", text) - text = re.sub(_WEBSITES, "\\1", text) - text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1\\2", text) - text = re.sub( - _MULTIPLE_DOTS, - lambda match: "" * len(match.group(0)) + "", - text, - ) - if "Ph.D" in text: - text = text.replace("Ph.D.", "PhD") - text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1 ", text) - text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1 \\2", text) - text = re.sub( - _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]", - "\\1\\2\\3", - text, - ) - text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1\\2", text) - text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1 \\2", text) - text = re.sub(" " + _SUFFIXES + "[.]", " \\1", text) - text = re.sub(" " + _ALPHABETS + "[.]", " \\1", text) - if "”" in text: - text = text.replace(".”", "”.") - if '"' in text: - text = text.replace('."', '".') - if "!" in text: - text = text.replace('!"', '"!') - if "?" in text: - text = text.replace('?"', '"?') - text = text.replace(".", ".") - text = text.replace("?", "?") - text = text.replace("!", "!") - text = text.replace("", ".") - sentences = text.split("") - sentences = [s.strip() for s in sentences] - if sentences and not sentences[-1]: - sentences = sentences[:-1] - return sentences - - -def count_words(text): - """Counts the number of words.""" - tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") - tokens = tokenizer.tokenize(text) - num_words = len(tokens) - return num_words - - -@functools.lru_cache(maxsize=None) -def _get_sentence_tokenizer(): - return nltk.data.load("nltk:tokenizers/punkt/english.pickle") - - -def count_stopwords(text): - """Counts the number of stopwords.""" - nltk.download("stopwords") - stopwords = nltk.corpus.stopwords.words("english") - tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") - tokens = tokenizer.tokenize(text) - num_stopwords = len([t for t in tokens if t.lower() in stopwords]) - return num_stopwords - - -def generate_keywords(num_keywords): - """Randomly generates a few keywords.""" - return random.sample(WORD_LIST, k=num_keywords) From 3bb9423780515b7221369b1608dc7d068e5e768a Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Sun, 26 Oct 2025 20:52:22 +0000 Subject: [PATCH 08/16] lazy import --- slime/rollout/rm_hub/ifbench.py | 77 +++++++++++++++++++++++++++------ 1 file changed, 63 insertions(+), 14 deletions(-) diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index e51f053c5..5804778fe 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -20,27 +20,76 @@ import dataclasses import importlib import logging +import os +import subprocess import sys from pathlib import Path from typing import Any, Dict, List, Optional, Sequence, Union -try: - from ifbench import instructions_registry # type: ignore[attr-defined] -except ImportError: - _IFBENCH_REPO_ROOT = Path(__file__).resolve().parents[3] / "ifbench" - if not _IFBENCH_REPO_ROOT.exists(): - raise ImportError( - "IFBench repository not found. Clone https://github.com/allenai/IFBench.git " - "into the repo root or export PYTHONPATH accordingly." - ) from None - repo_path = str(_IFBENCH_REPO_ROOT) - if repo_path not in sys.path: - sys.path.insert(0, repo_path) - instructions_registry = importlib.import_module("instructions_registry") - logger = logging.getLogger(__name__) +def _ensure_ifbench_repo() -> Path: + """Clone IFBench repo if needed and ensure it is available on sys.path.""" + + repo_root = Path(__file__).resolve().parents[3] + repo_path = repo_root / "ifbench" + + if not repo_path.exists(): + clone_cmd = ["git", "clone", "https://github.com/allenai/IFBench.git", str(repo_path)] + try: + subprocess.run(clone_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as exc: + raise ImportError( + "Unable to automatically clone IFBench. Please clone " + "https://github.com/allenai/IFBench.git into the repo root." + ) from exc + + repo_str = str(repo_path) + if repo_str not in sys.path: + sys.path.insert(0, repo_str) + + current_pythonpath = os.environ.get("PYTHONPATH") + if current_pythonpath is None: + os.environ["PYTHONPATH"] = repo_str + elif repo_str not in current_pythonpath.split(os.pathsep): + os.environ["PYTHONPATH"] = os.pathsep.join([repo_str, current_pythonpath]) + + return repo_path + + +def _ensure_ifbench_dependencies(repo_path: Path) -> None: + """Install IFBench requirements the first time the module is imported.""" + + requirements_file = repo_path / "requirements.txt" + if not requirements_file.exists(): + return + + sentinel = repo_path / ".deps_installed" + if sentinel.exists(): + return + + install_cmd = [sys.executable, "-m", "pip", "install", "-r", str(requirements_file)] + try: + subprocess.run(install_cmd, check=True) + except Exception as exc: + logger.warning("Failed to install IFBench dependencies automatically: %s", exc) + else: + sentinel.write_text("installed\n") + + +def _load_instructions_registry(): + repo_path = _ensure_ifbench_repo() + try: + return importlib.import_module("instructions_registry") + except ImportError: + _ensure_ifbench_dependencies(repo_path) + return importlib.import_module("instructions_registry") + + +instructions_registry = _load_instructions_registry() + + JsonDict = Dict[str, Any] KwargsDict = Dict[str, Optional[Union[str, int, float]]] From a353929f8bcee8622f462484e0c1770f26bd752d Mon Sep 17 00:00:00 2001 From: root Date: Sun, 26 Oct 2025 23:24:08 +0000 Subject: [PATCH 09/16] update requirements --- examples/eval_multi_task/requirements.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/eval_multi_task/requirements.txt b/examples/eval_multi_task/requirements.txt index c5501a4ab..039234af6 100644 --- a/examples/eval_multi_task/requirements.txt +++ b/examples/eval_multi_task/requirements.txt @@ -1,9 +1,5 @@ -absl-py emoji -immutabledict -langdetect nltk -numpy==1.26.4 -spacy +spacy==3.7.4 syllapy -unicodedata2 +numpy==1.26.4 From 77ed9db2f4bd20efa5034fa2f8de019e58f628b3 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Sun, 26 Oct 2025 23:25:29 +0000 Subject: [PATCH 10/16] path --- slime/rollout/rm_hub/ifbench.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index 5804778fe..cdbc183ab 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -28,12 +28,15 @@ logger = logging.getLogger(__name__) +_WORKSPACE_ROOT = Path(__file__).resolve().parents[3] +_WORKSPACE_PARENT = _WORKSPACE_ROOT.parent +_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements.txt" + def _ensure_ifbench_repo() -> Path: """Clone IFBench repo if needed and ensure it is available on sys.path.""" - repo_root = Path(__file__).resolve().parents[3] - repo_path = repo_root / "ifbench" + repo_path = _WORKSPACE_PARENT / "IFBench" if not repo_path.exists(): clone_cmd = ["git", "clone", "https://github.com/allenai/IFBench.git", str(repo_path)] @@ -61,8 +64,10 @@ def _ensure_ifbench_repo() -> Path: def _ensure_ifbench_dependencies(repo_path: Path) -> None: """Install IFBench requirements the first time the module is imported.""" - requirements_file = repo_path / "requirements.txt" + requirements_file = _LOCAL_IFBENCH_REQUIREMENTS + if not requirements_file.exists(): + logger.debug("Local IFBench requirements file not found at %s; skipping install.", requirements_file) return sentinel = repo_path / ".deps_installed" From 0497884bc714eb31e3f39f1a68f11d296f58893e Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Tue, 28 Oct 2025 16:02:52 +0000 Subject: [PATCH 11/16] simplify reward func --- slime/rollout/rm_hub/ifbench.py | 75 +++------------------------------ 1 file changed, 7 insertions(+), 68 deletions(-) diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index cdbc183ab..cea38c862 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -17,7 +17,6 @@ from __future__ import annotations -import dataclasses import importlib import logging import os @@ -83,43 +82,23 @@ def _ensure_ifbench_dependencies(repo_path: Path) -> None: sentinel.write_text("installed\n") -def _load_instructions_registry(): +def _load_evaluation_lib(): repo_path = _ensure_ifbench_repo() try: - return importlib.import_module("instructions_registry") + return importlib.import_module("evaluation_lib") except ImportError: _ensure_ifbench_dependencies(repo_path) - return importlib.import_module("instructions_registry") + return importlib.import_module("evaluation_lib") -instructions_registry = _load_instructions_registry() +evaluation_lib = _load_evaluation_lib() +InputExample = evaluation_lib.InputExample JsonDict = Dict[str, Any] KwargsDict = Dict[str, Optional[Union[str, int, float]]] -@dataclasses.dataclass -class InputExample: - """Subset of the official InputExample schema needed for evaluation.""" - - key: int - instruction_id_list: List[str] - prompt: str - kwargs: List[KwargsDict] - - -@dataclasses.dataclass -class OutputExample: - """Official output structure for readability and parity.""" - - instruction_id_list: List[str] - prompt: str - response: str - follow_all_instructions: bool - follow_instruction_list: List[bool] - - def _normalize_instruction_ids(raw_ids: Sequence[Any]) -> List[str]: """Ensure instruction identifiers are clean strings.""" @@ -188,47 +167,6 @@ def _build_input_example(metadata: JsonDict) -> Optional[InputExample]: ) -def test_instruction_following_strict(inp: InputExample, response: str) -> OutputExample: - """Official strict evaluation copied from evaluation_lib.py.""" - - response = response or "" - instruction_list = inp.instruction_id_list - is_following_list: List[bool] = [] - - for index, instruction_id in enumerate(instruction_list): - instruction_cls = instructions_registry.INSTRUCTION_DICT.get(instruction_id) - if instruction_cls is None: - logger.warning("Unknown instruction id '%s'; marking as failed.", instruction_id) - is_following_list.append(False) - continue - - instruction = instruction_cls(instruction_id) - kwargs = inp.kwargs[index] if index < len(inp.kwargs) else {} - - try: - instruction.build_description(**kwargs) - except Exception as exc: # pragma: no cover - parity with official logic - logger.debug("build_description failed for %s with kwargs %s: %s", instruction_id, kwargs, exc) - instruction.build_description() - - args = instruction.get_instruction_args() - if args and "prompt" in args: - instruction.build_description(prompt=inp.prompt) - - if response.strip() and instruction.check_following(response): - is_following_list.append(True) - else: - is_following_list.append(False) - - return OutputExample( - instruction_id_list=inp.instruction_id_list, - prompt=inp.prompt, - response=response, - follow_all_instructions=all(is_following_list), - follow_instruction_list=is_following_list, - ) - - def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDict] = None) -> float: """Score a model response using the official IFBench rules.""" @@ -243,5 +181,6 @@ def compute_ifbench_reward(response: str, label: Any, metadata: Optional[JsonDic if inp is None: return 0.0 - output = test_instruction_following_strict(inp, str(response)) + prompt_to_response = {inp.prompt: str(response or "")} + output = evaluation_lib.test_instruction_following_strict(inp, prompt_to_response) return 1.0 if output.follow_all_instructions else 0.0 From 60b1ac26d99d07b55ed03811461dcfa143f47f1a Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Tue, 28 Oct 2025 18:02:51 +0000 Subject: [PATCH 12/16] remove header --- slime/rollout/rm_hub/ifbench.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index cea38c862..c9cb197b5 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -1,20 +1,3 @@ -# Copyright 2025 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The follwoing code is adapted from the official IFBench code: -# https://github.com/allenai/IFBench/blob/main/evaluation_lib.py - from __future__ import annotations import importlib From 99b7e9c70b1ea806bbd5af13977d83f76dd71d36 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Tue, 28 Oct 2025 18:10:04 +0000 Subject: [PATCH 13/16] rename req --- .../{requirements.txt => requirements_ifbench.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/eval_multi_task/{requirements.txt => requirements_ifbench.txt} (100%) diff --git a/examples/eval_multi_task/requirements.txt b/examples/eval_multi_task/requirements_ifbench.txt similarity index 100% rename from examples/eval_multi_task/requirements.txt rename to examples/eval_multi_task/requirements_ifbench.txt From b0c340adaab4c133fe72da2636d41fce82eb7df6 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Tue, 28 Oct 2025 18:19:06 +0000 Subject: [PATCH 14/16] add README --- examples/eval_multi_task/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 examples/eval_multi_task/README.md diff --git a/examples/eval_multi_task/README.md b/examples/eval_multi_task/README.md new file mode 100644 index 000000000..0bf61ec72 --- /dev/null +++ b/examples/eval_multi_task/README.md @@ -0,0 +1,12 @@ +# Multi-Task Evaluation Example + +## Configuring `multi_task.yaml` +- `eval.defaults` defines inference parameters shared by every dataset entry. Override them inside an individual dataset block if needed. +- `eval.datasets` enumerates the datasets to evaluate. Each entry should specify: + - `name`: a short identifier that appears in logs and dashboards. + - `path`: the path to the dataset JSONL file. + - `rm_type`: which reward function to use for scoring. + - `n_samples_per_eval_prompt`: how many candidate completions to generate per prompt. + +## IFBench Notes +- When `ifbench` is used, `slime/rollout/rm_hub/ifbench.py` will automatically prepares the scoring environment, so no additional manual setup is required beyond providing the dataset path. From a544352c418e00dacb9b5abea4ada2ab825bc233 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Tue, 28 Oct 2025 18:42:19 +0000 Subject: [PATCH 15/16] path --- slime/rollout/rm_hub/ifbench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slime/rollout/rm_hub/ifbench.py b/slime/rollout/rm_hub/ifbench.py index c9cb197b5..39d083e70 100644 --- a/slime/rollout/rm_hub/ifbench.py +++ b/slime/rollout/rm_hub/ifbench.py @@ -12,7 +12,7 @@ _WORKSPACE_ROOT = Path(__file__).resolve().parents[3] _WORKSPACE_PARENT = _WORKSPACE_ROOT.parent -_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements.txt" +_LOCAL_IFBENCH_REQUIREMENTS = _WORKSPACE_ROOT / "examples" / "eval_multi_task" / "requirements_ifbench.txt" def _ensure_ifbench_repo() -> Path: From e9bc8529c4134fe5ad83178c29080d1e285291f8 Mon Sep 17 00:00:00 2001 From: zyzshishui Date: Tue, 28 Oct 2025 19:12:30 +0000 Subject: [PATCH 16/16] add --- examples/eval_multi_task/requirements_ifbench.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/eval_multi_task/requirements_ifbench.txt b/examples/eval_multi_task/requirements_ifbench.txt index 039234af6..4e9b607fb 100644 --- a/examples/eval_multi_task/requirements_ifbench.txt +++ b/examples/eval_multi_task/requirements_ifbench.txt @@ -3,3 +3,4 @@ nltk spacy==3.7.4 syllapy numpy==1.26.4 +immutabledict \ No newline at end of file