Skip to content

Jean/update curator #115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,22 @@ Evalchemy is developed by the [DataComp community](https://datacomp.ai) and [Bes
--model_args 'tokenized_requests=False' \
--output_path logs
```

Here are other examples of `model_name`:
- `"claude-3-7-sonnet-latest-thinking"`
- `"deepseek-reasoner"`
- `"gemini/gemini-1.5-flash"`
- `"claude-3-7-sonnet-latest"`
- `"gpt-4o-mini-2024-07-18"`
- `"o1-preview-2024-09-12"`
- `"gpt-4o-2024-08-06"`

You can also change the `model_args` to fit your needs. For example, `"claude-3-7-sonnet-latest-thinking"` might need more tokens and more time for its thinking process and can be used in batch mode to speed up evaluation and reduce costs by setting `model_args` like this:

```
--model_args 'tokenized_requests=False,timeout=2000,max_length=64000,batch=True'
```

#### [2025.01.29] New Reasoning Benchmarks

- AIME24, AMC23, MATH500, LiveCodeBench, GPQADiamond, HumanEvalPlus, MBPPPlus, BigCodeBench, MultiPL-E, and CRUXEval have been added to our growing list of [available benchmarks](https://github.com/mlfoundations/evalchemy?tab=readme-ov-file#built-in-benchmarks). This is part of the effort in the [Open Thoughts](https://github.com/open-thoughts/open-thoughts) project. See the [our blog post](https://www.open-thoughts.ai/blog/measure) on using Evalchemy for measuring reasoning models.
Expand Down
6 changes: 0 additions & 6 deletions eval/chat_benchmarks/AMC23/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

# Prepare instances for model
all_instances = []
if isinstance(model, lm_eval.models.huggingface.HFLM):
model_name = model.pretrained
elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
model_name = str(f"openai/{model.model}")
else:
model_name = model.model_args["model"]

all_outputs = []
for i in range(self.n_repeat):
Expand Down
7 changes: 0 additions & 7 deletions eval/chat_benchmarks/GPQADiamond/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
example["multiple_choice_string"] = multiple_choice_string
example["answer"] = correct_answer

if isinstance(model, lm_eval.models.huggingface.HFLM):
model_name = model.pretrained
elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
model_name = str(f"openai/{model.model}")
else:
model_name = model.model_args["model"]

all_outputs = []

for i in range(self.n_repeat):
Expand Down
6 changes: 0 additions & 6 deletions eval/chat_benchmarks/MATH500/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

# Prepare instances for model
all_instances = []
if isinstance(model, lm_eval.models.huggingface.HFLM):
model_name = model.pretrained
elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
model_name = str(f"openai/{model.model}")
else:
model_name = model.model_args["model"]
for idx, example in enumerate(examples):
messages = [
{"role": "user", "content": PROMPT.format(problem=example["problem"])},
Expand Down
262 changes: 169 additions & 93 deletions eval/chat_benchmarks/curator_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,111 +19,164 @@ def __init__(
model: str = None,
pretrained: str = None,
max_length: Optional[int] = 2048,
max_retries: int = 10,
max_retries: int = 20,
timeout: int = 300,
tokenized_requests: bool = False,
max_requests_per_minute: int = None,
max_tokens_per_minute: int = None,
seconds_to_pause_on_rate_limit: int = None,
batch: bool = False,
temperature: float = 0.0,
top_p: float = 0.95,
**kwargs,
):
super().__init__()

self.model_name = model or pretrained

self.model_args = kwargs
self.model_args.update(
{
"model": self.model_name,
"pretrained": pretrained,
"max_length": max_length,
"max_retries": max_retries,
"timeout": timeout,
"tokenized_requests": tokenized_requests,
}
)

if "gemini" in self.model_name and "thinking" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 200
max_tokens_per_minute = max_tokens_per_minute or 400_000
elif "gemini" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 4_000_000
elif "claude" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 80_000

if tokenized_requests:
raise NotImplementedError("Tokenized requests not implemented for curator.")
self.tokenized_requests = False

self.model_name = model or pretrained
self.max_length = max_length
self.llm = None
self.gen_kwargs = {}
self.eos = None
if "temperature" in kwargs:
self.gen_kwargs["temperature"] = kwargs["temperature"]
if "top_p" in kwargs:
self.gen_kwargs["top_p"] = kwargs["top_p"]
self.is_batch_request = batch
self._configure_params(
max_length=max_length,
max_retries=max_retries,
timeout=timeout,
max_requests_per_minute=max_requests_per_minute,
max_tokens_per_minute=max_tokens_per_minute,
seconds_to_pause_on_rate_limit=seconds_to_pause_on_rate_limit,
temperature=temperature,
top_p=top_p,
**kwargs,
)

self.llm = None # Initialize lazily
self.eos = None # Will be set during LLM initialization if needed

# Disable cache since it is not necessary
os.environ["CURATOR_DISABLE_CACHE"] = "true"

def _configure_params(
self,
max_length: int,
max_retries: int,
timeout: int,
max_requests_per_minute: Optional[int],
max_tokens_per_minute: Optional[int],
seconds_to_pause_on_rate_limit: Optional[int],
temperature: float,
top_p: float,
**kwargs,
):
"""Sets up gen_kwargs and backend_params based on model name and init args."""
self.gen_kwargs = {
"max_completion_tokens": max_length,
"temperature": temperature,
"top_p": top_p,
"stop": None, # Will be set later if needed based on request
}
self.backend_params = {
"invalid_finish_reasons": [
"content_filter"
], # So it doesn't retry on `length` finish reason, but retries on "content_filter"}
"invalid_finish_reasons": ["content_filter"],
"require_all_responses": False,
"request_timeout": timeout,
"max_retries": max_retries,
}
self.additional_llm_args = {} # For args passed directly to curator.LLM constructor

# Model-specific adjustments
is_thinking_model = "thinking" in self.model_name or "gemini-2.5-pro" in self.model_name

if "gemini" in self.model_name:
if self.is_batch_request:
self.additional_llm_args["backend"] = "gemini"
self.gen_kwargs.pop("max_completion_tokens", None)
self.gen_kwargs.pop("stop", None)

if is_thinking_model:
max_requests_per_minute = max_requests_per_minute or 200
max_tokens_per_minute = max_tokens_per_minute or 400_000
else:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 4_000_000
elif "claude" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 80_000
# Claude uses 'max_tokens' instead of 'max_completion_tokens'
self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens")
self.gen_kwargs.pop("stop", None) # Claude doesn't support stop sequences via API arg

if is_thinking_model:
# Adjust name and set thinking params
self.model_name = (
self.model_name.replace("-thinking-", "")
.replace("-thinking", "")
.replace("thinking-", "")
.replace("thinking", "")
)
# Thinking budget calculation depends on final max_tokens
thinking_budget = self.gen_kwargs["max_tokens"] - 4096
self.gen_kwargs["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget}
# API requirements for thinking mode
self.gen_kwargs["temperature"] = 1.0
self.gen_kwargs.pop("top_p", None)
elif "deepseek" in self.model_name:
self.additional_llm_args["backend"] = "openai"
self.backend_params["base_url"] = "https://api.deepseek.com/"
self.backend_params["api_key"] = os.environ["DEEPSEEK_API_KEY"]
max_requests_per_minute = 2_500 # Override rate limits
max_tokens_per_minute = 1_000_000_000
self.gen_kwargs["temperature"] = 0 # Override temperature
elif "o1" in self.model_name or "o3" in self.model_name or "o4" in self.model_name:
# o1/o3 don't support these
print(f"Warning: Model {self.model_name} does not support top_p, stop, or temperature. Ignoring them.")
self.gen_kwargs.pop("top_p", None)
self.gen_kwargs.pop("stop", None)
self.gen_kwargs.pop("temperature", None)
elif "xai" in self.model_name:
self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens", max_length)


# Apply rate limits if provided and not overridden by model specifics
if max_requests_per_minute is not None:
self.backend_params["max_requests_per_minute"] = max_requests_per_minute
if max_tokens_per_minute is not None:
self.backend_params["max_tokens_per_minute"] = max_tokens_per_minute
if seconds_to_pause_on_rate_limit is not None:
self.backend_params["seconds_to_pause_on_rate_limit"] = seconds_to_pause_on_rate_limit

# Disable cache since it is not necessary
os.environ["CURATOR_DISABLE_CACHE"] = "true"
# Handle batch mode specifics
if self.is_batch_request:
# Rate limiting params are incompatible with batch requests in curator
self.backend_params = {"require_all_responses": True}
self.additional_llm_args["batch"] = True

def _create_payload(
self,
messages: Union[List[List[int]], List[dict], List[str], str],
*,
generate: bool = False,
gen_kwargs: Optional[dict] = None,
eos=None,
**kwargs,
) -> dict:
assert generate, "Curator only supports generation."
# Create the payload for the API request
max_tokens = self.max_length or gen_kwargs.get("max_gen_toks", self.max_length)
temperature = self.gen_kwargs.get("temperature", gen_kwargs.get("temperature", 0))
top_p = self.gen_kwargs.get("top_p", gen_kwargs.get("top_p", 0.95))
stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
gen_kwargs = {
"max_completion_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"stop": stop,
}
if "o1" in self.model_name:
print("Warning: O1 model does not support top_p, stop, or temperature. Ignoring them.")
gen_kwargs.pop("top_p")
gen_kwargs.pop("stop")
gen_kwargs.pop("temperature")

def _ensure_llm_initialized(self, eos=None):
"""Initializes the curator.LLM object if it hasn't been already."""
if self.llm is None:
self.eos = eos
self.gen_kwargs = gen_kwargs.copy()
# Update stop sequences based on the current request if needed
# This assumes EOS is consistent for the lifetime of the model instance
if eos and self.gen_kwargs.get("stop") is None:
self.eos = eos # Store for potential future reference if needed
# Handle potential list of stop sequences
stop_sequences = handle_stop_sequences(None, eos) # Pass current eos
# Only update if stop sequences are actually needed and supported
if stop_sequences and "stop" in self.gen_kwargs:
self.gen_kwargs["stop"] = stop_sequences
elif stop_sequences and "max_tokens" in self.gen_kwargs and "claude" not in self.model_name:
# Only warn if stop sequences were provided but the param doesn't exist
# (like for Claude, which was handled in _configure_params)
print(f"Warning: Stop sequences provided but 'stop' generation parameter is not available for {self.model_name}.")


print(f"Initializing curator.LLM with: model_name='{self.model_name}', generation_params={self.gen_kwargs}, backend_params={self.backend_params}, additional_args={self.additional_llm_args}")
self.llm = curator.LLM(
model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
model_name=self.model_name,
generation_params=self.gen_kwargs,
backend_params=self.backend_params,
**self.additional_llm_args,
)
else:
if self.gen_kwargs != gen_kwargs:
print(
"Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request"
)
self.gen_kwargs = gen_kwargs.copy()
self.llm = curator.LLM(
model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
)
return messages

def create_message(
self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False
Expand Down Expand Up @@ -158,12 +211,27 @@ def tokenizer_name(self) -> str:
return self.model_name

def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str, JsonChatStr]:
# Convert chat history to the required format
# Convert chat history to the required JsonChatStr format
return JsonChatStr(json.dumps(chat_history))

def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]:
payload = self._create_payload(self.create_message(messages), **kwargs)
response = self.llm(payload)["response"]
# Deprecated or needs rework? generate_until is the primary method used by lm-eval harness.
# This method seems designed for single requests, while generate_until handles batches.
# If needed, it should also use _ensure_llm_initialized and create_message.
print("Warning: model_call is likely deprecated for lm-eval tasks. Use generate_until.")
self._ensure_llm_initialized() # Make sure LLM is ready
# Ensure messages is a list, as curator expects a list of prompts
if not isinstance(messages, list):
messages = [messages]

formatted_messages = self.create_message(messages)
# Assuming model_call handles a single prompt, curator expects a list
if not formatted_messages:
return None # Or raise error

# Curator returns a dictionary with a 'response' key containing a list of outputs
response = self.llm(formatted_messages)["response"]

return response

def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
Expand All @@ -179,26 +247,34 @@ def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:

@property
def eot_token_id(self) -> Optional[int]:
# Assuming the model has a specific end-of-text token ID
return self.llm.eot_token_id # Replace with actual method to get EOT token ID
# Curator doesn't directly expose tokenizer or token IDs.
# Need to rely on underlying model specifics if absolutely necessary,
# but lm-eval generally handles this via stop sequences.
print("Warning: eot_token_id is not directly available via Curator API.")
return None # Cannot reliably get this from curator

def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -> List[str]:
# Tokenize contexts if required
if self.tokenized_requests:
raise NotImplementedError("Tokenized requests not implemented for curator.")
if not requests:
return []

# Ensure LLM is initialized, passing eos from the first request's gen_kwargs
# Assumes eos is consistent across the batch, which is reasonable for lm-eval.
first_req_kwargs = requests[0].args[1] if len(requests[0].args) > 1 else {}
self._ensure_llm_initialized(eos=first_req_kwargs.get("until"))

# Extract contexts and generation kwargs from the Instance objects
# Extract contexts (already in JsonChatStr format expected by create_message)
contexts = [req.args[0] for req in requests]
gen_kwargs = [req.args[1] for req in requests]

# Assert all gen_kwargs are the same
assert all(
gen_kwargs[0] == gkw for gkw in gen_kwargs
), "Generation parameters must be the same for all requests in curator"
# Format messages for curator
formatted_messages = self.create_message(contexts)

response = self.llm(formatted_messages)
# Make the call to curator
try:
response = response["response"]
except Exception as e:
response = response.dataset["response"]

contexts_dataset = self.create_message(contexts)
payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
response = self.llm(payload)["response"]
return response

def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
Expand Down
Loading