diff --git a/balrog/client.py b/balrog/client.py index e4ea50ba..be03166f 100644 --- a/balrog/client.py +++ b/balrog/client.py @@ -184,6 +184,7 @@ def api_call(): return self.client.chat.completions.create( messages=converted_messages, model=self.model_id, + temperature=self.client_kwargs.get("temperature", 0.5), max_tokens=self.client_kwargs.get("max_tokens", 1024), ) diff --git a/balrog/config/config.yaml b/balrog/config/config.yaml index 2b53786f..7124af07 100644 --- a/balrog/config/config.yaml +++ b/balrog/config/config.yaml @@ -30,7 +30,7 @@ client: base_url: http://localhost:8080/v1 # Base URL for the API (if using a local server) generate_kwargs: temperature: 0.0 # Sampling temperature; 0.0 makes the output deterministic - max_tokens: 1024 # Max tokens to generate in the response + max_tokens: 4096 # Max tokens to generate in the response timeout: 60 # Timeout for API requests in seconds max_retries: 5 # Max number of retries for failed API calls delay: 2 # Exponential backoff factor between retries in seconds