adding support for vllm sampler kwargs.

NicoGrande · NicoGrande · commit a5efcd7c1707 · 2026-03-02T21:37:52.000Z
diff --git a/tests/generate/vllm_sampler_test.py b/tests/generate/vllm_sampler_test.py
@@ -31,6 +31,7 @@
 from tunix.generate import mappings
 from tunix.generate import sampler as vanilla_sampler
 from tunix.generate import vllm_sampler
+from tunix.models.dummy_model_creator import create_dummy_model
 from tunix.models.llama3 import model as llama_lib
 from tunix.models.llama3 import params as llama_params
 from tunix.sft import utils as base_utils
@@ -357,6 +358,169 @@ async def dispatch_requests():
         ),
     )
 
+  def test_vllm_sampler_sampling_kwargs(self):
+    """Test that sampling kwargs are correctly applied to sampling_params."""
+    tunix_model = create_dummy_model(
+          model_class=llama_lib.Llama3,
+          config=llama_lib.ModelConfig.llama3p2_1b(),
+          mesh=self.mesh,
+          random_seed=3,
+    )
+
+    model_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        self.model_path
+    )
+
+    prompts = ["Hello, my name is Tom."]
+    inputs = tc.batch_templatize(prompts, model_tokenizer)
+
+    mapping_config = mappings.MappingConfig.build(tunix_model)
+
+    # Test 1: Config sampling_kwargs are applied
+    config_sampling_kwargs = {
+        "frequency_penalty": 0.5,
+        "presence_penalty": 0.3,
+    }
+
+    vllm_config = vllm_sampler.VllmConfig(
+        mesh=self.mesh,
+        hbm_utilization=0.2,
+        init_with_random_weights=True,
+        tpu_backend_type="jax",
+        mapping_config=mapping_config,
+        server_mode=False,
+        sampling_kwargs=config_sampling_kwargs,
+        engine_kwargs={
+            "model": self.model_path,
+            "max_model_len": 512,
+            "enable_prefix_caching": True,
+        },
+    )
+
+    vl_sampler = vllm_sampler.VllmSampler(
+        tokenizer=model_tokenizer,
+        config=vllm_config,
+    )
+
+    state = nnx.state(tunix_model)
+    vl_sampler.load_checkpoint(state)
+
+    # Mock the generate method to capture sampling_params
+    original_generate = vl_sampler.llm.generate
+    captured_sampling_params = []
+
+    def mock_generate(prompts, sampling_params, **kwargs):
+      captured_sampling_params.append(sampling_params)
+      return original_generate(prompts, sampling_params, **kwargs)
+
+    vl_sampler.llm.generate = mock_generate
+
+    # Call with additional method kwargs
+    method_sampling_kwargs = {"min_tokens": 10}
+    vl_sampler(
+        input_strings=inputs,
+        max_generation_steps=128,
+        max_prompt_length=None,
+        temperature=0.0,
+        top_k=1,
+        seed=0,
+        echo=False,
+        pad_output=True,
+        **method_sampling_kwargs,
+    )
+
+    # Verify that both config and method kwargs were applied
+    self.assertLen(captured_sampling_params, 1)
+    sampling_params = captured_sampling_params[0]
+
+    # Check config kwargs
+    self.assertEqual(sampling_params.frequency_penalty, 0.5)
+    self.assertEqual(sampling_params.presence_penalty, 0.3)
+
+    # Check method kwargs
+    self.assertEqual(sampling_params.min_tokens, 10)
+
+  def test_vllm_sampler_sampling_kwargs_override(self):
+    """Test that method kwargs override config sampling_kwargs."""
+    tunix_model = create_dummy_model(
+          model_class=llama_lib.Llama3,
+          config=llama_lib.ModelConfig.llama3p2_1b(),
+          mesh=self.mesh,
+          random_seed=3,
+    )
+
+    model_tokenizer = transformers.AutoTokenizer.from_pretrained(
+        self.model_path
+    )
+
+    prompts = ["Hello, my name is Tom."]
+    inputs = tc.batch_templatize(prompts, model_tokenizer)
+
+    mapping_config = mappings.MappingConfig.build(tunix_model)
+
+    # Config has frequency_penalty = 0.5
+    config_sampling_kwargs = {
+        "frequency_penalty": 0.5,
+        "presence_penalty": 0.3,
+    }
+
+    vllm_config = vllm_sampler.VllmConfig(
+        mesh=self.mesh,
+        hbm_utilization=0.2,
+        init_with_random_weights=True,
+        tpu_backend_type="jax",
+        mapping_config=mapping_config,
+        server_mode=False,
+        sampling_kwargs=config_sampling_kwargs,
+        engine_kwargs={
+            "model": self.model_path,
+            "max_model_len": 512,
+            "enable_prefix_caching": True,
+        },
+    )
+
+    vl_sampler = vllm_sampler.VllmSampler(
+        tokenizer=model_tokenizer,
+        config=vllm_config,
+    )
+
+    state = nnx.state(tunix_model)
+    vl_sampler.load_checkpoint(state)
+
+    # Mock the generate method to capture sampling_params
+    original_generate = vl_sampler.llm.generate
+    captured_sampling_params = []
+
+    def mock_generate(prompts, sampling_params, **kwargs):
+      captured_sampling_params.append(sampling_params)
+      return original_generate(prompts, sampling_params, **kwargs)
+
+    vl_sampler.llm.generate = mock_generate
+
+    # Call with method kwargs that override config kwargs
+    method_sampling_kwargs = {"frequency_penalty": 0.8}  # Override from 0.5 to 0.8
+    vl_sampler(
+        input_strings=inputs,
+        max_generation_steps=128,
+        max_prompt_length=None,
+        temperature=0.0,
+        top_k=1,
+        seed=0,
+        echo=False,
+        pad_output=True,
+        **method_sampling_kwargs,
+    )
+
+    # Verify that method kwargs override config kwargs
+    self.assertLen(captured_sampling_params, 1)
+    sampling_params = captured_sampling_params[0]
+
+    # Check that method kwarg overrides config kwarg
+    self.assertEqual(sampling_params.frequency_penalty, 0.8)
+
+    # Check that other config kwargs are still applied
+    self.assertEqual(sampling_params.presence_penalty, 0.3)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/tunix/generate/vllm_sampler.py b/tunix/generate/vllm_sampler.py
@@ -71,6 +71,9 @@ class VllmConfig:
       init=False, default_factory=dict
   )
 
+  # vLLM sampling args that can be directly passed in without additional processing, e.g. temperature, stop etc.
+  sampling_kwargs: Dict[str, Any] = dataclasses.field(default_factory=dict)
+
   def __post_init__(self, engine_kwargs: Optional[Dict[str, Any]]):
     engine_kwargs = engine_kwargs or {}
     self._processed_engine_kwargs = engine_kwargs
@@ -418,20 +421,25 @@ def __call__(
       if seed is not None:
         sampling_params.seed = seed
 
-      if kwargs:
+      self.config.sampling_kwargs.update(kwargs)
+      if self.config.sampling_kwargs:     
         try:
-          sampling_params.update(**kwargs)
           logging.log_first_n(
               logging.INFO,
               "Received additional kwargs that are not explicitly defined in"
-              f" the method signature: {kwargs}. These will be forwarded to the"
+              f" the method signature: {self.config.sampling_kwargs}. These will be forwarded to the"
               " underlying sampler, but please ensure that they are valid.",
               1,
-          )
-        except Exception as e:
+          )   
+          for key, value in self.config.sampling_kwargs.items():
+            logging.debug(
+                "Sampler kwargs setting key '%s' with value '%s'.", key, value
+            )
+            setattr(sampling_params, key, value)
+        except (AttributeError, TypeError) as e:
           logging.log_first_n(
               logging.INFO,
-              f"Failed to update sampling_params with kwargs: {kwargs}."
+              f"Failed to update sampling_params with kwargs: {self.config.sampling_kwargs}."
               f" Error: {e}",
               1,
           )
diff --git a/tunix/models/dummy_model_creator.py b/tunix/models/dummy_model_creator.py
@@ -66,7 +66,9 @@ def create_dummy_model(
 
   @partial(nnx.jit, static_argnums=(2, 3,))
   def make_param(rngs, scale, shape, dt):
-    return scale * rngs.params.normal(shape, dt)
+    # Call the stream to get a unique JAX key, then use jax.random
+    key = rngs.params() 
+    return scale * jax.random.normal(key, shape, dtype=dt)
 
   def make_random_tensor(path, param, shard=None):
     shape = param.shape
diff --git a/tunix/rl/rollout/base_rollout.py b/tunix/rl/rollout/base_rollout.py
@@ -157,9 +157,12 @@ class RolloutConfig:
   # Maximum number of concurrent sequences allowed to be processed in vLLM.
   rollout_vllm_max_num_seqs: Optional[int] = None
 
-  # Additional keyword arguments forwarded directly to the vLLM sampler/engine.
+  # Additional keyword arguments forwarded directly to the vLLM engine constructor.
   rollout_vllm_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict)
 
+  # Additional keyword arguments forwarded directly to the vLLM sampling params.
+  rollout_vllm_sampling_kwargs: dict[str, Any] = dataclasses.field(default_factory=dict)
+
   # SG-Lang JAX specific rollout configs.
 
   # Model version for SG-Lang JAX rollout engine.
diff --git a/tunix/rl/rollout/vllm_rollout.py b/tunix/rl/rollout/vllm_rollout.py
@@ -68,6 +68,7 @@ def __init__(
                 "hf_config_path": rollout_config.rollout_vllm_hf_config_path,
                 **rollout_config.rollout_vllm_kwargs,
             },
+            sampling_kwargs=rollout_config.rollout_vllm_sampling_kwargs,
         ),
     )
     state = nnx.state(model)

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def __init__(`
`68`	`68`	`"hf_config_path": rollout_config.rollout_vllm_hf_config_path,`
`69`	`69`	`**rollout_config.rollout_vllm_kwargs,`
`70`	`70`	`},`
	`71`	`+ sampling_kwargs=rollout_config.rollout_vllm_sampling_kwargs,`
`71`	`72`	`),`
`72`	`73`	`)`
`73`	`74`	`state = nnx.state(model)`