huggingface · drbh · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025 · Apr 22, 2025
diff --git a/.github/workflows/client-tests.yaml b/.github/workflows/client-tests.yaml
diff --git a/backends/client/src/v3/client.rs b/backends/client/src/v3/client.rs
@@ -7,6 +7,7 @@ use grpc_metadata::InjectTelemetryContext;
 use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
 use pb::generate::v3::*;
 use std::cmp::min;
+use std::collections::HashMap;
 use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;
@@ -181,6 +182,7 @@ impl Client {
                     watermark: true,
                     grammar: String::new(),
                     grammar_type: GrammarType::None as i32,
+                    logit_bias: HashMap::new(),
                 }),
                 stopping_parameters: Some(StoppingCriteriaParameters {
                     max_new_tokens,

diff --git a/backends/client/src/v3/sharded_client.rs b/backends/client/src/v3/sharded_client.rs
@@ -5,6 +5,7 @@ use crate::{ClientError, Result};
 use crate::v3::{Chunk, InfoResponse, Input};
 use async_trait::async_trait;
 use futures::future::join_all;
+use std::collections::HashMap;
 use tonic::transport::Uri;
 use tracing::instrument;
 use v3::client::{DecodeTimings, PrefillTimings};
@@ -244,6 +245,7 @@ impl Health for ShardedClient {
                 watermark: false,
                 grammar: String::new(),
                 grammar_type: GrammarType::None as i32,
+                logit_bias: HashMap::new(),
             }),
             stopping_parameters: Some(StoppingCriteriaParameters {
                 max_new_tokens: 1,

diff --git a/backends/v2/src/queue.rs b/backends/v2/src/queue.rs
@@ -429,6 +429,7 @@ mod tests {
                     frequency_penalty: 0.0,
                     watermark: false,
                     grammar: None,
+                    logit_bias: None,
                 },
                 stopping_parameters: ValidStoppingParameters {
                     ignore_eos_token: false,

diff --git a/backends/v3/src/client/grpc_client.rs b/backends/v3/src/client/grpc_client.rs
@@ -7,6 +7,7 @@ use grpc_metadata::InjectTelemetryContext;
 use pb::generate::v3::text_generation_service_client::TextGenerationServiceClient;
 use pb::generate::v3::*;
 use std::cmp::min;
+use std::collections::HashMap;
 use std::time::Duration;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;
@@ -181,6 +182,7 @@ impl Client {
                     watermark: true,
                     grammar: String::new(),
                     grammar_type: GrammarType::None as i32,
+                    logit_bias: HashMap::new(),
                 }),
                 stopping_parameters: Some(StoppingCriteriaParameters {
                     max_new_tokens,

diff --git a/backends/v3/src/client/sharded_client.rs b/backends/v3/src/client/sharded_client.rs
@@ -10,6 +10,7 @@ use crate::client::{
 use crate::client::{Chunk, InfoResponse, Input};
 use async_trait::async_trait;
 use futures::future::join_all;
+use std::collections::HashMap;
 use tonic::transport::Uri;
 use tracing::instrument;
 
@@ -232,6 +233,7 @@ impl Health for ShardedClient {
                 watermark: false,
                 grammar: String::new(),
                 grammar_type: GrammarType::None as i32,
+                logit_bias: HashMap::new(),
             }),
             stopping_parameters: Some(StoppingCriteriaParameters {
                 max_new_tokens: 1,

diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
@@ -5,6 +5,7 @@ use crate::client::{
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
 use std::cmp::max;
+use std::collections::HashMap;
 use std::collections::VecDeque;
 use text_generation_router::infer::InferError;
 use text_generation_router::infer::InferStreamResponse;
@@ -522,6 +523,14 @@ impl From<ValidParameters> for NextTokenChooserParameters {
             watermark: value.watermark,
             grammar,
             grammar_type: grammar_type.into(),
+            logit_bias: value
+                .logit_bias
+                .map(|bias| {
+                    bias.into_iter()
+                        .map(|(token, bias)| (token.to_string(), bias as i32))
+                        .collect::<HashMap<String, i32>>()
+                })
+                .unwrap_or_default(),
         }
     }
 }
@@ -568,6 +577,7 @@ mod tests {
                     frequency_penalty: 0.0,
                     watermark: false,
                     grammar: None,
+                    logit_bias: None,
                 },
                 stopping_parameters: ValidStoppingParameters {
                     ignore_eos_token: false,

diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs
@@ -47,6 +47,7 @@ pub async fn run(
         watermark,
         grammar: String::new(),
         grammar_type: GrammarType::None as i32,
+        logit_bias: std::collections::HashMap::new(),
     };
 
     // Initialize terminal properties

diff --git a/clients/python/text_generation/types.py b/clients/python/text_generation/types.py
@@ -1,6 +1,6 @@
 from enum import Enum
 from pydantic import BaseModel, field_validator, ConfigDict
-from typing import Optional, List, Union, Any
+from typing import Optional, List, Union, Any, Dict
 
 from text_generation.errors import ValidationError
 
@@ -137,7 +137,7 @@ class ChatRequest(BaseModel):
     # decreasing the model's likelihood to repeat the same line verbatim.
     frequency_penalty: Optional[float] = None
     # Bias values for token selection
-    logit_bias: Optional[List[float]] = None
+    logit_bias: Optional[Dict[str, int]] = None
     # Whether to return log probabilities
     logprobs: Optional[bool] = None
     # Number of most likely tokens to return at each position

diff --git a/docs/openapi.json b/docs/openapi.json
@@ -995,12 +995,12 @@
             "nullable": true
           },
           "logit_bias": {
-            "type": "array",
-            "items": {
-              "type": "number",
-              "format": "float"
+            "type": "object",
+            "description": "Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
+            "additionalProperties": {
+              "type": "integer",
+              "format": "int32"
             },
-            "description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
             "nullable": true
           },
           "logprobs": {
@@ -1589,6 +1589,17 @@
             "default": "null",
             "nullable": true
           },
+          "logit_bias": {
+            "type": "object",
+            "description": "Modify the likelihood of specified tokens appearing in the completion.\nAccepts a hash map that maps token strings to an associated bias value.",
+            "default": "null",
+            "additionalProperties": {
+              "type": "integer",
+              "format": "int32"
+            },
+            "example": "{\"1923\": 100, \"1924\": -100}",
+            "nullable": true
+          },
           "max_new_tokens": {
             "type": "integer",
             "format": "int32",

diff --git a/integration-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_baseline.json b/integration-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_baseline.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Hello! How can I help you today?",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1745337495,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-2B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.3-dev0-native",
+  "usage": {
+    "completion_tokens": 10,
+    "prompt_tokens": 21,
+    "total_tokens": 31
+  }
+}
diff --git a/...-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_english_to_spanish.json b/...-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_english_to_spanish.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "¡Hola! ¿Cómo puedo ayudarte?",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1746486174,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-2B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.3-dev0-native",
+  "usage": {
+    "completion_tokens": 10,
+    "prompt_tokens": 21,
+    "total_tokens": 31
+  }
+}
diff --git a/...ion-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_multiple_tokens.json b/...ion-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_multiple_tokens.json
@@ -0,0 +1,26 @@
+{
+  "choices": [
+    {
+      "finish_reason": "stop",
+      "index": 0,
+      "logprobs": null,
+      "message": {
+        "content": "Chat!",
+        "name": null,
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "usage": null
+    }
+  ],
+  "created": 1746486174,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-2B-Instruct",
+  "object": "chat.completion",
+  "system_fingerprint": "3.2.3-dev0-native",
+  "usage": {
+    "completion_tokens": 3,
+    "prompt_tokens": 25,
+    "total_tokens": 28
+  }
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_streaming.json b/integration-tests/models/__snapshots__/test_flash_logit_bias/test_logit_bias_streaming.json
@@ -0,0 +1,20 @@
+{
+  "choices": [
+    {
+      "delta": {
+        "content": "",
+        "role": "assistant",
+        "tool_calls": null
+      },
+      "finish_reason": "length",
+      "index": 0,
+      "logprobs": null
+    }
+  ],
+  "created": 1746486174,
+  "id": "",
+  "model": "Qwen/Qwen2-VL-2B-Instruct",
+  "object": "chat.completion.chunk",
+  "system_fingerprint": "3.2.3-dev0-native",
+  "usage": null
+}