@@ -173,10 +173,11 @@ def generate_text_batch(
173173 The maximum context size for input tokens. If None, no truncation
174174 is applied to inputs.
175175 enable_thinking : bool, optional
176- Whether to enable thinking mode for models that support it (e.g., Qwen3).
177- When True, uses `tokenizer.apply_chat_template` with `enable_thinking=True`.
178- When False, explicitly disables thinking mode. When None (default),
179- does not apply chat template formatting.
176+ Controls chat template application and thinking mode:
177+ - None: Do not apply chat template (use raw prompts, for base models)
178+ - False: Apply chat template WITHOUT thinking mode (for instruction-tuned models)
179+ - True: Apply chat template WITH thinking mode, and extract response
180+ content after </think> marker (for thinking models like Qwen3)
180181
181182 Returns
182183 -------
@@ -192,7 +193,10 @@ def generate_text_batch(
192193 tokenizer .padding_side = "left"
193194
194195 try :
195- # Apply chat template if enable_thinking is specified
196+ # Apply chat template when enable_thinking is not None
197+ # - enable_thinking=True: apply with thinking enabled
198+ # - enable_thinking=False: apply without thinking (standard chat format)
199+ # - enable_thinking=None: skip chat template (raw prompts for base models)
196200 if enable_thinking is not None :
197201 processed_inputs = []
198202 for text in text_inputs :
@@ -208,17 +212,20 @@ def generate_text_batch(
208212 processed_inputs .append (formatted_text )
209213 except TypeError :
210214 # Tokenizer doesn't support enable_thinking parameter
211- logging .warning (
212- "Tokenizer does not support 'enable_thinking' parameter. "
213- "Falling back to standard chat template."
214- )
215+ # This is expected for non-Qwen models
216+ if enable_thinking :
217+ logging .warning (
218+ "Tokenizer does not support 'enable_thinking' parameter. "
219+ "Falling back to standard chat template."
220+ )
215221 formatted_text = tokenizer .apply_chat_template (
216222 messages ,
217223 tokenize = False ,
218224 add_generation_prompt = True ,
219225 )
220226 processed_inputs .append (formatted_text )
221227 text_inputs = processed_inputs
228+ logging .debug (f"Applied chat template (enable_thinking={ enable_thinking } )" )
222229
223230 # Tokenize inputs with left-padding for generation
224231 tokenized = tokenizer (
@@ -249,40 +256,55 @@ def generate_text_batch(
249256 generated_texts = []
250257 for i , output in enumerate (outputs ):
251258 # Extract only the newly generated tokens (after the padded input)
252- generated_tokens = output [input_seq_length :]. tolist ()
259+ generated_tokens = output [input_seq_length :]
253260
254- # If thinking mode was enabled, separate thinking content from response content
255- # The </think> token (ID 151668) marks the end of thinking content
256- if enable_thinking :
257- thinking_end_token_id = 151668 # </think> token ID for Qwen models
258- try :
259- # Find the </think> token from the end (in case there are multiple)
260- index = len (generated_tokens ) - generated_tokens [::- 1 ].index (thinking_end_token_id )
261- # Only decode content after </think>
262- content_tokens = generated_tokens [index :]
263- thinking_tokens = generated_tokens [:index ]
261+ # Decode the full generated text
262+ full_generated_text = tokenizer .decode (generated_tokens , skip_special_tokens = True )
264263
265- thinking_content = tokenizer .decode (thinking_tokens , skip_special_tokens = True ).strip ("\n " )
266- content = tokenizer .decode (content_tokens , skip_special_tokens = True ).strip ("\n " )
264+ # If thinking mode was enabled, separate thinking content from response
265+ if enable_thinking is True :
266+ # Use string-based detection for </think> separator
267+ # This is more robust than relying on hardcoded token IDs
268+ think_end_marker = "</think>"
267269
268- # Log all decoded tokens at debug level
270+ if think_end_marker in full_generated_text :
271+ # Split on </think> and take only the response content
272+ # Thinking content is logged but IGNORED for probability extraction
273+ parts = full_generated_text .split (think_end_marker , 1 )
274+ thinking_content = parts [0 ].strip ()
275+ response_content = parts [1 ].strip () if len (parts ) > 1 else ""
276+
277+ # Log thinking content for debugging (but don't use it for extraction)
269278 logging .debug (f"=== Generated output { i + 1 } /{ len (outputs )} ===" )
270- logging .debug (f"Thinking content ({ len (thinking_content )} chars):\n { thinking_content } " )
271- logging .debug (f"Response content ({ len (content )} chars):\n { content } " )
272-
273- generated_texts .append (content )
274- except ValueError :
275- # </think> token not found - decode entire output
276- logging .warning ("</think> token not found in output. Using full generated text." )
277- generated_text = tokenizer .decode (generated_tokens , skip_special_tokens = True )
278- logging .debug (f"=== Generated output { i + 1 } /{ len (outputs )} (no thinking separation) ===" )
279- logging .debug (f"Full content ({ len (generated_text )} chars):\n { generated_text } " )
280- generated_texts .append (generated_text )
279+ logging .debug (f"Thinking content ({ len (thinking_content )} chars) [IGNORED for extraction]:" )
280+ logging .debug (f"{ thinking_content [:500 ]} ..." if len (thinking_content ) > 500 else thinking_content )
281+ logging .debug (f"Response content ({ len (response_content )} chars) [USED for extraction]:" )
282+ logging .debug (response_content )
283+
284+ # Always use response content only - thinking content is ignored
285+ if response_content :
286+ generated_texts .append (response_content )
287+ else :
288+ # Response content is empty - this is a problem
289+ logging .warning (
290+ "Response content after </think> is empty. "
291+ "Model may not have generated a proper response. "
292+ "Probability extraction will likely fail."
293+ )
294+ generated_texts .append ("" )
295+ else :
296+ # </think> marker not found - use full text
297+ # This can happen if the model doesn't actually use thinking format
298+ logging .warning (
299+ f"</think> marker not found in output (thinking mode was enabled). "
300+ f"Using full generated text ({ len (full_generated_text )} chars)."
301+ )
302+ generated_texts .append (full_generated_text .strip ())
281303 else :
282- generated_text = tokenizer . decode ( generated_tokens , skip_special_tokens = True )
304+ # Non-thinking mode: use full generated text
283305 logging .debug (f"=== Generated output { i + 1 } /{ len (outputs )} ===" )
284- logging .debug (f"Content ({ len (generated_text )} chars):\n { generated_text } " )
285- generated_texts .append (generated_text )
306+ logging .debug (f"Content ({ len (full_generated_text )} chars):\n { full_generated_text [: 500 ] } ... " )
307+ generated_texts .append (full_generated_text . strip () )
286308
287309 return generated_texts
288310
0 commit comments