bug hunting for risk score extraction from reasoning models

AndreFCruz · AndreFCruz · commit caa398146cc5 · 2025-12-13T12:15:42.000+01:00
diff --git a/folktexts/classifier/transformers_classifier.py b/folktexts/classifier/transformers_classifier.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import logging
 from pathlib import Path
 from typing import Callable
 
@@ -138,11 +139,24 @@ def _query_prompt_risk_estimates_batch(
                 enable_thinking=question.enable_thinking,
             )
 
-            # Extract probability from generated text
-            risk_estimates_batch = [
-                question.get_answer_from_model_output(generated_text)
-                for generated_text in generated_texts
-            ]
+            # Extract probability from generated text and log each sample
+            risk_estimates_batch = []
+            for idx, (prompt, generated_text) in enumerate(zip(prompts_batch, generated_texts)):
+                risk_estimate = question.get_answer_from_model_output(generated_text)
+                risk_estimates_batch.append(risk_estimate)
+
+                # Log prompt, generated answer, and extracted risk score at INFO level
+                logging.info(
+                    f"\n{'='*60}\n"
+                    f"[ReasoningQA Sample {idx + 1}/{len(prompts_batch)}]\n"
+                    f"{'='*60}\n"
+                    f"PROMPT:\n{prompt}\n"
+                    f"{'-'*60}\n"
+                    f"GENERATED ANSWER:\n{generated_text}\n"
+                    f"{'-'*60}\n"
+                    f"EXTRACTED RISK SCORE: {risk_estimate:.4f}\n"
+                    f"{'='*60}"
+                )
 
             return risk_estimates_batch
 
diff --git a/folktexts/llm_utils.py b/folktexts/llm_utils.py
@@ -173,10 +173,11 @@ def generate_text_batch(
         The maximum context size for input tokens. If None, no truncation
         is applied to inputs.
     enable_thinking : bool, optional
-        Whether to enable thinking mode for models that support it (e.g., Qwen3).
-        When True, uses `tokenizer.apply_chat_template` with `enable_thinking=True`.
-        When False, explicitly disables thinking mode. When None (default),
-        does not apply chat template formatting.
+        Controls chat template application and thinking mode:
+        - None: Do not apply chat template (use raw prompts, for base models)
+        - False: Apply chat template WITHOUT thinking mode (for instruction-tuned models)
+        - True: Apply chat template WITH thinking mode, and extract response
+          content after </think> marker (for thinking models like Qwen3)
 
     Returns
     -------
@@ -192,7 +193,10 @@ def generate_text_batch(
     tokenizer.padding_side = "left"
 
     try:
-        # Apply chat template if enable_thinking is specified
+        # Apply chat template when enable_thinking is not None
+        # - enable_thinking=True: apply with thinking enabled
+        # - enable_thinking=False: apply without thinking (standard chat format)
+        # - enable_thinking=None: skip chat template (raw prompts for base models)
         if enable_thinking is not None:
             processed_inputs = []
             for text in text_inputs:
@@ -208,17 +212,20 @@ def generate_text_batch(
                     processed_inputs.append(formatted_text)
                 except TypeError:
                     # Tokenizer doesn't support enable_thinking parameter
-                    logging.warning(
-                        "Tokenizer does not support 'enable_thinking' parameter. "
-                        "Falling back to standard chat template."
-                    )
+                    # This is expected for non-Qwen models
+                    if enable_thinking:
+                        logging.warning(
+                            "Tokenizer does not support 'enable_thinking' parameter. "
+                            "Falling back to standard chat template."
+                        )
                     formatted_text = tokenizer.apply_chat_template(
                         messages,
                         tokenize=False,
                         add_generation_prompt=True,
                     )
                     processed_inputs.append(formatted_text)
             text_inputs = processed_inputs
+            logging.debug(f"Applied chat template (enable_thinking={enable_thinking})")
 
         # Tokenize inputs with left-padding for generation
         tokenized = tokenizer(
@@ -249,40 +256,55 @@ def generate_text_batch(
         generated_texts = []
         for i, output in enumerate(outputs):
             # Extract only the newly generated tokens (after the padded input)
-            generated_tokens = output[input_seq_length:].tolist()
+            generated_tokens = output[input_seq_length:]
 
-            # If thinking mode was enabled, separate thinking content from response content
-            # The </think> token (ID 151668) marks the end of thinking content
-            if enable_thinking:
-                thinking_end_token_id = 151668  # </think> token ID for Qwen models
-                try:
-                    # Find the </think> token from the end (in case there are multiple)
-                    index = len(generated_tokens) - generated_tokens[::-1].index(thinking_end_token_id)
-                    # Only decode content after </think>
-                    content_tokens = generated_tokens[index:]
-                    thinking_tokens = generated_tokens[:index]
+            # Decode the full generated text
+            full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
 
-                    thinking_content = tokenizer.decode(thinking_tokens, skip_special_tokens=True).strip("\n")
-                    content = tokenizer.decode(content_tokens, skip_special_tokens=True).strip("\n")
+            # If thinking mode was enabled, separate thinking content from response
+            if enable_thinking is True:
+                # Use string-based detection for </think> separator
+                # This is more robust than relying on hardcoded token IDs
+                think_end_marker = "</think>"
 
-                    # Log all decoded tokens at debug level
+                if think_end_marker in full_generated_text:
+                    # Split on </think> and take only the response content
+                    # Thinking content is logged but IGNORED for probability extraction
+                    parts = full_generated_text.split(think_end_marker, 1)
+                    thinking_content = parts[0].strip()
+                    response_content = parts[1].strip() if len(parts) > 1 else ""
+
+                    # Log thinking content for debugging (but don't use it for extraction)
                     logging.debug(f"=== Generated output {i+1}/{len(outputs)} ===")
-                    logging.debug(f"Thinking content ({len(thinking_content)} chars):\n{thinking_content}")
-                    logging.debug(f"Response content ({len(content)} chars):\n{content}")
-
-                    generated_texts.append(content)
-                except ValueError:
-                    # </think> token not found - decode entire output
-                    logging.warning("</think> token not found in output. Using full generated text.")
-                    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
-                    logging.debug(f"=== Generated output {i+1}/{len(outputs)} (no thinking separation) ===")
-                    logging.debug(f"Full content ({len(generated_text)} chars):\n{generated_text}")
-                    generated_texts.append(generated_text)
+                    logging.debug(f"Thinking content ({len(thinking_content)} chars) [IGNORED for extraction]:")
+                    logging.debug(f"{thinking_content[:500]}..." if len(thinking_content) > 500 else thinking_content)
+                    logging.debug(f"Response content ({len(response_content)} chars) [USED for extraction]:")
+                    logging.debug(response_content)
+
+                    # Always use response content only - thinking content is ignored
+                    if response_content:
+                        generated_texts.append(response_content)
+                    else:
+                        # Response content is empty - this is a problem
+                        logging.warning(
+                            "Response content after </think> is empty. "
+                            "Model may not have generated a proper response. "
+                            "Probability extraction will likely fail."
+                        )
+                        generated_texts.append("")
+                else:
+                    # </think> marker not found - use full text
+                    # This can happen if the model doesn't actually use thinking format
+                    logging.warning(
+                        f"</think> marker not found in output (thinking mode was enabled). "
+                        f"Using full generated text ({len(full_generated_text)} chars)."
+                    )
+                    generated_texts.append(full_generated_text.strip())
             else:
-                generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+                # Non-thinking mode: use full generated text
                 logging.debug(f"=== Generated output {i+1}/{len(outputs)} ===")
-                logging.debug(f"Content ({len(generated_text)} chars):\n{generated_text}")
-                generated_texts.append(generated_text)
+                logging.debug(f"Content ({len(full_generated_text)} chars):\n{full_generated_text[:500]}...")
+                generated_texts.append(full_generated_text.strip())
 
         return generated_texts
 
diff --git a/folktexts/qa_interface.py b/folktexts/qa_interface.py
@@ -427,18 +427,18 @@ def get_answer_from_model_output(
 
 # Regex patterns for extracting probability from generated text
 # Matches formats like: "Probability: 80%", "Probability: 0.80", "probability: 80 percent"
+# Patterns are ordered by specificity - more specific patterns first
 _PROBABILITY_PATTERNS = [
-    # Match "Probability: X%" or "probability: X%"
-    r"[Pp]robability:\s*(\d+(?:\.\d+)?)\s*%",
-    # Match "Probability: 0.XX" or "probability: 0.XX"
-    r"[Pp]robability:\s*(0?\.\d+)",
-    # Match "Probability: X" where X is a whole number (interpreted as percentage)
-    r"[Pp]robability:\s*(\d+)\s*(?:percent|%)",
-    # Match standalone percentage at end of text: "... 80%" or "...0.80"
-    r"(\d+(?:\.\d+)?)\s*%\s*$",
-    r"(0?\.\d+)\s*$",
+    # Match "Probability: X%" or "probability: X%" (with optional "is", "of", etc.)
+    r"[Pp]robability(?:\s+(?:is|of|estimate)?)?[:\s]+(\d+(?:\.\d+)?)\s*%",
+    # Match "Probability: 0.XX" or "probability: 0.XX" or "Probability: 1.0"
+    r"[Pp]robability(?:\s+(?:is|of|estimate)?)?[:\s]+(\d*\.?\d+)(?![%\d])",
+    # Match "X%" anywhere in text (prefer later matches in fallback)
+    r"(\d+(?:\.\d+)?)\s*%",
     # Match "X percent" pattern
     r"(\d+(?:\.\d+)?)\s+percent",
+    # Match standalone decimal (0.XX or .XX) that looks like probability
+    r"(?<![.\d])(0?\.\d+)(?![.\d])",
 ]
 
 
@@ -514,11 +514,25 @@ def extract_probability_from_text(generated_text: str) -> float | None:
         probability : float | None
             The extracted probability as a float between 0 and 1, or None if
             no valid probability was found.
+
+        Notes
+        -----
+        The extraction prioritizes:
+        1. Explicit "Probability: X%" format (most reliable)
+        2. Last percentage or probability value in text (likely the conclusion)
+        3. Fallback to any decimal that looks like a probability
         """
-        # Try each pattern in order of specificity
-        for pattern in _PROBABILITY_PATTERNS:
-            match = re.search(pattern, generated_text)
-            if match:
+        # First, try to find explicit "Probability: X" format (most reliable)
+        explicit_patterns = [
+            r"[Pp]robability(?:\s+(?:is|of|estimate)?)?[:\s]+(\d+(?:\.\d+)?)\s*%",
+            r"[Pp]robability(?:\s+(?:is|of|estimate)?)?[:\s]+(\d*\.?\d+)(?![%\d])",
+        ]
+
+        for pattern in explicit_patterns:
+            # Find ALL matches and use the LAST one (likely the final answer)
+            matches = list(re.finditer(pattern, generated_text))
+            if matches:
+                match = matches[-1]  # Use last match
                 value = float(match.group(1))
 
                 # Convert percentage to probability if > 1
@@ -532,24 +546,38 @@ def extract_probability_from_text(generated_text: str) -> float | None:
                 else:
                     logging.warning(f"Extracted value {value} is out of range [0, 1]")
 
-        # Fallback: try to find any number that could be a probability
-        # Look for decimal numbers between 0 and 1
-        decimal_matches = re.findall(r"0?\.\d+", generated_text)
-        for match in reversed(decimal_matches):  # Prefer later matches (likely the conclusion)
-            value = float(match)
+        # Second, look for percentage patterns (prefer last occurrence)
+        percent_matches = re.findall(r"(\d+(?:\.\d+)?)\s*%", generated_text)
+        if percent_matches:
+            # Use the last percentage found (likely the final answer)
+            value = float(percent_matches[-1]) / 100.0
             if 0 <= value <= 1:
-                logging.warning(f"Using fallback decimal extraction: {value:.2%}")
+                logging.debug(f"Using fallback percentage extraction: {value:.2%}")
                 return value
 
-        # Look for percentages
-        percent_matches = re.findall(r"(\d+(?:\.\d+)?)\s*%", generated_text)
-        for match in reversed(percent_matches):
-            value = float(match) / 100.0
+        # Third, look for "X percent" pattern
+        percent_word_matches = re.findall(r"(\d+(?:\.\d+)?)\s+percent", generated_text, re.IGNORECASE)
+        if percent_word_matches:
+            value = float(percent_word_matches[-1]) / 100.0
             if 0 <= value <= 1:
-                logging.warning(f"Using fallback percentage extraction: {value:.2%}")
+                logging.debug(f"Using fallback 'X percent' extraction: {value:.2%}")
                 return value
 
-        logging.error(f"Could not extract probability from text: {generated_text[:200]}...")
+        # Fourth, try to find decimal numbers between 0 and 1
+        decimal_matches = re.findall(r"(?<![.\d])(0?\.\d+)(?![.\d])", generated_text)
+        if decimal_matches:
+            # Use the last decimal found
+            value = float(decimal_matches[-1])
+            if 0 <= value <= 1:
+                logging.warning(f"Using fallback decimal extraction: {value:.2%}")
+                return value
+
+        # Log a detailed error message for debugging
+        if len(generated_text) > 500:
+            snippet = generated_text[:250] + "..." + generated_text[-250:]
+        else:
+            snippet = generated_text
+        logging.error(f"Could not extract probability from text:\n{snippet}")
         return None
 
     def get_answer_from_model_output(
diff --git a/scripts/test_reasoning_qa.py b/scripts/test_reasoning_qa.py