fix: make post-check scalable with word-boundary matching and locale awareness

antoniomtz · claude · antoniomtz · commit 189914ac21b8 · 2026-04-14T11:12:09.000-07:00
The programmatic post-check had two scalability issues:

1. Substring false positives: "at" matched inside "Nature", "10" matched
   inside "100". Fix: use regex word tokenization instead of string `in`
   operator. Both user words and enhanced title are tokenized into word
   sets before comparison.

2. Localization breakage: English user words were prepended to translated
   titles (e.g. "healthcare Aceite de Pescado..."). Fix: only apply the
   post-check for English locales (locale.startswith("en")). For
   non-English locales, the LLM translates user intent into the target
   language — English word matching doesn't apply.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/backend/vlm.py b/src/backend/vlm.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import os
+import re
 import json
 import base64
 import logging
@@ -343,12 +344,15 @@ def _call_nemotron_enhance(
     enhanced = _call_nemotron_enhance_vlm(vlm_output, filtered_product_data, locale)
     logger.info("Step 1 complete (enhanced + localized to %s): enhanced_keys=%s", locale, list(enhanced.keys()))
 
-    # Post-check: guarantee user-provided title words survive the LLM pipeline
-    # Uses the ORIGINAL product_data (not filtered) — the user typed it, we keep it.
-    if product_data and product_data.get("title") and enhanced.get("title"):
+    # Post-check: guarantee user-provided title words survive the LLM pipeline.
+    # Only applies for English locales — for non-English, the LLM translates the
+    # user's words into the target language, so English word matching doesn't apply.
+    if product_data and product_data.get("title") and enhanced.get("title") and locale.startswith("en"):
         user_title = product_data["title"]
-        enhanced_lower = enhanced["title"].lower()
-        missing = [w for w in user_title.split() if w.lower() not in enhanced_lower]
+        # Word-boundary matching: tokenize both into word sets to avoid substring false positives
+        enhanced_words = set(re.findall(r'[a-zA-Z0-9]+(?:[-\'][a-zA-Z0-9]+)*', enhanced["title"].lower()))
+        user_words = re.findall(r'[a-zA-Z0-9]+(?:[-\'][a-zA-Z0-9]+)*', user_title.lower())
+        missing = [w for w in user_words if w not in enhanced_words]
         if missing:
             logger.info("Post-check: user words %s missing from title, prepending original user title", missing)
             enhanced["title"] = user_title + " " + enhanced["title"]