fix(detection): fix PR #55 issues — ReDoS, double-attribution, false positives, test gaps

Zie619 · claude · Zie619 · commit 209001c0befb · 2026-02-26T08:40:32.000+02:00
- Fix ReDoS in Qwen regex: replace nested quantifier with safe `qwen[\d.]*(?:-\w+)*`
- Fix re.IGNORECASE silently ignored in endpoint_db.py (was passed as pos arg)
- Fix DeepSeek/OpenAI double-attribution: add byte-range dedup in detect_api_key
- Remove bare "grok" and "qwen" from model registry (false positives via prefix match)
- Add word boundary to o[13] model pattern to prevent partial matches
- Remove non-existent "deepseek" PyPI package from KNOWN_AI_PACKAGES
- Remove dead seen_components parameter from code_scanner.py
- Revert unauthorized ci.yml threshold change from --fail-on critical
- Remove docs/guides/n8n-quickstart.md (per review, unrelated to PR scope)
- Add 15 new tests for xAI, DeepSeek, Qwen detection + dedup + case-insensitive endpoints

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -102,7 +102,7 @@ jobs:
 
       - name: Run AI-BOM self-scan (SARIF)
         run: |
-          ai-bom scan . --fail-on critical --format sarif -o ai-bom-results.sarif --quiet || true
+          ai-bom scan . --format sarif -o ai-bom-results.sarif --quiet || true
 
       - name: Upload SARIF results
         if: github.event_name == 'push'
diff --git a/docs/guides/n8n-quickstart.md b/docs/guides/n8n-quickstart.md
diff --git a/src/ai_bom/config.py b/src/ai_bom/config.py
@@ -77,8 +77,6 @@
     # Google Agent Development Kit
     "google.adk": ("Google", "agent"),
     "google-adk": ("Google", "agent"),
-    # DeepSeek
-    "deepseek": ("DeepSeek", "completion"),
     # AWS Bedrock
     "boto3": ("AWS", "completion"),  # Special: requires additional bedrock service check
     # LiteLLM
@@ -167,7 +165,7 @@
 
 KNOWN_MODEL_PATTERNS: list[tuple[Pattern[str], str]] = [
     # OpenAI GPT models
-    (re.compile(r"gpt-4[o]?(-\w+)*"), "OpenAI"),
+    (re.compile(r"gpt-4o?(-\w+)*"), "OpenAI"),
     (re.compile(r"gpt-3\.5(-\w+)*"), "OpenAI"),
     (re.compile(r"text-davinci-\d+"), "OpenAI"),
     (re.compile(r"text-curie-\d+"), "OpenAI"),
@@ -202,7 +200,7 @@
     (re.compile(r"phi-\d+(-\w+)*"), "Microsoft"),
     # OpenAI latest
     (re.compile(r"gpt-4\.5(-\w+)*"), "OpenAI"),
-    (re.compile(r"o[13](-\w+)*"), "OpenAI"),
+    (re.compile(r"\bo[13](-\w+)*\b"), "OpenAI"),
     # Anthropic Claude 4.x
     (re.compile(r"claude-4(-\w+)*"), "Anthropic"),
     (re.compile(r"claude-4\.5(-\w+)*"), "Anthropic"),
@@ -216,7 +214,7 @@
     # DeepSeek
     (re.compile(r"deepseek-\w+(-\w+)*"), "DeepSeek"),
     # Alibaba Qwen
-    (re.compile(r"qwen(?:\d+(?:\.\d+)?)*(?:-\w+)*"), "Alibaba"),
+    (re.compile(r"qwen[\d.]*(?:-\w+)*"), "Alibaba"),
     # xAI Grok
     (re.compile(r"grok-(?:\d+(?:\.\d+)?|\w+)(?:-\w+)*"), "xAI"),
 ]
diff --git a/src/ai_bom/detectors/endpoint_db.py b/src/ai_bom/detectors/endpoint_db.py
@@ -7,8 +7,6 @@
 
 from __future__ import annotations
 
-import re
-
 from ai_bom.config import API_KEY_PATTERNS, KNOWN_AI_ENDPOINTS
 
 
@@ -35,8 +33,9 @@ def match_endpoint(url: str) -> tuple[str, str] | None:
         >>> match_endpoint("https://example.com/api")
         None
     """
+    url_lower = url.lower()
     for pattern, provider, usage_type in KNOWN_AI_ENDPOINTS:
-        if pattern.search(url, re.IGNORECASE):
+        if pattern.search(url_lower):
             return (provider, usage_type)
     return None
 
@@ -66,11 +65,26 @@ def detect_api_key(text: str) -> list[tuple[str, str, str]]:
         [("sk-ant-t...st123", "Anthropic", "sk-ant-[a-zA-Z0-9_-]{32,}")]
     """
     results: list[tuple[str, str, str]] = []
+    # Track matched byte ranges to avoid double-attribution
+    # (e.g., DeepSeek sk- key also matching the generic OpenAI sk- pattern)
+    matched_spans: list[tuple[int, int]] = []
 
     for pattern, provider in API_KEY_PATTERNS:
         for match in pattern.finditer(text):
             # Use the first capture group if present, otherwise the full match
-            key = match.group(1) if match.lastindex else match.group(0)
+            if match.lastindex:
+                key = match.group(1)
+                key_start = match.start(1)
+                key_end = match.end(1)
+            else:
+                key = match.group(0)
+                key_start = match.start(0)
+                key_end = match.end(0)
+
+            # Skip if this key's byte range overlaps with an already-matched span
+            if any(key_start < end and key_end > start for start, end in matched_spans):
+                continue
+            matched_spans.append((key_start, key_end))
 
             # Mask the key for security
             if len(key) <= 12:
diff --git a/src/ai_bom/detectors/model_registry.py b/src/ai_bom/detectors/model_registry.py
@@ -59,7 +59,6 @@
     "grok-1.5": {"provider": "xAI", "deprecated": False},
     "grok-1": {"provider": "xAI", "deprecated": False},
     "grok-beta": {"provider": "xAI", "deprecated": False},
-    "grok": {"provider": "xAI", "deprecated": False},
     # Alibaba Models
     "qwen-max": {"provider": "Alibaba", "deprecated": False},
     "qwen-plus": {"provider": "Alibaba", "deprecated": False},
@@ -68,7 +67,6 @@
     "qwen2.5": {"provider": "Alibaba", "deprecated": False},
     "qwen2": {"provider": "Alibaba", "deprecated": False},
     "qwen1.5": {"provider": "Alibaba", "deprecated": False},
-    "qwen": {"provider": "Alibaba", "deprecated": False},
 }
 
 
diff --git a/src/ai_bom/scanners/code_scanner.py b/src/ai_bom/scanners/code_scanner.py
@@ -69,10 +69,6 @@ def scan(self, path: Path) -> list[AIComponent]:
         """
         components: list[AIComponent] = []
 
-        # Track which files/SDKs we've already created components for
-        # to avoid duplicates: key = (sdk_name, file_path)
-        seen_components: set[tuple[str, str]] = set()
-
         # Handle single file scanning
         scan_dir = path if path.is_dir() else path.parent
 
@@ -111,9 +107,9 @@ def scan(self, path: Path) -> list[AIComponent]:
         # Phase B: Source code scan
         if path.is_file() and path.suffix in SCANNABLE_EXTENSIONS["code"]:
             # Single file mode: scan just this file
-            source_components = self._scan_single_source_file(path, declared_deps, seen_components)
+            source_components = self._scan_single_source_file(path, declared_deps)
         else:
-            source_components = self._scan_source_files(scan_dir, declared_deps, seen_components)
+            source_components = self._scan_source_files(scan_dir, declared_deps)
         components.extend(source_components)
 
         return components
@@ -204,7 +200,6 @@ def _scan_single_source_file(
         self,
         path: Path,
         declared_deps: set[str],
-        seen_components: set[tuple[str, str]],
     ) -> list[AIComponent]:
         """Scan a single source file for AI SDK usage."""
         components: list[AIComponent] = []
@@ -410,14 +405,12 @@ def _scan_source_files(
         self,
         path: Path,
         declared_deps: set[str],
-        seen_components: set[tuple[str, str]],
     ) -> list[AIComponent]:
         """Scan source files for AI SDK usage.
 
         Args:
             path: Root path to scan
             declared_deps: Set of declared dependencies from Phase A
-            seen_components: Set of (sdk_name, file_path) tuples to avoid duplicates
 
         Returns:
             List of detected AI components
diff --git a/tests/test_detection_enhanced.py b/tests/test_detection_enhanced.py
@@ -400,11 +400,15 @@ def test_model_pattern_matches(self, model_name: str, expected_provider: str):
                 break
         assert matched, f"Model '{model_name}' did not match any pattern"
 
-    def test_deepseek_package(self):
-        assert "deepseek" in KNOWN_AI_PACKAGES
-        provider, usage = KNOWN_AI_PACKAGES["deepseek"]
-        assert provider == "DeepSeek"
-        assert usage == "completion"
+    def test_deepseek_model_pattern(self):
+        """DeepSeek detection is via model patterns and API keys, not a fake package name."""
+        from ai_bom.config import KNOWN_MODEL_PATTERNS
+
+        matched = any(
+            pattern.search("deepseek-coder") and provider == "DeepSeek"
+            for pattern, provider in KNOWN_MODEL_PATTERNS
+        )
+        assert matched, "DeepSeek model pattern should match 'deepseek-coder'"
 
 
 class TestDeprecatedModels:
diff --git a/tests/test_detectors/test_patterns.py b/tests/test_detectors/test_patterns.py
@@ -93,6 +93,7 @@ class TestAPIKeyPatterns:
             ("deepseek_api_key = 'sk-abcdefghijklmnopqrstuvwxyz1234'", "DeepSeek"),
             ("sk-ant-abcdefghijklmnopqrstuvwxyz", "Anthropic"),
             ("hf_abcdefghijklmnopqrstuvwxyz", "HuggingFace"),
+            ("xai-abcdefghijklmnopqrstuvwxyz", "xAI"),
         ],
     )
     def test_key_patterns(self, key, provider):
@@ -101,6 +102,13 @@ def test_key_patterns(self, key, provider):
                 return
         pytest.fail(f"No pattern matched {key} for {provider}")
 
+    def test_deepseek_key_no_double_attribution(self):
+        """DeepSeek API keys should not also be attributed to OpenAI."""
+        results = detect_api_key("deepseek_api_key = 'sk-abcdefghijklmnopqrstuvwxyz1234'")
+        providers = [r[1] for r in results]
+        assert "DeepSeek" in providers
+        assert providers.count("OpenAI") == 0, "DeepSeek key should not also match as OpenAI"
+
 
 class TestModelPatterns:
     @pytest.mark.parametrize(
@@ -110,10 +118,60 @@ class TestModelPatterns:
             ("claude-3-opus", "Anthropic"),
             ("gemini-1.5-pro", "Google"),
             ("mistral-large", "Mistral"),
+            ("deepseek-coder", "DeepSeek"),
+            ("grok-2", "xAI"),
+            ("grok-beta", "xAI"),
+            ("qwen2.5-coder", "Alibaba"),
+            ("qwen-max", "Alibaba"),
         ],
     )
     def test_model_patterns(self, model, expected_provider):
         for pattern, provider in KNOWN_MODEL_PATTERNS:
-            if re.match(pattern, model) and provider == expected_provider:
+            if re.search(pattern, model) and provider == expected_provider:
                 return
         pytest.fail(f"No pattern matched {model} for {expected_provider}")
+
+
+class TestEndpointNewProviders:
+    def test_xai_endpoint(self):
+        result = match_endpoint("https://api.x.ai/v1/chat/completions")
+        assert result is not None
+        assert result[0] == "xAI"
+
+    def test_alibaba_endpoint(self):
+        result = match_endpoint("https://dashscope.aliyuncs.com/api/v1/services")
+        assert result is not None
+        assert result[0] == "Alibaba"
+
+    def test_xai_endpoint_uppercase(self):
+        """Ensure case-insensitive matching works for endpoints."""
+        result = match_endpoint("https://API.X.AI/v1/chat")
+        assert result is not None
+        assert result[0] == "xAI"
+
+
+class TestModelRegistryNewProviders:
+    def test_grok_2(self):
+        result = lookup_model("grok-2")
+        assert result is not None
+        assert result["provider"] == "xAI"
+
+    def test_grok_beta(self):
+        result = lookup_model("grok-beta")
+        assert result is not None
+        assert result["provider"] == "xAI"
+
+    def test_qwen_max(self):
+        result = lookup_model("qwen-max")
+        assert result is not None
+        assert result["provider"] == "Alibaba"
+
+    def test_qwen_versioned(self):
+        result = lookup_model("qwen2.5")
+        assert result is not None
+        assert result["provider"] == "Alibaba"
+
+    def test_bare_grok_no_match(self):
+        """Bare 'grok' without version/suffix should not match to avoid false positives."""
+        result = lookup_model("grok")
+        assert result is None