Skip to content

Commit 209001c

Browse files
Zie619claude
andcommitted
fix(detection): fix PR #55 issues — ReDoS, double-attribution, false positives, test gaps
- Fix ReDoS in Qwen regex: replace nested quantifier with safe `qwen[\d.]*(?:-\w+)*` - Fix re.IGNORECASE silently ignored in endpoint_db.py (was passed as pos arg) - Fix DeepSeek/OpenAI double-attribution: add byte-range dedup in detect_api_key - Remove bare "grok" and "qwen" from model registry (false positives via prefix match) - Add word boundary to o[13] model pattern to prevent partial matches - Remove non-existent "deepseek" PyPI package from KNOWN_AI_PACKAGES - Remove dead seen_components parameter from code_scanner.py - Revert unauthorized ci.yml threshold change from --fail-on critical - Remove docs/guides/n8n-quickstart.md (per review, unrelated to PR scope) - Add 15 new tests for xAI, DeepSeek, Qwen detection + dedup + case-insensitive endpoints Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 0cc165f commit 209001c

File tree

8 files changed

+92
-53
lines changed

8 files changed

+92
-53
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ jobs:
102102

103103
- name: Run AI-BOM self-scan (SARIF)
104104
run: |
105-
ai-bom scan . --fail-on critical --format sarif -o ai-bom-results.sarif --quiet || true
105+
ai-bom scan . --format sarif -o ai-bom-results.sarif --quiet || true
106106
107107
- name: Upload SARIF results
108108
if: github.event_name == 'push'

docs/guides/n8n-quickstart.md

Lines changed: 0 additions & 26 deletions
This file was deleted.

src/ai_bom/config.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@
7777
# Google Agent Development Kit
7878
"google.adk": ("Google", "agent"),
7979
"google-adk": ("Google", "agent"),
80-
# DeepSeek
81-
"deepseek": ("DeepSeek", "completion"),
8280
# AWS Bedrock
8381
"boto3": ("AWS", "completion"), # Special: requires additional bedrock service check
8482
# LiteLLM
@@ -167,7 +165,7 @@
167165

168166
KNOWN_MODEL_PATTERNS: list[tuple[Pattern[str], str]] = [
169167
# OpenAI GPT models
170-
(re.compile(r"gpt-4[o]?(-\w+)*"), "OpenAI"),
168+
(re.compile(r"gpt-4o?(-\w+)*"), "OpenAI"),
171169
(re.compile(r"gpt-3\.5(-\w+)*"), "OpenAI"),
172170
(re.compile(r"text-davinci-\d+"), "OpenAI"),
173171
(re.compile(r"text-curie-\d+"), "OpenAI"),
@@ -202,7 +200,7 @@
202200
(re.compile(r"phi-\d+(-\w+)*"), "Microsoft"),
203201
# OpenAI latest
204202
(re.compile(r"gpt-4\.5(-\w+)*"), "OpenAI"),
205-
(re.compile(r"o[13](-\w+)*"), "OpenAI"),
203+
(re.compile(r"\bo[13](-\w+)*\b"), "OpenAI"),
206204
# Anthropic Claude 4.x
207205
(re.compile(r"claude-4(-\w+)*"), "Anthropic"),
208206
(re.compile(r"claude-4\.5(-\w+)*"), "Anthropic"),
@@ -216,7 +214,7 @@
216214
# DeepSeek
217215
(re.compile(r"deepseek-\w+(-\w+)*"), "DeepSeek"),
218216
# Alibaba Qwen
219-
(re.compile(r"qwen(?:\d+(?:\.\d+)?)*(?:-\w+)*"), "Alibaba"),
217+
(re.compile(r"qwen[\d.]*(?:-\w+)*"), "Alibaba"),
220218
# xAI Grok
221219
(re.compile(r"grok-(?:\d+(?:\.\d+)?|\w+)(?:-\w+)*"), "xAI"),
222220
]

src/ai_bom/detectors/endpoint_db.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77

88
from __future__ import annotations
99

10-
import re
11-
1210
from ai_bom.config import API_KEY_PATTERNS, KNOWN_AI_ENDPOINTS
1311

1412

@@ -35,8 +33,9 @@ def match_endpoint(url: str) -> tuple[str, str] | None:
3533
>>> match_endpoint("https://example.com/api")
3634
None
3735
"""
36+
url_lower = url.lower()
3837
for pattern, provider, usage_type in KNOWN_AI_ENDPOINTS:
39-
if pattern.search(url, re.IGNORECASE):
38+
if pattern.search(url_lower):
4039
return (provider, usage_type)
4140
return None
4241

@@ -66,11 +65,26 @@ def detect_api_key(text: str) -> list[tuple[str, str, str]]:
6665
[("sk-ant-t...st123", "Anthropic", "sk-ant-[a-zA-Z0-9_-]{32,}")]
6766
"""
6867
results: list[tuple[str, str, str]] = []
68+
# Track matched byte ranges to avoid double-attribution
69+
# (e.g., DeepSeek sk- key also matching the generic OpenAI sk- pattern)
70+
matched_spans: list[tuple[int, int]] = []
6971

7072
for pattern, provider in API_KEY_PATTERNS:
7173
for match in pattern.finditer(text):
7274
# Use the first capture group if present, otherwise the full match
73-
key = match.group(1) if match.lastindex else match.group(0)
75+
if match.lastindex:
76+
key = match.group(1)
77+
key_start = match.start(1)
78+
key_end = match.end(1)
79+
else:
80+
key = match.group(0)
81+
key_start = match.start(0)
82+
key_end = match.end(0)
83+
84+
# Skip if this key's byte range overlaps with an already-matched span
85+
if any(key_start < end and key_end > start for start, end in matched_spans):
86+
continue
87+
matched_spans.append((key_start, key_end))
7488

7589
# Mask the key for security
7690
if len(key) <= 12:

src/ai_bom/detectors/model_registry.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
"grok-1.5": {"provider": "xAI", "deprecated": False},
6060
"grok-1": {"provider": "xAI", "deprecated": False},
6161
"grok-beta": {"provider": "xAI", "deprecated": False},
62-
"grok": {"provider": "xAI", "deprecated": False},
6362
# Alibaba Models
6463
"qwen-max": {"provider": "Alibaba", "deprecated": False},
6564
"qwen-plus": {"provider": "Alibaba", "deprecated": False},
@@ -68,7 +67,6 @@
6867
"qwen2.5": {"provider": "Alibaba", "deprecated": False},
6968
"qwen2": {"provider": "Alibaba", "deprecated": False},
7069
"qwen1.5": {"provider": "Alibaba", "deprecated": False},
71-
"qwen": {"provider": "Alibaba", "deprecated": False},
7270
}
7371

7472

src/ai_bom/scanners/code_scanner.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,6 @@ def scan(self, path: Path) -> list[AIComponent]:
6969
"""
7070
components: list[AIComponent] = []
7171

72-
# Track which files/SDKs we've already created components for
73-
# to avoid duplicates: key = (sdk_name, file_path)
74-
seen_components: set[tuple[str, str]] = set()
75-
7672
# Handle single file scanning
7773
scan_dir = path if path.is_dir() else path.parent
7874

@@ -111,9 +107,9 @@ def scan(self, path: Path) -> list[AIComponent]:
111107
# Phase B: Source code scan
112108
if path.is_file() and path.suffix in SCANNABLE_EXTENSIONS["code"]:
113109
# Single file mode: scan just this file
114-
source_components = self._scan_single_source_file(path, declared_deps, seen_components)
110+
source_components = self._scan_single_source_file(path, declared_deps)
115111
else:
116-
source_components = self._scan_source_files(scan_dir, declared_deps, seen_components)
112+
source_components = self._scan_source_files(scan_dir, declared_deps)
117113
components.extend(source_components)
118114

119115
return components
@@ -204,7 +200,6 @@ def _scan_single_source_file(
204200
self,
205201
path: Path,
206202
declared_deps: set[str],
207-
seen_components: set[tuple[str, str]],
208203
) -> list[AIComponent]:
209204
"""Scan a single source file for AI SDK usage."""
210205
components: list[AIComponent] = []
@@ -410,14 +405,12 @@ def _scan_source_files(
410405
self,
411406
path: Path,
412407
declared_deps: set[str],
413-
seen_components: set[tuple[str, str]],
414408
) -> list[AIComponent]:
415409
"""Scan source files for AI SDK usage.
416410
417411
Args:
418412
path: Root path to scan
419413
declared_deps: Set of declared dependencies from Phase A
420-
seen_components: Set of (sdk_name, file_path) tuples to avoid duplicates
421414
422415
Returns:
423416
List of detected AI components

tests/test_detection_enhanced.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -400,11 +400,15 @@ def test_model_pattern_matches(self, model_name: str, expected_provider: str):
400400
break
401401
assert matched, f"Model '{model_name}' did not match any pattern"
402402

403-
def test_deepseek_package(self):
404-
assert "deepseek" in KNOWN_AI_PACKAGES
405-
provider, usage = KNOWN_AI_PACKAGES["deepseek"]
406-
assert provider == "DeepSeek"
407-
assert usage == "completion"
403+
def test_deepseek_model_pattern(self):
404+
"""DeepSeek detection is via model patterns and API keys, not a fake package name."""
405+
from ai_bom.config import KNOWN_MODEL_PATTERNS
406+
407+
matched = any(
408+
pattern.search("deepseek-coder") and provider == "DeepSeek"
409+
for pattern, provider in KNOWN_MODEL_PATTERNS
410+
)
411+
assert matched, "DeepSeek model pattern should match 'deepseek-coder'"
408412

409413

410414
class TestDeprecatedModels:

tests/test_detectors/test_patterns.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ class TestAPIKeyPatterns:
9393
("deepseek_api_key = 'sk-abcdefghijklmnopqrstuvwxyz1234'", "DeepSeek"),
9494
("sk-ant-abcdefghijklmnopqrstuvwxyz", "Anthropic"),
9595
("hf_abcdefghijklmnopqrstuvwxyz", "HuggingFace"),
96+
("xai-abcdefghijklmnopqrstuvwxyz", "xAI"),
9697
],
9798
)
9899
def test_key_patterns(self, key, provider):
@@ -101,6 +102,13 @@ def test_key_patterns(self, key, provider):
101102
return
102103
pytest.fail(f"No pattern matched {key} for {provider}")
103104

105+
def test_deepseek_key_no_double_attribution(self):
106+
"""DeepSeek API keys should not also be attributed to OpenAI."""
107+
results = detect_api_key("deepseek_api_key = 'sk-abcdefghijklmnopqrstuvwxyz1234'")
108+
providers = [r[1] for r in results]
109+
assert "DeepSeek" in providers
110+
assert providers.count("OpenAI") == 0, "DeepSeek key should not also match as OpenAI"
111+
104112

105113
class TestModelPatterns:
106114
@pytest.mark.parametrize(
@@ -110,10 +118,60 @@ class TestModelPatterns:
110118
("claude-3-opus", "Anthropic"),
111119
("gemini-1.5-pro", "Google"),
112120
("mistral-large", "Mistral"),
121+
("deepseek-coder", "DeepSeek"),
122+
("grok-2", "xAI"),
123+
("grok-beta", "xAI"),
124+
("qwen2.5-coder", "Alibaba"),
125+
("qwen-max", "Alibaba"),
113126
],
114127
)
115128
def test_model_patterns(self, model, expected_provider):
116129
for pattern, provider in KNOWN_MODEL_PATTERNS:
117-
if re.match(pattern, model) and provider == expected_provider:
130+
if re.search(pattern, model) and provider == expected_provider:
118131
return
119132
pytest.fail(f"No pattern matched {model} for {expected_provider}")
133+
134+
135+
class TestEndpointNewProviders:
136+
def test_xai_endpoint(self):
137+
result = match_endpoint("https://api.x.ai/v1/chat/completions")
138+
assert result is not None
139+
assert result[0] == "xAI"
140+
141+
def test_alibaba_endpoint(self):
142+
result = match_endpoint("https://dashscope.aliyuncs.com/api/v1/services")
143+
assert result is not None
144+
assert result[0] == "Alibaba"
145+
146+
def test_xai_endpoint_uppercase(self):
147+
"""Ensure case-insensitive matching works for endpoints."""
148+
result = match_endpoint("https://API.X.AI/v1/chat")
149+
assert result is not None
150+
assert result[0] == "xAI"
151+
152+
153+
class TestModelRegistryNewProviders:
154+
def test_grok_2(self):
155+
result = lookup_model("grok-2")
156+
assert result is not None
157+
assert result["provider"] == "xAI"
158+
159+
def test_grok_beta(self):
160+
result = lookup_model("grok-beta")
161+
assert result is not None
162+
assert result["provider"] == "xAI"
163+
164+
def test_qwen_max(self):
165+
result = lookup_model("qwen-max")
166+
assert result is not None
167+
assert result["provider"] == "Alibaba"
168+
169+
def test_qwen_versioned(self):
170+
result = lookup_model("qwen2.5")
171+
assert result is not None
172+
assert result["provider"] == "Alibaba"
173+
174+
def test_bare_grok_no_match(self):
175+
"""Bare 'grok' without version/suffix should not match to avoid false positives."""
176+
result = lookup_model("grok")
177+
assert result is None

0 commit comments

Comments
 (0)