Skip to content

Commit 9c53300

Browse files
committed
Add patent scan plugin with CLI support and extended detection tests
Signed-off-by: dikshaa2909 <dikshadeware@gmail.com>
1 parent 022ddc8 commit 9c53300

File tree

5 files changed

+261
-1
lines changed

5 files changed

+261
-1
lines changed

setup.cfg

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,9 +189,9 @@ scancode_scan =
189189
packages = packagedcode.plugin_package:PackageScanner
190190
emails = cluecode.plugin_email:EmailScanner
191191
urls = cluecode.plugin_url:UrlScanner
192+
patents = cluecode.plugin_patent:PatentScanner
192193
generated = summarycode.generated:GeneratedCodeDetector
193194

194-
195195
# scancode_post_scan is the entry point for post_scan plugins executed after the
196196
# scan plugins and before the output plugins. See also plugincode.post_scan
197197
# module for details and doc.
@@ -237,3 +237,5 @@ scancode_output =
237237
yaml = formattedcode.output_yaml:YamlOutput
238238
cyclonedx = formattedcode.output_cyclonedx:CycloneDxJsonOutput
239239
cyclonedx-xml = formattedcode.output_cyclonedx:CycloneDxXmlOutput
240+
241+

src/cluecode/patents.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import re
2+
3+
# Keywords that indicate patent-related references
4+
PATENT_KEYWORDS = [
5+
"patent pending",
6+
"patented",
7+
"patent application",
8+
"patent number",
9+
]
10+
11+
# Precompile keyword regex patterns (case-insensitive)
12+
KEYWORD_REGEXES = [
13+
re.compile(rf"\b{re.escape(keyword)}\b", re.IGNORECASE)
14+
for keyword in PATENT_KEYWORDS
15+
]
16+
17+
# Regex for patent numbers and international formats
18+
PATENT_NUMBER_REGEX = re.compile(
19+
r"""
20+
\b
21+
(?:
22+
(?:US|EP|WO|JP|CN|KR|GB|IN) # Country codes
23+
\s*
24+
(?:Patent(?:\s+No\.?)?\s*)? # Optional 'Patent' or 'Patent No.'
25+
\d+(?:[,\/]\d+)* # Number part (allow commas/slashes)
26+
\s*(?:A1|A2|B1|B2)? # Optional kind codes
27+
)
28+
\b
29+
""",
30+
re.IGNORECASE | re.VERBOSE,
31+
)
32+
33+
34+
def find_patents(location):
35+
"""
36+
Detect patent references and patent-related keywords in a file.
37+
38+
Return a list of tuples:
39+
(kind, value, line_number)
40+
where:
41+
kind: "number" or "keyword"
42+
value: matched text (original casing preserved)
43+
line_number: line where match occurred
44+
"""
45+
results = []
46+
47+
try:
48+
with open(location, "r", errors="ignore") as f:
49+
lines = f.readlines()
50+
except Exception:
51+
return results
52+
53+
for line_num, line in enumerate(lines, start=1):
54+
55+
# Detect patent numbers
56+
for match in PATENT_NUMBER_REGEX.finditer(line):
57+
results.append(("number", match.group().strip(), line_num))
58+
59+
# Detect keyword references
60+
for regex in KEYWORD_REGEXES:
61+
match = regex.search(line)
62+
if match:
63+
results.append(("keyword", match.group(), line_num))
64+
65+
return results
66+

src/cluecode/plugin_patent.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright (c) nexB Inc. and others. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from functools import partial
5+
import attr
6+
7+
from commoncode.cliutils import PluggableCommandLineOption
8+
from commoncode.cliutils import OTHER_SCAN_GROUP
9+
from commoncode.cliutils import SCAN_OPTIONS_GROUP
10+
from plugincode.scan import ScanPlugin
11+
from plugincode.scan import scan_impl
12+
13+
14+
@scan_impl
15+
class PatentScanner(ScanPlugin):
16+
"""
17+
Scan a Resource for patent references and patent numbers.
18+
"""
19+
resource_attributes = dict(
20+
patent_detections=attr.ib(default=attr.Factory(list))
21+
)
22+
23+
run_order = 8
24+
sort_order = 8
25+
26+
options = [
27+
PluggableCommandLineOption(
28+
('--patent',),
29+
is_flag=True,
30+
default=False,
31+
help='Scan <input> for patent references and patent numbers.',
32+
help_group=OTHER_SCAN_GROUP,
33+
),
34+
PluggableCommandLineOption(
35+
('--max-patent',),
36+
type=int,
37+
default=50,
38+
metavar='INT',
39+
show_default=True,
40+
required_options=['patent'],
41+
help='Report only up to INT patent references found in a file. Use 0 for no limit.',
42+
help_group=SCAN_OPTIONS_GROUP,
43+
),
44+
]
45+
46+
def is_enabled(self, patent, **kwargs):
47+
return patent
48+
49+
def get_scanner(self, max_patent=50, **kwargs):
50+
from scancode.api import get_patents
51+
return partial(
52+
get_patents,
53+
threshold=max_patent,
54+
)

src/scancode/api.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,3 +366,33 @@ def get_file_info(location, **kwargs):
366366
result['is_source'] = bool(collector.is_source)
367367
result['is_script'] = bool(collector.is_script)
368368
return result
369+
370+
def get_patents(location, threshold=50, **kwargs):
371+
from itertools import islice
372+
from cluecode.patents import find_patents
373+
374+
raw_matches = find_patents(location)
375+
376+
seen = set()
377+
matches = []
378+
for kind, value, line_num in raw_matches:
379+
key = (kind, value, line_num)
380+
if key not in seen:
381+
seen.add(key)
382+
matches.append(key)
383+
384+
if threshold and threshold > 0:
385+
matches = list(islice(matches, threshold))
386+
387+
results = []
388+
for kind, value, line_num in matches:
389+
results.append({
390+
"type": kind,
391+
"patent_reference": value,
392+
"start_line": line_num,
393+
"end_line": line_num,
394+
})
395+
396+
return dict(patent_detections=results)
397+
398+
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import json
2+
from scancode.cli_test_utils import run_scan_click
3+
from scancode.cli_test_utils import load_json_result
4+
5+
6+
def test_patent_detection_basic(tmp_path):
7+
test_file = tmp_path / "test.txt"
8+
test_file.write_text("US Patent 8,123,456 B2 and patent pending.")
9+
10+
result_file = tmp_path / "result.json"
11+
12+
run_scan_click(
13+
["--patent", "--json", str(result_file), str(test_file)]
14+
)
15+
16+
result = load_json_result(result_file)
17+
18+
detections = result["files"][0].get("patent_detections", [])
19+
20+
assert len(detections) == 2
21+
22+
values = [d["patent_reference"] for d in detections]
23+
24+
assert "US Patent 8,123,456 B2" in values
25+
assert "patent pending" in values
26+
27+
for d in detections:
28+
assert "type" in d
29+
assert "start_line" in d
30+
assert "end_line" in d
31+
32+
33+
def test_patent_detection_none(tmp_path):
34+
test_file = tmp_path / "test.txt"
35+
test_file.write_text("This file has no patent reference.")
36+
37+
result_file = tmp_path / "result.json"
38+
39+
run_scan_click(
40+
["--patent", "--json", str(result_file), str(test_file)]
41+
)
42+
43+
result = load_json_result(result_file)
44+
45+
detections = result["files"][0].get("patent_detections", [])
46+
47+
assert detections == []
48+
49+
50+
def test_patent_international_formats(tmp_path):
51+
test_file = tmp_path / "test.txt"
52+
test_file.write_text(
53+
"EP1234567B1\nWO 2019/123456\nUS20190012345A1"
54+
)
55+
56+
result_file = tmp_path / "result.json"
57+
58+
run_scan_click(
59+
["--patent", "--json", str(result_file), str(test_file)]
60+
)
61+
62+
result = load_json_result(result_file)
63+
64+
detections = result["files"][0].get("patent_detections", [])
65+
66+
values = [d["patent_reference"] for d in detections]
67+
68+
assert any("EP1234567B1" in v for v in values)
69+
assert any("WO 2019/123456" in v for v in values)
70+
assert any("US20190012345A1" in v for v in values)
71+
72+
73+
def test_patent_no_false_positive(tmp_path):
74+
test_file = tmp_path / "test.txt"
75+
test_file.write_text("This is unpatented technology.")
76+
77+
result_file = tmp_path / "result.json"
78+
79+
run_scan_click(
80+
["--patent", "--json", str(result_file), str(test_file)]
81+
)
82+
83+
result = load_json_result(result_file)
84+
85+
detections = result["files"][0].get("patent_detections", [])
86+
87+
assert detections == []
88+
89+
90+
def test_patent_threshold(tmp_path):
91+
test_file = tmp_path / "test.txt"
92+
test_file.write_text(
93+
"US Patent 1\nUS Patent 2\nUS Patent 3"
94+
)
95+
96+
result_file = tmp_path / "result.json"
97+
98+
run_scan_click(
99+
["--patent", "--max-patent", "1", "--json", str(result_file), str(test_file)]
100+
)
101+
102+
result = load_json_result(result_file)
103+
104+
detections = result["files"][0].get("patent_detections", [])
105+
106+
assert len(detections) == 1
107+
assert detections[0]["patent_reference"] == "US Patent 1"
108+

0 commit comments

Comments
 (0)