Add patent scan plugin with CLI support and extended detection tests

dikshaa2909 · dikshaa2909 · commit 9c5330018831 · 2026-02-20T21:58:34.000+05:30
Signed-off-by: dikshaa2909 &lt;dikshadeware@gmail.com&gt;
diff --git a/setup.cfg b/setup.cfg
@@ -189,9 +189,9 @@ scancode_scan =
     packages = packagedcode.plugin_package:PackageScanner
     emails = cluecode.plugin_email:EmailScanner
     urls = cluecode.plugin_url:UrlScanner
+    patents = cluecode.plugin_patent:PatentScanner
     generated = summarycode.generated:GeneratedCodeDetector
 
-
 # scancode_post_scan is the entry point for post_scan plugins executed after the
 # scan plugins and before the output plugins. See also plugincode.post_scan
 # module for details and doc.
@@ -237,3 +237,5 @@ scancode_output =
     yaml = formattedcode.output_yaml:YamlOutput
     cyclonedx = formattedcode.output_cyclonedx:CycloneDxJsonOutput
     cyclonedx-xml = formattedcode.output_cyclonedx:CycloneDxXmlOutput
+
+    
diff --git a/src/cluecode/patents.py b/src/cluecode/patents.py
@@ -0,0 +1,66 @@
+import re
+
+# Keywords that indicate patent-related references
+PATENT_KEYWORDS = [
+    "patent pending",
+    "patented",
+    "patent application",
+    "patent number",
+]
+
+# Precompile keyword regex patterns (case-insensitive)
+KEYWORD_REGEXES = [
+    re.compile(rf"\b{re.escape(keyword)}\b", re.IGNORECASE)
+    for keyword in PATENT_KEYWORDS
+]
+
+# Regex for patent numbers and international formats
+PATENT_NUMBER_REGEX = re.compile(
+    r"""
+    \b
+    (?:
+        (?:US|EP|WO|JP|CN|KR|GB|IN)       # Country codes
+        \s*
+        (?:Patent(?:\s+No\.?)?\s*)?      # Optional 'Patent' or 'Patent No.'
+        \d+(?:[,\/]\d+)*                 # Number part (allow commas/slashes)
+        \s*(?:A1|A2|B1|B2)?              # Optional kind codes
+    )
+    \b
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+
+
+def find_patents(location):
+    """
+    Detect patent references and patent-related keywords in a file.
+
+    Return a list of tuples:
+        (kind, value, line_number)
+    where:
+        kind: "number" or "keyword"
+        value: matched text (original casing preserved)
+        line_number: line where match occurred
+    """
+    results = []
+
+    try:
+        with open(location, "r", errors="ignore") as f:
+            lines = f.readlines()
+    except Exception:
+        return results
+
+    for line_num, line in enumerate(lines, start=1):
+
+        # Detect patent numbers
+        for match in PATENT_NUMBER_REGEX.finditer(line):
+            results.append(("number", match.group().strip(), line_num))
+
+        # Detect keyword references
+        for regex in KEYWORD_REGEXES:
+            match = regex.search(line)
+            if match:
+                results.append(("keyword", match.group(), line_num))
+
+    return results
+    
diff --git a/src/cluecode/plugin_patent.py b/src/cluecode/plugin_patent.py
@@ -0,0 +1,54 @@
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from functools import partial
+import attr
+
+from commoncode.cliutils import PluggableCommandLineOption
+from commoncode.cliutils import OTHER_SCAN_GROUP
+from commoncode.cliutils import SCAN_OPTIONS_GROUP
+from plugincode.scan import ScanPlugin
+from plugincode.scan import scan_impl
+
+
+@scan_impl
+class PatentScanner(ScanPlugin):
+    """
+    Scan a Resource for patent references and patent numbers.
+    """
+    resource_attributes = dict(
+        patent_detections=attr.ib(default=attr.Factory(list))
+    )
+
+    run_order = 8
+    sort_order = 8
+
+    options = [
+        PluggableCommandLineOption(
+            ('--patent',),
+            is_flag=True,
+            default=False,
+            help='Scan <input> for patent references and patent numbers.',
+            help_group=OTHER_SCAN_GROUP,
+        ),
+        PluggableCommandLineOption(
+            ('--max-patent',),
+            type=int,
+            default=50,
+            metavar='INT',
+            show_default=True,
+            required_options=['patent'],
+            help='Report only up to INT patent references found in a file. Use 0 for no limit.',
+            help_group=SCAN_OPTIONS_GROUP,
+        ),
+    ]
+
+    def is_enabled(self, patent, **kwargs):
+        return patent
+
+    def get_scanner(self, max_patent=50, **kwargs):
+        from scancode.api import get_patents
+        return partial(
+            get_patents,
+            threshold=max_patent,
+        )
diff --git a/src/scancode/api.py b/src/scancode/api.py
@@ -366,3 +366,33 @@ def get_file_info(location, **kwargs):
     result['is_source'] = bool(collector.is_source)
     result['is_script'] = bool(collector.is_script)
     return result
+
+def get_patents(location, threshold=50, **kwargs):
+    from itertools import islice
+    from cluecode.patents import find_patents
+
+    raw_matches = find_patents(location)
+
+    seen = set()
+    matches = []
+    for kind, value, line_num in raw_matches:
+        key = (kind, value, line_num)
+        if key not in seen:
+            seen.add(key)
+            matches.append(key)
+
+    if threshold and threshold > 0:
+        matches = list(islice(matches, threshold))
+
+    results = []
+    for kind, value, line_num in matches:
+        results.append({
+            "type": kind,
+            "patent_reference": value,
+            "start_line": line_num,
+            "end_line": line_num,
+        })
+
+    return dict(patent_detections=results)
+
+    
diff --git a/tests/cluecode/test_plugin_patent.py b/tests/cluecode/test_plugin_patent.py
@@ -0,0 +1,108 @@
+import json
+from scancode.cli_test_utils import run_scan_click
+from scancode.cli_test_utils import load_json_result
+
+
+def test_patent_detection_basic(tmp_path):
+    test_file = tmp_path / "test.txt"
+    test_file.write_text("US Patent 8,123,456 B2 and patent pending.")
+
+    result_file = tmp_path / "result.json"
+
+    run_scan_click(
+        ["--patent", "--json", str(result_file), str(test_file)]
+    )
+
+    result = load_json_result(result_file)
+
+    detections = result["files"][0].get("patent_detections", [])
+
+    assert len(detections) == 2
+
+    values = [d["patent_reference"] for d in detections]
+
+    assert "US Patent 8,123,456 B2" in values
+    assert "patent pending" in values
+
+    for d in detections:
+        assert "type" in d
+        assert "start_line" in d
+        assert "end_line" in d
+
+
+def test_patent_detection_none(tmp_path):
+    test_file = tmp_path / "test.txt"
+    test_file.write_text("This file has no patent reference.")
+
+    result_file = tmp_path / "result.json"
+
+    run_scan_click(
+        ["--patent", "--json", str(result_file), str(test_file)]
+    )
+
+    result = load_json_result(result_file)
+
+    detections = result["files"][0].get("patent_detections", [])
+
+    assert detections == []
+
+
+def test_patent_international_formats(tmp_path):
+    test_file = tmp_path / "test.txt"
+    test_file.write_text(
+        "EP1234567B1\nWO 2019/123456\nUS20190012345A1"
+    )
+
+    result_file = tmp_path / "result.json"
+
+    run_scan_click(
+        ["--patent", "--json", str(result_file), str(test_file)]
+    )
+
+    result = load_json_result(result_file)
+
+    detections = result["files"][0].get("patent_detections", [])
+
+    values = [d["patent_reference"] for d in detections]
+
+    assert any("EP1234567B1" in v for v in values)
+    assert any("WO 2019/123456" in v for v in values)
+    assert any("US20190012345A1" in v for v in values)
+
+
+def test_patent_no_false_positive(tmp_path):
+    test_file = tmp_path / "test.txt"
+    test_file.write_text("This is unpatented technology.")
+
+    result_file = tmp_path / "result.json"
+
+    run_scan_click(
+        ["--patent", "--json", str(result_file), str(test_file)]
+    )
+
+    result = load_json_result(result_file)
+
+    detections = result["files"][0].get("patent_detections", [])
+
+    assert detections == []
+
+
+def test_patent_threshold(tmp_path):
+    test_file = tmp_path / "test.txt"
+    test_file.write_text(
+        "US Patent 1\nUS Patent 2\nUS Patent 3"
+    )
+
+    result_file = tmp_path / "result.json"
+
+    run_scan_click(
+        ["--patent", "--max-patent", "1", "--json", str(result_file), str(test_file)]
+    )
+
+    result = load_json_result(result_file)
+
+    detections = result["files"][0].get("patent_detections", [])
+
+    assert len(detections) == 1
+    assert detections[0]["patent_reference"] == "US Patent 1"
+