Skip to content

Commit 05b26dd

Browse files
authored
Merge pull request #1987 from ziadhany/kb-commits
Migrate Importer to Advisory v2 & Collect Existing Fix Commits for Project KB
2 parents df91a2c + f70b2a4 commit 05b26dd

File tree

16 files changed

+1644
-23405
lines changed

16 files changed

+1644
-23405
lines changed

vulnerabilities/importers/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@
5959
from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2
6060
from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2
6161
from vulnerabilities.pipelines.v2_importers import postgresql_importer as postgresql_importer_v2
62+
from vulnerabilities.pipelines.v2_importers import (
63+
project_kb_msr2019_importer as project_kb_msr2019_importer_v2,
64+
)
65+
from vulnerabilities.pipelines.v2_importers import (
66+
project_kb_statements_importer as project_kb_statements_importer_v2,
67+
)
6268
from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2
6369
from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2
6470
from vulnerabilities.pipelines.v2_importers import redhat_importer as redhat_importer_v2
@@ -87,6 +93,8 @@
8793
github_osv_importer_v2.GithubOSVImporterPipeline,
8894
redhat_importer_v2.RedHatImporterPipeline,
8995
aosp_importer_v2.AospImporterPipeline,
96+
project_kb_statements_importer_v2.ProjectKBStatementsPipeline,
97+
project_kb_msr2019_importer_v2.ProjectKBMSR2019Pipeline,
9098
ruby_importer_v2.RubyImporterPipeline,
9199
epss_importer_v2.EPSSImporterPipeline,
92100
mattermost_importer_v2.MattermostImporterPipeline,

vulnerabilities/pipelines/v2_importers/aosp_importer.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,9 @@
1616
from fetchcode.vcs import fetch_via_vcs
1717

1818
from vulnerabilities.importer import AdvisoryData
19-
from vulnerabilities.importer import AffectedPackageV2
20-
from vulnerabilities.importer import PackageCommitPatchData
21-
from vulnerabilities.importer import PatchData
22-
from vulnerabilities.importer import ReferenceV2
2319
from vulnerabilities.importer import VulnerabilitySeverity
2420
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
25-
from vulnerabilities.pipes.advisory import classify_patch_source
21+
from vulnerabilities.pipes.advisory import append_patch_classifications
2622
from vulnerabilities.severity_systems import GENERIC
2723

2824

@@ -90,23 +86,14 @@ def collect_advisories(self):
9086
patch_url = commit_data.get("patchUrl")
9187
commit_id = commit_data.get("commitId")
9288

93-
base_purl, patch_objs = classify_patch_source(
89+
append_patch_classifications(
9490
url=patch_url,
9591
commit_hash=commit_id,
9692
patch_text=None,
93+
affected_packages=affected_packages,
94+
references=references,
95+
patches=patches,
9796
)
98-
for patch_obj in patch_objs:
99-
if isinstance(patch_obj, PackageCommitPatchData):
100-
fixed_commit = patch_obj
101-
affected_package = AffectedPackageV2(
102-
package=base_purl,
103-
fixed_by_commit_patches=[fixed_commit],
104-
)
105-
affected_packages.append(affected_package)
106-
elif isinstance(patch_obj, PatchData):
107-
patches.append(patch_obj)
108-
elif isinstance(patch_obj, ReferenceV2):
109-
references.append(patch_obj)
11097

11198
url = (
11299
"https://raw.githubusercontent.com/quarkslab/aosp_dataset/refs/heads/master/cves/"
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import csv
11+
from pathlib import Path
12+
from typing import Iterable
13+
14+
from fetchcode.vcs import fetch_via_vcs
15+
16+
from vulnerabilities.importer import AdvisoryData
17+
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
18+
from vulnerabilities.pipes.advisory import append_patch_classifications
19+
20+
21+
class ProjectKBMSR2019Pipeline(VulnerableCodeBaseImporterPipelineV2):
22+
"""
23+
ProjectKB Importer Pipeline
24+
Collect advisory from ProjectKB data:
25+
- CSV database https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv
26+
"""
27+
28+
pipeline_id = "project-kb-msr-2019_v2"
29+
spdx_license_expression = "Apache-2.0"
30+
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
31+
repo_url = "git+https://github.com/SAP/project-kb"
32+
33+
@classmethod
34+
def steps(cls):
35+
return (
36+
cls.clone,
37+
cls.collect_and_store_advisories,
38+
cls.clean_downloads,
39+
)
40+
41+
def clone(self):
42+
self.log("Cloning ProjectKB advisory data...")
43+
self.vcs_response = fetch_via_vcs(self.repo_url)
44+
45+
def advisories_count(self):
46+
csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
47+
48+
with open(csv_path, newline="", encoding="utf-8") as f:
49+
reader = csv.reader(f)
50+
next(reader, None)
51+
count = sum(1 for _ in reader)
52+
53+
self.log(f"Estimated advisories to process: {count}")
54+
return count
55+
56+
def collect_advisories(self) -> Iterable[AdvisoryData]:
57+
self.log("Collecting fix commits from ProjectKB ( vulas_db_msr2019_release )...")
58+
csv_path = Path(self.vcs_response.dest_dir) / "MSR2019/dataset/vulas_db_msr2019_release.csv"
59+
60+
with open(csv_path, newline="", encoding="utf-8") as f:
61+
reader = csv.reader(f)
62+
next(reader, None) # skip header
63+
64+
for row in reader:
65+
if len(row) != 4:
66+
continue
67+
68+
vuln_id, vcs_url, commit_hash, poc = row
69+
70+
if not vuln_id or not vcs_url or not commit_hash:
71+
continue
72+
73+
patches = []
74+
affected_packages = []
75+
references = []
76+
append_patch_classifications(
77+
url=vcs_url,
78+
commit_hash=commit_hash,
79+
patch_text=None,
80+
affected_packages=affected_packages,
81+
references=references,
82+
patches=patches,
83+
)
84+
85+
yield AdvisoryData(
86+
advisory_id=vuln_id,
87+
affected_packages=affected_packages,
88+
patches=patches,
89+
references_v2=references,
90+
url="https://github.com/SAP/project-kb/blob/main/MSR2019/dataset/vulas_db_msr2019_release.csv",
91+
)
92+
93+
def clean_downloads(self):
94+
"""Remove the cloned repository from disk."""
95+
self.log("Removing cloned repository...")
96+
if self.vcs_response:
97+
self.vcs_response.delete()
98+
99+
def on_failure(self):
100+
"""Ensure cleanup happens on pipeline failure."""
101+
self.clean_downloads()
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
from collections import defaultdict
10+
from pathlib import Path
11+
from typing import Iterable
12+
13+
import saneyaml
14+
from fetchcode.vcs import fetch_via_vcs
15+
from packageurl import PackageURL
16+
from univers.version_range import RANGE_CLASS_BY_SCHEMES
17+
from univers.versions import InvalidVersion
18+
19+
from vulnerabilities.importer import AdvisoryData
20+
from vulnerabilities.importer import AffectedPackageV2
21+
from vulnerabilities.importer import ReferenceV2
22+
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
23+
from vulnerabilities.pipes.advisory import append_patch_classifications
24+
from vulnerabilities.utils import get_advisory_url
25+
from vulnerabilities.utils import is_commit
26+
27+
28+
class ProjectKBStatementsPipeline(VulnerableCodeBaseImporterPipelineV2):
29+
"""
30+
ProjectKB Importer Pipeline
31+
Collect advisory from ProjectKB data:
32+
- YAML statements: https://github.com/SAP/project-kb/blob/vulnerability-data/statements/*/*.yaml
33+
"""
34+
35+
pipeline_id = "project-kb-statements_v2"
36+
spdx_license_expression = "Apache-2.0"
37+
license_url = "https://github.com/SAP/project-kb/blob/main/LICENSE.txt"
38+
repo_url = "git+https://github.com/SAP/project-kb@vulnerability-data"
39+
40+
@classmethod
41+
def steps(cls):
42+
return (
43+
cls.clone,
44+
cls.collect_and_store_advisories,
45+
cls.clean_downloads,
46+
)
47+
48+
def clone(self):
49+
self.log("Cloning ProjectKB Statements advisory data...")
50+
self.vcs_response = fetch_via_vcs(self.repo_url)
51+
52+
def advisories_count(self):
53+
base_path = Path(self.vcs_response.dest_dir) / "statements"
54+
count = sum(1 for _ in base_path.rglob("*.yaml"))
55+
self.log(f"Estimated advisories to process: {count}")
56+
return count
57+
58+
def collect_advisories(self) -> Iterable[AdvisoryData]:
59+
self.log("Collecting fix commits from YAML statements under /statements....")
60+
base_path = Path(self.vcs_response.dest_dir) / "statements"
61+
62+
for yaml_file in base_path.rglob("*.yaml"):
63+
if yaml_file.name != "statement.yaml":
64+
continue
65+
66+
with open(yaml_file, encoding="utf-8") as f:
67+
yaml_data = saneyaml.load(f)
68+
69+
vulnerability_id = yaml_data.get("vulnerability_id")
70+
if not vulnerability_id:
71+
continue
72+
73+
note_texts = []
74+
references = []
75+
for note_entry in yaml_data.get("notes", []):
76+
text_content = note_entry.get("text")
77+
if not text_content:
78+
continue
79+
note_texts.append(text_content)
80+
81+
for link in note_entry.get("links", []):
82+
ref = ReferenceV2(url=link)
83+
references.append(ref)
84+
85+
description = "\n".join(note_texts)
86+
affected_packages = []
87+
patches = []
88+
for fix in yaml_data.get("fixes", []):
89+
for commit in fix.get("commits", []):
90+
commit_hash = commit.get("id")
91+
if not is_commit(commit_hash):
92+
commit_hash = None
93+
94+
vcs_url = commit.get("repository")
95+
append_patch_classifications(
96+
url=vcs_url,
97+
commit_hash=commit_hash,
98+
patch_text=None,
99+
affected_packages=affected_packages,
100+
references=references,
101+
patches=patches,
102+
)
103+
104+
purls_to_versions = defaultdict(lambda: [[], []])
105+
for artifact in yaml_data.get("artifacts", []):
106+
affected = artifact.get("affected")
107+
purl_str = artifact.get("id")
108+
109+
try:
110+
purl = PackageURL.from_string(purl_str)
111+
except ValueError:
112+
self.log(f"Invalid PackageURL: {purl_str!r}")
113+
continue
114+
115+
version_range_class = RANGE_CLASS_BY_SCHEMES.get(purl.type)
116+
if not version_range_class:
117+
continue
118+
119+
base_purl = PackageURL(
120+
type=purl.type,
121+
namespace=purl.namespace,
122+
name=purl.name,
123+
)
124+
125+
if affected:
126+
purls_to_versions[base_purl][0].append(purl.version)
127+
else:
128+
purls_to_versions[base_purl][1].append(purl.version)
129+
130+
for base_purl, (affected_versions, fixed_versions) in purls_to_versions.items():
131+
version_range_class = RANGE_CLASS_BY_SCHEMES.get(base_purl.type)
132+
133+
affected_range = None
134+
fixed_range = None
135+
136+
if affected_versions:
137+
try:
138+
affected_range = version_range_class.from_versions(affected_versions)
139+
except InvalidVersion as e:
140+
self.log(f"Invalid affected versions for {base_purl}: {e}")
141+
142+
if fixed_versions:
143+
try:
144+
fixed_range = version_range_class.from_versions(fixed_versions)
145+
except InvalidVersion as e:
146+
self.log(f"Invalid fixed versions for {base_purl}: {e}")
147+
148+
if affected_range or fixed_range:
149+
pkg = AffectedPackageV2(
150+
package=base_purl,
151+
affected_version_range=affected_range,
152+
fixed_version_range=fixed_range,
153+
)
154+
affected_packages.append(pkg)
155+
156+
advisory_url = get_advisory_url(
157+
file=yaml_file,
158+
base_path=base_path,
159+
url="https://github.com/SAP/project-kb/blob/vulnerability-data/statements/",
160+
)
161+
162+
yield AdvisoryData(
163+
advisory_id=vulnerability_id,
164+
summary=description,
165+
affected_packages=affected_packages,
166+
references_v2=references,
167+
patches=patches,
168+
url=advisory_url,
169+
)
170+
171+
def clean_downloads(self):
172+
"""Remove the cloned repository from disk."""
173+
self.log("Removing cloned repository...")
174+
175+
if self.vcs_response:
176+
self.vcs_response.delete()
177+
178+
def on_failure(self):
179+
"""Ensure cleanup happens on pipeline failure."""
180+
self.clean_downloads()

vulnerabilities/pipes/advisory.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
from aboutcode.hashid import get_core_purl
2727
from vulnerabilities.importer import AdvisoryData
28+
from vulnerabilities.importer import AffectedPackageV2
2829
from vulnerabilities.importer import PackageCommitPatchData
2930
from vulnerabilities.importer import PatchData
3031
from vulnerabilities.importer import ReferenceV2
@@ -501,3 +502,27 @@ def advisories_checksum(advisories: Union[Advisory, List[Advisory]]) -> str:
501502

502503
checksum = hashlib.sha1(combined_contents.encode())
503504
return checksum.hexdigest()
505+
506+
507+
def append_patch_classifications(
508+
url, commit_hash, patch_text, affected_packages, patches, references
509+
):
510+
"""Classify a patch source and append the results to affected_packages, patches, or references,
511+
assuming all provided commits are fixed commits."""
512+
513+
base_purl, patch_objs = classify_patch_source(
514+
url=url, commit_hash=commit_hash, patch_text=patch_text
515+
)
516+
517+
for patch_obj in patch_objs:
518+
if isinstance(patch_obj, PackageCommitPatchData):
519+
fixed_commit = patch_obj
520+
affected_package = AffectedPackageV2(
521+
package=base_purl,
522+
fixed_by_commit_patches=[fixed_commit],
523+
)
524+
affected_packages.append(affected_package)
525+
elif isinstance(patch_obj, PatchData):
526+
patches.append(patch_obj)
527+
elif isinstance(patch_obj, ReferenceV2):
528+
references.append(patch_obj)

0 commit comments

Comments
 (0)