Merge pull request #10 from cisagov/improvement/support-nvd-2.0-json

dav3r · web-flow · commit 91bdcf9a1b20 · 2025-09-03T11:10:44.000-04:00
Support NVD 2.0 JSON
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@ and then load the data into to your database.
 
 ```python
 import asyncio
-from cyhy_cvesync import DEFAULT_CVE_URL_PATTERN
+from cyhy_cvesync import DEFAULT_CVE_AUTHORITATIVE_SOURCE, DEFAULT_CVE_URL_PATTERN
 from cyhy_cvesync.cve_sync import process_urls
 from cyhy_db import initialize_db
 from cyhy_db.models import CVEDoc
@@ -73,7 +73,7 @@ async def main():
     cve_url = DEFAULT_CVE_URL_PATTERN.format(year=2024)
     print(f"Processing CVE data from: {cve_url}...")
     created_cve_docs_count, updated_cve_docs_count, deleted_cve_docs_count = await process_urls(
-        [cve_url], cve_data_gzipped=True, concurrency=1)
+        [cve_url], cve_data_gzipped=True, concurrency=1, cve_authoritative_source=DEFAULT_CVE_AUTHORITATIVE_SOURCE)
 
     print(f"Created CVE documents: {created_cve_docs_count}")
     print(f"Updated CVE documents: {updated_cve_docs_count}")
@@ -90,12 +90,12 @@ Output:
 
 ```console
 CVE documents in DB before sync: 20
-Processing CVE data from: https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2024.json.gz...
+Processing CVE data from: https://nvd.nist.gov/feeds/json/cve/2.0/nvdcve-2.0-2024.json.gz...
 Deleting outdated CVE docs ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
-Created CVE documents: 12174
+Created CVE documents: 18272
 Updated CVE documents: 0
 Deleted CVE documents: 0
-CVE documents in DB after sync: 12194
+CVE documents in DB after sync: 18272
 ```
 
 ### Environment Variables ###
diff --git a/src/cyhy_cvesync/__init__.py b/src/cyhy_cvesync/__init__.py
@@ -7,11 +7,12 @@
 #   directly used, it populates the value package_name.__version__, which is
 #   used to get version information about this Python package.
 
+DEFAULT_CVE_AUTHORITATIVE_SOURCE = "nvd@nist.gov"
 DEFAULT_CVE_URL_PATTERN = (
-    "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{year}.json.gz"
+    "https://nvd.nist.gov/feeds/json/cve/2.0/nvdcve-2.0-{year}.json.gz"
 )
 
 from ._version import __version__  # noqa: F401, E402
 from .main import do_cve_sync  # noqa: E402
 
-__all__ = [DEFAULT_CVE_URL_PATTERN, "do_cve_sync"]
+__all__ = [DEFAULT_CVE_AUTHORITATIVE_SOURCE, DEFAULT_CVE_URL_PATTERN, "do_cve_sync"]
diff --git a/src/cyhy_cvesync/_version.py b/src/cyhy_cvesync/_version.py
@@ -1,3 +1,3 @@
 """This file defines the version of this module."""
 
-__version__ = "1.2.1"
+__version__ = "2.0.0"
diff --git a/src/cyhy_cvesync/cve_sync.py b/src/cyhy_cvesync/cve_sync.py
@@ -20,6 +20,8 @@
 ALLOWED_URL_SCHEMES = ["http", "https"]
 CVE_URL_RETRY_WAIT_SEC = 5
 MAX_CVE_URL_RETRIES = 10
+# Preferred CVSS metrics listed in order of preference
+PREFERRED_CVSS_METRICS = ["cvssMetricV31", "cvssMetricV30", "cvssMetricV2"]
 
 # Map to track existing CVE documents that were not updated
 cve_map: Dict[str, CVEDoc] = {}
@@ -28,12 +30,15 @@
 logger = logging.getLogger(f"{CYHY_ROOT_LOGGER}.{__name__}")
 
 
-async def process_cve_json(cve_json: dict) -> Tuple[int, int]:
+async def process_cve_json(
+    cve_json: dict, cve_authoritative_source: str
+) -> Tuple[int, int]:
     """
     Process the provided CVEs JSON and update the database with their contents.
 
     Args:
         cve_json (dict): The JSON data containing information about CVEs.
+        cve_authoritative_source (str): The authoritative source for CVE data.
 
     Returns:
         Tuple[int, int]: A tuple containing the counts of created and updated
@@ -45,19 +50,21 @@ async def process_cve_json(cve_json: dict) -> Tuple[int, int]:
     created_cve_docs_count = 0
     updated_cve_docs_count = 0
 
-    if cve_json.get("CVE_data_type") != "CVE":
+    if cve_json.get("format") != "NVD_CVE":
         raise ValueError("JSON does not look like valid CVE data.")
 
-    cve_items = cve_json.get("CVE_Items", [])
+    cve_items = cve_json.get("vulnerabilities", [])
 
     logger.info(
         "Async task %d: Starting to process %d CVEs",
         id(asyncio.current_task()),
         len(cve_items),
     )
+    # Create a set of preferred CVSS metrics for quick lookup
+    preferred_cvss_metrics_set = set(PREFERRED_CVSS_METRICS)
     for cve in cve_items:
         try:
-            cve_id = cve["cve"]["CVE_data_meta"]["ID"]
+            cve_id = cve["cve"]["id"]
         except KeyError:
             # JSON might be malformed, so we'll log what the CVE object looks like
             # and then raise an error
@@ -67,21 +74,36 @@ async def process_cve_json(cve_json: dict) -> Tuple[int, int]:
         if not cve_id:
             raise ValueError("CVE ID is empty.")
 
-        # Only process CVEs that have CVSS V2 or V3 data
-        if any(k in cve["impact"] for k in ["baseMetricV2", "baseMetricV3"]):
+        # Only process CVEs that have our preferred CVSS metrics
+        metrics = cve.get("cve", {}).get("metrics", {}).keys()
+        if metrics & preferred_cvss_metrics_set:
             # Check if the CVE document already exists in the database
             global cve_map
             async with cve_map_lock:
                 cve_doc = cve_map.pop(cve_id, None)
 
-            version = "V3" if "baseMetricV3" in cve["impact"] else "V2"
+            # Grab newest CVSS metrics from the authoritative source
+            cvss_base_score = None
+            cvss_version_temp = None
             try:
-                cvss_base_score = cve["impact"]["baseMetric" + version][
-                    "cvss" + version
-                ]["baseScore"]
-                cvss_version_temp = cve["impact"]["baseMetric" + version][
-                    "cvss" + version
-                ]["version"]
+                for v in PREFERRED_CVSS_METRICS:
+                    if v in cve["cve"].get("metrics", {}):
+                        for metric in cve["cve"]["metrics"][v]:
+                            if metric.get("source") == cve_authoritative_source:
+                                cvss_base_score = metric["cvssData"]["baseScore"]
+                                cvss_version_temp = metric["cvssData"]["version"]
+                                break
+                    if cvss_base_score is not None:
+                        # Break out of outer loop
+                        break
+
+                if cvss_base_score is None or cvss_version_temp is None:
+                    logger.debug(
+                        "Skipping %s; no preferred CVSS metrics found from authoritative source (%s).",
+                        cve_id,
+                        cve_authoritative_source,
+                    )
+                    continue
             except KeyError:
                 logger.error("CVE object: %s", cve)
                 raise ValueError("JSON does not look like valid CVE data.")
@@ -168,6 +190,7 @@ async def process_urls(
     cve_urls: List[str],
     cve_data_gzipped: bool,
     concurrency: int,
+    cve_authoritative_source: str,
 ) -> Tuple[int, int, int]:
     """
     Process URLs containing CVE data.
@@ -180,6 +203,7 @@ async def process_urls(
         cve_urls (List[str]): A list of URLs containing CVE data.
         cve_data_gzipped (bool): A flag indicating whether the CVE data is gzipped.
         concurrency (int): The number of concurrent URL requests to make and process.
+        cve_authoritative_source (str): The authoritative source for CVE data.
 
     Returns:
         Tuple[int, int, int]: A tuple containing the counts of created, updated,
@@ -201,7 +225,9 @@ async def process_single_url(
         async with semaphore:
             logging.info("Processing URL: %s", cve_url)
             cve_json = await fetch_cve_data(session, cve_url, cve_data_gzipped)
-            created_count, updated_count = await process_cve_json(cve_json)
+            created_count, updated_count = await process_cve_json(
+                cve_json, cve_authoritative_source
+            )
             async with cve_docs_count_lock:
                 created_cve_docs_count += created_count
                 updated_cve_docs_count += updated_count
diff --git a/src/cyhy_cvesync/main.py b/src/cyhy_cvesync/main.py
@@ -60,7 +60,10 @@ async def do_cve_sync(
     # Fetch the CVE URLs and put the CVE data into the database
     created_cve_docs_count, updated_cve_docs_count, deleted_cve_docs_count = (
         await process_urls(
-            cve_urls, config.cvesync.json_url_gzipped, config.cvesync.url_concurrency
+            cve_urls,
+            config.cvesync.json_url_gzipped,
+            config.cvesync.url_concurrency,
+            config.cvesync.cve_authoritative_source,
         )
     )
 
diff --git a/src/cyhy_cvesync/models/config_model.py b/src/cyhy_cvesync/models/config_model.py
@@ -6,14 +6,18 @@
 # Third-Party Libraries
 from pydantic import BaseModel, ConfigDict, Field
 
-from .. import DEFAULT_CVE_URL_PATTERN
+from .. import DEFAULT_CVE_AUTHORITATIVE_SOURCE, DEFAULT_CVE_URL_PATTERN
 
 
 class CVESync(BaseModel):
     """Definition of a CVE Sync configuration."""
 
     model_config = ConfigDict(extra="forbid")
 
+    cve_authoritative_source: str = Field(
+        default=DEFAULT_CVE_AUTHORITATIVE_SOURCE,
+        description="The authoritative source for CVE data",
+    )
     db_auth_uri: str = Field(
         pattern=r"^mongodb://", description="MongoDB connection URI"
     )
diff --git a/tests/test_config_model.py b/tests/test_config_model.py
@@ -5,7 +5,11 @@
 import pytest
 
 # cisagov Libraries
-from cyhy_cvesync.models.config_model import DEFAULT_CVE_URL_PATTERN, CVESync
+from cyhy_cvesync.models.config_model import (
+    DEFAULT_CVE_AUTHORITATIVE_SOURCE,
+    DEFAULT_CVE_URL_PATTERN,
+    CVESync,
+)
 
 
 def test_set_json_url_pattern():
@@ -36,6 +40,15 @@ def test_default_url_concurrency():
     assert config.url_concurrency == 10
 
 
+def test_default_cve_authoritative_source():
+    """Test the default CVE authoritative source."""
+    config = CVESync(
+        db_auth_uri="mongodb://localhost:27017",
+        db_name="test_db",
+    )
+    assert config.cve_authoritative_source == DEFAULT_CVE_AUTHORITATIVE_SOURCE
+
+
 def test_invalid_db_auth_uri():
     """Test an invalid database authentication URI."""
     with pytest.raises(ValidationError):
diff --git a/tests/test_cvesync.py b/tests/test_cvesync.py
diff --git a/tests/test_main.py b/tests/test_main.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""This file defines the version of this module."""`
`2`	`2`
`3`		`-__version__ = "1.2.1"`
	`3`	`+__version__ = "2.0.0"`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,10 @@ async def do_cve_sync(`
`60`	`60`	`# Fetch the CVE URLs and put the CVE data into the database`
`61`	`61`	`created_cve_docs_count, updated_cve_docs_count, deleted_cve_docs_count = (`
`62`	`62`	`await process_urls(`
`63`		`- cve_urls, config.cvesync.json_url_gzipped, config.cvesync.url_concurrency`
	`63`	`+ cve_urls,`
	`64`	`+ config.cvesync.json_url_gzipped,`
	`65`	`+ config.cvesync.url_concurrency,`
	`66`	`+ config.cvesync.cve_authoritative_source,`
`64`	`67`	`)`
`65`	`68`	`)`
`66`	`69`