Skip to content

Commit b5c8960

Browse files
committed
Add more data about upload time, vulns
1 parent d06bcf6 commit b5c8960

File tree

2 files changed

+29
-33
lines changed

2 files changed

+29
-33
lines changed

README.md

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ CREATE TABLE packages (
4343
requires_python STRING,
4444
yanked BOOLEAN DEFAULT FALSE,
4545
has_binary_wheel BOOLEAN,
46-
uploaded_at TIMESTAMP,
46+
has_vulnerabilities BOOLEAN,
47+
first_uploaded_at TIMESTAMP,
48+
last_uploaded_at TIMESTAMP,
4749
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
4850
downloads INTEGER,
4951
scorecard_overall FLOAT,
@@ -68,6 +70,7 @@ CREATE TABLE wheels (
6870
python STRING,
6971
abi STRING,
7072
platform STRING,
73+
uploaded_at TIMESTAMP,
7174
PRIMARY KEY (package_name, filename)
7275
);
7376

@@ -95,21 +98,7 @@ CREATE TABLE scorecard_checks (
9598

9699
### Download data
97100

98-
Downloads are grabbed manually from BigQuery with this query:
99-
100-
```sql
101-
SELECT file.project, COUNT(*) AS downloads
102-
FROM `bigquery-public-data.pypi.file_downloads`
103-
WHERE (
104-
DATE(timestamp)
105-
BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
106-
AND CURRENT_DATE()
107-
)
108-
GROUP BY file.project
109-
ORDER BY downloads DESC;
110-
```
111-
112-
The results are stored in `downloads.csv`.
101+
Downloads are grabbed from https://github.com/hugovk/top-pypi-packages but only available for the top 5,000 packages.
113102

114103
## Running locally
115104

main.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
from __future__ import annotations
22
import contextlib
3-
import csv
43
import itertools
54
import json
6-
import logging
75
import os
86
import re
97
import sqlite3
@@ -27,6 +25,7 @@
2725

2826
MAX_WORKERS = 16
2927
GOOGLE_ASSURED_OSS_PACKAGES = set()
28+
DOWNLOADS_URL = "https://raw.githubusercontent.com/hugovk/top-pypi-packages/main/top-pypi-packages-30-days.min.json"
3029

3130
@contextlib.contextmanager
3231
def locked_db():
@@ -254,13 +253,14 @@ def update_data_for_package(package: str) -> None:
254253
yanked = []
255254

256255
releases = resp["releases"][str_version]
257-
uploaded_at = None if not releases else min(x["upload_time"] for x in releases)
256+
first_uploaded_at = None if not releases else min(x["upload_time"] for x in releases)
257+
last_uploaded_at = None if not releases else max(x["upload_time"] for x in releases)
258258
wheel_data = [
259-
(x["filename"], x["url"]) for x in releases if x["filename"].endswith(".whl")
259+
(x["filename"], x["url"], x["upload_time"]) for x in releases if x["filename"].endswith(".whl")
260260
]
261261
has_binary_wheel = False
262262

263-
for filename, _ in wheel_data:
263+
for filename, _, uploaded_at in wheel_data:
264264
try:
265265
whl = parse_wheel_filename(filename)
266266
except InvalidFilenameError:
@@ -276,31 +276,36 @@ def update_data_for_package(package: str) -> None:
276276
db.execute(
277277
"""
278278
INSERT INTO wheels (
279-
package_name, filename, build, python, abi, platform
280-
) VALUES (?, ?, ?, ?, ?, ?);
279+
package_name, filename, build, python, abi, platform, uploaded_at
280+
) VALUES (?, ?, ?, ?, ?, ?, ?);
281281
""",
282-
(package, filename, whl.build, py, abi, plat),
282+
(package, filename, whl.build, py, abi, plat, uploaded_at),
283283
)
284284

285285
if abi_tags == ["none"] and platform_tags == ["any"]:
286286
continue
287287

288288
has_binary_wheel = True
289289

290+
# Check if the package has any known vulnerabilities.
291+
has_vulnerabilities = bool(resp.get("vulnerabilities", []))
292+
290293
package_downloads = downloads.get(package, 0)
291294
with locked_db() as db:
292295
db.execute(
293296
"""
294297
INSERT OR IGNORE INTO packages (
295-
name, version, requires_python, has_binary_wheel, uploaded_at, downloads, scorecard_overall, in_google_assured_oss
296-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?);
298+
name, version, requires_python, has_binary_wheel, has_vulnerabilities, first_uploaded_at, last_uploaded_at, downloads, scorecard_overall, in_google_assured_oss
299+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
297300
""",
298301
(
299302
package,
300303
str_version,
301304
requires_python,
302305
has_binary_wheel,
303-
uploaded_at,
306+
has_vulnerabilities,
307+
first_uploaded_at,
308+
last_uploaded_at,
304309
package_downloads,
305310
scorecard_overall,
306311
package.lower() in GOOGLE_ASSURED_OSS_PACKAGES
@@ -460,11 +465,10 @@ def get_google_assured_oss_packages(http: urllib3.PoolManager) -> set[str]:
460465
pypi_deps_db = os.path.join(base_dir, "pypi.db")
461466

462467
downloads = {}
463-
with open(os.path.join(base_dir, "downloads.csv")) as f:
464-
csv = csv.reader(f)
465-
next(csv)
466-
for project, dls in csv:
467-
downloads[project] = int(dls)
468+
resp = http.request("GET", DOWNLOADS_URL)
469+
assert resp.status == 200
470+
for row in resp.json()["rows"]:
471+
downloads[row["project"]] = row["download_count"]
468472

469473
_DB = sqlite3.connect(os.path.join(base_dir, "pypi.db"), check_same_thread=False)
470474
_DB.execute(
@@ -475,7 +479,9 @@ def get_google_assured_oss_packages(http: urllib3.PoolManager) -> set[str]:
475479
requires_python TEXT,
476480
yanked BOOLEAN DEFAULT 0,
477481
has_binary_wheel BOOLEAN,
478-
uploaded_at TIMESTAMP,
482+
has_vulnerabilities BOOLEAN,
483+
first_uploaded_at TIMESTAMP,
484+
last_uploaded_at TIMESTAMP,
479485
recorded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
480486
downloads INTEGER,
481487
scorecard_overall FLOAT,
@@ -508,6 +514,7 @@ def get_google_assured_oss_packages(http: urllib3.PoolManager) -> set[str]:
508514
python TEXT,
509515
abi TEXT,
510516
platform TEXT,
517+
uploaded_at TIMESTAMP,
511518
FOREIGN KEY (package_name) REFERENCES packages(name)
512519
);
513520
"""

0 commit comments

Comments
 (0)