|
7 | 7 | import tempfile
|
8 | 8 | import time
|
9 | 9 | from contextlib import closing
|
| 10 | +from typing import NamedTuple |
| 11 | +from concurrent.futures import ProcessPoolExecutor |
10 | 12 |
|
11 | 13 | import urllib3
|
12 | 14 | from packaging.version import InvalidVersion, Version
|
|
70 | 72 | );
|
71 | 73 | """
|
72 | 74 | )
|
73 |
| -db.execute(""" |
| 75 | +db.execute( |
| 76 | + """ |
74 | 77 | CREATE TABLE IF NOT EXISTS maintainers (
|
75 | 78 | name STRING,
|
76 | 79 | package_name STRING
|
77 | 80 | );
|
78 |
| -""") |
| 81 | +""" |
| 82 | +) |
79 | 83 | db.commit()
|
| 84 | +pool = ProcessPoolExecutor() |
80 | 85 |
|
81 | 86 |
|
82 | 87 | def get_all_package_names():
|
@@ -195,171 +200,181 @@ def get_maintainers_from_pypi(package: str):
|
195 | 200 | return set()
|
196 | 201 | elif resp.status != 200:
|
197 | 202 | continue
|
198 |
| - return set(re.findall(r"<a href=\"/user/([^/]+)/\" aria-label=", resp.data.decode("utf-8"))) |
| 203 | + return set( |
| 204 | + re.findall( |
| 205 | + r"<a href=\"/user/([^/]+)/\" aria-label=", resp.data.decode("utf-8") |
| 206 | + ) |
| 207 | + ) |
199 | 208 | return set()
|
200 | 209 |
|
201 | 210 |
|
202 |
| -def update_data_from_pypi(): |
203 |
| - for package in tqdm(packages, unit="packages"): |
204 |
| - resp = http.request("GET", f"https://pypi.org/pypi/{package}/json") |
| 211 | +def update_data_for_package(package: str) -> None: |
| 212 | + resp = http.request("GET", f"https://pypi.org/pypi/{package}/json") |
| 213 | + |
| 214 | + if resp.status != 200: |
| 215 | + return |
| 216 | + try: |
| 217 | + resp = json.loads(resp.data.decode("utf-8")) |
| 218 | + except Exception: |
| 219 | + return |
| 220 | + try: |
| 221 | + version = Version(resp["info"]["version"]) |
| 222 | + except InvalidVersion: # The latest release has an invalid version, skip |
| 223 | + return |
| 224 | + latest_version = max(to_versions(resp["releases"].keys())) |
| 225 | + |
| 226 | + # Favor pre-releases over non-pre-releases |
| 227 | + if version < latest_version: |
| 228 | + version = latest_version |
| 229 | + new_resp = http.request( |
| 230 | + "GET", f"https://pypi.org/pypi/{package}/{latest_version}/json" |
| 231 | + ) |
| 232 | + if new_resp.status == 200: |
| 233 | + resp = json.loads(new_resp.data.decode("utf-8")) |
205 | 234 |
|
206 |
| - if resp.status != 200: |
207 |
| - continue |
| 235 | + # Get the exact string for the version that we found |
| 236 | + for strv in resp["releases"]: |
208 | 237 | try:
|
209 |
| - resp = json.loads(resp.data.decode("utf-8")) |
210 |
| - except Exception: |
| 238 | + if Version(strv) == version: |
| 239 | + str_version = strv |
| 240 | + break |
| 241 | + except InvalidVersion: |
211 | 242 | continue
|
| 243 | + else: |
| 244 | + raise ValueError("???") |
| 245 | + |
| 246 | + # Check to see if we already have this version or not |
| 247 | + with closing(db.cursor()) as cur: |
| 248 | + cur.execute( |
| 249 | + "SELECT name FROM packages WHERE name = ? AND version = ?;", |
| 250 | + (package, str_version), |
| 251 | + ) |
| 252 | + if cur.fetchone(): |
| 253 | + return |
| 254 | + |
| 255 | + maintainers = get_maintainers_from_pypi(package) |
| 256 | + |
| 257 | + requires_python = resp["info"]["requires_python"] or "" |
| 258 | + urequires_dist = [ |
| 259 | + normalize_requires_dist(x) for x in resp["info"]["requires_dist"] or [] |
| 260 | + ] |
| 261 | + urequires_dist = sorted(urequires_dist, key=requires_dist_sort_key) |
| 262 | + |
| 263 | + requires_dist = {"specifiers": [], "dists": []} |
| 264 | + requires_extras = {} |
| 265 | + yanked = [] |
| 266 | + |
| 267 | + releases = resp["releases"][str_version] |
| 268 | + uploaded_at = None if not releases else min(x["upload_time"] for x in releases) |
| 269 | + wheel_filenames = [ |
| 270 | + x["filename"] for x in releases if x["filename"].endswith(".whl") |
| 271 | + ] |
| 272 | + has_binary_wheel = False |
| 273 | + |
| 274 | + for filename in wheel_filenames: |
212 | 275 | try:
|
213 |
| - version = Version(resp["info"]["version"]) |
214 |
| - except InvalidVersion: # The latest release has an invalid version, skip |
| 276 | + whl = parse_wheel_filename(filename) |
| 277 | + except InvalidFilenameError: |
215 | 278 | continue
|
216 |
| - latest_version = max(to_versions(resp["releases"].keys())) |
217 |
| - |
218 |
| - # Favor pre-releases over non-pre-releases |
219 |
| - if version < latest_version: |
220 |
| - version = latest_version |
221 |
| - new_resp = http.request( |
222 |
| - "GET", f"https://pypi.org/pypi/{package}/{latest_version}/json" |
223 |
| - ) |
224 |
| - if new_resp.status == 200: |
225 |
| - resp = json.loads(new_resp.data.decode("utf-8")) |
226 |
| - |
227 |
| - # Get the exact string for the version that we found |
228 |
| - for strv in resp["releases"]: |
229 |
| - try: |
230 |
| - if Version(strv) == version: |
231 |
| - str_version = strv |
232 |
| - break |
233 |
| - except InvalidVersion: |
234 |
| - continue |
235 |
| - else: |
236 |
| - raise ValueError("???") |
| 279 | + python_tags, abi_tags, platform_tags = ( |
| 280 | + whl.python_tags, |
| 281 | + whl.abi_tags, |
| 282 | + whl.platform_tags, |
| 283 | + ) |
237 | 284 |
|
238 |
| - # Check to see if we already have this version or not |
239 |
| - with closing(db.cursor()) as cur: |
240 |
| - cur.execute( |
241 |
| - "SELECT name FROM packages WHERE name = ? AND version = ?;", |
242 |
| - (package, str_version), |
243 |
| - ) |
244 |
| - if cur.fetchone(): |
245 |
| - continue |
246 |
| - |
247 |
| - # If we don't have 'requires_dist' information install |
248 |
| - # locally and investigate the installed package |
249 |
| - if False and resp["info"]["requires_dist"] is None: |
250 |
| - new_resp = get_metadata_by_install(package, resp) |
251 |
| - if new_resp is not None: |
252 |
| - resp = new_resp |
253 |
| - |
254 |
| - requires_python = resp["info"]["requires_python"] or "" |
255 |
| - urequires_dist = [ |
256 |
| - normalize_requires_dist(x) for x in resp["info"]["requires_dist"] or [] |
257 |
| - ] |
258 |
| - urequires_dist = sorted(urequires_dist, key=requires_dist_sort_key) |
259 |
| - |
260 |
| - requires_dist = {"specifiers": [], "dists": []} |
261 |
| - requires_extras = {} |
262 |
| - yanked = [] |
263 |
| - |
264 |
| - releases = resp["releases"][str_version] |
265 |
| - uploaded_at = None if not releases else min(x["upload_time"] for x in releases) |
266 |
| - wheel_filenames = [ |
267 |
| - x["filename"] for x in releases if x["filename"].endswith(".whl") |
268 |
| - ] |
269 |
| - has_binary_wheel = False |
270 |
| - |
271 |
| - for filename in wheel_filenames: |
272 |
| - try: |
273 |
| - whl = parse_wheel_filename(filename) |
274 |
| - except InvalidFilenameError: |
275 |
| - continue |
276 |
| - python_tags, abi_tags, platform_tags = ( |
277 |
| - whl.python_tags, |
278 |
| - whl.abi_tags, |
279 |
| - whl.platform_tags, |
| 285 | + for wheel_data in itertools.product(python_tags, abi_tags, platform_tags): |
| 286 | + py, abi, plat = wheel_data |
| 287 | + db.execute( |
| 288 | + """ |
| 289 | + INSERT INTO wheels ( |
| 290 | + name, version, filename, python, abi, platform |
| 291 | + ) VALUES (?, ?, ?, ?, ?, ?); |
| 292 | + """, |
| 293 | + (package, str_version, filename, py, abi, plat), |
280 | 294 | )
|
281 | 295 |
|
282 |
| - for wheel_data in itertools.product(python_tags, abi_tags, platform_tags): |
283 |
| - py, abi, plat = wheel_data |
284 |
| - db.execute( |
285 |
| - """ |
286 |
| - INSERT INTO wheels ( |
287 |
| - name, version, filename, python, abi, platform |
288 |
| - ) VALUES (?, ?, ?, ?, ?, ?); |
289 |
| - """, |
290 |
| - (package, str_version, filename, py, abi, plat), |
291 |
| - ) |
| 296 | + if abi_tags == ["none"] and platform_tags == ["any"]: |
| 297 | + continue |
292 | 298 |
|
293 |
| - if abi_tags == ["none"] and platform_tags == ["any"]: |
294 |
| - continue |
| 299 | + has_binary_wheel = True |
295 | 300 |
|
296 |
| - has_binary_wheel = True |
| 301 | + db.execute( |
| 302 | + """ |
| 303 | + INSERT OR IGNORE INTO packages ( |
| 304 | + name, version, requires_python, has_binary_wheel, uploaded_at |
| 305 | + ) VALUES (?, ?, ?, ?, ?); |
| 306 | + """, |
| 307 | + (package, str_version, requires_python, has_binary_wheel, uploaded_at), |
| 308 | + ) |
297 | 309 |
|
| 310 | + for maintainer in maintainers: |
298 | 311 | db.execute(
|
299 | 312 | """
|
300 |
| - INSERT OR IGNORE INTO packages ( |
301 |
| - name, version, requires_python, has_binary_wheel, uploaded_at |
302 |
| - ) VALUES (?, ?, ?, ?, ?); |
| 313 | + INSERT OR IGNORE INTO maintainers (name, package_name) VALUES (?, ?); |
303 | 314 | """,
|
304 |
| - (package, str_version, requires_python, has_binary_wheel, uploaded_at), |
| 315 | + (maintainer, package), |
305 | 316 | )
|
306 |
| - db.commit() |
307 |
| - |
308 |
| - for maintainer in get_maintainers_from_pypi(package): |
309 |
| - db.execute(""" |
310 |
| - INSERT OR IGNORE INTO maintainers (name, package_name) VALUES (?, ?); |
311 |
| - """, (maintainer, package)) |
312 |
| - |
313 |
| - for req in urequires_dist: |
314 |
| - extras = get_extras(req) |
315 |
| - req_no_specifiers = dist_from_requires_dist(req) |
316 |
| - specifier = specifier_from_requires_dist(req).replace( |
317 |
| - req_no_specifiers + " ", "", 1 |
318 |
| - ) |
319 |
| - if extras: |
320 |
| - for extra in extras: |
321 |
| - db.execute( |
322 |
| - """ |
323 |
| - INSERT OR IGNORE INTO deps ( |
324 |
| - name, |
325 |
| - version, |
326 |
| - dep_name, |
327 |
| - dep_specifier, |
328 |
| - extra |
329 |
| - ) VALUES (?, ?, ?, ?, ?); |
330 |
| - """, |
331 |
| - (package, str_version, req_no_specifiers, specifier, extra), |
332 |
| - ) |
333 |
| - else: |
| 317 | + |
| 318 | + for req in urequires_dist: |
| 319 | + extras = get_extras(req) |
| 320 | + req_no_specifiers = dist_from_requires_dist(req) |
| 321 | + specifier = specifier_from_requires_dist(req).replace( |
| 322 | + req_no_specifiers + " ", "", 1 |
| 323 | + ) |
| 324 | + if extras: |
| 325 | + for extra in extras: |
334 | 326 | db.execute(
|
335 | 327 | """
|
336 | 328 | INSERT OR IGNORE INTO deps (
|
337 |
| - name, |
338 |
| - version, |
339 |
| - dep_name, |
340 |
| - dep_specifier |
341 |
| - ) VALUES (?, ?, ?, ?); |
| 329 | + name, |
| 330 | + version, |
| 331 | + dep_name, |
| 332 | + dep_specifier, |
| 333 | + extra |
| 334 | + ) VALUES (?, ?, ?, ?, ?); |
342 | 335 | """,
|
343 |
| - (package, str_version, req_no_specifiers, specifier), |
| 336 | + (package, str_version, req_no_specifiers, specifier, extra), |
344 | 337 | )
|
| 338 | + else: |
| 339 | + db.execute( |
| 340 | + """ |
| 341 | + INSERT OR IGNORE INTO deps ( |
| 342 | + name, |
| 343 | + version, |
| 344 | + dep_name, |
| 345 | + dep_specifier |
| 346 | + ) VALUES (?, ?, ?, ?); |
| 347 | + """, |
| 348 | + (package, str_version, req_no_specifiers, specifier), |
| 349 | + ) |
345 | 350 |
|
346 |
| - requires_dist["dists"] = sorted(set(requires_dist["dists"])) |
347 |
| - for extra, extra_info in list(requires_extras.items()): |
348 |
| - requires_extras[extra]["dists"] = sorted(set(extra_info["dists"])) |
| 351 | + requires_dist["dists"] = sorted(set(requires_dist["dists"])) |
| 352 | + for extra, extra_info in list(requires_extras.items()): |
| 353 | + requires_extras[extra]["dists"] = sorted(set(extra_info["dists"])) |
349 | 354 |
|
350 |
| - for relv, downloads in resp["releases"].items(): |
351 |
| - for download in downloads: |
352 |
| - if download["yanked"]: |
353 |
| - yanked.append(relv) |
354 |
| - break |
| 355 | + for relv, downloads in resp["releases"].items(): |
| 356 | + for download in downloads: |
| 357 | + if download["yanked"]: |
| 358 | + yanked.append(relv) |
| 359 | + break |
355 | 360 |
|
356 |
| - yanked = sorted_versions(set(yanked)) |
357 |
| - if yanked: |
358 |
| - db.execute( |
359 |
| - "UPDATE packages SET yanked=1 WHERE name=? AND version=?;", |
360 |
| - (package, str_version), |
361 |
| - ) |
| 361 | + yanked = sorted_versions(set(yanked)) |
| 362 | + if yanked: |
| 363 | + db.execute( |
| 364 | + "UPDATE packages SET yanked=1 WHERE name=? AND version=?;", |
| 365 | + (package, str_version), |
| 366 | + ) |
| 367 | + |
| 368 | + db.commit() |
| 369 | + |
| 370 | + return package |
| 371 | + |
| 372 | + |
| 373 | +def update_data_from_pypi(): |
| 374 | + results = pool.map(update_data_for_package, packages) |
| 375 | + for _ in tqdm(results, total=len(packages), unit="packages"): |
| 376 | + pass |
362 | 377 |
|
363 |
| - db.commit() |
364 | 378 |
|
365 |
| -update_data_from_pypi() |
| 379 | +if __name__ == "__main__": |
| 380 | + update_data_from_pypi() |
0 commit comments