Skip to content

Commit c7172a3

Browse files
committed
Parallelize workload
1 parent a665e58 commit c7172a3

File tree

1 file changed

+157
-142
lines changed

1 file changed

+157
-142
lines changed

main.py

+157-142
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import tempfile
88
import time
99
from contextlib import closing
10+
from typing import NamedTuple
11+
from concurrent.futures import ProcessPoolExecutor
1012

1113
import urllib3
1214
from packaging.version import InvalidVersion, Version
@@ -70,13 +72,16 @@
7072
);
7173
"""
7274
)
73-
db.execute("""
75+
db.execute(
76+
"""
7477
CREATE TABLE IF NOT EXISTS maintainers (
7578
name STRING,
7679
package_name STRING
7780
);
78-
""")
81+
"""
82+
)
7983
db.commit()
84+
pool = ProcessPoolExecutor()
8085

8186

8287
def get_all_package_names():
@@ -195,171 +200,181 @@ def get_maintainers_from_pypi(package: str):
195200
return set()
196201
elif resp.status != 200:
197202
continue
198-
return set(re.findall(r"<a href=\"/user/([^/]+)/\" aria-label=", resp.data.decode("utf-8")))
203+
return set(
204+
re.findall(
205+
r"<a href=\"/user/([^/]+)/\" aria-label=", resp.data.decode("utf-8")
206+
)
207+
)
199208
return set()
200209

201210

202-
def update_data_from_pypi():
203-
for package in tqdm(packages, unit="packages"):
204-
resp = http.request("GET", f"https://pypi.org/pypi/{package}/json")
211+
def update_data_for_package(package: str) -> None:
212+
resp = http.request("GET", f"https://pypi.org/pypi/{package}/json")
213+
214+
if resp.status != 200:
215+
return
216+
try:
217+
resp = json.loads(resp.data.decode("utf-8"))
218+
except Exception:
219+
return
220+
try:
221+
version = Version(resp["info"]["version"])
222+
except InvalidVersion: # The latest release has an invalid version, skip
223+
return
224+
latest_version = max(to_versions(resp["releases"].keys()))
225+
226+
# Favor pre-releases over non-pre-releases
227+
if version < latest_version:
228+
version = latest_version
229+
new_resp = http.request(
230+
"GET", f"https://pypi.org/pypi/{package}/{latest_version}/json"
231+
)
232+
if new_resp.status == 200:
233+
resp = json.loads(new_resp.data.decode("utf-8"))
205234

206-
if resp.status != 200:
207-
continue
235+
# Get the exact string for the version that we found
236+
for strv in resp["releases"]:
208237
try:
209-
resp = json.loads(resp.data.decode("utf-8"))
210-
except Exception:
238+
if Version(strv) == version:
239+
str_version = strv
240+
break
241+
except InvalidVersion:
211242
continue
243+
else:
244+
raise ValueError("???")
245+
246+
# Check to see if we already have this version or not
247+
with closing(db.cursor()) as cur:
248+
cur.execute(
249+
"SELECT name FROM packages WHERE name = ? AND version = ?;",
250+
(package, str_version),
251+
)
252+
if cur.fetchone():
253+
return
254+
255+
maintainers = get_maintainers_from_pypi(package)
256+
257+
requires_python = resp["info"]["requires_python"] or ""
258+
urequires_dist = [
259+
normalize_requires_dist(x) for x in resp["info"]["requires_dist"] or []
260+
]
261+
urequires_dist = sorted(urequires_dist, key=requires_dist_sort_key)
262+
263+
requires_dist = {"specifiers": [], "dists": []}
264+
requires_extras = {}
265+
yanked = []
266+
267+
releases = resp["releases"][str_version]
268+
uploaded_at = None if not releases else min(x["upload_time"] for x in releases)
269+
wheel_filenames = [
270+
x["filename"] for x in releases if x["filename"].endswith(".whl")
271+
]
272+
has_binary_wheel = False
273+
274+
for filename in wheel_filenames:
212275
try:
213-
version = Version(resp["info"]["version"])
214-
except InvalidVersion: # The latest release has an invalid version, skip
276+
whl = parse_wheel_filename(filename)
277+
except InvalidFilenameError:
215278
continue
216-
latest_version = max(to_versions(resp["releases"].keys()))
217-
218-
# Favor pre-releases over non-pre-releases
219-
if version < latest_version:
220-
version = latest_version
221-
new_resp = http.request(
222-
"GET", f"https://pypi.org/pypi/{package}/{latest_version}/json"
223-
)
224-
if new_resp.status == 200:
225-
resp = json.loads(new_resp.data.decode("utf-8"))
226-
227-
# Get the exact string for the version that we found
228-
for strv in resp["releases"]:
229-
try:
230-
if Version(strv) == version:
231-
str_version = strv
232-
break
233-
except InvalidVersion:
234-
continue
235-
else:
236-
raise ValueError("???")
279+
python_tags, abi_tags, platform_tags = (
280+
whl.python_tags,
281+
whl.abi_tags,
282+
whl.platform_tags,
283+
)
237284

238-
# Check to see if we already have this version or not
239-
with closing(db.cursor()) as cur:
240-
cur.execute(
241-
"SELECT name FROM packages WHERE name = ? AND version = ?;",
242-
(package, str_version),
243-
)
244-
if cur.fetchone():
245-
continue
246-
247-
# If we don't have 'requires_dist' information install
248-
# locally and investigate the installed package
249-
if False and resp["info"]["requires_dist"] is None:
250-
new_resp = get_metadata_by_install(package, resp)
251-
if new_resp is not None:
252-
resp = new_resp
253-
254-
requires_python = resp["info"]["requires_python"] or ""
255-
urequires_dist = [
256-
normalize_requires_dist(x) for x in resp["info"]["requires_dist"] or []
257-
]
258-
urequires_dist = sorted(urequires_dist, key=requires_dist_sort_key)
259-
260-
requires_dist = {"specifiers": [], "dists": []}
261-
requires_extras = {}
262-
yanked = []
263-
264-
releases = resp["releases"][str_version]
265-
uploaded_at = None if not releases else min(x["upload_time"] for x in releases)
266-
wheel_filenames = [
267-
x["filename"] for x in releases if x["filename"].endswith(".whl")
268-
]
269-
has_binary_wheel = False
270-
271-
for filename in wheel_filenames:
272-
try:
273-
whl = parse_wheel_filename(filename)
274-
except InvalidFilenameError:
275-
continue
276-
python_tags, abi_tags, platform_tags = (
277-
whl.python_tags,
278-
whl.abi_tags,
279-
whl.platform_tags,
285+
for wheel_data in itertools.product(python_tags, abi_tags, platform_tags):
286+
py, abi, plat = wheel_data
287+
db.execute(
288+
"""
289+
INSERT INTO wheels (
290+
name, version, filename, python, abi, platform
291+
) VALUES (?, ?, ?, ?, ?, ?);
292+
""",
293+
(package, str_version, filename, py, abi, plat),
280294
)
281295

282-
for wheel_data in itertools.product(python_tags, abi_tags, platform_tags):
283-
py, abi, plat = wheel_data
284-
db.execute(
285-
"""
286-
INSERT INTO wheels (
287-
name, version, filename, python, abi, platform
288-
) VALUES (?, ?, ?, ?, ?, ?);
289-
""",
290-
(package, str_version, filename, py, abi, plat),
291-
)
296+
if abi_tags == ["none"] and platform_tags == ["any"]:
297+
continue
292298

293-
if abi_tags == ["none"] and platform_tags == ["any"]:
294-
continue
299+
has_binary_wheel = True
295300

296-
has_binary_wheel = True
301+
db.execute(
302+
"""
303+
INSERT OR IGNORE INTO packages (
304+
name, version, requires_python, has_binary_wheel, uploaded_at
305+
) VALUES (?, ?, ?, ?, ?);
306+
""",
307+
(package, str_version, requires_python, has_binary_wheel, uploaded_at),
308+
)
297309

310+
for maintainer in maintainers:
298311
db.execute(
299312
"""
300-
INSERT OR IGNORE INTO packages (
301-
name, version, requires_python, has_binary_wheel, uploaded_at
302-
) VALUES (?, ?, ?, ?, ?);
313+
INSERT OR IGNORE INTO maintainers (name, package_name) VALUES (?, ?);
303314
""",
304-
(package, str_version, requires_python, has_binary_wheel, uploaded_at),
315+
(maintainer, package),
305316
)
306-
db.commit()
307-
308-
for maintainer in get_maintainers_from_pypi(package):
309-
db.execute("""
310-
INSERT OR IGNORE INTO maintainers (name, package_name) VALUES (?, ?);
311-
""", (maintainer, package))
312-
313-
for req in urequires_dist:
314-
extras = get_extras(req)
315-
req_no_specifiers = dist_from_requires_dist(req)
316-
specifier = specifier_from_requires_dist(req).replace(
317-
req_no_specifiers + " ", "", 1
318-
)
319-
if extras:
320-
for extra in extras:
321-
db.execute(
322-
"""
323-
INSERT OR IGNORE INTO deps (
324-
name,
325-
version,
326-
dep_name,
327-
dep_specifier,
328-
extra
329-
) VALUES (?, ?, ?, ?, ?);
330-
""",
331-
(package, str_version, req_no_specifiers, specifier, extra),
332-
)
333-
else:
317+
318+
for req in urequires_dist:
319+
extras = get_extras(req)
320+
req_no_specifiers = dist_from_requires_dist(req)
321+
specifier = specifier_from_requires_dist(req).replace(
322+
req_no_specifiers + " ", "", 1
323+
)
324+
if extras:
325+
for extra in extras:
334326
db.execute(
335327
"""
336328
INSERT OR IGNORE INTO deps (
337-
name,
338-
version,
339-
dep_name,
340-
dep_specifier
341-
) VALUES (?, ?, ?, ?);
329+
name,
330+
version,
331+
dep_name,
332+
dep_specifier,
333+
extra
334+
) VALUES (?, ?, ?, ?, ?);
342335
""",
343-
(package, str_version, req_no_specifiers, specifier),
336+
(package, str_version, req_no_specifiers, specifier, extra),
344337
)
338+
else:
339+
db.execute(
340+
"""
341+
INSERT OR IGNORE INTO deps (
342+
name,
343+
version,
344+
dep_name,
345+
dep_specifier
346+
) VALUES (?, ?, ?, ?);
347+
""",
348+
(package, str_version, req_no_specifiers, specifier),
349+
)
345350

346-
requires_dist["dists"] = sorted(set(requires_dist["dists"]))
347-
for extra, extra_info in list(requires_extras.items()):
348-
requires_extras[extra]["dists"] = sorted(set(extra_info["dists"]))
351+
requires_dist["dists"] = sorted(set(requires_dist["dists"]))
352+
for extra, extra_info in list(requires_extras.items()):
353+
requires_extras[extra]["dists"] = sorted(set(extra_info["dists"]))
349354

350-
for relv, downloads in resp["releases"].items():
351-
for download in downloads:
352-
if download["yanked"]:
353-
yanked.append(relv)
354-
break
355+
for relv, downloads in resp["releases"].items():
356+
for download in downloads:
357+
if download["yanked"]:
358+
yanked.append(relv)
359+
break
355360

356-
yanked = sorted_versions(set(yanked))
357-
if yanked:
358-
db.execute(
359-
"UPDATE packages SET yanked=1 WHERE name=? AND version=?;",
360-
(package, str_version),
361-
)
361+
yanked = sorted_versions(set(yanked))
362+
if yanked:
363+
db.execute(
364+
"UPDATE packages SET yanked=1 WHERE name=? AND version=?;",
365+
(package, str_version),
366+
)
367+
368+
db.commit()
369+
370+
return package
371+
372+
373+
def update_data_from_pypi():
374+
results = pool.map(update_data_for_package, packages)
375+
for _ in tqdm(results, total=len(packages), unit="packages"):
376+
pass
362377

363-
db.commit()
364378

365-
update_data_from_pypi()
379+
if __name__ == "__main__":
380+
update_data_from_pypi()

0 commit comments

Comments
 (0)