Skip to content

Commit 76653cb

Browse files
authored
Rewrote PyPI mirror endpoints due to change in the way PyPI serves package URLs (#613)
* Simple index is now located under '/pypi/index/' * Package URL paths now mirror Python package repo's URL structure ('[https://files.pythonhosted.org]/packages/path/to/python/package') * Updated route manifest
1 parent 1c08889 commit 76653cb

File tree

2 files changed

+45
-103
lines changed

2 files changed

+45
-103
lines changed

src/murfey/server/api/bootstrap.py

Lines changed: 37 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -190,15 +190,15 @@ def get_murfey_wheel():
190190
murfey.bootstrap is compatible with all relevant versions of Python.
191191
This also ignores yanked releases, which again should be fine.
192192
"""
193-
full_path_response = http_session.get("https://pypi.org/simple/murfey")
193+
full_path_response = http_session.get(f"{pypi_index_url.rstrip('/')}/murfey")
194194
wheels = {}
195195

196196
for wheel_file in re.findall(
197197
b"<a [^>]*>([^<]*).whl</a>",
198198
full_path_response.content,
199199
):
200200
try:
201-
filename = wheel_file.decode("latin-1") + ".whl"
201+
filename = wheel_file.decode("utf-8") + ".whl"
202202
version = packaging.version.parse(filename.split("-")[1])
203203
wheels[version] = filename
204204
except Exception:
@@ -261,7 +261,7 @@ def find_cygwin_mirror() -> str:
261261

262262
mirror_priorities = {}
263263
for mirror in mirrors.content.split(b"\n"):
264-
mirror_line = mirror.decode("latin1").strip().split(";")
264+
mirror_line = mirror.decode("utf-8").strip().split(";")
265265
if not mirror_line or len(mirror_line) < 4:
266266
continue
267267
if not mirror_line[0].startswith("http"):
@@ -493,7 +493,7 @@ def get_msys2_main_index(
493493

494494
# Parse and rewrite package index content
495495
content: bytes = response.content # Get content in bytes
496-
content_text: str = content.decode("latin1") # Convert to strings
496+
content_text: str = content.decode("utf-8") # Convert to strings
497497
content_text_list = []
498498
for line in content_text.splitlines():
499499
if line.startswith("<a href"):
@@ -508,7 +508,7 @@ def get_msys2_main_index(
508508

509509
# Reconstruct conent
510510
content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure
511-
content_new = content_text_new.encode("latin1") # Convert back to bytes
511+
content_new = content_text_new.encode("utf-8") # Convert back to bytes
512512
return Response(
513513
content=content_new,
514514
status_code=response.status_code,
@@ -538,7 +538,7 @@ def get_msys2_environment_index(
538538

539539
# Parse and rewrite package index content
540540
content: bytes = response.content # Get content in bytes
541-
content_text: str = content.decode("latin1") # Convert to strings
541+
content_text: str = content.decode("utf-8") # Convert to strings
542542
content_text_list = []
543543
for line in content_text.splitlines():
544544
if line.startswith("<a href="):
@@ -552,7 +552,7 @@ def get_msys2_environment_index(
552552

553553
# Reconstruct conent
554554
content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure
555-
content_new = content_text_new.encode("latin1") # Convert back to bytes
555+
content_new = content_text_new.encode("utf-8") # Convert back to bytes
556556
return Response(
557557
content=content_new,
558558
status_code=response.status_code,
@@ -1066,6 +1066,9 @@ def get_rust_package_crate(
10661066
=======================================================================================
10671067
"""
10681068

1069+
python_repo_url = "https://files.pythonhosted.org"
1070+
pypi_index_url = "https://pypi.org/simple/"
1071+
10691072

10701073
def _get_full_pypi_path_response(package: str) -> requests.Response:
10711074
"""
@@ -1082,73 +1085,55 @@ def _get_full_pypi_path_response(package: str) -> requests.Response:
10821085
package_clean = quote(re.sub(r"[-_.]+", "-", package.lower()), safe="/")
10831086

10841087
# Get HTTP response
1085-
url = f"https://pypi.org/simple/{package_clean}"
1088+
url = f"{pypi_index_url.rstrip('/')}/{package_clean}"
10861089
response = http_session.get(url)
10871090
if response.status_code != 200:
10881091
raise HTTPException(status_code=response.status_code)
10891092
return response
10901093

10911094

1092-
@pypi.get("/", response_class=Response)
1095+
@pypi.get("/index/", response_class=Response)
10931096
def get_pypi_index():
10941097
"""
10951098
Obtain list of all PyPI packages via the simple API (PEP 503).
10961099
"""
10971100

1098-
response = http_session.get("https://pypi.org/simple/")
1101+
response = http_session.get(pypi_index_url)
10991102
return Response(
11001103
content=response.content,
11011104
status_code=response.status_code,
11021105
media_type=response.headers.get("Content-Type"),
11031106
)
11041107

11051108

1106-
@pypi.get("/{package}/", response_class=Response)
1109+
@pypi.get("/index/{package}/", response_class=Response)
11071110
def get_pypi_package_downloads_list(request: Request, package: str) -> Response:
11081111
"""
11091112
Obtain list of all package downloads from PyPI via the simple API (PEP 503), and
11101113
rewrite all download URLs to point to this server, under the current directory.
11111114
"""
11121115

1113-
def _rewrite_pypi_url(match):
1114-
"""
1115-
Use regular expression matching to rewrite the URLs. Points them from
1116-
pythonhosted.org to current server, and removes the hash from the URL as well
1117-
"""
1118-
# url = match.group(4) # Original
1119-
url = match.group(3)
1120-
return '<a href="' + url + '"' + match.group(2) + ">" + match.group(3) + "</a>"
1121-
11221116
logger.debug(f"Received request to access {str(request.url)!r}")
11231117

1118+
# Construct base URL to rewrite with
1119+
netloc = resolve_netloc(request)
1120+
scheme = request.headers.get("X-Forwarded-Proto", request.url.scheme)
1121+
router_path = request.url.path.removesuffix(f"/index/{package}/")
1122+
base_url = f"{scheme}://{netloc}{router_path}"
1123+
11241124
# Validate package and URL
11251125
full_path_response = _get_full_pypi_path_response(package)
11261126

11271127
# Process lines related to PyPI packages in response
11281128
content: bytes = full_path_response.content # In bytes
1129-
content_text: str = content.decode("latin1") # Convert to strings
1130-
content_text_list = []
1131-
for line in content_text.splitlines():
1132-
# Look for lines with hyperlinks
1133-
if "<a href" in line:
1134-
# Rewrite URL to point to current proxy server
1135-
line_new = re.sub(
1136-
'^<a href="([^">]*)"([^>]*)>([^<]*)</a>', # Regex search criteria
1137-
_rewrite_pypi_url, # Function to apply search criteria to
1138-
line,
1139-
)
1140-
content_text_list.append(line_new)
1141-
1142-
# Add entry for wheel metadata (PEP 658; see _expose_wheel_metadata)
1143-
if ".whl" in line_new:
1144-
line_metadata = line_new.replace(".whl", ".whl.metadata")
1145-
content_text_list.append(line_metadata)
1146-
else:
1147-
# Append other lines as normal
1148-
content_text_list.append(line)
1129+
content_text: str = content.decode("utf-8") # Convert to strings
11491130

1150-
content_text_new = str("\n".join(content_text_list)) # Regenerate HTML structure
1151-
content_new = content_text_new.encode("latin1") # Convert back to bytes
1131+
# PyPI's simple index now directly points to https://pythonhosted.org
1132+
# It also uses newlines partway through the '<a ...></a>' blocks now
1133+
# It's thus now better to use regex substitution on the page as a whole
1134+
content_text_new = re.sub(re.escape(python_repo_url), base_url, content_text)
1135+
1136+
content_new = content_text_new.encode("utf-8") # Convert back to bytes
11521137

11531138
return Response(
11541139
content=content_new,
@@ -1157,76 +1142,29 @@ def _rewrite_pypi_url(match):
11571142
)
11581143

11591144

1160-
@pypi.get("/{package}/{filename}", response_class=StreamingResponse)
1145+
@pypi.get("/packages/{a}/{b}/{c}/{filename}", response_class=StreamingResponse)
11611146
def get_pypi_file(
11621147
request: Request,
1163-
package: str,
1148+
a: str,
1149+
b: str,
1150+
c: str,
11641151
filename: str,
11651152
):
11661153
"""
11671154
Obtain and pass through a specific download for a PyPI package.
11681155
"""
1169-
1170-
def _expose_wheel_metadata(response_bytes: bytes) -> bytes:
1171-
"""
1172-
As of pip v22.3 (coinciding with PEP 658), pip expects to find an additonal
1173-
".whl.metadata" file based on the URL of the ".whl" file present on the PyPI Simple
1174-
Index. However, because it is not listed on the webpage itself, it is not copied
1175-
across to the proxy. This function adds that URL to the proxy explicitly.
1176-
"""
1177-
1178-
# Analyse API response line-by-line
1179-
response_text: str = response_bytes.decode("latin1") # Convert to text
1180-
response_text_list = [] # Write line-by-line analysis to here
1181-
1182-
for line in response_text.splitlines():
1183-
# Process URLs
1184-
if r"<a href=" in line:
1185-
response_text_list.append(line) # Add to list
1186-
1187-
# Add new line to explicitly call for wheel metadata
1188-
if ".whl" in line:
1189-
# Add ".metadata" to URL and file name
1190-
line_new = line.replace(".whl", ".whl.metadata")
1191-
response_text_list.append(line_new) # Add to list
1192-
1193-
# Append all other lines as normal
1194-
else:
1195-
response_text_list.append(line)
1196-
1197-
# Recover original structure
1198-
response_text_new = str("\n".join(response_text_list))
1199-
response_bytes_new = bytes(response_text_new, encoding="latin-1")
1200-
1201-
return response_bytes_new
1202-
12031156
logger.debug(f"Received request to access {str(request.url)!r}")
12041157

1205-
# Validate package and URL
1206-
full_path_response = _get_full_pypi_path_response(package)
1207-
1208-
# Get filename in bytes
1209-
filename_bytes = re.escape(filename.encode("latin1"))
1210-
1211-
# Add explicit URLs for ".whl.metadata" files
1212-
content = _expose_wheel_metadata(full_path_response.content)
1213-
1214-
# Find package matching the specified filename
1215-
selected_package_link = re.search(
1216-
b'<a href="([^">]*)"[^>]*>' + filename_bytes + b"</a>",
1217-
content,
1218-
)
1219-
if not selected_package_link:
1220-
raise HTTPException(status_code=404, detail="File not found for package")
1221-
original_url = selected_package_link.group(1)
1222-
response = http_session.get(original_url)
1158+
package_url = f"{python_repo_url}/packages/{a}/{b}/{c}/{filename}"
1159+
logger.debug(f"Forwarding package request to {package_url!r}")
1160+
response = http_session.get(package_url, stream=True)
12231161

12241162
# Construct headers to return with response
12251163
headers: dict[str, str] = {}
1226-
if response.headers.get("Content-Length"):
1227-
headers["Content-Lengh"] = response.headers["Content-Length"]
1164+
if response.status_code != 200:
1165+
raise HTTPException(status_code=response.status_code)
12281166
return StreamingResponse(
1229-
content=response.iter_content(chunk_size=8192),
1167+
content=response.raw,
12301168
status_code=response.status_code,
12311169
headers=headers,
12321170
media_type=response.headers.get("Content-Type"),

src/murfey/util/route_manifest.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,22 +215,26 @@ murfey.server.api.bootstrap.plugins:
215215
methods:
216216
- GET
217217
murfey.server.api.bootstrap.pypi:
218-
- path: /pypi/
218+
- path: /pypi/index/
219219
function: get_pypi_index
220220
path_params: []
221221
methods:
222222
- GET
223-
- path: /pypi/{package}/
223+
- path: /pypi/index/{package}/
224224
function: get_pypi_package_downloads_list
225225
path_params:
226226
- name: package
227227
type: str
228228
methods:
229229
- GET
230-
- path: /pypi/{package}/{filename}
230+
- path: /pypi/packages/{a}/{b}/{c}/{filename}
231231
function: get_pypi_file
232232
path_params:
233-
- name: package
233+
- name: a
234+
type: str
235+
- name: b
236+
type: str
237+
- name: c
234238
type: str
235239
- name: filename
236240
type: str

0 commit comments

Comments
 (0)