@@ -190,15 +190,15 @@ def get_murfey_wheel():
190190 murfey.bootstrap is compatible with all relevant versions of Python.
191191 This also ignores yanked releases, which again should be fine.
192192 """
193- full_path_response = http_session .get ("https://pypi.org/simple /murfey" )
193+ full_path_response = http_session .get (f" { pypi_index_url . rstrip ( '/' ) } /murfey" )
194194 wheels = {}
195195
196196 for wheel_file in re .findall (
197197 b"<a [^>]*>([^<]*).whl</a>" ,
198198 full_path_response .content ,
199199 ):
200200 try :
201- filename = wheel_file .decode ("latin-1 " ) + ".whl"
201+ filename = wheel_file .decode ("utf-8 " ) + ".whl"
202202 version = packaging .version .parse (filename .split ("-" )[1 ])
203203 wheels [version ] = filename
204204 except Exception :
@@ -261,7 +261,7 @@ def find_cygwin_mirror() -> str:
261261
262262 mirror_priorities = {}
263263 for mirror in mirrors .content .split (b"\n " ):
264- mirror_line = mirror .decode ("latin1 " ).strip ().split (";" )
264+ mirror_line = mirror .decode ("utf-8 " ).strip ().split (";" )
265265 if not mirror_line or len (mirror_line ) < 4 :
266266 continue
267267 if not mirror_line [0 ].startswith ("http" ):
@@ -493,7 +493,7 @@ def get_msys2_main_index(
493493
494494 # Parse and rewrite package index content
495495 content : bytes = response .content # Get content in bytes
496- content_text : str = content .decode ("latin1 " ) # Convert to strings
496+ content_text : str = content .decode ("utf-8 " ) # Convert to strings
497497 content_text_list = []
498498 for line in content_text .splitlines ():
499499 if line .startswith ("<a href" ):
@@ -508,7 +508,7 @@ def get_msys2_main_index(
508508
509509 # Reconstruct conent
510510 content_text_new = str ("\n " .join (content_text_list )) # Regenerate HTML structure
511- content_new = content_text_new .encode ("latin1 " ) # Convert back to bytes
511+ content_new = content_text_new .encode ("utf-8 " ) # Convert back to bytes
512512 return Response (
513513 content = content_new ,
514514 status_code = response .status_code ,
@@ -538,7 +538,7 @@ def get_msys2_environment_index(
538538
539539 # Parse and rewrite package index content
540540 content : bytes = response .content # Get content in bytes
541- content_text : str = content .decode ("latin1 " ) # Convert to strings
541+ content_text : str = content .decode ("utf-8 " ) # Convert to strings
542542 content_text_list = []
543543 for line in content_text .splitlines ():
544544 if line .startswith ("<a href=" ):
@@ -552,7 +552,7 @@ def get_msys2_environment_index(
552552
553553 # Reconstruct conent
554554 content_text_new = str ("\n " .join (content_text_list )) # Regenerate HTML structure
555- content_new = content_text_new .encode ("latin1 " ) # Convert back to bytes
555+ content_new = content_text_new .encode ("utf-8 " ) # Convert back to bytes
556556 return Response (
557557 content = content_new ,
558558 status_code = response .status_code ,
@@ -1066,6 +1066,9 @@ def get_rust_package_crate(
10661066=======================================================================================
10671067"""
10681068
1069+ python_repo_url = "https://files.pythonhosted.org"
1070+ pypi_index_url = "https://pypi.org/simple/"
1071+
10691072
10701073def _get_full_pypi_path_response (package : str ) -> requests .Response :
10711074 """
@@ -1082,73 +1085,55 @@ def _get_full_pypi_path_response(package: str) -> requests.Response:
10821085 package_clean = quote (re .sub (r"[-_.]+" , "-" , package .lower ()), safe = "/" )
10831086
10841087 # Get HTTP response
1085- url = f"https://pypi.org/simple /{ package_clean } "
1088+ url = f"{ pypi_index_url . rstrip ( '/' ) } /{ package_clean } "
10861089 response = http_session .get (url )
10871090 if response .status_code != 200 :
10881091 raise HTTPException (status_code = response .status_code )
10891092 return response
10901093
10911094
1092- @pypi .get ("/" , response_class = Response )
1095+ @pypi .get ("/index/ " , response_class = Response )
10931096def get_pypi_index ():
10941097 """
10951098 Obtain list of all PyPI packages via the simple API (PEP 503).
10961099 """
10971100
1098- response = http_session .get ("https://pypi.org/simple/" )
1101+ response = http_session .get (pypi_index_url )
10991102 return Response (
11001103 content = response .content ,
11011104 status_code = response .status_code ,
11021105 media_type = response .headers .get ("Content-Type" ),
11031106 )
11041107
11051108
1106- @pypi .get ("/{package}/" , response_class = Response )
1109+ @pypi .get ("/index/ {package}/" , response_class = Response )
11071110def get_pypi_package_downloads_list (request : Request , package : str ) -> Response :
11081111 """
11091112 Obtain list of all package downloads from PyPI via the simple API (PEP 503), and
11101113 rewrite all download URLs to point to this server, under the current directory.
11111114 """
11121115
1113- def _rewrite_pypi_url (match ):
1114- """
1115- Use regular expression matching to rewrite the URLs. Points them from
1116- pythonhosted.org to current server, and removes the hash from the URL as well
1117- """
1118- # url = match.group(4) # Original
1119- url = match .group (3 )
1120- return '<a href="' + url + '"' + match .group (2 ) + ">" + match .group (3 ) + "</a>"
1121-
11221116 logger .debug (f"Received request to access { str (request .url )!r} " )
11231117
1118+ # Construct base URL to rewrite with
1119+ netloc = resolve_netloc (request )
1120+ scheme = request .headers .get ("X-Forwarded-Proto" , request .url .scheme )
1121+ router_path = request .url .path .removesuffix (f"/index/{ package } /" )
1122+ base_url = f"{ scheme } ://{ netloc } { router_path } "
1123+
11241124 # Validate package and URL
11251125 full_path_response = _get_full_pypi_path_response (package )
11261126
11271127 # Process lines related to PyPI packages in response
11281128 content : bytes = full_path_response .content # In bytes
1129- content_text : str = content .decode ("latin1" ) # Convert to strings
1130- content_text_list = []
1131- for line in content_text .splitlines ():
1132- # Look for lines with hyperlinks
1133- if "<a href" in line :
1134- # Rewrite URL to point to current proxy server
1135- line_new = re .sub (
1136- '^<a href="([^">]*)"([^>]*)>([^<]*)</a>' , # Regex search criteria
1137- _rewrite_pypi_url , # Function to apply search criteria to
1138- line ,
1139- )
1140- content_text_list .append (line_new )
1141-
1142- # Add entry for wheel metadata (PEP 658; see _expose_wheel_metadata)
1143- if ".whl" in line_new :
1144- line_metadata = line_new .replace (".whl" , ".whl.metadata" )
1145- content_text_list .append (line_metadata )
1146- else :
1147- # Append other lines as normal
1148- content_text_list .append (line )
1129+ content_text : str = content .decode ("utf-8" ) # Convert to strings
11491130
1150- content_text_new = str ("\n " .join (content_text_list )) # Regenerate HTML structure
1151- content_new = content_text_new .encode ("latin1" ) # Convert back to bytes
1131+ # PyPI's simple index now directly points to https://pythonhosted.org
1132+ # It also uses newlines partway through the '<a ...></a>' blocks now
1133+ # It's thus now better to use regex substitution on the page as a whole
1134+ content_text_new = re .sub (re .escape (python_repo_url ), base_url , content_text )
1135+
1136+ content_new = content_text_new .encode ("utf-8" ) # Convert back to bytes
11521137
11531138 return Response (
11541139 content = content_new ,
@@ -1157,76 +1142,29 @@ def _rewrite_pypi_url(match):
11571142 )
11581143
11591144
1160- @pypi .get ("/{package }/{filename}" , response_class = StreamingResponse )
1145+ @pypi .get ("/packages/{a}/{b}/{c }/{filename}" , response_class = StreamingResponse )
11611146def get_pypi_file (
11621147 request : Request ,
1163- package : str ,
1148+ a : str ,
1149+ b : str ,
1150+ c : str ,
11641151 filename : str ,
11651152):
11661153 """
11671154 Obtain and pass through a specific download for a PyPI package.
11681155 """
1169-
1170- def _expose_wheel_metadata (response_bytes : bytes ) -> bytes :
1171- """
1172- As of pip v22.3 (coinciding with PEP 658), pip expects to find an additonal
1173- ".whl.metadata" file based on the URL of the ".whl" file present on the PyPI Simple
1174- Index. However, because it is not listed on the webpage itself, it is not copied
1175- across to the proxy. This function adds that URL to the proxy explicitly.
1176- """
1177-
1178- # Analyse API response line-by-line
1179- response_text : str = response_bytes .decode ("latin1" ) # Convert to text
1180- response_text_list = [] # Write line-by-line analysis to here
1181-
1182- for line in response_text .splitlines ():
1183- # Process URLs
1184- if r"<a href=" in line :
1185- response_text_list .append (line ) # Add to list
1186-
1187- # Add new line to explicitly call for wheel metadata
1188- if ".whl" in line :
1189- # Add ".metadata" to URL and file name
1190- line_new = line .replace (".whl" , ".whl.metadata" )
1191- response_text_list .append (line_new ) # Add to list
1192-
1193- # Append all other lines as normal
1194- else :
1195- response_text_list .append (line )
1196-
1197- # Recover original structure
1198- response_text_new = str ("\n " .join (response_text_list ))
1199- response_bytes_new = bytes (response_text_new , encoding = "latin-1" )
1200-
1201- return response_bytes_new
1202-
12031156 logger .debug (f"Received request to access { str (request .url )!r} " )
12041157
1205- # Validate package and URL
1206- full_path_response = _get_full_pypi_path_response (package )
1207-
1208- # Get filename in bytes
1209- filename_bytes = re .escape (filename .encode ("latin1" ))
1210-
1211- # Add explicit URLs for ".whl.metadata" files
1212- content = _expose_wheel_metadata (full_path_response .content )
1213-
1214- # Find package matching the specified filename
1215- selected_package_link = re .search (
1216- b'<a href="([^">]*)"[^>]*>' + filename_bytes + b"</a>" ,
1217- content ,
1218- )
1219- if not selected_package_link :
1220- raise HTTPException (status_code = 404 , detail = "File not found for package" )
1221- original_url = selected_package_link .group (1 )
1222- response = http_session .get (original_url )
1158+ package_url = f"{ python_repo_url } /packages/{ a } /{ b } /{ c } /{ filename } "
1159+ logger .debug (f"Forwarding package request to { package_url !r} " )
1160+ response = http_session .get (package_url , stream = True )
12231161
12241162 # Construct headers to return with response
12251163 headers : dict [str , str ] = {}
1226- if response .headers . get ( "Content-Length" ) :
1227- headers [ "Content-Lengh" ] = response .headers [ "Content-Length" ]
1164+ if response .status_code != 200 :
1165+ raise HTTPException ( status_code = response .status_code )
12281166 return StreamingResponse (
1229- content = response .iter_content ( chunk_size = 8192 ) ,
1167+ content = response .raw ,
12301168 status_code = response .status_code ,
12311169 headers = headers ,
12321170 media_type = response .headers .get ("Content-Type" ),
0 commit comments