Skip to content

Commit 7a38803

Browse files
authored
Extract index generation into a separate module from upload_package_repo.py (#3566)
## Motivation This PR extracts the HTML index generation logic from `upload_package_repo.py` into a separate module, `generate_package_indexes.py`. ## Technical Details - Moved S3-based index generation functions into a new script: - generate_index_from_s3 - generate_top_index_from_s3 - Updated upload_package_repo.py to import and call these functions from the new module. - The new script can also be executed standalone via CLI to regenerate indexes for an existing repository prefix. ## Test Plan Run on CI ## Test Result Test: https://github.com/ROCm/TheRock/actions/runs/22338748161/job/64637174767 ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
1 parent 0645768 commit 7a38803

2 files changed

Lines changed: 298 additions & 250 deletions

File tree

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
#!/usr/bin/env python3
2+
3+
"""Generate index.html files for package repositories.
4+
5+
This script contains index generation logic.
6+
TODO(#3329) Enable running index generation at server-side.
7+
8+
It can be used as a standalone tool:
9+
10+
python3 build_tools/packaging/linux/generate_package_indexes.py \
11+
--s3-bucket <bucket> \
12+
--prefix <prefix>
13+
14+
Examples:
15+
--prefix deb/20260223-12345
16+
--prefix rpm/20260223-12345
17+
--top-prefix deb
18+
--top-prefix rpm
19+
"""
20+
21+
import argparse
22+
import boto3
23+
import os
24+
from pathlib import Path
25+
26+
27+
SVG_DEFS = """<svg xmlns="http://www.w3.org/2000/svg" style="display:none">
28+
<defs>
29+
<symbol id="file" viewBox="0 0 265 323">
30+
<path fill="#4582ec" d="M213 115v167a41 41 0 01-41 41H69a41 41 0 01-41-41V39a39 39 0 0139-39h127a39 39 0 0139 39v76z"/>
31+
<path fill="#77a4ff" d="M176 17v88a19 19 0 0019 19h88"/>
32+
</symbol>
33+
<symbol id="folder-shortcut" viewBox="0 0 265 216">
34+
<path fill="#4582ec" d="M18 54v-5a30 30 0 0130-30h75a28 28 0 0128 28v7h77a30 30 0 0130 30v84a30 30 0 01-30 30H33a30 30 0 01-30-30V54z"/>
35+
</symbol>
36+
</defs>
37+
</svg>
38+
"""
39+
40+
HTML_HEAD = f"""<!DOCTYPE html>
41+
<html>
42+
<head>
43+
<meta charset="utf-8">
44+
<title>artifacts</title>
45+
</head>
46+
<body>
47+
{SVG_DEFS}
48+
<table>
49+
<tbody>
50+
"""
51+
52+
HTML_FOOT = """
53+
</tbody>
54+
</table>
55+
</body>
56+
</html>
57+
"""
58+
59+
60+
def generate_index_html(directory: str) -> None:
61+
"""Generate a local index.html for a directory on disk."""
62+
rows: list[str] = []
63+
try:
64+
for entry in os.scandir(directory):
65+
if entry.name.startswith("."):
66+
continue
67+
rows.append(f'<tr><td><a href="{entry.name}">{entry.name}</a></td></tr>')
68+
except PermissionError:
69+
return
70+
71+
index_path = os.path.join(directory, "index.html")
72+
with open(index_path, "w", encoding="utf-8") as f:
73+
f.write(HTML_HEAD + "\n".join(rows) + HTML_FOOT)
74+
75+
76+
def generate_indexes_recursive(root: str) -> None:
77+
"""Generate local index.html files for all directories under root."""
78+
for d, _, _ in os.walk(root):
79+
generate_index_html(d)
80+
81+
82+
def generate_top_index_from_s3(s3, bucket: str, prefix: str) -> None:
83+
"""Generate index.html for top-level directory using S3 Delimiter.
84+
85+
This is much more efficient than listing all objects recursively,
86+
as it only retrieves immediate subdirectories and files.
87+
88+
Args:
89+
s3: boto3 S3 client
90+
bucket: S3 bucket name
91+
prefix: S3 prefix (e.g., 'deb' or 'rpm')
92+
"""
93+
print(f"Generating top index from S3: s3://{bucket}/{prefix}/")
94+
95+
paginator = s3.get_paginator("list_objects_v2")
96+
pages = paginator.paginate(Bucket=bucket, Prefix=f"{prefix}/", Delimiter="/")
97+
98+
rows: list[str] = []
99+
100+
for page in pages:
101+
# Add subdirectories (CommonPrefixes returned by Delimiter)
102+
for cp in page.get("CommonPrefixes", []):
103+
folder = cp["Prefix"][len(prefix) + 1 :].rstrip("/")
104+
rows.append(
105+
f'<tr><td><a href="{folder}/index.html">{folder}/</a></td></tr>'
106+
)
107+
108+
# Add files at this level only (no nested files)
109+
for obj in page.get("Contents", []):
110+
key = obj["Key"]
111+
if key.endswith("/") or key.endswith("index.html"):
112+
continue
113+
name = key[len(prefix) + 1 :]
114+
if "/" not in name: # Only files at this level
115+
rows.append(f'<tr><td><a href="{name}">{name}</a></td></tr>')
116+
117+
index_content = HTML_HEAD + "\n".join(rows) + HTML_FOOT
118+
index_key = f"{prefix}/index.html"
119+
120+
print(f"Uploading top index: {index_key}")
121+
s3.put_object(
122+
Bucket=bucket,
123+
Key=index_key,
124+
Body=index_content.encode("utf-8"),
125+
ContentType="text/html",
126+
)
127+
print("✓ Successfully uploaded top-level index")
128+
129+
130+
def generate_index_from_s3(
131+
s3, bucket: str, prefix: str, max_depth: int | None = None
132+
) -> None:
133+
"""Generate index.html files based on what's actually in S3.
134+
135+
This ensures index files accurately reflect the S3 repository state,
136+
including files from previous uploads that may have been deduplicated.
137+
138+
Args:
139+
s3: boto3 S3 client
140+
bucket: S3 bucket name
141+
prefix: S3 prefix (e.g., 'deb/20260223-12345')
142+
max_depth: Maximum directory depth to generate indexes for.
143+
None = unlimited (recursive), 0 = only root level, 1 = root + immediate children
144+
"""
145+
depth_msg = (
146+
f" (max depth: {max_depth})" if max_depth is not None else " (recursive)"
147+
)
148+
print(f"Generating indexes from S3: s3://{bucket}/{prefix}/{depth_msg}")
149+
150+
# Get all objects under the prefix
151+
paginator = s3.get_paginator("list_objects_v2")
152+
all_objects = []
153+
154+
try:
155+
for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
156+
if "Contents" not in page:
157+
continue
158+
all_objects.extend(page["Contents"])
159+
except Exception as e:
160+
print(f"Error listing S3 objects: {e}")
161+
return
162+
163+
if not all_objects:
164+
print(f"No objects found in s3://{bucket}/{prefix}/")
165+
return
166+
167+
# Group objects by directory
168+
directories: dict[str, list[str]] = {}
169+
for obj in all_objects:
170+
key = obj["Key"]
171+
172+
# Skip existing index.html files
173+
if key.endswith("index.html"):
174+
continue
175+
176+
# Get the directory path relative to prefix
177+
if key.startswith(prefix):
178+
rel_path = key[len(prefix) :].lstrip("/")
179+
else:
180+
rel_path = key
181+
182+
# Determine directory and filename
183+
if "/" in rel_path:
184+
dir_path = "/".join(rel_path.split("/")[:-1])
185+
filename = rel_path.split("/")[-1]
186+
else:
187+
dir_path = ""
188+
filename = rel_path
189+
190+
directories.setdefault(dir_path, []).append(filename)
191+
192+
# Track all parent directories (even if they have no files, only subdirs)
193+
parts = dir_path.split("/") if dir_path else []
194+
for i in range(len(parts)):
195+
parent = "/".join(parts[:i]) # Empty string for root, or partial path
196+
directories.setdefault(parent, [])
197+
198+
# Ensure root directory exists
199+
directories.setdefault("", [])
200+
201+
uploaded_indexes = 0
202+
for dir_path, files in sorted(
203+
directories.items(), key=lambda x: (-x[0].count("/") if x[0] else 1, x[0])
204+
):
205+
# Check depth limit
206+
if max_depth is not None:
207+
depth = dir_path.count("/") if dir_path else 0
208+
if depth > max_depth:
209+
continue
210+
211+
rows: list[str] = []
212+
213+
# Add subdirectories first
214+
subdirs: set[str] = set()
215+
for other_dir in directories.keys():
216+
if dir_path == "":
217+
if other_dir:
218+
subdir = other_dir.split("/")[0] if "/" in other_dir else other_dir
219+
subdirs.add(subdir)
220+
else:
221+
if other_dir.startswith(dir_path + "/") and other_dir != dir_path:
222+
remainder = other_dir[len(dir_path) :].lstrip("/")
223+
subdir = remainder.split("/")[0] if "/" in remainder else remainder
224+
if subdir:
225+
subdirs.add(subdir)
226+
227+
for subdir in sorted(subdirs):
228+
rows.append(
229+
f'<tr><td><a href="{subdir}/index.html">{subdir}/</a></td></tr>'
230+
)
231+
232+
# Add files
233+
for filename in sorted(files):
234+
rows.append(f'<tr><td><a href="{filename}">{filename}</a></td></tr>')
235+
236+
index_content = HTML_HEAD + "\n".join(rows) + HTML_FOOT
237+
238+
if dir_path:
239+
index_key = f"{prefix}/{dir_path}/index.html"
240+
else:
241+
index_key = f"{prefix}/index.html"
242+
243+
try:
244+
print(f"Uploading index: {index_key}")
245+
s3.put_object(
246+
Bucket=bucket,
247+
Key=index_key,
248+
Body=index_content.encode("utf-8"),
249+
ContentType="text/html",
250+
)
251+
uploaded_indexes += 1
252+
except Exception as e:
253+
print(f"Error uploading index {index_key}: {e}")
254+
255+
print(f"Generated and uploaded {uploaded_indexes} index files from S3 state")
256+
257+
258+
def _parse_args() -> argparse.Namespace:
259+
parser = argparse.ArgumentParser()
260+
parser.add_argument("--s3-bucket", required=True)
261+
parser.add_argument("--prefix", required=True, help="e.g. deb/20260223-12345")
262+
parser.add_argument(
263+
"--top-prefix",
264+
default=None,
265+
help="Optional top-level prefix to generate top index for (e.g., 'deb' or 'rpm').",
266+
)
267+
parser.add_argument(
268+
"--max-depth",
269+
type=int,
270+
default=None,
271+
help="Optional max depth for per-prefix index generation.",
272+
)
273+
return parser.parse_args()
274+
275+
276+
def main() -> None:
277+
args = _parse_args()
278+
s3 = boto3.client("s3")
279+
generate_index_from_s3(s3, args.s3_bucket, args.prefix, max_depth=args.max_depth)
280+
281+
if args.top_prefix is not None:
282+
generate_top_index_from_s3(s3, args.s3_bucket, args.top_prefix)
283+
284+
285+
if __name__ == "__main__":
286+
main()

0 commit comments

Comments
 (0)