Skip to content

Commit b52aa57

Browse files
authored
Merge pull request #88 from jajreidy/refactor-cleanout
refactor(upload): typed models for gather/results and per-arch RpmUploadResult
2 parents e27bba1 + 1d3bb13 commit b52aa57

20 files changed

+1090
-256
lines changed

.cursor/rules/llm-development-guidelines.mdc

Lines changed: 79 additions & 77 deletions
Large diffs are not rendered by default.

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2121
- Username/password (Basic Auth) support for packages.redhat.com
2222

2323
### Changed
24+
- Upload orchestration uses `RpmUploadResult` per architecture instead of ad-hoc dicts; gather/collect uses `PulpContentRow`, `ExtraArtifactRef`, and `FileInfoMap` for clearer typed data flow
25+
- Upload flow populates `pulp_results.json` artifact entries incrementally as RPMs, logs, SBOMs, and generic files finish; final gather still reconciles via merge (keeps incremental entries when keys already exist)
2426
- Repository setup logs use the concrete repo slug (e.g. ``rpms-signed``) instead of a generic ``Rpms`` label; distribution creation logs state that ``name`` and ``base_path`` match the repository name on one line
2527
- `upload --target-arch-repo` with `--signed-by`: RPM paths remain `{arch}/` only (no `{arch}/rpms-signed`); signing is via `signed_by` label on content
2628
- `pull`: use each artifact's ``url`` from pulp_results.json when present instead of synthesizing download URLs from distribution entries

pulp_tool/api/pulp_client.py

Lines changed: 74 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@
4444
from ..utils.artifact_detection import rpm_packages_letter_and_basename
4545
from ..utils.constants import DEFAULT_CHUNK_SIZE, SUPPORTED_ARCHITECTURES
4646
from ..utils.validation import sanitize_build_id_for_repository, validate_build_id
47-
from ..utils.rpm_operations import parse_rpm_filename_to_nvr
47+
from ..utils.rpm_operations import calculate_sha256_checksum, parse_rpm_filename_to_nvr
48+
from ..models.artifacts import ContentData, ExtraArtifactRef, FileInfoMap, PulpContentRow
4849
from .auth import OAuth2ClientCredentialsAuth
4950

5051
# Resource-based mixins
@@ -1663,7 +1664,9 @@ async def _fetch_rpm_by_signed_by_then_filter_nvr(
16631664
request=_EMPTY_RESPONSE_REQUEST,
16641665
)
16651666

1666-
def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict[str, str]]] = None) -> Any:
1667+
def gather_content_data(
1668+
self, build_id: str, extra_artifacts: Optional[List[ExtraArtifactRef]] = None
1669+
) -> ContentData:
16671670
"""
16681671
Gather content data and artifacts for a build ID.
16691672
@@ -1674,10 +1677,8 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
16741677
Returns:
16751678
ContentData containing content results and artifacts
16761679
"""
1677-
from ..models.artifacts import ContentData
1678-
1679-
content_results = []
1680-
artifacts: List[Dict[str, Any]] = []
1680+
raw_results: List[Dict[str, Any]] = []
1681+
artifacts: List[Dict[str, str]] = []
16811682

16821683
# Always use bulk query by build_id for efficiency
16831684
# This gets all content in a single API call instead of N individual calls
@@ -1689,48 +1690,48 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
16891690
try:
16901691
resp = self.find_content("build_id", build_id)
16911692
resp_json = resp.json()
1692-
content_results = resp_json["results"]
1693+
raw_results = resp_json["results"]
16931694
except Exception:
16941695
logging.error("Failed to get content by build ID", exc_info=True)
16951696
raise
16961697

16971698
# If no results from build_id query and we have extra_artifacts, try querying by href
16981699
# This handles the case where content hasn't been indexed yet
1699-
if not content_results and extra_artifacts:
1700+
if not raw_results and extra_artifacts:
17001701
logging.warning(
17011702
"No content found by build_id, trying direct href query for %d artifacts", len(extra_artifacts)
17021703
)
17031704
try:
17041705
# Extract content hrefs from extra_artifacts
17051706
# Note: extra_artifacts contains content hrefs (not artifact hrefs)
1706-
href_list = [
1707-
artifact.get("pulp_href", "") for artifact in extra_artifacts if artifact.get("pulp_href", "")
1708-
]
1707+
href_list = [a.pulp_href for a in extra_artifacts if a.pulp_href]
17091708
if href_list:
17101709
href_query = ",".join(href_list)
17111710
resp = self.find_content("href", href_query)
17121711
resp_json = resp.json()
1713-
content_results = resp_json["results"]
1714-
logging.info("Found %d content items by href query", len(content_results))
1712+
raw_results = resp_json["results"]
1713+
logging.info("Found %d content items by href query", len(raw_results))
17151714
except Exception:
17161715
logging.error("Failed to get content by href", exc_info=True)
17171716
# Don't raise, just continue with empty results
17181717

1719-
if not content_results:
1718+
if not raw_results:
17201719
logging.warning("No content found for build ID: %s", build_id)
17211720
return ContentData()
17221721

1722+
content_results = [PulpContentRow.model_validate(r) for r in raw_results]
1723+
17231724
logging.info("Found %d content items for build_id: %s", len(content_results), build_id)
17241725

17251726
# Log details about what content was found
17261727
if content_results:
17271728
logging.info("Content types found:")
17281729
for idx, result in enumerate(content_results):
1729-
pulp_href = result.get("pulp_href", "")
1730+
pulp_href = result.pulp_href
17301731
content_type = self._get_content_type_from_href(pulp_href)
17311732

17321733
# Get relative paths from artifacts dict
1733-
artifacts_dict = result.get("artifacts", {})
1734+
artifacts_dict = result.artifacts or {}
17341735
if artifacts_dict:
17351736
relative_paths = list(artifacts_dict.keys())
17361737
logging.info(" - %s: %s", content_type, ", ".join(relative_paths))
@@ -1739,28 +1740,65 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
17391740

17401741
# Log full structure for first item to help with debugging
17411742
if idx == 0:
1742-
logging.debug("First content item full structure: %s", json.dumps(result, indent=2, default=str))
1743+
logging.debug(
1744+
"First content item full structure: %s",
1745+
json.dumps(result.model_dump(), indent=2, default=str),
1746+
)
17431747

17441748
# Extract artifacts from content results
17451749
# Content structure has "artifacts" (plural) field which is a dict: {relative_path: artifact_href}
17461750
artifacts = [
17471751
{"artifact": artifact_href}
17481752
for result in content_results
1749-
for artifact_href in result.get("artifacts", {}).values()
1753+
for artifact_href in (result.artifacts or {}).values()
17501754
if artifact_href
17511755
]
17521756

17531757
logging.info("Extracted %d artifact hrefs from content results", len(artifacts))
17541758
return ContentData(content_results=content_results, artifacts=artifacts)
17551759

1760+
def add_uploaded_artifact_to_results_model(
1761+
self,
1762+
results_model: Any,
1763+
*,
1764+
local_path: str,
1765+
labels: Dict[str, str],
1766+
is_rpm: bool,
1767+
distribution_urls: Dict[str, str],
1768+
target_arch_repo: bool = False,
1769+
file_relative_path: Optional[str] = None,
1770+
) -> None:
1771+
"""
1772+
Add one uploaded artifact to PulpResultsModel using the same keys and URLs as gather/build.
1773+
1774+
Called after upload tasks succeed so results JSON can be built incrementally.
1775+
"""
1776+
relative_path = os.path.basename(local_path) if is_rpm else (file_relative_path or os.path.basename(local_path))
1777+
build_id = labels.get("build_id", "")
1778+
if is_rpm:
1779+
artifact_key = relative_path
1780+
else:
1781+
artifact_key = f"{build_id}/{relative_path}" if build_id else relative_path
1782+
1783+
sha256_hex = calculate_sha256_checksum(local_path)
1784+
artifact_url = self._build_artifact_distribution_url(
1785+
relative_path,
1786+
is_rpm,
1787+
labels,
1788+
distribution_urls,
1789+
target_arch_repo=target_arch_repo,
1790+
)
1791+
results_model.add_artifact(key=artifact_key, url=artifact_url, sha256=sha256_hex, labels=labels)
1792+
17561793
def build_results_structure(
17571794
self,
17581795
results_model: Any,
1759-
content_results: List[Dict[str, Any]],
1760-
file_info_map: Dict[str, Any],
1796+
content_results: List[PulpContentRow],
1797+
file_info_map: FileInfoMap,
17611798
distribution_urls: Optional[Dict[str, str]] = None,
17621799
*,
17631800
target_arch_repo: bool = False,
1801+
merge: bool = False,
17641802
) -> Any:
17651803
"""
17661804
Build the results structure from content and file info using optimized single-pass processing.
@@ -1771,6 +1809,7 @@ def build_results_structure(
17711809
file_info_map: Mapping of artifact hrefs to file info models
17721810
distribution_urls: Optional dictionary mapping repo_type to distribution base URL
17731811
target_arch_repo: When True, RPM URLs use per-arch distribution paths from labels
1812+
merge: When True, skip artifact keys already present (incremental upload + reconcile)
17741813
17751814
Returns:
17761815
Populated PulpResultsModel
@@ -1784,12 +1823,12 @@ def build_results_structure(
17841823
missing_file_info = 0
17851824

17861825
for idx, content in enumerate(content_results):
1787-
labels = content.get("pulp_labels", {})
1826+
labels = dict(content.pulp_labels or {})
17881827
build_id = labels.get("build_id", "")
1789-
pulp_href = content.get("pulp_href", "unknown")
1828+
pulp_href = content.pulp_href or "unknown"
17901829

17911830
# Content structure has "artifacts" (plural) field which is a dict: {relative_path: artifact_href}
1792-
artifacts_dict = content.get("artifacts", {})
1831+
artifacts_dict = content.artifacts or {}
17931832

17941833
if not artifacts_dict:
17951834
missing_artifacts += 1
@@ -1798,9 +1837,9 @@ def build_results_structure(
17981837
logging.warning(
17991838
"Content item %d structure (no artifacts field). Available fields: %s",
18001839
idx,
1801-
list(content.keys()),
1840+
list(content.model_dump(exclude_none=True).keys()),
18021841
)
1803-
logging.debug("Full content: %s", json.dumps(content, indent=2, default=str))
1842+
logging.debug("Full content: %s", json.dumps(content.model_dump(), indent=2, default=str))
18041843
continue
18051844

18061845
# Determine content type once per content item (cached via lru_cache)
@@ -1842,6 +1881,16 @@ def build_results_structure(
18421881
target_arch_repo=target_arch_repo,
18431882
)
18441883

1884+
if merge and artifact_key in results_model.artifacts:
1885+
existing = results_model.artifacts[artifact_key]
1886+
gi_sha = file_info.sha256 or ""
1887+
if existing.url != artifact_url or (existing.sha256 or "") != gi_sha:
1888+
logging.warning(
1889+
"Gathered artifact %s differs from incremental entry (keeping incremental)",
1890+
artifact_key,
1891+
)
1892+
continue
1893+
18451894
# Add artifact to results model
18461895
results_model.add_artifact(
18471896
key=artifact_key, url=artifact_url, sha256=file_info.sha256 or "", labels=labels

pulp_tool/models/artifacts.py

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,47 @@
11
"""Artifact-related models for Konflux Pulp."""
22

3-
from typing import Optional, Dict, Any, List
3+
from typing import Dict, List, Optional, Any
44

5-
from pydantic import Field
5+
from pydantic import BaseModel, ConfigDict, Field, model_validator
66

77
from .base import KonfluxBaseModel
88

99

10+
class PulpContentRow(BaseModel):
11+
"""One content unit from Pulp's content API (RPM package, file unit, etc.); fields vary by type."""
12+
13+
model_config = ConfigDict(extra="allow")
14+
15+
pulp_href: str = ""
16+
pulp_labels: Dict[str, Any] = Field(default_factory=dict)
17+
artifacts: Dict[str, Any] = Field(default_factory=dict)
18+
relative_path: Optional[str] = None
19+
20+
21+
class ExtraArtifactRef(BaseModel):
22+
"""Content href from upload `created_resources` used when gather-by-build_id returns empty."""
23+
24+
model_config = ConfigDict(extra="ignore")
25+
26+
pulp_href: Optional[str] = None
27+
file: Optional[str] = None
28+
29+
@model_validator(mode="before")
30+
@classmethod
31+
def _legacy_dict_href_keys(cls, data: Any) -> Any:
32+
"""Support legacy {"pulp_href"} and odd test shapes {"file": href} / {"extra": href}."""
33+
if isinstance(data, dict):
34+
d = dict(data)
35+
if not (d.get("pulp_href") or "").strip():
36+
for k in ("file", "extra"):
37+
v = d.get(k)
38+
if isinstance(v, str) and v.strip():
39+
d["pulp_href"] = v.strip()
40+
break
41+
return d
42+
return data
43+
44+
1045
class DownloadTask(KonfluxBaseModel):
1146
"""
1247
Information needed to download a single artifact.
@@ -262,7 +297,7 @@ class ContentData(KonfluxBaseModel):
262297
artifacts: List of artifact information dictionaries
263298
"""
264299

265-
content_results: List[Dict[str, Any]] = Field(default_factory=list)
300+
content_results: List[PulpContentRow] = Field(default_factory=list)
266301
artifacts: List[Dict[str, str]] = Field(default_factory=list)
267302

268303
@property
@@ -301,6 +336,9 @@ class FileInfoModel(KonfluxBaseModel):
301336
size: Optional[int] = None
302337

303338

339+
FileInfoMap = Dict[str, FileInfoModel]
340+
341+
304342
__all__ = [
305343
"DownloadTask",
306344
"ArtifactFile",
@@ -309,5 +347,8 @@ class FileInfoModel(KonfluxBaseModel):
309347
"ArtifactJsonResponse",
310348
"ArtifactData",
311349
"ContentData",
350+
"ExtraArtifactRef",
351+
"FileInfoMap",
312352
"FileInfoModel",
353+
"PulpContentRow",
313354
]

pulp_tool/pull/upload.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def _upload_rpms_to_repository(
112112
logging.warning("Uploading %d RPM file(s)", len(rpm_infos))
113113

114114
# Upload all RPMs in parallel using the consolidated function
115-
rpm_artifacts = upload_rpms_parallel(pulp_client, rpm_infos)
115+
rpm_pairs = upload_rpms_parallel(pulp_client, rpm_infos)
116+
rpm_artifacts = [href for _path, href in rpm_pairs]
116117

117118
# Add all successfully uploaded RPM artifacts to the repository
118119
if rpm_artifacts:

0 commit comments

Comments
 (0)