konflux-ci
diff --git a/‎.cursor/rules/llm-development-guidelines.mdc‎
Lines changed: 79 additions & 77 deletions b/‎.cursor/rules/llm-development-guidelines.mdc‎
Lines changed: 79 additions & 77 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pulp_tool/api/pulp_client.py‎
Lines changed: 74 additions & 25 deletions b/‎pulp_tool/api/pulp_client.py‎
Lines changed: 74 additions & 25 deletions
diff --git a/‎pulp_tool/models/artifacts.py‎
Lines changed: 44 additions & 3 deletions b/‎pulp_tool/models/artifacts.py‎
Lines changed: 44 additions & 3 deletions
diff --git a/‎pulp_tool/pull/upload.py‎
Lines changed: 2 additions & 1 deletion b/‎pulp_tool/pull/upload.py‎
Lines changed: 2 additions & 1 deletion
@@ -21,6 +21,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Username/password (Basic Auth) support for packages.redhat.com
 
 ### Changed
+- Upload orchestration uses `RpmUploadResult` per architecture instead of ad-hoc dicts; gather/collect uses `PulpContentRow`, `ExtraArtifactRef`, and `FileInfoMap` for clearer typed data flow
+- Upload flow populates `pulp_results.json` artifact entries incrementally as RPMs, logs, SBOMs, and generic files finish; final gather still reconciles via merge (keeps incremental entries when keys already exist)
 - Repository setup logs use the concrete repo slug (e.g. ``rpms-signed``) instead of a generic ``Rpms`` label; distribution creation logs state that ``name`` and ``base_path`` match the repository name on one line
 - `upload --target-arch-repo` with `--signed-by`: RPM paths remain `{arch}/` only (no `{arch}/rpms-signed`); signing is via `signed_by` label on content
 - `pull`: use each artifact's ``url`` from pulp_results.json when present instead of synthesizing download URLs from distribution entries
 
@@ -44,7 +44,8 @@
 from ..utils.artifact_detection import rpm_packages_letter_and_basename
 from ..utils.constants import DEFAULT_CHUNK_SIZE, SUPPORTED_ARCHITECTURES
 from ..utils.validation import sanitize_build_id_for_repository, validate_build_id
-from ..utils.rpm_operations import parse_rpm_filename_to_nvr
+from ..utils.rpm_operations import calculate_sha256_checksum, parse_rpm_filename_to_nvr
+from ..models.artifacts import ContentData, ExtraArtifactRef, FileInfoMap, PulpContentRow
 from .auth import OAuth2ClientCredentialsAuth
 
 # Resource-based mixins
@@ -1663,7 +1664,9 @@ async def _fetch_rpm_by_signed_by_then_filter_nvr(
             request=_EMPTY_RESPONSE_REQUEST,
         )
 
-    def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict[str, str]]] = None) -> Any:
+    def gather_content_data(
+        self, build_id: str, extra_artifacts: Optional[List[ExtraArtifactRef]] = None
+    ) -> ContentData:
         """
         Gather content data and artifacts for a build ID.
 
@@ -1674,10 +1677,8 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
         Returns:
             ContentData containing content results and artifacts
         """
-        from ..models.artifacts import ContentData
-
-        content_results = []
-        artifacts: List[Dict[str, Any]] = []
+        raw_results: List[Dict[str, Any]] = []
+        artifacts: List[Dict[str, str]] = []
 
         # Always use bulk query by build_id for efficiency
         # This gets all content in a single API call instead of N individual calls
@@ -1689,48 +1690,48 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
         try:
             resp = self.find_content("build_id", build_id)
             resp_json = resp.json()
-            content_results = resp_json["results"]
+            raw_results = resp_json["results"]
         except Exception:
             logging.error("Failed to get content by build ID", exc_info=True)
             raise
 
         # If no results from build_id query and we have extra_artifacts, try querying by href
         # This handles the case where content hasn't been indexed yet
-        if not content_results and extra_artifacts:
+        if not raw_results and extra_artifacts:
             logging.warning(
                 "No content found by build_id, trying direct href query for %d artifacts", len(extra_artifacts)
             )
             try:
                 # Extract content hrefs from extra_artifacts
                 # Note: extra_artifacts contains content hrefs (not artifact hrefs)
-                href_list = [
-                    artifact.get("pulp_href", "") for artifact in extra_artifacts if artifact.get("pulp_href", "")
-                ]
+                href_list = [a.pulp_href for a in extra_artifacts if a.pulp_href]
                 if href_list:
                     href_query = ",".join(href_list)
                     resp = self.find_content("href", href_query)
                     resp_json = resp.json()
-                    content_results = resp_json["results"]
-                    logging.info("Found %d content items by href query", len(content_results))
+                    raw_results = resp_json["results"]
+                    logging.info("Found %d content items by href query", len(raw_results))
             except Exception:
                 logging.error("Failed to get content by href", exc_info=True)
                 # Don't raise, just continue with empty results
 
-        if not content_results:
+        if not raw_results:
             logging.warning("No content found for build ID: %s", build_id)
             return ContentData()
 
+        content_results = [PulpContentRow.model_validate(r) for r in raw_results]
+
         logging.info("Found %d content items for build_id: %s", len(content_results), build_id)
 
         # Log details about what content was found
         if content_results:
             logging.info("Content types found:")
             for idx, result in enumerate(content_results):
-                pulp_href = result.get("pulp_href", "")
+                pulp_href = result.pulp_href
                 content_type = self._get_content_type_from_href(pulp_href)
 
                 # Get relative paths from artifacts dict
-                artifacts_dict = result.get("artifacts", {})
+                artifacts_dict = result.artifacts or {}
                 if artifacts_dict:
                     relative_paths = list(artifacts_dict.keys())
                     logging.info("  - %s: %s", content_type, ", ".join(relative_paths))
@@ -1739,28 +1740,65 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
 
                 # Log full structure for first item to help with debugging
                 if idx == 0:
-                    logging.debug("First content item full structure: %s", json.dumps(result, indent=2, default=str))
+                    logging.debug(
+                        "First content item full structure: %s",
+                        json.dumps(result.model_dump(), indent=2, default=str),
+                    )
 
         # Extract artifacts from content results
         # Content structure has "artifacts" (plural) field which is a dict: {relative_path: artifact_href}
         artifacts = [
             {"artifact": artifact_href}
             for result in content_results
-            for artifact_href in result.get("artifacts", {}).values()
+            for artifact_href in (result.artifacts or {}).values()
             if artifact_href
         ]
 
         logging.info("Extracted %d artifact hrefs from content results", len(artifacts))
         return ContentData(content_results=content_results, artifacts=artifacts)
 
+    def add_uploaded_artifact_to_results_model(
+        self,
+        results_model: Any,
+        *,
+        local_path: str,
+        labels: Dict[str, str],
+        is_rpm: bool,
+        distribution_urls: Dict[str, str],
+        target_arch_repo: bool = False,
+        file_relative_path: Optional[str] = None,
+    ) -> None:
+        """
+        Add one uploaded artifact to PulpResultsModel using the same keys and URLs as gather/build.
+
+        Called after upload tasks succeed so results JSON can be built incrementally.
+        """
+        relative_path = os.path.basename(local_path) if is_rpm else (file_relative_path or os.path.basename(local_path))
+        build_id = labels.get("build_id", "")
+        if is_rpm:
+            artifact_key = relative_path
+        else:
+            artifact_key = f"{build_id}/{relative_path}" if build_id else relative_path
+
+        sha256_hex = calculate_sha256_checksum(local_path)
+        artifact_url = self._build_artifact_distribution_url(
+            relative_path,
+            is_rpm,
+            labels,
+            distribution_urls,
+            target_arch_repo=target_arch_repo,
+        )
+        results_model.add_artifact(key=artifact_key, url=artifact_url, sha256=sha256_hex, labels=labels)
+
     def build_results_structure(
         self,
         results_model: Any,
-        content_results: List[Dict[str, Any]],
-        file_info_map: Dict[str, Any],
+        content_results: List[PulpContentRow],
+        file_info_map: FileInfoMap,
         distribution_urls: Optional[Dict[str, str]] = None,
         *,
         target_arch_repo: bool = False,
+        merge: bool = False,
     ) -> Any:
         """
         Build the results structure from content and file info using optimized single-pass processing.
@@ -1771,6 +1809,7 @@ def build_results_structure(
             file_info_map: Mapping of artifact hrefs to file info models
             distribution_urls: Optional dictionary mapping repo_type to distribution base URL
             target_arch_repo: When True, RPM URLs use per-arch distribution paths from labels
+            merge: When True, skip artifact keys already present (incremental upload + reconcile)
 
         Returns:
             Populated PulpResultsModel
@@ -1784,12 +1823,12 @@ def build_results_structure(
         missing_file_info = 0
 
         for idx, content in enumerate(content_results):
-            labels = content.get("pulp_labels", {})
+            labels = dict(content.pulp_labels or {})
             build_id = labels.get("build_id", "")
-            pulp_href = content.get("pulp_href", "unknown")
+            pulp_href = content.pulp_href or "unknown"
 
             # Content structure has "artifacts" (plural) field which is a dict: {relative_path: artifact_href}
-            artifacts_dict = content.get("artifacts", {})
+            artifacts_dict = content.artifacts or {}
 
             if not artifacts_dict:
                 missing_artifacts += 1
@@ -1798,9 +1837,9 @@ def build_results_structure(
                     logging.warning(
                         "Content item %d structure (no artifacts field). Available fields: %s",
                         idx,
-                        list(content.keys()),
+                        list(content.model_dump(exclude_none=True).keys()),
                     )
-                    logging.debug("Full content: %s", json.dumps(content, indent=2, default=str))
+                    logging.debug("Full content: %s", json.dumps(content.model_dump(), indent=2, default=str))
                 continue
 
             # Determine content type once per content item (cached via lru_cache)
@@ -1842,6 +1881,16 @@ def build_results_structure(
                     target_arch_repo=target_arch_repo,
                 )
 
+                if merge and artifact_key in results_model.artifacts:
+                    existing = results_model.artifacts[artifact_key]
+                    gi_sha = file_info.sha256 or ""
+                    if existing.url != artifact_url or (existing.sha256 or "") != gi_sha:
+                        logging.warning(
+                            "Gathered artifact %s differs from incremental entry (keeping incremental)",
+                            artifact_key,
+                        )
+                    continue
+
                 # Add artifact to results model
                 results_model.add_artifact(
                     key=artifact_key, url=artifact_url, sha256=file_info.sha256 or "", labels=labels
 
@@ -1,12 +1,47 @@
 """Artifact-related models for Konflux Pulp."""
 
-from typing import Optional, Dict, Any, List
+from typing import Dict, List, Optional, Any
 
-from pydantic import Field
+from pydantic import BaseModel, ConfigDict, Field, model_validator
 
 from .base import KonfluxBaseModel
 
 
+class PulpContentRow(BaseModel):
+    """One content unit from Pulp's content API (RPM package, file unit, etc.); fields vary by type."""
+
+    model_config = ConfigDict(extra="allow")
+
+    pulp_href: str = ""
+    pulp_labels: Dict[str, Any] = Field(default_factory=dict)
+    artifacts: Dict[str, Any] = Field(default_factory=dict)
+    relative_path: Optional[str] = None
+
+
+class ExtraArtifactRef(BaseModel):
+    """Content href from upload `created_resources` used when gather-by-build_id returns empty."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    pulp_href: Optional[str] = None
+    file: Optional[str] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def _legacy_dict_href_keys(cls, data: Any) -> Any:
+        """Support legacy {"pulp_href"} and odd test shapes {"file": href} / {"extra": href}."""
+        if isinstance(data, dict):
+            d = dict(data)
+            if not (d.get("pulp_href") or "").strip():
+                for k in ("file", "extra"):
+                    v = d.get(k)
+                    if isinstance(v, str) and v.strip():
+                        d["pulp_href"] = v.strip()
+                        break
+            return d
+        return data
+
+
 class DownloadTask(KonfluxBaseModel):
     """
     Information needed to download a single artifact.
@@ -262,7 +297,7 @@ class ContentData(KonfluxBaseModel):
         artifacts: List of artifact information dictionaries
     """
 
-    content_results: List[Dict[str, Any]] = Field(default_factory=list)
+    content_results: List[PulpContentRow] = Field(default_factory=list)
     artifacts: List[Dict[str, str]] = Field(default_factory=list)
 
     @property
@@ -301,6 +336,9 @@ class FileInfoModel(KonfluxBaseModel):
     size: Optional[int] = None
 
 
+FileInfoMap = Dict[str, FileInfoModel]
+
+
 __all__ = [
     "DownloadTask",
     "ArtifactFile",
@@ -309,5 +347,8 @@ class FileInfoModel(KonfluxBaseModel):
     "ArtifactJsonResponse",
     "ArtifactData",
     "ContentData",
+    "ExtraArtifactRef",
+    "FileInfoMap",
     "FileInfoModel",
+    "PulpContentRow",
 ]
@@ -112,7 +112,8 @@ def _upload_rpms_to_repository(
     logging.warning("Uploading %d RPM file(s)", len(rpm_infos))
 
     # Upload all RPMs in parallel using the consolidated function
-    rpm_artifacts = upload_rpms_parallel(pulp_client, rpm_infos)
+    rpm_pairs = upload_rpms_parallel(pulp_client, rpm_infos)
+    rpm_artifacts = [href for _path, href in rpm_pairs]
 
     # Add all successfully uploaded RPM artifacts to the repository
     if rpm_artifacts: