4444from ..utils .artifact_detection import rpm_packages_letter_and_basename
4545from ..utils .constants import DEFAULT_CHUNK_SIZE , SUPPORTED_ARCHITECTURES
4646from ..utils .validation import sanitize_build_id_for_repository , validate_build_id
47- from ..utils .rpm_operations import parse_rpm_filename_to_nvr
47+ from ..utils .rpm_operations import calculate_sha256_checksum , parse_rpm_filename_to_nvr
48+ from ..models .artifacts import ContentData , ExtraArtifactRef , FileInfoMap , PulpContentRow
4849from .auth import OAuth2ClientCredentialsAuth
4950
5051# Resource-based mixins
@@ -1663,7 +1664,9 @@ async def _fetch_rpm_by_signed_by_then_filter_nvr(
16631664 request = _EMPTY_RESPONSE_REQUEST ,
16641665 )
16651666
1666- def gather_content_data (self , build_id : str , extra_artifacts : Optional [List [Dict [str , str ]]] = None ) -> Any :
1667+ def gather_content_data (
1668+ self , build_id : str , extra_artifacts : Optional [List [ExtraArtifactRef ]] = None
1669+ ) -> ContentData :
16671670 """
16681671 Gather content data and artifacts for a build ID.
16691672
@@ -1674,10 +1677,8 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
16741677 Returns:
16751678 ContentData containing content results and artifacts
16761679 """
1677- from ..models .artifacts import ContentData
1678-
1679- content_results = []
1680- artifacts : List [Dict [str , Any ]] = []
1680+ raw_results : List [Dict [str , Any ]] = []
1681+ artifacts : List [Dict [str , str ]] = []
16811682
16821683 # Always use bulk query by build_id for efficiency
16831684 # This gets all content in a single API call instead of N individual calls
@@ -1689,48 +1690,48 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
16891690 try :
16901691 resp = self .find_content ("build_id" , build_id )
16911692 resp_json = resp .json ()
1692- content_results = resp_json ["results" ]
1693+ raw_results = resp_json ["results" ]
16931694 except Exception :
16941695 logging .error ("Failed to get content by build ID" , exc_info = True )
16951696 raise
16961697
16971698 # If no results from build_id query and we have extra_artifacts, try querying by href
16981699 # This handles the case where content hasn't been indexed yet
1699- if not content_results and extra_artifacts :
1700+ if not raw_results and extra_artifacts :
17001701 logging .warning (
17011702 "No content found by build_id, trying direct href query for %d artifacts" , len (extra_artifacts )
17021703 )
17031704 try :
17041705 # Extract content hrefs from extra_artifacts
17051706 # Note: extra_artifacts contains content hrefs (not artifact hrefs)
1706- href_list = [
1707- artifact .get ("pulp_href" , "" ) for artifact in extra_artifacts if artifact .get ("pulp_href" , "" )
1708- ]
1707+ href_list = [a .pulp_href for a in extra_artifacts if a .pulp_href ]
17091708 if href_list :
17101709 href_query = "," .join (href_list )
17111710 resp = self .find_content ("href" , href_query )
17121711 resp_json = resp .json ()
1713- content_results = resp_json ["results" ]
1714- logging .info ("Found %d content items by href query" , len (content_results ))
1712+ raw_results = resp_json ["results" ]
1713+ logging .info ("Found %d content items by href query" , len (raw_results ))
17151714 except Exception :
17161715 logging .error ("Failed to get content by href" , exc_info = True )
17171716 # Don't raise, just continue with empty results
17181717
1719- if not content_results :
1718+ if not raw_results :
17201719 logging .warning ("No content found for build ID: %s" , build_id )
17211720 return ContentData ()
17221721
1722+ content_results = [PulpContentRow .model_validate (r ) for r in raw_results ]
1723+
17231724 logging .info ("Found %d content items for build_id: %s" , len (content_results ), build_id )
17241725
17251726 # Log details about what content was found
17261727 if content_results :
17271728 logging .info ("Content types found:" )
17281729 for idx , result in enumerate (content_results ):
1729- pulp_href = result .get ( " pulp_href" , "" )
1730+ pulp_href = result .pulp_href
17301731 content_type = self ._get_content_type_from_href (pulp_href )
17311732
17321733 # Get relative paths from artifacts dict
1733- artifacts_dict = result .get ( " artifacts" , {})
1734+ artifacts_dict = result .artifacts or {}
17341735 if artifacts_dict :
17351736 relative_paths = list (artifacts_dict .keys ())
17361737 logging .info (" - %s: %s" , content_type , ", " .join (relative_paths ))
@@ -1739,28 +1740,65 @@ def gather_content_data(self, build_id: str, extra_artifacts: Optional[List[Dict
17391740
17401741 # Log full structure for first item to help with debugging
17411742 if idx == 0 :
1742- logging .debug ("First content item full structure: %s" , json .dumps (result , indent = 2 , default = str ))
1743+ logging .debug (
1744+ "First content item full structure: %s" ,
1745+ json .dumps (result .model_dump (), indent = 2 , default = str ),
1746+ )
17431747
17441748 # Extract artifacts from content results
17451749 # Content structure has "artifacts" (plural) field which is a dict: {relative_path: artifact_href}
17461750 artifacts = [
17471751 {"artifact" : artifact_href }
17481752 for result in content_results
1749- for artifact_href in result .get ( " artifacts" , {}).values ()
1753+ for artifact_href in ( result .artifacts or {}).values ()
17501754 if artifact_href
17511755 ]
17521756
17531757 logging .info ("Extracted %d artifact hrefs from content results" , len (artifacts ))
17541758 return ContentData (content_results = content_results , artifacts = artifacts )
17551759
1760+ def add_uploaded_artifact_to_results_model (
1761+ self ,
1762+ results_model : Any ,
1763+ * ,
1764+ local_path : str ,
1765+ labels : Dict [str , str ],
1766+ is_rpm : bool ,
1767+ distribution_urls : Dict [str , str ],
1768+ target_arch_repo : bool = False ,
1769+ file_relative_path : Optional [str ] = None ,
1770+ ) -> None :
1771+ """
1772+ Add one uploaded artifact to PulpResultsModel using the same keys and URLs as gather/build.
1773+
1774+ Called after upload tasks succeed so results JSON can be built incrementally.
1775+ """
1776+ relative_path = os .path .basename (local_path ) if is_rpm else (file_relative_path or os .path .basename (local_path ))
1777+ build_id = labels .get ("build_id" , "" )
1778+ if is_rpm :
1779+ artifact_key = relative_path
1780+ else :
1781+ artifact_key = f"{ build_id } /{ relative_path } " if build_id else relative_path
1782+
1783+ sha256_hex = calculate_sha256_checksum (local_path )
1784+ artifact_url = self ._build_artifact_distribution_url (
1785+ relative_path ,
1786+ is_rpm ,
1787+ labels ,
1788+ distribution_urls ,
1789+ target_arch_repo = target_arch_repo ,
1790+ )
1791+ results_model .add_artifact (key = artifact_key , url = artifact_url , sha256 = sha256_hex , labels = labels )
1792+
17561793 def build_results_structure (
17571794 self ,
17581795 results_model : Any ,
1759- content_results : List [Dict [ str , Any ] ],
1760- file_info_map : Dict [ str , Any ] ,
1796+ content_results : List [PulpContentRow ],
1797+ file_info_map : FileInfoMap ,
17611798 distribution_urls : Optional [Dict [str , str ]] = None ,
17621799 * ,
17631800 target_arch_repo : bool = False ,
1801+ merge : bool = False ,
17641802 ) -> Any :
17651803 """
17661804 Build the results structure from content and file info using optimized single-pass processing.
@@ -1771,6 +1809,7 @@ def build_results_structure(
17711809 file_info_map: Mapping of artifact hrefs to file info models
17721810 distribution_urls: Optional dictionary mapping repo_type to distribution base URL
17731811 target_arch_repo: When True, RPM URLs use per-arch distribution paths from labels
1812+ merge: When True, skip artifact keys already present (incremental upload + reconcile)
17741813
17751814 Returns:
17761815 Populated PulpResultsModel
@@ -1784,12 +1823,12 @@ def build_results_structure(
17841823 missing_file_info = 0
17851824
17861825 for idx , content in enumerate (content_results ):
1787- labels = content .get ( " pulp_labels" , {})
1826+ labels = dict ( content .pulp_labels or {})
17881827 build_id = labels .get ("build_id" , "" )
1789- pulp_href = content .get ( " pulp_href" , "unknown" )
1828+ pulp_href = content .pulp_href or "unknown"
17901829
17911830 # Content structure has "artifacts" (plural) field which is a dict: {relative_path: artifact_href}
1792- artifacts_dict = content .get ( " artifacts" , {})
1831+ artifacts_dict = content .artifacts or {}
17931832
17941833 if not artifacts_dict :
17951834 missing_artifacts += 1
@@ -1798,9 +1837,9 @@ def build_results_structure(
17981837 logging .warning (
17991838 "Content item %d structure (no artifacts field). Available fields: %s" ,
18001839 idx ,
1801- list (content .keys ()),
1840+ list (content .model_dump ( exclude_none = True ). keys ()),
18021841 )
1803- logging .debug ("Full content: %s" , json .dumps (content , indent = 2 , default = str ))
1842+ logging .debug ("Full content: %s" , json .dumps (content . model_dump () , indent = 2 , default = str ))
18041843 continue
18051844
18061845 # Determine content type once per content item (cached via lru_cache)
@@ -1842,6 +1881,16 @@ def build_results_structure(
18421881 target_arch_repo = target_arch_repo ,
18431882 )
18441883
1884+ if merge and artifact_key in results_model .artifacts :
1885+ existing = results_model .artifacts [artifact_key ]
1886+ gi_sha = file_info .sha256 or ""
1887+ if existing .url != artifact_url or (existing .sha256 or "" ) != gi_sha :
1888+ logging .warning (
1889+ "Gathered artifact %s differs from incremental entry (keeping incremental)" ,
1890+ artifact_key ,
1891+ )
1892+ continue
1893+
18451894 # Add artifact to results model
18461895 results_model .add_artifact (
18471896 key = artifact_key , url = artifact_url , sha256 = file_info .sha256 or "" , labels = labels
0 commit comments