|
| 1 | +// Copyright (c) 2024, QuantStack and Mamba Contributors |
| 2 | +// |
| 3 | +// Distributed under the terms of the BSD 3-Clause License. |
| 4 | +// |
| 5 | +// The full license is in the file LICENSE, distributed with this software. |
| 6 | + |
| 7 | +#ifndef MAMBA_CORE_SHARD_TYPES_HPP |
| 8 | +#define MAMBA_CORE_SHARD_TYPES_HPP |
| 9 | + |
| 10 | +#include <map> |
| 11 | +#include <optional> |
| 12 | +#include <string> |
| 13 | +#include <vector> |
| 14 | + |
| 15 | +#include "mamba/specs/package_info.hpp" |
| 16 | +#include "mamba/specs/repo_data.hpp" |
| 17 | + |
| 18 | +namespace mamba |
| 19 | +{ |
| 20 | + /** |
| 21 | + * Package record dictionary for shard data. |
| 22 | + * |
| 23 | + * This is a simplified representation of package metadata used in shards. |
| 24 | + * It exists separately from other package types for several reasons: |
| 25 | + * |
| 26 | + * **Comparison with specs::RepoDataPackage:** |
| 27 | + * - Uses primitive types (string for version, string for noarch) instead of |
| 28 | + * complex types (Version object, NoArchType enum), making direct msgpack |
| 29 | + * deserialization faster and more straightforward. |
| 30 | + * - Contains only fields needed for dependency traversal, reducing memory usage |
| 31 | + * when processing many shards. |
| 32 | + * - Conversion to RepoDataPackage happens lazily when building repodata for |
| 33 | + * the solver, deferring parsing costs until actually needed. |
| 34 | + * |
| 35 | + * **Comparison with specs::PackageInfo:** |
| 36 | + * - PackageInfo is the runtime representation used for installed packages, |
| 37 | + * transactions, and queries. It uses string for version (like ShardPackageRecord) |
| 38 | + * but NoArchType enum (like RepoDataPackage), and includes runtime-specific |
| 39 | + * fields like channel, package_url, platform, filename, signatures, etc. |
| 40 | + * - ShardPackageRecord is purely for parsing msgpack shards and contains only |
| 41 | + * the minimal fields needed for dependency traversal. |
| 42 | + * - PackageInfo is created from RepoDataPackage when packages are added to the |
| 43 | + * solver database, not directly from ShardPackageRecord. |
| 44 | + * |
| 45 | + * **Key design decisions:** |
| 46 | + * |
| 47 | + * 1. **Simpler msgpack parsing**: The msgpack format from Python shards uses simple |
| 48 | + * types that map directly to primitives, avoiding complex type parsing during |
| 49 | + * deserialization. |
| 50 | + * |
| 51 | + * 2. **Minimal storage**: Only fields needed for dependency traversal (name, version, |
| 52 | + * build, dependencies, constraints). Fields like license, timestamp, track_features |
| 53 | + * are not needed during traversal. |
| 54 | + * |
| 55 | + * 3. **Lazy conversion**: Conversion to specs::RepoDataPackage happens only when |
| 56 | + * building repodata for the solver (via to_repo_data_package()), deferring |
| 57 | + * Version/NoArchType parsing costs until actually needed. |
| 58 | + * |
| 59 | + * 4. **Flexible msgpack handling**: Custom parsing handles various msgpack types |
| 60 | + * for sha256/md5 (strings, bytes, EXT types), easier with a dedicated structure. |
| 61 | + * |
| 62 | + * This structure supports all fields defined in the shard format specification. |
| 63 | + * See https://conda.org/learn/ceps/cep-0016 for the complete shard format specification. |
| 64 | + * |
| 65 | + * @see specs::RepoDataPackage The canonical package record type used for repodata.json |
| 66 | + * @see specs::PackageInfo The runtime package representation used throughout the codebase |
| 67 | + * @see to_repo_data_package() Conversion function to RepoDataPackage |
| 68 | + * @see from_repo_data_package() Conversion function from RepoDataPackage |
| 69 | + * @see https://conda.org/learn/ceps/cep-0016 CEP 16 - Sharded Repodata specification |
| 70 | + */ |
| 71 | + struct ShardPackageRecord |
| 72 | + { |
| 73 | + std::string name; |
| 74 | + std::string version; |
| 75 | + std::string build; |
| 76 | + std::size_t build_number = 0; |
| 77 | + std::optional<std::string> sha256; |
| 78 | + std::optional<std::string> md5; |
| 79 | + std::vector<std::string> depends; |
| 80 | + std::vector<std::string> constrains; |
| 81 | + std::optional<std::string> noarch; |
| 82 | + std::size_t size = 0; |
| 83 | + std::optional<std::string> license; |
| 84 | + std::optional<std::string> license_family; |
| 85 | + std::optional<std::string> subdir; |
| 86 | + std::optional<std::size_t> timestamp; |
| 87 | + }; |
| 88 | + |
| 89 | + /** |
| 90 | + * A shard dictionary containing packages for a single package name. |
| 91 | + * |
| 92 | + * Maps to the Python ShardDict type. Contains all versions of a package |
| 93 | + * in both .tar.bz2 and .conda formats. |
| 94 | + */ |
| 95 | + struct ShardDict |
| 96 | + { |
| 97 | + /** Packages in .tar.bz2 format, keyed by filename. */ |
| 98 | + std::map<std::string, ShardPackageRecord> packages; |
| 99 | + |
| 100 | + /** Packages in .conda format, keyed by filename. */ |
| 101 | + std::map<std::string, ShardPackageRecord> conda_packages; |
| 102 | + }; |
| 103 | + |
| 104 | + /** |
| 105 | + * Information dictionary from repodata. |
| 106 | + * |
| 107 | + * Contains channel metadata including base URLs and subdir information. |
| 108 | + */ |
| 109 | + struct RepoMetadata |
| 110 | + { |
| 111 | + /** Base URL where packages are stored. */ |
| 112 | + std::string base_url; |
| 113 | + |
| 114 | + /** Base URL where shards are stored. */ |
| 115 | + std::string shards_base_url; |
| 116 | + |
| 117 | + /** Subdirectory (platform) name. */ |
| 118 | + std::string subdir; |
| 119 | + }; |
| 120 | + |
| 121 | + /** |
| 122 | + * Shards index dictionary. |
| 123 | + * |
| 124 | + * This is the structure parsed from repodata_shards.msgpack.zst. |
| 125 | + * It maps package names to their shard hash (SHA256). |
| 126 | + */ |
| 127 | + struct ShardsIndexDict |
| 128 | + { |
| 129 | + /** Channel information. */ |
| 130 | + RepoMetadata info; |
| 131 | + |
| 132 | + /** Version of the shards index format. */ |
| 133 | + std::size_t version = 1; |
| 134 | + |
| 135 | + /** |
| 136 | + * Map of package names to their shard hash. |
| 137 | + * |
| 138 | + * The hash is stored as raw bytes (32 bytes for SHA256). |
| 139 | + */ |
| 140 | + std::map<std::string, std::vector<std::uint8_t>> shards; |
| 141 | + }; |
| 142 | + |
| 143 | + /** |
| 144 | + * Complete repodata dictionary. |
| 145 | + * |
| 146 | + * Combines shard data with repodata metadata. |
| 147 | + */ |
| 148 | + struct RepodataDict |
| 149 | + { |
| 150 | + /** Channel information. */ |
| 151 | + RepoMetadata info; |
| 152 | + |
| 153 | + /** Repodata version. */ |
| 154 | + std::size_t repodata_version = 2; |
| 155 | + |
| 156 | + /** Shard dictionary containing packages in both .tar.bz2 and .conda formats. */ |
| 157 | + ShardDict shard_dict; |
| 158 | + }; |
| 159 | + |
| 160 | + /** |
| 161 | + * Convert ShardPackageRecord to specs::RepoDataPackage. |
| 162 | + * |
| 163 | + * This conversion is used when building repodata for the solver. |
| 164 | + */ |
| 165 | + specs::RepoDataPackage to_repo_data_package(const ShardPackageRecord& record); |
| 166 | + |
| 167 | + /** |
| 168 | + * Convert specs::RepoDataPackage to ShardPackageRecord. |
| 169 | + * |
| 170 | + * This conversion is used when treating monolithic repodata as shards. |
| 171 | + */ |
| 172 | + ShardPackageRecord from_repo_data_package(const specs::RepoDataPackage& record); |
| 173 | + |
| 174 | + /** |
| 175 | + * Convert RepodataDict to specs::RepoData. |
| 176 | + * |
| 177 | + * This conversion is used when building repodata for the solver from shards. |
| 178 | + */ |
| 179 | + specs::RepoData to_repo_data(const RepodataDict& repodata); |
| 180 | + |
| 181 | + /** |
| 182 | + * Convert ShardPackageRecord to specs::PackageInfo. |
| 183 | + * |
| 184 | + * This conversion is used when loading packages from shards into the package database. |
| 185 | + * Requires additional metadata (filename, channel, platform, base_url) that is not |
| 186 | + * present in ShardPackageRecord but needed for PackageInfo. |
| 187 | + * |
| 188 | + * @param record The shard package record to convert |
| 189 | + * @param filename The package filename (e.g., "package-1.0.0-h123_0.tar.bz2") |
| 190 | + * @param channel_id The channel identifier |
| 191 | + * @param platform The platform for this package |
| 192 | + * @param base_url The base URL for constructing package_url |
| 193 | + * @return PackageInfo object with all fields populated |
| 194 | + */ |
| 195 | + specs::PackageInfo to_package_info( |
| 196 | + const ShardPackageRecord& record, |
| 197 | + const std::string& filename, |
| 198 | + const std::string& channel_id, |
| 199 | + const specs::DynamicPlatform& platform, |
| 200 | + const std::string& base_url |
| 201 | + ); |
| 202 | + |
| 203 | +} |
| 204 | + |
| 205 | +#endif |
0 commit comments