Skip to content

Commit b98c281

Browse files
feat: Shard types and utils (#4145)
Signed-off-by: Julien Jerphanion <git@jjerphan.xyz> Co-authored-by: Johan Mabille <johan.mabille@gmail.com>
1 parent 16ca83c commit b98c281

File tree

11 files changed

+1484
-6
lines changed

11 files changed

+1484
-6
lines changed

.github/workflows/brew.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
run: sudo chown -R linuxbrew .
3535

3636
- name: Install host and build dependencies
37-
run: brew install fmt libarchive libsolv lz4 openssl@3 reproc simdjson xz yaml-cpp zstd cmake cli11 nlohmann-json spdlog tl-expected curl pkgconfig python bzip2 krb5 zlib
37+
run: brew install fmt libarchive libsolv lz4 openssl@3 reproc simdjson xz yaml-cpp zstd cmake cli11 nlohmann-json spdlog tl-expected curl pkgconfig python bzip2 krb5 zlib msgpack
3838

3939
- name: Configure to build mamba
4040
run: cmake -S. -Bbuild -DBUILD_LIBMAMBA=ON -DBUILD_MAMBA=ON -DBUILD_LIBMAMBA_SPDLOG=ON -DBUILD_SHARED=ON -DBUILD_STATIC=OFF
@@ -56,7 +56,7 @@ jobs:
5656
run: >
5757
brew install --overwrite
5858
fmt libarchive libsolv lz4 openssl@3 reproc simdjson xz yaml-cpp zstd
59-
cli11 nlohmann-json spdlog tl-expected pkgconfig python
59+
cli11 nlohmann-json spdlog tl-expected pkgconfig python msgpack
6060
6161
- name: Configure to build mamba
6262
run: >

.github/workflows/static_build.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ jobs:
199199
yaml-cpp-static>=0.8.0
200200
libsolv-static>=0.7.24
201201
reproc-cpp-static>=14.2.4.post0
202+
msgpack-c
202203
- name: build micromamba
203204
shell: cmd /C call {0}
204205
run: |

dev/environment-dev.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ dependencies:
2121
- libcurl >=7.86
2222
- libsodium
2323
- libsolv >=0.7.18
24+
- msgpack-c
2425
- nlohmann_json
2526
- reproc-cpp >=14.2.4.post0
2627
- simdjson >=3.3.0

libmamba/CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ set(
248248
${LIBMAMBA_SOURCE_DIR}/core/repo_checker_store.cpp
249249
${LIBMAMBA_SOURCE_DIR}/core/run.cpp
250250
${LIBMAMBA_SOURCE_DIR}/core/shell_init.cpp
251+
${LIBMAMBA_SOURCE_DIR}/core/shard_types.cpp
251252
${LIBMAMBA_SOURCE_DIR}/core/singletons.cpp
252253
${LIBMAMBA_SOURCE_DIR}/core/subdir_index.cpp
253254
${LIBMAMBA_SOURCE_DIR}/core/thread_utils.cpp
@@ -397,6 +398,7 @@ set(
397398
${LIBMAMBA_INCLUDE_DIR}/mamba/core/repo_checker_store.hpp
398399
${LIBMAMBA_INCLUDE_DIR}/mamba/core/run.hpp
399400
${LIBMAMBA_INCLUDE_DIR}/mamba/core/shell_init.hpp
401+
${LIBMAMBA_INCLUDE_DIR}/mamba/core/shard_types.hpp
400402
${LIBMAMBA_INCLUDE_DIR}/mamba/core/subdir_index.hpp
401403
${LIBMAMBA_INCLUDE_DIR}/mamba/core/tasksync.hpp
402404
${LIBMAMBA_INCLUDE_DIR}/mamba/core/thread_utils.hpp
@@ -436,6 +438,7 @@ find_package(yaml-cpp CONFIG REQUIRED)
436438
find_package(reproc CONFIG REQUIRED)
437439
find_package(reproc++ CONFIG REQUIRED)
438440
find_package(Libsolv MODULE REQUIRED)
441+
find_package(msgpack-c CONFIG REQUIRED)
439442
add_subdirectory(ext/solv-cpp)
440443

441444
macro(libmamba_create_target target_name linkage output_name)
@@ -455,7 +458,7 @@ macro(libmamba_create_target target_name linkage output_name)
455458
)
456459

457460
# Header only libraries are always linked the same way
458-
target_link_libraries(${target_name} PUBLIC tl::expected nlohmann_json::nlohmann_json)
461+
target_link_libraries(${target_name} PUBLIC tl::expected nlohmann_json::nlohmann_json msgpack-c)
459462

460463
target_compile_features(${target_name} PUBLIC cxx_std_20)
461464
set_target_properties(
@@ -478,7 +481,7 @@ macro(libmamba_create_target target_name linkage output_name)
478481

479482
target_link_libraries(
480483
${target_name}
481-
PUBLIC fmt::fmt-header-only yaml-cpp::yaml-cpp
484+
PUBLIC fmt::fmt-header-only yaml-cpp::yaml-cpp msgpack-c
482485
PRIVATE
483486
reproc
484487
reproc++
@@ -621,7 +624,8 @@ macro(libmamba_create_target target_name linkage output_name)
621624

622625
target_link_libraries(
623626
${target_name}
624-
PUBLIC ${LIBSOLV_LIBRARIES} ${LIBSOLVEXT_LIBRARIES} yaml-cpp::yaml-cpp fmt::fmt
627+
PUBLIC
628+
${LIBSOLV_LIBRARIES} ${LIBSOLVEXT_LIBRARIES} yaml-cpp::yaml-cpp fmt::fmt msgpack-c
625629
PRIVATE
626630
${LibArchive_LIBRARIES}
627631
${CURL_LIBRARIES}
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
// Copyright (c) 2024, QuantStack and Mamba Contributors
2+
//
3+
// Distributed under the terms of the BSD 3-Clause License.
4+
//
5+
// The full license is in the file LICENSE, distributed with this software.
6+
7+
#ifndef MAMBA_CORE_SHARD_TYPES_HPP
8+
#define MAMBA_CORE_SHARD_TYPES_HPP
9+
10+
#include <map>
11+
#include <optional>
12+
#include <string>
13+
#include <vector>
14+
15+
#include "mamba/specs/package_info.hpp"
16+
#include "mamba/specs/repo_data.hpp"
17+
18+
namespace mamba
19+
{
20+
/**
21+
* Package record dictionary for shard data.
22+
*
23+
* This is a simplified representation of package metadata used in shards.
24+
* It exists separately from other package types for several reasons:
25+
*
26+
* **Comparison with specs::RepoDataPackage:**
27+
* - Uses primitive types (string for version, string for noarch) instead of
28+
* complex types (Version object, NoArchType enum), making direct msgpack
29+
* deserialization faster and more straightforward.
30+
* - Contains only fields needed for dependency traversal, reducing memory usage
31+
* when processing many shards.
32+
* - Conversion to RepoDataPackage happens lazily when building repodata for
33+
* the solver, deferring parsing costs until actually needed.
34+
*
35+
* **Comparison with specs::PackageInfo:**
36+
* - PackageInfo is the runtime representation used for installed packages,
37+
* transactions, and queries. It uses string for version (like ShardPackageRecord)
38+
* but NoArchType enum (like RepoDataPackage), and includes runtime-specific
39+
* fields like channel, package_url, platform, filename, signatures, etc.
40+
* - ShardPackageRecord is purely for parsing msgpack shards and contains only
41+
* the minimal fields needed for dependency traversal.
42+
* - PackageInfo is created from RepoDataPackage when packages are added to the
43+
* solver database, not directly from ShardPackageRecord.
44+
*
45+
* **Key design decisions:**
46+
*
47+
* 1. **Simpler msgpack parsing**: The msgpack format from Python shards uses simple
48+
* types that map directly to primitives, avoiding complex type parsing during
49+
* deserialization.
50+
*
51+
* 2. **Minimal storage**: Only fields needed for dependency traversal (name, version,
52+
* build, dependencies, constraints). Fields like license, timestamp, track_features
53+
* are not needed during traversal.
54+
*
55+
* 3. **Lazy conversion**: Conversion to specs::RepoDataPackage happens only when
56+
* building repodata for the solver (via to_repo_data_package()), deferring
57+
* Version/NoArchType parsing costs until actually needed.
58+
*
59+
* 4. **Flexible msgpack handling**: Custom parsing handles various msgpack types
60+
* for sha256/md5 (strings, bytes, EXT types), easier with a dedicated structure.
61+
*
62+
* This structure supports all fields defined in the shard format specification.
63+
* See https://conda.org/learn/ceps/cep-0016 for the complete shard format specification.
64+
*
65+
* @see specs::RepoDataPackage The canonical package record type used for repodata.json
66+
* @see specs::PackageInfo The runtime package representation used throughout the codebase
67+
* @see to_repo_data_package() Conversion function to RepoDataPackage
68+
* @see from_repo_data_package() Conversion function from RepoDataPackage
69+
* @see https://conda.org/learn/ceps/cep-0016 CEP 16 - Sharded Repodata specification
70+
*/
71+
struct ShardPackageRecord
72+
{
73+
std::string name;
74+
std::string version;
75+
std::string build;
76+
std::size_t build_number = 0;
77+
std::optional<std::string> sha256;
78+
std::optional<std::string> md5;
79+
std::vector<std::string> depends;
80+
std::vector<std::string> constrains;
81+
std::optional<std::string> noarch;
82+
std::size_t size = 0;
83+
std::optional<std::string> license;
84+
std::optional<std::string> license_family;
85+
std::optional<std::string> subdir;
86+
std::optional<std::size_t> timestamp;
87+
};
88+
89+
/**
90+
* A shard dictionary containing packages for a single package name.
91+
*
92+
* Maps to the Python ShardDict type. Contains all versions of a package
93+
* in both .tar.bz2 and .conda formats.
94+
*/
95+
struct ShardDict
96+
{
97+
/** Packages in .tar.bz2 format, keyed by filename. */
98+
std::map<std::string, ShardPackageRecord> packages;
99+
100+
/** Packages in .conda format, keyed by filename. */
101+
std::map<std::string, ShardPackageRecord> conda_packages;
102+
};
103+
104+
/**
105+
* Information dictionary from repodata.
106+
*
107+
* Contains channel metadata including base URLs and subdir information.
108+
*/
109+
struct RepoMetadata
110+
{
111+
/** Base URL where packages are stored. */
112+
std::string base_url;
113+
114+
/** Base URL where shards are stored. */
115+
std::string shards_base_url;
116+
117+
/** Subdirectory (platform) name. */
118+
std::string subdir;
119+
};
120+
121+
/**
122+
* Shards index dictionary.
123+
*
124+
* This is the structure parsed from repodata_shards.msgpack.zst.
125+
* It maps package names to their shard hash (SHA256).
126+
*/
127+
struct ShardsIndexDict
128+
{
129+
/** Channel information. */
130+
RepoMetadata info;
131+
132+
/** Version of the shards index format. */
133+
std::size_t version = 1;
134+
135+
/**
136+
* Map of package names to their shard hash.
137+
*
138+
* The hash is stored as raw bytes (32 bytes for SHA256).
139+
*/
140+
std::map<std::string, std::vector<std::uint8_t>> shards;
141+
};
142+
143+
/**
144+
* Complete repodata dictionary.
145+
*
146+
* Combines shard data with repodata metadata.
147+
*/
148+
struct RepodataDict
149+
{
150+
/** Channel information. */
151+
RepoMetadata info;
152+
153+
/** Repodata version. */
154+
std::size_t repodata_version = 2;
155+
156+
/** Shard dictionary containing packages in both .tar.bz2 and .conda formats. */
157+
ShardDict shard_dict;
158+
};
159+
160+
/**
161+
* Convert ShardPackageRecord to specs::RepoDataPackage.
162+
*
163+
* This conversion is used when building repodata for the solver.
164+
*/
165+
specs::RepoDataPackage to_repo_data_package(const ShardPackageRecord& record);
166+
167+
/**
168+
* Convert specs::RepoDataPackage to ShardPackageRecord.
169+
*
170+
* This conversion is used when treating monolithic repodata as shards.
171+
*/
172+
ShardPackageRecord from_repo_data_package(const specs::RepoDataPackage& record);
173+
174+
/**
175+
* Convert RepodataDict to specs::RepoData.
176+
*
177+
* This conversion is used when building repodata for the solver from shards.
178+
*/
179+
specs::RepoData to_repo_data(const RepodataDict& repodata);
180+
181+
/**
182+
* Convert ShardPackageRecord to specs::PackageInfo.
183+
*
184+
* This conversion is used when loading packages from shards into the package database.
185+
* Requires additional metadata (filename, channel, platform, base_url) that is not
186+
* present in ShardPackageRecord but needed for PackageInfo.
187+
*
188+
* @param record The shard package record to convert
189+
* @param filename The package filename (e.g., "package-1.0.0-h123_0.tar.bz2")
190+
* @param channel_id The channel identifier
191+
* @param platform The platform for this package
192+
* @param base_url The base URL for constructing package_url
193+
* @return PackageInfo object with all fields populated
194+
*/
195+
specs::PackageInfo to_package_info(
196+
const ShardPackageRecord& record,
197+
const std::string& filename,
198+
const std::string& channel_id,
199+
const specs::DynamicPlatform& platform,
200+
const std::string& base_url
201+
);
202+
203+
}
204+
205+
#endif

0 commit comments

Comments
 (0)