Skip to content

Commit 8c71b1c

Browse files
memshardedgrossag
andauthored
Feature/builtin compression (#19337)
* Add support for zstd compression of binary packages This change adds zstd support to conan in the following ways: 1. The person or build running `conan upload` can set a config value core.upload:compression_format = zstd to upload binaries using zstd instead of gzip. 2. The zstd compression is done entirely in Python using a combination of tarfile and python-zstandard. Then the file is uploaded as normal. 3. When downloading packages, if a .tar.zst file is encountered, the extraction code uses tarfile and python-zstandard to extract. I chose python-zstandard as the library because that is what urllib3 uses. * Switch to include python-zstandard in the package requirements Because zstd decompression is expected to just work if the server has a .tar.zst file, I am including zstandard in requirements.txt. https://python-zstandard.readthedocs.io/en/latest/projectinfo.html#state-of-project recommends that we "Pin the package version to prevent unwanted breakage when this change occurs!", although I doubt that much will change before an eventual 1.0. * Add a test case to cover zstd compress and decompress * Downgrade to 0.20.0 to fix CI CI is unable to find 0.21.0 * Two small improvements 1. Change requirements.txt to allow either zstandard 0.20 or 0.21. That prevents a downgrade for people who already have 0.21 installed, while also allowing CI to find 0.20. 2. Move compressformat parameter earlier in compress_files() function. It made a bit more sense to have it earlier; as long as consumers are correctly using positional kwargs, it shouldn't break anyone. * Address review feedback * Add file missed by merge * Fix typo in parameter which broke tests * A few more small fixes in hopes of unbreaking the build 1. Fix bad merge causing uploader.py change to still refer to `self._app.cache.new_config`, when now we are supposed to use `self._global_conf`. 2. Change two output calls in uploader.py to only output the package file basename to be consistent with other existing log lines. 3. Use double quotes instead of single quotes to be more consistent with existing code. * Some more improvements 1. Downgrade bufsize to 32KB because that performs well for compression and decompression. The values don't need to be the same, but it happened to be the best value in both compression and decompression tests. 2. Use a context manager for stream_reader as I do for stream_writer. 3. Add some comments about the bufsize value. * Address some of the review feedback Still need to do some testing though. * Flush zstd frames around every 128MB * Fix DeprecationWarning Newer Python has this warning: DeprecationWarning: Python 3.14 will, by default, filter extracted tar archives and reject files or modify their metadata. Use the filter argument to control this behavior * wip * wip * review * compression for cache save/restore too * fix unit test * fix tests * fix tests * fix save/restore with Path * last review * fix tests --------- Co-authored-by: Adam Gross <grossag@vmware.com>
1 parent b2f4a0b commit 8c71b1c

File tree

15 files changed

+393
-81
lines changed

15 files changed

+393
-81
lines changed

conan/api/subapi/cache.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,14 @@
66

77
from conan.api.model import PackagesList
88
from conan.api.output import ConanOutput
9-
from conan.internal.api.uploader import compress_files
9+
from conan.internal.api.uploader import compress_files, get_compress_level
1010
from conan.internal.cache.cache import PkgCache
1111
from conan.internal.cache.conan_reference_layout import (EXPORT_SRC_FOLDER, EXPORT_FOLDER,
1212
SRC_FOLDER, METADATA,
1313
DOWNLOAD_EXPORT_FOLDER)
1414
from conan.internal.cache.home_paths import HomePaths
1515
from conan.internal.cache.integrity_check import IntegrityChecker
16+
from conan.internal.paths import COMPRESSIONS
1617
from conan.internal.rest.download_cache import DownloadCache
1718
from conan.errors import ConanException
1819
from conan.api.model import PkgReference
@@ -148,7 +149,11 @@ def save(self, package_list: PackagesList, tgz_path, no_source=False) -> None:
148149
cache_folder = cache.store # Note, this is not the home, but the actual package cache
149150
out = ConanOutput()
150151
mkdir(os.path.dirname(tgz_path))
151-
compresslevel = global_conf.get("core.gzip:compresslevel", check_type=int)
152+
tgz_name = os.path.basename(tgz_path)
153+
compressformat = next((e for e in COMPRESSIONS if tgz_name.endswith(e)), None)
154+
if not compressformat:
155+
raise ConanException(f"Unsupported compression format for {tgz_name}")
156+
compresslevel = get_compress_level(compressformat, global_conf)
152157
tar_files: dict[str, str] = {} # {path_in_tar: abs_path}
153158

154159
for ref, packages in package_list.items():
@@ -191,9 +196,9 @@ def save(self, package_list: PackagesList, tgz_path, no_source=False) -> None:
191196
pkglist_path = os.path.join(tempfile.gettempdir(), "pkglist.json")
192197
save(pkglist_path, serialized)
193198
tar_files["pkglist.json"] = pkglist_path
194-
compress_files(tar_files, os.path.basename(tgz_path), os.path.dirname(tgz_path),
195-
compresslevel, recursive=True)
199+
compress_files(tar_files, tgz_name, os.path.dirname(tgz_path), compresslevel, recursive=True)
196200
remove(pkglist_path)
201+
ConanOutput().success(f"Created cache save file: {tgz_path}")
197202

198203
def restore(self, path) -> PackagesList:
199204
if not os.path.isfile(path):

conan/internal/api/uploader.py

Lines changed: 91 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import gzip
33
import os
44
import shutil
5+
import sys
56
import tarfile
67
import time
78

@@ -10,8 +11,8 @@
1011
from conan.internal.source import retrieve_exports_sources
1112
from conan.internal.errors import NotFoundException
1213
from conan.errors import ConanException
13-
from conan.internal.paths import (CONAN_MANIFEST, CONANFILE, EXPORT_SOURCES_TGZ_NAME,
14-
EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME, CONANINFO)
14+
from conan.internal.paths import CONAN_MANIFEST, CONANFILE, CONANINFO, COMPRESSIONS, \
15+
EXPORT_SOURCES_FILE_NAME, EXPORT_FILE_NAME, PACKAGE_FILE_NAME
1516
from conan.internal.util.files import (clean_dirty, is_dirty, gather_files,
1617
set_dirty_context_manager, mkdir, human_size)
1718

@@ -80,10 +81,37 @@ def _check_upstream_package(self, pref, prev_bundle, remote, force):
8081
prev_bundle["upload"] = False
8182

8283

84+
def get_compress_level(compressformat, global_conf):
85+
if compressformat == "xz":
86+
msg = ("The 'xz' compression is experimental. "
87+
"Consumers using older Conan versions will not be able to install these packages. "
88+
"Feedback is welcome, please report any issues as GitHub tickets.")
89+
ConanOutput().warning(msg, warn_tag="experimental")
90+
elif compressformat == "zst":
91+
msg = ("The 'zst' compression is experimental. "
92+
"Consumers installing packages created with this format must use Python >= 3.14. "
93+
"Consumers using older Conan or Python versions will not be able to install these "
94+
"packages. Feedback is welcome, please report any issues as GitHub tickets.")
95+
ConanOutput().warning(msg, warn_tag="experimental")
96+
97+
if compressformat == "zst" and sys.version_info.minor < 14:
98+
raise ConanException("The 'core.upload:compression_format=zst' is only for Python>=3.14")
99+
compresslevel = global_conf.get("core:compresslevel", check_type=int)
100+
if compresslevel is None and compressformat == "gz":
101+
compresslevel = global_conf.get("core.gzip:compresslevel", check_type=int)
102+
# do not deprecate yet core.gzip:compresslevel, wait a bit to stabilize core:compresslevel
103+
return compresslevel
104+
105+
83106
class PackagePreparator:
84107
def __init__(self, app: ConanApp, global_conf):
85108
self._app = app
86109
self._global_conf = global_conf
110+
compressformat = self._global_conf.get("core.upload:compression_format", default="gz",
111+
choices=COMPRESSIONS)
112+
compresslevel = get_compress_level(compressformat, global_conf)
113+
self._compressformat = compressformat
114+
self._compresslevel = compresslevel
87115

88116
def prepare(self, pkg_list, enabled_remotes):
89117
local_url = self._global_conf.get("core.scm:local_url", choices=["allow", "block"])
@@ -128,14 +156,6 @@ def _prepare_recipe(self, ref, ref_bundle, conanfile, remotes):
128156
def _compress_recipe_files(self, layout, ref):
129157
download_export_folder = layout.download_export()
130158

131-
output = ConanOutput(scope=str(ref))
132-
for f in (EXPORT_TGZ_NAME, EXPORT_SOURCES_TGZ_NAME):
133-
tgz_path = os.path.join(download_export_folder, f)
134-
if is_dirty(tgz_path):
135-
output.warning("Removing %s, marked as dirty" % f)
136-
os.remove(tgz_path)
137-
clean_dirty(tgz_path)
138-
139159
export_folder = layout.export()
140160
files, symlinked_folders = gather_files(export_folder)
141161
files.update(symlinked_folders)
@@ -159,18 +179,13 @@ def _compress_recipe_files(self, layout, ref):
159179
files.pop(CONANFILE)
160180
files.pop(CONAN_MANIFEST)
161181

162-
def add_tgz(tgz_name, tgz_files):
163-
tgz = os.path.join(download_export_folder, tgz_name)
164-
if os.path.isfile(tgz):
165-
result[tgz_name] = tgz
166-
elif tgz_files:
167-
compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int)
168-
tgz = compress_files(tgz_files, tgz_name, download_export_folder,
169-
compresslevel=compresslevel, ref=ref)
170-
result[tgz_name] = tgz
171-
172-
add_tgz(EXPORT_TGZ_NAME, files)
173-
add_tgz(EXPORT_SOURCES_TGZ_NAME, src_files)
182+
if files:
183+
comp = self._compressed_file(EXPORT_FILE_NAME, files, download_export_folder, ref)
184+
result[comp] = os.path.join(download_export_folder, comp)
185+
if src_files:
186+
comp = self._compressed_file(EXPORT_SOURCES_FILE_NAME, src_files,
187+
download_export_folder, ref)
188+
result[comp] = os.path.join(download_export_folder, comp)
174189
return result
175190

176191
def _prepare_package(self, pref, prev_bundle):
@@ -181,14 +196,39 @@ def _prepare_package(self, pref, prev_bundle):
181196
cache_files = self._compress_package_files(pkg_layout, pref)
182197
prev_bundle["files"] = cache_files
183198

199+
def _compressed_file(self, filename, files, download_folder, ref):
200+
output = ConanOutput(scope=str(ref))
201+
202+
# Check if there is some existing compressed file already
203+
matches = []
204+
for extension in COMPRESSIONS:
205+
file_name = filename + extension
206+
package_file = os.path.join(download_folder, file_name)
207+
if is_dirty(package_file):
208+
output.warning(f"Removing {file_name}, marked as dirty")
209+
os.remove(package_file)
210+
clean_dirty(package_file)
211+
if os.path.isfile(package_file):
212+
matches.append(file_name)
213+
if len(matches) > 1:
214+
raise ConanException(f"{ref}: Multiple package files found for {filename}: {matches}")
215+
if len(matches) == 1:
216+
existing = matches[0]
217+
if not existing.endswith(self._compressformat):
218+
output.info(f"Existing {existing} compressed file, "
219+
f"keeping it, not using '{self._compressformat}' format")
220+
return existing
221+
222+
file_name = filename + self._compressformat
223+
package_file = os.path.join(download_folder, file_name)
224+
compressed_path = compress_files(files, file_name, download_folder,
225+
compresslevel=self._compresslevel, scope=str(ref))
226+
assert compressed_path == package_file
227+
assert os.path.exists(package_file)
228+
return file_name
229+
184230
def _compress_package_files(self, layout, pref):
185-
output = ConanOutput(scope=str(pref))
186231
download_pkg_folder = layout.download_package()
187-
package_tgz = os.path.join(download_pkg_folder, PACKAGE_TGZ_NAME)
188-
if is_dirty(package_tgz):
189-
output.warning("Removing %s, marked as dirty" % PACKAGE_TGZ_NAME)
190-
os.remove(package_tgz)
191-
clean_dirty(package_tgz)
192232

193233
# Get all the files in that directory
194234
# existing package
@@ -209,15 +249,8 @@ def _compress_package_files(self, layout, pref):
209249
files.pop(CONANINFO)
210250
files.pop(CONAN_MANIFEST)
211251

212-
if not os.path.isfile(package_tgz):
213-
tgz_files = {f: path for f, path in files.items()}
214-
compresslevel = self._global_conf.get("core.gzip:compresslevel", check_type=int)
215-
tgz_path = compress_files(tgz_files, PACKAGE_TGZ_NAME, download_pkg_folder,
216-
compresslevel=compresslevel, ref=pref)
217-
assert tgz_path == package_tgz
218-
assert os.path.exists(package_tgz)
219-
220-
return {PACKAGE_TGZ_NAME: package_tgz,
252+
compressed_file = self._compressed_file(PACKAGE_FILE_NAME, files, download_pkg_folder, pref)
253+
return {compressed_file: os.path.join(download_pkg_folder, compressed_file),
221254
CONANINFO: os.path.join(download_pkg_folder, CONANINFO),
222255
CONAN_MANIFEST: os.path.join(download_pkg_folder, CONAN_MANIFEST)}
223256

@@ -282,21 +315,36 @@ def gzopen_without_timestamps(name, fileobj, compresslevel=None):
282315
return t
283316

284317

285-
def compress_files(files, name, dest_dir, compresslevel=None, ref=None, recursive=False):
318+
def compress_files(files, name, dest_dir, compresslevel=None, scope=None, recursive=False):
286319
t1 = time.time()
287-
# FIXME, better write to disk sequentially and not keep tgz contents in memory
288320
tgz_path = os.path.join(dest_dir, name)
289-
if ref:
290-
ConanOutput(scope=str(ref) if ref else None).info(f"Compressing {name}")
321+
322+
out = ConanOutput(scope=scope)
323+
out.info(f"Compressing {name}")
324+
325+
if name.endswith("zst"):
326+
with tarfile.open(tgz_path, "w:zst", level=compresslevel) as tar: # noqa Py314 only
327+
for filename, abs_path in sorted(files.items()):
328+
tar.add(abs_path, filename, recursive=recursive)
329+
out.debug(f"{name} compressed in {time.time() - t1} time")
330+
return tgz_path
331+
332+
if name.endswith("xz"):
333+
# The default to PAX_FORMAT in case of Python 3.7
334+
with tarfile.open(tgz_path, "w:xz", preset=compresslevel, format=tarfile.PAX_FORMAT) as tar:
335+
for filename, abs_path in sorted(files.items()):
336+
tar.add(abs_path, filename, recursive=recursive)
337+
out.debug(f"{name} compressed in {time.time() - t1} time")
338+
return tgz_path
339+
291340
with set_dirty_context_manager(tgz_path), open(tgz_path, "wb") as tgz_handle:
292341
tgz = gzopen_without_timestamps(name, fileobj=tgz_handle, compresslevel=compresslevel)
293342
for filename, abs_path in sorted(files.items()):
294343
# recursive is False by default in case it is a symlink to a folder
295344
tgz.add(abs_path, filename, recursive=recursive)
296345
tgz.close()
297346

298-
duration = time.time() - t1
299-
ConanOutput().debug(f"{name} compressed in {duration} time")
347+
out.debug(f"{name} compressed in {time.time() - t1} time")
300348
return tgz_path
301349

302350

conan/internal/model/conf.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,11 @@
6060
"core.net.http:cacert_path": "Path containing a custom Cacert file",
6161
"core.net.http:client_cert": "Path or tuple of files containing a client cert (and key)",
6262
"core.net.http:clean_system_proxy": "If defined, the proxies system env-vars will be discarded",
63-
# Gzip compression
63+
# Compression for `conan upload`
64+
"core.upload:compression_format": "The compression format used when uploading Conan packages. "
65+
"Possible values: 'zst', 'xz', 'gz' (default=gz)",
6466
"core.gzip:compresslevel": "The Gzip compression level for Conan artifacts (default=9)",
67+
"core:compresslevel": "The compression level for Conan artifacts (default zstd=3, gz=9)",
6568
# Excluded from revision_mode = "scm" dirty and Git().is_dirty() checks
6669
"core.scm:excluded": "List of excluded patterns for builtin git dirty checks",
6770
"core.scm:local_url": "By default allows to store local folders as remote url, but not upload them. Use 'allow' for allowing upload and 'block' to completely forbid it",

conan/internal/model/manifest.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import os
22
from collections import defaultdict
33

4-
from conan.internal.paths import CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
4+
from conan.internal.paths import CONAN_MANIFEST, COMPRESSIONS, PACKAGE_FILE_NAME, EXPORT_FILE_NAME, \
5+
EXPORT_SOURCES_FILE_NAME
56
from conan.internal.util.dates import timestamp_now, timestamp_to_str
67
from conan.internal.util.files import load, md5, md5sum, save, gather_files
78

@@ -91,8 +92,10 @@ def create(cls, folder, exports_sources_folder=None):
9192
"""
9293
files, _ = gather_files(folder)
9394
# The folders symlinks are discarded for the manifest
94-
for f in (PACKAGE_TGZ_NAME, EXPORT_TGZ_NAME, CONAN_MANIFEST, EXPORT_SOURCES_TGZ_NAME):
95-
files.pop(f, None)
95+
for f in (PACKAGE_FILE_NAME, EXPORT_FILE_NAME, EXPORT_SOURCES_FILE_NAME):
96+
for e in COMPRESSIONS:
97+
files.pop(f + e, None)
98+
files.pop(CONAN_MANIFEST, None)
9699

97100
file_dict = {}
98101
for name, filepath in files.items():

conan/internal/paths.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def _user_home_from_conanrc_file():
8686
CONANFILE_TXT = "conanfile.txt"
8787
CONAN_MANIFEST = "conanmanifest.txt"
8888
CONANINFO = "conaninfo.txt"
89-
PACKAGE_TGZ_NAME = "conan_package.tgz"
90-
EXPORT_TGZ_NAME = "conan_export.tgz"
91-
EXPORT_SOURCES_TGZ_NAME = "conan_sources.tgz"
89+
PACKAGE_FILE_NAME = "conan_package.t"
90+
EXPORT_FILE_NAME = "conan_export.t"
91+
EXPORT_SOURCES_FILE_NAME = "conan_sources.t"
92+
COMPRESSIONS = "gz", "xz", "zst"
9293
DATA_YML = "conandata.yml"

conan/internal/rest/remote_manager.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import os
22
import shutil
3+
import sys
4+
35
from collections import namedtuple
46
from typing import List
57

68
from requests.exceptions import ConnectionError
79

810
from conan.api.model import LOCAL_RECIPES_INDEX
11+
from conan.internal.paths import CONANINFO, CONAN_MANIFEST, PACKAGE_FILE_NAME, EXPORT_FILE_NAME
912
from conan.internal.rest.rest_client_local_recipe_index import RestApiClientLocalRecipesIndex
1013
from conan.api.model import Remote
1114
from conan.api.output import ConanOutput
@@ -17,7 +20,6 @@
1720
from conan.api.model import PkgReference
1821
from conan.api.model import RecipeReference
1922
from conan.internal.util.files import rmdir, human_size
20-
from conan.internal.paths import EXPORT_SOURCES_TGZ_NAME, EXPORT_TGZ_NAME, PACKAGE_TGZ_NAME
2123
from conan.internal.util.files import mkdir, tar_extract
2224

2325

@@ -86,7 +88,8 @@ def get_recipe(self, ref, remote, metadata=None):
8688
self._cache.remove_recipe_layout(layout)
8789
raise
8890
export_folder = layout.export()
89-
tgz_file = zipped_files.pop(EXPORT_TGZ_NAME, None)
91+
export_file = next((f for f in zipped_files if f.startswith(EXPORT_FILE_NAME)), None)
92+
tgz_file = zipped_files.pop(export_file, None)
9093

9194
if tgz_file:
9295
uncompress_file(tgz_file, export_folder, scope=str(ref))
@@ -132,7 +135,8 @@ def get_recipe_sources(self, ref, layout, remote):
132135
return
133136

134137
self._signer.verify(ref, download_folder, files=zipped_files)
135-
tgz_file = zipped_files[EXPORT_SOURCES_TGZ_NAME]
138+
# Only 1 file is guaranteed
139+
tgz_file = next(iter(zipped_files.values()))
136140
uncompress_file(tgz_file, export_sources_folder, scope=str(ref))
137141

138142
def get_package(self, pref, remote, metadata=None):
@@ -178,12 +182,15 @@ def _get_package(self, layout, pref, remote, scoped_output, metadata):
178182
metadata, only_metadata=False)
179183
zipped_files = {k: v for k, v in zipped_files.items() if not k.startswith(METADATA)}
180184
# quick server package integrity check:
181-
for f in ("conaninfo.txt", "conanmanifest.txt", "conan_package.tgz"):
185+
for f in (CONANINFO, CONAN_MANIFEST):
182186
if f not in zipped_files:
183187
raise ConanException(f"Corrupted {pref} in '{remote.name}' remote: no {f}")
188+
189+
# This is guaranteed to exists, otherwise RestClient would have raised already
190+
package_file = next(f for f in zipped_files if PACKAGE_FILE_NAME in f)
184191
self._signer.verify(pref, download_pkg_folder, zipped_files)
185192

186-
tgz_file = zipped_files.pop(PACKAGE_TGZ_NAME, None)
193+
tgz_file = zipped_files.pop(package_file)
187194
package_folder = layout.package()
188195
uncompress_file(tgz_file, package_folder, scope=str(pref.ref))
189196
mkdir(package_folder) # Just in case it doesn't exist, because uncompress did nothing
@@ -337,6 +344,9 @@ def _call_remote(self, remote, method, *args, **kwargs):
337344

338345

339346
def uncompress_file(src_path, dest_folder, scope=None):
347+
if sys.version_info.minor < 14 and src_path.endswith("zst"):
348+
raise ConanException(f"File {os.path.basename(src_path)} compressed with 'zst', "
349+
f"unsupported for Python<3.14 ")
340350
try:
341351
filesize = os.path.getsize(src_path)
342352
big_file = filesize > 10000000 # 10 MB

0 commit comments

Comments
 (0)