Skip to content

Commit f1fcd05

Browse files
deeppcsayush-panta
authored andcommitted
feat: add parallel_pull_as_fallback config option
Add a new config option parallel_pull_as_fallback under [pull_modes.parallel_pull_unpack] that enables parallel-pull as an automatic fallback when lazy-load is the primary mode but no SOCI index is found for an image. Today, lazy-load and parallel-pull are mutually exclusive at the daemon level. When lazy-load is enabled and no SOCI index exists, the snapshotter defers to the container runtime's sequential pull, which is 5-40% slower than Docker. This forces operators to choose between optimal performance for indexed images (lazy-load) or non-indexed images (parallel-pull), but not both. With parallel_pull_as_fallback = true (and enable = false), the snapshotter first attempts lazy-load for every image. Only when no SOCI index is found does it fall back to parallel-pull instead of the slow sequential path. This gives optimal performance for both cases: - Images WITH a SOCI index: lazy-load (83-96% faster than Docker) - Images WITHOUT a SOCI index: parallel-pull (14-50% faster than Docker) The option defaults to false, preserving existing behavior for all current users. When enable = true, the fallback is a no-op since parallel-pull is already the primary mode. Tested on AL2 and AL2023 instances with both small (nginx) and large (20GB+) container images. All existing unit tests pass. Signed-off-by: deeppcs <deeppcs@amazon.com>
1 parent ad6c319 commit f1fcd05

11 files changed

Lines changed: 308 additions & 10 deletions

File tree

config/config.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ skip_check_snapshotter_supported = false
7373
max_concurrent_unpacks_per_image = 1
7474
discard_unpacked_layers = false
7575
enable = false
76+
experimental_parallel_pull_as_fallback = false
7677

7778
[kubeconfig_keychain]
7879
enable_keychain = false

config/config_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@ func TestConfigDefaults(t *testing.T) {
4646
expected: DefaultParallelPullUnpackEnable,
4747
actual: cfg.PullModes.Parallel.Enable,
4848
},
49+
{
50+
name: "parallel pull as fallback",
51+
expected: DefaultExperimentalParallelPullAsFallback,
52+
actual: cfg.PullModes.Parallel.ExperimentalParallelPullAsFallback,
53+
},
4954
{
5055
name: "metrics network",
5156
expected: defaultMetricsNetwork,
@@ -287,6 +292,39 @@ concurrent_download_chunk_size = "badchunksize"
287292
}
288293
},
289294
},
295+
{
296+
name: "ParallelPullAsFallback",
297+
config: []byte(`
298+
[pull_modes.soci_v2]
299+
enable = true
300+
301+
[pull_modes.parallel_pull_unpack]
302+
enable = false
303+
experimental_parallel_pull_as_fallback = true
304+
max_concurrent_downloads_per_image = 10
305+
concurrent_download_chunk_size = "16mb"
306+
`),
307+
assert: func(t *testing.T, actual *Config, err error) {
308+
if err != nil {
309+
t.Errorf("Expected no error, got %v", err)
310+
}
311+
if !actual.PullModes.SOCIv2.Enable {
312+
t.Error("Expected soci_v2 to be enabled")
313+
}
314+
if actual.PullModes.Parallel.Enable {
315+
t.Error("Expected parallel_pull_unpack.enable to be false")
316+
}
317+
if !actual.PullModes.Parallel.ExperimentalParallelPullAsFallback {
318+
t.Error("Expected experimental_parallel_pull_as_fallback to be true")
319+
}
320+
if actual.PullModes.Parallel.MaxConcurrentDownloadsPerImage != 10 {
321+
t.Errorf("Expected max_concurrent_downloads_per_image to be 10, got %d", actual.PullModes.Parallel.MaxConcurrentDownloadsPerImage)
322+
}
323+
if actual.PullModes.Parallel.ConcurrentDownloadChunkSize != 16*1024*1024 {
324+
t.Errorf("Expected concurrent_download_chunk_size to be %d, got %d", 16*1024*1024, actual.PullModes.Parallel.ConcurrentDownloadChunkSize)
325+
}
326+
},
327+
},
290328
}
291329

292330
for _, test := range tests {

config/defaults.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,12 @@ const (
123123
// DefaultParallelPullEnable is the default value for whether parallel pull and unpack is enabled
124124
DefaultParallelPullUnpackEnable = false
125125

126+
// DefaultParallelPullAsFallback is the default value for whether parallel pull is used
127+
// as a fallback when lazy-load finds no SOCI index.
128+
// This is EXPERIMENTAL — lazy-load with containerd content store may have
129+
// garbage collection edge cases. See https://github.com/awslabs/soci-snapshotter/issues/1843
130+
DefaultExperimentalParallelPullAsFallback = false
131+
126132
// Defaults for ParallelPullUnpack.
127133
// The default values should mirror default containerd values.
128134

config/pull_modes.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,19 @@ type V2 struct {
4444
type Parallel struct {
4545
ParallelConfig
4646
Enable bool `toml:"enable"`
47+
48+
// ParallelPullAsFallback enables parallel-pull as an automatic fallback
49+
// when lazy-load is the primary mode but no SOCI index is found for an image.
50+
// When true (and Enable is false), the snapshotter will first attempt lazy-load;
51+
// if no SOCI index exists, it falls back to parallel-pull instead of deferring
52+
// to the container runtime's slower sequential pull.
53+
// If Enable is true, this option is a no-op (parallel-pull is already the primary mode).
54+
//
55+
// EXPERIMENTAL: This requires the containerd content store for both lazy-load
56+
// and parallel-pull (unless discard_unpacked_layers = true).
57+
// Lazy-load with the containerd content store may have
58+
// garbage collection edge cases. See https://github.com/awslabs/soci-snapshotter/issues/1843
59+
ExperimentalParallelPullAsFallback bool `toml:"experimental_parallel_pull_as_fallback"`
4760
}
4861

4962
func defaultPullModes(cfg *Config) error {

docs/config.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,24 @@ This set of variables must be at the top of your TOML file due to not belonging
9797

9898
## config/service.go
9999

100+
## config/pull_modes.go
101+
102+
### [pull_modes.soci_v1]
103+
- `enable` (bool) — Enables SOCI v1 index discovery via the OCI Referrers API. Default: false.
104+
105+
### [pull_modes.soci_v2]
106+
- `enable` (bool) — Enables SOCI v2 index discovery via image manifest annotations. Default: true.
107+
108+
### [pull_modes.parallel_pull_unpack]
109+
- `enable` (bool) — Enables parallel pull and unpack as the primary pull mode. When true, lazy-load is skipped entirely. Default: false.
110+
- `experimental_parallel_pull_as_fallback` (bool) — **[EXPERIMENTAL]** When true (and `enable` is false), uses parallel-pull as an automatic fallback when lazy-load is the primary mode but no SOCI index is found for an image. Requires containerd content store (unless `discard_unpacked_layers = true`). Lazy-load with the containerd content store may have garbage collection edge cases. See [#1843](https://github.com/awslabs/soci-snapshotter/issues/1843). Ignored when `enable` is true. Default: false.
111+
- `max_concurrent_downloads` (int) — Max concurrent downloads across all images. -1 for unlimited. Default: -1.
112+
- `max_concurrent_downloads_per_image` (int) — Max concurrent downloads per image. Default: 3.
113+
- `concurrent_download_chunk_size` (string) — Size of each download chunk (e.g. "8mb", "16mb"). Empty means full layer. Default: "".
114+
- `max_concurrent_unpacks` (int) — Max concurrent unpacks across all images. -1 for unlimited. Default: -1.
115+
- `max_concurrent_unpacks_per_image` (int) — Max concurrent unpacks per image. Default: 1.
116+
- `discard_unpacked_layers` (bool) — Discard layer blobs after unpacking to save disk space. Default: false.
117+
100118
### [snapshotter]
101119
- `min_layer_size` (int) — Sets the minimum threshold for lazy loading a layer. Any layer smaller than this value will ignore the zTOC for the layer and pull the entire layer ahead of time. We generally recommend setting it to 10MiB (10000000). Default: 0.
102120
- `allow_invalid_mounts_on_restart` (bool) — Allows the snapshotter to start even if preexisting snapshots cannot connect to their data source on startup. Useful on unexpected daemon crashes/corruption. Default: false.

docs/parallel-mode.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,32 @@ If you have any questions or need further assistance, please don't hesitate to r
9494

9595
## Known Limitations
9696

97+
### Parallel Pull as Fallback for Lazy-Load
98+
99+
When using lazy-load as the primary pull mode (`soci_v2.enable = true` or `soci_v1.enable = true`), images without a SOCI index normally fall back to the container runtime's sequential pull, which can be slower than a standard image pull. To avoid this behavior, you can enable `experimental_parallel_pull_as_fallback`:
100+
101+
```toml
102+
[pull_modes.soci_v2]
103+
enable = true
104+
105+
[pull_modes.parallel_pull_unpack]
106+
enable = false
107+
experimental_parallel_pull_as_fallback = true
108+
max_concurrent_downloads_per_image = 10
109+
concurrent_download_chunk_size = "16mb"
110+
max_concurrent_unpacks_per_image = 10
111+
discard_unpacked_layers = true
112+
```
113+
114+
With this configuration:
115+
- Images **with** a SOCI index use lazy-load
116+
- Images **without** a SOCI index use parallel-pull
117+
- No image falls through to the slow sequential containerd pull
118+
119+
> **EXPERIMENTAL**: This option requires the containerd content store (`type = "containerd"` under `[content_store]`) for both lazy-load and parallel-pull. Lazy-load with the containerd content store may have garbage collection edge cases and does not carry the same stability guarantees as using either mode independently. See [#1843](https://github.com/awslabs/soci-snapshotter/issues/1843) for details.
120+
121+
Note: `experimental_parallel_pull_as_fallback` is ignored when `enable = true`, since parallel-pull is already the primary mode in that case.
122+
97123
### Registries
98124

99125
Any registry that supports ranged GET requests and has sufficient request limits should work with parallel pull mode. If a registry is rate limiting image pull requests, users can attempt to lower `max_concurrent_downloads` or `max_concurrent_downloads_per_image` and see if it alleviates the issue, however this will result in less of a performance benefit compared to regular pulling.

fs/adaptive_fetch_image_layers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ func checkParallelPullUnpack(cfg *config.Parallel) error {
162162
if cfg == nil {
163163
return errors.New("parallel pull config is nil")
164164
}
165-
if !cfg.Enable {
165+
if !cfg.Enable && !cfg.ExperimentalParallelPullAsFallback {
166166
return ErrParallelPullIsDisabled
167167
}
168168
// If global concurrent downloads/unpacks are unlimited, any value for per-image concurrent downloads/unpacks are valid

fs/fs.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,12 @@ func NewFilesystem(ctx context.Context, root string, cfg config.FSConfig, opts .
242242
if pullModes.Parallel.Enable &&
243243
cfg.ContentStoreConfig.Type != config.ContainerdContentStoreType &&
244244
!pullModes.Parallel.DiscardUnpackedLayers {
245-
return nil, errors.New("parallel_pull_unpack mode requires containerd content store (type=\"containerd\" under [content_store])")
245+
return nil, errors.New("parallel_pull_unpack mode requires containerd content store (type=\"containerd\" under [content_store] or discard_unpacked_layers = true)")
246+
}
247+
if pullModes.Parallel.ExperimentalParallelPullAsFallback &&
248+
cfg.ContentStoreConfig.Type != config.ContainerdContentStoreType &&
249+
!pullModes.Parallel.DiscardUnpackedLayers {
250+
return nil, errors.New("experimental_parallel_pull_as_fallback requires containerd content store (type=\"containerd\" under [content_store] or discard_unpacked_layers = true)")
246251
}
247252
client := store.NewContainerdClient(cfg.ContentStoreConfig.ContainerdAddress)
248253

@@ -339,7 +344,7 @@ func NewFilesystem(ctx context.Context, root string, cfg config.FSConfig, opts .
339344
}
340345

341346
func createParallelPullStructs(ctx context.Context, storage LayerUnpackJobStorage, parallelConfig *config.Parallel) (*unpackJobs, error) {
342-
if !parallelConfig.Enable {
347+
if !parallelConfig.Enable && !parallelConfig.ExperimentalParallelPullAsFallback {
343348
return nil, nil
344349
}
345350

@@ -430,7 +435,7 @@ type filesystem struct {
430435
}
431436

432437
func (fs *filesystem) MountParallel(ctx context.Context, mountpoint string, labels map[string]string, mounts []mount.Mount) error {
433-
if !fs.pullModes.Parallel.Enable {
438+
if !fs.pullModes.Parallel.Enable && !fs.pullModes.Parallel.ExperimentalParallelPullAsFallback {
434439
return ErrParallelPullIsDisabled
435440
}
436441

@@ -727,7 +732,7 @@ func (fs *filesystem) getImageManifest(ctx context.Context, dgst string) (*ocisp
727732
// CleanImage stops all parallel operations for the specific image.
728733
// Generally this will be called when removing a snapshot for an image.
729734
func (fs *filesystem) CleanImage(ctx context.Context, imgDigest string) error {
730-
if !fs.pullModes.Parallel.Enable {
735+
if !fs.pullModes.Parallel.Enable && !fs.pullModes.Parallel.ExperimentalParallelPullAsFallback {
731736
return nil
732737
}
733738

integration/pull_modes_test.go

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,3 +364,159 @@ func testDanglingV2Annotation(t *testing.T, imgName string) {
364364
t.Fatalf("expected v2 index digest %s, got %s", v2IndexDigest, indexDigestUsed)
365365
}
366366
}
367+
368+
func TestExperimentalParallelPullAsFallback(t *testing.T) {
369+
for _, imgName := range pullModesImages {
370+
t.Run(imgName, func(t *testing.T) {
371+
testExperimentalParallelPullAsFallback(t, imgName)
372+
})
373+
}
374+
}
375+
376+
func testExperimentalParallelPullAsFallback(t *testing.T, imgName string) {
377+
regConfig := newRegistryConfig()
378+
sh, done := newShellWithRegistry(t, regConfig)
379+
defer done()
380+
381+
srcInfo := dockerhub(imgName)
382+
dstInfo := regConfig.mirror(imgName)
383+
sh.X("nerdctl", "login", "-u", regConfig.user, "-p", regConfig.pass, dstInfo.ref)
384+
385+
// Create a SOCI-indexed version of the image for lazy-load tests
386+
rebootContainerd(t, sh, "", "")
387+
v2IndexDigest := createAndPushV2Index(t, sh, srcInfo, dstInfo)
388+
389+
// Also push the original (non-indexed) image to a separate tag for fallback tests
390+
rebootContainerd(t, sh, "", "")
391+
noIndexInfo := regConfig.mirror(imgName)
392+
noIndexRef, err := reference.Parse(noIndexInfo.ref)
393+
if err != nil {
394+
t.Fatal(err)
395+
}
396+
noIndexRef.Object = "no-soci-index"
397+
sh.X("nerdctl", "pull", "-q", srcInfo.ref)
398+
sh.X("nerdctl", "image", "tag", srcInfo.ref, noIndexRef.String())
399+
sh.X("nerdctl", "push", noIndexRef.String())
400+
401+
tests := []struct {
402+
name string
403+
pullModes config.PullModes
404+
contentStoreType store.ContentStoreType
405+
imageRef string
406+
expectedDigest string
407+
// checkLocal means we expect local/parallel snapshots (not remote/lazy)
408+
checkLocal bool
409+
// checkDeferred means we expect deferred-to-runtime snapshots
410+
checkDeferred bool
411+
}{
412+
{
413+
// Fallback enabled + image WITH SOCI index → lazy-load should be used
414+
name: "fallback enabled with indexed image uses lazy-load",
415+
pullModes: config.PullModes{
416+
SOCIv1: config.V1{Enable: false},
417+
SOCIv2: config.V2{Enable: true},
418+
Parallel: config.Parallel{
419+
ExperimentalParallelPullAsFallback: true,
420+
},
421+
},
422+
contentStoreType: store.ContainerdContentStoreType,
423+
imageRef: dstInfo.ref,
424+
expectedDigest: v2IndexDigest,
425+
},
426+
{
427+
// Fallback enabled + image WITHOUT SOCI index → parallel-pull fallback
428+
name: "fallback enabled with non-indexed image uses parallel pull",
429+
pullModes: config.PullModes{
430+
SOCIv1: config.V1{Enable: false},
431+
SOCIv2: config.V2{Enable: true},
432+
Parallel: config.Parallel{
433+
ExperimentalParallelPullAsFallback: true,
434+
},
435+
},
436+
contentStoreType: store.ContainerdContentStoreType,
437+
imageRef: noIndexRef.String(),
438+
expectedDigest: "",
439+
checkLocal: true,
440+
},
441+
{
442+
// Fallback disabled + image WITHOUT SOCI index → defers to container runtime
443+
name: "fallback disabled with non-indexed image defers to runtime",
444+
pullModes: config.PullModes{
445+
SOCIv1: config.V1{Enable: false},
446+
SOCIv2: config.V2{Enable: true},
447+
},
448+
imageRef: noIndexRef.String(),
449+
expectedDigest: "",
450+
checkDeferred: true,
451+
},
452+
{
453+
// enable=true takes precedence over fallback — parallel pull is primary
454+
name: "enable true takes precedence over fallback",
455+
pullModes: config.PullModes{
456+
SOCIv1: config.V1{Enable: false},
457+
SOCIv2: config.V2{Enable: true},
458+
Parallel: config.Parallel{
459+
Enable: true,
460+
ExperimentalParallelPullAsFallback: true,
461+
},
462+
},
463+
contentStoreType: store.ContainerdContentStoreType,
464+
imageRef: dstInfo.ref,
465+
expectedDigest: "",
466+
checkLocal: true,
467+
},
468+
{
469+
// Fallback with SOCI content store + discard_unpacked_layers works
470+
name: "fallback with soci content store and discard unpacked layers",
471+
pullModes: config.PullModes{
472+
SOCIv1: config.V1{Enable: false},
473+
SOCIv2: config.V2{Enable: true},
474+
Parallel: config.Parallel{
475+
ExperimentalParallelPullAsFallback: true,
476+
ParallelConfig: config.ParallelConfig{
477+
DiscardUnpackedLayers: true,
478+
},
479+
},
480+
},
481+
// No contentStoreType set — defaults to SOCI content store
482+
imageRef: noIndexRef.String(),
483+
expectedDigest: "",
484+
checkLocal: true,
485+
},
486+
}
487+
488+
for _, test := range tests {
489+
t.Run(test.name, func(t *testing.T) {
490+
opts := []snapshotterConfigOpt{withPullModes(test.pullModes)}
491+
if test.contentStoreType == store.ContainerdContentStoreType {
492+
opts = append(opts, withContainerdContentStore())
493+
}
494+
var indexDigestUsed string
495+
monitorFunc := func(s string) {
496+
structuredLog := make(map[string]string)
497+
err := json.Unmarshal([]byte(s), &structuredLog)
498+
if err != nil {
499+
return
500+
}
501+
if structuredLog["msg"] == "fetching SOCI artifacts using index descriptor" {
502+
indexDigestUsed = structuredLog["digest"]
503+
}
504+
}
505+
rsm := testutil.NewRemoteSnapshotMonitor()
506+
m := rebootContainerd(t, sh, "", getSnapshotterConfigToml(t, opts...), rsm.MonitorFunc, monitorFunc)
507+
defer m.Cleanup(t)
508+
sh.X(append(imagePullCmd, test.imageRef)...)
509+
510+
if test.expectedDigest != "" {
511+
rsm.CheckAllRemoteSnapshots(t)
512+
} else if test.checkLocal {
513+
rsm.CheckAllLocalSnapshots(t)
514+
} else if test.checkDeferred {
515+
rsm.CheckAllDeferredSnapshots(t)
516+
}
517+
if indexDigestUsed != test.expectedDigest {
518+
t.Fatalf("expected digest %s, got %s", test.expectedDigest, indexDigestUsed)
519+
}
520+
})
521+
}
522+
}

service/service.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,15 @@ func NewSociSnapshotterService(ctx context.Context, root string, serviceCfg *con
124124
if serviceCfg.PullModes.Parallel.Enable {
125125
snOpts = append(snOpts, snbase.ParallelPullUnpack)
126126
}
127+
if serviceCfg.PullModes.Parallel.ExperimentalParallelPullAsFallback && !serviceCfg.PullModes.Parallel.Enable {
128+
log.G(ctx).Warn("EXPERIMENTAL: experimental_parallel_pull_as_fallback is enabled. " +
129+
"Lazy-load and parallel-pull will coexist using the same content store. " +
130+
"When using the containerd content store, this combination may have " +
131+
"garbage collection edge cases and does not carry the same stability " +
132+
"guarantees as using either mode independently. " +
133+
"See https://github.com/awslabs/soci-snapshotter/issues/1843")
134+
snOpts = append(snOpts, snbase.ParallelPullAsFallback)
135+
}
127136

128137
snapshotter, err = snbase.NewSnapshotter(ctx, snapshotterRoot(root), fs, snOpts...)
129138
if err != nil {

0 commit comments

Comments
 (0)