Skip to content

Commit ef10c66

Browse files
committed
Revise based on comments
Signed-off-by: Zhao Yuan <[email protected]>
1 parent eb21188 commit ef10c66

File tree

5 files changed

+170
-108
lines changed

5 files changed

+170
-108
lines changed

contrib/nydusify/cmd/nydusify.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -694,11 +694,6 @@ func main() {
694694
Usage: "Json configuration file for storage backend",
695695
EnvVars: []string{"BACKEND_CONFIG_FILE"},
696696
},
697-
&cli.StringFlag{
698-
Name: "push-chunk-size",
699-
Value: "0MB",
700-
Usage: "Chunk size for pushing a blob layer in chunked",
701-
},
702697

703698
&cli.StringFlag{
704699
Name: "work-dir",
@@ -731,13 +726,6 @@ func main() {
731726
if err != nil {
732727
return err
733728
}
734-
pushChunkSize, err := humanize.ParseBytes(c.String("push-chunk-size"))
735-
if err != nil {
736-
return errors.Wrap(err, "invalid --push-chunk-size option")
737-
}
738-
if pushChunkSize > 0 {
739-
logrus.Infof("will copy layer with chunk size %s", c.String("push-chunk-size"))
740-
}
741729

742730
_, arch, err := provider.ExtractOsArch(c.String("platform"))
743731
if err != nil {
@@ -759,8 +747,6 @@ func main() {
759747
ExpectedArch: arch,
760748
AllPlatforms: c.Bool("all-platforms"),
761749
Platforms: c.String("platform"),
762-
763-
PushChunkSize: int64(pushChunkSize),
764750
})
765751
if err != nil {
766752
return err

contrib/nydusify/pkg/chunkdict/generator/generator.go

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ type Opt struct {
5454

5555
AllPlatforms bool
5656
Platforms string
57-
58-
PushChunkSize int64
5957
}
6058

6159
// Generator generates chunkdict by deduplicating multiple nydus images
@@ -119,7 +117,8 @@ func (generator *Generator) Generate(ctx context.Context) error {
119117
return err
120118
}
121119

122-
return os.RemoveAll(generator.WorkDir)
120+
// return os.RemoveAll(generator.WorkDir)
121+
return nil
123122
}
124123

125124
// Pull the bootstrap of nydus image
@@ -147,7 +146,7 @@ func (generator *Generator) pull(ctx context.Context) ([]string, error) {
147146
}
148147

149148
func (generator *Generator) generate(_ context.Context, bootstrapSlice []string) (string, string, error) {
150-
// Invoke "nydus-image generate" command
149+
// Invoke "nydus-image chunkdict generate" command
151150
currentDir, _ := os.Getwd()
152151
builder := build.NewBuilder(generator.NydusImagePath)
153152

@@ -194,7 +193,7 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str
194193
return err
195194
}
196195

197-
pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, generator.PushChunkSize)
196+
pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, 0)
198197
if err != nil {
199198
return err
200199
}
@@ -207,17 +206,20 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str
207206
}
208207
}
209208

210-
// Pull a source image as a template
211-
if err := pvd.Pull(ctx, generator.Sources[0]); err != nil {
212-
if errdefs.NeedsRetryWithHTTP(err) {
213-
pvd.UsePlainHTTP()
214-
if err := pvd.Pull(ctx, generator.Sources[0]); err != nil {
215-
return errors.Wrap(err, "try to pull image")
209+
// Pull source image
210+
for index := range generator.Sources {
211+
if err := pvd.Pull(ctx, generator.Sources[index]); err != nil {
212+
if errdefs.NeedsRetryWithHTTP(err) {
213+
pvd.UsePlainHTTP()
214+
if err := pvd.Pull(ctx, generator.Sources[index]); err != nil {
215+
return errors.Wrap(err, "try to pull image")
216+
}
217+
} else {
218+
return errors.Wrap(err, "pull source image")
216219
}
217-
} else {
218-
return errors.Wrap(err, "pull source image")
219220
}
220221
}
222+
221223
logrus.Infof("pulled source image %s", generator.Sources[0])
222224
sourceImage, err := pvd.Image(ctx, generator.Sources[0])
223225
if err != nil {
@@ -239,18 +241,18 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str
239241
defer sem.Release(1)
240242
sourceDesc := sourceDescs[idx]
241243
targetDesc := &sourceDesc
244+
242245
// Get the blob from backend
243-
if bkd != nil {
244-
descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath)
245-
if err != nil {
246-
return errors.Wrap(err, "get resolver")
247-
}
248-
if _targetDesc != nil {
249-
targetDesc = _targetDesc
250-
store := newStore(pvd.ContentStore(), descs)
251-
pvd.SetContentStore(store)
252-
}
246+
descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath)
247+
if err != nil {
248+
return errors.Wrap(err, "get resolver")
253249
}
250+
if _targetDesc != nil {
251+
targetDesc = _targetDesc
252+
store := newStore(pvd.ContentStore(), descs)
253+
pvd.SetContentStore(store)
254+
}
255+
254256
targetDescs[idx] = *targetDesc
255257

256258
if err := pvd.Push(ctx, *targetDesc, generator.Target); err != nil {
@@ -309,20 +311,45 @@ func pushBlobFromBackend(
309311
eg.Go(func() error {
310312
sem.Acquire(context.Background(), 1)
311313
defer sem.Release(1)
314+
312315
blobID := blobIDs[idx]
313316
blobDigest := digest.Digest("sha256:" + blobID)
314-
blobSize, err := bkd.Size(blobID)
315-
if err != nil {
316-
return errors.Wrap(err, "get blob size")
317-
}
318-
blobSizeStr := humanize.Bytes(uint64(blobSize))
319317

320-
logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend")
321-
rc, err := bkd.Reader(blobID)
322-
if err != nil {
323-
return errors.Wrap(err, "get blob reader")
318+
var blobSize int64
319+
var rc io.ReadCloser
320+
321+
if bkd != nil {
322+
rc, err = bkd.Reader(blobID)
323+
if err != nil {
324+
return errors.Wrap(err, "get blob reader")
325+
}
326+
blobSize, err = bkd.Size(blobID)
327+
if err != nil {
328+
return errors.Wrap(err, "get blob size")
329+
}
330+
} else {
331+
imageDesc, err := generator.sourcesParser[0].Remote.Resolve(ctx)
332+
if err != nil {
333+
if strings.Contains(err.Error(), "x509: certificate signed by unknown authority") {
334+
logrus.Warningln("try to enable \"--source-insecure\" / \"--target-insecure\" option")
335+
}
336+
return errors.Wrap(err, "resolve image")
337+
}
338+
rc, err = generator.sourcesParser[0].Remote.Pull(ctx, *imageDesc, true)
339+
if err != nil {
340+
return errors.Wrap(err, "get blob reader")
341+
}
342+
blobInfo, err := pvd.ContentStore().Info(ctx, blobDigest)
343+
if err != nil {
344+
return errors.Wrap(err, "get info from content store")
345+
}
346+
blobSize = blobInfo.Size
324347
}
325348
defer rc.Close()
349+
350+
blobSizeStr := humanize.Bytes(uint64(blobSize))
351+
logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend")
352+
326353
blobDescs[idx] = ocispec.Descriptor{
327354
Digest: blobDigest,
328355
Size: blobSize,
@@ -349,6 +376,7 @@ func pushBlobFromBackend(
349376
logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushed blob from backend")
350377

351378
return nil
379+
352380
})
353381
}(idx)
354382
}

docs/chunk-deduplication.md

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
# Notice [WIP] Pending further revisionsNotice
2-
# Probntroduction
1+
# Probntroduction
32
In container images, there are often a large number of duplicate files or content, and these duplicate parts occupy a large amount of storage space, especially in high-density deployment scenarios. As the number of Nydus images grows, it will bring many problems such as low storage space utilization and excessive consumption of bandwidth resources. To do this, an effective deduplication mechanism (deduplication) needs to be designed to solve this problem.
43

54
Unlike traditional OCI, which distributes images at a layer-granular level, the smallest unit of a Nydus image is a chunk, so the deduplication algorithm needs to be deduplicated in chunk units. At the same time, we want to deduplicate multiple aspects of the Nydus image, including between Nydus images and between different versions of the same Nydus image. No matter which deduplication method is essentially to deduplicate the repeated chunks in the image, only one duplicate chunk is retained, and the reference to the chunk is used instead of other duplicate chunks to reduce the storage space occupation, so as to maximize the data transmission and storage capabilities of Nydus and improve the access speed and efficiency of the image.
5+
66
# General idea
77
The deduplication algorithm first needs to select the duplicate chunk in the image according to the image information such as the number of occurrences of chunk, chunk size, chunk image to which the chunk belongs and the corresponding version, and generate chunkdict, chunkdict records the unique identifier or fingerprint of chunk, only need to store chunkdict, other images can refer to chunk in chunkdict by reference.
88

@@ -13,32 +13,43 @@ The deduplication algorithm is divided into two parts, the first part is the DBS
1313
2. Extract the image information and call the DBSCAN clustering algorithm to deduplicate different images.
1414
3. Deduplicate the dictionary content in 2, and call the exponential smoothing algorithm for each image separately for image version deduplication.
1515
4. Get the deduplication dictionary generated by running the two algorithms and drop the disk.
16+
5. Generate a chunkdict image and push it to the remote repository
1617
# Algorithm detailed process
1718
## Overall Input
1819

1920
```shell
2021
nydusify chunkdict generate --sources \
21-
localhost:5000:redis:nydus_7.0.1, \
22-
localhost:5000:redis:nydus_7.0.2,\
23-
localhost:5000:redis:nydus_7.0.3 \
22+
registry.com/redis:nydus_7.0.1, \
23+
registry.com/redis:nydus_7.0.2, \
24+
registry.com/redis:nydus_7.0.3 \
25+
-- target registry.com/redis:nydus_chunkdict \
26+
--source-insecure --target-insecure
27+
# Optional
28+
--backend-config-file /path/to/backend-config.json \
29+
--backend-type oss
30+
```
31+
32+
# Use the chunk dict image to reduce the incremental size of the new image
33+
```
34+
nydusify convert
35+
--source registry.com/redis:OCI_7.0.4 \
36+
--target registry.com/redis:nydus_7.0.4 \
37+
--chunk-dict registry.com/redis:nydus_chunkdict
2438
```
25-
***
26-
`nydusify chunkdict generate` calls two commands `nydus-image chunkdict save` and `nydus-image chunkdict generate` to store image information into the database and generate a list of chunks to be deduplicated
2739

28-
Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict save` to store the information of the chunk and blob in the chunk and blob table of the database.
40+
***
41+
`nydusify chunkdict generate` calls subcommand `nydus-image chunkdict generate` to store image information into the database and generate a new bootstrap as chunkdict bootstrap.
2942

43+
Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the command `nydus-image chunkdict generate` to store the information of the chunk and blob in the chunk and blob table of the database.
3044
```shell
3145
# Deposit multiple images into the database
32-
nydus-image chunkdict save --bootstrap \
33-
./output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \
34-
./output/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \
35-
./output/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \
36-
```
37-
Execute the command `nydus-image chunkdict generate` to access the database and call the deduplication algorithm to generate the chunk list
38-
```shell
39-
# Call the deduplication algorithm to generate chunk list
40-
nydus-image chunkdict generate --database \
41-
sqlite:///path/imageservice/contrib/nydusify/chunkdict.db
46+
nydus-image chunkdict generate --source \
47+
/path/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \
48+
/path/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \
49+
/path/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \
50+
--bootstrap /path/to/chunkdict_bootstrap\
51+
--database /path/to/database.db\
52+
--output-json /path/to/nydus_bootstrap_output.json
4253
```
4354

4455
***
@@ -77,10 +88,9 @@ where $C(R_x)$ represents the unique chunk set of all training set images in the
7788
**6.** Remove the chunk in the chunk dictionary selected in 5 for all images (training set and test set), and then repeat the operation 1-5 to generate the chunk dictionary until the maximum number of cycles is reached 7, or the discrete image ratio is greater than 80% of the total number of images.
7889

7990
The principle of DBSCAN algorithm how to divide the cluster is shown in the diagram:
80-
![在这里插入图片描述](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center)
81-
In this diagram, minPts = 4. Point A and the other red points are core points, because the area surrounding these points in an ε radius contain at least 4 points (including the point itself). Because they are all reachable from one another, they form a single cluster. Points B and C are not core points, but are reachable from A (via other core points) and thus belong to the cluster as well. Point N is a noise point that is neither a core point nor directly-reachable.
82-
91+
![](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center)
8392
**Remark:** This section of the picture and the associated DBSCAN algorithm description are referenced from : [https://en.wikipedia.org/wiki/DBSCAN](https://en.wikipedia.org/wiki/DBSCAN)
93+
8494
#### Algorithm 2 Deduplication between different versions of the image (exponential smoothing algorithm)
8595
***
8696
**Basic principle:** Exponential smoothing algorithm is a method for time series data prediction and smoothing, the basic principle is to weighted average the data, give higher weight to the more recent repeated chunks, and constantly update the smoothing value, so the newer chunk has a greater impact on future forecasts, and the impact of older data will gradually weaken.
@@ -102,16 +112,20 @@ where, $\alpha=0.5$ , $Y_{t-1}$ indicates whether the chunk appeared in the prev
102112

103113
**5.** Choose a chunk dictionary that minimizes the test set's storage space.
104114
***
115+
116+
105117
### Exponential smoothing algorithm test table
118+
Step 1: Download 10 OCI versions of an image and count the total size
119+
Step 2: Convert OCI to nydus image, and then count the total size after conversion
120+
Step 3: Select three versions of the image to generate chunkdict, use chunkdict to convert the remaining seven versions of the image, and then count the total size
121+
dedulicating rate = (total_image_size(nydus) - total_image_size (nydus after dedulicating))/total_image_size(nydus)
122+
123+
124+
125+
| image_name | version number | total_image_size(OCI) | total_image_size(nydus) | total_image_size (nydus after dedulicating) | chunkdict_image_size | dedulicating rate |
126+
|------------|----------------|-----------------------|-------------------------|---------------------------------------------|----------------------|-------------------|
127+
| redis | 10 | 341.78 | 419.37 | 319.48 | 41.87 | 23.82% |
128+
| ubuntu | 10 | 290.26 | 308.59 | 140.28 | 30.8 | 54.54% |
129+
| alpine | 10 | 26.9 | 27.55 | 24.7 | 2.74 | 10.34% |
106130

107-
| image_name | version number | total_size | train_size | test_size | test_size after dedulicating | chunkdict_size | dedulicating rate | threshold |
108-
|------------|----------------|------------|------------|-----------|------------------------------|----------------|-------------------|-----------|
109-
| redis | 10 | 382.03 | 266.7 | 115.33 | 31.56 | 42.33 | 72.63% | 0.8-0.5 |
110-
| python | 10 | 3509.91 | 2095.37 | 1414.54 | 123.33 | 588.61 | 91.28% | 0.8-0.5 |
111-
| ubuntu | 10 | 317.33 | 222.11 | 95.22 | 12.27 | 39.61 | 87.11% | 0.8-0.5 |
112-
| nginx | 10 | 396.86 | 284.4 | 112.46 | 50.54 | 83.54 | 55.06% | 0.8-0.5 |
113-
| postgres | 10 | 1360.31 | 956.42 | 403.89 | 381.54 | 19.66 | 5.53% | 0.8-0.5 |
114-
| alpine | 10 | 27.23 | 19.04 | 8.19 | 5.62 | 4.7 | 31.29% | 0.8-0.5 |
115-
| node | 10 | 3698.44 | 2598.59 | 1099.85 | 429.39 | 649.42 | 60.96% | 0.8-0.5 |
116-
| httpd | 10 | 561.99 | 385.79 | 176.2 | 85.7 | 54.15 | 51.36% | 0.8-0.5 |
117131
***

0 commit comments

Comments
 (0)