dragonflyoss
diff --git a/‎contrib/nydusify/cmd/nydusify.go
Lines changed: 0 additions & 14 deletions b/‎contrib/nydusify/cmd/nydusify.go
Lines changed: 0 additions & 14 deletions
diff --git a/‎contrib/nydusify/pkg/chunkdict/generator/generator.go
Lines changed: 60 additions & 32 deletions b/‎contrib/nydusify/pkg/chunkdict/generator/generator.go
Lines changed: 60 additions & 32 deletions
diff --git a/‎docs/chunk-deduplication.md
Lines changed: 45 additions & 31 deletions b/‎docs/chunk-deduplication.md
Lines changed: 45 additions & 31 deletions
@@ -694,11 +694,6 @@ func main() {
 							Usage:     "Json configuration file for storage backend",
 							EnvVars:   []string{"BACKEND_CONFIG_FILE"},
 						},
-						&cli.StringFlag{
-							Name:  "push-chunk-size",
-							Value: "0MB",
-							Usage: "Chunk size for pushing a blob layer in chunked",
-						},
 
 						&cli.StringFlag{
 							Name:    "work-dir",
@@ -731,13 +726,6 @@ func main() {
 						if err != nil {
 							return err
 						}
-						pushChunkSize, err := humanize.ParseBytes(c.String("push-chunk-size"))
-						if err != nil {
-							return errors.Wrap(err, "invalid --push-chunk-size option")
-						}
-						if pushChunkSize > 0 {
-							logrus.Infof("will copy layer with chunk size %s", c.String("push-chunk-size"))
-						}
 
 						_, arch, err := provider.ExtractOsArch(c.String("platform"))
 						if err != nil {
@@ -759,8 +747,6 @@ func main() {
 							ExpectedArch:   arch,
 							AllPlatforms:   c.Bool("all-platforms"),
 							Platforms:      c.String("platform"),
-
-							PushChunkSize: int64(pushChunkSize),
 						})
 						if err != nil {
 							return err
 
@@ -54,8 +54,6 @@ type Opt struct {
 
 	AllPlatforms bool
 	Platforms    string
-
-	PushChunkSize int64
 }
 
 // Generator generates chunkdict by deduplicating multiple nydus images
@@ -119,7 +117,8 @@ func (generator *Generator) Generate(ctx context.Context) error {
 		return err
 	}
 
-	return os.RemoveAll(generator.WorkDir)
+	// return os.RemoveAll(generator.WorkDir)
+	return nil 
 }
 
 // Pull the bootstrap of nydus image
@@ -147,7 +146,7 @@ func (generator *Generator) pull(ctx context.Context) ([]string, error) {
 }
 
 func (generator *Generator) generate(_ context.Context, bootstrapSlice []string) (string, string, error) {
-	// Invoke "nydus-image generate" command
+	// Invoke "nydus-image chunkdict generate" command
 	currentDir, _ := os.Getwd()
 	builder := build.NewBuilder(generator.NydusImagePath)
 
@@ -194,7 +193,7 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str
 		return err
 	}
 
-	pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, generator.PushChunkSize)
+	pvd, err := provider.New(generator.WorkDir, hosts(generator), 200, "v1", platformMC, 0)
 	if err != nil {
 		return err
 	}
@@ -207,17 +206,20 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str
 		}
 	}
 
-	// Pull a source image as a template
-	if err := pvd.Pull(ctx, generator.Sources[0]); err != nil {
-		if errdefs.NeedsRetryWithHTTP(err) {
-			pvd.UsePlainHTTP()
-			if err := pvd.Pull(ctx, generator.Sources[0]); err != nil {
-				return errors.Wrap(err, "try to pull image")
+	// Pull source image 
+	for index := range generator.Sources {
+		if err := pvd.Pull(ctx, generator.Sources[index]); err != nil {
+			if errdefs.NeedsRetryWithHTTP(err) {
+				pvd.UsePlainHTTP()
+				if err := pvd.Pull(ctx, generator.Sources[index]); err != nil {
+					return errors.Wrap(err, "try to pull image")
+				}
+			} else {
+				return errors.Wrap(err, "pull source image")
 			}
-		} else {
-			return errors.Wrap(err, "pull source image")
 		}
 	}
+	
 	logrus.Infof("pulled source image %s", generator.Sources[0])
 	sourceImage, err := pvd.Image(ctx, generator.Sources[0])
 	if err != nil {
@@ -239,18 +241,18 @@ func (generator *Generator) push(ctx context.Context, chunkdictBootstrapPath str
 				defer sem.Release(1)
 				sourceDesc := sourceDescs[idx]
 				targetDesc := &sourceDesc
+
 				// Get the blob from backend
-				if bkd != nil {
-					descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath)
-					if err != nil {
-						return errors.Wrap(err, "get resolver")
-					}
-					if _targetDesc != nil {
-						targetDesc = _targetDesc
-						store := newStore(pvd.ContentStore(), descs)
-						pvd.SetContentStore(store)
-					}
+				descs, _targetDesc, err := pushBlobFromBackend(ctx, pvd, bkd, sourceDesc, *generator, chunkdictBootstrapPath, outputPath)
+				if err != nil {
+					return errors.Wrap(err, "get resolver")
 				}
+				if _targetDesc != nil {
+					targetDesc = _targetDesc
+					store := newStore(pvd.ContentStore(), descs)
+					pvd.SetContentStore(store)
+				}
+
 				targetDescs[idx] = *targetDesc
 
 				if err := pvd.Push(ctx, *targetDesc, generator.Target); err != nil {
@@ -309,20 +311,45 @@ func pushBlobFromBackend(
 			eg.Go(func() error {
 				sem.Acquire(context.Background(), 1)
 				defer sem.Release(1)
+
 				blobID := blobIDs[idx]
 				blobDigest := digest.Digest("sha256:" + blobID)
-				blobSize, err := bkd.Size(blobID)
-				if err != nil {
-					return errors.Wrap(err, "get blob size")
-				}
-				blobSizeStr := humanize.Bytes(uint64(blobSize))
 
-				logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend")
-				rc, err := bkd.Reader(blobID)
-				if err != nil {
-					return errors.Wrap(err, "get blob reader")
+				var blobSize int64
+				var rc io.ReadCloser
+
+				if bkd != nil {
+					rc, err = bkd.Reader(blobID)
+					if err != nil {
+						return errors.Wrap(err, "get blob reader")
+					}
+					blobSize, err = bkd.Size(blobID)
+					if err != nil {
+						return errors.Wrap(err, "get blob size")
+					}
+				} else {
+					imageDesc, err := generator.sourcesParser[0].Remote.Resolve(ctx)
+					if err != nil {
+						if strings.Contains(err.Error(), "x509: certificate signed by unknown authority") {
+							logrus.Warningln("try to enable \"--source-insecure\" / \"--target-insecure\" option")
+						}
+						return errors.Wrap(err, "resolve image")
+					}
+					rc, err = generator.sourcesParser[0].Remote.Pull(ctx, *imageDesc, true)
+					if err != nil {
+						return errors.Wrap(err, "get blob reader")
+					}
+					blobInfo, err := pvd.ContentStore().Info(ctx, blobDigest)
+					if err != nil {
+						return errors.Wrap(err, "get info from content store")
+					}
+					blobSize = blobInfo.Size
 				}
 				defer rc.Close()
+
+				blobSizeStr := humanize.Bytes(uint64(blobSize))
+				logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushing blob from backend")
+
 				blobDescs[idx] = ocispec.Descriptor{
 					Digest:    blobDigest,
 					Size:      blobSize,
@@ -349,6 +376,7 @@ func pushBlobFromBackend(
 				logrus.WithField("digest", blobDigest).WithField("size", blobSizeStr).Infof("pushed blob from backend")
 
 				return nil
+
 			})
 		}(idx)
 	}
 
@@ -1,8 +1,8 @@
-# Notice [WIP] Pending further revisionsNotice
-# Probntroduction 
+# Probntroduction 
 In container images, there are often a large number of duplicate files or content, and these duplicate parts occupy a large amount of storage space, especially in high-density deployment scenarios. As the number of Nydus images grows, it will bring many problems such as low storage space utilization and excessive consumption of bandwidth resources. To do this, an effective deduplication mechanism (deduplication) needs to be designed to solve this problem.
 
 Unlike traditional OCI, which distributes images at a layer-granular level, the smallest unit of a Nydus image is a chunk, so the deduplication algorithm needs to be deduplicated in chunk units. At the same time, we want to deduplicate multiple aspects of the Nydus image, including between Nydus images and between different versions of the same Nydus image. No matter which deduplication method is essentially to deduplicate the repeated chunks in the image, only one duplicate chunk is retained, and the reference to the chunk is used instead of other duplicate chunks to reduce the storage space occupation, so as to maximize the data transmission and storage capabilities of Nydus and improve the access speed and efficiency of the image.
+
 # General idea
 The deduplication algorithm first needs to select the duplicate chunk in the image according to the image information such as the number of occurrences of chunk, chunk size, chunk image to which the chunk belongs and the corresponding version, and generate chunkdict, chunkdict records the unique identifier or fingerprint of chunk, only need to store chunkdict, other images can refer to chunk in chunkdict by reference.
 
@@ -13,32 +13,43 @@ The deduplication algorithm is divided into two parts, the first part is the DBS
 2. Extract the image information and call the DBSCAN clustering algorithm to deduplicate different images.
 3. Deduplicate the dictionary content in 2, and call the exponential smoothing algorithm for each image separately for image version deduplication.
 4. Get the deduplication dictionary generated by running the two algorithms and drop the disk.
+5. Generate a chunkdict image and push it to the remote repository
 # Algorithm detailed process
 ## Overall  Input
 
 ```shell
 nydusify chunkdict generate --sources \
-	localhost:5000:redis:nydus_7.0.1, \
-	localhost:5000:redis:nydus_7.0.2，\
-	localhost:5000:redis:nydus_7.0.3 \ 
+	registry.com/redis:nydus_7.0.1,  \
+	registry.com/redis:nydus_7.0.2， \
+	registry.com/redis:nydus_7.0.3   \
+     -- target registry.com/redis:nydus_chunkdict \
+     --source-insecure --target-insecure
+     # Optional
+     --backend-config-file /path/to/backend-config.json \
+     --backend-type oss
+```
+
+# Use the chunk dict image to reduce the incremental size of the new image
+```
+nydusify convert
+	--source registry.com/redis:OCI_7.0.4 \
+	--target registry.com/redis:nydus_7.0.4 \
+	--chunk-dict registry.com/redis:nydus_chunkdict
 ```
-*** 
-`nydusify chunkdict generate` calls two commands `nydus-image  chunkdict save` and `nydus-image  chunkdict generate`  to store image information into the database and generate a list of chunks to be deduplicated
 
-Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the  command `nydus-image  chunkdict save` to store the information of the chunk and blob in the chunk and blob table of the database.
+*** 
+`nydusify chunkdict generate` calls subcommand `nydus-image  chunkdict generate`  to store image information into the database and generate a new bootstrap as chunkdict bootstrap.
 
+Download multiple Nydus images in advance and put them into the repository as datasets, such as selecting 10 consecutive versions of redis and alpine as the image dataset, and execute the  command `nydus-image  chunkdict generate` to store the information of the chunk and blob in the chunk and blob table of the database.
 ```shell
 # Deposit multiple images into the database
-nydus-image chunkdict save --bootstrap \
-     ./output/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \
-     ./output/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \
-     ./output/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap \
-```
-Execute the command `nydus-image chunkdict generate` to access the database and call the deduplication algorithm to generate the chunk list
-```shell
-# Call the deduplication algorithm to generate chunk list
-nydus-image chunkdict generate --database \
-     sqlite:///path/imageservice/contrib/nydusify/chunkdict.db
+nydus-image chunkdict generate --source \
+     /path/localhost:5000:redis:nydus_7.0.1/nydus_bootstrap, \
+     /path/localhost:5000:redis:nydus_7.0.2/nydus_bootstrap, \
+     /path/localhost:5000:redis:nydus_7.0.3/nydus_bootstrap  \
+     --bootstrap /path/to/chunkdict_bootstrap\
+     --database /path/to/database.db\
+     --output-json /path/to/nydus_bootstrap_output.json
 ```
 
 ***
@@ -77,10 +88,9 @@ where $C(R_x)$ represents the unique chunk set of all training set images in the
 **6.** Remove the chunk in the chunk dictionary selected in 5 for all images (training set and test set), and then repeat the operation 1-5 to generate the chunk dictionary until the maximum number of cycles is reached 7, or the discrete image ratio is greater than 80% of the total number of images.
 
 The principle of DBSCAN algorithm how to divide the cluster is shown in the diagram:
-![在这里插入图片描述](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center)
-In this diagram, minPts = 4. Point A and the other red points are core points, because the area surrounding these points in an ε radius contain at least 4 points (including the point itself). Because they are all reachable from one another, they form a single cluster. Points B and C are not core points, but are reachable from A (via other core points) and thus belong to the cluster as well. Point N is a noise point that is neither a core point nor directly-reachable.
-
+![](https://img-blog.csdnimg.cn/5fba149720a34620873a5a2cb304d668.png#pic_center)
 **Remark：** This section of the picture and the associated DBSCAN algorithm description are referenced from : [https://en.wikipedia.org/wiki/DBSCAN](https://en.wikipedia.org/wiki/DBSCAN)
+
 #### Algorithm 2 Deduplication between different versions of the image (exponential smoothing algorithm)
 ***
 **Basic principle:** Exponential smoothing algorithm is a method for time series data prediction and smoothing, the basic principle is to weighted average the data, give higher weight to the more recent repeated chunks, and constantly update the smoothing value, so the newer chunk has a greater impact on future forecasts, and the impact of older data will gradually weaken.
@@ -102,16 +112,20 @@ where, $\alpha=0.5$ , $Y_{t-1}$ indicates whether the chunk appeared in the prev
 
 **5.** Choose a chunk dictionary that minimizes the test set's storage space.
 ***
+
+
 ### Exponential smoothing algorithm test table
+Step 1: Download 10 OCI versions of an image and count the total size
+Step 2: Convert OCI to nydus image, and then count the total size after conversion
+Step 3: Select three versions of the image to generate chunkdict, use chunkdict to convert the remaining seven versions of the image, and then count the total size 
+dedulicating rate = (total_image_size(nydus) - total_image_size (nydus after dedulicating))/total_image_size(nydus)
+
+
+
+| image_name | version number | total_image_size(OCI) | total_image_size(nydus) | total_image_size (nydus after dedulicating) | chunkdict_image_size | dedulicating rate |
+|------------|----------------|-----------------------|-------------------------|---------------------------------------------|----------------------|-------------------|
+| redis      | 10             | 341.78                | 419.37                  | 319.48                                      | 41.87                | 23.82%            |
+| ubuntu     | 10             | 290.26                | 308.59                  | 140.28                                      | 30.8                 | 54.54%            |
+| alpine     | 10             | 26.9                  | 27.55                   | 24.7                                        | 2.74                 | 10.34%            |
 
-| image_name | version number | total_size | train_size | test_size | test_size after dedulicating | chunkdict_size | dedulicating rate | threshold |
-|------------|----------------|------------|------------|-----------|------------------------------|----------------|-------------------|-----------|
-| redis      | 10             | 382.03     | 266.7      | 115.33    | 31.56                        | 42.33          | 72.63%            | 0.8-0.5   |
-| python     | 10             | 3509.91    | 2095.37    | 1414.54   | 123.33                       | 588.61         | 91.28%            | 0.8-0.5   |
-| ubuntu     | 10             | 317.33     | 222.11     | 95.22     | 12.27                        | 39.61          | 87.11%            | 0.8-0.5   |
-| nginx      | 10             | 396.86     | 284.4      | 112.46    | 50.54                        | 83.54          | 55.06%            | 0.8-0.5   |
-| postgres   | 10             | 1360.31    | 956.42     | 403.89    | 381.54                       | 19.66          | 5.53%             | 0.8-0.5   |
-| alpine     | 10             | 27.23      | 19.04      | 8.19      | 5.62                         | 4.7            | 31.29%            | 0.8-0.5   |
-| node       | 10             | 3698.44    | 2598.59    | 1099.85   | 429.39                       | 649.42         | 60.96%            | 0.8-0.5   |
-| httpd      | 10             | 561.99     | 385.79     | 176.2     | 85.7                         | 54.15          | 51.36%            | 0.8-0.5   |
 ***