Skip to content

Commit c7ea2d5

Browse files
Bhavya BibraBhavya Bibra
authored andcommitted
Multiple improvements: seekable tests, zstdless, streaming dict example, seqBench hardening, DiB early termination
1 parent d7ee320 commit c7ea2d5

8 files changed

Lines changed: 8705 additions & 7692 deletions

File tree

contrib/seekable_format/tests/seekable_tests.c

Lines changed: 574 additions & 319 deletions
Large diffs are not rendered by default.

contrib/seqBench/seqBench.c

Lines changed: 131 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,147 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under both the BSD-style license (found in the
6+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7+
* in the COPYING file in the root directory of this source tree).
8+
*/
9+
10+
/* seqBench: roundtrip benchmark using sequence-level compression API.
11+
*
12+
* Usage: seqBench <file>
13+
*
14+
* Generates sequences from the input file, then recompresses them
15+
* using ZSTD_compressSequences() with explicit block delimiters,
16+
* finally validates the output by decompressing and comparing.
17+
*/
18+
119
#define ZSTD_STATIC_LINKING_ONLY
2-
#include <zstd.h>
20+
#include <assert.h>
321
#include <stdio.h>
422
#include <stdlib.h>
5-
#include <assert.h>
623
#include <string.h>
24+
#include <zstd.h>
725

826
int main(int argc, char *argv[]) {
9-
ZSTD_CCtx* zc = ZSTD_createCCtx();
27+
int ret = 0;
1028

11-
if (argc != 2) {
12-
printf("Usage: seqBench <file>\n"); // TODO provide the block delim option here
13-
return 1;
14-
}
29+
if (argc != 2) {
30+
fprintf(stderr, "Usage: seqBench <file>\n");
31+
fprintf(stderr,
32+
"\nRoundtrip benchmark using sequence-level compression.\n");
33+
fprintf(stderr, "Generates sequences with ZSTD_generateSequences(),\n");
34+
fprintf(stderr, "recompresses with ZSTD_compressSequences() using\n");
35+
fprintf(stderr,
36+
"explicit block delimiters, and validates by decompression.\n");
37+
return 1;
38+
}
39+
40+
ZSTD_CCtx *zc = ZSTD_createCCtx();
41+
if (zc == NULL) {
42+
fprintf(stderr, "ERROR: ZSTD_createCCtx() failed\n");
43+
return 1;
44+
}
1545

16-
FILE *f = fopen(argv[1], "rb");
17-
fseek(f, 0, SEEK_END);
18-
long inBufSize = ftell(f);
19-
fseek(f, 0, SEEK_SET);
46+
FILE *f = fopen(argv[1], "rb");
47+
if (f == NULL) {
48+
fprintf(stderr, "ERROR: could not open file '%s'\n", argv[1]);
49+
ZSTD_freeCCtx(zc);
50+
return 1;
51+
}
2052

21-
char *inBuf = malloc(inBufSize + 1);
22-
fread(inBuf, inBufSize, 1, f);
53+
fseek(f, 0, SEEK_END);
54+
long inBufSize = ftell(f);
55+
fseek(f, 0, SEEK_SET);
56+
57+
if (inBufSize <= 0) {
58+
fprintf(stderr, "ERROR: file '%s' is empty or unreadable\n", argv[1]);
59+
fclose(f);
60+
ZSTD_freeCCtx(zc);
61+
return 1;
62+
}
63+
64+
char *inBuf = (char *)malloc((size_t)inBufSize + 1);
65+
if (inBuf == NULL) {
66+
fprintf(stderr, "ERROR: could not allocate %ld bytes for input\n",
67+
inBufSize);
2368
fclose(f);
69+
ZSTD_freeCCtx(zc);
70+
return 1;
71+
}
2472

25-
size_t seqsSize = ZSTD_sequenceBound(inBufSize);
26-
ZSTD_Sequence *seqs = (ZSTD_Sequence*)malloc(seqsSize * sizeof(ZSTD_Sequence));
27-
char *outBuf = malloc(ZSTD_compressBound(inBufSize));
73+
size_t const bytesRead = fread(inBuf, 1, (size_t)inBufSize, f);
74+
fclose(f);
75+
if ((long)bytesRead != inBufSize) {
76+
fprintf(stderr, "ERROR: read only %zu of %ld bytes\n", bytesRead,
77+
inBufSize);
78+
free(inBuf);
79+
ZSTD_freeCCtx(zc);
80+
return 1;
81+
}
2882

29-
ZSTD_generateSequences(zc, seqs, seqsSize, inBuf, inBufSize);
30-
ZSTD_CCtx_setParameter(zc, ZSTD_c_blockDelimiters, ZSTD_sf_explicitBlockDelimiters);
31-
size_t outBufSize = ZSTD_compressSequences(zc, outBuf, inBufSize, seqs, seqsSize, inBuf, inBufSize);
32-
if (ZSTD_isError(outBufSize)) {
33-
printf("ERROR: %lu\n", outBufSize);
34-
return 1;
35-
}
83+
size_t seqsSize = ZSTD_sequenceBound((size_t)inBufSize);
84+
ZSTD_Sequence *seqs =
85+
(ZSTD_Sequence *)malloc(seqsSize * sizeof(ZSTD_Sequence));
86+
size_t const outBufCapacity = ZSTD_compressBound((size_t)inBufSize);
87+
char *outBuf = (char *)malloc(outBufCapacity);
88+
char *validationBuf = (char *)malloc((size_t)inBufSize);
89+
90+
if (seqs == NULL || outBuf == NULL || validationBuf == NULL) {
91+
fprintf(stderr, "ERROR: memory allocation failed\n");
92+
ret = 1;
93+
goto cleanup;
94+
}
95+
96+
ZSTD_generateSequences(zc, seqs, seqsSize, inBuf, (size_t)inBufSize);
97+
ZSTD_CCtx_setParameter(zc, ZSTD_c_blockDelimiters,
98+
ZSTD_sf_explicitBlockDelimiters);
99+
size_t outBufSize = ZSTD_compressSequences(
100+
zc, outBuf, outBufCapacity, seqs, seqsSize, inBuf, (size_t)inBufSize);
101+
if (ZSTD_isError(outBufSize)) {
102+
fprintf(stderr, "ERROR: ZSTD_compressSequences failed: %s\n",
103+
ZSTD_getErrorName(outBufSize));
104+
ret = 1;
105+
goto cleanup;
106+
}
107+
108+
size_t const decSize =
109+
ZSTD_decompress(validationBuf, (size_t)inBufSize, outBuf, outBufSize);
110+
if (ZSTD_isError(decSize)) {
111+
fprintf(stderr, "ERROR: ZSTD_decompress failed: %s\n",
112+
ZSTD_getErrorName(decSize));
113+
ret = 1;
114+
goto cleanup;
115+
}
116+
117+
if ((long)decSize != inBufSize) {
118+
fprintf(stderr, "ERROR: decompressed size (%zu) != original size (%ld)\n",
119+
decSize, inBufSize);
120+
ret = 1;
121+
goto cleanup;
122+
}
36123

37-
char *validationBuf = malloc(inBufSize);
38-
ZSTD_decompress(validationBuf, inBufSize, outBuf, outBufSize);
39-
40-
if (memcmp(inBuf, validationBuf, inBufSize) == 0) {
41-
printf("Compression and decompression were successful!\n");
42-
} else {
43-
printf("ERROR: input and validation buffers don't match!\n");
44-
for (int i = 0; i < inBufSize; i++) {
45-
if (inBuf[i] != validationBuf[i]) {
46-
printf("First bad index: %d\n", i);
47-
break;
48-
}
49-
}
124+
if (memcmp(inBuf, validationBuf, (size_t)inBufSize) == 0) {
125+
printf("Compression and decompression were successful!\n");
126+
printf(" Original size: %ld bytes\n", inBufSize);
127+
printf(" Compressed size: %zu bytes\n", outBufSize);
128+
printf(" Ratio: %.2f\n", (double)inBufSize / (double)outBufSize);
129+
} else {
130+
fprintf(stderr, "ERROR: input and validation buffers don't match!\n");
131+
for (long i = 0; i < inBufSize; i++) {
132+
if (inBuf[i] != validationBuf[i]) {
133+
fprintf(stderr, "First bad index: %ld\n", i);
134+
break;
135+
}
50136
}
137+
ret = 1;
138+
}
51139

52-
return 0;
140+
cleanup:
141+
free(validationBuf);
142+
free(outBuf);
143+
free(seqs);
144+
free(inBuf);
145+
ZSTD_freeCCtx(zc);
146+
return ret;
53147
}

examples/Makefile

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ all: simple_compression simple_decompression \
2121
multiple_simple_compression\
2222
dictionary_compression dictionary_decompression \
2323
streaming_compression streaming_decompression \
24-
multiple_streaming_compression streaming_memory_usage
24+
multiple_streaming_compression streaming_memory_usage \
25+
streaming_dictionary_compression
2526

2627
$(LIB) :
2728
$(MAKE) -C $(LIBDIR) libzstd.a
@@ -53,6 +54,9 @@ streaming_decompression : $(LIB)
5354
streaming_memory_usage.o: common.h
5455
streaming_memory_usage : $(LIB)
5556

57+
streaming_dictionary_compression.o: common.h
58+
streaming_dictionary_compression : $(LIB)
59+
5660

5761
.PHONY:clean
5862
clean:
@@ -61,7 +65,8 @@ clean:
6165
multiple_simple_compression \
6266
dictionary_compression dictionary_decompression \
6367
streaming_compression streaming_decompression \
64-
multiple_streaming_compression streaming_memory_usage
68+
multiple_streaming_compression streaming_memory_usage \
69+
streaming_dictionary_compression
6570
@echo Cleaning completed
6671

6772
.PHONY:test

examples/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,7 @@ Zstandard library : usage examples
4444
Decompress multiple files using the same dictionary.
4545
Result remains in memory.
4646
Introduces usage of : `ZSTD_createDDict()` and `ZSTD_decompress_usingDDict()`
47+
48+
- [Streaming dictionary compression](streaming_dictionary_compression.c) :
49+
Compress multiple files in streaming mode using the same dictionary.
50+
Introduces usage of : `ZSTD_CCtx_loadDictionary()` and `ZSTD_compressStream2()`
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under both the BSD-style license (found in the
6+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7+
* in the COPYING file in the root directory of this source tree).
8+
* You may select, at your option, one of the above-listed licenses.
9+
*/
10+
11+
/* This example demonstrates streaming compression with a dictionary.
12+
* It combines the streaming approach from streaming_compression.c
13+
* with the dictionary approach from dictionary_compression.c.
14+
*
15+
* This is useful when compressing many small files (e.g. database records,
16+
* JSON objects, log entries) in a streaming fashion using a pre-trained
17+
* dictionary for better compression ratios on small data.
18+
*
19+
* It uses the advanced API:
20+
* - ZSTD_CCtx_loadDictionary() to load the dictionary
21+
* - ZSTD_compressStream2() to stream-compress the input
22+
*
23+
* Usage: streaming_dictionary_compression DICT FILES...
24+
*/
25+
26+
#include "common.h" // Helper functions, CHECK(), and CHECK_ZSTD()
27+
#include <stdio.h> // printf
28+
#include <stdlib.h> // free
29+
#include <string.h> // memset, strcat, strlen
30+
#include <zstd.h> // presumes zstd library is installed
31+
32+
static void compressFile_orDie(const char *fname, const char *outName,
33+
const void *dictBuffer, size_t dictSize,
34+
int cLevel) {
35+
/* Open the input and output files. */
36+
FILE *const fin = fopen_orDie(fname, "rb");
37+
FILE *const fout = fopen_orDie(outName, "wb");
38+
39+
/* Create the input and output buffers.
40+
* They may be any size, but we recommend using these functions to size them.
41+
*/
42+
size_t const buffInSize = ZSTD_CStreamInSize();
43+
void *const buffIn = malloc_orDie(buffInSize);
44+
size_t const buffOutSize = ZSTD_CStreamOutSize();
45+
void *const buffOut = malloc_orDie(buffOutSize);
46+
47+
/* Create the compression context. */
48+
ZSTD_CCtx *const cctx = ZSTD_createCCtx();
49+
CHECK(cctx != NULL, "ZSTD_createCCtx() failed!");
50+
51+
/* Set compression parameters. */
52+
CHECK_ZSTD(ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, cLevel));
53+
CHECK_ZSTD(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1));
54+
55+
/* Load the dictionary.
56+
* The dictionary will be used for all subsequent compressions using this
57+
* context, until it is reset or a new dictionary is loaded.
58+
* ZSTD_CCtx_loadDictionary() makes an internal copy of the dictionary,
59+
* so we can free dictBuffer after this call if we wanted to.
60+
*/
61+
CHECK_ZSTD(ZSTD_CCtx_loadDictionary(cctx, dictBuffer, dictSize));
62+
63+
/* Stream-compress the input file. */
64+
size_t const toRead = buffInSize;
65+
for (;;) {
66+
size_t read = fread_orDie(buffIn, toRead, fin);
67+
int const lastChunk = (read < toRead);
68+
ZSTD_EndDirective const mode = lastChunk ? ZSTD_e_end : ZSTD_e_continue;
69+
ZSTD_inBuffer input = {buffIn, read, 0};
70+
int finished;
71+
do {
72+
ZSTD_outBuffer output = {buffOut, buffOutSize, 0};
73+
size_t const remaining =
74+
ZSTD_compressStream2(cctx, &output, &input, mode);
75+
CHECK_ZSTD(remaining);
76+
fwrite_orDie(buffOut, output.pos, fout);
77+
finished = lastChunk ? (remaining == 0) : (input.pos == input.size);
78+
} while (!finished);
79+
CHECK(input.pos == input.size, "Impossible: zstd only returns 0 when the "
80+
"input is completely consumed!");
81+
if (lastChunk) {
82+
break;
83+
}
84+
}
85+
86+
ZSTD_freeCCtx(cctx);
87+
fclose_orDie(fout);
88+
fclose_orDie(fin);
89+
free(buffIn);
90+
free(buffOut);
91+
}
92+
93+
static char *createOutFilename_orDie(const char *filename) {
94+
size_t const inL = strlen(filename);
95+
size_t const outL = inL + 5;
96+
void *outSpace = malloc_orDie(outL);
97+
memset(outSpace, 0, outL);
98+
strcat(outSpace, filename);
99+
strcat(outSpace, ".zst");
100+
return (char *)outSpace;
101+
}
102+
103+
int main(int argc, const char **argv) {
104+
const char *const exeName = argv[0];
105+
int const cLevel = 3;
106+
107+
if (argc < 3) {
108+
fprintf(stderr, "wrong arguments\n");
109+
fprintf(stderr, "usage:\n");
110+
fprintf(stderr, "%s DICT [FILES...]\n", exeName);
111+
fprintf(stderr,
112+
"\nCompress FILES using streaming mode with a dictionary.\n");
113+
fprintf(stderr, "DICT is a dictionary file created with `zstd --train`.\n");
114+
return 1;
115+
}
116+
117+
/* Load dictionary into memory.
118+
* The dictionary is loaded once and reused for all files. */
119+
const char *const dictName = argv[1];
120+
size_t dictSize;
121+
void *const dictBuffer = mallocAndLoadFile_orDie(dictName, &dictSize);
122+
printf("loading dictionary %s (%zu bytes)\n", dictName, dictSize);
123+
124+
/* Compress each file with the dictionary. */
125+
int u;
126+
for (u = 2; u < argc; u++) {
127+
const char *const inFilename = argv[u];
128+
char *const outFilename = createOutFilename_orDie(inFilename);
129+
compressFile_orDie(inFilename, outFilename, dictBuffer, dictSize, cLevel);
130+
printf("%25s : compressed with dictionary -> %s\n", inFilename,
131+
outFilename);
132+
free(outFilename);
133+
}
134+
135+
free(dictBuffer);
136+
printf("All %u files compressed with dictionary. \n", argc - 2);
137+
return 0;
138+
}

0 commit comments

Comments
 (0)