Skip to content

Commit f71fbdf

Browse files
author
lsleonard
committed
Vary string length in extended string mode
In tdString.c, make the definition of string length and associated number of bits based on number of input values. For <= 64 values, length is 9 and bits are 3. For > 64 values, length is 17 and bits are 4. Longer strings are likely to be found in larger data sets.
1 parent 674c364 commit f71fbdf

File tree

5 files changed

+18
-11
lines changed

5 files changed

+18
-11
lines changed

Tiny Data Compression with td512.docx

95 Bytes
Binary file not shown.

main.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,6 @@ int main(int argc, char* argv[])
218218
{
219219
int32_t nRetBytes;
220220
nRetBytes = td512d(src+srcBlockOffset, dst+dstBlockOffset, &bytesProcessed);
221-
if (nRetBytes != 512)
222-
nRetBytes = nRetBytes;
223221
if (nRetBytes < 0)
224222
return nRetBytes;
225223
nBytesRemaining -= bytesProcessed;

td512.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,21 @@
2929
3. In main.c, after decompression, the input file is verified against
3030
the decompressed output file.
3131
*/
32+
// Note for version 2.1.2:
33+
/*
34+
1. In tdString.c, make the definition of string length and associated
35+
number of bits based on number of input values. For <= 64 values,
36+
length is 9 and bits are 3. For > 64 values, length is 17 and
37+
bits are 4. Longer strings are likely to be found in larger data sets.
38+
*/
3239
#ifndef td512_h
3340
#define td512_h
3441

3542
#include "td64.h"
3643
#include "tdString.h"
3744
#include <unistd.h>
3845

39-
#define TD512_VERSION "v2.1.1"
46+
#define TD512_VERSION "v2.1.2"
4047
#define MIN_VALUES_EXTENDED_MODE 128
4148
#define MIN_UNIQUES_SINGLE_VALUE_MODE_CHECK 14
4249
#define MIN_VALUES_TO_COMPRESS 16

td64.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#define NDEBUG // disable asserts
2828
#include <assert.h>
2929

30-
#define TD64_VERSION "v2.1.1"
3130
#define MAX_TD64_BYTES 64 // max input vals supported
3231
#define MIN_TD64_BYTES 1 // min input vals supported
3332
#define MAX_UNIQUES 16 // max uniques supported in input

tdString.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424
#include "tdString.h"
2525
#include "td64_internal.h"
2626

27-
#define STRING_LIMIT 9 // for 128+ values set to 17
28-
#define EXTENDED_STRING_LENGTH_BITS 3 // for 128+ values, use 4
2927
#define MAX_STRING_MODE_EXTENDED_VALUES 512
3028

3129
static inline void esmOutputBits(unsigned char *outValsT, const uint32_t nBits, const uint32_t bitVal, uint32_t *nextOutIx, uint32_t *nextOutBit)
@@ -59,7 +57,10 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
5957
unsigned char outValsT[MAX_STRING_MODE_EXTENDED_VALUES];
6058
uint32_t maxUniquesExceeded=0;
6159
uint32_t highBitClear;
62-
60+
// smaller values compress slightly better with string limit of 9 versus 17
61+
const uint32_t string_limit=nValuesMax<=64 ? 9 : 17;
62+
const uint32_t extended_string_length_bits=nValuesMax<=64 ? 3 : 4;
63+
6364
if (nValuesMax > MAX_STRING_MODE_EXTENDED_VALUES)
6465
return -1;
6566
nextOutIx = 0; // start of encoding in outValsT
@@ -191,8 +192,8 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
191192
uint32_t strLimit = inPos - 1 - twoValsPos; // don't take string past current input pos
192193
if (nValuesMax-strPos < strLimit)
193194
strLimit = nValuesMax - strPos; // don't go past end of input
194-
if (strLimit > STRING_LIMIT-2)
195-
strLimit = STRING_LIMIT-2;
195+
if (strLimit > string_limit-2)
196+
strLimit = string_limit-2;
196197

197198
uint32_t strCount=0;
198199
while(strCount++ < strLimit && inVals[strPos] == inVals[twoValsPos])
@@ -201,7 +202,7 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
201202
twoValsPos++;
202203
}
203204
// output 11 plus string length bits
204-
esmOutputBits(outValsT, 2+EXTENDED_STRING_LENGTH_BITS, 3 | ((strCount-1)<<2), &nextOutIx, &nextOutBit);
205+
esmOutputBits(outValsT, 2+extended_string_length_bits, 3 | ((strCount-1)<<2), &nextOutIx, &nextOutBit);
205206
// output the position of string
206207
if (encodingBits512[inPos-1] > 8)
207208
{
@@ -313,6 +314,8 @@ int32_t decodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
313314
unsigned char uncompressedUniques[MAX_UNIQUES_EXTENDED_STRING_MODE];\
314315
uint32_t secondByte=inVals[1]; // bits= 0:5 nUniques 6 first encoding bit 7 compressed or not
315316
uint32_t nUniquesIn=(secondByte & 0x3f) + 1;
317+
// smaller values compress slightly better with string limit of 9 versus 17
318+
const uint32_t extended_string_length_bits=nOriginalValues<=64 ? 3 : 4;
316319

317320
// process one of three encodings:
318321
// 0 new unique value
@@ -405,7 +408,7 @@ int32_t decodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
405408
bitPos = 0;
406409
}
407410
// multi-character string: length, then location of values in bits needed to code current pos
408-
dsmGetBits2(inVals, EXTENDED_STRING_LENGTH_BITS, &thisInVal, &thisVal, &bitPos, &theBits);
411+
dsmGetBits2(inVals, extended_string_length_bits, &thisInVal, &thisVal, &bitPos, &theBits);
409412
uint32_t stringLen = (uint32_t)theBits + 2;
410413
assert(stringLen <= STRING_LIMIT);
411414
uint32_t nPosBits = encodingBits512[nextOutVal];

0 commit comments

Comments
 (0)