Skip to content

Commit b0a4427

Browse files
author
lsleonard
committed
Improve encoding speed of extended string mode
1. In tdstring.c, implemented 64-bit output of encoded values to improve compression speed.
1 parent 7cbe5ba commit b0a4427

File tree

4 files changed

+76
-43
lines changed

4 files changed

+76
-43
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# tiny-data-compression
2-
High-speed lossless tiny data compression of 16 to 512 bytes
2+
High-speed lossless data compression of 16 to 512 bytes
33

44
td512 filename [loopCount]
55

Tiny Data Compression with td512.docx

220 Bytes
Binary file not shown.

td512.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,18 @@
4545
/*
4646
1. In tdstring.c, extended string mode, continue processing after 64 unique values encountered for up to 64 additional non-unique values that are output as if they are unique values but are not indexed for string matching. The initial 64 unique values continue to be used. This improves compression for files with larger numbers of unique values, and reduces compression for some other files. The net result is better average compression.
4747
*/
48+
// Notes for version 2.1.5:
49+
/*
50+
1. In tdstring.c, implemented 64-bit output of encoded values to improve compression speed.
51+
*/
4852
#ifndef td512_h
4953
#define td512_h
5054

5155
#include "td64.h"
5256
#include "tdString.h"
5357
#include <unistd.h>
5458

55-
#define TD512_VERSION "v2.1.4"
59+
#define TD512_VERSION "v2.1.5"
5660
#define MIN_VALUES_EXTENDED_MODE 128
5761
#define MIN_UNIQUES_SINGLE_VALUE_MODE_CHECK 14
5862
#define MIN_VALUES_TO_COMPRESS 16

tdString.c

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,49 @@
2626

2727
#define MAX_STRING_MODE_EXTENDED_VALUES 512
2828

29-
static inline void esmOutputBits(unsigned char *outValsT, const uint32_t nBits, const uint32_t bitVal, uint32_t *nextOutIx, uint32_t *nextOutBit)
29+
static inline void esmOutputRemainder(unsigned char *outValsT, uint32_t *thisOutIx, uint32_t *nextOutBit, uint64_t *outBits)
3030
{
31-
// output 1 to 8 bits
32-
outValsT[*nextOutIx] |= (unsigned char)(bitVal << *nextOutBit);
31+
if (*nextOutBit == 0)
32+
return; // no bits to output
33+
uint32_t shiftPos=0;
34+
int32_t bitsRemaining=*nextOutBit-8;
35+
// output bits that remain
36+
outValsT[(*thisOutIx)++] = (unsigned char)*outBits;
37+
while (bitsRemaining > 0)
38+
{
39+
shiftPos += 8;
40+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits >> shiftPos);
41+
bitsRemaining -= 8;
42+
}
43+
*nextOutBit = 0;
44+
} // end esmOutputRemainder
45+
46+
static inline void esmOutputOutBits(unsigned char *outValsT, uint32_t *thisOutIx, uint64_t *outBits)
47+
{
48+
// copy 64 bits to output
49+
outValsT[(*thisOutIx)++] = (unsigned char)*outBits;
50+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>8);
51+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>16);
52+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>24);
53+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>32);
54+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>40);
55+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>48);
56+
outValsT[(*thisOutIx)++] = (unsigned char)(*outBits>>56);
57+
} // end esmOutputOutBits
58+
59+
static inline void thisOutIx2(unsigned char *outValsT, const uint32_t nBits, const uint64_t bitVal, uint32_t *thisOutIx, uint32_t *nextOutBit, uint64_t *outBits)
60+
{
61+
// output 1 to 64 bits
62+
*outBits |= bitVal << *nextOutBit;
3363
*nextOutBit += nBits;
34-
if (*nextOutBit >= 8)
64+
if (*nextOutBit >= 64)
3565
{
36-
*nextOutBit -= 8;
37-
outValsT[++(*nextOutIx)] = (unsigned char)bitVal >> (nBits - *nextOutBit);
66+
esmOutputOutBits(outValsT, thisOutIx, outBits);
67+
// init outBits with remainder of bits from current output
68+
*nextOutBit -= 64;
69+
*outBits = bitVal >> (nBits - *nextOutBit);
3870
}
39-
} // end esmOutputBits
71+
} // end thisOutIx2
4072

4173
int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *outVals, const uint32_t nValuesMax, uint32_t *nValuesOut)
4274
{
@@ -52,18 +84,19 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
5284
uint64_t twoVals[64]; // index is first unique val, with bit position of second unique value set to 1
5385
uint32_t twoValsPoss[64*64]; // position in input of first occurrence of corresponding two unique values of up to 64
5486
uint32_t twoValsPos;
55-
uint32_t nextOutIx;
87+
uint32_t thisOutIx;
5688
uint32_t nextOutBit=1; // start of encoding after first two inputs
5789
unsigned char outValsT[MAX_STRING_MODE_EXTENDED_VALUES];
5890
uint32_t maxUniquesExceeded=0;
5991
uint32_t highBitClear;
92+
uint64_t outBits; // accumulate 64 bits before output
6093
// smaller values compress slightly better with string limit of 9 versus 17
6194
const uint32_t string_limit=nValuesMax<=64 ? 9 : 17;
6295
const uint32_t extended_string_length_bits=nValuesMax<=64 ? 3 : 4;
6396
if (nValuesMax > MAX_STRING_MODE_EXTENDED_VALUES)
6497
return -100;
6598
outVals[1] = 0; // init second info byte
66-
nextOutIx = 0; // start of encoding in outValsT
99+
thisOutIx = 0; // start of encoding in outValsT
67100
// output encoding of first two values in outVals starting at third bit in second byte
68101
// first bit is last bit of unique count, second is whether
69102
// uniques are compressed
@@ -85,7 +118,7 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
85118
twoValsPoss[1] = 3; // set position to two past second value
86119
}
87120
// output 1 to indicate first unique value repeated
88-
outValsT[0] = 1; // 1 for first encoding bit
121+
outBits = 1; // 1 for first encoding bit
89122
}
90123
else
91124
{
@@ -111,9 +144,9 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
111144
// set up new two value in 2nd position
112145
twoVals[1] = 1 << UOinPos2;
113146
twoValsPoss[64 | UOinPos2] = 3; // set position to two past second value
114-
outValsT[0] = 0; // for first encoding bit
147+
outBits = 0; // for first encoding bit
115148
}
116-
uint32_t nUniqueBits=1; // bits to encode current number of uniques
149+
uint32_t nUniqueBits=1; // bits to encode current number of uniques (1 or 2)
117150
uint32_t nextInVal=inVals[2];
118151
inPos=2; // start loop after init of first two values
119152
const uint32_t lastPos=nValuesMax-1;
@@ -127,7 +160,7 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
127160
{
128161
// set up for new unique in this position
129162
// uniques > 64 are output as uniques but are not considered for processing
130-
if (nextOutIx+nUniques > lastPos)
163+
if (thisOutIx+nUniques > lastPos)
131164
{
132165
*nValuesOut = inPos - 1; // processed through last inPos
133166
return 0;
@@ -168,10 +201,11 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
168201
nUniques++;
169202
highBitClear |= inVal;
170203
// output a 0 to indicate new unique
171-
if (++nextOutBit == 8)
204+
if (++nextOutBit == 64)
172205
{
173-
// update out index and next out bit
174-
outValsT[++nextOutIx] = 0;
206+
// output outBits and init for next output
207+
esmOutputOutBits(outValsT, &thisOutIx, &outBits);
208+
outBits = 0;
175209
nextOutBit = 0;
176210
}
177211
outVals[nUniques+1] = (unsigned char)inVal; // save unique or any value encountered beyond 64 uniques in list at front of outVals starting in third position
@@ -189,7 +223,7 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
189223
twoValsPoss[(UOinVal<<6) | UOinValsInPosP1] = inPos + 1;
190224
}
191225
// output repeated value: 01 plus unique occurrence
192-
esmOutputBits(outValsT, 2 + nUniqueBits, 1|(UOinVal<<2), &nextOutIx, &nextOutBit);
226+
thisOutIx2(outValsT, 2 + nUniqueBits, (uint64_t)(1|(UOinVal<<2)), &thisOutIx, &nextOutBit, &outBits);
193227
continue;
194228
}
195229
const uint64_t TVuniqueOccurrence=twoVals[UOinVal];
@@ -203,7 +237,7 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
203237
{
204238
// two vals include first value so this is a repeat
205239
// output repeated value: 01 plus unique occurrence
206-
esmOutputBits(outValsT, 2+nUniqueBits, 1|(UOinVal<<2), &nextOutIx, &nextOutBit);
240+
thisOutIx2(outValsT, 2+nUniqueBits, (uint64_t)(1|(UOinVal<<2)), &thisOutIx, &nextOutBit, &outBits);
207241
continue;
208242
}
209243
uint32_t strPos=inPos+1;
@@ -218,18 +252,10 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
218252
strPos++;
219253
twoValsPos++;
220254
}
221-
// output 11 plus string length bits
222-
esmOutputBits(outValsT, 2+extended_string_length_bits, 3 | ((strCount-1)<<2), &nextOutIx, &nextOutBit);
223-
// output the position of string
224-
// output lowest bit and then remaining bits
225-
const uint32_t outVal9=twoValsPos-strCount-1;
226-
outValsT[nextOutIx] |= (outVal9 & 1) << nextOutBit;
227-
if (++nextOutBit == 8)
228-
{
229-
outValsT[++nextOutIx] = 0;
230-
nextOutBit = 0;
231-
}
232-
esmOutputBits(outValsT, encodingBits512[inPos-1]-1, outVal9>>1, &nextOutIx, &nextOutBit);
255+
// output 11 plus string length bit then position of string
256+
const uint32_t stringBits=2+extended_string_length_bits;
257+
const uint64_t outVal9=twoValsPos-strCount-1;
258+
thisOutIx2(outValsT, stringBits+encodingBits512[inPos-1], (3 | ((strCount-1)<<2)) | (outVal9<<stringBits), &thisOutIx, &nextOutBit, &outBits);
233259
inPos += strCount;
234260
nextInVal = inVals[inPos];
235261
}
@@ -240,22 +266,24 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
240266
twoVals[UOinVal] |= 1llu << UOinValsInPosP1;
241267
twoValsPoss[(UOinVal<<6) | UOinValsInPosP1] = inPos + 1;
242268
// output repeated value: 01 plus unique occurrence
243-
esmOutputBits(outValsT, 2 + nUniqueBits, 1|(UOinVal<<2), &nextOutIx, &nextOutBit);
269+
thisOutIx2(outValsT, 2 + nUniqueBits, (uint64_t)(1|(UOinVal<<2)), &thisOutIx, &nextOutBit, &outBits);
244270
}
245271
}
246272
// output final bits
247-
if (nextOutBit > 0)
248-
nextOutIx++; // index past final bits
249273
if (inPos < nValuesMax)
250274
{
251275
// occurs for both end of input on last pos -1 and for max uniques exceeded
252276
if (maxUniquesExceeded)
253-
outValsT[nextOutIx++] = inVals[maxUniquesExceeded-1]; // output last byte that is last unique encountered
277+
thisOutIx2(outValsT, 8, (uint64_t)inVals[maxUniquesExceeded-1], &thisOutIx, &nextOutBit, &outBits);
254278
else
255-
outValsT[nextOutIx++] = inVals[lastPos]; // output last input byte
279+
thisOutIx2(outValsT, 8, (uint64_t)inVals[lastPos], &thisOutIx, &nextOutBit, &outBits);
280+
}
281+
if (nextOutBit > 0)
282+
{
283+
esmOutputRemainder(outValsT, &thisOutIx, &nextOutBit, &outBits); // index past final bits
256284
}
257285
*nValuesOut = maxUniquesExceeded ? maxUniquesExceeded : nValuesMax;
258-
if (nextOutIx + nUniques > *nValuesOut - 1)
286+
if (thisOutIx + nUniques > *nValuesOut - 1)
259287
return 0;
260288
// use 7-bit encoding on uniques if all high bits set
261289
int32_t uniqueOffset;
@@ -279,10 +307,10 @@ int32_t encodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
279307
{
280308
uniqueOffset = nUniques + 2;
281309
}
282-
memcpy(outVals+uniqueOffset, outValsT, nextOutIx);
310+
memcpy(outVals+uniqueOffset, outValsT, thisOutIx);
283311
outVals[0] = 0x7f; // indicate external string mode
284312
outVals[1] |= nUniques-1; // number uniques in first 7 bits then compressed uniques bit
285-
return (int32_t)(nextOutIx+uniqueOffset) * 8;
313+
return (int32_t)(thisOutIx+uniqueOffset) * 8;
286314
} // end encodeExtendedStringMode
287315

288316
static inline void dsmGetBits(const unsigned char *inVals, const uint32_t nBitsToGet, uint32_t *thisInVal, uint32_t *thisVal, uint32_t *bitPos, int32_t *theBits)
@@ -452,14 +480,15 @@ int32_t decodeExtendedStringMode(const unsigned char *inVals, unsigned char *out
452480
}
453481
}
454482
}
455-
if (bitPos > 0)
456-
thisInVal++; // inc past partial input value
457483
if (nextOutVal == nOrigMinus1)
458484
{
459485
// output last byte in input when not ending with a string
460486
// string at end will catch last byte
461-
outVals[nOrigMinus1] = inVals[thisInVal++];
487+
dsmGetBits(inVals, 8, &thisInVal, &thisVal, &bitPos, &theBits);
488+
outVals[nOrigMinus1] = (unsigned char)theBits;
462489
}
490+
if (bitPos > 0)
491+
thisInVal++; // inc past partial input value
463492
*bytesProcessed = thisInVal;
464493
return (int32_t)nOriginalValues;
465494
} // end decodeExtendedStringMode

0 commit comments

Comments
 (0)