lsleonard
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎Tiny Data Compression with td512.docx
-1.52 KB b/‎Tiny Data Compression with td512.docx
-1.52 KB
diff --git a/‎td512.h
Lines changed: 6 additions & 0 deletions b/‎td512.h
Lines changed: 6 additions & 0 deletions
diff --git a/‎td64.c
Lines changed: 115 additions & 34 deletions b/‎td64.c
Lines changed: 115 additions & 34 deletions
diff --git a/‎td64.h
Lines changed: 58 additions & 5 deletions b/‎td64.h
Lines changed: 58 additions & 5 deletions
@@ -7,7 +7,7 @@ td512 filename [loopCount]
 
 	loopCount (default 1) is the loop count to use for performance testing. Also see BENCHMARK_LOOP_COUNT macro in main.c.
 
-Tiny data compression is not usually supported by compression programs. Now with td512 you can compress data from 6 to 512 bytes. td512 is available under the GPL-3.0 License at https://github.com/lsleonard/tiny-data-compression. Although Zstandard and Snappy get better compression at 512 bytes than td512, Zstandard is very slow for tiny datasets and both programs steadily decline in compression ratio as the number of bytes decreases to 128. At 64 bytes, neither program produces compression. td512 combines the compressed output of td64 for each block of 64 bytes in the input, meaning that the compression achieved at 512 bytes is the same as that for 64 bytes. The td512 algorithm emphasizes speed, and running on a 2 GHz processor, gets 24% average compression at 272 Mbytes per second on the Squash benchmark test data (see https://quixdb.github.io/squash-benchmark/#). Although Huffman coding, with its optimal compression using frequency analysis of values, has been used effectively for many applications, for tiny datasets the compression modes used in td512 approach or exceed the results of using the Huffman algorithm. And with a focus on speed of execution, Huffman and arithmetic coding are not practical algorithms for applications of tiny data. Two areas where high-speed compression using td512 might be applied are small message text and programmatic objects.
+Tiny data compression is not usually supported by compression programs. Now with td512 you can compress data from 6 to 512 bytes. td512 is available under the GPL-3.0 License at https://github.com/lsleonard/tiny-data-compression. Although for some types of data, programs QuickLZ, Zstandard and Snappy can get better compression at 512 bytes than td512, all steadily decline in compression ratio as the number of bytes decreases to 128. At 64 bytes, none of these programs produces compression. td512 combines the compressed output of td64 for each block of 64 bytes in the input, meaning that the compression achieved at 512 bytes is the same as that for 64 bytes. The td512 algorithm emphasizes speed, and running on a 2 GHz processor, gets 24% average compression at 323 Mbytes per second on the Squash benchmark test data (see https://quixdb.github.io/squash-benchmark/#). Although Huffman coding, with its optimal compression using frequency analysis of values, has been used effectively for many applications, for tiny datasets the compression modes used in td512 approach or exceed the results of using the Huffman algorithm. And with a focus on speed of execution, Huffman and arithmetic coding are not practical algorithms for applications of tiny data. Two areas where high-speed compression using td512 might be applied are small message text and programmatic objects.
 
 You can call the td512 and td512d functions to compress and decompress 1 to 512 bytes. The td512 interface performs compression of 6 to 512 bytes, but accepts 1 to 5 bytes and stores them without compression. td512 acts as a wrapper that uses the td64 interface to compress blocks of 64 bytes until the final block of 64 or fewer bytes is compressed. Along with the number of bytes processed, a pass/fail bit is stored for each 64-byte (or smaller) block compressed, and the compressed or uncompressed data is output.
 
 
@@ -66,6 +66,12 @@
  3. Set the initial loop in td64 to 7/16 of input values for 24 or
     more inputs. This provides a better result for adaptive text mode.
  */
+// Notes for version 1.1.6
+/*
+ 1. Modified random data check and added a later check for random data.
+ 2. Added an early call to single value mode.
+ 3. Updated unused extended string mode to calculate high bit clear during processing and check for overflow as late as possible.
+*/
 #ifndef td512_h
 #define td512_h
 
 
@@ -7,6 +7,7 @@
 //  Copyright © 2021 L. Stevan Leonard. All rights reserved.
 #include "td64.h"
 #include "td64_internal.h"
+#include "tdString.h"
 
 #ifdef TD64_TEST_MODE
 // these globals can be used to collect info
@@ -15,6 +16,7 @@ uint32_t g_td64FailedStringMode=0;
 uint32_t g_td64MaxStringModeUniquesExceeded=0;
 uint32_t g_td64Text8bitCount=0;
 uint32_t g_td64AdaptiveText8bitCount=0;
+uint32_t g_td64StringBlocks=0;
 #endif
 
 // fixed bit compression (fbc): for the number of uniques in input, the minimum number of input values for 25% compression
@@ -661,8 +663,9 @@ int32_t encodeAdaptiveTextMode(unsigned char *inVals, unsigned char *outVals, co
 
     // save uniques for possible failure
     memcpy(saveUniques, outVals+1, nUniquesIn);
-    if (predefinedTextCharCnt > nValues * 3/4)
+    if (predefinedTextCharCnt)
     {
+        // predefined text char count is high enough to guarantee compreession even if remainder of checked values contain no text chars
         // use standard text table, accept compression even if maxBytes exeeded
         outVals[0] = 0x7; // indicate text mode with standard text
         outVals[1] = 0; // init first value used by esmOutputBits
@@ -677,18 +680,19 @@ int32_t encodeAdaptiveTextMode(unsigned char *inVals, unsigned char *outVals, co
             else
             {
                 // output char not predefined or adaptive
+                if (nextOutIx > maxBytes)
+                {
+                    // main verifies up to 1/2 of data values looked at are text
+                    // reset uniques in output array
+                    memcpy(outVals+1, saveUniques, nUniquesIn);
+                    return 0; // requested compression not met
+                }
                 esmOutputBits(outVals, 3, 0x5, &nextOutIx, &nextOutBit);
                 esmOutputBits(outVals, 8, inVal, &nextOutIx, &nextOutBit); // output 8 bits
 #ifdef TD64_TEST_MODE
                 g_td64Text8bitCount++;
 #endif
             }
-            if (nextOutIx > maxBytes)
-            {
-                // reset uniques in output array
-                memcpy(outVals+1, saveUniques, nUniquesIn);
-                return 0; // requested compression not met
-            }
         }
     }
     else
@@ -707,20 +711,23 @@ int32_t encodeAdaptiveTextMode(unsigned char *inVals, unsigned char *outVals, co
             else
             {
                 // output char not predefined or adaptive
+                if (nextOutIx > maxBytes)
+                {
+                    // main verifies only 1/2 of data values looked at are text
+                    resetAdaptiveChars(adaptiveUsed); // prep for next time
+                    // reset uniques in output array
+                    memcpy(outVals+1, saveUniques, nUniquesIn);
+                    return 0; // requested compression not met
+                }
                 esmOutputBits(outVals, 3, 0x5, &nextOutIx, &nextOutBit);
                 esmOutputBits(outVals, 8, inVal, &nextOutIx, &nextOutBit); // output 8 bits
 #ifdef TD64_TEST_MODE
-                g_td64AdaptiveText8bitCount++;
+                if ((outVals[0] & 0x37) == 7)
+                    g_td64Text8bitCount++;
+                else
+                    g_td64AdaptiveText8bitCount++;
 #endif
             }
-            if (nextOutIx > maxBytes)
-            {
-                // main verifies only 1/2 of data values looked at are text
-                resetAdaptiveChars(adaptiveUsed); // prep for next time
-                // reset uniques in output array
-                memcpy(outVals+1, saveUniques, nUniquesIn);
-                return 0; // requested compression not met
-            }
         }
     }
     return nextOutIx * 8 + nextOutBit;
@@ -994,6 +1001,30 @@ int32_t encodeStringMode(const unsigned char *inVals, unsigned char *outVals, co
     return 0; // not compressible
 } // end encodeStringMode
 
+static inline uint32_t getNum2char(const unsigned char *inVals, const uint32_t *uniqueOccurrence, const uint32_t nValues)
+{
+    // unique offsets must be preset from main loop
+    uint32_t n2char=0;
+    uint32_t inPos=1; // first value preset
+    int32_t twoVals[32];
+    uint32_t UOinVal;
+    uint32_t UOnextIn=0; // first value is unique offset 0
+    memset(twoVals, 255, sizeof(twoVals));
+    
+    while (inPos < nValues-1)
+    {
+        UOinVal = UOnextIn;
+        UOnextIn = uniqueOccurrence[inVals[inPos++]];
+        if (UOinVal > 31 || UOnextIn > 31)
+            continue; // only support up to 32 unique values
+        if (twoVals[UOinVal] == -1)
+            twoVals[UOinVal] = UOnextIn;
+        else if (twoVals[UOinVal] == UOnextIn)
+            n2char++;
+    }
+    return n2char;
+} // end getNum2char
+
 int32_t td64(unsigned char *inVals, unsigned char *outVals, const uint32_t nValues)
 // td64: Compress nValues bytes. Return 0 if not compressible (no output bytes),
 //    -1 if error; otherwise, number of bits written to outVals.
@@ -1038,10 +1069,11 @@ int32_t td64(unsigned char *inVals, unsigned char *outVals, const uint32_t nValu
             highBitCheck |= inVal; // keep watch on high bit of unique values
         }
     }
-    if (nUniqueVals > nValsInitLoop * 7/8 + 1)
+    if (nUniqueVals > nValsInitLoop * 7/8 - 1)
     {
-        // supported unique values exceeded--skip this for < 16 values
-        if (nValues >= MIN_VALUES_7_BIT_MODE && (highBitCheck & 0x80) == 0)
+        // supported unique values exceeded
+        // check highBitCheck for high bit clear
+        if ((highBitCheck & 0x80) == 0 && nValues >= MIN_VALUES_7_BIT_MODE)
         {
             // attempt to compress based on high bit clear across all values
             // confirm remaining values have high bit clear
@@ -1053,10 +1085,10 @@ int32_t td64(unsigned char *inVals, unsigned char *outVals, const uint32_t nValu
         outVals[0] = 0; // indicate random data failure
         return 0; // too many uniques to compress with fixed bit coding
     }
-    if (nUniqueVals > uniqueLimit/2 && predefinedTextCharCnt > nValsInitLoop / 2)
+    if (nUniqueVals > uniqueLimit/2 && predefinedTextCharCnt > nValsInitLoop/2)
     {
         // encode in text mode if at least 11% compression expected
-        uint32_t retBits=encodeAdaptiveTextMode(inVals, outVals, nValues, val256, nUniqueVals, predefinedTextCharCnt, nValues-nValues/8);
+        uint32_t retBits=encodeAdaptiveTextMode(inVals, outVals, nValues, val256, nUniqueVals, predefinedTextCharCnt > nValsInitLoop*3/4, nValues-nValues/8);
         if (retBits != 0)
             return retBits;
 #ifdef TD64_TEST_MODE
@@ -1084,6 +1116,28 @@ int32_t td64(unsigned char *inVals, unsigned char *outVals, const uint32_t nValu
             break; // continue loop without further checking
         }
     }
+    if (singleValue >= 0 && nUniqueVals > uniqueLimit)
+    {
+        // early opportunity for single value mode
+        // single value mode is fast and set to get minimum 12% compression for 64 values
+        // only single value mode can have more then MAX_STRING_MODE_UNIQUES
+        return encodeSingleValueMode(inVals, outVals, nValues, singleValue);
+    }
+    const uint32_t nUniquesRandom=nValues*5/8 < MAX_STRING_MODE_UNIQUES ? nValues*5/8 : MAX_STRING_MODE_UNIQUES;
+    if (nUniqueVals > nUniquesRandom)
+    {
+        if ((highBitCheck & 0x80) == 0 && nValues >= MIN_VALUES_7_BIT_MODE)
+        {
+            // attempt to compress based on high bit clear across all values
+            // confirm remaining values have high bit clear
+            while (inPos < nValues)
+                highBitCheck |= inVals[inPos++];
+            if ((highBitCheck & 0x80) == 0)
+                return encode7bits(inVals, outVals, nValues);
+        }
+        outVals[0] = 0; // indicate random data failure
+        return 0; // too many uniques to compress with fixed bit coding
+    }
     if (nUniqueVals <= uniqueLimit) // confirm unique limit has not been exceeded
     {
         // continue fixed bit loop with checks for high bit set and repeat counts,
@@ -1105,22 +1159,41 @@ int32_t td64(unsigned char *inVals, unsigned char *outVals, const uint32_t nValu
         // fixed bit coding failed, try for other compression modes
         if (singleValue >= 0)
         {
+            // second chance for single value mode
+            // single value mode is fast and set to get minimum 12% compression for 64
+            // only single value mode can have more then MAX_STRING_MODE_UNIQUES
             return encodeSingleValueMode(inVals, outVals, nValues, singleValue);
         }
         if ((nValues >= MIN_VALUES_STRING_MODE))
         {
-            if ((nUniqueVals > MAX_STRING_MODE_UNIQUES))
+            uint32_t maxBits = ((highBitCheck & 0x80) == 0 && nValues >= MIN_VALUE_7_BIT_MODE_12_PERCENT) ?  nValues*7 : nValues*7+nValues/2 ;
+            int32_t retBits;
+#ifdef TD64_TEST_MODE
+            g_td64StringBlocks++;
+#endif
+            if (nUniqueVals > MAX_STRING_MODE_UNIQUES)
             {
+                // extended string mode supports up to 64 uniques but is slow and not guaranteed to achieve any particular compression, and is needed less than 5% of time in data tested; could be used if a quick metric to predict compression level can be found
+                // NOTE: more than 32 uniques is currently being labeled random data
 #ifdef TD64_TEST_MODE
                 g_td64MaxStringModeUniquesExceeded++;
 #endif
-            }
+/*
+                // extended string mode
+                uint32_t nValuesOut;
+                int32_t retBits=encodeStringModeExtended(inVals, outVals, nValues, &nValuesOut);
+                if (retBits < 0)
+                    return retBits;
+                if (retBits < maxBits)
+                    return retBits;
+#ifdef TD64_TEST_MODE
+                g_td64FailedStringMode++;
+#endif
+*/            }
             else
             {
-                // string mode for 32+ values with 32 or fewer uniques
-                int32_t retBits;
+                // string mode for 32+ values with 17 to 32 uniques
                 // max bits set to 12% if high bit clear and enough input values, else 6%
-                uint32_t maxBits = ((highBitCheck & 0x80) == 0 && nValues >= MIN_VALUE_7_BIT_MODE_12_PERCENT) ?  nValues*7 : nValues*7+nValues/2 ;
                 if ((retBits=encodeStringMode(inVals, outVals, nValues, nUniqueVals, uniqueOccurrence, (highBitCheck & 0x80) == 0, maxBits)) != 0)
                     return retBits;
 #ifdef TD64_TEST_MODE
@@ -1136,18 +1209,21 @@ int32_t td64(unsigned char *inVals, unsigned char *outVals, const uint32_t nValu
         outVals[0] = 1; // indicate general failure to compress
         return 0; // unable to compress
     }
-    else if (nUniqueVals > 8 && singleValue >= 0)
+    else if (nUniqueVals > 8)
     {
-        // check for benefit of single value mode when 4-bit fixed bit encoding
-        // requires at least 38 input values to have 9 or more uniques
-        const uint32_t singleValueOverFixexBitRepeats=nValues/2-nValues/16;
-        if (val256[singleValue] >= singleValueOverFixexBitRepeats)
+        if (singleValue >= 0)
         {
-            // favor single value over fixed 4-bit encoding
-            return encodeSingleValueMode(inVals, outVals, nValues, singleValue);
+            // check for benefit of single value mode when 4-bit fixed bit encoding
+            // requires at least 38 input values to have 9 or more uniques
+            // FUTURE: graduate based on number of uniques versus fixed 31%
+            const uint32_t singleValueOverFixexBitRepeats=nValues/2-nValues/16; //
+            if (val256[singleValue] >= singleValueOverFixexBitRepeats)
+            {
+                // favor single value over fixed 4-bit encoding
+                return encodeSingleValueMode(inVals, outVals, nValues, singleValue);
+            }
         }
     }
-    
     // process fixed bit coding
     uint32_t i;
     uint32_t nextOut;
@@ -1761,6 +1837,11 @@ int32_t td64d(const unsigned char *inVals, unsigned char *outVals, const uint32_
 
     // first bit of first byte 1: decode one of four modes
     const unsigned char firstByte=inVals[0];
+    if (firstByte == 0x7f)
+    {
+        // string mode extended
+        return decodeStringModeExtended(inVals, outVals, nOriginalValues, bytesProcessed);
+    }
     if ((firstByte & 7) == 0x01)
     {
         // string mode
 
@@ -15,21 +15,74 @@
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <https://www.gnu.org/licenses/>.//
 */
-// Notes for version 2.1.0:
+// Notes for version 1.1.0:
 /*
- 1. Modified the td64 interface after studying the results from compressing
-    up to 512 bytes in the td512 interface.
+ 1. Main program reads a file into memory that is compressed by
+    calling td512 repeatedly. When complete, the compressed data is
+    written to a file and read for decompression by calling td512d.
+      td512 filename [loopCount]]
+        filename is required argument 1.
+        loopCount is optional argument 2 (default: 1). Looping is performed over the entire input file.
  */
+// Notes for version 1.1.1:
+/*
+ 1. Updated some descriptive comments.
+ */
+// Notes for version 1.1.2:
+/*
+ 1. Moved 7-bit mode defines to td64.h because they are used
+    outside of the 7-bit mode.
+ 2. When fewer than minimum values to use 7-bit mode of 16, don't
+    accumulate high bit when reasonable. Main loop keeps this in because
+    time required is minimal.
+ 3. When fewer than 24 input values, but greater than or equal to minimum
+    values of 16 to use 7-bit mode, use 6% as minimum compression for
+    compression modes used prior to 7-bit mode.
+ */
+// Notes for version 1.1.3:
+/*
+ 1. Fixed bugs in td5 and td5d functions.
+ 2. Recognize random data starting at 16 input values.
+ */
+// Notes for version 1.1.4:
+/*
+ 1. Added bit text mode that uses variable length encoding bits
+    to maximize compression. td5 still uses the fixed bit text mode.
+ 2. Changed the random data metric to use number values init
+    loop * 7/8 + 1 to be threshold for random data.
+ 3. Implemented a static global for decoding bit text mode and
+    string mode to limit reads of input values.
+*/
+// Notes for version 1.1.5
+/*
+ 1. Added adaptive text mode that looks for occurrences of characters
+    that are common to a particular data type when fewer than 3/4 of
+    the input values are matched by a predefined character. Defined
+    XML and HTML based on '<', '>', '/' and '"'. Defined C or other
+    code files based on '*', '=', ';' and '\t'. Eight characters
+    common to the text type are defined in the last 8 characters of
+    the characters encoded.
+ 2. Added compression of high bit in unique characters in string mode
+    when the high bit is 0 for all values.
+ 3. Set the initial loop in td64 to 7/16 of input values for 24 or
+    more inputs. This provides a better result for adaptive text mode.
+ */
+// Notes for version 1.1.6
+/*
+ 1. Modified random data check and added a later check for random data.
+ 2. Added an early call to single value mode.
+ 3. Updated unused extended string mode to calculate high bit clear during processing and check for overflow as late as possible.
+*/
 #ifndef td64_h
 #define td64_h
 
 #include <stdint.h>
 #include <string.h>
 #include <stdlib.h>
-//#define NDEBUG // disable asserts
+#define NDEBUG // disable asserts
 #include <assert.h>
 
-#define TD64_VERSION "v2.1.0"
+#define TD64_VERSION "v1.1.6"
 #define MAX_TD64_BYTES 64  // max input vals supported
 #define MIN_TD64_BYTES 1  // min input vals supported
 #define MAX_UNIQUES 16 // max uniques supported in input