typio
diff --git a/‎.github/workflows/release.yml
Lines changed: 7 additions & 11 deletions b/‎.github/workflows/release.yml
Lines changed: 7 additions & 11 deletions
diff --git a/‎README.md
Lines changed: 16 additions & 11 deletions b/‎README.md
Lines changed: 16 additions & 11 deletions
diff --git a/‎build.zig.zon
Lines changed: 1 addition & 1 deletion b/‎build.zig.zon
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/decode.zig
Lines changed: 7 additions & 13 deletions b/‎src/decode.zig
Lines changed: 7 additions & 13 deletions
diff --git a/‎src/encode.zig
Lines changed: 44 additions & 12 deletions b/‎src/encode.zig
Lines changed: 44 additions & 12 deletions
@@ -35,23 +35,19 @@ jobs:
         include:
           - name: linux-x86_64
             target: x86_64-linux
-            strip: strip=true
-            optimize: optimize=ReleaseSafe
+            optimize: optimize=ReleaseFast
 
           - name: windows-x86_64
             target: x86_64-windows
-            strip: strip=true
-            optimize: optimize=ReleaseSafe
+            optimize: optimize=ReleaseFast
 
           - name: macos-aarch64
             target: aarch64-macos
-            strip: strip=true
-            optimize: optimize=ReleaseSafe
+            optimize: optimize=ReleaseFast
 
           - name: macos-x86
             target: x86_64-macos
-            strip: strip=true
-            optimize: optimize=ReleaseSafe
+            optimize: optimize=ReleaseFast
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -61,9 +57,9 @@ jobs:
         with:
           version: master
 
-      - name: zig build -D${{ matrix.strip }} -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
+      - name: zig build -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
         run: |
-          zig build -D${{ matrix.strip }} -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
+          zig build -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
 
       - uses: actions/upload-artifact@v3
         with:
@@ -81,4 +77,4 @@ jobs:
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          gh release create v0.5.1 -t "0.5.1" entreepy/entreepy*
+          gh release create v1.0.0 -t "1.0.0" entreepy/entreepy*
@@ -2,20 +2,22 @@ entreepy<br/>
 [![Actions Status](https://github.com/typio/entreepy/workflows/release/badge.svg)](https://github.com/typio/entreepy/actions)
 ====
 
-> ⚡ Fast huffman coding text compression
+> ⚡ Fast text compression tool
 
-The name is from entropy coding + binary trees.
+The name is entropy coding + binary trees.
 
 ### Usage
 
 ```
-$ entreepy [options] [command] [file] [command options]
+Entreepy - Text compression tool
+
+Usage: entreepy [options] [command] [file] [command options]
 
 Options:
     -h, --help     show help
     -p, --print    print decompressed text to stdout
     -t, --test     test/dry run, does not write to file
-    -d, --debug    print huffman code dictionary and performance times
+    -d, --debug    print huffman code dictionary and performance times to stdout
 
 Commands:
     c    compress a file
@@ -26,10 +28,10 @@ Command Options:
 
 Examples:
     entreepy -d c text.txt -o text.txt.et
-    entreepy -ptd d text.txt.et
+    entreepy -ptd d text.txt.et -o decoded_text.txt
 ```
 
-Input file must be < 1 terabyte. I recommend keeping an uncompressed backup or testing the program's decompression before deleting the original, the program hasn't been robustly tested. Be sure to use the same version of the program to decompress as compress.
+Input file must be < 1 terabyte. Be sure to use the same version of the program to decompress as compress.
 
 ### Performance
 
@@ -41,20 +43,23 @@ I use a decode map which is keyed by the integer value of the code and stores a
 
 By utilizing this decode map, decoding can be performed much more quickly than by traversing a binary tree.
 
-#### Performance on MacBook Air M2, 8 GB RAM - v0.5.0
+#### Performance on MacBook Air M2, 8 GB RAM - v1.0.0
 | File | Original File Size | Compressed Size | Compression Time | Decompression Time |
 | ---- | :----------------: | :-------------: | :--------------: | :----------------: |
-| [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt)   | 477 bytes | 374 bytes | 240μs | 950μs |
-| [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 115 KB | ~ 66 KB | 2.2ms | 150ms |
-| [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.5 MB | ~ 3.2 MB | 0.1s | 7s |
+| [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt)   | 477 bytes | 374 bytes | 600μs | 3.2ms |
+| [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 112 KB | ~ 68 KB | 6.7ms | 262ms |
+| [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.2 MB | ~ 3.0 MB | 111ms | 11.8s |
+
+Next I'll add block based parallel decoding. After that I'm interested in exploring additional compression techniques; to support non-text file formats.
 
 ### Compressed File Format
 
 Uses the `.et` file format, identified by the magic number `e7 c0 de`.
 
 ```bf
 | magic number -> 3 bytes |
-| (length of dictionary - 1) -> 1 byte |
+| file format version -> 1 byte |
+| length of dictionary - 1 -> 1 byte |
 | length of body -> 4 bytes |
 
 for n symbols
 
@@ -1,7 +1,7 @@
 .{
     .name = "entreepy",
 
-    .version = "0.5.1",
+    .version = "1.0.0",
 
     .dependencies = .{},
 
 
@@ -8,8 +8,6 @@ pub const DecodeFlags = struct {
     debug: bool = false,
 };
 
-// TODO: Add checks for to error if it isnt in valid .et file format (min length)
-
 pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: anytype, std_out: std.fs.File, flags: DecodeFlags) !usize {
     var bytes_written: u32 = 0;
     const start_time = std.time.microTimestamp();
@@ -20,19 +18,15 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
     var reading_dict_code_len: bool = false;
     var reading_dict_code: bool = false;
 
-    const decode_dictionary_length: u8 = compressed_text[3] + 1;
-
-    std.debug.print("decode_dictionary_length: {}\n", .{decode_dictionary_length});
+    const decode_dictionary_length: u8 = compressed_text[0] + 1;
 
-    var decode_body_length: u32 = compressed_text[4];
+    var decode_body_length: u32 = compressed_text[1];
     decode_body_length <<= 8;
-    decode_body_length |= compressed_text[5];
+    decode_body_length |= compressed_text[2];
     decode_body_length <<= 8;
-    decode_body_length |= compressed_text[6];
+    decode_body_length |= compressed_text[3];
     decode_body_length <<= 8;
-    decode_body_length |= compressed_text[7];
-
-    std.debug.print("decode body length: {}\n", .{decode_body_length});
+    decode_body_length |= compressed_text[4];
 
     var longest_code: u8 = 0;
     var shortest_code: usize = std.math.maxInt(usize);
@@ -53,7 +47,7 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
     var build_bits: usize = 0b0;
     var i: usize = 0; // bit pos in current read
     var letters_read: u8 = 0;
-    for (compressed_text[8..]) |byte| {
+    for (compressed_text[5..]) |byte| {
         pos = 0;
 
         read: while (true) {
@@ -136,7 +130,7 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
     var testing_code: usize = 0;
     var decoded_letters_read: usize = 0;
 
-    for (compressed_text[8 + global_pos ..]) |byte| {
+    for (compressed_text[5 + global_pos ..]) |byte| {
         window <<= 8;
         window |= byte;
         window_len += 8;
 
@@ -36,20 +36,20 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
     // alphabetically, 0 occurence ascii chars at the end
     var sorted_letter_book = [_]u8{0} ** 256;
 
-    // my naive custom sort, <256 passes, ~100 microseconds
+    // simple custom sort, <256 passes, ~100 microseconds
     var book_index: u8 = 0;
     var min_value: usize = 1;
     var next_min_value: usize = 0;
     while (next_min_value != std.math.maxInt(usize)) {
         next_min_value = std.math.maxInt(usize);
-        for (occurences_book, 0..) |o, c| {
-            if (o < next_min_value and o > min_value) {
-                next_min_value = o;
+        for (occurences_book, 0..) |occurences, char_code| {
+            if (occurences < next_min_value and occurences > min_value) {
+                next_min_value = occurences;
             }
             // occurences is definitionally sorted in ASCII alphabetical order
-            // so ties (1+ c's with same o) with be resolved alphabetically
-            if (o == min_value) {
-                sorted_letter_book[book_index] = @intCast(c);
+            // so ties (different char_codes with same occurences) with be resolved alphabetically
+            if (occurences == min_value) {
+                sorted_letter_book[book_index] = @intCast(char_code);
                 if (book_index < 255) book_index += 1;
             }
         }
@@ -174,17 +174,46 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
         }
 
         if (traverser.node.right == null and traverser.node.left == null) {
-            if (flags.debug) try std_out.writer().print("{c} - ", .{traverser.node.symbol orelse 0});
+            if (flags.debug) try std_out.writer().print("{c} {} - ", .{ traverser.node.symbol orelse 0, traverser.node.symbol orelse 0 });
             var j: u8 = traverser.path.length;
             while (j > 0) : (j -= 1) {
                 if (flags.debug) try std_out.writer().print("{b}", .{traverser.path.data >>
-                    @as(u4, @truncate(j - 1)) & 1});
+                    @as(u5, @truncate(j - 1)) & 1});
             }
             if (flags.debug) try std_out.writer().print("\n", .{});
             dictionary[traverser.node.symbol orelse unreachable] = traverser.path;
         }
     }
 
+    // debug check that there are no colliding prefixes
+    if (flags.debug) {
+        for (dictionary, 0..) |code_1, i| {
+            for (dictionary, 0..) |code_2, j| {
+                if (code_1.length == 0 or code_2.length == 0 or i == j) continue;
+
+                var isPrefix = true;
+                const shorter = @min(code_1.length, code_2.length);
+                var k: usize = 0;
+
+                while (k <= shorter) : (k += 1) {
+                    const code_1_bit = (code_1.data >> @as(u5, @truncate(code_1.length - k))) & 1;
+                    const code_2_bit = (code_2.data >> @as(u5, @truncate(code_2.length - k))) & 1;
+
+                    if (code_1_bit != code_2_bit) {
+                        isPrefix = false;
+                        break;
+                    }
+                }
+
+                if (isPrefix) {
+                    const l_i = @as(u8, @truncate(i));
+                    const l_j = @as(u8, @truncate(j));
+                    try std_out.writer().print("Found colliding prefix codes for {} {c} and {} {c}", .{ l_i, l_i, l_j, l_j });
+                }
+            }
+        }
+    }
+
     // estimate of header length when every unique char is used
     const max_header_length: usize = 7200;
     var out_buffer = try allocator.alloc(u8, max_header_length + text.len);
@@ -198,6 +227,10 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
     try bit_stream_writer.writeBits(@as(u24, 0xe7c0de), 24);
     bits_written += 24;
 
+    // write format version
+    try bit_stream_writer.writeBits(@as(u8, 0x01), 8);
+    bits_written += 8;
+
     // write dictionary length
     var dictionary_length: usize = 0; // dictionary length - 1
     for (dictionary) |code| {
@@ -209,7 +242,6 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
 
     // write body length
     try bit_stream_writer.writeBits(text.len, 32);
-    std.debug.print("text.len {}", .{text.len});
     bits_written += 32;
 
     // write dictionary
@@ -223,7 +255,7 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
             bits_written += 8;
             var j: usize = code.length;
             while (j > 0) : (j -= 1) {
-                try bit_stream_writer.writeBits((code.data >> @as(u4, @truncate(j - 1))) & 1, 1);
+                try bit_stream_writer.writeBits((code.data >> @as(u5, @truncate(j - 1))) & 1, 1);
                 bits_written += 1;
             }
         }
@@ -236,7 +268,7 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
         const code = dictionary[char];
         var j: usize = code.length;
         while (j > 0) : (j -= 1) {
-            try bit_stream_writer.writeBits((code.data >> @as(u4, @truncate(j - 1))) & 1, 1);
+            try bit_stream_writer.writeBits((code.data >> @as(u5, @truncate(j - 1))) & 1, 1);
             bits_written += 1;
         }
     }