Skip to content

Commit c0166ed

Browse files
committed
Fixed bugs with CLI and encode prefix code creation ⚡
1 parent 5394be0 commit c0166ed

File tree

8 files changed

+110
-82
lines changed

8 files changed

+110
-82
lines changed

.github/workflows/release.yml

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,19 @@ jobs:
3535
include:
3636
- name: linux-x86_64
3737
target: x86_64-linux
38-
strip: strip=true
39-
optimize: optimize=ReleaseSafe
38+
optimize: optimize=ReleaseFast
4039

4140
- name: windows-x86_64
4241
target: x86_64-windows
43-
strip: strip=true
44-
optimize: optimize=ReleaseSafe
42+
optimize: optimize=ReleaseFast
4543

4644
- name: macos-aarch64
4745
target: aarch64-macos
48-
strip: strip=true
49-
optimize: optimize=ReleaseSafe
46+
optimize: optimize=ReleaseFast
5047

5148
- name: macos-x86
5249
target: x86_64-macos
53-
strip: strip=true
54-
optimize: optimize=ReleaseSafe
50+
optimize: optimize=ReleaseFast
5551
runs-on: ubuntu-latest
5652
steps:
5753
- uses: actions/checkout@v3
@@ -61,9 +57,9 @@ jobs:
6157
with:
6258
version: master
6359

64-
- name: zig build -D${{ matrix.strip }} -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
60+
- name: zig build -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
6561
run: |
66-
zig build -D${{ matrix.strip }} -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
62+
zig build -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
6763
6864
- uses: actions/upload-artifact@v3
6965
with:
@@ -81,4 +77,4 @@ jobs:
8177
env:
8278
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
8379
run: |
84-
gh release create v0.5.1 -t "0.5.1" entreepy/entreepy*
80+
gh release create v1.0.0 -t "1.0.0" entreepy/entreepy*

README.md

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,22 @@ entreepy<br/>
22
[![Actions Status](https://github.com/typio/entreepy/workflows/release/badge.svg)](https://github.com/typio/entreepy/actions)
33
====
44

5-
> ⚡ Fast huffman coding text compression
5+
> ⚡ Fast text compression tool
66
7-
The name is from entropy coding + binary trees.
7+
The name is entropy coding + binary trees.
88

99
### Usage
1010

1111
```
12-
$ entreepy [options] [command] [file] [command options]
12+
Entreepy - Text compression tool
13+
14+
Usage: entreepy [options] [command] [file] [command options]
1315
1416
Options:
1517
-h, --help show help
1618
-p, --print print decompressed text to stdout
1719
-t, --test test/dry run, does not write to file
18-
-d, --debug print huffman code dictionary and performance times
20+
-d, --debug print huffman code dictionary and performance times to stdout
1921
2022
Commands:
2123
c compress a file
@@ -26,10 +28,10 @@ Command Options:
2628
2729
Examples:
2830
entreepy -d c text.txt -o text.txt.et
29-
entreepy -ptd d text.txt.et
31+
entreepy -ptd d text.txt.et -o decoded_text.txt
3032
```
3133

32-
Input file must be < 1 terabyte. I recommend keeping an uncompressed backup or testing the program's decompression before deleting the original, the program hasn't been robustly tested. Be sure to use the same version of the program to decompress as compress.
34+
Input file must be < 1 terabyte. Be sure to use the same version of the program to decompress as compress.
3335

3436
### Performance
3537

@@ -41,20 +43,23 @@ I use a decode map which is keyed by the integer value of the code and stores a
4143

4244
By utilizing this decode map, decoding can be performed much more quickly than by traversing a binary tree.
4345

44-
#### Performance on MacBook Air M2, 8 GB RAM - v0.5.0
46+
#### Performance on MacBook Air M2, 8 GB RAM - v1.0.0
4547
| File | Original File Size | Compressed Size | Compression Time | Decompression Time |
4648
| ---- | :----------------: | :-------------: | :--------------: | :----------------: |
47-
| [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt) | 477 bytes | 374 bytes | 240μs | 950μs |
48-
| [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 115 KB | ~ 66 KB | 2.2ms | 150ms |
49-
| [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.5 MB | ~ 3.2 MB | 0.1s | 7s |
49+
| [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt) | 477 bytes | 374 bytes | 600μs | 3.2ms |
50+
| [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 112 KB | ~ 68 KB | 6.7ms | 262ms |
51+
| [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.2 MB | ~ 3.0 MB | 111ms | 11.8s |
52+
53+
Next I'll add block based parallel decoding. After that I'm interested in exploring additional compression techniques; to support non-text file formats.
5054

5155
### Compressed File Format
5256

5357
Uses the `.et` file format, identified by the magic number `e7 c0 de`.
5458

5559
```bf
5660
| magic number -> 3 bytes |
57-
| (length of dictionary - 1) -> 1 byte |
61+
| file format version -> 1 byte |
62+
| length of dictionary - 1 -> 1 byte |
5863
| length of body -> 4 bytes |
5964

6065
for n symbols

build.zig.zon

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
.{
22
.name = "entreepy",
33

4-
.version = "0.5.1",
4+
.version = "1.0.0",
55

66
.dependencies = .{},
77

src/decode.zig

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ pub const DecodeFlags = struct {
88
debug: bool = false,
99
};
1010

11-
// TODO: Add checks for to error if it isnt in valid .et file format (min length)
12-
1311
pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: anytype, std_out: std.fs.File, flags: DecodeFlags) !usize {
1412
var bytes_written: u32 = 0;
1513
const start_time = std.time.microTimestamp();
@@ -20,19 +18,15 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
2018
var reading_dict_code_len: bool = false;
2119
var reading_dict_code: bool = false;
2220

23-
const decode_dictionary_length: u8 = compressed_text[3] + 1;
24-
25-
std.debug.print("decode_dictionary_length: {}\n", .{decode_dictionary_length});
21+
const decode_dictionary_length: u8 = compressed_text[0] + 1;
2622

27-
var decode_body_length: u32 = compressed_text[4];
23+
var decode_body_length: u32 = compressed_text[1];
2824
decode_body_length <<= 8;
29-
decode_body_length |= compressed_text[5];
25+
decode_body_length |= compressed_text[2];
3026
decode_body_length <<= 8;
31-
decode_body_length |= compressed_text[6];
27+
decode_body_length |= compressed_text[3];
3228
decode_body_length <<= 8;
33-
decode_body_length |= compressed_text[7];
34-
35-
std.debug.print("decode body length: {}\n", .{decode_body_length});
29+
decode_body_length |= compressed_text[4];
3630

3731
var longest_code: u8 = 0;
3832
var shortest_code: usize = std.math.maxInt(usize);
@@ -53,7 +47,7 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
5347
var build_bits: usize = 0b0;
5448
var i: usize = 0; // bit pos in current read
5549
var letters_read: u8 = 0;
56-
for (compressed_text[8..]) |byte| {
50+
for (compressed_text[5..]) |byte| {
5751
pos = 0;
5852

5953
read: while (true) {
@@ -136,7 +130,7 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
136130
var testing_code: usize = 0;
137131
var decoded_letters_read: usize = 0;
138132

139-
for (compressed_text[8 + global_pos ..]) |byte| {
133+
for (compressed_text[5 + global_pos ..]) |byte| {
140134
window <<= 8;
141135
window |= byte;
142136
window_len += 8;

src/encode.zig

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,20 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
3636
// alphabetically, 0 occurence ascii chars at the end
3737
var sorted_letter_book = [_]u8{0} ** 256;
3838

39-
// my naive custom sort, <256 passes, ~100 microseconds
39+
// simple custom sort, <256 passes, ~100 microseconds
4040
var book_index: u8 = 0;
4141
var min_value: usize = 1;
4242
var next_min_value: usize = 0;
4343
while (next_min_value != std.math.maxInt(usize)) {
4444
next_min_value = std.math.maxInt(usize);
45-
for (occurences_book, 0..) |o, c| {
46-
if (o < next_min_value and o > min_value) {
47-
next_min_value = o;
45+
for (occurences_book, 0..) |occurences, char_code| {
46+
if (occurences < next_min_value and occurences > min_value) {
47+
next_min_value = occurences;
4848
}
4949
// occurences is definitionally sorted in ASCII alphabetical order
50-
// so ties (1+ c's with same o) with be resolved alphabetically
51-
if (o == min_value) {
52-
sorted_letter_book[book_index] = @intCast(c);
50+
// so ties (different char_codes with same occurences) with be resolved alphabetically
51+
if (occurences == min_value) {
52+
sorted_letter_book[book_index] = @intCast(char_code);
5353
if (book_index < 255) book_index += 1;
5454
}
5555
}
@@ -174,17 +174,46 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
174174
}
175175

176176
if (traverser.node.right == null and traverser.node.left == null) {
177-
if (flags.debug) try std_out.writer().print("{c} - ", .{traverser.node.symbol orelse 0});
177+
if (flags.debug) try std_out.writer().print("{c} {} - ", .{ traverser.node.symbol orelse 0, traverser.node.symbol orelse 0 });
178178
var j: u8 = traverser.path.length;
179179
while (j > 0) : (j -= 1) {
180180
if (flags.debug) try std_out.writer().print("{b}", .{traverser.path.data >>
181-
@as(u4, @truncate(j - 1)) & 1});
181+
@as(u5, @truncate(j - 1)) & 1});
182182
}
183183
if (flags.debug) try std_out.writer().print("\n", .{});
184184
dictionary[traverser.node.symbol orelse unreachable] = traverser.path;
185185
}
186186
}
187187

188+
// debug check that there are no colliding prefixes
189+
if (flags.debug) {
190+
for (dictionary, 0..) |code_1, i| {
191+
for (dictionary, 0..) |code_2, j| {
192+
if (code_1.length == 0 or code_2.length == 0 or i == j) continue;
193+
194+
var isPrefix = true;
195+
const shorter = @min(code_1.length, code_2.length);
196+
var k: usize = 0;
197+
198+
while (k <= shorter) : (k += 1) {
199+
const code_1_bit = (code_1.data >> @as(u5, @truncate(code_1.length - k))) & 1;
200+
const code_2_bit = (code_2.data >> @as(u5, @truncate(code_2.length - k))) & 1;
201+
202+
if (code_1_bit != code_2_bit) {
203+
isPrefix = false;
204+
break;
205+
}
206+
}
207+
208+
if (isPrefix) {
209+
const l_i = @as(u8, @truncate(i));
210+
const l_j = @as(u8, @truncate(j));
211+
try std_out.writer().print("Found colliding prefix codes for {} {c} and {} {c}", .{ l_i, l_i, l_j, l_j });
212+
}
213+
}
214+
}
215+
}
216+
188217
// estimate of header length when every unique char is used
189218
const max_header_length: usize = 7200;
190219
var out_buffer = try allocator.alloc(u8, max_header_length + text.len);
@@ -198,6 +227,10 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
198227
try bit_stream_writer.writeBits(@as(u24, 0xe7c0de), 24);
199228
bits_written += 24;
200229

230+
// write format version
231+
try bit_stream_writer.writeBits(@as(u8, 0x01), 8);
232+
bits_written += 8;
233+
201234
// write dictionary length
202235
var dictionary_length: usize = 0; // dictionary length - 1
203236
for (dictionary) |code| {
@@ -209,7 +242,6 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
209242

210243
// write body length
211244
try bit_stream_writer.writeBits(text.len, 32);
212-
std.debug.print("text.len {}", .{text.len});
213245
bits_written += 32;
214246

215247
// write dictionary
@@ -223,7 +255,7 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
223255
bits_written += 8;
224256
var j: usize = code.length;
225257
while (j > 0) : (j -= 1) {
226-
try bit_stream_writer.writeBits((code.data >> @as(u4, @truncate(j - 1))) & 1, 1);
258+
try bit_stream_writer.writeBits((code.data >> @as(u5, @truncate(j - 1))) & 1, 1);
227259
bits_written += 1;
228260
}
229261
}
@@ -236,7 +268,7 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
236268
const code = dictionary[char];
237269
var j: usize = code.length;
238270
while (j > 0) : (j -= 1) {
239-
try bit_stream_writer.writeBits((code.data >> @as(u4, @truncate(j - 1))) & 1, 1);
271+
try bit_stream_writer.writeBits((code.data >> @as(u5, @truncate(j - 1))) & 1, 1);
240272
bits_written += 1;
241273
}
242274
}

0 commit comments

Comments
 (0)