Skip to content

Commit

Permalink
Fixed bugs with CLI and encode prefix code creation ⚡
Browse files Browse the repository at this point in the history
  • Loading branch information
typio committed Mar 12, 2024
1 parent 5394be0 commit c0166ed
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 82 deletions.
18 changes: 7 additions & 11 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,23 +35,19 @@ jobs:
include:
- name: linux-x86_64
target: x86_64-linux
strip: strip=true
optimize: optimize=ReleaseSafe
optimize: optimize=ReleaseFast

- name: windows-x86_64
target: x86_64-windows
strip: strip=true
optimize: optimize=ReleaseSafe
optimize: optimize=ReleaseFast

- name: macos-aarch64
target: aarch64-macos
strip: strip=true
optimize: optimize=ReleaseSafe
optimize: optimize=ReleaseFast

- name: macos-x86
target: x86_64-macos
strip: strip=true
optimize: optimize=ReleaseSafe
optimize: optimize=ReleaseFast
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
Expand All @@ -61,9 +57,9 @@ jobs:
with:
version: master

- name: zig build -D${{ matrix.strip }} -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
- name: zig build -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
run: |
zig build -D${{ matrix.strip }} -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
zig build -D${{ matrix.optimize }} -Dtarget=${{ matrix.target }}
- uses: actions/upload-artifact@v3
with:
Expand All @@ -81,4 +77,4 @@ jobs:
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
gh release create v0.5.1 -t "0.5.1" entreepy/entreepy*
gh release create v1.0.0 -t "1.0.0" entreepy/entreepy*
27 changes: 16 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,22 @@ entreepy<br/>
[![Actions Status](https://github.com/typio/entreepy/workflows/release/badge.svg)](https://github.com/typio/entreepy/actions)
====

> ⚡ Fast huffman coding text compression
> ⚡ Fast text compression tool
The name is from entropy coding + binary trees.
The name is entropy coding + binary trees.

### Usage

```
$ entreepy [options] [command] [file] [command options]
Entreepy - Text compression tool
Usage: entreepy [options] [command] [file] [command options]
Options:
-h, --help show help
-p, --print print decompressed text to stdout
-t, --test test/dry run, does not write to file
-d, --debug print huffman code dictionary and performance times
-d, --debug print huffman code dictionary and performance times to stdout
Commands:
c compress a file
Expand All @@ -26,10 +28,10 @@ Command Options:
Examples:
entreepy -d c text.txt -o text.txt.et
entreepy -ptd d text.txt.et
entreepy -ptd d text.txt.et -o decoded_text.txt
```

Input file must be < 1 terabyte. I recommend keeping an uncompressed backup or testing the program's decompression before deleting the original, the program hasn't been robustly tested. Be sure to use the same version of the program to decompress as compress.
Input file must be < 1 terabyte. Be sure to use the same version of the program to decompress as compress.

### Performance

Expand All @@ -41,20 +43,23 @@ I use a decode map which is keyed by the integer value of the code and stores a

By utilizing this decode map, decoding can be performed much more quickly than by traversing a binary tree.

#### Performance on MacBook Air M2, 8 GB RAM - v0.5.0
#### Performance on MacBook Air M2, 8 GB RAM - v1.0.0
| File | Original File Size | Compressed Size | Compression Time | Decompression Time |
| ---- | :----------------: | :-------------: | :--------------: | :----------------: |
| [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt) | 477 bytes | 374 bytes | 240μs | 950μs |
| [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 115 KB | ~ 66 KB | 2.2ms | 150ms |
| [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.5 MB | ~ 3.2 MB | 0.1s | 7s |
| [Macbeth, Act V, Scene V](https://github.com/typio/entreepy/blob/main/res/nice.shakespeare.txt) | 477 bytes | 374 bytes | 600μs | 3.2ms |
| [A Midsummer Night's Dream](https://github.com/typio/entreepy/blob/main/res/a_midsummer_nights_dream.txt) | ~ 112 KB | ~ 68 KB | 6.7ms | 262ms |
| [The Complete Works of Shakespeare](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) | ~ 5.2 MB | ~ 3.0 MB | 111ms | 11.8s |

Next I'll add block based parallel decoding. After that I'm interested in exploring additional compression techniques; to support non-text file formats.

### Compressed File Format

Uses the `.et` file format, identified by the magic number `e7 c0 de`.

```bf
| magic number -> 3 bytes |
| (length of dictionary - 1) -> 1 byte |
| file format version -> 1 byte |
| length of dictionary - 1 -> 1 byte |
| length of body -> 4 bytes |

for n symbols
Expand Down
2 changes: 1 addition & 1 deletion build.zig.zon
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
.{
.name = "entreepy",

.version = "0.5.1",
.version = "1.0.0",

.dependencies = .{},

Expand Down
20 changes: 7 additions & 13 deletions src/decode.zig
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ pub const DecodeFlags = struct {
debug: bool = false,
};

// TODO: Add checks for to error if it isnt in valid .et file format (min length)

pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: anytype, std_out: std.fs.File, flags: DecodeFlags) !usize {
var bytes_written: u32 = 0;
const start_time = std.time.microTimestamp();
Expand All @@ -20,19 +18,15 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
var reading_dict_code_len: bool = false;
var reading_dict_code: bool = false;

const decode_dictionary_length: u8 = compressed_text[3] + 1;

std.debug.print("decode_dictionary_length: {}\n", .{decode_dictionary_length});
const decode_dictionary_length: u8 = compressed_text[0] + 1;

var decode_body_length: u32 = compressed_text[4];
var decode_body_length: u32 = compressed_text[1];
decode_body_length <<= 8;
decode_body_length |= compressed_text[5];
decode_body_length |= compressed_text[2];
decode_body_length <<= 8;
decode_body_length |= compressed_text[6];
decode_body_length |= compressed_text[3];
decode_body_length <<= 8;
decode_body_length |= compressed_text[7];

std.debug.print("decode body length: {}\n", .{decode_body_length});
decode_body_length |= compressed_text[4];

var longest_code: u8 = 0;
var shortest_code: usize = std.math.maxInt(usize);
Expand All @@ -53,7 +47,7 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
var build_bits: usize = 0b0;
var i: usize = 0; // bit pos in current read
var letters_read: u8 = 0;
for (compressed_text[8..]) |byte| {
for (compressed_text[5..]) |byte| {
pos = 0;

read: while (true) {
Expand Down Expand Up @@ -136,7 +130,7 @@ pub fn decode(allocator: Allocator, compressed_text: []const u8, out_writer: any
var testing_code: usize = 0;
var decoded_letters_read: usize = 0;

for (compressed_text[8 + global_pos ..]) |byte| {
for (compressed_text[5 + global_pos ..]) |byte| {
window <<= 8;
window |= byte;
window_len += 8;
Expand Down
56 changes: 44 additions & 12 deletions src/encode.zig
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,20 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
// alphabetically, 0 occurence ascii chars at the end
var sorted_letter_book = [_]u8{0} ** 256;

// my naive custom sort, <256 passes, ~100 microseconds
// simple custom sort, <256 passes, ~100 microseconds
var book_index: u8 = 0;
var min_value: usize = 1;
var next_min_value: usize = 0;
while (next_min_value != std.math.maxInt(usize)) {
next_min_value = std.math.maxInt(usize);
for (occurences_book, 0..) |o, c| {
if (o < next_min_value and o > min_value) {
next_min_value = o;
for (occurences_book, 0..) |occurences, char_code| {
if (occurences < next_min_value and occurences > min_value) {
next_min_value = occurences;
}
// occurences is definitionally sorted in ASCII alphabetical order
// so ties (1+ c's with same o) with be resolved alphabetically
if (o == min_value) {
sorted_letter_book[book_index] = @intCast(c);
// so ties (different char_codes with same occurences) with be resolved alphabetically
if (occurences == min_value) {
sorted_letter_book[book_index] = @intCast(char_code);
if (book_index < 255) book_index += 1;
}
}
Expand Down Expand Up @@ -174,17 +174,46 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
}

if (traverser.node.right == null and traverser.node.left == null) {
if (flags.debug) try std_out.writer().print("{c} - ", .{traverser.node.symbol orelse 0});
if (flags.debug) try std_out.writer().print("{c} {} - ", .{ traverser.node.symbol orelse 0, traverser.node.symbol orelse 0 });
var j: u8 = traverser.path.length;
while (j > 0) : (j -= 1) {
if (flags.debug) try std_out.writer().print("{b}", .{traverser.path.data >>
@as(u4, @truncate(j - 1)) & 1});
@as(u5, @truncate(j - 1)) & 1});
}
if (flags.debug) try std_out.writer().print("\n", .{});
dictionary[traverser.node.symbol orelse unreachable] = traverser.path;
}
}

// debug check that there are no colliding prefixes
if (flags.debug) {
for (dictionary, 0..) |code_1, i| {
for (dictionary, 0..) |code_2, j| {
if (code_1.length == 0 or code_2.length == 0 or i == j) continue;

var isPrefix = true;
const shorter = @min(code_1.length, code_2.length);
var k: usize = 0;

while (k <= shorter) : (k += 1) {
const code_1_bit = (code_1.data >> @as(u5, @truncate(code_1.length - k))) & 1;
const code_2_bit = (code_2.data >> @as(u5, @truncate(code_2.length - k))) & 1;

if (code_1_bit != code_2_bit) {
isPrefix = false;
break;
}
}

if (isPrefix) {
const l_i = @as(u8, @truncate(i));
const l_j = @as(u8, @truncate(j));
try std_out.writer().print("Found colliding prefix codes for {} {c} and {} {c}", .{ l_i, l_i, l_j, l_j });
}
}
}
}

// estimate of header length when every unique char is used
const max_header_length: usize = 7200;
var out_buffer = try allocator.alloc(u8, max_header_length + text.len);
Expand All @@ -198,6 +227,10 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
try bit_stream_writer.writeBits(@as(u24, 0xe7c0de), 24);
bits_written += 24;

// write format version
try bit_stream_writer.writeBits(@as(u8, 0x01), 8);
bits_written += 8;

// write dictionary length
var dictionary_length: usize = 0; // dictionary length - 1
for (dictionary) |code| {
Expand All @@ -209,7 +242,6 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o

// write body length
try bit_stream_writer.writeBits(text.len, 32);
std.debug.print("text.len {}", .{text.len});
bits_written += 32;

// write dictionary
Expand All @@ -223,7 +255,7 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
bits_written += 8;
var j: usize = code.length;
while (j > 0) : (j -= 1) {
try bit_stream_writer.writeBits((code.data >> @as(u4, @truncate(j - 1))) & 1, 1);
try bit_stream_writer.writeBits((code.data >> @as(u5, @truncate(j - 1))) & 1, 1);
bits_written += 1;
}
}
Expand All @@ -236,7 +268,7 @@ pub fn encode(allocator: Allocator, text: []const u8, out_writer: anytype, std_o
const code = dictionary[char];
var j: usize = code.length;
while (j > 0) : (j -= 1) {
try bit_stream_writer.writeBits((code.data >> @as(u4, @truncate(j - 1))) & 1, 1);
try bit_stream_writer.writeBits((code.data >> @as(u5, @truncate(j - 1))) & 1, 1);
bits_written += 1;
}
}
Expand Down
Loading

1 comment on commit c0166ed

@typio
Copy link
Owner Author

@typio typio commented on c0166ed Mar 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixes #4
Fixes #2

Please sign in to comment.