Skip to content

Commit dc1491f

Browse files
committed
Fix/Improve handling of codeword issues
1 parent 401e7d3 commit dc1491f

File tree

2 files changed

+41
-5
lines changed

2 files changed

+41
-5
lines changed

dahuffman/huffmancodec.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import collections
22
import itertools
3+
import math
34
from io import IOBase
45
import sys
56
from heapq import heappush, heappop, heapify
@@ -65,20 +66,25 @@ def ensure_dir(path: Union[str, Path]) -> Path:
6566

6667
class CodeTable:
6768
"""
68-
Code table: mapping a symbol to codes (and vice versa).
69+
Code table: mapping a symbol to codewords (and vice versa).
6970
7071
The symbols are the things you want to encode, usually characters in a string
7172
or byte sequence, but it can be anything hashable.
72-
The codes are the corresponding bit sequences, represented as a tuple (bits, value)
73+
The codewords are the corresponding bit sequences, represented as a tuple (bits, value)
7374
where `bits` is the number of bits and `value` the integer interpretation of these bits.
7475
"""
76+
# TODO: use something like namedtuple or class with slots for codewords instead of tuples?
7577

7678
def __init__(self, symbol_code_map: dict):
7779
self._symbol_map = {}
7880
self._code_map = {}
7981
for symbol, (bits, value) in symbol_code_map.items():
80-
assert isinstance(bits, int) and bits >= 1, f"Invalid bit count {bits}"
81-
assert isinstance(value, int) and value >= 0, f"Invalid code value {value}"
82+
if not (
83+
isinstance(bits, int) and bits >= 1
84+
and isinstance(value, int) and value >= 0
85+
and math.log2(max(value, 1)) < bits
86+
):
87+
raise ValueError("Invalid code: {b} bits, value {v}".format(b=bits, v=value))
8288
self._symbol_map[symbol] = (bits, value)
8389
self._code_map[(bits, value)] = symbol
8490
# TODO check if code table is actually a prefix code

tests/test_dahuffman.py

+31-1
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,47 @@
11
# coding=utf-8
22
import io
33
import re
4+
import textwrap
45
from io import StringIO
56
from pathlib import Path
67

78
import pytest
89

910
from dahuffman import HuffmanCodec
10-
from dahuffman.huffmancodec import PrefixCodec, _EOF
11+
from dahuffman.huffmancodec import PrefixCodec, _EOF, CodeTable
1112

1213

1314
# TODO test streaming
1415

16+
class TestCodeTable:
17+
18+
def test_basic(self):
19+
table = CodeTable({"a": (1, 0), "b": (1, 1)})
20+
assert table.get_code("a") == (1, 0)
21+
assert table.get_code("b") == (1, 1)
22+
assert table.get_symbol(1, 0) == "a"
23+
assert table.get_symbol(1, 1) == "b"
24+
assert len(table) == 2
25+
26+
@pytest.mark.parametrize("codes", [
27+
{"a": (0, 0), "b": (1, 1)},
28+
{"a": (1, 0), "b": (1, -1)},
29+
{"a": (1, 2), "b": (1, 1)},
30+
])
31+
def test_invalid(self, codes):
32+
with pytest.raises(ValueError):
33+
CodeTable(codes)
34+
35+
def test_print(self):
36+
table = CodeTable({"a": (2, 0), "b": (3, 7)})
37+
out = StringIO()
38+
table.print(out)
39+
assert out.getvalue() == textwrap.dedent("""\
40+
Bits Code Value Symbol
41+
2 00 0 'a'
42+
3 111 7 'b'
43+
""")
44+
1545

1646
def test_prefix_codec():
1747
code_table = {'A': (2, 0), 'B': (2, 1), _EOF: (2, 3)}

0 commit comments

Comments
 (0)