-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathprintable_binary.exs
207 lines (175 loc) · 9.78 KB
/
printable_binary.exs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
### Design Goals ###
# The point of this code is to come up with a good visual representation/encoding of binary data that can be represented simply in a UTF-8 string;
# that is, the design priorities are:
# 1) visually distinct
# 2) conformant to printable ASCII 0-127 (so embedded text is instantly recognizable)
# 3) each representation is only the width of 1 character (although internally may be multibyte) so that printouts are as long as the
# underlying byte length, but no longer; this made solutions like \n and whatnot infeasible
# 4) for control characters and nonprintable characters and values from 128-255, use a representation that is ideally only 2 bytes, but if no workable
# substitute exists in the 2 byte Unicode character set, 3 bytes is "OK"
# 5) Easily copyable, pastable, and printable (possibly even OCR'able)
# 6) Less memory-consuming and more immediately useful (especially visually) than hexadecimal encoding
# 7) Ideally useful in debugging/testing environments but can also be used as a general binary representation in code (to be decoded right before use of course)
# 8) avoid use of emoji due to 4-byte requirement and "visually jarring"; so for example, this is not useful https://ayende.com/blog/177729/emoji-encoding-a-new-style-for-binary-encoding-for-the-web
defmodule PrintableBinary do
def encode(0), do: "∅" # \0 or possibly ␀ but the spacing on that gets weird; see https://en.wikipedia.org/wiki/C0_and_C1_control_codes
def encode(1), do: "¯" # Start of Heading
def encode(2), do: "«" # Start of Text
def encode(3), do: "»" # End of Text
def encode(4), do: "ϟ" # control-D, or "end of transmission" signal. 3bytes: ⌁, 2bytes: ϟ
def encode(5), do: "¿" # Enquiry (ENQ)
def encode(6), do: "¡" # Acknowledge (ACK)
def encode(7), do: "ª" # \a (bell). 2bytes: ª. 4 bytes: 🔔
def encode(8), do: "⌫" # \b (backspace). 3bytes: ⌫
def encode(9), do: "⇥" # \t (tab). 3 bytes: ⇥
def encode(10), do: "⇩" # \n (newline or line feed). 3bytes: ⇩
def encode(11), do: "↧" # \v (vertical tab). 3bytes: ↧
def encode(12), do: "§" # \f 2bytes: § (could also use ↡ for form feed/page break, but that takes 3 bytes and is less visually distinct)
def encode(13), do: "⏎" # \r (carriage return). 3bytes: ⏎
def encode(14), do: "ȯ" # Shift Out
def encode(15), do: "ʘ" # Shift Back In
def encode(16), do: "Ɣ" # Data Link Escape
def encode(17), do: "¹" # (XON) Device Control 1
def encode(18), do: "²" # Device Control 2
def encode(19), do: "º" # (XOFF) Device Control 3
def encode(20), do: "³" # Device Control 4 (used 3 since I wanted 0 for XOFF and using ⁴ is 3 bytes)
def encode(21), do: "Ͷ" # Negative Acknowledge (NAK)
def encode(22), do: "ɨ" # Synchronous Idle
def encode(23), do: "¬" # End of Transmission Block
def encode(24), do: "©" # Cancel (cancel previous input)
def encode(25), do: "¦" # End of Medium
def encode(26), do: "Ƶ️" # control-Z, SIGTSTP/stop/suspend, "soft EOF", possibly consider 🛑 or ⏸? all 4 bytes tho
def encode(27), do: "⎋" # \e (escape). 3 bytes: ⎋.
def encode(28), do: "Ξ" # File Separator
def encode(29), do: "ǁ" # Group Separator
def encode(30), do: "ǀ" # Record Separator
def encode(31), do: "¶" # Unit Separator
# fallthrough to the 3 byte control character symbolic representations (hard to read/indistinct) when no clear 2-byte substitute provided
# Note: This fallthrough isn't currently reachable because all the values are now accounted for above
# but you can comment those out if you want this type of representation instead
def encode(n) when is_integer(n) and n > -1 and n < 32 do
<<(n + 9216)::utf8>>
end
# space. make it visible with a ␣. also prevents unwanted line breaks on actual spaces.
def encode(32), do: "␣" # \s ?
# encode double quote to utf8 757 so that it doesn't need to get escaped
# I wanted something that looked enough like a double quote to be recognizable
# but different enough to not be mistaken for the normal " character.
def encode(34), do: "˵"
# encode backslash to Ʌ so that it doesn't need to get escaped
def encode(92), do: "Ʌ"
def encode(n) when is_integer(n) and n > 32 and n < 127 do
to_string([n])
end
# forward delete
def encode(127), do: "⌦"
# move encodings of Ø and ø (both 2 bytes) so that ∅ (3 bytes) is visually distinct for NULL
def encode(152), do: "Ō"
def encode(184), do: "ŏ"
def encode(n) when is_integer(n) and n > 127 and n < 192 do
<<195, n>>
end
def encode(n) when is_integer(n) and n > 191 and n < 256 do
<<196, (n - 192) + 0x80>>
end
def encode(n) when is_integer(n) and n > 255 do
raise "Invalid byte to encode: #{n}"
end
def encode(n) when is_list(n) do
encode(codepoint_list_to_binary(n))
end
def encode(<<>>), do: ""
def encode(<<c::8, rest::binary>>) do
encode(c) <> encode(rest)
end
def to_bytelist(<<>>), do: []
def to_bytelist(<<c::8, rest::binary>>) do
[c | to_bytelist(rest)]
end
def codepoint_list_to_binary(list) when is_list(list) do
do_codepoint_list_to_binary("", list)
end
defp do_codepoint_list_to_binary(bin, []), do: bin
defp do_codepoint_list_to_binary(bin, [cp | lst]) when cp > -1 and cp < 256 do
do_codepoint_list_to_binary(bin <> <<cp>>, lst)
end
@decode_charset "∅¯«»ϟ¿¡ª⌫⇥⇩↧§⏎ȯʘƔ¹²º³Ͷɨ¬©¦Ƶ⎋Ξǁǀ¶␣!˵#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[Ʌ]^_`abcdefghijklmnopqrstuvwxyz{|}~⌦ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ŌÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷ŏùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿ"
|> String.codepoints
def decode(<<>>), do: ""
# some UTF8 weirdness to hack around... possibly replace offending character, but this works for now
def decode(<<n::utf8, bin::binary>>) when n == 65039, do: decode(bin)
def decode(<<n::utf8, bin::binary>>) do
code = Enum.find_index(@decode_charset, fn cs -> <<n::utf8>> == cs end)
unless code do
raise "Character #{inspect(<<n::utf8>>)} with codepoint #{n} and length #{byte_size(<<n::utf8>>)} was not in the decode list"
end
<<code>> <> decode(bin)
end
def decode(n) when is_list(n) do
Enum.map(n, fn c -> decode(c) end)
end
end
# run this inline suite with "elixir #{__ENV__.file} test"
if System.argv |> List.first == "test" do
ExUnit.start
defmodule PrintableBinaryTest do
use ExUnit.Case, async: true
alias PrintableBinary, as: PB
# some helper funcs
defp random_stream(min..max) when max >= min do
max = max + 1
seed = :rand.seed_s(:exsplus)
Stream.unfold(seed, fn acc ->
{next, acc} = :rand.uniform_s(acc)
{trunc(min + (next * (max - min))), acc}
end)
end
defp random_binary_data(len) do
random_stream(0..255) |> Enum.take(len) |> PB.codepoint_list_to_binary
end
test "encoding control chars 0-32" do
assert "∅¯«»ϟ¿¡ª⌫⇥⇩↧§⏎ȯʘƔ¹²º³Ͷɨ¬©¦Ƶ️⎋Ξǁǀ¶␣" == (0..32) |> Enum.to_list |> PB.codepoint_list_to_binary |> PB.encode
end
test "encoding printable chars 33-127" do
assert "!˵#$%&'()*+,-./0123456789:;<=>?@" == (33..64) |> Enum.to_list |> to_string |> PB.encode
assert "ABCDEFGHIJKLMNOPQRSTUVWXYZ[Ʌ]^_`abcdefghijklmnopqrstuvwxyz{|}~⌦" == (65..127) |> Enum.to_list |> PB.codepoint_list_to_binary |> PB.encode
end
test "encoding 128-191" do
assert "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ŌÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷ŏùúûüýþÿ" = (128..191) |> Enum.to_list |> PB.codepoint_list_to_binary |> PB.encode
end
test "encoding 192-255" do
assert "ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿ" = (192..255) |> Enum.to_list |> PB.codepoint_list_to_binary |> PB.encode
end
test "making visible some text within some binary data" do
prefix = random_binary_data(10)
suffix = random_binary_data(10)
test_binary = prefix <> "there once was a man in München\r\n" <> suffix
# There's still an issue with cursor placement/spacing using this encoding, at least in my editor...
# Tabling for now
assert PB.encode(prefix) <> "there␣once␣was␣a␣man␣in␣Măünchen⏎⇩" <> PB.encode(suffix) == PB.encode(test_binary)
end
test "all encodings are unique" do
all_symbols = (0..255) |> Enum.to_list |> PB.codepoint_list_to_binary |> PB.encode |> String.split("", trim: true)
assert 256 == all_symbols |> Enum.uniq |> length
end
test "encoding and then decoding random binary result gives same argument" do
charlist = random_stream(0..255) |> Enum.take(2000)
charlist_as_binary = PB.codepoint_list_to_binary(charlist)
assert charlist_as_binary == charlist |> PB.encode |> PB.decode
assert charlist_as_binary == charlist_as_binary |> PB.encode |> PB.decode
# IO.inspect byte_size(bin |> PB.encode) # ~67% expansion in size for random binary data, reasonable?
end
test "encoding problematic binaries doesn't raise" do
PrintableBinary.encode(<<198, 181, 239, 184, 143>>)
end
end
else
mode = System.argv |> List.first # either "encode" or "decode"
input = IO.read(:stdio, :all)
case mode do
"encode" -> input |> PrintableBinary.encode |> IO.puts
"decode" -> input |> PrintableBinary.decode |> IO.puts
blank when blank in ["",nil] -> IO.puts "Usage: #{Path.basename(__ENV__.file)} [ test | encode|decode < input.bin ]\nAnother example: head -c 500 /dev/urandom | elixir printable_binary.exs encode"
_ -> raise "Unknown mode argument '#{mode}', please use either 'encode' or 'decode', or run the test suite with 'test'"
end
end