Skip to content

Commit 6cadad3

Browse files
pattern/protobuf: Allow parsing of nested messages (#378)
Mostly this enables attempted recursive parsing of submessages. Note that it is inherently impossible to determine the underlying data type for LengthDelimited for sure, so this is a best-effort attempt. The user can disable recursive submessage parsing via Settings. Other minor changes: * added #pragma MIME and #pragma endian directives * enabled UTF-8 display for LengthDelimited types * added signed LEB128 display for Varint types (although this doesn't seem to be working on my test case) * swapped if/else-if structure for match * fail upon receiving unknown or unsupported WireType Co-authored-by: Nik <[email protected]>
1 parent 7ad9cd4 commit 6cadad3

File tree

1 file changed

+57
-13
lines changed

1 file changed

+57
-13
lines changed

patterns/protobuf.hexpat

+57-13
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,25 @@
1-
#pragma author WerWolv
1+
#pragma author WerWolv and Glenn Hartmann
22
#pragma description Google Protobuf wire encoding (.pb)
33

4+
#pragma MIME application/protobuf
5+
#pragma MIME application/vnd.google.protobuf
6+
7+
#pragma endian little
8+
49
import std.core;
510
import std.io;
611
import std.mem;
12+
import std.string;
13+
import std.sys;
714

815
import type.leb128;
916

17+
// Attempting to recursively parse submessages is a guess-and-check process
18+
// since it's inherently impossible to tell for sure what type a
19+
// LengthDelimited field is. This could be imprecise and could be slow for
20+
// large or ambiguous files, so we give the user an option to disable it.
21+
bool disable_recursive_submessage_parsing in;
22+
1023
struct ZigZag32 {
1124
u32 value;
1225
} [[sealed, format("format_zigzag32")]];
@@ -32,7 +45,6 @@ enum WireType : u8 {
3245
_32Bit = 5
3346
};
3447

35-
3648
struct Key {
3749
type::uLEB128 keyDec;
3850
u32 field_number = u32(keyDec) >> 3;
@@ -55,23 +67,55 @@ union _32Bit {
5567
float flt;
5668
};
5769

70+
using Field;
71+
72+
struct Message<auto Size> {
73+
Field fields[while(!std::mem::reached(addressof(this) + Size))];
74+
};
75+
76+
struct Utf8String<auto Length> {
77+
char data[Length];
78+
} [[sealed, format("std::string::impl::format_string"), transform("std::string::impl::format_string")]];
79+
80+
union _LengthDelimitedData<auto Length> {
81+
u8 bytes[Length];
82+
Utf8String<Length> utf8;
83+
84+
if (!disable_recursive_submessage_parsing) {
85+
try {
86+
// Attempt to parse binary data as an embedded Message. This is
87+
// expected to fail often, as the proto format uses LengthDelimited
88+
// for several different data types.
89+
Message<Length> msg;
90+
std::assert(sizeof(msg) == Length, "Attempted parse of Message consumed wrong number of bytes.");
91+
}
92+
}
93+
};
94+
5895
struct LengthDelimited {
5996
type::uLEB128 length;
60-
char data[length];
97+
98+
std::assert($ + length <= std::mem::size(), "Attempting to parse _LengthDelimitedData would exceed file length.");
99+
_LengthDelimitedData<length> data;
61100
};
62101

102+
union _LEB128 {
103+
type::uLEB128 uLEB128;
104+
type::sLEB128 sLEB128; // NOTE: the signed version doesn't seem to be working properly
105+
};
63106

64-
struct Entry {
107+
struct Field {
65108
Key key;
66109

67-
if (key.wire_type == WireType::Varint)
68-
type::uLEB128 value;
69-
else if (key.wire_type == WireType::_64Bit)
70-
_64Bit value;
71-
else if (key.wire_type == WireType::LengthDelimited)
72-
LengthDelimited value;
73-
else if (key.wire_type == WireType::_32Bit)
74-
_32Bit value;
110+
match (key.wire_type) {
111+
(WireType::Varint): _LEB128 value;
112+
(WireType::_64Bit): _64Bit value;
113+
(WireType::LengthDelimited): LengthDelimited value;
114+
(WireType::_32Bit): _32Bit value;
115+
(WireType::StartGroup | WireType::EndGroup): std::unimplemented();
116+
(_): std::error("Unknown WireType.");
117+
}
75118
};
76119

77-
Entry entries[while(!std::mem::eof())] @ 0x00;
120+
Message<std::mem::size()> msg @ 0x00;
121+
std::assert(std::mem::eof(), "Parsing did not consume whole file.");

0 commit comments

Comments
 (0)