Skip to content

Commit 96db105

Browse files
authored
Merge pull request #3403 from trailofbits/kkaoudis/http-parsing
HTTP header parsing
2 parents f3f6c21 + 10ea1a0 commit 96db105

File tree

17 files changed

+1327
-9
lines changed

17 files changed

+1327
-9
lines changed

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
strategy:
1616
matrix:
1717
os: [ubuntu-latest] # windows-latest, macos-latest,
18-
python-version: ["3.7", "3.8", "3.9", "3.10"]
18+
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
1919

2020
runs-on: ${{ matrix.os }}
2121

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ polyfile/defs/*
33
poyfile/trie_full.gz
44
polyfile/trie_partial.gz
55
~*
6-
*.pyc
6+
*.pyc
7+
.vscode/
8+
.vscode/*

polyfile/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1-
from . import nes, pdf, jpeg, zipmatcher, nitf, kaitaimatcher, languagematcher, polyfile
1+
from . import (
2+
nes,
3+
pdf,
4+
jpeg,
5+
zipmatcher,
6+
nitf,
7+
http,
8+
kaitaimatcher,
9+
languagematcher,
10+
polyfile
11+
)
12+
213
from .__main__ import main
314
from .polyfile import __version__

polyfile/ast.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
2+
3+
from .polyfile import Match, Submatch
4+
5+
6+
class Node:
7+
def __init__(self,
8+
name: str,
9+
value: Optional[bytes] = None,
10+
offset: Optional[int] = None,
11+
length: Optional[int] = None,
12+
older_sibling: Optional["Node"] = None,
13+
children: Iterable["Node"] = ()
14+
):
15+
self.name: str = name
16+
self.value: Optional[bytes] = value
17+
self._offset: Optional[int] = offset
18+
self._length: Optional[int] = length
19+
self.older_sibling: Optional[Node] = older_sibling
20+
self.children: Tuple[Node, ...] = tuple(children)
21+
22+
def __repr__(self):
23+
r = f"{self.__class__.__name__}(name={self.name!r}"
24+
if self.value is not None:
25+
r = f"{r}, value={self.value!r}"
26+
r = f"{r}, offset={self.offset!r}, length={self.length!r}, older_sibling={self.older_sibling!r}, " \
27+
f"children={self.children!r})"
28+
return r
29+
30+
@property
31+
def offset(self) -> int:
32+
if self._offset is None:
33+
if self.children:
34+
self._offset = self.children[0].offset
35+
elif self.older_sibling is not None:
36+
self._offset = self.older_sibling.offset + self.older_sibling.length
37+
else:
38+
raise ValueError(f"{self!r} must either have an explicit offset, an older sibling, or a child!")
39+
return self._offset
40+
41+
@property
42+
def length(self) -> int:
43+
if self._length is None:
44+
if self.value is not None:
45+
self._length = len(self.value)
46+
elif not self.children:
47+
self._length = 0
48+
else:
49+
self._length = self.children[-1].offset + self.children[-1].length - self.offset
50+
return self._length
51+
52+
def to_matches(self, parent: Optional[Match] = None) -> Iterator[Submatch]:
53+
stack: List[Tuple["Node", Optional[Match]]] = [(self, parent)]
54+
while stack:
55+
node, parent = stack.pop()
56+
if parent is None:
57+
parent_offset = 0
58+
else:
59+
parent_offset = parent.offset
60+
match = Submatch(
61+
name=node.name,
62+
match_obj=node.value,
63+
relative_offset=node.offset - parent_offset,
64+
length=node.length,
65+
parent=parent
66+
)
67+
yield match
68+
for child in reversed(node.children):
69+
stack.append((child, match))
70+
71+
@classmethod
72+
def load(cls, obj: Any) -> "Node":
73+
ancestors: List[Tuple[Any, Optional[Tuple[Node, ...]]]] = [
74+
(obj, None)
75+
]
76+
children: Optional[Tuple[Node, ...]]
77+
while True:
78+
obj, children = ancestors.pop()
79+
if children is None:
80+
if hasattr(obj, "children") and obj.children:
81+
ancestors.append((obj, ()))
82+
ancestors.extend((child, None) for child in reversed(obj.children))
83+
continue
84+
children = ()
85+
if not hasattr(obj, "name"):
86+
raise ValueError(f"{obj!r} does not have a `name` attribute!")
87+
name: str = obj.name
88+
if hasattr(obj, "value"):
89+
value: Union[Optional[bytes], str] = obj.value
90+
else:
91+
value = None
92+
if isinstance(value, str):
93+
value = value.encode("utf-8")
94+
if hasattr(obj, "offset"):
95+
offset: Optional[int] = obj.offset
96+
else:
97+
offset = None
98+
if hasattr(obj, "length") and (value is None or obj.length >= len(value)):
99+
length: Optional[int] = obj.length
100+
else:
101+
length = None
102+
older_sibling: Optional[Node] = None
103+
for reversed_parent_index, (parent, siblings) in enumerate(reversed(ancestors)):
104+
if siblings is not None:
105+
if siblings:
106+
older_sibling: Optional[Node] = siblings[-1]
107+
break
108+
else:
109+
assert not ancestors
110+
node = cls(name=name, value=value, offset=offset, length=length, older_sibling=older_sibling,
111+
children=children)
112+
if not ancestors:
113+
return node
114+
ancestors[len(ancestors) - reversed_parent_index - 1] = parent, siblings + (node,)

polyfile/http/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .matcher import parse_http_11

polyfile/http/defacto.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from abnf.grammars.misc import load_grammar_rules
2+
from abnf.grammars import rfc9110, rfc3986
3+
from abnf import Rule as _Rule
4+
from typing import List, Tuple
5+
6+
inherited_rulelist: List[Tuple[str, _Rule]] = [
7+
("ipv4", rfc3986.Rule("IPv4address")),
8+
("ipv6", rfc3986.Rule("IPv6address")),
9+
("uri-host", rfc9110.Rule("uri-host")),
10+
]
11+
12+
13+
@load_grammar_rules(inherited_rulelist)
14+
class Rule(_Rule):
15+
"""A place to define *hop-by-hop* headers which do not have an RFC or other standard, but are in common use. As linked, these definitions are primarily based on Mozilla documentation for now.
16+
17+
These are highly worth parsing and examining since they can be spoofed and are untrustworthy if not added by a reverse proxy on a hop-by-hop request path.
18+
19+
There are several variants on X-Forwarded-Proto header included.
20+
"""
21+
22+
grammar: List[str] = [
23+
'defacto-header = "X-Forwarded-For:" OWS X-Forwarded-For OWS / "X-Forwarded-Host:" OWS X-Forwarded-Host OWS / "X-Forwarded-Proto:" OWS X-Forwarded-Proto OWS / "Front-End-Https:" OWS Front-End-Https OWS / "X-Forwarded-Protocol:" OWS X-Forwarded-Protocol OWS / "X-Forwarded-Ssl:" OWS X-Forwarded-Ssl OWS / "X-Url-Scheme:" OWS X-Url-Scheme OWS',
24+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-For
25+
'X-Forwarded-For = xff-client *( 0*1SP "," 0*1SP xff-proxy )',
26+
"xff-client = ipv4 / ipv6",
27+
"xff-proxy = ipv4 / ipv6",
28+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Host - value should be the domain name of the forwarded server
29+
"X-Forwarded-Host = uri-host",
30+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto
31+
'proto = "http" / "https"',
32+
"X-Forwarded-Proto = proto",
33+
'Front-End-Https = "on"',
34+
"X-Forwarded-Protocol = proto",
35+
'X-Forwarded-Ssl = "on"',
36+
"X-Url-Scheme = proto",
37+
]

polyfile/http/deprecated.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from abnf.grammars.misc import load_grammar_rules
2+
from abnf.grammars import rfc9110, rfc3986, rfc9111
3+
from abnf import Rule as _Rule
4+
from typing import List, Tuple
5+
6+
inherited_rulelist: List[Tuple[str, _Rule]] = [
7+
("Accept-Charset", rfc9110.Rule("Accept-Charset")),
8+
("Authentication-Info", rfc9110.Rule("Authentication-Info")),
9+
("token", rfc9110.Rule("token")),
10+
("quoted-string", rfc9110.Rule("quoted-string")),
11+
("HTTP-date", rfc9110.Rule("HTTP-date")),
12+
("Host", rfc9110.Rule("Host")),
13+
]
14+
15+
16+
@load_grammar_rules(inherited_rulelist)
17+
class Rule(_Rule):
18+
"""
19+
Request headers which are in general deprecated by modern browsers, but may still be included from spoofed user agents or unusual user agents.
20+
"""
21+
22+
grammar: List[str] = [
23+
'deprecated-header = "Accept-Charset:" OWS Accept-Charset OWS / "Authentication-Info:" OWS Authentication-Info OWS / "DNT:" OWS DNT OWS / "DPR:" OWS DPR OWS / "Expect-CT:" OWS Expect-CT OWS / "Pragma:" OWS Pragma OWS / "Viewport-Width:" OWS Viewport-Width OWS / "Warning:" OWS Warning OWS / "Width:" OWS Width OWS',
24+
# https://www.w3.org/TR/tracking-dnt/#dnt-header-field
25+
'DNT = ( "0" / "1" ) *DNT-extension',
26+
# DNT-extension excludes CTL, SP, DQUOTE, comma, backslash
27+
"DNT-extension = %x21 / %x23-2B / %x2D-5B / %x5D-7E",
28+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/DPR
29+
'DPR = 1*DIGIT "." 1*DIGIT',
30+
# https://www.rfc-editor.org/rfc/rfc9163#section-2.1
31+
'Expect-CT = expect-ct-directive *( OWS "," OWS expect-ct-directive )',
32+
'expect-ct-directive = directive-name [ "=" directive-value ]',
33+
"directive-name = token",
34+
"directive-value = token / quoted-string",
35+
# https://httpwg.org/specs/rfc9111.html#field.pragma
36+
'Pragma = "no-cache"',
37+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Viewport-Width
38+
# The width of the user's viewport in CSS pixels, rounded up to the nearest integer.
39+
"Viewport-Width = 1*DIGIT",
40+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Warning
41+
"Warning = warn-code warn-agent warn-text *warn-date",
42+
# https://www.iana.org/assignments/http-warn-codes/http-warn-codes.xhtml
43+
'warn-code = "110" / "111" / "112" / "113" / "199" / "214" / "299"',
44+
"warn-agent = Host / pseudonym",
45+
"warn-text = quoted-string",
46+
"warn-date = HTTP-date",
47+
"pseudonym = token",
48+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Width
49+
# The width of the resource in physical pixels, rounded up to the nearest integer.
50+
"Width = 1*DIGIT",
51+
]

polyfile/http/experimental.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from abnf.grammars.misc import load_grammar_rules
2+
from abnf.grammars import rfc9110, rfc3986
3+
from . import structured_headers
4+
from abnf import Rule as _Rule
5+
from typing import List, Tuple
6+
7+
rulelist: List[Tuple[str, _Rule]] = [
8+
("OWS", rfc9110.Rule("OWS")),
9+
("sh-list", structured_headers.Rule("sh-list")),
10+
("sh-string", structured_headers.Rule("sh-string")),
11+
("sh-decimal", structured_headers.Rule("sh-decimal")),
12+
("sh-boolean", structured_headers.Rule("sh-boolean")),
13+
]
14+
15+
16+
@load_grammar_rules(rulelist)
17+
class Rule(_Rule):
18+
"""Request headers defined as 'experimental' by the Mozilla developer documentation, which have partial cross browser support.
19+
20+
Many of these headers convey client hints (indicated by the CH segment in some, but not all, related header names).
21+
22+
Most of these also require only simple ABNF to string together the values, so they are collected here for brevity.
23+
"""
24+
25+
grammar: List[str] = [
26+
'experimental-header = "Device-Memory:" OWS Device-Memory OWS / "Downlink:" OWS Downlink OWS / "Early-Data:" OWS Early-Data OWS / "ECT:" OWS ECT OWS / "RTT:" OWS RTT OWS / "Save-Data:" OWS Save-Data OWS / "Sec-CH-UA-Arch:" OWS Sec-CH-UA-Arch OWS / "Sec-CH-UA-Bitness:" OWS Sec-CH-UA-Bitness OWS / "Sec-CH-UA-Form-Factor:" OWS Sec-CH-UA-Form-Factor / "Sec-CH-UA-Full-Version:" OWS Sec-CH-UA-Full-Version OWS / "Sec-CH-UA-Full-Version-List:" OWS Sec-CH-UA-Full-Version-List OWS / "Sec-CH-UA-Mobile:" OWS Sec-CH-UA-Mobile OWS / "Sec-CH-UA-Model:" OWS Sec-CH-UA-Model OWS / "Sec-CH-UA-Platform:" OWS Sec-CH-UA-Platform OWS / "Sec-CH-UA-Platform-Version:" OWS Sec-CH-UA-Platform-Version OWS / "Sec-GPC:" OWS Sec-GPC OWS / "Sec-CH-Prefers-Reduced-Motion:" OWS Sec-CH-Prefers-Reduced-Motion OWS',
27+
# https://www.w3.org/TR/device-memory/#iana-device-memory
28+
'Device-Memory = "0.25" / "0.5" / "1" / "2" / "4" / "8"',
29+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Downlink
30+
# https://wicg.github.io/netinfo/#dom-networkinformation-downlink
31+
"Downlink = sh-decimal",
32+
# https://httpwg.org/specs/rfc8470.html#header
33+
'Early-Data = "1"',
34+
# https://wicg.github.io/netinfo/#ect-request-header-field
35+
'ECT = "2g" / "3g" / "4g" / "slow-2g"',
36+
# https://wicg.github.io/netinfo/#rtt-request-header-field
37+
"RTT = 1*DIGIT",
38+
# https://wicg.github.io/savedata/#save-data-request-header-field TODO Structured Headers RFC for sh-list
39+
'Save-Data = "on" / sh-list',
40+
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Sec-CH-UA
41+
# TODO sf-list vs sh-list
42+
"Sec-CH-UA = sh-list",
43+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-arch
44+
"Sec-CH-UA-Arch = sh-string",
45+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-bitness
46+
"Sec-CH-UA-Bitness = sh-string",
47+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-form-factor
48+
"Sec-CH-UA-Form-Factor = sh-string",
49+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-full-version
50+
"Sec-CH-UA-Full-Version = sh-string",
51+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-full-version-list
52+
"Sec-CH-UA-Full-Version-List = sh-list",
53+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-mobile
54+
"Sec-CH-UA-Mobile = sh-boolean",
55+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-model
56+
"Sec-CH-UA-Model = sh-string",
57+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-platform
58+
"Sec-CH-UA-Platform = sh-string",
59+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-platform-version
60+
"Sec-CH-UA-Platform-Version = sh-string",
61+
# https://wicg.github.io/ua-client-hints/#sec-ch-ua-wow64
62+
"Sec-CH-UA-WoW64 = sh-boolean",
63+
# https://privacycg.github.io/gpc-spec/#the-sec-gpc-header-field-for-http-requests
64+
'Sec-GPC = "1"',
65+
# https://wicg.github.io/user-preference-media-features-headers/#sec-ch-prefers-reduced-motion
66+
'Sec-CH-Prefers-Reduced-Motion = "no-preference" / "reduce"',
67+
]

0 commit comments

Comments
 (0)