-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgrammar.pegn
115 lines (93 loc) · 4.26 KB
/
grammar.pegn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# PEGN (v1.1.1) pegn.dev/spec/nodes.pegn
# Copyright 2021 Robert S Muhlestein ([email protected])
# Licensed under Apache-2
# Uses pegn.dev/spec/classes.pegn
# Uses pegn.dev/spec/tokens.pegn
# NOTE: assumes scanning by at least one byte at a time
# (except for StateDef, which is rhetorical)
Spec <-- Meta? Copyright? Licensed? Uses*
ComEndLine*
(Definition ComEndLine*)+
Meta <-- '# ' Grammar ' (' Version ') ' Home EndLine
Copyright <-- '# Copyright ' Comment EndLine
Licensed <-- '# Licensed under ' Comment EndLine
Uses <-- '# Uses ' Path EndLine
Path <-- (!ws unipoint)+
ComEndLine <- SP* ('# ' Comment)? EndLine
Definition <- StateDef / NodeDef / ScanDef / ClassDef / TokenDef
Grammar <- Name ('-' NameExt)?
Version <- 'v' MajorVer '.' MinorVer '.' PatchVer ('-' PreVer)?
Home <-- (!ws unipoint)+
Comment <-- (!EndLine unipoint)+
StateDef <-- StateId SP+ '<-' SP+ '# ' Comment # rhetorical
ScanDef <-- CheckId SP+ '<-' SP+ Expression # convenient
NodeDef <-- CheckId SP+ '<--' SP+ Expression # significant
ClassDef <-- ClassId SP+ '<-' SP+ ClassExpr
TokenDef <-- TokenId SP+ '<-' SP+
TokenVal (Spacing TokenVal)*
ComEndLine
# lower-case of identifier must be unique within grammar
Identifier <- CheckId / ClassId / TokenId / StateId
TokenVal <- Unicode / Binary / Hexadec / Octal / SQ String SQ
Name <-- upper{2,12} # ascii, not unicode
NameExt <-- visible{1,20}
MajorVer <-- digit+
MinorVer <-- digit+
PatchVer <-- digit+
PreVer <-- (word / DASH)+ ('.' (word / DASH)+)*
CheckId <-- (upper lower+)+
ClassId <-- ResClassId / lower (lower / UNDER lower)+
TokenId <-- ResTokenId / upper (upper / UNDER upper)+
StateId <-- UNDER upper+
Expression <-- Sequence (Spacing '/' SP+ Sequence)*
ClassExpr <-- Simple (Spacing '/' SP+ Simple)*
Simple <- Unicode / Binary / Hexadec / Octal
/ ClassId / TokenId / Range / SQ String SQ
Spacing <- ComEndLine? SP+
Sequence <-- Rule (Spacing Rule)*
Rule <- PosLook / NegLook / Plain
Plain <-- Primary Quant?
PosLook <-- '&' Primary Quant?
NegLook <-- '!' Primary Quant?
Primary <- Simple / CheckId / '(' Expression ')'
Quant <- Optional / MinZero / MinOne / MinMax / Amount
Optional <-- '?'
MinZero <-- '*'
MinOne <-- '+'
MinMax <-- '{' Min ',' Max? '}'
Min <-- digit+
Max <-- digit+
Amount <- '{' Count '}'
Count <-- digit+
Range <- AlphaRange / IntRange / UniRange
/ BinRange / HexRange / OctRange
UniRange <-- '[' Unicode '-' Unicode ']' # [u0000-u10FFFF]
AlphaRange <-- '[' Letter '-' Letter ']' # [a-m] [A-Z]
IntRange <-- '[' Integer '-' Integer ']' # [0-108] [0-9]
BinRange <-- '[' Binary '-' Binary ']' # [b101-b1111111110101010]
HexRange <-- '[' Hexadec '-' Hexadec ']' # [x20-x2F] [xFFFFF - x1FFFFF]
OctRange <-- '[' Octal '-' Octal ']' # [o20-o37]
String <-- quotable+
Letter <-- alpha
Unicode <-- 'u' ('10' uphex{4} / uphex{4,5})
Integer <-- digit+
Binary <-- 'b' bindig+ # b1 == b00000001 (0 pad to closest byte)
Hexadec <-- 'x' uphex+
Octal <-- 'o' octdig+
EndLine <-- LF / CRLF / CR
ResClassId <-- 'alphanum' / 'alpha' / 'any' / 'bindig' / 'control'
/ 'digit' / 'hexdig' / 'lowerhex' / 'lower' / 'octdig'
/ 'punct' / 'quotable' / 'sign' / 'uphex' / 'upper'
/ 'visible' / 'ws' / 'alnum' / 'ascii' / 'blank' / 'cntrl'
/ 'graph' / 'print' / 'space' / 'word' / 'xdigit' / 'unipoint'
ResTokenId <-- 'TAB' / 'CRLF' / 'CR' / 'LFAT' / 'SP' / 'VT' / 'FF' / 'NOT'
/ 'BANG' / 'DQ' / 'HASH' / 'DOLLAR' / 'PERCENT' / 'AND'
/ 'SQ' / 'LPAREN' / 'RPAREN' / 'STAR' / 'PLUS' / 'COMMA'
/ 'DASH' / 'MINUS' / 'DOT' / 'SLASH' / 'COLON' / 'SEMI' / 'LT'
/ 'EQ' / 'GT' / 'QUERY' / 'QUESTION' / 'AT' / 'LBRAKT'
/ 'BKSLASH' / 'RBRAKT' / 'CARET' / 'UNDER' / 'BKTICK'
/ 'LCURLY' / 'LBRACE' / 'BAR' / 'PIPE' / 'RCURLY'
/ 'RBRACE' / 'TILDE' / 'UNKNOWN' / 'REPLACE' / 'MAXRUNE'
/ 'MAXASCII' / 'MAXLATIN' / 'LARROWF' / 'RARROWF' / 'LLARROW'
/ 'RLARROW' / 'LARROW' / 'LF' / 'RARROW' / 'RFAT'
/ 'WALRUS' / 'ENDOFDATA'