Skip to content

Commit d247d1b

Browse files
authored
Merge pull request #27 from DecimalTurn/abnf5
feat: simplify ABNF for v0.1 and introduce section in main document
2 parents b6bcb4b + b9cd304 commit d247d1b

7 files changed

Lines changed: 364 additions & 281 deletions

File tree

generate-railroad.js

Lines changed: 184 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,78 @@ const { spawnSync } = require("node:child_process");
55
const path = require("node:path");
66

77
// Customization section
8-
const DEFAULT_INPUT_ABNF = "grammar/jsonc.abnf";
8+
const DEFAULT_INPUT_ABNF = "grammar/JSONC.abnf";
99
const DEFAULT_PROCESSED_ABNF = "grammar/jsonc-processed.abnf";
1010
const DEFAULT_OUTPUT_HTML = "grammar/railroad-diagram.html";
11+
const FORCED_HTML_HEADER = "JSONC GRAMMAR";
1112

1213
// Rules to inline from their %x... definitions as literal ABNF strings.
1314
// Add more rule names here to apply the same transformation.
1415
const INLINE_HEX_RULES = [
1516
"multi-line-comment-start",
1617
"multi-line-comment-end",
1718
"asterisk",
18-
"escape"
19+
"escape",
20+
"single-line-comment-start",
21+
"quotation-mark",
22+
"decimal-point",
23+
"minus",
24+
"plus",
25+
"zero",
26+
];
27+
28+
// Inline selected rule references as quoted literals in specific target rules.
29+
// Add more mappings here to reuse this transformation pattern.
30+
const INLINE_LITERAL_REFS = [
31+
{
32+
targetRule: "value",
33+
referencedRules: ["false", "true", "null"],
34+
},
35+
];
36+
37+
// Move selected rule definitions after another rule in the processed ABNF.
38+
// Add more entries here to control rule ordering in generated output.
39+
const REPOSITION_RULES_AFTER = [
40+
{
41+
ruleName: "begin-array",
42+
afterRule: "array",
43+
},
44+
{
45+
ruleName: "end-array",
46+
afterRule: "begin-array",
47+
},
48+
{
49+
ruleName: "begin-object",
50+
afterRule: "object",
51+
},
52+
{
53+
ruleName: "end-object",
54+
afterRule: "begin-object",
55+
},
56+
{
57+
ruleName: "name-separator",
58+
afterRule: "member",
59+
},
60+
{
61+
ruleName: "value-separator",
62+
afterRule: "value",
63+
},
64+
{
65+
ruleName: "digit",
66+
afterRule: "unescaped",
67+
},
68+
{
69+
ruleName: "digit1-9",
70+
afterRule: "digit",
71+
},
72+
{
73+
ruleName: "hexdigit",
74+
afterRule: "digit1-9",
75+
},
76+
{
77+
ruleName: "four-hexdigits",
78+
afterRule: "hexdigit",
79+
}
1980
];
2081

2182
function escapeRegExp(value) {
@@ -36,6 +97,24 @@ function decodeAbnfHexSequence(value) {
3697
return String.fromCodePoint(...bytes);
3798
}
3899

100+
function getHexRuleSequence(source, ruleName) {
101+
const escapedRuleName = escapeRegExp(ruleName);
102+
const ruleRegex = new RegExp(
103+
`^\\s*${escapedRuleName}\\s*=\\s*(%x[0-9A-Fa-f]+(?:\\.[0-9A-Fa-f]+)*)\\b.*$`,
104+
"m",
105+
);
106+
const ruleMatch = source.match(ruleRegex);
107+
if (!ruleMatch) {
108+
throw new Error(`Rule ${ruleName} was not found.`);
109+
}
110+
111+
return ruleMatch[1];
112+
}
113+
114+
function getHexRuleLiteral(source, ruleName) {
115+
return decodeAbnfHexSequence(getHexRuleSequence(source, ruleName));
116+
}
117+
39118
function inlineHexRuleAsLiteral(source, ruleName) {
40119
const escapedRuleName = escapeRegExp(ruleName);
41120
const ruleRegex = new RegExp(
@@ -50,10 +129,10 @@ function inlineHexRuleAsLiteral(source, ruleName) {
50129
const hexSequence = ruleMatch[1];
51130
const literalChars = decodeAbnfHexSequence(hexSequence);
52131

53-
// For backslash or other problematic characters, keep them as hex format
54-
// ABNF doesn't support backslash escaping in quoted strings
132+
// Keep hex format for characters that cannot be represented safely
133+
// as a single ABNF quoted string literal.
55134
let replacement;
56-
if (literalChars === "\\") {
135+
if (literalChars === "\\" || literalChars === '"') {
57136
replacement = hexSequence;
58137
} else {
59138
// For other characters, escape only double quotes (not backslashes)
@@ -90,16 +169,104 @@ function inlineHexRuleAsLiteral(source, ruleName) {
90169
.join("\n");
91170
}
92171

172+
function inlineLiteralRefsInTargetRule(source, targetRule, referencedRules) {
173+
const escapedTargetRule = escapeRegExp(targetRule);
174+
const targetRuleRegex = new RegExp(`^(\\s*${escapedTargetRule}\\s*=\\s*)(.*)$`, "m");
175+
const match = source.match(targetRuleRegex);
176+
if (!match) {
177+
throw new Error(`Rule ${targetRule} was not found.`);
178+
}
179+
180+
const targetRulePrefix = match[1];
181+
const targetRuleRhs = match[2];
182+
183+
let updatedRhs = targetRuleRhs;
184+
for (const referencedRule of referencedRules) {
185+
const replacementLiteral = getHexRuleSequence(source, referencedRule);
186+
const referencedRuleRegex = new RegExp(
187+
`(?<![A-Za-z0-9-])${escapeRegExp(referencedRule)}(?![A-Za-z0-9-])`,
188+
"g",
189+
);
190+
updatedRhs = updatedRhs.replace(referencedRuleRegex, replacementLiteral);
191+
}
192+
193+
return source.replace(targetRuleRegex, `${targetRulePrefix}${updatedRhs}`);
194+
}
195+
196+
function removeRuleDefinitions(source, ruleNames) {
197+
const removalSet = new Set(ruleNames);
198+
199+
return source
200+
.split(/\r?\n/)
201+
.filter((line) => {
202+
const match = line.match(/^\s*([A-Za-z][A-Za-z0-9-]*)\s*=/);
203+
if (!match) {
204+
return true;
205+
}
206+
return !removalSet.has(match[1]);
207+
})
208+
.join("\n");
209+
}
210+
211+
function findRuleBlock(lines, ruleName) {
212+
const ruleStartRegex = new RegExp(`^\\s*${escapeRegExp(ruleName)}\\s*=`);
213+
const startIndex = lines.findIndex((line) => ruleStartRegex.test(line));
214+
if (startIndex === -1) {
215+
throw new Error(`Rule ${ruleName} was not found.`);
216+
}
217+
218+
let endIndex = startIndex + 1;
219+
while (endIndex < lines.length && /^\s/.test(lines[endIndex])) {
220+
endIndex += 1;
221+
}
222+
223+
return {
224+
startIndex,
225+
endIndex,
226+
blockLines: lines.slice(startIndex, endIndex),
227+
};
228+
}
229+
230+
function repositionRulesAfter(source, reorderings) {
231+
let lines = source.split(/\r?\n/);
232+
233+
for (const { ruleName, afterRule } of reorderings) {
234+
const ruleBlock = findRuleBlock(lines, ruleName);
235+
lines.splice(ruleBlock.startIndex, ruleBlock.endIndex - ruleBlock.startIndex);
236+
237+
const afterRuleBlock = findRuleBlock(lines, afterRule);
238+
lines.splice(afterRuleBlock.endIndex, 0, ...ruleBlock.blockLines);
239+
}
240+
241+
return lines.join("\n");
242+
}
243+
93244
function processAbnfSource(source) {
94245
let processed = source;
95246

96247
for (const ruleName of INLINE_HEX_RULES) {
97248
processed = inlineHexRuleAsLiteral(processed, ruleName);
98249
}
99250

251+
for (const { targetRule, referencedRules } of INLINE_LITERAL_REFS) {
252+
processed = inlineLiteralRefsInTargetRule(processed, targetRule, referencedRules);
253+
processed = removeRuleDefinitions(processed, referencedRules);
254+
}
255+
256+
processed = repositionRulesAfter(processed, REPOSITION_RULES_AFTER);
257+
100258
return processed;
101259
}
102260

261+
function postProcessGeneratedHtml(htmlPath) {
262+
const html = fs.readFileSync(htmlPath, "utf8");
263+
const updated = html.replace(/<h1>[^<]*<\/h1>/, `<h1>${FORCED_HTML_HEADER}</h1>`);
264+
265+
if (updated !== html) {
266+
fs.writeFileSync(htmlPath, updated, "utf8");
267+
}
268+
}
269+
103270
const args = process.argv.slice(2);
104271
const titleIndex = args.indexOf("--title");
105272

@@ -173,4 +340,15 @@ if (result.error) {
173340
process.exit(1);
174341
}
175342

176-
process.exit(result.status === null ? 1 : result.status);
343+
if (result.status !== 0) {
344+
process.exit(result.status === null ? 1 : result.status);
345+
}
346+
347+
try {
348+
postProcessGeneratedHtml(outputPath);
349+
} catch (error) {
350+
console.error(`Failed to post-process generated HTML: ${error.message}`);
351+
process.exit(1);
352+
}
353+
354+
process.exit(0);
Lines changed: 26 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,29 @@
11
; JSONC grammar with comments support (RFC 8259 extended with JavaScript-style comments)
22
;
33
; Notes:
4-
; - Rule names and structure follow RFC 8259 ABNF snippets.
5-
; - DIGIT and HEXDIG are core rules from RFC 5234.
6-
; - comments are an extension not in RFC 8259.
4+
; - Rule names and structure follow RFC 8259 ABNF.
5+
; - Comments are an extension not in RFC 8259.
76
; - Trailing commas are NOT supported in this grammar.
87

98
; A JSONC-text is a serialized value surrounded by optional whitespace and comments.
109
; Comments can appear anywhere insignificant whitespace is allowed in JSON.
1110
JSONC-text = wsc value wsc
1211

1312
; Whitespace with Comments: zero or more whitespace characters or comments
14-
wsc = *(ws-char / comment)
13+
wsc = *(ws-char / comment) ; Whitespace and/or comments
1514

1615
; Single whitespace character (space, tab, line feed, carriage return)
1716
ws-char = %x20 / %x09 / %x0A / %x0D ; space / tab / LF / CR
1817

1918
; Comments: single-line or multi-line
2019
comment = single-line-comment / multi-line-comment
2120

22-
; Source character: any Unicode code point, as per ECMAScript.
23-
source-character = %x00-10FFFF
24-
25-
; Comment terminators and sequences (based on ECMAScript line terminators)
26-
comment-terminator = %x0A / %x0D / %x2028 / %x2029 ; LF / CR / LS / PS
27-
comment-terminator-sequence = %x0D.0A / %x0A / %x0D / %x2028 / %x2029
28-
2921
; Single-line comment: starts with //, continues until line ending
30-
; Terminator is not part of the comment body.
3122
; Note that the single-line-comment-end is optional, allowing comments to end at the end of the file without a line terminator.
3223
single-line-comment-start = %x2F.2F ; // double solidus
33-
single-line-comment-end = comment-terminator-sequence
24+
single-line-comment-end = %x0D.0A / %x0A / %x0D
3425
single-line-comment = single-line-comment-start *single-line-comment-char [ single-line-comment-end ]
35-
single-line-comment-char = %x00-09 / %x0B-0C / %x0E-2027 / %x202A-10FFFF ; Any source character except comment terminators
26+
single-line-comment-char = %x00-09 / %x0B-0C / %x0E-10FFFF ; Any source character except CR and LF (line terminator)
3627

3728
; Multi-line comment: /* ... */
3829
; Cannot be nested. The first */ closes the comment.
@@ -58,7 +49,7 @@ name-separator = wsc %x3A wsc ; : colon
5849
value-separator = wsc %x2C wsc ; , comma
5950

6051
; Any JSON value
61-
value = false / null / true / object / array / number / string
52+
value = object / array / number / string / true / false / null
6253

6354
; Literal names (boolean values and null)
6455
false = %x66.61.6C.73.65 ; false
@@ -73,31 +64,37 @@ member = string name-separator value
7364
array = begin-array [ value *( value-separator value ) ] end-array
7465

7566
; Numbers
76-
number = [ minus ] int [ frac ] [ exp ]
67+
number = [ minus ] ( zero / ( digit1-9 *digit ) ) [ decimal-point 1*digit ] [ ( %x65 / %x45 ) [ minus / plus ] 1*digit ]
7768
decimal-point = %x2E ; .
69+
digit = %x30-39 ; 0-9
7870
digit1-9 = %x31-39 ; 1-9
79-
e = %x65 / %x45 ; e E
80-
exp = e [ minus / plus ] 1*DIGIT
81-
frac = decimal-point 1*DIGIT
82-
int = zero / ( digit1-9 *DIGIT )
71+
8372
minus = %x2D ; -
8473
plus = %x2B ; +
8574
zero = %x30 ; 0
75+
hexdigit = digit /
76+
%x41 / %x61 / ; A a
77+
%x42 / %x62 / ; B b
78+
%x43 / %x63 / ; C c
79+
%x44 / %x64 / ; D d
80+
%x45 / %x65 / ; E e
81+
%x46 / %x66 ; F f
82+
four-hexdigits = 4hexdigit
8683

8784
; Strings
8885
string = quotation-mark *char quotation-mark
8986

9087
char = unescaped /
9188
escape (
92-
%x22 / ; " quotation mark U+0022
93-
%x5C / ; \ reverse solidus U+005C
94-
%x2F / ; / solidus U+002F
95-
%x62 / ; b backspace U+0008
96-
%x66 / ; f form feed U+000C
97-
%x6E / ; n line feed U+000A
98-
%x72 / ; r carriage return U+000D
99-
%x74 / ; t tab U+0009
100-
%x75 4HEXDIG ; uXXXX U+XXXX
89+
%x22 / ; " quotation mark U+0022
90+
%x5C / ; \ reverse solidus U+005C
91+
%x2F / ; / solidus U+002F
92+
%x62 / ; b backspace U+0008
93+
%x66 / ; f form feed U+000C
94+
%x6E / ; n line feed U+000A
95+
%x72 / ; r carriage return U+000D
96+
%x74 / ; t tab U+0009
97+
%x75 four-hexdigits ; uXXXX U+XXXX
10198
)
10299

103100
escape = %x5C ; \

grammar/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ This directory contains the ABNF grammar for JSONC, along with plans for generat
44

55
## Railroad Diagram Generation Plan
66

7-
Generate railroad diagrams from `grammar/jsonc.abnf` using a simple one-file Node.js script.
7+
Generate railroad diagrams from `grammar/JSONC.abnf` using a simple one-file Node.js script.
88

99
Instead of building a custom ABNF parser and converter to Tab Atkins constructor calls, use:
1010

@@ -18,7 +18,7 @@ The wrapper script should:
1818

1919
1. Accept input ABNF path and optional output HTML path.
2020
2. Default to:
21-
- input: `grammar/jsonc.abnf`
21+
- input: `grammar/JSONC.abnf`
2222
- output: `grammar/railroad-diagram.html`
2323
3. Optionally accept `--title` to set the HTML title.
2424
4. Execute the upstream CLI from our installed dependency.
@@ -53,13 +53,13 @@ npm run railroad
5353
Generate from a specific input and output:
5454

5555
```bash
56-
npm run railroad -- grammar/jsonc.abnf grammar/railroad-diagram.html
56+
npm run railroad -- grammar/JSONC.abnf grammar/railroad-diagram.html
5757
```
5858

5959
Generate with a custom title:
6060

6161
```bash
62-
npm run railroad -- grammar/jsonc.abnf grammar/railroad-diagram.html --title "JSONC Grammar"
62+
npm run railroad -- grammar/JSONC.abnf grammar/railroad-diagram.html --title "JSONC Grammar"
6363
```
6464

6565
### Notes on EOF for single-line comments

0 commit comments

Comments
 (0)