Skip to content

Commit 22acd7c

Browse files
authored
Improve Lexer handling by removing TOKEN_TEXT_CONTENT (#47)
This pull request removes the `TOKEN_TEXT_CONTENT` token and in favor of falling back to a new `TOKEN_CHARACTER` for any remaining characters that didn't match. #### Summary * Removed `TOKEN_TEXT_CONTENT` * Added new tokens `TOKEN_NBSP`, `TOKEN_SEMICOLON`, `TOKEN_AMPERSAND`, and `TOKEN_CHARACTER`. * Instead of explicitly listing which tokens to build up the Text Content AST Node, rollup any tokens for the Text Content AST Node in the Parser.
1 parent 56f2531 commit 22acd7c

File tree

14 files changed

+353
-40
lines changed

14 files changed

+353
-40
lines changed

.vitepress/config.mts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,6 @@ export default defineConfig({
138138
collapsed: true,
139139
items: [
140140
{ text: "TOKEN_NEWLINE", link: "/tokens/TOKEN_NEWLINE" },
141-
{ text: "TOKEN_TEXT_CONTENT", link: "/tokens/TOKEN_TEXT_CONTENT" },
142141
{ text: "TOKEN_WHITESPACE", link: "/tokens/TOKEN_WHITESPACE" },
143142
]
144143
}

docs/tokens/TOKEN_TEXT_CONTENT.md

Lines changed: 0 additions & 1 deletion
This file was deleted.

src/include/token_struct.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66

77
typedef enum {
88
TOKEN_WHITESPACE, // ' '
9+
TOKEN_NBSP, // \xC2\xA0
910
TOKEN_NEWLINE, // \n
1011
TOKEN_IDENTIFIER,
11-
TOKEN_TEXT_CONTENT,
1212

1313
TOKEN_HTML_DOCTYPE, // <!DOCTYPE, <!doctype, <!DoCtYpE, <!dOcTyPe
1414

@@ -31,9 +31,12 @@ typedef enum {
3131
TOKEN_DASH, // -
3232
TOKEN_UNDERSCORE, // _
3333
TOKEN_EXCLAMATION, // !
34+
TOKEN_SEMICOLON, // ;
3435
TOKEN_COLON, // :
3536
TOKEN_PERCENT, // %
37+
TOKEN_AMPERSAND, // &
3638

39+
TOKEN_CHARACTER,
3740
TOKEN_ERROR,
3841
TOKEN_EOF,
3942
} token_type_T;

src/lexer.c

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ static token_T* lexer_match_and_advance(lexer_T* lexer, const char* value, token
9696
static token_T* lexer_parse_whitespace(lexer_T* lexer) {
9797
buffer_T buffer = buffer_new();
9898

99-
while (isspace(lexer->current_character) && lexer->current_character != '\n' && lexer->current_character != '\r') {
99+
while (isspace(lexer->current_character) && lexer->current_character != '\n' && lexer->current_character != '\r'
100+
&& !lexer_eof(lexer)) {
100101
buffer_append_char(&buffer, lexer->current_character);
101102
lexer_advance(lexer);
102103
}
@@ -109,7 +110,7 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
109110

110111
while ((isalnum(lexer->current_character) || lexer->current_character == '-' || lexer->current_character == '_'
111112
|| lexer->current_character == ':')
112-
&& !lexer_peek_for_html_comment_end(lexer, 0)) {
113+
&& !lexer_peek_for_html_comment_end(lexer, 0) && !lexer_eof(lexer)) {
113114

114115
buffer_append_char(&buffer, lexer->current_character);
115116
lexer_advance(lexer);
@@ -118,17 +119,6 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
118119
return token_init(buffer.value, TOKEN_IDENTIFIER, lexer);
119120
}
120121

121-
static token_T* lexer_parse_text_content(lexer_T* lexer) {
122-
buffer_T buffer = buffer_new();
123-
124-
while (lexer->current_character != '<' && lexer->current_character != '>' && !lexer_eof(lexer)) {
125-
buffer_append_char(&buffer, lexer->current_character);
126-
lexer_advance(lexer);
127-
}
128-
129-
return token_init(buffer.value, TOKEN_TEXT_CONTENT, lexer);
130-
}
131-
132122
// ===== ERB Parsing
133123

134124
static token_T* lexer_parse_erb_open(lexer_T* lexer) {
@@ -180,6 +170,9 @@ token_T* lexer_next_token(lexer_T* lexer) {
180170

181171
if (lexer->current_character == '\n') { return lexer_advance_current(lexer, TOKEN_NEWLINE); }
182172
if (isspace(lexer->current_character)) { return lexer_parse_whitespace(lexer); }
173+
if (lexer->current_character == '\xC2' && lexer_peek(lexer, 1) == '\xA0') {
174+
return lexer_advance_with(lexer, "\xC2\xA0", TOKEN_NBSP);
175+
}
183176

184177
switch (lexer->current_character) {
185178
case '<': {
@@ -211,6 +204,8 @@ token_T* lexer_next_token(lexer_T* lexer) {
211204
case '>': return lexer_advance_current(lexer, TOKEN_HTML_TAG_END);
212205
case '_': return lexer_advance_current(lexer, TOKEN_UNDERSCORE);
213206
case ':': return lexer_advance_current(lexer, TOKEN_COLON);
207+
case ';': return lexer_advance_current(lexer, TOKEN_SEMICOLON);
208+
case '&': return lexer_advance_current(lexer, TOKEN_AMPERSAND);
214209
case '!': return lexer_advance_current(lexer, TOKEN_EXCLAMATION);
215210
case '=': return lexer_advance_current(lexer, TOKEN_EQUALS);
216211
case '%': return lexer_advance_current(lexer, TOKEN_PERCENT);
@@ -219,11 +214,9 @@ token_T* lexer_next_token(lexer_T* lexer) {
219214
case '\'': return lexer_advance_current(lexer, TOKEN_QUOTE);
220215

221216
default: {
222-
if (isalnum(lexer->current_character) || lexer->current_character == '_') {
223-
return lexer_parse_identifier(lexer);
224-
}
217+
if (isalnum(lexer->current_character)) { return lexer_parse_identifier(lexer); }
225218

226-
return lexer_parse_text_content(lexer);
219+
return lexer_advance_current(lexer, TOKEN_CHARACTER);
227220
}
228221
}
229222
}

src/parser.c

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -203,30 +203,15 @@ static AST_NODE_T* parser_parse_text_content(parser_T* parser, AST_NODE_T* eleme
203203
start_location = parser->current_token->start;
204204
} break;
205205

206-
case TOKEN_IDENTIFIER: {
207-
token_T* identifier = parser_consume(parser, TOKEN_IDENTIFIER, text_content_node);
208-
209-
buffer_append(&content, identifier->value);
210-
} break;
211-
212-
case TOKEN_WHITESPACE: {
213-
token_T* whitespace = parser_consume(parser, TOKEN_WHITESPACE, text_content_node);
214-
buffer_append(&content, whitespace->value);
215-
} break;
216-
217-
case TOKEN_NEWLINE: {
218-
token_T* newline = parser_consume(parser, TOKEN_NEWLINE, text_content_node);
219-
buffer_append(&content, newline->value);
220-
} break;
221-
222206
case TOKEN_EOF:
223207
case TOKEN_HTML_TAG_START:
224208
case TOKEN_HTML_TAG_START_CLOSE: {
225209
break;
226210
}
227211

228212
default: {
229-
parser_append_unexpected_token_from_token(parser, parser->current_token->type, text_content_node);
213+
token_T* token = parser_consume(parser, parser->current_token->type, text_content_node);
214+
buffer_append(&content, token->value);
230215
}
231216
}
232217
}

src/token.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ token_T* token_init(const char* value, token_type_T type, lexer_T* lexer) {
3939
const char* token_type_to_string(token_type_T type) {
4040
switch (type) {
4141
case TOKEN_WHITESPACE: return "TOKEN_WHITESPACE";
42+
case TOKEN_NBSP: return "TOKEN_NBSP";
4243
case TOKEN_NEWLINE: return "TOKEN_NEWLINE";
4344
case TOKEN_IDENTIFIER: return "TOKEN_IDENTIFIER";
44-
case TOKEN_TEXT_CONTENT: return "TOKEN_TEXT_CONTENT";
4545
case TOKEN_HTML_DOCTYPE: return "TOKEN_HTML_DOCTYPE";
4646
case TOKEN_HTML_TAG_START: return "TOKEN_HTML_TAG_START";
4747
case TOKEN_HTML_TAG_END: return "TOKEN_HTML_TAG_END";
@@ -55,12 +55,15 @@ const char* token_type_to_string(token_type_T type) {
5555
case TOKEN_UNDERSCORE: return "TOKEN_UNDERSCORE";
5656
case TOKEN_EXCLAMATION: return "TOKEN_EXCLAMATION";
5757
case TOKEN_SLASH: return "TOKEN_SLASH";
58+
case TOKEN_SEMICOLON: return "TOKEN_SEMICOLON";
5859
case TOKEN_COLON: return "TOKEN_COLON";
5960
case TOKEN_LT: return "TOKEN_LT";
6061
case TOKEN_PERCENT: return "TOKEN_PERCENT";
62+
case TOKEN_AMPERSAND: return "TOKEN_AMPERSAND";
6163
case TOKEN_ERB_START: return "TOKEN_ERB_START";
6264
case TOKEN_ERB_CONTENT: return "TOKEN_ERB_CONTENT";
6365
case TOKEN_ERB_END: return "TOKEN_ERB_END";
66+
case TOKEN_CHARACTER: return "TOKEN_CHARACTER";
6467
case TOKEN_ERROR: return "TOKEN_ERROR";
6568
case TOKEN_EOF: return "TOKEN_EOF";
6669
}

test/lexer/attributes_test.rb

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,5 +318,84 @@ class AttributesTest < Minitest::Spec
318318

319319
assert_equal expected, result.array.items.map(&:type)
320320
end
321+
322+
test "attribute value with a period" do
323+
result = ERBX.lex(%(<div value="hello. world."></div>))
324+
325+
expected = %w[
326+
TOKEN_HTML_TAG_START
327+
TOKEN_IDENTIFIER
328+
TOKEN_WHITESPACE
329+
TOKEN_IDENTIFIER
330+
TOKEN_EQUALS
331+
TOKEN_QUOTE
332+
TOKEN_IDENTIFIER
333+
TOKEN_CHARACTER
334+
TOKEN_WHITESPACE
335+
TOKEN_IDENTIFIER
336+
TOKEN_CHARACTER
337+
TOKEN_QUOTE
338+
TOKEN_HTML_TAG_END
339+
TOKEN_HTML_TAG_START_CLOSE
340+
TOKEN_IDENTIFIER
341+
TOKEN_HTML_TAG_END
342+
TOKEN_EOF
343+
]
344+
345+
assert_equal expected, result.array.items.map(&:type)
346+
end
347+
348+
test "attribute value with a slash" do
349+
result = ERBX.lex(%(<div value="hello/ world/"></div>))
350+
351+
expected = %w[
352+
TOKEN_HTML_TAG_START
353+
TOKEN_IDENTIFIER
354+
TOKEN_WHITESPACE
355+
TOKEN_IDENTIFIER
356+
TOKEN_EQUALS
357+
TOKEN_QUOTE
358+
TOKEN_IDENTIFIER
359+
TOKEN_SLASH
360+
TOKEN_WHITESPACE
361+
TOKEN_IDENTIFIER
362+
TOKEN_SLASH
363+
TOKEN_QUOTE
364+
TOKEN_HTML_TAG_END
365+
TOKEN_HTML_TAG_START_CLOSE
366+
TOKEN_IDENTIFIER
367+
TOKEN_HTML_TAG_END
368+
TOKEN_EOF
369+
]
370+
371+
assert_equal expected, result.array.items.map(&:type)
372+
end
373+
374+
test "attribute value with an URL" do
375+
result = ERBX.lex(%(<a href="https://example.com"></div>))
376+
377+
expected = %w[
378+
TOKEN_HTML_TAG_START
379+
TOKEN_IDENTIFIER
380+
TOKEN_WHITESPACE
381+
TOKEN_IDENTIFIER
382+
TOKEN_EQUALS
383+
TOKEN_QUOTE
384+
TOKEN_IDENTIFIER
385+
TOKEN_SLASH
386+
TOKEN_SLASH
387+
TOKEN_IDENTIFIER
388+
TOKEN_CHARACTER
389+
TOKEN_IDENTIFIER
390+
TOKEN_QUOTE
391+
TOKEN_HTML_TAG_END
392+
TOKEN_HTML_TAG_START_CLOSE
393+
TOKEN_IDENTIFIER
394+
TOKEN_HTML_TAG_END
395+
TOKEN_EOF
396+
]
397+
398+
assert_equal expected, result.array.items.map(&:type)
399+
end
321400
end
322401
end

test/lexer/doctype_test.rb

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class DoctypeTest < Minitest::Spec
5353
TOKEN_WHITESPACE
5454
TOKEN_IDENTIFIER
5555
TOKEN_WHITESPACE
56+
5657
TOKEN_QUOTE
5758
TOKEN_DASH
5859
TOKEN_SLASH
@@ -65,7 +66,34 @@ class DoctypeTest < Minitest::Spec
6566
TOKEN_IDENTIFIER
6667
TOKEN_WHITESPACE
6768
TOKEN_IDENTIFIER
68-
TOKEN_TEXT_CONTENT
69+
TOKEN_CHARACTER
70+
TOKEN_IDENTIFIER
71+
TOKEN_SLASH
72+
TOKEN_SLASH
73+
TOKEN_IDENTIFIER
74+
TOKEN_QUOTE
75+
76+
TOKEN_WHITESPACE
77+
78+
TOKEN_QUOTE
79+
TOKEN_IDENTIFIER
80+
TOKEN_SLASH
81+
TOKEN_SLASH
82+
TOKEN_IDENTIFIER
83+
TOKEN_CHARACTER
84+
TOKEN_IDENTIFIER
85+
TOKEN_CHARACTER
86+
TOKEN_IDENTIFIER
87+
TOKEN_SLASH
88+
TOKEN_IDENTIFIER
89+
TOKEN_SLASH
90+
TOKEN_IDENTIFIER
91+
TOKEN_SLASH
92+
TOKEN_IDENTIFIER
93+
TOKEN_CHARACTER
94+
TOKEN_IDENTIFIER
95+
TOKEN_QUOTE
96+
6997
TOKEN_HTML_TAG_END
7098
TOKEN_EOF
7199
]

test/lexer/html_entities_test.rb

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# frozen_string_literal: true
2+
3+
require_relative "../test_helper"
4+
5+
module Lexer
6+
class HTMLEntitiesTest < Minitest::Spec
7+
test "&lt;" do
8+
result = ERBX.lex("&lt;")
9+
10+
expected = %w[
11+
TOKEN_AMPERSAND
12+
TOKEN_IDENTIFIER
13+
TOKEN_SEMICOLON
14+
TOKEN_EOF
15+
]
16+
17+
assert_equal expected, result.array.items.map(&:type)
18+
end
19+
20+
test "&gt;" do
21+
result = ERBX.lex("&gt;")
22+
23+
expected = %w[
24+
TOKEN_AMPERSAND
25+
TOKEN_IDENTIFIER
26+
TOKEN_SEMICOLON
27+
TOKEN_EOF
28+
]
29+
30+
assert_equal expected, result.array.items.map(&:type)
31+
end
32+
33+
test "&nbsp;" do
34+
result = ERBX.lex("&nbsp;")
35+
36+
expected = %w[
37+
TOKEN_AMPERSAND
38+
TOKEN_IDENTIFIER
39+
TOKEN_SEMICOLON
40+
TOKEN_EOF
41+
]
42+
43+
assert_equal expected, result.array.items.map(&:type)
44+
end
45+
46+
test "&quot;" do
47+
result = ERBX.lex("&quot;")
48+
49+
expected = %w[
50+
TOKEN_AMPERSAND
51+
TOKEN_IDENTIFIER
52+
TOKEN_SEMICOLON
53+
TOKEN_EOF
54+
]
55+
56+
assert_equal expected, result.array.items.map(&:type)
57+
end
58+
59+
test "&apos;" do
60+
result = ERBX.lex("&apos;")
61+
62+
expected = %w[
63+
TOKEN_AMPERSAND
64+
TOKEN_IDENTIFIER
65+
TOKEN_SEMICOLON
66+
TOKEN_EOF
67+
]
68+
69+
assert_equal expected, result.array.items.map(&:type)
70+
end
71+
72+
test "ampersand" do
73+
result = ERBX.lex("&amp;")
74+
75+
expected = %w[
76+
TOKEN_AMPERSAND
77+
TOKEN_IDENTIFIER
78+
TOKEN_SEMICOLON
79+
TOKEN_EOF
80+
]
81+
82+
assert_equal expected, result.array.items.map(&:type)
83+
end
84+
85+
test "literal ampersand" do
86+
result = ERBX.lex("&")
87+
88+
expected = %w[
89+
TOKEN_AMPERSAND
90+
TOKEN_EOF
91+
]
92+
93+
assert_equal expected, result.array.items.map(&:type)
94+
end
95+
end
96+
end

0 commit comments

Comments
 (0)