marcoroth
diff --git a/‎.vitepress/config.mts‎
Lines changed: 0 additions & 1 deletion b/‎.vitepress/config.mts‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/tokens/TOKEN_TEXT_CONTENT.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/tokens/TOKEN_TEXT_CONTENT.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/include/token_struct.h‎
Lines changed: 4 additions & 1 deletion b/‎src/include/token_struct.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/lexer.c‎
Lines changed: 10 additions & 17 deletions b/‎src/lexer.c‎
Lines changed: 10 additions & 17 deletions
diff --git a/‎src/parser.c‎
Lines changed: 2 additions & 17 deletions b/‎src/parser.c‎
Lines changed: 2 additions & 17 deletions
diff --git a/‎src/token.c‎
Lines changed: 4 additions & 1 deletion b/‎src/token.c‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎test/lexer/attributes_test.rb‎
Lines changed: 79 additions & 0 deletions b/‎test/lexer/attributes_test.rb‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎test/lexer/doctype_test.rb‎
Lines changed: 29 additions & 1 deletion b/‎test/lexer/doctype_test.rb‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎test/lexer/html_entities_test.rb‎
Lines changed: 96 additions & 0 deletions b/‎test/lexer/html_entities_test.rb‎
Lines changed: 96 additions & 0 deletions
@@ -138,7 +138,6 @@ export default defineConfig({
             collapsed: true,
             items: [
               { text: "TOKEN_NEWLINE", link: "/tokens/TOKEN_NEWLINE" },
-              { text: "TOKEN_TEXT_CONTENT", link: "/tokens/TOKEN_TEXT_CONTENT" },
               { text: "TOKEN_WHITESPACE", link: "/tokens/TOKEN_WHITESPACE" },
             ]
           }
 
@@ -6,9 +6,9 @@
 
 typedef enum {
   TOKEN_WHITESPACE, // ' '
+  TOKEN_NBSP,       // \xC2\xA0
   TOKEN_NEWLINE,    // \n
   TOKEN_IDENTIFIER,
-  TOKEN_TEXT_CONTENT,
 
   TOKEN_HTML_DOCTYPE, // <!DOCTYPE, <!doctype, <!DoCtYpE, <!dOcTyPe
 
@@ -31,9 +31,12 @@ typedef enum {
   TOKEN_DASH,        // -
   TOKEN_UNDERSCORE,  // _
   TOKEN_EXCLAMATION, // !
+  TOKEN_SEMICOLON,   // ;
   TOKEN_COLON,       // :
   TOKEN_PERCENT,     // %
+  TOKEN_AMPERSAND,   // &
 
+  TOKEN_CHARACTER,
   TOKEN_ERROR,
   TOKEN_EOF,
 } token_type_T;
 
@@ -96,7 +96,8 @@ static token_T* lexer_match_and_advance(lexer_T* lexer, const char* value, token
 static token_T* lexer_parse_whitespace(lexer_T* lexer) {
   buffer_T buffer = buffer_new();
 
-  while (isspace(lexer->current_character) && lexer->current_character != '\n' && lexer->current_character != '\r') {
+  while (isspace(lexer->current_character) && lexer->current_character != '\n' && lexer->current_character != '\r'
+         && !lexer_eof(lexer)) {
     buffer_append_char(&buffer, lexer->current_character);
     lexer_advance(lexer);
   }
@@ -109,7 +110,7 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
 
   while ((isalnum(lexer->current_character) || lexer->current_character == '-' || lexer->current_character == '_'
           || lexer->current_character == ':')
-         && !lexer_peek_for_html_comment_end(lexer, 0)) {
+         && !lexer_peek_for_html_comment_end(lexer, 0) && !lexer_eof(lexer)) {
 
     buffer_append_char(&buffer, lexer->current_character);
     lexer_advance(lexer);
@@ -118,17 +119,6 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
   return token_init(buffer.value, TOKEN_IDENTIFIER, lexer);
 }
 
-static token_T* lexer_parse_text_content(lexer_T* lexer) {
-  buffer_T buffer = buffer_new();
-
-  while (lexer->current_character != '<' && lexer->current_character != '>' && !lexer_eof(lexer)) {
-    buffer_append_char(&buffer, lexer->current_character);
-    lexer_advance(lexer);
-  }
-
-  return token_init(buffer.value, TOKEN_TEXT_CONTENT, lexer);
-}
-
 // ===== ERB Parsing
 
 static token_T* lexer_parse_erb_open(lexer_T* lexer) {
@@ -180,6 +170,9 @@ token_T* lexer_next_token(lexer_T* lexer) {
 
   if (lexer->current_character == '\n') { return lexer_advance_current(lexer, TOKEN_NEWLINE); }
   if (isspace(lexer->current_character)) { return lexer_parse_whitespace(lexer); }
+  if (lexer->current_character == '\xC2' && lexer_peek(lexer, 1) == '\xA0') {
+    return lexer_advance_with(lexer, "\xC2\xA0", TOKEN_NBSP);
+  }
 
   switch (lexer->current_character) {
     case '<': {
@@ -211,6 +204,8 @@ token_T* lexer_next_token(lexer_T* lexer) {
     case '>': return lexer_advance_current(lexer, TOKEN_HTML_TAG_END);
     case '_': return lexer_advance_current(lexer, TOKEN_UNDERSCORE);
     case ':': return lexer_advance_current(lexer, TOKEN_COLON);
+    case ';': return lexer_advance_current(lexer, TOKEN_SEMICOLON);
+    case '&': return lexer_advance_current(lexer, TOKEN_AMPERSAND);
     case '!': return lexer_advance_current(lexer, TOKEN_EXCLAMATION);
     case '=': return lexer_advance_current(lexer, TOKEN_EQUALS);
     case '%': return lexer_advance_current(lexer, TOKEN_PERCENT);
@@ -219,11 +214,9 @@ token_T* lexer_next_token(lexer_T* lexer) {
     case '\'': return lexer_advance_current(lexer, TOKEN_QUOTE);
 
     default: {
-      if (isalnum(lexer->current_character) || lexer->current_character == '_') {
-        return lexer_parse_identifier(lexer);
-      }
+      if (isalnum(lexer->current_character)) { return lexer_parse_identifier(lexer); }
 
-      return lexer_parse_text_content(lexer);
+      return lexer_advance_current(lexer, TOKEN_CHARACTER);
     }
   }
 }
@@ -203,30 +203,15 @@ static AST_NODE_T* parser_parse_text_content(parser_T* parser, AST_NODE_T* eleme
         start_location = parser->current_token->start;
       } break;
 
-      case TOKEN_IDENTIFIER: {
-        token_T* identifier = parser_consume(parser, TOKEN_IDENTIFIER, text_content_node);
-
-        buffer_append(&content, identifier->value);
-      } break;
-
-      case TOKEN_WHITESPACE: {
-        token_T* whitespace = parser_consume(parser, TOKEN_WHITESPACE, text_content_node);
-        buffer_append(&content, whitespace->value);
-      } break;
-
-      case TOKEN_NEWLINE: {
-        token_T* newline = parser_consume(parser, TOKEN_NEWLINE, text_content_node);
-        buffer_append(&content, newline->value);
-      } break;
-
       case TOKEN_EOF:
       case TOKEN_HTML_TAG_START:
       case TOKEN_HTML_TAG_START_CLOSE: {
         break;
       }
 
       default: {
-        parser_append_unexpected_token_from_token(parser, parser->current_token->type, text_content_node);
+        token_T* token = parser_consume(parser, parser->current_token->type, text_content_node);
+        buffer_append(&content, token->value);
       }
     }
   }
 
@@ -39,9 +39,9 @@ token_T* token_init(const char* value, token_type_T type, lexer_T* lexer) {
 const char* token_type_to_string(token_type_T type) {
   switch (type) {
     case TOKEN_WHITESPACE: return "TOKEN_WHITESPACE";
+    case TOKEN_NBSP: return "TOKEN_NBSP";
     case TOKEN_NEWLINE: return "TOKEN_NEWLINE";
     case TOKEN_IDENTIFIER: return "TOKEN_IDENTIFIER";
-    case TOKEN_TEXT_CONTENT: return "TOKEN_TEXT_CONTENT";
     case TOKEN_HTML_DOCTYPE: return "TOKEN_HTML_DOCTYPE";
     case TOKEN_HTML_TAG_START: return "TOKEN_HTML_TAG_START";
     case TOKEN_HTML_TAG_END: return "TOKEN_HTML_TAG_END";
@@ -55,12 +55,15 @@ const char* token_type_to_string(token_type_T type) {
     case TOKEN_UNDERSCORE: return "TOKEN_UNDERSCORE";
     case TOKEN_EXCLAMATION: return "TOKEN_EXCLAMATION";
     case TOKEN_SLASH: return "TOKEN_SLASH";
+    case TOKEN_SEMICOLON: return "TOKEN_SEMICOLON";
     case TOKEN_COLON: return "TOKEN_COLON";
     case TOKEN_LT: return "TOKEN_LT";
     case TOKEN_PERCENT: return "TOKEN_PERCENT";
+    case TOKEN_AMPERSAND: return "TOKEN_AMPERSAND";
     case TOKEN_ERB_START: return "TOKEN_ERB_START";
     case TOKEN_ERB_CONTENT: return "TOKEN_ERB_CONTENT";
     case TOKEN_ERB_END: return "TOKEN_ERB_END";
+    case TOKEN_CHARACTER: return "TOKEN_CHARACTER";
     case TOKEN_ERROR: return "TOKEN_ERROR";
     case TOKEN_EOF: return "TOKEN_EOF";
   }
 
@@ -318,5 +318,84 @@ class AttributesTest < Minitest::Spec
 
       assert_equal expected, result.array.items.map(&:type)
     end
+
+    test "attribute value with a period" do
+      result = ERBX.lex(%(<div value="hello. world."></div>))
+
+      expected = %w[
+        TOKEN_HTML_TAG_START
+        TOKEN_IDENTIFIER
+        TOKEN_WHITESPACE
+        TOKEN_IDENTIFIER
+        TOKEN_EQUALS
+        TOKEN_QUOTE
+        TOKEN_IDENTIFIER
+        TOKEN_CHARACTER
+        TOKEN_WHITESPACE
+        TOKEN_IDENTIFIER
+        TOKEN_CHARACTER
+        TOKEN_QUOTE
+        TOKEN_HTML_TAG_END
+        TOKEN_HTML_TAG_START_CLOSE
+        TOKEN_IDENTIFIER
+        TOKEN_HTML_TAG_END
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "attribute value with a slash" do
+      result = ERBX.lex(%(<div value="hello/ world/"></div>))
+
+      expected = %w[
+        TOKEN_HTML_TAG_START
+        TOKEN_IDENTIFIER
+        TOKEN_WHITESPACE
+        TOKEN_IDENTIFIER
+        TOKEN_EQUALS
+        TOKEN_QUOTE
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_WHITESPACE
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_QUOTE
+        TOKEN_HTML_TAG_END
+        TOKEN_HTML_TAG_START_CLOSE
+        TOKEN_IDENTIFIER
+        TOKEN_HTML_TAG_END
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "attribute value with an URL" do
+      result = ERBX.lex(%(<a href="https://example.com"></div>))
+
+      expected = %w[
+        TOKEN_HTML_TAG_START
+        TOKEN_IDENTIFIER
+        TOKEN_WHITESPACE
+        TOKEN_IDENTIFIER
+        TOKEN_EQUALS
+        TOKEN_QUOTE
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_SLASH
+        TOKEN_IDENTIFIER
+        TOKEN_CHARACTER
+        TOKEN_IDENTIFIER
+        TOKEN_QUOTE
+        TOKEN_HTML_TAG_END
+        TOKEN_HTML_TAG_START_CLOSE
+        TOKEN_IDENTIFIER
+        TOKEN_HTML_TAG_END
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
   end
 end
@@ -53,6 +53,7 @@ class DoctypeTest < Minitest::Spec
         TOKEN_WHITESPACE
         TOKEN_IDENTIFIER
         TOKEN_WHITESPACE
+
         TOKEN_QUOTE
         TOKEN_DASH
         TOKEN_SLASH
@@ -65,7 +66,34 @@ class DoctypeTest < Minitest::Spec
         TOKEN_IDENTIFIER
         TOKEN_WHITESPACE
         TOKEN_IDENTIFIER
-        TOKEN_TEXT_CONTENT
+        TOKEN_CHARACTER
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_SLASH
+        TOKEN_IDENTIFIER
+        TOKEN_QUOTE
+
+        TOKEN_WHITESPACE
+
+        TOKEN_QUOTE
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_SLASH
+        TOKEN_IDENTIFIER
+        TOKEN_CHARACTER
+        TOKEN_IDENTIFIER
+        TOKEN_CHARACTER
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_IDENTIFIER
+        TOKEN_SLASH
+        TOKEN_IDENTIFIER
+        TOKEN_CHARACTER
+        TOKEN_IDENTIFIER
+        TOKEN_QUOTE
+
         TOKEN_HTML_TAG_END
         TOKEN_EOF
       ]
 
@@ -0,0 +1,96 @@
+# frozen_string_literal: true
+
+require_relative "../test_helper"
+
+module Lexer
+  class HTMLEntitiesTest < Minitest::Spec
+    test "&lt;" do
+      result = ERBX.lex("&lt;")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_IDENTIFIER
+        TOKEN_SEMICOLON
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "&gt;" do
+      result = ERBX.lex("&gt;")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_IDENTIFIER
+        TOKEN_SEMICOLON
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "&nbsp;" do
+      result = ERBX.lex("&nbsp;")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_IDENTIFIER
+        TOKEN_SEMICOLON
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "&quot;" do
+      result = ERBX.lex("&quot;")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_IDENTIFIER
+        TOKEN_SEMICOLON
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "&apos;" do
+      result = ERBX.lex("&apos;")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_IDENTIFIER
+        TOKEN_SEMICOLON
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "ampersand" do
+      result = ERBX.lex("&amp;")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_IDENTIFIER
+        TOKEN_SEMICOLON
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+
+    test "literal ampersand" do
+      result = ERBX.lex("&")
+
+      expected = %w[
+        TOKEN_AMPERSAND
+        TOKEN_EOF
+      ]
+
+      assert_equal expected, result.array.items.map(&:type)
+    end
+  end
+end
Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,6 @@ export default defineConfig({`
`138`	`138`	`collapsed: true,`
`139`	`139`	`items: [`
`140`	`140`	`{ text: "TOKEN_NEWLINE", link: "/tokens/TOKEN_NEWLINE" },`
`141`		`- { text: "TOKEN_TEXT_CONTENT", link: "/tokens/TOKEN_TEXT_CONTENT" },`
`142`	`141`	`{ text: "TOKEN_WHITESPACE", link: "/tokens/TOKEN_WHITESPACE" },`
`143`	`142`	`]`
`144`	`143`	`}`