@@ -96,7 +96,8 @@ static token_T* lexer_match_and_advance(lexer_T* lexer, const char* value, token
9696static token_T * lexer_parse_whitespace (lexer_T * lexer ) {
9797 buffer_T buffer = buffer_new ();
9898
99- while (isspace (lexer -> current_character ) && lexer -> current_character != '\n' && lexer -> current_character != '\r' ) {
99+ while (isspace (lexer -> current_character ) && lexer -> current_character != '\n' && lexer -> current_character != '\r'
100+ && !lexer_eof (lexer )) {
100101 buffer_append_char (& buffer , lexer -> current_character );
101102 lexer_advance (lexer );
102103 }
@@ -109,7 +110,7 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
109110
110111 while ((isalnum (lexer -> current_character ) || lexer -> current_character == '-' || lexer -> current_character == '_'
111112 || lexer -> current_character == ':' )
112- && !lexer_peek_for_html_comment_end (lexer , 0 )) {
113+ && !lexer_peek_for_html_comment_end (lexer , 0 ) && ! lexer_eof ( lexer ) ) {
113114
114115 buffer_append_char (& buffer , lexer -> current_character );
115116 lexer_advance (lexer );
@@ -118,17 +119,6 @@ static token_T* lexer_parse_identifier(lexer_T* lexer) {
118119 return token_init (buffer .value , TOKEN_IDENTIFIER , lexer );
119120}
120121
121- static token_T * lexer_parse_text_content (lexer_T * lexer ) {
122- buffer_T buffer = buffer_new ();
123-
124- while (lexer -> current_character != '<' && lexer -> current_character != '>' && !lexer_eof (lexer )) {
125- buffer_append_char (& buffer , lexer -> current_character );
126- lexer_advance (lexer );
127- }
128-
129- return token_init (buffer .value , TOKEN_TEXT_CONTENT , lexer );
130- }
131-
132122// ===== ERB Parsing
133123
134124static token_T * lexer_parse_erb_open (lexer_T * lexer ) {
@@ -180,6 +170,9 @@ token_T* lexer_next_token(lexer_T* lexer) {
180170
181171 if (lexer -> current_character == '\n' ) { return lexer_advance_current (lexer , TOKEN_NEWLINE ); }
182172 if (isspace (lexer -> current_character )) { return lexer_parse_whitespace (lexer ); }
173+ if (lexer -> current_character == '\xC2' && lexer_peek (lexer , 1 ) == '\xA0' ) {
174+ return lexer_advance_with (lexer , "\xC2\xA0" , TOKEN_NBSP );
175+ }
183176
184177 switch (lexer -> current_character ) {
185178 case '<' : {
@@ -211,6 +204,8 @@ token_T* lexer_next_token(lexer_T* lexer) {
211204 case '>' : return lexer_advance_current (lexer , TOKEN_HTML_TAG_END );
212205 case '_' : return lexer_advance_current (lexer , TOKEN_UNDERSCORE );
213206 case ':' : return lexer_advance_current (lexer , TOKEN_COLON );
207+ case ';' : return lexer_advance_current (lexer , TOKEN_SEMICOLON );
208+ case '&' : return lexer_advance_current (lexer , TOKEN_AMPERSAND );
214209 case '!' : return lexer_advance_current (lexer , TOKEN_EXCLAMATION );
215210 case '=' : return lexer_advance_current (lexer , TOKEN_EQUALS );
216211 case '%' : return lexer_advance_current (lexer , TOKEN_PERCENT );
@@ -219,11 +214,9 @@ token_T* lexer_next_token(lexer_T* lexer) {
219214 case '\'' : return lexer_advance_current (lexer , TOKEN_QUOTE );
220215
221216 default : {
222- if (isalnum (lexer -> current_character ) || lexer -> current_character == '_' ) {
223- return lexer_parse_identifier (lexer );
224- }
217+ if (isalnum (lexer -> current_character )) { return lexer_parse_identifier (lexer ); }
225218
226- return lexer_parse_text_content (lexer );
219+ return lexer_advance_current (lexer , TOKEN_CHARACTER );
227220 }
228221 }
229222}
0 commit comments