Add identifiers, Completed chapter 4, the Scanner

RoelAdriaans · RoelAdriaans · commit f55c50b62de6 · 2020-08-01T20:53:08.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,20 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+
+## [0.0.1] - 2020-08-01
+
 ### Added
 
 - Completing chapter 4
-- Completed including chapter 4.6.2
 - Created CHANGELOG.md, README.md, STATUS.md
 - Created first structure of the project, including tox, flake8, mypy, black and
   other utilities
 - Implemented `run`, `run_file` and `run_prompt` methods
 - Added `Token`, `Scanner`, `TokenType`
-- Support strings and numbers
-
-## [0.0.1] - 2020-00-00
-
-Todo, this is just a placeholder
+- Support `strings`, `numbers` and `identifiers`
 
 [Unreleased]: https://github.com/RoelAdriaans/yaplox/compare/v0.0.1...HEAD
 [0.0.1]: https://github.com/RoelAdriaans/yaplox/releases/tag/v0.0.1
diff --git a/src/yaplox/scanner.py b/src/yaplox/scanner.py
@@ -9,6 +9,24 @@ class Scanner:
     start: int = 0
     current: int = 0
     line: int = 1
+    keywords = {
+        "and": TokenType.AND,
+        "class": TokenType.CLASS,
+        "else": TokenType.ELSE,
+        "false": TokenType.FALSE,
+        "for": TokenType.FOR,
+        "fun": TokenType.FUN,
+        "if": TokenType.IF,
+        "nil": TokenType.NIL,
+        "or": TokenType.OR,
+        "print": TokenType.PRINT,
+        "return": TokenType.RETURN,
+        "super": TokenType.SUPER,
+        "this": TokenType.THIS,
+        "true": TokenType.TRUE,
+        "var": TokenType.VAR,
+        "while": TokenType.WHILE,
+    }
 
     def __init__(self, source: str, on_error=None):
         """
@@ -76,6 +94,16 @@ def _number(self):
         number_value = self.source[self.start : self.current]
         self._add_token(TokenType.NUMBER, float(number_value))
 
+    def _identifier(self):
+        while self._peek().isalnum() or self._peek() == "_":
+            self._advance()
+
+        # See if the identifier is a reserved word
+        text = self.source[self.start : self.current]
+        token_type = self.keywords.get(text, TokenType.IDENTIFIER)
+
+        self._add_token(token_type=token_type)
+
     def _scan_token(self):
         """ Scan tokens"""
         c = self._advance()
@@ -126,8 +154,12 @@ def _scan_token(self):
             if c.isdigit():
                 # An digit encountered, consume the number
                 self._number()
-            # If we have an on_error callback, run this, otherwise raise the error again
+            elif c.isalpha() or c == "_":
+                # An letter encoutered
+                self._identifier()
             elif self.on_error:
+                # If we have an on_error callback, run this, otherwise raise the
+                # error again
                 self.on_error(self.line, f"Unexpected character: {c}")
             else:
                 raise
diff --git a/tests/test_scanner.py b/tests/test_scanner.py
@@ -176,3 +176,59 @@ def test_scanner_with_number(self, mocker):
         assert tokens[5].literal == 13.0
 
         assert not on_error_mock.called
+
+    def test_scanner_identifier(self, mocker):
+        source = "appelflap or nil if while _foo_bar_1_2"
+
+        on_error_mock = mocker.MagicMock()
+        scanner = Scanner(source, on_error=on_error_mock)
+
+        tokens = scanner.scan_tokens()
+
+        assert tokens[0].token_type == TokenType.IDENTIFIER
+        assert tokens[0].lexeme == "appelflap"
+
+        assert tokens[1].token_type == TokenType.OR
+        assert tokens[2].token_type == TokenType.NIL
+        assert tokens[3].token_type == TokenType.IF
+        assert tokens[4].token_type == TokenType.WHILE
+
+        assert tokens[5].token_type == TokenType.IDENTIFIER
+        assert tokens[5].lexeme == "_foo_bar_1_2"
+
+        assert not on_error_mock.called
+
+    def test_scanner_invalid_identifier(self, mocker):
+        # The bit of source code below is completely wrong, and identifies and
+        # numbers in here will not result in valid tokens, but not the tokens you
+        # would expect. This is not a problem of the scanner, it just does as it's
+        # told.
+        source = "123foo_bar bar-stool spam_egg_1.3_chickens"
+
+        on_error_mock = mocker.MagicMock()
+        scanner = Scanner(source, on_error=on_error_mock)
+
+        tokens = scanner.scan_tokens()
+
+        assert tokens[0].literal == 123.0
+
+        assert tokens[1].lexeme == "foo_bar"
+        assert tokens[1].token_type == TokenType.IDENTIFIER
+
+        assert tokens[2].lexeme == "bar"
+        assert tokens[2].token_type == TokenType.IDENTIFIER
+
+        assert tokens[3].token_type == TokenType.MINUS
+
+        assert tokens[4].lexeme == "stool"
+        assert tokens[5].lexeme == "spam_egg_1"
+        assert tokens[6].token_type == TokenType.DOT
+
+        # This token did not consume the 1 before, since that was still part of the
+        # valid identifier. The dot broke the identifier, and then a number started
+        assert tokens[7].token_type == TokenType.NUMBER
+        assert tokens[7].literal == 3.0
+
+        assert tokens[8].lexeme == "_chickens"
+
+        assert not on_error_mock.called