From fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba Mon Sep 17 00:00:00 2001 From: Alexander Sulfrian Date: Wed, 20 May 2009 04:19:52 +0200 Subject: implemented lexer (with tokens and symbolTable) todo: beautify code, implement token classes for parser implemented test function with testcode moved token class to single file (token.py) --- src/front/lexer.py | 149 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 95 insertions(+), 54 deletions(-) (limited to 'src/front/lexer.py') diff --git a/src/front/lexer.py b/src/front/lexer.py index aca7473..fe798a9 100644 --- a/src/front/lexer.py +++ b/src/front/lexer.py @@ -1,62 +1,103 @@ +# -*- coding: utf-8 -*- +import re +from token import * + class Lexer: - line = 1 + + def __init__(self, source, symbols): + self.symbols = symbols - def __init__(self): - return + self.source = source.splitlines() + self.source.reverse() + self.line = 0 + self.doubleNewlineCheck = False + self.currentLine = '' - def reserve(self, word): + # reservierte Wörter initialisieren + self.reservedWords = {'True': Token(Tag.TRUE), + 'False': Token(Tag.FALSE), + '[': Token(Tag.LBRAK), + ']': Token(Tag.RBRAK), + '(': Token(Tag.LPAREN), + ')': Token(Tag.RPAREN), + ',': Token(Tag.COMMA), + 'while': Token(Tag.WHILE), + 'if': Token(Tag.IF), + 'else': Token(Tag.ELSE), + 'fun': Token(Tag.FUN), + 'end': Token(Tag.END)} return - def scan(): + def reserve(self, word, token): + self.reservedWords[word] = token return -class Tag: - # lexer tokens - NUMBER = 1 - TRUE = 2 - FALSE = 3 - IDENT = 4 - WHILE = 5 - IF = 6 - ELSE = 7 - END = 8 - LBRAK = 9 - RBRAK = 10 - LPAREN = 11 - RPAREN = 12 - NEWLINE = 13 - COMMA = 14 - FUN = 15 - ASSIGNMENT = 16 - RETURN = 17 - OPERATOR = 18 - - # parser tokens - BOOL = 19 - JOIN = 20 - EQUALITY = 21 - RELATION = 22 - EXPRESSION = 23 - TERM = 24 - UNARY = 25 - FACTOR = 26 - IDENT_LIST = 27 - EXPRESSION_LIST = 28 - PROGRAM = 29 - FUNCTION = 30 - STATEMENT = 31 - STATEMENTS = 32 - IF_STATEMENT = 33 - WHILE_STATEMENT = 34 - RETURN_STATEMENT = 35 - ASSIGN_STATEMENT = 36 - FUNCTION_CALL = 37 - -class Token: - tag = None - - def __init__(self, tag): - return + def scan(self): + # wenn in der aktuellen Zeile nichts mehr steht + if (len(self.currentLine) == 0): + # wenn source zuende, dann None zurückgeben + if (len(self.source) <= 0): + return None - def __str__(self): - return + # nächste Zeile auslesen + self.line = self.line + 1 + self.currentLine = self.source.pop() + + # newline zurückgeben + if self.doubleNewlineCheck: + self.doubleNewlineCheck = False + return Token(Tag.NEWLINE) + + # leerzeichen entfernen + self.currentLine = self.currentLine.strip() + + # bei Kommentar, Rest der Zeile ignorieren + if self.currentLine.startswith('#'): + self.currentLine = '' + return self.scan() + + # keine doppelten Newlines + self.doubleNewlineCheck = True + + # Token parsen + if self.currentLine.startswith('@'): + self.currentLine = self.currentLine[1:] + return Token(Tag.RETURN) + + # reservierte Wörter (da stehen auch schon erkannte Identifyer drine) + for reservedWord, token in self.reservedWords.iteritems(): + if self.currentLine.startswith(reservedWord): + length = len(reservedWord) + + if len(self.currentLine) <= length or not self.currentLine[0].isalnum() or not self.currentLine[length].isalnum(): + self.currentLine = self.currentLine[length:] + return token + + # zahlen matchen + match = re.match(r"^([0-9]+)", self.currentLine) + if match: + self.currentLine = self.currentLine[match.end(0):] + return ValueToken(Tag.NUMBER, int(match.group(0))) + + # operatoren matchen + match = re.match(r"^(<=|==|>=|&&|\|\||<|>|\+|-|\*|/)", self.currentLine) + if match: + self.currentLine = self.currentLine[match.end(0):] + return ValueToken(Tag.OPERATOR, match.group(0)) + + # idents matchen + match = re.match(r"^([a-zA-Z][a-zA-Z0-9]*)", self.currentLine) + if match: + self.currentLine = self.currentLine[match.end(0):] + token = ValueToken(Tag.IDENT, self.symbols.getOrPut(match.group(0))) + self.reserve(match.group(0), token) + return token + + # assignments + if self.currentLine.startswith('='): + self.currentLine = self.currentLine[1:] + return Token(Tag.ASSIGNMENT) + + # wenn die programmausführung hier ist, + # ist ein syntaxfehler aufgetreten + raise Exception("Syntax Error in line: %d at: '%s'" % (self.line, self.currentLine)) -- cgit v1.2.3