From fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba Mon Sep 17 00:00:00 2001 From: Alexander Sulfrian Date: Wed, 20 May 2009 04:19:52 +0200 Subject: implemented lexer (with tokens and symbolTable) todo: beautify code, implement token classes for parser implemented test function with testcode moved token class to single file (token.py) --- src/front/__init__.py | 32 ++++++++++- src/front/lexer.py | 149 ++++++++++++++++++++++++++++++++------------------ src/front/symbols.py | 29 ++++++++-- src/front/token.py | 66 ++++++++++++++++++++++ 4 files changed, 215 insertions(+), 61 deletions(-) create mode 100644 src/front/token.py (limited to 'src/front') diff --git a/src/front/__init__.py b/src/front/__init__.py index 63529ef..426a16e 100644 --- a/src/front/__init__.py +++ b/src/front/__init__.py @@ -1,7 +1,33 @@ from front.lexer import Lexer from front.parser import Parser +from front.symbols import SymbolTable def main(): - lex = Lexer() - parse = Parser(lex) - parse.program() + source = '''fun fib[a] + if a < 2 + @1 + @( fib[a-1] + fib[a-2] ) +end + +# main function +fun main[] + sum = 0 + i = 0 + while (i < 10) + sum = sum + fib[i = i + 1] + end + @sum +end''' + + symbols = SymbolTable() + lex = Lexer(source, symbols) + + # testing + while True: + token = lex.scan() + print token.__repr__() + if not token: + break + + # parse = Parser(lex) + # parse.program() diff --git a/src/front/lexer.py b/src/front/lexer.py index aca7473..fe798a9 100644 --- a/src/front/lexer.py +++ b/src/front/lexer.py @@ -1,62 +1,103 @@ +# -*- coding: utf-8 -*- +import re +from token import * + class Lexer: - line = 1 + + def __init__(self, source, symbols): + self.symbols = symbols - def __init__(self): - return + self.source = source.splitlines() + self.source.reverse() + self.line = 0 + self.doubleNewlineCheck = False + self.currentLine = '' - def reserve(self, word): + # reservierte Wörter initialisieren + self.reservedWords = {'True': Token(Tag.TRUE), + 'False': Token(Tag.FALSE), + '[': Token(Tag.LBRAK), + ']': Token(Tag.RBRAK), + '(': Token(Tag.LPAREN), + ')': Token(Tag.RPAREN), + ',': Token(Tag.COMMA), + 'while': Token(Tag.WHILE), + 'if': Token(Tag.IF), + 'else': Token(Tag.ELSE), + 'fun': Token(Tag.FUN), + 'end': Token(Tag.END)} return - def scan(): + def reserve(self, word, token): + self.reservedWords[word] = token return -class Tag: - # lexer tokens - NUMBER = 1 - TRUE = 2 - FALSE = 3 - IDENT = 4 - WHILE = 5 - IF = 6 - ELSE = 7 - END = 8 - LBRAK = 9 - RBRAK = 10 - LPAREN = 11 - RPAREN = 12 - NEWLINE = 13 - COMMA = 14 - FUN = 15 - ASSIGNMENT = 16 - RETURN = 17 - OPERATOR = 18 - - # parser tokens - BOOL = 19 - JOIN = 20 - EQUALITY = 21 - RELATION = 22 - EXPRESSION = 23 - TERM = 24 - UNARY = 25 - FACTOR = 26 - IDENT_LIST = 27 - EXPRESSION_LIST = 28 - PROGRAM = 29 - FUNCTION = 30 - STATEMENT = 31 - STATEMENTS = 32 - IF_STATEMENT = 33 - WHILE_STATEMENT = 34 - RETURN_STATEMENT = 35 - ASSIGN_STATEMENT = 36 - FUNCTION_CALL = 37 - -class Token: - tag = None - - def __init__(self, tag): - return + def scan(self): + # wenn in der aktuellen Zeile nichts mehr steht + if (len(self.currentLine) == 0): + # wenn source zuende, dann None zurückgeben + if (len(self.source) <= 0): + return None - def __str__(self): - return + # nächste Zeile auslesen + self.line = self.line + 1 + self.currentLine = self.source.pop() + + # newline zurückgeben + if self.doubleNewlineCheck: + self.doubleNewlineCheck = False + return Token(Tag.NEWLINE) + + # leerzeichen entfernen + self.currentLine = self.currentLine.strip() + + # bei Kommentar, Rest der Zeile ignorieren + if self.currentLine.startswith('#'): + self.currentLine = '' + return self.scan() + + # keine doppelten Newlines + self.doubleNewlineCheck = True + + # Token parsen + if self.currentLine.startswith('@'): + self.currentLine = self.currentLine[1:] + return Token(Tag.RETURN) + + # reservierte Wörter (da stehen auch schon erkannte Identifyer drine) + for reservedWord, token in self.reservedWords.iteritems(): + if self.currentLine.startswith(reservedWord): + length = len(reservedWord) + + if len(self.currentLine) <= length or not self.currentLine[0].isalnum() or not self.currentLine[length].isalnum(): + self.currentLine = self.currentLine[length:] + return token + + # zahlen matchen + match = re.match(r"^([0-9]+)", self.currentLine) + if match: + self.currentLine = self.currentLine[match.end(0):] + return ValueToken(Tag.NUMBER, int(match.group(0))) + + # operatoren matchen + match = re.match(r"^(<=|==|>=|&&|\|\||<|>|\+|-|\*|/)", self.currentLine) + if match: + self.currentLine = self.currentLine[match.end(0):] + return ValueToken(Tag.OPERATOR, match.group(0)) + + # idents matchen + match = re.match(r"^([a-zA-Z][a-zA-Z0-9]*)", self.currentLine) + if match: + self.currentLine = self.currentLine[match.end(0):] + token = ValueToken(Tag.IDENT, self.symbols.getOrPut(match.group(0))) + self.reserve(match.group(0), token) + return token + + # assignments + if self.currentLine.startswith('='): + self.currentLine = self.currentLine[1:] + return Token(Tag.ASSIGNMENT) + + # wenn die programmausführung hier ist, + # ist ein syntaxfehler aufgetreten + raise Exception("Syntax Error in line: %d at: '%s'" % (self.line, self.currentLine)) diff --git a/src/front/symbols.py b/src/front/symbols.py index f4ab40e..a868d77 100644 --- a/src/front/symbols.py +++ b/src/front/symbols.py @@ -1,9 +1,30 @@ class SymbolTable: def __init__(self): + self.symbols = {} return - def put(token, id): - return + def put(self, token, id=None): + if id and not id.isdigit(): + raise Exception("Only digits as id possible. '%s' is not a number" % id) - def get(token): - return + if id == None: + if len(self.symbols) <= 0: + id = 0 + else: + id = max(self.symbols.values()) + 1 + + self.symbols[token] = id + return id + + + def get(self, token): + if token in self.symbols: + return self.symbols[token] + + return None + + def getOrPut(self, token): + if self.get(token): + return self.get(token) + + return self.put(token) diff --git a/src/front/token.py b/src/front/token.py new file mode 100644 index 0000000..def0236 --- /dev/null +++ b/src/front/token.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# date: 20 Mai 2009 + +class Tag: + # lexer tokens + NUMBER = 1 + TRUE = 2 + FALSE = 3 + IDENT = 4 + WHILE = 5 + IF = 6 + ELSE = 7 + END = 8 + LBRAK = 9 + RBRAK = 10 + LPAREN = 11 + RPAREN = 12 + NEWLINE = 13 + COMMA = 14 + FUN = 15 + ASSIGNMENT = 16 + RETURN = 17 + OPERATOR = 18 + + # parser tokens + BOOL = 19 + JOIN = 20 + EQUALITY = 21 + RELATION = 22 + EXPRESSION = 23 + TERM = 24 + UNARY = 25 + FACTOR = 26 + IDENT_LIST = 27 + EXPRESSION_LIST = 28 + PROGRAM = 29 + FUNCTION = 30 + STATEMENT = 31 + STATEMENTS = 32 + IF_STATEMENT = 33 + WHILE_STATEMENT = 34 + RETURN_STATEMENT = 35 + ASSIGN_STATEMENT = 36 + FUNCTION_CALL = 37 + +class Token: + tag = None + + def __init__(self, tag): + self.tag = tag + return + + def __repr__(self): + return "" % self.tag + +class ValueToken(Token): + value = None + + def __init__(self, tag, value): + Token.__init__(self, tag) + self.value = value + return + + def __repr__(self): + return "" % (self.tag, self.value.__str__()) + -- cgit v1.2.3