summaryrefslogtreecommitdiffstats
path: root/src/front/lexer.py
diff options
context:
space:
mode:
authorAlexander Sulfrian <alexander@sulfrian.net>2009-05-20 04:19:52 +0200
committerAlexander Sulfrian <alexander@sulfrian.net>2009-05-20 04:19:52 +0200
commitfb9e6a20ac7393b6cc27949ad2d2af7a305c96ba (patch)
tree2be8d1ea500651ae7415deb9ba718473969468e0 /src/front/lexer.py
parentb649933b98691a43e7e3adf158109ced285e802c (diff)
downloadswppy-fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba.tar.gz
swppy-fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba.tar.xz
swppy-fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba.zip
implemented lexer (with tokens and symbolTable)
todo: beautify code, implement token classes for parser implemented test function with testcode moved token class to single file (token.py)
Diffstat (limited to 'src/front/lexer.py')
-rw-r--r--src/front/lexer.py149
1 files changed, 95 insertions, 54 deletions
diff --git a/src/front/lexer.py b/src/front/lexer.py
index aca7473..fe798a9 100644
--- a/src/front/lexer.py
+++ b/src/front/lexer.py
@@ -1,62 +1,103 @@
+# -*- coding: utf-8 -*-
+import re
+from token import *
+
class Lexer:
- line = 1
+
+ def __init__(self, source, symbols):
+ self.symbols = symbols
- def __init__(self):
- return
+ self.source = source.splitlines()
+ self.source.reverse()
+ self.line = 0
+ self.doubleNewlineCheck = False
+ self.currentLine = ''
- def reserve(self, word):
+ # reservierte Wörter initialisieren
+ self.reservedWords = {'True': Token(Tag.TRUE),
+ 'False': Token(Tag.FALSE),
+ '[': Token(Tag.LBRAK),
+ ']': Token(Tag.RBRAK),
+ '(': Token(Tag.LPAREN),
+ ')': Token(Tag.RPAREN),
+ ',': Token(Tag.COMMA),
+ 'while': Token(Tag.WHILE),
+ 'if': Token(Tag.IF),
+ 'else': Token(Tag.ELSE),
+ 'fun': Token(Tag.FUN),
+ 'end': Token(Tag.END)}
return
- def scan():
+ def reserve(self, word, token):
+ self.reservedWords[word] = token
return
-class Tag:
- # lexer tokens
- NUMBER = 1
- TRUE = 2
- FALSE = 3
- IDENT = 4
- WHILE = 5
- IF = 6
- ELSE = 7
- END = 8
- LBRAK = 9
- RBRAK = 10
- LPAREN = 11
- RPAREN = 12
- NEWLINE = 13
- COMMA = 14
- FUN = 15
- ASSIGNMENT = 16
- RETURN = 17
- OPERATOR = 18
-
- # parser tokens
- BOOL = 19
- JOIN = 20
- EQUALITY = 21
- RELATION = 22
- EXPRESSION = 23
- TERM = 24
- UNARY = 25
- FACTOR = 26
- IDENT_LIST = 27
- EXPRESSION_LIST = 28
- PROGRAM = 29
- FUNCTION = 30
- STATEMENT = 31
- STATEMENTS = 32
- IF_STATEMENT = 33
- WHILE_STATEMENT = 34
- RETURN_STATEMENT = 35
- ASSIGN_STATEMENT = 36
- FUNCTION_CALL = 37
-
-class Token:
- tag = None
-
- def __init__(self, tag):
- return
+ def scan(self):
+ # wenn in der aktuellen Zeile nichts mehr steht
+ if (len(self.currentLine) == 0):
+ # wenn source zuende, dann None zurückgeben
+ if (len(self.source) <= 0):
+ return None
- def __str__(self):
- return
+ # nächste Zeile auslesen
+ self.line = self.line + 1
+ self.currentLine = self.source.pop()
+
+ # newline zurückgeben
+ if self.doubleNewlineCheck:
+ self.doubleNewlineCheck = False
+ return Token(Tag.NEWLINE)
+
+ # leerzeichen entfernen
+ self.currentLine = self.currentLine.strip()
+
+ # bei Kommentar, Rest der Zeile ignorieren
+ if self.currentLine.startswith('#'):
+ self.currentLine = ''
+ return self.scan()
+
+ # keine doppelten Newlines
+ self.doubleNewlineCheck = True
+
+ # Token parsen
+ if self.currentLine.startswith('@'):
+ self.currentLine = self.currentLine[1:]
+ return Token(Tag.RETURN)
+
+ # reservierte Wörter (da stehen auch schon erkannte Identifyer drine)
+ for reservedWord, token in self.reservedWords.iteritems():
+ if self.currentLine.startswith(reservedWord):
+ length = len(reservedWord)
+
+ if len(self.currentLine) <= length or not self.currentLine[0].isalnum() or not self.currentLine[length].isalnum():
+ self.currentLine = self.currentLine[length:]
+ return token
+
+ # zahlen matchen
+ match = re.match(r"^([0-9]+)", self.currentLine)
+ if match:
+ self.currentLine = self.currentLine[match.end(0):]
+ return ValueToken(Tag.NUMBER, int(match.group(0)))
+
+ # operatoren matchen
+ match = re.match(r"^(<=|==|>=|&&|\|\||<|>|\+|-|\*|/)", self.currentLine)
+ if match:
+ self.currentLine = self.currentLine[match.end(0):]
+ return ValueToken(Tag.OPERATOR, match.group(0))
+
+ # idents matchen
+ match = re.match(r"^([a-zA-Z][a-zA-Z0-9]*)", self.currentLine)
+ if match:
+ self.currentLine = self.currentLine[match.end(0):]
+ token = ValueToken(Tag.IDENT, self.symbols.getOrPut(match.group(0)))
+ self.reserve(match.group(0), token)
+ return token
+
+ # assignments
+ if self.currentLine.startswith('='):
+ self.currentLine = self.currentLine[1:]
+ return Token(Tag.ASSIGNMENT)
+
+ # wenn die programmausführung hier ist,
+ # ist ein syntaxfehler aufgetreten
+ raise Exception("Syntax Error in line: %d at: '%s'" % (self.line, self.currentLine))