summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Sulfrian <alexander@sulfrian.net>2009-05-20 04:19:52 +0200
committerAlexander Sulfrian <alexander@sulfrian.net>2009-05-20 04:19:52 +0200
commitfb9e6a20ac7393b6cc27949ad2d2af7a305c96ba (patch)
tree2be8d1ea500651ae7415deb9ba718473969468e0
parentb649933b98691a43e7e3adf158109ced285e802c (diff)
downloadswppy-fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba.tar.gz
swppy-fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba.tar.xz
swppy-fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba.zip
implemented lexer (with tokens and symbolTable)
todo: beautify code, implement token classes for parser implemented test function with testcode moved token class to single file (token.py)
-rw-r--r--.gitignore1
-rw-r--r--src/front/__init__.py32
-rw-r--r--src/front/lexer.py149
-rw-r--r--src/front/symbols.py29
-rw-r--r--src/front/token.py66
5 files changed, 216 insertions, 61 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/src/front/__init__.py b/src/front/__init__.py
index 63529ef..426a16e 100644
--- a/src/front/__init__.py
+++ b/src/front/__init__.py
@@ -1,7 +1,33 @@
from front.lexer import Lexer
from front.parser import Parser
+from front.symbols import SymbolTable
def main():
- lex = Lexer()
- parse = Parser(lex)
- parse.program()
+ source = '''fun fib[a]
+ if a < 2
+ @1
+ @( fib[a-1] + fib[a-2] )
+end
+
+# main function
+fun main[]
+ sum = 0
+ i = 0
+ while (i < 10)
+ sum = sum + fib[i = i + 1]
+ end
+ @sum
+end'''
+
+ symbols = SymbolTable()
+ lex = Lexer(source, symbols)
+
+ # testing
+ while True:
+ token = lex.scan()
+ print token.__repr__()
+ if not token:
+ break
+
+ # parse = Parser(lex)
+ # parse.program()
diff --git a/src/front/lexer.py b/src/front/lexer.py
index aca7473..fe798a9 100644
--- a/src/front/lexer.py
+++ b/src/front/lexer.py
@@ -1,62 +1,103 @@
+# -*- coding: utf-8 -*-
+import re
+from token import *
+
class Lexer:
- line = 1
+
+ def __init__(self, source, symbols):
+ self.symbols = symbols
- def __init__(self):
- return
+ self.source = source.splitlines()
+ self.source.reverse()
+ self.line = 0
+ self.doubleNewlineCheck = False
+ self.currentLine = ''
- def reserve(self, word):
+ # reservierte Wörter initialisieren
+ self.reservedWords = {'True': Token(Tag.TRUE),
+ 'False': Token(Tag.FALSE),
+ '[': Token(Tag.LBRAK),
+ ']': Token(Tag.RBRAK),
+ '(': Token(Tag.LPAREN),
+ ')': Token(Tag.RPAREN),
+ ',': Token(Tag.COMMA),
+ 'while': Token(Tag.WHILE),
+ 'if': Token(Tag.IF),
+ 'else': Token(Tag.ELSE),
+ 'fun': Token(Tag.FUN),
+ 'end': Token(Tag.END)}
return
- def scan():
+ def reserve(self, word, token):
+ self.reservedWords[word] = token
return
-class Tag:
- # lexer tokens
- NUMBER = 1
- TRUE = 2
- FALSE = 3
- IDENT = 4
- WHILE = 5
- IF = 6
- ELSE = 7
- END = 8
- LBRAK = 9
- RBRAK = 10
- LPAREN = 11
- RPAREN = 12
- NEWLINE = 13
- COMMA = 14
- FUN = 15
- ASSIGNMENT = 16
- RETURN = 17
- OPERATOR = 18
-
- # parser tokens
- BOOL = 19
- JOIN = 20
- EQUALITY = 21
- RELATION = 22
- EXPRESSION = 23
- TERM = 24
- UNARY = 25
- FACTOR = 26
- IDENT_LIST = 27
- EXPRESSION_LIST = 28
- PROGRAM = 29
- FUNCTION = 30
- STATEMENT = 31
- STATEMENTS = 32
- IF_STATEMENT = 33
- WHILE_STATEMENT = 34
- RETURN_STATEMENT = 35
- ASSIGN_STATEMENT = 36
- FUNCTION_CALL = 37
-
-class Token:
- tag = None
-
- def __init__(self, tag):
- return
+ def scan(self):
+ # wenn in der aktuellen Zeile nichts mehr steht
+ if (len(self.currentLine) == 0):
+ # wenn source zuende, dann None zurückgeben
+ if (len(self.source) <= 0):
+ return None
- def __str__(self):
- return
+ # nächste Zeile auslesen
+ self.line = self.line + 1
+ self.currentLine = self.source.pop()
+
+ # newline zurückgeben
+ if self.doubleNewlineCheck:
+ self.doubleNewlineCheck = False
+ return Token(Tag.NEWLINE)
+
+ # leerzeichen entfernen
+ self.currentLine = self.currentLine.strip()
+
+ # bei Kommentar, Rest der Zeile ignorieren
+ if self.currentLine.startswith('#'):
+ self.currentLine = ''
+ return self.scan()
+
+ # keine doppelten Newlines
+ self.doubleNewlineCheck = True
+
+ # Token parsen
+ if self.currentLine.startswith('@'):
+ self.currentLine = self.currentLine[1:]
+ return Token(Tag.RETURN)
+
+ # reservierte Wörter (da stehen auch schon erkannte Identifyer drine)
+ for reservedWord, token in self.reservedWords.iteritems():
+ if self.currentLine.startswith(reservedWord):
+ length = len(reservedWord)
+
+ if len(self.currentLine) <= length or not self.currentLine[0].isalnum() or not self.currentLine[length].isalnum():
+ self.currentLine = self.currentLine[length:]
+ return token
+
+ # zahlen matchen
+ match = re.match(r"^([0-9]+)", self.currentLine)
+ if match:
+ self.currentLine = self.currentLine[match.end(0):]
+ return ValueToken(Tag.NUMBER, int(match.group(0)))
+
+ # operatoren matchen
+ match = re.match(r"^(<=|==|>=|&&|\|\||<|>|\+|-|\*|/)", self.currentLine)
+ if match:
+ self.currentLine = self.currentLine[match.end(0):]
+ return ValueToken(Tag.OPERATOR, match.group(0))
+
+ # idents matchen
+ match = re.match(r"^([a-zA-Z][a-zA-Z0-9]*)", self.currentLine)
+ if match:
+ self.currentLine = self.currentLine[match.end(0):]
+ token = ValueToken(Tag.IDENT, self.symbols.getOrPut(match.group(0)))
+ self.reserve(match.group(0), token)
+ return token
+
+ # assignments
+ if self.currentLine.startswith('='):
+ self.currentLine = self.currentLine[1:]
+ return Token(Tag.ASSIGNMENT)
+
+ # wenn die programmausführung hier ist,
+ # ist ein syntaxfehler aufgetreten
+ raise Exception("Syntax Error in line: %d at: '%s'" % (self.line, self.currentLine))
diff --git a/src/front/symbols.py b/src/front/symbols.py
index f4ab40e..a868d77 100644
--- a/src/front/symbols.py
+++ b/src/front/symbols.py
@@ -1,9 +1,30 @@
class SymbolTable:
def __init__(self):
+ self.symbols = {}
return
- def put(token, id):
- return
+ def put(self, token, id=None):
+ if id and not id.isdigit():
+ raise Exception("Only digits as id possible. '%s' is not a number" % id)
- def get(token):
- return
+ if id == None:
+ if len(self.symbols) <= 0:
+ id = 0
+ else:
+ id = max(self.symbols.values()) + 1
+
+ self.symbols[token] = id
+ return id
+
+
+ def get(self, token):
+ if token in self.symbols:
+ return self.symbols[token]
+
+ return None
+
+ def getOrPut(self, token):
+ if self.get(token):
+ return self.get(token)
+
+ return self.put(token)
diff --git a/src/front/token.py b/src/front/token.py
new file mode 100644
index 0000000..def0236
--- /dev/null
+++ b/src/front/token.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# date: 20 Mai 2009
+
+class Tag:
+ # lexer tokens
+ NUMBER = 1
+ TRUE = 2
+ FALSE = 3
+ IDENT = 4
+ WHILE = 5
+ IF = 6
+ ELSE = 7
+ END = 8
+ LBRAK = 9
+ RBRAK = 10
+ LPAREN = 11
+ RPAREN = 12
+ NEWLINE = 13
+ COMMA = 14
+ FUN = 15
+ ASSIGNMENT = 16
+ RETURN = 17
+ OPERATOR = 18
+
+ # parser tokens
+ BOOL = 19
+ JOIN = 20
+ EQUALITY = 21
+ RELATION = 22
+ EXPRESSION = 23
+ TERM = 24
+ UNARY = 25
+ FACTOR = 26
+ IDENT_LIST = 27
+ EXPRESSION_LIST = 28
+ PROGRAM = 29
+ FUNCTION = 30
+ STATEMENT = 31
+ STATEMENTS = 32
+ IF_STATEMENT = 33
+ WHILE_STATEMENT = 34
+ RETURN_STATEMENT = 35
+ ASSIGN_STATEMENT = 36
+ FUNCTION_CALL = 37
+
+class Token:
+ tag = None
+
+ def __init__(self, tag):
+ self.tag = tag
+ return
+
+ def __repr__(self):
+ return "<Token: %d>" % self.tag
+
+class ValueToken(Token):
+ value = None
+
+ def __init__(self, tag, value):
+ Token.__init__(self, tag)
+ self.value = value
+ return
+
+ def __repr__(self):
+ return "<Token: %d Value: %s>" % (self.tag, self.value.__str__())
+