From fb9e6a20ac7393b6cc27949ad2d2af7a305c96ba Mon Sep 17 00:00:00 2001
From: Alexander Sulfrian <alexander@sulfrian.net>
Date: Wed, 20 May 2009 04:19:52 +0200
Subject: implemented lexer (with tokens and symbolTable)

todo: beautify code, implement token classes for parser
implemented test function with testcode
moved token class to single file (token.py)
---
 .gitignore            |   1 +
 src/front/__init__.py |  32 ++++++++++-
 src/front/lexer.py    | 149 ++++++++++++++++++++++++++++++++------------------
 src/front/symbols.py  |  29 ++++++++--
 src/front/token.py    |  66 ++++++++++++++++++++++
 5 files changed, 216 insertions(+), 61 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 src/front/token.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/src/front/__init__.py b/src/front/__init__.py
index 63529ef..426a16e 100644
--- a/src/front/__init__.py
+++ b/src/front/__init__.py
@@ -1,7 +1,33 @@
 from front.lexer import Lexer
 from front.parser import Parser
+from front.symbols import SymbolTable
 
 def main():
-    lex = Lexer()
-    parse = Parser(lex)
-    parse.program()
+    source = '''fun fib[a]
+	if a < 2
+		@1
+	@( fib[a-1] + fib[a-2] )
+end
+
+# main function
+fun main[]
+	sum = 0
+	i = 0
+	while (i < 10)
+		sum = sum + fib[i = i + 1]
+	end
+	@sum
+end'''
+
+    symbols = SymbolTable()
+    lex = Lexer(source, symbols)
+
+    # testing
+    while True:
+        token = lex.scan()
+        print token.__repr__()
+        if not token:
+            break
+    
+    # parse = Parser(lex)
+    # parse.program()
diff --git a/src/front/lexer.py b/src/front/lexer.py
index aca7473..fe798a9 100644
--- a/src/front/lexer.py
+++ b/src/front/lexer.py
@@ -1,62 +1,103 @@
+# -*- coding: utf-8 -*-
+import re
+from token import *
+
 class Lexer:
-    line = 1
+    
+    def __init__(self, source, symbols):
+        self.symbols = symbols
 
-    def __init__(self):
-        return
+        self.source = source.splitlines()
+        self.source.reverse()
+        self.line = 0
+        self.doubleNewlineCheck = False
+        self.currentLine = ''
 
-    def reserve(self, word):
+        # reservierte Wörter initialisieren
+        self.reservedWords = {'True': Token(Tag.TRUE),
+                              'False': Token(Tag.FALSE),
+                              '[': Token(Tag.LBRAK),
+                              ']': Token(Tag.RBRAK),
+                              '(': Token(Tag.LPAREN),
+                              ')': Token(Tag.RPAREN),
+                              ',': Token(Tag.COMMA),
+                              'while': Token(Tag.WHILE),
+                              'if': Token(Tag.IF),
+                              'else': Token(Tag.ELSE),
+                              'fun': Token(Tag.FUN),
+                              'end': Token(Tag.END)}
         return
 
-    def scan():
+    def reserve(self, word, token):
+        self.reservedWords[word] = token
         return
 
-class Tag:
-    # lexer tokens
-    NUMBER           = 1
-    TRUE             = 2
-    FALSE            = 3
-    IDENT            = 4
-    WHILE            = 5
-    IF               = 6
-    ELSE             = 7
-    END              = 8
-    LBRAK            = 9
-    RBRAK            = 10
-    LPAREN           = 11
-    RPAREN           = 12
-    NEWLINE          = 13
-    COMMA            = 14
-    FUN              = 15
-    ASSIGNMENT       = 16
-    RETURN           = 17
-    OPERATOR         = 18
-
-    # parser tokens
-    BOOL             = 19
-    JOIN             = 20
-    EQUALITY         = 21
-    RELATION         = 22
-    EXPRESSION       = 23
-    TERM             = 24
-    UNARY            = 25
-    FACTOR           = 26
-    IDENT_LIST       = 27
-    EXPRESSION_LIST  = 28
-    PROGRAM          = 29
-    FUNCTION         = 30
-    STATEMENT        = 31
-    STATEMENTS       = 32
-    IF_STATEMENT     = 33
-    WHILE_STATEMENT  = 34
-    RETURN_STATEMENT = 35
-    ASSIGN_STATEMENT = 36
-    FUNCTION_CALL    = 37
-
-class Token:
-    tag = None
-
-    def __init__(self, tag):
-        return
+    def scan(self):
+        # wenn in der aktuellen Zeile nichts mehr steht
+        if (len(self.currentLine) == 0):
+            # wenn source zuende, dann None zurückgeben
+            if (len(self.source) <= 0):
+                return None
 
-    def __str__(self):
-        return
+            # nächste Zeile auslesen
+            self.line = self.line + 1
+            self.currentLine = self.source.pop()
+
+            # newline zurückgeben
+            if self.doubleNewlineCheck:
+                self.doubleNewlineCheck = False
+                return Token(Tag.NEWLINE)
+
+        # leerzeichen entfernen
+        self.currentLine = self.currentLine.strip()
+        
+        # bei Kommentar, Rest der Zeile ignorieren
+        if self.currentLine.startswith('#'):
+            self.currentLine = ''
+            return self.scan()
+
+        # keine doppelten Newlines
+        self.doubleNewlineCheck = True
+
+        # Token parsen
+        if self.currentLine.startswith('@'):
+            self.currentLine = self.currentLine[1:]
+            return Token(Tag.RETURN)
+
+        # reservierte Wörter (da stehen auch schon erkannte Identifyer drine)
+        for reservedWord, token in self.reservedWords.iteritems():
+            if self.currentLine.startswith(reservedWord):
+                length = len(reservedWord)
+
+                if len(self.currentLine) <= length or not self.currentLine[0].isalnum() or not self.currentLine[length].isalnum():
+                    self.currentLine = self.currentLine[length:]
+                    return token
+
+        # zahlen matchen
+        match = re.match(r"^([0-9]+)", self.currentLine)
+        if match:
+            self.currentLine = self.currentLine[match.end(0):]
+            return ValueToken(Tag.NUMBER, int(match.group(0)))
+
+        # operatoren matchen
+        match = re.match(r"^(<=|==|>=|&&|\|\||<|>|\+|-|\*|/)", self.currentLine)
+        if match:
+            self.currentLine = self.currentLine[match.end(0):]
+            return ValueToken(Tag.OPERATOR, match.group(0))
+
+        # idents matchen
+        match = re.match(r"^([a-zA-Z][a-zA-Z0-9]*)", self.currentLine)
+        if match:
+            self.currentLine = self.currentLine[match.end(0):]
+            token = ValueToken(Tag.IDENT, self.symbols.getOrPut(match.group(0)))
+            self.reserve(match.group(0), token)
+            return token
+
+        # assignments
+        if self.currentLine.startswith('='):
+            self.currentLine = self.currentLine[1:]
+            return Token(Tag.ASSIGNMENT)
+
+        # wenn die programmausführung hier ist,
+        # ist ein syntaxfehler aufgetreten
+        raise Exception("Syntax Error in line: %d at: '%s'" % (self.line, self.currentLine))
diff --git a/src/front/symbols.py b/src/front/symbols.py
index f4ab40e..a868d77 100644
--- a/src/front/symbols.py
+++ b/src/front/symbols.py
@@ -1,9 +1,30 @@
 class SymbolTable:
     def __init__(self):
+        self.symbols = {}
         return
 
-    def put(token, id):
-        return
+    def put(self, token, id=None):
+        if id and not id.isdigit():
+            raise Exception("Only digits as id possible. '%s' is not a number" % id)
 
-    def get(token):
-        return
+        if id == None:
+            if len(self.symbols) <= 0:
+                id = 0
+            else:
+                id = max(self.symbols.values()) + 1
+            
+        self.symbols[token] = id
+        return id
+        
+
+    def get(self, token):
+        if token in self.symbols:
+            return self.symbols[token]
+
+        return None
+
+    def getOrPut(self, token):
+        if self.get(token):
+            return self.get(token)
+
+        return self.put(token)
diff --git a/src/front/token.py b/src/front/token.py
new file mode 100644
index 0000000..def0236
--- /dev/null
+++ b/src/front/token.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# date: 20 Mai 2009
+
+class Tag:
+    # lexer tokens
+    NUMBER           = 1
+    TRUE             = 2
+    FALSE            = 3
+    IDENT            = 4
+    WHILE            = 5
+    IF               = 6
+    ELSE             = 7
+    END              = 8
+    LBRAK            = 9
+    RBRAK            = 10
+    LPAREN           = 11
+    RPAREN           = 12
+    NEWLINE          = 13
+    COMMA            = 14
+    FUN              = 15
+    ASSIGNMENT       = 16
+    RETURN           = 17
+    OPERATOR         = 18
+
+    # parser tokens
+    BOOL             = 19
+    JOIN             = 20
+    EQUALITY         = 21
+    RELATION         = 22
+    EXPRESSION       = 23
+    TERM             = 24
+    UNARY            = 25
+    FACTOR           = 26
+    IDENT_LIST       = 27
+    EXPRESSION_LIST  = 28
+    PROGRAM          = 29
+    FUNCTION         = 30
+    STATEMENT        = 31
+    STATEMENTS       = 32
+    IF_STATEMENT     = 33
+    WHILE_STATEMENT  = 34
+    RETURN_STATEMENT = 35
+    ASSIGN_STATEMENT = 36
+    FUNCTION_CALL    = 37
+
+class Token:
+    tag = None
+
+    def __init__(self, tag):
+        self.tag = tag
+        return
+
+    def __repr__(self):
+        return "<Token: %d>" % self.tag
+
+class ValueToken(Token):
+    value = None
+
+    def __init__(self, tag, value):
+        Token.__init__(self, tag)
+        self.value = value
+        return
+
+    def __repr__(self):
+        return "<Token: %d Value: %s>" % (self.tag, self.value.__str__())
+
-- 
cgit v1.2.3