- Improved tokenizer

- GUI shows unicode errors (i really can't wait for debian to have pyqt in python 3) git-svn-id: http://galileo.dmi.unict.it/svn/relational/trunk@339 014f5005-505e-4b48-8d0a-63407b615a7c
2011-11-01 18:47:03 +00:00
parent f6a6947218
commit 48bde3ea84
5 changed files with 30 additions and 23 deletions
--- a/relational/parser.py
+++ b/relational/parser.py
@@ -40,7 +40,7 @@
 #
 # Language definition here:
 # https://galileo.dmi.unict.it/wiki/relational/doku.php?id=language
-
+import re

 RELATION=0
 UNARY=1
@@ -65,6 +65,9 @@ u_operators=(PROJECTION,SELECTION,RENAME) # List of unary operators

 op_functions={PRODUCT:'product',DIFFERENCE:'difference',UNION:'union',INTERSECTION:'intersection',DIVISION:'division',JOIN:'join',JOIN_LEFT:'outer_left',JOIN_RIGHT:'outer_right',JOIN_FULL:'outer',PROJECTION:'projection',SELECTION:'selection',RENAME:'rename'} # Associates operator with python method

+class ParseException (Exception):
+    pass
+
 class node (object):
    '''This class is a node of a relational expression. Leaves are relations and internal nodes are operations.
    
@@ -276,15 +279,18 @@ def tokenize(expression):
    '''

    while len(expression)>0:
+        
        if expression.startswith('('): #Parenthesis state
            state=2
            end=_find_matching_parenthesis(expression)
+            if end==None:
+                raise ParseException("Missing matching ')' in '%s'" %expression)
            #Appends the tokenization of the content of the parenthesis
            items.append(tokenize(expression[1:end]))
            #Removes the entire parentesis and content from the expression
            expression=expression[end+1:].strip()
        
-        elif expression.startswith(u"σ") or expression.startswith(u"π") or expression.startswith(u"ρ"): #Unary 2 bytes
+        elif expression.startswith((u"σ",u"π",u"ρ")): #Unary 2 bytes
            items.append(expression[0:1]) #Adding operator in the top of the list
            expression=expression[1:].strip() #Removing operator from the expression
            
@@ -295,24 +301,19 @@ def tokenize(expression):
            
            items.append(expression[:par].strip()) #Inserting parameter of the operator
            expression=expression[par:].strip() #Removing parameter from the expression
-        elif expression.startswith("*") or expression.startswith("-"): # Binary 1 byte
+        elif expression.startswith((u"÷",u"ᑎ",u"ᑌ",u"*",u"-")):
            items.append(expression[0])
            expression=expression[1:].strip() #1 char from the expression
            state=4
-        elif expression.startswith(u"ᑎ") or expression.startswith(u"ᑌ"): #Binary short 3 bytes
-            items.append(expression[0:1]) #Adding operator in the top of the list
-            expression=expression[1:].strip() #Removing operator from the expression
-            state=4
-        elif expression.startswith(u"÷"): #Binary short 2 bytes
-            items.append(expression[0:1]) #Adding operator in the top of the list
-            expression=expression[1:].strip() #Removing operator from the expression
-            state=4
        elif expression.startswith(u"ᐅ"): #Binary long
            i=expression.find(u"ᐊ")
+            if i==-1:
+                raise ParseException(u"Expected ᐊ in %s" % (expression,))
            items.append(expression[:i+1])
            expression=expression[i+1:].strip()
-            
            state=4
+        elif re.match(r'[_0-9A-Za-z]',expression[0])==None: #At this point we only have relation names, so we raise errors for anything else
+            raise ParseException("Unexpected '%c' in '%s'" % (expression[0],expression))
        else: #Relation (hopefully)
            if state==1: #Previous was a relation, appending to the last token
                i=items.pop()
@@ -352,7 +353,7 @@ def parse(expr):
    
    You can use parenthesis to change priority: a ᐅᐊ (q ᑌ d).
    
-    IMPORTANT: The encoding used by this module is UTF-8 (all strings must be UTF-8)
+    IMPORTANT: all strings must be unicode
    
    EXAMPLES
    σage > 25 and rank == weight(A)
@@ -365,10 +366,11 @@ def parse(expr):
    return tree(expr).toPython()
    
 if __name__=="__main__":
-    #while True:
-    #    e=raw_input("Expression: ")
-    #    print parse(e)
-    b=u"σ age>1 and skill=='C' (peopleᐅᐊskills)"
-    print b[0]
-    parse(b)
-    
+    while True:
+        e=unicode(raw_input("Expression: "),'utf-8')
+        print parse(e)
+    
+    #b=u"σ age>1 and skill=='C' (peopleᐅᐊskills)"
+    #print b[0]
+    #parse(b)
+    pass