# Relational # Copyright (C) 2008-2020 Salvo "LtWorf" Tomaselli # # Relational is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # # author Salvo "LtWorf" Tomaselli # # # # This module implements a parser for relational algebra, and can be used # to convert expressions into python expressions and to get the parse-tree # of the expression. # # Language definition here: # http://ltworf.github.io/relational/grammar.html from typing import Optional, Union, List, Any, Dict from dataclasses import dataclass from relational import rtypes PRODUCT = '*' DIFFERENCE = '-' UNION = '∪' INTERSECTION = '∩' DIVISION = '÷' JOIN = '⋈' JOIN_LEFT = '⧑' JOIN_RIGHT = '⧒' JOIN_FULL = '⧓' PROJECTION = 'π' SELECTION = 'σ' RENAME = 'ρ' ARROW = '➡' b_operators = (PRODUCT, DIFFERENCE, UNION, INTERSECTION, DIVISION, JOIN, JOIN_LEFT, JOIN_RIGHT, JOIN_FULL) # List of binary operators u_operators = (PROJECTION, SELECTION, RENAME) # List of unary operators # Associates operator with python method op_functions = { PRODUCT: 'product', DIFFERENCE: 'difference', UNION: 'union', INTERSECTION: 'intersection', DIVISION: 'division', JOIN: 'join', JOIN_LEFT: 'outer_left', JOIN_RIGHT: 'outer_right', JOIN_FULL: 'outer', PROJECTION: 'projection', SELECTION: 'selection', RENAME: 'rename'} class TokenizerException (Exception): pass class ParserException (Exception): pass class CallableString(str): ''' This is a string. However it is also callable. For example: CallableString('1+1')() returns 2 It is used to contain Python expressions and print or execute them. ''' def __call__(self, context=None): ''' context is a dictionary where to each name is associated the relative relation ''' return eval(self, context) @dataclass class Node: '''This class is a node of a relational expression. Leaves are relations and internal nodes are operations. The 'kind' property indicates whether the node is a binary operator, unary operator or relation. Since relations are leaves, a relation node will have no attribute for children. If the node is a binary operator, it will have left and right properties. If the node is a unary operator, it will have a child, pointing to the child node and a property containing the string with the props of the operation. This class is used to convert an expression into python code.''' name: str def __init__(self, name: str) -> None: raise NotImplementedError('This is supposed to be an abstract class') def toCode(self): #FIXME return type '''This method converts the AST into a python code object''' code = self._toPython() return compile(code, '', 'eval') def toPython(self) -> CallableString: '''This method converts the AST into a python code string, which will require the relation module to be executed. The return value is a CallableString, which means that it can be directly called.''' return CallableString(self._toPython()) def _toPython(self) -> str: raise NotImplementedError() def printtree(self, level: int = 0) -> str: '''returns a representation of the tree using indentation''' r = '' for i in range(level): r += ' ' r += self.name if self.name in b_operators: r += self.left.printtree(level + 1) r += self.right.printtree(level + 1) elif self.name in u_operators: r += '\t%s\n' % self.prop r += self.child.printtree(level + 1) return '\n' + r def get_left_leaf(self) -> 'Node': raise NotImplementedError() def result_format(self, rels: dict) -> list: #FIXME types '''This function returns a list containing the fields that the resulting relation will have. It requires a dictionary where keys are the names of the relations and the values are the relation objects.''' if not isinstance(rels, dict): raise TypeError('Can\'t be of None type') if isinstance(self, Variable): #FIXME this is ugly return list(rels[self.name].header) elif isinstance(self, Binary) and self.name in (DIFFERENCE, UNION, INTERSECTION): return self.left.result_format(rels) elif isinstance(self, Binary) and self.name == DIVISION: return list(set(self.left.result_format(rels)) - set(self.right.result_format(rels))) elif self.name == PROJECTION: return [i.strip() for i in self.prop.split(',')] elif self.name == PRODUCT: return self.left.result_format(rels) + self.right.result_format(rels) elif self.name == SELECTION: return self.child.result_format(rels) elif self.name == RENAME: _vars = {} for i in self.prop.split(','): q = i.split(ARROW) _vars[q[0].strip()] = q[1].strip() _fields = self.child.result_format(rels) for i in range(len(_fields)): if _fields[i] in _vars: _fields[i] = _vars[_fields[i]] return _fields elif self.name in (JOIN, JOIN_LEFT, JOIN_RIGHT, JOIN_FULL): return list(set(self.left.result_format(rels)).union(set(self.right.result_format(rels)))) raise ValueError('What kind of alien object is this?') def __eq__(self, other): #FIXME if not (isinstance(other, node) and self.name == other.name and self.kind == other.kind): return False if self.kind == UNARY: if other.prop != self.prop: return False return self.child == other.child if self.kind == BINARY: return self.left == other.left and self.right == other.right return True @dataclass class Variable(Node): def _toPython(self) -> str: return self.name def __str__(self): return self.name def get_left_leaf(self) -> Node: return self @dataclass class Binary(Node): left: Node right: Node def get_left_leaf(self) -> Node: return self.left.get_left_leaf() def _toPython(self) -> str: return '%s.%s(%s)' % (self.left._toPython(), op_functions[self.name], self.right._toPython()) def __str__(self): le = self.left.__str__() if isinstance(self.right, Binary): re = "(" + self.right.__str__() + ")" else: re = self.right.__str__() return (le + self.name + re) #TODO use fstrings @dataclass class Unary(Node): prop: str child: Node def get_left_leaf(self) -> Node: return self.child.get_left_leaf() def __str__(self): return self.name + " " + self.prop + " (" + self.child.__str__() + ")" #TODO use fstrings def _toPython(self) -> str: prop = self.prop # Converting parameters if self.name == PROJECTION: prop = '\"%s\"' % prop.replace(' ', '').replace(',', '\",\"') elif self.name == RENAME: prop = repr(self.get_rename_prop()) else: # Selection prop = repr(prop) return '%s.%s(%s)' % (self.child._toPython(), op_functions[self.name], prop) def get_projection_prop(self) -> List[str]: if self.name != PROJECTION: raise ValueError('This is only supported on projection nodes') return [i.strip() for i in self.prop.split(',')] def set_projection_prop(self, p: List[str]) -> None: if self.name != PROJECTION: raise ValueError('This is only supported on projection nodes') self.prop = ','.join(p) def get_rename_prop(self) -> Dict[str, str]: ''' Returns the dictionary that the rename operation wants ''' if self.name != RENAME: raise ValueError('This is only supported on rename nodes') r = {} for i in self.prop.split(','): q = i.split(ARROW) r[q[0].strip()] = q[1].strip() return r def set_rename_prop(self, renames: Dict[str, str]) -> None: ''' Sets the prop field based on the dictionary for renames ''' if self.name != RENAME: raise ValueError('This is only supported on rename nodes') self.prop = ','.join(f'{k}{ARROW}{v}' for k, v in renames.items()) def parse_tokens(expression: List[Union[list, str]]) -> Node: '''Generates the tree from the tokenized expression If no expression is specified then it will create an empty node''' # If the list contains only a list, it will consider the lower level list. # This will allow things like ((((((a))))) to work while len(expression) == 1 and isinstance(expression[0], list): expression = expression[0] # The list contains only 1 string. Means it is the name of a relation if len(expression) == 1: if not rtypes.is_valid_relation_name(expression[0]): raise ParserException( u"'%s' is not a valid relation name" % expression[0]) return Variable(expression[0]) #FIXME Move validation in the object # Expression from right to left, searching for binary operators # this means that binary operators have lesser priority than # unary operators. # It finds the operator with lesser priority, uses it as root of this # (sub)tree using everything on its left as left parameter (so building # a left subtree with the part of the list located on left) and doing # the same on right. # Since it searches for strings, and expressions into parenthesis are # within sub-lists, they won't be found here, ensuring that they will # have highest priority. for i in range(len(expression) - 1, -1, -1): if expression[i] in b_operators: # Binary operator if len(expression[:i]) == 0: raise ParserException( u"Expected left operand for '%s'" % self.name) if len(expression[i + 1:]) == 0: raise ParserException( u"Expected right operand for '%s'" % self.name) return Binary(expression[i], parse_tokens(expression[:i]), parse_tokens(expression[i + 1:])) '''Searches for unary operators, parsing from right to left''' for i in range(len(expression) - 1, -1, -1): if expression[i] in u_operators: # Unary operator if len(expression) <= i + 2: raise ParserException( u"Expected more tokens in '%s'" % self.name) return Unary( expression[i], prop=expression[1 + i].strip(), child=parse_tokens(expression[2 + i]) ) raise ParserException('Parse error') #FIXME more details def _find_matching_parenthesis(expression: str, start=0, openpar='(', closepar=')') -> Optional[int]: '''This function returns the position of the matching close parenthesis to the 1st open parenthesis found starting from start (0 by default)''' par_count = 0 # Count of parenthesis string = False escape = False for i in range(start, len(expression)): if expression[i] == '\'' and not escape: string = not string if expression[i] == '\\' and not escape: escape = True else: escape = False if string: continue if expression[i] == openpar: par_count += 1 elif expression[i] == closepar: par_count -= 1 if par_count == 0: return i # Closing parenthesis of the parameter return None def _find_token(haystack: str, needle: str) -> int: ''' Like the string function find, but ignores tokens that are within a string literal. ''' r = -1 string = False escape = False for i in range(len(haystack)): if haystack[i] == '\'' and not escape: string = not string if haystack[i] == '\\' and not escape: escape = True else: escape = False if string: continue if haystack[i:].startswith(needle): return i return r def tokenize(expression: str) -> list: '''This function converts a relational expression into a list where every token of the expression is an item of a list. Expressions into parenthesis will be converted into sublists.''' # List for the tokens items = [] # type: List[Union[str,list]] expression = expression.strip() # Removes initial and ending spaces while len(expression) > 0: if expression.startswith('('): # Parenthesis state end = _find_matching_parenthesis(expression) if end is None: raise TokenizerException( "Missing matching ')' in '%s'" % expression) # Appends the tokenization of the content of the parenthesis items.append(tokenize(expression[1:end])) # Removes the entire parentesis and content from the expression expression = expression[end + 1:].strip() elif expression.startswith((SELECTION, RENAME, PROJECTION)): # Unary operators items.append(expression[0:1]) # Adding operator in the top of the list expression = expression[ 1:].strip() # Removing operator from the expression if expression.startswith('('): # Expression with parenthesis, so adding what's between open and close without tokenization par = expression.find( '(', _find_matching_parenthesis(expression)) else: # Expression without parenthesis, so adding what's between start and parenthesis as whole par = _find_token(expression, '(') items.append(expression[:par].strip()) # Inserting parameter of the operator expression = expression[ par:].strip() # Removing parameter from the expression else: # Relation (hopefully) expression += ' ' # To avoid the special case of the ending # Initial part is a relation, stop when the name of the relation is # over for r in range(1, len(expression)): if rtypes.RELATION_NAME_REGEXP.match(expression[:r + 1]) is None: break items.append(expression[:r]) expression = expression[r:].strip() return items def tree(expression: str) -> Node: '''This function parses a relational algebra expression into a AST and returns the root node using the Node class.''' return parse_tokens(tokenize(expression)) def parse(expr: str) -> CallableString: '''This function parses a relational algebra expression, and returns a CallableString (a string that can be called) whith the corresponding Python expression. ''' return tree(expr).toPython()