# Relational
# Copyright (C) 2008-2020 Salvo "LtWorf" Tomaselli
#
# Relational is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#
# author Salvo "LtWorf" Tomaselli
#
#
#
# This module implements a parser for relational algebra, and can be used
# to convert expressions into python expressions and to get the parse-tree
# of the expression.
#
# Language definition here:
# http://ltworf.github.io/relational/grammar.html
from typing import Optional, Union, List, Any, Dict
from dataclasses import dataclass
from relational import rtypes
PRODUCT = '*'
DIFFERENCE = '-'
UNION = '∪'
INTERSECTION = '∩'
DIVISION = '÷'
JOIN = '⋈'
JOIN_LEFT = '⧑'
JOIN_RIGHT = '⧒'
JOIN_FULL = '⧓'
PROJECTION = 'π'
SELECTION = 'σ'
RENAME = 'ρ'
ARROW = '➡'
b_operators = (PRODUCT, DIFFERENCE, UNION, INTERSECTION, DIVISION,
JOIN, JOIN_LEFT, JOIN_RIGHT, JOIN_FULL) # List of binary operators
u_operators = (PROJECTION, SELECTION, RENAME) # List of unary operators
# Associates operator with python method
op_functions = {
PRODUCT: 'product', DIFFERENCE: 'difference', UNION: 'union', INTERSECTION: 'intersection', DIVISION: 'division', JOIN: 'join',
JOIN_LEFT: 'outer_left', JOIN_RIGHT: 'outer_right', JOIN_FULL: 'outer', PROJECTION: 'projection', SELECTION: 'selection', RENAME: 'rename'}
class TokenizerException (Exception):
pass
class ParserException (Exception):
pass
class CallableString(str):
'''
This is a string. However it is also callable.
For example:
CallableString('1+1')()
returns 2
It is used to contain Python expressions and print
or execute them.
'''
def __call__(self, context=None):
'''
context is a dictionary where to
each name is associated the relative relation
'''
return eval(self, context)
@dataclass
class Node:
'''This class is a node of a relational expression. Leaves are relations
and internal nodes are operations.
The 'kind' property indicates whether the node is a binary operator, unary
operator or relation.
Since relations are leaves, a relation node will have no attribute for
children.
If the node is a binary operator, it will have left and right properties.
If the node is a unary operator, it will have a child, pointing to the
child node and a property containing the string with the props of the
operation.
This class is used to convert an expression into python code.'''
name: str
def __init__(self, name: str) -> None:
raise NotImplementedError('This is supposed to be an abstract class')
def toCode(self): #FIXME return type
'''This method converts the AST into a python code object'''
code = self._toPython()
return compile(code, '', 'eval')
def toPython(self) -> CallableString:
'''This method converts the AST into a python code string, which
will require the relation module to be executed.
The return value is a CallableString, which means that it can be
directly called.'''
return CallableString(self._toPython())
def _toPython(self) -> str:
raise NotImplementedError()
def printtree(self, level: int = 0) -> str:
'''returns a representation of the tree using indentation'''
r = ''
for i in range(level):
r += ' '
r += self.name
if self.name in b_operators:
r += self.left.printtree(level + 1)
r += self.right.printtree(level + 1)
elif self.name in u_operators:
r += '\t%s\n' % self.prop
r += self.child.printtree(level + 1)
return '\n' + r
def get_left_leaf(self) -> 'Node':
raise NotImplementedError()
def result_format(self, rels: dict) -> list: #FIXME types
'''This function returns a list containing the fields that the resulting relation will have.
It requires a dictionary where keys are the names of the relations and the values are
the relation objects.'''
if not isinstance(rels, dict):
raise TypeError('Can\'t be of None type')
if isinstance(self, Variable): #FIXME this is ugly
return list(rels[self.name].header)
elif isinstance(self, Binary) and self.name in (DIFFERENCE, UNION, INTERSECTION):
return self.left.result_format(rels)
elif isinstance(self, Binary) and self.name == DIVISION:
return list(set(self.left.result_format(rels)) - set(self.right.result_format(rels)))
elif self.name == PROJECTION:
return [i.strip() for i in self.prop.split(',')]
elif self.name == PRODUCT:
return self.left.result_format(rels) + self.right.result_format(rels)
elif self.name == SELECTION:
return self.child.result_format(rels)
elif self.name == RENAME:
_vars = {}
for i in self.prop.split(','):
q = i.split(ARROW)
_vars[q[0].strip()] = q[1].strip()
_fields = self.child.result_format(rels)
for i in range(len(_fields)):
if _fields[i] in _vars:
_fields[i] = _vars[_fields[i]]
return _fields
elif self.name in (JOIN, JOIN_LEFT, JOIN_RIGHT, JOIN_FULL):
return list(set(self.left.result_format(rels)).union(set(self.right.result_format(rels))))
raise ValueError('What kind of alien object is this?')
def __eq__(self, other): #FIXME
if not (isinstance(other, node) and self.name == other.name and self.kind == other.kind):
return False
if self.kind == UNARY:
if other.prop != self.prop:
return False
return self.child == other.child
if self.kind == BINARY:
return self.left == other.left and self.right == other.right
return True
@dataclass
class Variable(Node):
def _toPython(self) -> str:
return self.name
def __str__(self):
return self.name
def get_left_leaf(self) -> Node:
return self
@dataclass
class Binary(Node):
left: Node
right: Node
def get_left_leaf(self) -> Node:
return self.left.get_left_leaf()
def _toPython(self) -> str:
return '%s.%s(%s)' % (self.left._toPython(), op_functions[self.name], self.right._toPython())
def __str__(self):
le = self.left.__str__()
if isinstance(self.right, Binary):
re = "(" + self.right.__str__() + ")"
else:
re = self.right.__str__()
return (le + self.name + re) #TODO use fstrings
@dataclass
class Unary(Node):
prop: str
child: Node
def get_left_leaf(self) -> Node:
return self.child.get_left_leaf()
def __str__(self):
return self.name + " " + self.prop + " (" + self.child.__str__() + ")" #TODO use fstrings
def _toPython(self) -> str:
prop = self.prop
# Converting parameters
if self.name == PROJECTION:
prop = '\"%s\"' % prop.replace(' ', '').replace(',', '\",\"')
elif self.name == RENAME:
prop = repr(self.get_rename_prop())
else: # Selection
prop = repr(prop)
return '%s.%s(%s)' % (self.child._toPython(), op_functions[self.name], prop)
def get_projection_prop(self) -> List[str]:
if self.name != PROJECTION:
raise ValueError('This is only supported on projection nodes')
return [i.strip() for i in self.prop.split(',')]
def set_projection_prop(self, p: List[str]) -> None:
if self.name != PROJECTION:
raise ValueError('This is only supported on projection nodes')
self.prop = ','.join(p)
def get_rename_prop(self) -> Dict[str, str]:
'''
Returns the dictionary that the rename operation wants
'''
if self.name != RENAME:
raise ValueError('This is only supported on rename nodes')
r = {}
for i in self.prop.split(','):
q = i.split(ARROW)
r[q[0].strip()] = q[1].strip()
return r
def set_rename_prop(self, renames: Dict[str, str]) -> None:
'''
Sets the prop field based on the dictionary for renames
'''
if self.name != RENAME:
raise ValueError('This is only supported on rename nodes')
self.prop = ','.join(f'{k}{ARROW}{v}' for k, v in renames.items())
def parse_tokens(expression: List[Union[list, str]]) -> Node:
'''Generates the tree from the tokenized expression
If no expression is specified then it will create an empty node'''
# If the list contains only a list, it will consider the lower level list.
# This will allow things like ((((((a))))) to work
while len(expression) == 1 and isinstance(expression[0], list):
expression = expression[0]
# The list contains only 1 string. Means it is the name of a relation
if len(expression) == 1:
if not rtypes.is_valid_relation_name(expression[0]):
raise ParserException(
u"'%s' is not a valid relation name" % expression[0])
return Variable(expression[0]) #FIXME Move validation in the object
# Expression from right to left, searching for binary operators
# this means that binary operators have lesser priority than
# unary operators.
# It finds the operator with lesser priority, uses it as root of this
# (sub)tree using everything on its left as left parameter (so building
# a left subtree with the part of the list located on left) and doing
# the same on right.
# Since it searches for strings, and expressions into parenthesis are
# within sub-lists, they won't be found here, ensuring that they will
# have highest priority.
for i in range(len(expression) - 1, -1, -1):
if expression[i] in b_operators: # Binary operator
if len(expression[:i]) == 0:
raise ParserException(
u"Expected left operand for '%s'" % self.name)
if len(expression[i + 1:]) == 0:
raise ParserException(
u"Expected right operand for '%s'" % self.name)
return Binary(expression[i], parse_tokens(expression[:i]), parse_tokens(expression[i + 1:]))
'''Searches for unary operators, parsing from right to left'''
for i in range(len(expression) - 1, -1, -1):
if expression[i] in u_operators: # Unary operator
if len(expression) <= i + 2:
raise ParserException(
u"Expected more tokens in '%s'" % self.name)
return Unary(
expression[i],
prop=expression[1 + i].strip(),
child=parse_tokens(expression[2 + i])
)
raise ParserException('Parse error') #FIXME more details
def _find_matching_parenthesis(expression: str, start=0, openpar='(', closepar=')') -> Optional[int]:
'''This function returns the position of the matching
close parenthesis to the 1st open parenthesis found
starting from start (0 by default)'''
par_count = 0 # Count of parenthesis
string = False
escape = False
for i in range(start, len(expression)):
if expression[i] == '\'' and not escape:
string = not string
if expression[i] == '\\' and not escape:
escape = True
else:
escape = False
if string:
continue
if expression[i] == openpar:
par_count += 1
elif expression[i] == closepar:
par_count -= 1
if par_count == 0:
return i # Closing parenthesis of the parameter
return None
def _find_token(haystack: str, needle: str) -> int:
'''
Like the string function find, but
ignores tokens that are within a string
literal.
'''
r = -1
string = False
escape = False
for i in range(len(haystack)):
if haystack[i] == '\'' and not escape:
string = not string
if haystack[i] == '\\' and not escape:
escape = True
else:
escape = False
if string:
continue
if haystack[i:].startswith(needle):
return i
return r
def tokenize(expression: str) -> list:
'''This function converts a relational expression into a list where
every token of the expression is an item of a list. Expressions into
parenthesis will be converted into sublists.'''
# List for the tokens
items = [] # type: List[Union[str,list]]
expression = expression.strip() # Removes initial and ending spaces
while len(expression) > 0:
if expression.startswith('('): # Parenthesis state
end = _find_matching_parenthesis(expression)
if end is None:
raise TokenizerException(
"Missing matching ')' in '%s'" % expression)
# Appends the tokenization of the content of the parenthesis
items.append(tokenize(expression[1:end]))
# Removes the entire parentesis and content from the expression
expression = expression[end + 1:].strip()
elif expression.startswith((SELECTION, RENAME, PROJECTION)): # Unary operators
items.append(expression[0:1])
# Adding operator in the top of the list
expression = expression[
1:].strip() # Removing operator from the expression
if expression.startswith('('): # Expression with parenthesis, so adding what's between open and close without tokenization
par = expression.find(
'(', _find_matching_parenthesis(expression))
else: # Expression without parenthesis, so adding what's between start and parenthesis as whole
par = _find_token(expression, '(')
items.append(expression[:par].strip())
# Inserting parameter of the operator
expression = expression[
par:].strip() # Removing parameter from the expression
else: # Relation (hopefully)
expression += ' ' # To avoid the special case of the ending
# Initial part is a relation, stop when the name of the relation is
# over
for r in range(1, len(expression)):
if rtypes.RELATION_NAME_REGEXP.match(expression[:r + 1]) is None:
break
items.append(expression[:r])
expression = expression[r:].strip()
return items
def tree(expression: str) -> Node:
'''This function parses a relational algebra expression into a AST and returns
the root node using the Node class.'''
return parse_tokens(tokenize(expression))
def parse(expr: str) -> CallableString:
'''This function parses a relational algebra expression, and returns a
CallableString (a string that can be called) whith the corresponding
Python expression.
'''
return tree(expr).toPython()