diff --git a/CHANGELOG b/CHANGELOG index eaf8226..5168f1d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,7 @@ -2.6 +3.0 +- Refactored parser to use better typing +- Refactored and fixed some optimizations +- Added more test cases - Improved survey sending - Prevent relation/field names from being reserved keywords - Fixed issue in cli where loading an invalid file would lead to a crash diff --git a/relational/optimizations.py b/relational/optimizations.py index 42c141e..2f3cfe3 100644 --- a/relational/optimizations.py +++ b/relational/optimizations.py @@ -1,5 +1,5 @@ # Relational -# Copyright (C) 2009-2018 Salvo "LtWorf" Tomaselli +# Copyright (C) 2009-2020 Salvo "LtWorf" Tomaselli # # Relational is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -30,8 +30,9 @@ from io import StringIO from tokenize import generate_tokens +from typing import Tuple, Dict - +from relational.relation import Relation from relational import parser sel_op = ( @@ -98,36 +99,7 @@ def replace_node(replace, replacement): replace.left = replacement.left -def recoursive_scan(function, node, rels=None): - '''Does a recoursive optimization on the tree. - - This function will recoursively execute the function given - as "function" parameter starting from node to all the tree. - if rels is provided it will be passed as argument to the function. - Otherwise the function will be called just on the node. - - Result value: function is supposed to return the amount of changes - it has performed on the tree. - The various result will be added up and this final value will be the - returned value.''' - changes = 0 - # recoursive scan - if node.kind == parser.UNARY: - if rels != None: - changes += function(node.child, rels) - else: - changes += function(node.child) - elif node.kind == parser.BINARY: - if rels != None: - changes += function(node.right, rels) - changes += function(node.left, rels) - else: - changes += function(node.right) - changes += function(node.left) - return changes - - -def duplicated_select(n: parser.Node) -> int: +def duplicated_select(n: parser.Node) -> Tuple[parser.Node, int]: '''This function locates and deletes things like σ a ( σ a(C)) and the ones like σ a ( σ b(C)) replacing the 1st one with a single select and @@ -135,243 +107,189 @@ def duplicated_select(n: parser.Node) -> int: in and ''' changes = 0 - if n.name == SELECTION and n.child.name == SELECTION: + while n.name == SELECTION and n.child.name == SELECTION: + changes += 1 + prop = n.prop + if n.prop != n.child.prop: # Nested but different, joining them - n.prop = n.prop + " and " + n.child.prop + prop = n.prop + " and " + n.child.prop # This adds parenthesis if they are needed if n.child.prop.startswith('(') or n.prop.startswith('('): - n.prop = '(%s)' % n.prop - - n.child = n.child.child - changes = 1 - changes += duplicated_select(n) - - return changes + recoursive_scan(duplicated_select, n) + prop = '(%s)' % prop + n = parser.Unary( + SELECTION, + prop, + n.child.child, + ) + return n, changes -def futile_union_intersection_subtraction(n: parser.Node) -> int: - '''This function locates things like r ᑌ r, and replaces them with r. - R ᑌ R --> R - R ᑎ R --> R +def futile_union_intersection_subtraction(n: parser.Node) -> Tuple[parser.Node, int]: + '''This function locates things like r ∪ r, and replaces them with r. + R ∪ R --> R + R ∩ R --> R R - R --> σ False (R) σ k (R) - R --> σ False (R) R - σ k (R) --> σ not k (R) - σ k (R) ᑌ R --> R - σ k (R) ᑎ R --> σ k (R) + σ k (R) ∪ R --> R + σ k (R) ∩ R --> σ k (R) ''' changes = 0 # Union and intersection of the same thing if n.name in (UNION, INTERSECTION, JOIN, JOIN_LEFT, JOIN_RIGHT, JOIN_FULL) and n.left == n.right: - changes = 1 - replace_node(n, n.left) + return n.left, 1 # selection and union of the same thing elif (n.name == UNION): if n.left.name == SELECTION and n.left.child == n.right: - changes = 1 - replace_node(n, n.right) + return n.right, 1 elif n.right.name == SELECTION and n.right.child == n.left: - changes = 1 - replace_node(n, n.left) + return n.left, 1 # selection and intersection of the same thing elif n.name == INTERSECTION: if n.left.name == SELECTION and n.left.child == n.right: - changes = 1 - replace_node(n, n.left) + return n.left, 1 elif n.right.name == SELECTION and n.right.child == n.left: - changes = 1 - replace_node(n, n.right) + return n.right, 1 # Subtraction and selection of the same thing elif n.name == DIFFERENCE and \ n.right.name == SELECTION and \ n.right.child == n.left: - n.name = n.right.name - n.kind = n.right.kind - n.child = n.right.child - n.prop = '(not (%s))' % n.right.prop - n.left = n.right = None + return parser.Unary( + SELECTION, + '(not (%s))' % n.right.prop, + n.right.child), 1 # Subtraction of the same thing or with selection on the left child elif n.name == DIFFERENCE and (n.left == n.right or (n.left.name == SELECTION and n.left.child == n.right)): - changes = 1 - n.kind = parser.UNARY - n.name = SELECTION - n.prop = 'False' - n.child = n.left.get_left_leaf() - # n.left=n.right=None - - return changes + recoursive_scan(futile_union_intersection_subtraction, n) + return parser.Unary( + SELECTION, + 'False', + n.get_left_leaf() + ), 1 + return n, 0 -def down_to_unions_subtractions_intersections(n: parser.Node) -> int: - '''This funcion locates things like σ i==2 (c ᑌ d), where the union +def down_to_unions_subtractions_intersections(n: parser.Node) -> Tuple[parser.Node, int]: + '''This funcion locates things like σ i==2 (c ∪ d), where the union can be a subtraction and an intersection and replaces them with - σ i==2 (c) ᑌ σ i==2(d). + σ i==2 (c) ∪ σ i==2(d). ''' changes = 0 _o = (UNION, DIFFERENCE, INTERSECTION) if n.name == SELECTION and n.child.name in _o: + l = parser.Unary(SELECTION, n.prop, n.child.left) + r = parser.Unary(SELECTION, n.prop, n.child.right) - left = parser.Node() - left.prop = n.prop - left.name = n.name - left.child = n.child.left - left.kind = parser.UNARY - right = parser.Node() - right.prop = n.prop - right.name = n.name - right.child = n.child.right - right.kind = parser.UNARY - - n.name = n.child.name - n.left = left - n.right = right - n.child = None - n.prop = None - n.kind = parser.BINARY - changes += 1 - - return changes + recoursive_scan(down_to_unions_subtractions_intersections, n) + return parser.Binary(n.child.name, l, r), 1 + return n, 0 -def duplicated_projection(n: parser.Node) -> int: +def duplicated_projection(n: parser.Node) -> Tuple[parser.Node, int]: '''This function locates thing like π i ( π j (R)) and replaces them with π i (R)''' - changes = 0 if n.name == PROJECTION and n.child.name == PROJECTION: - n.child = n.child.child - changes += 1 - - return changes + recoursive_scan(duplicated_projection, n) + return parser.Unary( + PROJECTION, + n.prop, + n.child.child), 1 + return n, 0 -def selection_inside_projection(n: parser.Node) -> int: +def selection_inside_projection(n: parser.Node) -> Tuple[parser.Node, int]: '''This function locates things like σ j (π k(R)) and converts them into π k(σ j (R))''' - changes = 0 - if n.name == SELECTION and n.child.name == PROJECTION: - changes = 1 - temp = n.prop - n.prop = n.child.prop - n.child.prop = temp - n.name = PROJECTION - n.child.name = SELECTION + child = parser.Unary( + SELECTION, + n.prop, + n.child.child + ) - return changes + recoursive_scan(selection_inside_projection, n) + return parser.Unary(PROJECTION, n.child.prop, child), 0 + return n, 0 -def swap_union_renames(n: parser.Node) -> int: +def swap_union_renames(n: parser.Node) -> Tuple[parser.Node, int]: '''This function locates things like - ρ a➡b(R) ᑌ ρ a➡b(Q) + ρ a➡b(R) ∪ ρ a➡b(Q) and replaces them with - ρ a➡b(R ᑌ Q). + ρ a➡b(R ∪ Q). Does the same with subtraction and intersection''' - changes = 0 - - if n.name in (DIFFERENCE, UNION, INTERSECTION) and n.left.name == n.right.name and n.left.name == RENAME: - l_vars = {} - for i in n.left.prop.split(','): - q = i.split(ARROW) - l_vars[q[0].strip()] = q[1].strip() - - r_vars = {} - for i in n.right.prop.split(','): - q = i.split(ARROW) - r_vars[q[0].strip()] = q[1].strip() - + if n.name in (DIFFERENCE, UNION, INTERSECTION) and n.left.name == RENAME and n.right.name == RENAME: + l_vars = n.left.get_rename_prop() + r_vars = n.right.get_rename_prop() if r_vars == l_vars: - changes = 1 - - # Copying self, but child will be child of renames - q = parser.Node() - q.name = n.name - q.kind = parser.BINARY - q.left = n.left.child - q.right = n.right.child - - n.name = RENAME - n.kind = parser.UNARY - n.child = q - n.prop = n.left.prop - n.left = n.right = None - - return changes + recoursive_scan(swap_union_renames, n) + child = parser.Binary(n.name, n.left.child, n.right.child) + return parser.Unary(RENAME, n.left.prop, child), 1 + return n, 0 -def futile_renames(n: parser.Node) -> int: - '''This function purges renames like id->id''' - changes = 0 +def futile_renames(n: parser.Node) -> Tuple[parser.Node, int]: + '''This function purges renames like + ρ id->id,a->q (A) + into + ρ a->q (A) + or removes the operation entirely if they all get removed + ''' if n.name == RENAME: - # Located two nested renames. - changes = 1 + renames = n.get_rename_prop() + changes = False + for k, v in renames.items(): + if k == v: + changes = True + del renames[k] + if len(renames) == 0: # Nothing to rename, removing the rename + return n.child, 1 + elif changes: + # Changing the node in place, no need to return to cause a recursive step + n.set_rename_prop(renames) - # Creating a dictionary with the attributes - _vars = {} - for i in n.prop.split(','): - q = i.split(ARROW) - _vars[q[0].strip()] = q[1].strip() - # Scans dictionary to locate things like "a->b,b->c" and replace them - # with "a->c" - for key in list(_vars.keys()): - value = _vars.get(key) - if key == value: - _vars.pop(value) # Removes the unused one - - if len(_vars) == 0: # Nothing to rename, removing the rename op - replace_node(n, n.child) - else: - n.prop = ','.join('%s%s%s' % (i[0], ARROW, i[1]) for i in _vars.items()) - - return changes + recoursive_scan(futile_renames, n) + return n, 0 -def subsequent_renames(n: parser.Node) -> int: - '''This function removes redoundant subsequent renames joining them into one''' - - '''Purges renames like id->id Since it's needed to be performed BEFORE this one - so it is not in the list with the other optimizations''' - futile_renames(n) - changes = 0 - +def subsequent_renames(n: parser.Node) -> Tuple[parser.Node, int]: + '''This function removes redundant subsequent renames joining them into one + ρ .. ρ .. (A) + into + ρ ... (A) + ''' if n.name == RENAME and n.child.name == RENAME: # Located two nested renames. - changes = 1 - # Joining the attribute into one - n.prop += ',' + n.child.prop - n.child = n.child.child + prop = n.prop + ',' + n.child.prop + child = n.child.child + n = parser.Unary(RENAME, prop, child) # Creating a dictionary with the attributes - _vars = {} - for i in n.prop.split(','): - q = i.split(ARROW) - _vars[q[0].strip()] = q[1].strip() + renames = n.get_rename_prop() + # Scans dictionary to locate things like "a->b,b->c" and replace them # with "a->c" - for key in list(_vars.keys()): - value = _vars.get(key) - if value in _vars.keys(): - if _vars[value] != key: + for key, value in tuple(renames.items()): + + if value in renames: + if renames[value] != key: # Double rename on attribute - _vars[key] = _vars[_vars[key]] # Sets value - _vars.pop(value) # Removes the unused one + renames[key] = renames[renames[key]] # Sets value + del renames[value] # Removes the unused one else: # Cycle rename a->b,b->a - _vars.pop(value) # Removes the unused one - _vars.pop(key) # Removes the unused one + del renames[value] # Removes the unused one + del renames[key] # Removes the unused one - if len(_vars) == 0: # Nothing to rename, removing the rename op - replace_node(n, n.child) + if len(renames) == 0: # Nothing to rename, removing the rename op + return n.child, 1 else: - n.prop = ','.join('%s%s%s' % (i[0], ARROW, i[1]) for i in _vars.items()) + n.set_rename_prop(renames) + return n, 1 - return changes + recoursive_scan(subsequent_renames, n) + return n, 0 class level_string(str): @@ -411,101 +329,84 @@ def tokenize_select(expression): return l -def swap_rename_projection(n: parser.Node) -> int: - '''This function locates things like π k(ρ j(R)) - and replaces them with ρ j(π k(R)). +def swap_rename_projection(n: parser.Node) -> Tuple[parser.Node, int]: + '''This function locates things like + π k(ρ j(R)) + and replaces them with + ρ j(π k(R)). This will let rename work on a hopefully smaller set and more important, will hopefully allow further optimizations. - Will also eliminate fields in the rename that are cutted in the projection. + + Will also eliminate fields in the rename that are cut in the projection. ''' - changes = 0 if n.name == PROJECTION and n.child.name == RENAME: - changes = 1 - # π index,name(ρ id➡index(R)) - _vars = {} - for i in n.child.prop.split(','): - q = i.split(ARROW) - _vars[q[1].strip()] = q[0].strip() + renames = n.child.get_rename_prop() + projections = set(n.get_projection_prop()) - _pr = n.prop.split(',') - for i in range(len(_pr)): - try: - _pr[i] = _vars[_pr[i].strip()] - except: - pass + # Use pre-rename names in the projection + for k, v in renames.items(): + if v in projections: + projections.remove(v) + projections.add(k) - _pr_reborn = n.prop.split(',') - for i in list(_vars.keys()): - if i not in _pr_reborn: - _vars.pop(i) - n.name = n.child.name + # Eliminate fields + for i in list(renames.keys()): + if i not in projections: + del renames[i] - n.prop = ','.join('%s%s%s' % (i[1], ARROW, i[0]) for i in _vars.items()) + child = parser.Unary(PROJECTION,'' , n.child.child) + child.set_projection_prop(projections) + n = parser.Unary(RENAME, '', child) + n.set_rename_prop(renames) + return n, 1 - n.child.name = PROJECTION - n.child.prop = '' - for i in _pr: - n.child.prop += i + ',' - n.child.prop = n.child.prop[:-1] - - return changes + recoursive_scan(swap_rename_projection, n) + return n, 0 def swap_rename_select(n: parser.Node) -> int: - '''This function locates things like σ k(ρ j(R)) and replaces - them with ρ j(σ k(R)). Renaming the attributes used in the + '''This function locates things like + σ k(ρ j(R)) + and replaces them with + ρ j(σ k(R)). + Renaming the attributes used in the selection, so the operation is still valid.''' - changes = 0 if n.name == SELECTION and n.child.name == RENAME: - changes = 1 - # Dictionary containing attributes of rename - _vars = {} - for i in n.child.prop.split(','): - q = i.split(ARROW) - _vars[q[1].strip()] = q[0].strip() + # This is an inverse mapping for the rename + renames = {v: k for k, v in n.child.get_rename_prop().items()} # tokenizes expression in select - _tokens = tokenize_select(n.prop) + tokens = tokenize_select(n.prop) - # Renaming stuff - for i in range(len(_tokens)): - splitted = _tokens[i].split('.', 1) - if splitted[0] in _vars: - if len(splitted) == 1: - _tokens[i] = _vars[_tokens[i].split('.')[0]] - else: - _tokens[i] = _vars[ - _tokens[i].split('.')[0]] + '.' + splitted[1] + # Renaming stuff, no enum because I edit the tokens + for i in range(len(tokens)): + splitted = tokens[i].split('.', 1) + if splitted[0] in renames: + tokens[i] = renames[splitted[0]] + if len(splitted) > 1: + tokens[i] += '.' + splitted[1] - # Swapping operators - n.name = RENAME - n.child.name = SELECTION - - n.prop = n.child.prop - n.child.prop = ' '.join(_tokens) - - return changes + recoursive_scan(swap_rename_select, n) + child = parser.Unary(SELECTION, ' '.join(tokens), n.child.child) + return parser.Unary(RENAME, n.child.prop, child), 1 + return n, 0 def select_union_intersect_subtract(n: parser.Node) -> int: - '''This function locates things like σ i(a) ᑌ σ q(a) - and replaces them with σ (i OR q) (a) + '''This function locates things like + σ i(a) ∪ σ q(a) + and replaces them with + σ (i OR q) (a) Removing a O(n²) operation like the union''' - changes = 0 if n.name in {UNION, INTERSECTION, DIFFERENCE} and \ n.left.name == SELECTION and \ n.right.name == SELECTION and \ n.left.child == n.right.child: - changes = 1 d = {UNION: 'or', INTERSECTION: 'and', DIFFERENCE: 'and not'} op = d[n.name] - newnode = parser.Node() - if n.left.prop.startswith('(') or n.right.prop.startswith('('): t_str = '(' if n.left.prop.startswith('('): @@ -519,54 +420,34 @@ def select_union_intersect_subtract(n: parser.Node) -> int: t_str += '%s' t_str += ')' - newnode.prop = t_str % (n.left.prop, op, n.right.prop) + prop = t_str % (n.left.prop, op, n.right.prop) else: - newnode.prop = '%s %s %s' % (n.left.prop, op, n.right.prop) - newnode.name = SELECTION - newnode.child = n.left.child - newnode.kind = parser.UNARY - replace_node(n, newnode) - - return changes + recoursive_scan(select_union_intersect_subtract, n) + prop = '%s %s %s' % (n.left.prop, op, n.right.prop) + return parser.Unary(SELECTION, prop, n.left.child), 1 + return n, 0 -def union_and_product(n: parser.Node) -> int: +def union_and_product(n: parser.Node) -> Tuple[parser.Node, int]: ''' A * B ∪ A * C = A * (B ∪ C) Same thing with inner join ''' - - changes = 0 if n.name == UNION and n.left.name in {PRODUCT, JOIN} and n.left.name == n.right.name: - newnode = parser.Node() - newnode.kind = parser.BINARY - newnode.name = n.left.name - - newchild = parser.Node() - newchild.kind = parser.BINARY - newchild.name = UNION - if n.left.left == n.right.left or n.left.left == n.right.right: - newnode.left = n.left.left - newnode.right = newchild - - newchild.left = n.left.right - newchild.right = n.right.left if n.left.left == n.right.right else n.right.right - replace_node(n, newnode) - changes = 1 + l = n.left.right + r = n.right.left if n.left.left == n.right.right else n.right.right + newchild = parser.Binary(UNION, l, r) + return parser.Binary(n.left.name, n.left.left, newchild), 1 elif n.left.right == n.right.left or n.left.left == n.right.right: - newnode.left = n.left.right - newnode.right = newchild - - newchild.left = n.left.left - newchild.right = n.right.left if n.right.left == n.right.right else n.right.right - replace_node(n, newnode) - changes = 1 - return changes + recoursive_scan(union_and_product, n) + l = n.left.left + r = n.right.left if n.right.left == n.right.right else n.right.right + newchild = parser.Binary(UNION, l, r) + return parser.Binary(n.left.name, n.left.right, newchild), 1 + return n, 0 -def projection_and_union(n, rels): +def projection_and_union(n: parser.Node, rels: Dict[str, Relation]) -> Tuple[parser.Node, int]: ''' Turns π a,b,c(A) ∪ π a,b,c(B) @@ -581,28 +462,16 @@ def projection_and_union(n, rels): n.left.name == PROJECTION and \ n.right.name == PROJECTION and \ set(n.left.child.result_format(rels)) == set(n.right.child.result_format(rels)): - newchild = parser.Node() - newchild.kind = parser.BINARY - newchild.name = UNION - newchild.left = n.left.child - newchild.right = n.right.child - - newnode = parser.Node() - newnode.child = newchild - newnode.kind = parser.UNARY - newnode.name = PROJECTION - newnode.prop = n.right.prop - replace_node(n, newnode) - changes = 1 - return changes + recoursive_scan(projection_and_union, n, rels) + child = parser.Binary(UNION, n.left.child, n.right.child) + return parser.Unary(PROJECTION, n.right.prop, child), 0 + return n, 0 -def selection_and_product(n, rels): +def selection_and_product(n: parser.Node, rels: Dict[str, Relation]) -> parser.Node: '''This function locates things like σ k (R*Q) and converts them into σ l (σ j (R) * σ i (Q)). Where j contains only attributes belonging to R, i contains attributes belonging to Q and l contains attributes belonging to both''' - changes = 0 if n.name == SELECTION and n.child.name in (PRODUCT, JOIN): l_attr = n.child.left.result_format(rels) @@ -637,76 +506,71 @@ def selection_and_product(n, rels): if j in r_attr: # Field in right r_fields = True - if l_fields and r_fields: # Fields in both - both.append(i) - elif l_fields: + if l_fields and not r_fields: left.append(i) - elif r_fields: + elif r_fields and not l_fields: right.append(i) else: # Unknown.. adding in both both.append(i) # Preparing left selection - if len(left) > 0: - changes = 1 - l_node = parser.Node() - l_node.name = SELECTION - l_node.kind = parser.UNARY - l_node.child = n.child.left - l_node.prop = '' - n.child.left = l_node + if left: + l_prop = '' while len(left) > 0: c = left.pop(0) for i in c: - l_node.prop += i + ' ' + l_prop += i + ' ' if len(left) > 0: - l_node.prop += ' and ' - if '(' in l_node.prop: - l_node.prop = '(%s)' % l_node.prop + l_prop += ' and ' + if '(' in l_prop: + l_prop = '(%s)' % l_prop + l_node = parser.Unary(SELECTION, l_prop, n.child.left) + else: + l_node = n.child.left # Preparing right selection - if len(right) > 0: - changes = 1 - r_node = parser.Node() - r_node.name = SELECTION - r_node.prop = '' - r_node.kind = parser.UNARY - r_node.child = n.child.right - n.child.right = r_node + if right: + r_prop = '' while len(right) > 0: c = right.pop(0) - r_node.prop += ' '.join(c) + r_prop += ' '.join(c) if len(right) > 0: - r_node.prop += ' and ' - if '(' in r_node.prop: - r_node.prop = '(%s)' % r_node.prop + r_prop += ' and ' + if '(' in r_prop: + r_prop = '(%s)' % r_prop + r_node = parser.Unary(SELECTION, r_prop, n.child.right) + else: + r_node = n.child.right + + b_node = parser.Binary(n.child.name, l_node, r_node) + # Changing main selection - n.prop = '' - if len(both) != 0: + if both: + both_prop = '' while len(both) > 0: c = both.pop(0) - n.prop += ' '.join(c) + both_prop += ' '.join(c) if len(both) > 0: - n.prop += ' and ' - if '(' in n.prop: - n.prop = '(%s)' % n.prop + both_prop += ' and ' + if '(' in both_prop: + both_prop = '(%s)' % both_prop + r = parser.Unary(SELECTION, both_prop, b_node) + return r, len(left) + len(right) else: # No need for general select - replace_node(n, n.child) + return b_node, 1 - return changes + recoursive_scan(selection_and_product, n, rels) + return n, 0 -def useless_projection(n, rels) -> int: +def useless_projection(n: parser.Node, rels: Dict[str, Relation]) -> Tuple[parser.Node, int]: ''' Removes projections that are over all the fields ''' - changes = 0 if n.name == PROJECTION and \ set(n.child.result_format(rels)) == set(i.strip() for i in n.prop.split(',')): - changes = 1 - replace_node(n, n.child) + return n.child, 1 - return changes + recoursive_scan(useless_projection, n, rels) + return n, 0 general_optimizations = [ duplicated_select, @@ -714,6 +578,7 @@ general_optimizations = [ duplicated_projection, selection_inside_projection, subsequent_renames, + futile_renames, swap_rename_select, futile_union_intersection_subtraction, swap_union_renames, @@ -726,6 +591,3 @@ specific_optimizations = [ projection_and_union, useless_projection, ] - -if __name__ == "__main__": - print (tokenize_select("skill == 'C' and id % 2 == 0")) diff --git a/relational/optimizer.py b/relational/optimizer.py index 3e8ccd5..7a4c6d7 100644 --- a/relational/optimizer.py +++ b/relational/optimizer.py @@ -1,5 +1,5 @@ # Relational -# Copyright (C) 2008-2016 Salvo "LtWorf" Tomaselli +# Copyright (C) 2008-2020 Salvo "LtWorf" Tomaselli # # Relational is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -22,23 +22,22 @@ # relational query, or it can be a parse tree for a relational expression (ie: class parser.node). # The functions will always return a string with the optimized query, but if a parse tree was provided, # the parse tree itself will be modified accordingly. -from typing import Union, Optional, Dict, Any +from typing import Union, Optional, Dict, Any, Tuple +from relational.relation import Relation from relational import optimizations -from relational.parser import Node, RELATION, UNARY, BINARY, op_functions, tokenize, tree +from relational.parser import Node, Variable, Unary, Binary, op_functions, tokenize, tree from relational import querysplit from relational.maintenance import UserInterface -ContextDict = Dict[str,Any] - -def optimize_program(code, rels: ContextDict): +def optimize_program(code, rels: Dict[str, Relation]): ''' Optimize an entire program, composed by multiple expressions and assignments. ''' lines = code.split('\n') - context = {} # type: ContextDict + context = {} for line in lines: line = line.strip() @@ -53,7 +52,7 @@ def optimize_program(code, rels: ContextDict): return querysplit.split(node, rels) -def optimize_all(expression: Union[str, Node], rels: ContextDict, specific: bool = True, general: bool = True, debug: Optional[list] = None, tostr: bool = True) -> Union[str, Node]: +def optimize_all(expression: Union[str, Node], rels: Dict[str, Relation], specific: bool = True, general: bool = True, debug: Optional[list] = None, tostr: bool = True) -> Union[str, Node]: '''This function performs all the available optimizations. expression : see documentation of this module @@ -82,23 +81,23 @@ def optimize_all(expression: Union[str, Node], rels: ContextDict, specific: bool total = 0 if specific: for i in optimizations.specific_optimizations: - res = i(n, rels) # Performs the optimization - if res != 0 and dbg: + n, c = recursive_scan(i, n, rels) + if c != 0 and dbg: debug.append(str(n)) - total += res + total += c if general: for i in optimizations.general_optimizations: - res = i(n) # Performs the optimization - if res != 0 and dbg: + n, c = recursive_scan(i, n, None) + if c != 0 and dbg: debug.append(str(n)) - total += res + total += c if tostr: return str(n) else: return n -def specific_optimize(expression, rels: ContextDict): +def specific_optimize(expression, rels: Dict[str, Relation]): '''This function performs specific optimizations. Means that it will need to know the fields used by the relations. @@ -117,3 +116,35 @@ def general_optimize(expression): Return value: this will return an optimized version of the expression''' return optimize_all(expression, None, specific=False, general=True) + + +def recursive_scan(function, node, rels) -> Tuple[Node, int]: + '''Does a recursive optimization on the tree. + + This function will recursively execute the function given + as "function" parameter starting from node to all the tree. + if rels is provided it will be passed as argument to the function. + Otherwise the function will be called just on the node. + + Result value: function is supposed to return the amount of changes + it has performed on the tree. + The various result will be added up and this final value will be the + returned value.''' + + args = [] + if rels: + args.append(rels) + + changes = 0 + node, c = function(node, *args) + changes += c + + if isinstance(node, Unary): + node.child, c = recursive_scan(function, node.child, rels) + changes += c + elif isinstance(node, Binary): + node.left, c = recursive_scan(function, node.left, rels) + changes += c + node.right, c = recursive_scan(function, node.right, rels) + changes += c + return node, changes diff --git a/relational/parser.py b/relational/parser.py index c09eb46..a5d6d34 100644 --- a/relational/parser.py +++ b/relational/parser.py @@ -1,5 +1,5 @@ # Relational -# Copyright (C) 2008-2017 Salvo "LtWorf" Tomaselli +# Copyright (C) 2008-2020 Salvo "LtWorf" Tomaselli # # Relational is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -24,14 +24,11 @@ # # Language definition here: # http://ltworf.github.io/relational/grammar.html -from typing import Optional, Union, List, Any +from typing import Optional, Union, List, Any, Dict +from dataclasses import dataclass from relational import rtypes -RELATION = 0 -UNARY = 1 -BINARY = 2 - PRODUCT = '*' DIFFERENCE = '-' UNION = '∪' @@ -84,9 +81,8 @@ class CallableString(str): ''' return eval(self, context) - +@dataclass class Node: - '''This class is a node of a relational expression. Leaves are relations and internal nodes are operations. @@ -102,72 +98,12 @@ class Node: operation. This class is used to convert an expression into python code.''' - kind = None # type: Optional[int] - __hash__ = None # type: None + name: str - def __init__(self, expression: Optional[list] = None) -> None: - '''Generates the tree from the tokenized expression - If no expression is specified then it will create an empty node''' - if expression is None or len(expression) == 0: - return + def __init__(self, name: str) -> None: + raise NotImplementedError('This is supposed to be an abstract class') - # If the list contains only a list, it will consider the lower level list. - # This will allow things like ((((((a))))) to work - while len(expression) == 1 and isinstance(expression[0], list): - expression = expression[0] - - # The list contains only 1 string. Means it is the name of a relation - if len(expression) == 1: - self.kind = RELATION - self.name = expression[0] - if not rtypes.is_valid_relation_name(self.name): - raise ParserException( - u"'%s' is not a valid relation name" % self.name) - return - - # Expression from right to left, searching for binary operators - # this means that binary operators have lesser priority than - # unary operators. - # It finds the operator with lesser priority, uses it as root of this - # (sub)tree using everything on its left as left parameter (so building - # a left subtree with the part of the list located on left) and doing - # the same on right. - # Since it searches for strings, and expressions into parenthesis are - # within sub-lists, they won't be found here, ensuring that they will - # have highest priority. - for i in range(len(expression) - 1, -1, -1): - if expression[i] in b_operators: # Binary operator - self.kind = BINARY - self.name = expression[i] - - if len(expression[:i]) == 0: - raise ParserException( - u"Expected left operand for '%s'" % self.name) - - if len(expression[i + 1:]) == 0: - raise ParserException( - u"Expected right operand for '%s'" % self.name) - - self.left = node(expression[:i]) - self.right = node(expression[i + 1:]) - return - '''Searches for unary operators, parsing from right to left''' - for i in range(len(expression) - 1, -1, -1): - if expression[i] in u_operators: # Unary operator - self.kind = UNARY - self.name = expression[i] - - if len(expression) <= i + 2: - raise ParserException( - u"Expected more tokens in '%s'" % self.name) - - self.prop = expression[1 + i].strip() - self.child = node(expression[2 + i]) - - return - raise ParserException("Expected operator in '%s'" % expression) - - def toCode(self): + def toCode(self): #FIXME return type '''This method converts the AST into a python code object''' code = self._toPython() return compile(code, '', 'eval') @@ -181,25 +117,7 @@ class Node: return CallableString(self._toPython()) def _toPython(self) -> str: - ''' - Same as toPython but returns a regular string - ''' - if self.name in b_operators: - return '%s.%s(%s)' % (self.left.toPython(), op_functions[self.name], self.right.toPython()) - elif self.name in u_operators: - prop = self.prop - - # Converting parameters - if self.name == PROJECTION: - prop = '\"%s\"' % prop.replace(' ', '').replace(',', '\",\"') - elif self.name == RENAME: - prop = '{\"%s\"}' % prop.replace( - ',', '\",\"').replace(ARROW, '\":\"').replace(' ', '') - else: # Selection - prop = repr(prop) - - return '%s.%s(%s)' % (self.child.toPython(), op_functions[self.name], prop) - return self.name + raise NotImplementedError() def printtree(self, level: int = 0) -> str: '''returns a representation of the tree using indentation''' @@ -216,27 +134,20 @@ class Node: return '\n' + r def get_left_leaf(self) -> 'Node': - '''This function returns the leftmost leaf in the tree.''' - if self.kind == RELATION: - return self - elif self.kind == UNARY: - return self.child.get_left_leaf() - elif self.kind == BINARY: - return self.left.get_left_leaf() - raise ValueError('What kind of alien object is this?') + raise NotImplementedError() - def result_format(self, rels: dict) -> list: + def result_format(self, rels: dict) -> list: #FIXME types '''This function returns a list containing the fields that the resulting relation will have. It requires a dictionary where keys are the names of the relations and the values are the relation objects.''' if not isinstance(rels, dict): raise TypeError('Can\'t be of None type') - if self.kind == RELATION: + if isinstance(self, Variable): #FIXME this is ugly return list(rels[self.name].header) - elif self.kind == BINARY and self.name in (DIFFERENCE, UNION, INTERSECTION): + elif isinstance(self, Binary) and self.name in (DIFFERENCE, UNION, INTERSECTION): return self.left.result_format(rels) - elif self.kind == BINARY and self.name == DIVISION: + elif isinstance(self, Binary) and self.name == DIVISION: return list(set(self.left.result_format(rels)) - set(self.right.result_format(rels))) elif self.name == PROJECTION: return [i.strip() for i in self.prop.split(',')] @@ -259,7 +170,7 @@ class Node: return list(set(self.left.result_format(rels)).union(set(self.right.result_format(rels)))) raise ValueError('What kind of alien object is this?') - def __eq__(self, other): + def __eq__(self, other): #FIXME if not (isinstance(other, node) and self.name == other.name and self.kind == other.kind): return False @@ -271,22 +182,151 @@ class Node: return self.left == other.left and self.right == other.right return True + +@dataclass +class Variable(Node): + def _toPython(self) -> str: + return self.name + def __str__(self): - if (self.kind == RELATION): - return self.name - elif (self.kind == UNARY): - return self.name + " " + self.prop + " (" + self.child.__str__() + ")" - elif (self.kind == BINARY): - le = self.left.__str__() - if self.right.kind != BINARY: - re = self.right.__str__() - else: - re = "(" + self.right.__str__() + ")" - return (le + self.name + re) - raise ValueError('What kind of alien object is this?') + return self.name + + def get_left_leaf(self) -> Node: + return self -def _find_matching_parenthesis(expression: str, start=0, openpar=u'(', closepar=u')') -> Optional[int]: +@dataclass +class Binary(Node): + left: Node + right: Node + + def get_left_leaf(self) -> Node: + return self.left.get_left_leaf() + + def _toPython(self) -> str: + return '%s.%s(%s)' % (self.left._toPython(), op_functions[self.name], self.right._toPython()) + + def __str__(self): + le = self.left.__str__() + if isinstance(self.right, Binary): + re = "(" + self.right.__str__() + ")" + else: + re = self.right.__str__() + return (le + self.name + re) #TODO use fstrings + + +@dataclass +class Unary(Node): + prop: str + child: Node + + def get_left_leaf(self) -> Node: + return self.child.get_left_leaf() + + def __str__(self): + return self.name + " " + self.prop + " (" + self.child.__str__() + ")" #TODO use fstrings + + def _toPython(self) -> str: + prop = self.prop + + # Converting parameters + if self.name == PROJECTION: + prop = '\"%s\"' % prop.replace(' ', '').replace(',', '\",\"') + elif self.name == RENAME: + prop = repr(self.get_rename_prop()) + else: # Selection + prop = repr(prop) + + return '%s.%s(%s)' % (self.child._toPython(), op_functions[self.name], prop) + + def get_projection_prop(self) -> List[str]: + if self.name != PROJECTION: + raise ValueError('This is only supported on projection nodes') + return [i.strip() for i in self.prop.split(',')] + + def set_projection_prop(self, p: List[str]) -> None: + if self.name != PROJECTION: + raise ValueError('This is only supported on projection nodes') + self.prop = ','.join(p) + + def get_rename_prop(self) -> Dict[str, str]: + ''' + Returns the dictionary that the rename operation wants + ''' + if self.name != RENAME: + raise ValueError('This is only supported on rename nodes') + r = {} + for i in self.prop.split(','): + q = i.split(ARROW) + r[q[0].strip()] = q[1].strip() + return r + + def set_rename_prop(self, renames: Dict[str, str]) -> None: + ''' + Sets the prop field based on the dictionary for renames + ''' + if self.name != RENAME: + raise ValueError('This is only supported on rename nodes') + self.prop = ','.join(f'{k}{ARROW}{v}' for k, v in renames.items()) + + + + +def parse_tokens(expression: List[Union[list, str]]) -> Node: + '''Generates the tree from the tokenized expression + If no expression is specified then it will create an empty node''' + + # If the list contains only a list, it will consider the lower level list. + # This will allow things like ((((((a))))) to work + while len(expression) == 1 and isinstance(expression[0], list): + expression = expression[0] + + # The list contains only 1 string. Means it is the name of a relation + if len(expression) == 1: + + if not rtypes.is_valid_relation_name(expression[0]): + raise ParserException( + u"'%s' is not a valid relation name" % expression[0]) + return Variable(expression[0]) #FIXME Move validation in the object + + # Expression from right to left, searching for binary operators + # this means that binary operators have lesser priority than + # unary operators. + # It finds the operator with lesser priority, uses it as root of this + # (sub)tree using everything on its left as left parameter (so building + # a left subtree with the part of the list located on left) and doing + # the same on right. + # Since it searches for strings, and expressions into parenthesis are + # within sub-lists, they won't be found here, ensuring that they will + # have highest priority. + for i in range(len(expression) - 1, -1, -1): + if expression[i] in b_operators: # Binary operator + + + if len(expression[:i]) == 0: + raise ParserException( + u"Expected left operand for '%s'" % self.name) + + if len(expression[i + 1:]) == 0: + raise ParserException( + u"Expected right operand for '%s'" % self.name) + return Binary(expression[i], parse_tokens(expression[:i]), parse_tokens(expression[i + 1:])) + '''Searches for unary operators, parsing from right to left''' + for i in range(len(expression) - 1, -1, -1): + if expression[i] in u_operators: # Unary operator + if len(expression) <= i + 2: + raise ParserException( + u"Expected more tokens in '%s'" % self.name) + + return Unary( + expression[i], + prop=expression[1 + i].strip(), + child=parse_tokens(expression[2 + i]) + ) + raise ParserException('Parse error') #FIXME more details + + +def _find_matching_parenthesis(expression: str, start=0, openpar='(', closepar=')') -> Optional[int]: '''This function returns the position of the matching close parenthesis to the 1st open parenthesis found starting from start (0 by default)''' @@ -391,7 +431,7 @@ def tokenize(expression: str) -> list: def tree(expression: str) -> Node: '''This function parses a relational algebra expression into a AST and returns the root node using the Node class.''' - return Node(tokenize(expression)) + return parse_tokens(tokenize(expression)) def parse(expr: str) -> CallableString: @@ -400,11 +440,3 @@ def parse(expr: str) -> CallableString: Python expression. ''' return tree(expr).toPython() - -if __name__ == "__main__": - while True: - e = input("Expression: ") - print (parse(e)) - -# Backwards compatibility -node = Node diff --git a/tests_dir/people_rename.query b/tests_dir/people_rename.query new file mode 100644 index 0000000..b2fba90 --- /dev/null +++ b/tests_dir/people_rename.query @@ -0,0 +1 @@ +ρ name➡n,age➡a(σTrue(people)) ∪ ρ age➡a,name➡n(people) diff --git a/tests_dir/people_rename.result b/tests_dir/people_rename.result new file mode 100644 index 0000000..6e4a1ba --- /dev/null +++ b/tests_dir/people_rename.result @@ -0,0 +1,9 @@ +id,n,chief,a +0,jack,0,22 +1,carl,0,20 +2,john,1,30 +3,dean,1,33 +4,eve,0,25 +5,duncan,4,30 +6,paul,4,30 +7,alia,1,28 diff --git a/tests_dir/people_rename_select.query b/tests_dir/people_rename_select.query new file mode 100644 index 0000000..d728fc8 --- /dev/null +++ b/tests_dir/people_rename_select.query @@ -0,0 +1 @@ +σ i%2==0 (ρ id➡i (people)) diff --git a/tests_dir/people_rename_select.result b/tests_dir/people_rename_select.result new file mode 100644 index 0000000..83a690c --- /dev/null +++ b/tests_dir/people_rename_select.result @@ -0,0 +1,5 @@ +i,name,chief,age +0,jack,0,22 +2,john,1,30 +4,eve,0,25 +6,paul,4,30