From 3b0392e8671e0fa90ce0f9bba9f4cab8f3333767 Mon Sep 17 00:00:00 2001 From: LtWorf Date: Thu, 3 Mar 2011 08:46:04 +0000 Subject: [PATCH] - Float type recognition is more robust, now handled using a regexp - Date type recognition is more robust, now using a combination of regexp plus date object - Parsing of strings representing dates is now cached, eliminating the need for double parse git-svn-id: http://galileo.dmi.unict.it/svn/relational/trunk@270 014f5005-505e-4b48-8d0a-63407b615a7c --- CHANGELOG | 3 ++ relational/parser.py | 2 +- relational/relation.py | 16 ++++--- relational/rtypes.py | 97 +++++++++++++++++++++++------------------- samples/ratings.csv | 9 ++++ 5 files changed, 75 insertions(+), 52 deletions(-) create mode 100644 samples/ratings.csv diff --git a/CHANGELOG b/CHANGELOG index 6a74818..a5ed7b0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -12,6 +12,9 @@ - Module parallel does something, can execute queries in parallel - Set hash method for the classes - Implemented select_union_intersect_subtract general optimization +- Float type recognition is more robust, now handled using a regexp +- Date type recognition is more robust, now using a combination of regexp plus date object +- Parsing of strings representing dates is now cached, eliminating the need for double parse 0.11 - Font is set only on windows (Rev 206) diff --git a/relational/parser.py b/relational/parser.py index f9e2564..1da85ed 100644 --- a/relational/parser.py +++ b/relational/parser.py @@ -118,7 +118,7 @@ class node (object): return self.name pass def printtree(self,level=0): - '''Prints a representation of the tree using indentation''' + '''returns a representation of the tree using indentation''' r='' for i in range(level): r+=' ' diff --git a/relational/relation.py b/relational/relation.py index 6475fd2..aaab379 100644 --- a/relational/relation.py +++ b/relational/relation.py @@ -102,14 +102,16 @@ class relation (object): for i in self.content: #Fills the attributes dictionary with the values of the tuple for j in range(len(self.header.attributes)): - if len(i[j])>0 and i[j].isdigit(): - attributes[self.header.attributes[j]]=int(i[j]) - elif len(i[j])>0 and rstring(i[j]).isFloat(): - attributes[self.header.attributes[j]]=float(i[j]) - elif len(i[j])>0 and isDate(i[j]): - attributes[self.header.attributes[j]]=rdate(i[j]) + tmpstring=rstring(i[j]) + + if len(tmpstring)>0 and tmpstring.isdigit(): + attributes[self.header.attributes[j]]=int(tmpstring) + elif len(tmpstring)>0 and tmpstring.isFloat(): + attributes[self.header.attributes[j]]=float(tmpstring) + elif len(tmpstring)>0 and tmpstring.isDate(): + attributes[self.header.attributes[j]]=rdate(tmpstring) else: - attributes[self.header.attributes[j]]=i[j] + attributes[self.header.attributes[j]]=tmpstring try: if eval(expr,attributes): diff --git a/relational/rtypes.py b/relational/rtypes.py index 0f38bf4..9d2ff8d 100644 --- a/relational/rtypes.py +++ b/relational/rtypes.py @@ -22,33 +22,66 @@ Purpose of this module is having the isFloat function and implementing dates to use in selection.''' import datetime +import re class rstring (str): '''String subclass with some custom methods''' def isFloat(self): - '''True if the string is a float number, false otherwise''' - lst=('0','1','2','3','4','5','6','7','8','9','.') - for i in self: - if i not in lst: - return False; - return True; + '''Returns true if the string represents a float number + it only considers as float numbers, the strings matching + the following regexp: + r'^[0-9]+(\.([0-9])+)?$' + ''' + if re.match(r'^[0-9]+(\.([0-9])+)?$',self)==None: + return False + else: + return True + + def isDate(self): + '''Returns true if the string represents a date, + in the format YYYY-MM-DD. as separators '-' , '\', '/' are allowed. + As side-effect, the date object will be stored for future usage, so + no more parsings are needed + ''' + try: + return self._isdate + except: + pass + + r= re.match(r'^([0-9]{1,4})(\\|-|/)([0-9]{1,2})(\\|-|/)([0-9]{1,2})$',self) + if r==None: + self._isdate=False + self._date=None + return False + + try: #Any of the following operations can generate an exception, if it happens, we aren't dealing with a date + year=int(r.group(1)) + month=int(r.group(3)) + day=int(r.group(5)) + d=datetime.date(year,month,day) + self._isdate=True + self._date=d + return True + except: + self._isdate=False + self._date=None + return False + def getDate(self): + '''Returns the datetime.date object or None''' + try: + return self._date + except: + self.isDate() + return self._date class rdate (object): '''Represents a date''' def __init__(self,date): - sep=('-','/','\\') - splitter=None - for i in sep: - if i in date: - splitter=i - break; - elems=date.split(splitter) + '''date: A string representing a date''' + if not isinstance(date,rstring): + date=rstring(date) - year=int(elems[0]) - month=int(elems[1]) - day=int(elems[2]) - - self.intdate=datetime.date(year,month,day) + self.intdate=date.getDate() self.day= self.intdate.day self.month=self.intdate.month self.weekday=self.intdate.weekday() @@ -75,29 +108,5 @@ class rdate (object): return self.intdate!=other.intdate def __sub__ (self,other): return (self.intdate-other.intdate).days -def isDate(date): - sep=('-','/','\\') - splitter=None - for i in sep: - if i in date: - splitter=i - break; - elems=date.split(splitter) - if len(elems)!=3: - return False #Wrong number of elements - year=elems[0] - month=elems[1] - day=elems[2] - if not (year.isdigit() and month.isdigit() and day.isdigit()): - return False - year=int(year) - month=int(month) - day=int(day) - - if yeardatetime.MAXYEAR: - return False - if month<1 or month>12: - return False - if day<1 or day >31: - return False - return True \ No newline at end of file + + \ No newline at end of file diff --git a/samples/ratings.csv b/samples/ratings.csv new file mode 100644 index 0000000..39fd952 --- /dev/null +++ b/samples/ratings.csv @@ -0,0 +1,9 @@ +id,rating +0,5.3 +1,6 +2,5.7 +3,3.3 +4,9.1 +5,4.4 +6,5.1.1 +7,4.9