1- import copy
21import csv
32import io
43import logging
76
87logger = logging .getLogger (__name__ )
98
10- TABLE_MIN_ROWS = 20
11-
12- DELIMITERS = {
13- '\t ' : 'tab' ,
14- ' ' : 'space' ,
15- ';' : 'semicolon' ,
16- ',' : 'comma' ,
17- }
18- LINETERMINATORS = {
19- '\r \n ' : '\\ r\\ n' ,
20- '\r ' : '\\ r' ,
21- '\n ' : '\\ n' ,
22- }
23-
249
2510class CSVReader (Reader ):
2611 identifier = 'csv_reader'
2712 priority = 100
2813
14+ empty_values = ['' , 'n.a.' ]
15+ table_min_rows = 20
16+ delimiters = {
17+ '\t ' : 'tab' ,
18+ ' ' : 'space' ,
19+ ';' : 'semicolon' ,
20+ ',' : 'comma' ,
21+ }
22+ lineterminators = {
23+ '\r \n ' : '\\ r\\ n' ,
24+ '\r ' : '\\ r' ,
25+ '\n ' : '\\ n' ,
26+ }
27+
2928 def check (self ):
3029 logger .debug ('file_name=%s content_type=%s mime_type=%s encoding=%s' ,
3130 self .file_name , self .content_type , self .mime_type , self .encoding )
@@ -35,7 +34,7 @@ def check(self):
3534 file_string = self .file_content .decode (self .encoding )
3635
3736 # check different delimiters one by one
38- for delimiter in DELIMITERS .keys ():
37+ for delimiter in self . delimiters .keys ():
3938 try :
4039 self .dialect = csv .Sniffer ().sniff (file_string , delimiters = delimiter )
4140 result = True
@@ -44,9 +43,8 @@ def check(self):
4443 pass
4544
4645 if result :
47- io_string = io .StringIO (file_string )
48- self .lines = list (copy .copy (io_string ))
49- self .rows = list (csv .reader (io_string , self .dialect ))
46+ self .rows = list (csv .reader (io .StringIO (file_string ), self .dialect ))
47+ self .lines = file_string .splitlines ()
5048
5149 logger .debug ('result=%s' , result )
5250 return result
@@ -69,7 +67,7 @@ def get_tables(self):
6967 # loop over blocks and sort into header, table, and metadata
7068 prev_block = None
7169 for block in blocks :
72- if len (block ['indexes' ]) < TABLE_MIN_ROWS or not block ['shape' ]:
70+ if len (block ['indexes' ]) < self . table_min_rows or not block ['shape' ]:
7371 # this is the header
7472 if table ['rows' ]:
7573 # if a table is already there, this must be a new header
@@ -92,7 +90,7 @@ def get_tables(self):
9290 # remove the colum line from the header
9391 table ['header' ] = table ['header' ][:- 1 ]
9492
95- table ['rows' ] += [self .rows [index ] for index in block ['indexes' ]]
93+ table ['rows' ] += [[ self .get_value ( value ) for value in self . rows [index ] ] for index in block ['indexes' ]]
9694
9795 prev_block = block
9896
@@ -114,25 +112,25 @@ def get_tables(self):
114112
115113 def get_metadata (self ):
116114 metadata = super ().get_metadata ()
117- metadata ['lineterminator' ] = LINETERMINATORS .get (self .dialect .lineterminator , self .dialect .lineterminator )
115+ metadata ['lineterminator' ] = self . lineterminators .get (self .dialect .lineterminator , self .dialect .lineterminator )
118116 metadata ['quoting' ] = self .dialect .quoting
119117 metadata ['doublequote' ] = self .dialect .doublequote
120- metadata ['delimiter' ] = DELIMITERS .get (self .dialect .delimiter , self .dialect .delimiter )
118+ metadata ['delimiter' ] = self . delimiters .get (self .dialect .delimiter , self .dialect .delimiter )
121119 metadata ['quotechar' ] = self .dialect .quotechar
122120 metadata ['skipinitialspace' ] = self .dialect .skipinitialspace
123121 return metadata
124122
125123 def get_shape (self , row ):
126124 shape = []
127125 for cell in row :
128- if cell .strip () == '' :
126+ value = cell .strip ()
127+ if value in self .empty_values :
129128 shape .append ('' )
129+ elif self .float_pattern .match (value ):
130+ shape .append ('f' )
130131 else :
131- try :
132- float (cell .replace (',' , '.' ))
133- shape .append ('f' )
134- except ValueError :
135- shape .append ('s' )
132+ shape .append ('s' )
133+
136134 return shape
137135
138136 def compare_shape (self , shape_a , shape_b ):
0 commit comments