77
88logger = logging .getLogger (__name__ )
99
10+ TABLE_MIN_ROWS = 20
11+
12+ DELIMITERS = {
13+ '\t ' : 'tab' ,
14+ ' ' : 'space' ,
15+ ';' : 'semicolon' ,
16+ ',' : 'comma' ,
17+ }
18+ LINETERMINATORS = {
19+ '\r \n ' : '\\ r\\ n' ,
20+ '\r ' : '\\ r' ,
21+ '\n ' : '\\ n' ,
22+ }
23+
1024
1125class CSVReader (Reader ):
1226 identifier = 'csv_reader'
@@ -16,86 +30,103 @@ def check(self):
1630 logger .debug ('file_name=%s content_type=%s mime_type=%s encoding=%s' ,
1731 self .file_name , self .content_type , self .mime_type , self .encoding )
1832
19- if self .encoding == 'binary' :
20- result = False
21- else :
33+ result = False
34+ if self .encoding != 'binary' :
2235 file_string = self .file_content .decode (self .encoding )
2336
24- try :
25- self .dialect = csv .Sniffer ().sniff (file_string , delimiters = ';,\t ' )
26- except csv .Error :
27- result = False
28- else :
29- io_string = io .StringIO (file_string )
30- self .lines = copy .copy (io_string )
31- self .reader = csv .reader (io_string , self .dialect )
32- result = True
37+ # check different delimiters one by one
38+ for delimiter in DELIMITERS .keys ():
39+ try :
40+ self .dialect = csv .Sniffer ().sniff (file_string , delimiters = delimiter )
41+ result = True
42+ break
43+ except csv .Error :
44+ pass
45+
46+ if result :
47+ io_string = io .StringIO (file_string )
48+ self .lines = list (copy .copy (io_string ))
49+ self .rows = list (csv .reader (io_string , self .dialect ))
3350
3451 logger .debug ('result=%s' , result )
3552 return result
3653
3754 def get_tables (self ):
38- table = {
39- 'header' : [],
40- 'metadata' : {},
41- 'columns' : [],
42- 'rows' : []
43- }
44- table_shape = None
45- header = False
46-
47- # loop through the rows in reverse order from bottom to top
48- reverse_lines = reversed (list (self .lines ))
49- reverse_rows = reversed (list (self .reader ))
50- for line , row in zip (reverse_lines , reverse_rows ):
55+ tables = []
56+ table = self .append_table (tables )
57+
58+ # loop over rows and sort into blocks of similar shape
59+ blocks = []
60+ block = {}
61+ for index , row in enumerate (self .rows ):
5162 shape = self .get_shape (row )
52- if table_shape is None :
53- # store shape of the first row from the end of the table
54- table_shape = shape
55-
56- if header :
57- table ['header' ].append (line )
58- elif shape != table_shape :
59- # this is the last row of the header, check if these are column names
60- if list (map (bool , shape )) == list (map (bool , table_shape )):
61- # add the column names as metadata
62- table ['metadata' ] = {
63- 'column_{:02d}' .format (idx ): str (value ) for idx , value in enumerate (row )
64- }
65-
66- else :
67- # add the line as a regular header line
68- table ['header' ].append (line )
69-
70- # set the header switch to true
71- header = True
63+ if block .get ('shape' ) is None or not self .compare_shape (shape , block .get ('shape' , [])):
64+ block = {'indexes' : [], 'shape' : shape }
65+ blocks .append (block )
66+
67+ block ['indexes' ].append (index )
68+
69+ # loop over blocks and sort into header, table, and metadata
70+ prev_block = None
71+ for block in blocks :
72+ if len (block ['indexes' ]) < TABLE_MIN_ROWS or not block ['shape' ]:
73+ # this is the header
74+ if table ['rows' ]:
75+ # if a table is already there, this must be a new header
76+ table = self .append_table (tables )
77+
78+ table ['header' ] += [self .lines [index ] for index in block ['indexes' ]]
7279 else :
73- table ['rows' ].append (row )
74-
75- table ['header' ] = list (reversed (table ['header' ]))
76- table ['rows' ] = list (reversed (table ['rows' ]))
77- table ['columns' ] = self .get_columns (table ['metadata' ].values (), len (table_shape ))
78-
79- return [table ]
80-
81- def get_columns (self , column_names , n_columns ):
82- if column_names :
83- return [{
84- 'key' : str (idx ),
85- 'name' : 'Column #{} ({})' .format (idx , column_name )
86- } for idx , column_name in enumerate (column_names )]
87- else :
88- # add the row as columns
89- return [{
90- 'key' : str (idx ),
91- 'name' : 'Column #{}' .format (idx )
92- } for idx in range (n_columns )]
80+ # this is the table
81+ if not table ['rows' ]:
82+ # if there are no tables, we can try to find the columns previous line
83+ if prev_block :
84+ this_row = self .rows [block ['indexes' ][0 ]]
85+ prev_row = self .rows [prev_block ['indexes' ][- 1 ]]
86+
87+ if len (prev_row ) > 0 and len (prev_row ) <= len (this_row ):
88+ # add the column names as metadata
89+ table ['metadata' ] = {
90+ 'column_{:02d}' .format (idx ): str (value ) for idx , value in enumerate (prev_row )
91+ }
92+ # remove the colum line from the header
93+ table ['header' ] = table ['header' ][:- 1 ]
94+
95+ table ['rows' ] += [self .rows [index ] for index in block ['indexes' ]]
96+
97+ prev_block = block
98+
99+ # build columns
100+ for table in tables :
101+ table ['columns' ] = []
102+ if table ['rows' ]:
103+ for idx in range (len (table ['rows' ][0 ])):
104+ name = table ['metadata' ].get ('column_{:02d}' .format (idx ))
105+ table ['columns' ].append ({
106+ 'key' : str (idx ),
107+ 'name' : 'Column #{} ({})' .format (idx , name ) if name else 'Column #{}' .format (idx )
108+ })
109+
110+ table ['metadata' ]['rows' ] = len (table ['rows' ])
111+ table ['metadata' ]['columns' ] = len (table ['columns' ])
112+
113+ return tables
114+
115+ def get_metadata (self ):
116+ metadata = super ().get_metadata ()
117+ metadata ['lineterminator' ] = LINETERMINATORS .get (self .dialect .lineterminator , self .dialect .lineterminator )
118+ metadata ['quoting' ] = self .dialect .quoting
119+ metadata ['doublequote' ] = self .dialect .doublequote
120+ metadata ['delimiter' ] = DELIMITERS .get (self .dialect .delimiter , self .dialect .delimiter )
121+ metadata ['quotechar' ] = self .dialect .quotechar
122+ metadata ['skipinitialspace' ] = self .dialect .skipinitialspace
123+ return metadata
93124
94125 def get_shape (self , row ):
95126 shape = []
96127 for cell in row :
97128 if cell .strip () == '' :
98- shape .append ('s ' )
129+ shape .append ('' )
99130 else :
100131 try :
101132 float (cell .replace (',' , '.' ))
@@ -104,12 +135,29 @@ def get_shape(self, row):
104135 shape .append ('s' )
105136 return shape
106137
107- def get_metadata (self ):
108- metadata = super ().get_metadata ()
109- metadata ['lineterminator' ] = self .dialect .lineterminator
110- metadata ['quoting' ] = self .dialect .quoting
111- metadata ['doublequote' ] = self .dialect .doublequote
112- metadata ['delimiter' ] = self .dialect .delimiter
113- metadata ['quotechar' ] = self .dialect .quotechar
114- metadata ['skipinitialspace' ] = self .dialect .skipinitialspace
115- return metadata
138+ def compare_shape (self , shape_a , shape_b ):
139+ # this function compares two shapes, shapes are considered equal if
140+ # floats or strings are at the same positions, spaces are considered wildcards
141+ # since they could be both floats or strings
142+ if shape_a == shape_b :
143+ # both shapes are identical
144+ return True
145+
146+ if len (shape_a ) != len (shape_b ):
147+ # shapes have different length
148+ return False
149+
150+ if not any (shape_a ):
151+ # shape_a consits only of spaces
152+ return False
153+
154+ if not any (shape_b ):
155+ # shape_b consits only of spaces
156+ return False
157+
158+ for a , b in zip (shape_a , shape_b ):
159+ if a and b and a != b :
160+ # cell a is not equal cell b or one of the two is empty
161+ return False
162+
163+ return True
0 commit comments