Skip to content

Commit 785bdb1

Browse files
authored
Refactor readers (#18)
* Add metadata to NTUPLES output * Refactor readers, fix csv reader
1 parent 8157bdb commit 785bdb1

8 files changed

Lines changed: 187 additions & 136 deletions

File tree

converter_app/readers/ascii.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def check(self):
2929

3030
def get_tables(self):
3131
tables = []
32-
self.append_table(tables)
32+
table = self.append_table(tables)
3333

3434
# loop over lines of the file
3535
previous_count = None
@@ -40,12 +40,12 @@ def get_tables(self):
4040
# try to match text for the header
4141
text_match = PATTERNS['text'].search(row)
4242
if text_match:
43-
if tables[-1]['rows']:
43+
if table['rows']:
4444
# if a table is already there, this must be a new header
45-
self.append_table(tables)
45+
table = self.append_table(tables)
4646

4747
# append header line to last table
48-
tables[-1]['header'].append(row)
48+
table['header'].append(row)
4949
else:
5050
# try to match columns of floats
5151
row = row.replace('n.a.','0')
@@ -55,27 +55,23 @@ def get_tables(self):
5555
float_match = [float_str.replace(',', '.') for float_str in float_match]
5656
count = len(float_match)
5757

58-
if tables[-1]['rows'] and count != previous_count:
58+
if table['rows'] and count != previous_count:
5959
# start a new table if the shape has changed
6060
self.append_table(tables)
6161

62-
tables[-1]['rows'].append(float_match)
62+
table['rows'].append(float_match)
6363

6464
previous_count = count
6565

66-
# loop over tables and append rows
66+
# loop over tables and append columns
6767
for table in tables:
6868
if table['rows']:
6969
table['columns'] = [{
7070
'key': str(idx),
7171
'name': 'Column #{}'.format(idx)
7272
} for idx, value in enumerate(table['rows'][0])]
7373

74-
return tables
74+
table['metadata']['rows'] = len(table['rows'])
75+
table['metadata']['columns'] = len(table['columns'])
7576

76-
def append_table(self, tables):
77-
tables.append({
78-
'header': [],
79-
'columns': [],
80-
'rows': []
81-
})
77+
return tables

converter_app/readers/base.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,19 @@ def get_metadata(self):
4141
'extension': self.extension,
4242
'reader': self.__class__.__name__
4343
}
44+
45+
def append_table(self, tables):
46+
table = {
47+
'header': [],
48+
'metadata': {},
49+
'columns': [],
50+
'rows': []
51+
}
52+
tables.append(table)
53+
return table
54+
55+
def get_value(self, value):
56+
try:
57+
return float(value.replace(',', '.'))
58+
except ValueError:
59+
return value

converter_app/readers/brml.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,7 @@ def get_tables(self):
3939
raw_data = ET.fromstring(rd.read())
4040

4141
for data_route in raw_data.findall('./DataRoutes/DataRoute'):
42-
table = {
43-
'header': [],
44-
'columns': [],
45-
'rows': []
46-
}
42+
table = self.append_table(tables)
4743

4844
first = True
4945
for datum in data_route.findall('./Datum'):
@@ -64,6 +60,7 @@ def get_tables(self):
6460

6561
table['rows'].append(row)
6662

67-
tables.append(table)
63+
table['metadata']['rows'] = len(table['rows'])
64+
table['metadata']['columns'] = len(table['columns'])
6865

6966
return tables

converter_app/readers/csv.py

Lines changed: 123 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,20 @@
77

88
logger = logging.getLogger(__name__)
99

10+
TABLE_MIN_ROWS = 20
11+
12+
DELIMITERS = {
13+
'\t': 'tab',
14+
' ': 'space',
15+
';': 'semicolon',
16+
',': 'comma',
17+
}
18+
LINETERMINATORS = {
19+
'\r\n': '\\r\\n',
20+
'\r': '\\r',
21+
'\n': '\\n',
22+
}
23+
1024

1125
class CSVReader(Reader):
1226
identifier = 'csv_reader'
@@ -16,86 +30,103 @@ def check(self):
1630
logger.debug('file_name=%s content_type=%s mime_type=%s encoding=%s',
1731
self.file_name, self.content_type, self.mime_type, self.encoding)
1832

19-
if self.encoding == 'binary':
20-
result = False
21-
else:
33+
result = False
34+
if self.encoding != 'binary':
2235
file_string = self.file_content.decode(self.encoding)
2336

24-
try:
25-
self.dialect = csv.Sniffer().sniff(file_string, delimiters=';,\t')
26-
except csv.Error:
27-
result = False
28-
else:
29-
io_string = io.StringIO(file_string)
30-
self.lines = copy.copy(io_string)
31-
self.reader = csv.reader(io_string, self.dialect)
32-
result = True
37+
# check different delimiters one by one
38+
for delimiter in DELIMITERS.keys():
39+
try:
40+
self.dialect = csv.Sniffer().sniff(file_string, delimiters=delimiter)
41+
result = True
42+
break
43+
except csv.Error:
44+
pass
45+
46+
if result:
47+
io_string = io.StringIO(file_string)
48+
self.lines = list(copy.copy(io_string))
49+
self.rows = list(csv.reader(io_string, self.dialect))
3350

3451
logger.debug('result=%s', result)
3552
return result
3653

3754
def get_tables(self):
38-
table = {
39-
'header': [],
40-
'metadata': {},
41-
'columns': [],
42-
'rows': []
43-
}
44-
table_shape = None
45-
header = False
46-
47-
# loop through the rows in reverse order from bottom to top
48-
reverse_lines = reversed(list(self.lines))
49-
reverse_rows = reversed(list(self.reader))
50-
for line, row in zip(reverse_lines, reverse_rows):
55+
tables = []
56+
table = self.append_table(tables)
57+
58+
# loop over rows and sort into blocks of similar shape
59+
blocks = []
60+
block = {}
61+
for index, row in enumerate(self.rows):
5162
shape = self.get_shape(row)
52-
if table_shape is None:
53-
# store shape of the first row from the end of the table
54-
table_shape = shape
55-
56-
if header:
57-
table['header'].append(line)
58-
elif shape != table_shape:
59-
# this is the last row of the header, check if these are column names
60-
if list(map(bool, shape)) == list(map(bool, table_shape)):
61-
# add the column names as metadata
62-
table['metadata'] = {
63-
'column_{:02d}'.format(idx): str(value) for idx, value in enumerate(row)
64-
}
65-
66-
else:
67-
# add the line as a regular header line
68-
table['header'].append(line)
69-
70-
# set the header switch to true
71-
header = True
63+
if block.get('shape') is None or not self.compare_shape(shape, block.get('shape', [])):
64+
block = {'indexes': [], 'shape': shape}
65+
blocks.append(block)
66+
67+
block['indexes'].append(index)
68+
69+
# loop over blocks and sort into header, table, and metadata
70+
prev_block = None
71+
for block in blocks:
72+
if len(block['indexes']) < TABLE_MIN_ROWS or not block['shape']:
73+
# this is the header
74+
if table['rows']:
75+
# if a table is already there, this must be a new header
76+
table = self.append_table(tables)
77+
78+
table['header'] += [self.lines[index] for index in block['indexes']]
7279
else:
73-
table['rows'].append(row)
74-
75-
table['header'] = list(reversed(table['header']))
76-
table['rows'] = list(reversed(table['rows']))
77-
table['columns'] = self.get_columns(table['metadata'].values(), len(table_shape))
78-
79-
return [table]
80-
81-
def get_columns(self, column_names, n_columns):
82-
if column_names:
83-
return [{
84-
'key': str(idx),
85-
'name': 'Column #{} ({})'.format(idx, column_name)
86-
} for idx, column_name in enumerate(column_names)]
87-
else:
88-
# add the row as columns
89-
return [{
90-
'key': str(idx),
91-
'name': 'Column #{}'.format(idx)
92-
} for idx in range(n_columns)]
80+
# this is the table
81+
if not table['rows']:
82+
# if there are no tables, we can try to find the columns previous line
83+
if prev_block:
84+
this_row = self.rows[block['indexes'][0]]
85+
prev_row = self.rows[prev_block['indexes'][-1]]
86+
87+
if len(prev_row) > 0 and len(prev_row) <= len(this_row):
88+
# add the column names as metadata
89+
table['metadata'] = {
90+
'column_{:02d}'.format(idx): str(value) for idx, value in enumerate(prev_row)
91+
}
92+
# remove the colum line from the header
93+
table['header'] = table['header'][:-1]
94+
95+
table['rows'] += [self.rows[index] for index in block['indexes']]
96+
97+
prev_block = block
98+
99+
# build columns
100+
for table in tables:
101+
table['columns'] = []
102+
if table['rows']:
103+
for idx in range(len(table['rows'][0])):
104+
name = table['metadata'].get('column_{:02d}'.format(idx))
105+
table['columns'].append({
106+
'key': str(idx),
107+
'name': 'Column #{} ({})'.format(idx, name) if name else 'Column #{}'.format(idx)
108+
})
109+
110+
table['metadata']['rows'] = len(table['rows'])
111+
table['metadata']['columns'] = len(table['columns'])
112+
113+
return tables
114+
115+
def get_metadata(self):
116+
metadata = super().get_metadata()
117+
metadata['lineterminator'] = LINETERMINATORS.get(self.dialect.lineterminator, self.dialect.lineterminator)
118+
metadata['quoting'] = self.dialect.quoting
119+
metadata['doublequote'] = self.dialect.doublequote
120+
metadata['delimiter'] = DELIMITERS.get(self.dialect.delimiter, self.dialect.delimiter)
121+
metadata['quotechar'] = self.dialect.quotechar
122+
metadata['skipinitialspace'] = self.dialect.skipinitialspace
123+
return metadata
93124

94125
def get_shape(self, row):
95126
shape = []
96127
for cell in row:
97128
if cell.strip() == '':
98-
shape.append('s')
129+
shape.append('')
99130
else:
100131
try:
101132
float(cell.replace(',', '.'))
@@ -104,12 +135,29 @@ def get_shape(self, row):
104135
shape.append('s')
105136
return shape
106137

107-
def get_metadata(self):
108-
metadata = super().get_metadata()
109-
metadata['lineterminator'] = self.dialect.lineterminator
110-
metadata['quoting'] = self.dialect.quoting
111-
metadata['doublequote'] = self.dialect.doublequote
112-
metadata['delimiter'] = self.dialect.delimiter
113-
metadata['quotechar'] = self.dialect.quotechar
114-
metadata['skipinitialspace'] = self.dialect.skipinitialspace
115-
return metadata
138+
def compare_shape(self, shape_a, shape_b):
139+
# this function compares two shapes, shapes are considered equal if
140+
# floats or strings are at the same positions, spaces are considered wildcards
141+
# since they could be both floats or strings
142+
if shape_a == shape_b:
143+
# both shapes are identical
144+
return True
145+
146+
if len(shape_a) != len(shape_b):
147+
# shapes have different length
148+
return False
149+
150+
if not any(shape_a):
151+
# shape_a consits only of spaces
152+
return False
153+
154+
if not any(shape_b):
155+
# shape_b consits only of spaces
156+
return False
157+
158+
for a, b in zip(shape_a, shape_b):
159+
if a and b and a != b:
160+
# cell a is not equal cell b or one of the two is empty
161+
return False
162+
163+
return True

0 commit comments

Comments
 (0)