Skip to content

Commit 8d437e3

Browse files
authored
Add JascoReader and refactor readers (get_value) for different number locales (#19)
* Cleanup setup.py * Refactor CSVReader to handle "one-line" files * Refactor readers (get_value) for different number locales * Add jasco reader and revert csv reader * Fix get_value
1 parent 785bdb1 commit 8d437e3

8 files changed

Lines changed: 122 additions & 51 deletions

File tree

converter_app/readers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .brml import BrmlReader
88
from .dta import DtaReader
99
from .pssession import PsSessionReader
10+
from .jasco import JascoReader
1011

1112
logger = logging.getLogger(__name__)
1213

@@ -47,3 +48,4 @@ def match_reader(self, file, file_name, content_type):
4748
registry.register(BrmlReader)
4849
registry.register(DtaReader)
4950
registry.register(PsSessionReader)
51+
registry.register(JascoReader)

converter_app/readers/ascii.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,14 @@
55

66
logger = logging.getLogger(__name__)
77

8-
PATTERNS = {
9-
'text': re.compile(r'[A-Za-z]{2,}'), # two or more chars in row
10-
'floats': re.compile(r'(-?\d+[,.]*\d*[eE+\-\d]*)\S*') # e.g. 1.00001E-10
11-
}
12-
138

149
class AsciiReader(Reader):
1510
identifier = 'ascii_reader'
1611
priority = 1000
1712

13+
# two or more chars in row
14+
text_pattern = re.compile(r'[A-Za-z]{2,}')
15+
1816
def check(self):
1917
logger.debug('file_name=%s content_type=%s mime_type=%s encoding=%s',
2018
self.file_name, self.content_type, self.mime_type, self.encoding)
@@ -38,7 +36,7 @@ def get_tables(self):
3836
count = None
3937

4038
# try to match text for the header
41-
text_match = PATTERNS['text'].search(row)
39+
text_match = self.text_pattern.search(row)
4240
if text_match:
4341
if table['rows']:
4442
# if a table is already there, this must be a new header
@@ -48,11 +46,10 @@ def get_tables(self):
4846
table['header'].append(row)
4947
else:
5048
# try to match columns of floats
51-
row = row.replace('n.a.','0')
52-
float_match = PATTERNS['floats'].findall(row)
49+
row = row.replace('n.a.', '')
50+
float_match = self.float_pattern.findall(row)
5351
if float_match:
54-
# replace , by . in floats
55-
float_match = [float_str.replace(',', '.') for float_str in float_match]
52+
float_match = [self.get_value(float_str) for float_str in float_match]
5653
count = len(float_match)
5754

5855
if table['rows'] and count != previous_count:

converter_app/readers/base.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import re
23
from pathlib import Path
34

45
import magic
@@ -8,6 +9,10 @@
89

910
class Reader(object):
1011

12+
float_pattern = re.compile(r'(-?\d+[,.]*\d*[eE+\-\d]*)\S*')
13+
float_de_pattern = re.compile(r'(-?[\d.]+,\d*[eE+\-\d]*)')
14+
float_us_pattern = re.compile(r'(-?[\d,]+.\d*[eE+\-\d]*)')
15+
1116
def __init__(self, file, file_name, content_type):
1217
self.file = file
1318
self.file_name = file_name
@@ -52,8 +57,25 @@ def append_table(self, tables):
5257
tables.append(table)
5358
return table
5459

60+
def get_shape(self, row):
61+
shape = []
62+
for cell in row:
63+
value = cell.strip()
64+
if value in self.empty_values:
65+
shape.append('')
66+
elif self.float_pattern.match(value):
67+
shape.append('f')
68+
else:
69+
shape.append('s')
70+
71+
return shape
72+
5573
def get_value(self, value):
56-
try:
57-
return float(value.replace(',', '.'))
58-
except ValueError:
74+
if self.float_de_pattern.match(value):
75+
# remove any digit group seperators and replace the comma with a period
76+
return value.replace('.', '').replace(',', '.')
77+
if self.float_us_pattern.match(value):
78+
# just remove the digit group seperators
79+
return value.replace(',', '')
80+
else:
5981
return value

converter_app/readers/csv.py

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import copy
21
import csv
32
import io
43
import logging
@@ -7,25 +6,25 @@
76

87
logger = logging.getLogger(__name__)
98

10-
TABLE_MIN_ROWS = 20
11-
12-
DELIMITERS = {
13-
'\t': 'tab',
14-
' ': 'space',
15-
';': 'semicolon',
16-
',': 'comma',
17-
}
18-
LINETERMINATORS = {
19-
'\r\n': '\\r\\n',
20-
'\r': '\\r',
21-
'\n': '\\n',
22-
}
23-
249

2510
class CSVReader(Reader):
2611
identifier = 'csv_reader'
2712
priority = 100
2813

14+
empty_values = ['', 'n.a.']
15+
table_min_rows = 20
16+
delimiters = {
17+
'\t': 'tab',
18+
' ': 'space',
19+
';': 'semicolon',
20+
',': 'comma',
21+
}
22+
lineterminators = {
23+
'\r\n': '\\r\\n',
24+
'\r': '\\r',
25+
'\n': '\\n',
26+
}
27+
2928
def check(self):
3029
logger.debug('file_name=%s content_type=%s mime_type=%s encoding=%s',
3130
self.file_name, self.content_type, self.mime_type, self.encoding)
@@ -35,7 +34,7 @@ def check(self):
3534
file_string = self.file_content.decode(self.encoding)
3635

3736
# check different delimiters one by one
38-
for delimiter in DELIMITERS.keys():
37+
for delimiter in self.delimiters.keys():
3938
try:
4039
self.dialect = csv.Sniffer().sniff(file_string, delimiters=delimiter)
4140
result = True
@@ -44,9 +43,8 @@ def check(self):
4443
pass
4544

4645
if result:
47-
io_string = io.StringIO(file_string)
48-
self.lines = list(copy.copy(io_string))
49-
self.rows = list(csv.reader(io_string, self.dialect))
46+
self.rows = list(csv.reader(io.StringIO(file_string), self.dialect))
47+
self.lines = file_string.splitlines()
5048

5149
logger.debug('result=%s', result)
5250
return result
@@ -69,7 +67,7 @@ def get_tables(self):
6967
# loop over blocks and sort into header, table, and metadata
7068
prev_block = None
7169
for block in blocks:
72-
if len(block['indexes']) < TABLE_MIN_ROWS or not block['shape']:
70+
if len(block['indexes']) < self.table_min_rows or not block['shape']:
7371
# this is the header
7472
if table['rows']:
7573
# if a table is already there, this must be a new header
@@ -92,7 +90,7 @@ def get_tables(self):
9290
# remove the colum line from the header
9391
table['header'] = table['header'][:-1]
9492

95-
table['rows'] += [self.rows[index] for index in block['indexes']]
93+
table['rows'] += [[self.get_value(value) for value in self.rows[index]] for index in block['indexes']]
9694

9795
prev_block = block
9896

@@ -114,25 +112,25 @@ def get_tables(self):
114112

115113
def get_metadata(self):
116114
metadata = super().get_metadata()
117-
metadata['lineterminator'] = LINETERMINATORS.get(self.dialect.lineterminator, self.dialect.lineterminator)
115+
metadata['lineterminator'] = self.lineterminators.get(self.dialect.lineterminator, self.dialect.lineterminator)
118116
metadata['quoting'] = self.dialect.quoting
119117
metadata['doublequote'] = self.dialect.doublequote
120-
metadata['delimiter'] = DELIMITERS.get(self.dialect.delimiter, self.dialect.delimiter)
118+
metadata['delimiter'] = self.delimiters.get(self.dialect.delimiter, self.dialect.delimiter)
121119
metadata['quotechar'] = self.dialect.quotechar
122120
metadata['skipinitialspace'] = self.dialect.skipinitialspace
123121
return metadata
124122

125123
def get_shape(self, row):
126124
shape = []
127125
for cell in row:
128-
if cell.strip() == '':
126+
value = cell.strip()
127+
if value in self.empty_values:
129128
shape.append('')
129+
elif self.float_pattern.match(value):
130+
shape.append('f')
130131
else:
131-
try:
132-
float(cell.replace(',', '.'))
133-
shape.append('f')
134-
except ValueError:
135-
shape.append('s')
132+
shape.append('s')
133+
136134
return shape
137135

138136
def compare_shape(self, shape_a, shape_b):

converter_app/readers/excel.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,9 @@ def get_shape(self, row):
8686
if cell is None:
8787
shape.append(None)
8888
else:
89-
try:
90-
float(cell)
89+
90+
if isinstance(cell, (int, float)):
9191
shape.append('f')
92-
except (ValueError, TypeError):
92+
else:
9393
shape.append('s')
9494
return shape

converter_app/readers/jasco.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import io
2+
import logging
3+
import os
4+
5+
from .base import Reader
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
class JascoReader(Reader):
11+
identifier = 'jasco_reader'
12+
priority = 99
13+
header_length = 8
14+
15+
def check(self):
16+
logger.debug('file_name=%s content_type=%s mime_type=%s encoding=%s',
17+
self.file_name, self.content_type, self.mime_type, self.encoding)
18+
19+
result = False
20+
if self.encoding != 'binary':
21+
file_string = self.file_content.decode(self.encoding)
22+
if len(file_string.splitlines()) == 1:
23+
file_lines = file_string.split(',')
24+
if file_lines[self.header_length - 1] == str(len(file_lines) - self.header_length):
25+
result = True
26+
if result:
27+
self.lines = file_lines
28+
29+
logger.debug('result=%s', result)
30+
return result
31+
32+
def get_tables(self):
33+
tables = []
34+
table = self.append_table(tables)
35+
36+
for i, line in enumerate(self.lines):
37+
if i < self.header_length:
38+
table['header'].append(line)
39+
else:
40+
x, y = line.split()
41+
table['rows'].append((self.get_value(x), self.get_value(y)))
42+
43+
# build columns
44+
for table in tables:
45+
table['columns'] = []
46+
if table['rows']:
47+
for idx in range(len(table['rows'][0])):
48+
table['columns'].append({
49+
'key': str(idx),
50+
'name': 'Column #{}'.format(idx)
51+
})
52+
53+
table['metadata']['rows'] = len(table['rows'])
54+
table['metadata']['columns'] = len(table['columns'])
55+
56+
return tables

converter_app/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def check_uuid(string):
3939
except ValueError:
4040
return False
4141

42+
4243
def checkpw(password, hashed_password):
4344
m = hashlib.sha1()
4445
m.update(password)

setup.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,5 @@
2929
'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)'
3030
],
3131
packages=find_packages(),
32-
include_package_data=True,
33-
entry_points={
34-
'console_scripts': [
35-
'chemotion-converter=converter_app.scripts:converter',
36-
]
37-
}
32+
include_package_data=True
3833
)

0 commit comments

Comments
 (0)