Skip to content

Commit 362b232

Browse files
authored
Rework read_tfs parsing with pandas C engine for speedup (#85)
1 parent 1be79fe commit 362b232

File tree

4 files changed

+82
-32
lines changed

4 files changed

+82
-32
lines changed

.zenodo.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@
2929
"affiliation": "CERN",
3030
"orcid": "0000-0002-8146-2340"
3131
},
32+
{
33+
"name": "Jaime Maria Coello De Portugal - Martinez Vazquez",
34+
"affiliation": "CERN",
35+
"orcid": "0000-0002-6899-3809"
36+
},
3237
{
3338
"name": "Rogelio Tomas Garcia",
3439
"affiliation": "CERN",

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
# TFS-Pandas Changelog
22

3+
## Version 2.1.0
4+
5+
- Changes:
6+
- The parsing in `read_tfs` has been reworked to make use of `pandas`'s C engine, resulting in drastic performance improvements when loading files. No functionality was lost or changed.
7+
38
## Version 2.0.3
49

510
- Fixed:

tfs/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
__title__ = "tfs-pandas"
77
__description__ = "Read and write tfs files."
88
__url__ = "https://github.com/pylhc/tfs"
9-
__version__ = "2.0.3"
9+
__version__ = "2.1.0"
1010
__author__ = "pylhc"
1111
__author_email__ = "[email protected]"
1212
__license__ = "MIT"

tfs/handler.py

Lines changed: 71 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -93,14 +93,19 @@ def _str_items(items_list):
9393

9494

9595
def read_tfs(
96-
tfs_file_path: Union[pathlib.Path, str],
97-
index: str = None,
98-
non_unique_behavior: str = "warn",
96+
tfs_file_path: Union[pathlib.Path, str], index: str = None, non_unique_behavior: str = "warn"
9997
) -> TfsDataFrame:
10098
"""
10199
Parses the TFS table present in **tfs_file_path** and returns a customized version of a Pandas
102100
DataFrame (a TfsDataFrame).
103101
102+
Methodology: This function parses the first lines of the file until it gets to the `types` line.
103+
While parsed, the appropriate information is gathered (headers content, column names & types,
104+
number of lines parsed). After reaching the types lines, the rest of the file is given to parse
105+
to ``pandas.read_csv`` with the right options to make use of it's C engine's speed. After this,
106+
conversion to ``TfsDataDrame`` is made, proper types are applied to columns, the index is set and
107+
the frame is validated before being returned.
108+
104109
Args:
105110
tfs_file_path (Union[pathlib.Path, str]): PosixPath object to the output TFS file. Can be
106111
a string, in which case it will be cast to a PosixPath object.
@@ -109,55 +114,69 @@ def read_tfs(
109114
non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
110115
dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
111116
to respectively issue a warning or raise an error if non-unique elements are found.
117+
112118
Returns:
113-
A TfsDataFrame object.
119+
A TfsDataFrame object with the loaded data from the file.
114120
"""
115121
tfs_file_path = pathlib.Path(tfs_file_path)
116122
headers = OrderedDict()
117-
rows_list = []
123+
non_data_lines: int = 0
118124
column_names = column_types = None
119125

120126
LOGGER.debug(f"Reading path: {tfs_file_path.absolute()}")
121127
with tfs_file_path.open("r") as tfs_data:
122128
for line in tfs_data:
129+
non_data_lines += 1
123130
line_components = shlex.split(line)
124131
if not line_components:
125132
continue
126133
if line_components[0] == HEADER:
127134
name, value = _parse_header(line_components[1:])
128135
headers[name] = value
129136
elif line_components[0] == NAMES:
130-
LOGGER.debug("Setting column names.")
137+
LOGGER.debug("Parsing column names.")
131138
column_names = np.array(line_components[1:])
132139
elif line_components[0] == TYPES:
133-
LOGGER.debug("Setting column types.")
140+
LOGGER.debug("Parsing column types.")
134141
column_types = _compute_types(line_components[1:])
135142
elif line_components[0] == COMMENTS:
136143
continue
137-
else:
138-
if column_names is None:
139-
LOGGER.error(f"No column names in file {tfs_file_path.absolute()}, aborting")
140-
raise TfsFormatError("Column names have not been set.")
141-
if column_types is None:
142-
LOGGER.error(f"No column types in file {tfs_file_path.absolute()}, aborting")
143-
raise TfsFormatError("Column types have not been set.")
144-
line_components = [part.strip('"') for part in line_components]
145-
rows_list.append(line_components)
146-
data_frame = _create_data_frame(column_names, column_types, rows_list, headers)
147-
148-
if index: # Use given column as index
149-
data_frame = data_frame.set_index(index)
150-
else: # Try to find Index automatically
151-
index_column = [colname for colname in data_frame.columns if colname.startswith(INDEX_ID)]
152-
if index_column:
153-
data_frame = data_frame.set_index(index_column)
154-
index_name = index_column[0].replace(INDEX_ID, "")
155-
if index_name == "":
156-
index_name = None # to remove it completely (Pandas makes a difference)
157-
data_frame = data_frame.rename_axis(index_name)
158-
159-
_validate(data_frame, f"from file {tfs_file_path.absolute()}", non_unique_behavior)
160-
return data_frame
144+
else: # After all previous cases should only be data lines. If not, file is fucked.
145+
break # Break to not go over all lines, saves a lot of time on big files
146+
147+
if column_names is None:
148+
LOGGER.error(f"No column names in file {tfs_file_path.absolute()}, aborting")
149+
raise TfsFormatError("Column names have not been set.")
150+
if column_types is None:
151+
LOGGER.error(f"No column types in file {tfs_file_path.absolute()}, aborting")
152+
raise TfsFormatError("Column types have not been set.")
153+
154+
LOGGER.debug("Parsing data part of the file")
155+
# DO NOT use comment=COMMENTS in here, if you do and the symbol is in an element for some
156+
# reason then the entire parsing will crash
157+
data_frame = pd.read_csv(
158+
tfs_file_path,
159+
engine="c", # faster, and we do not need the features of the python engine
160+
skiprows=non_data_lines - 1, # because we incremented for the first data line in loop above
161+
delim_whitespace=True, # understands ' ' is our delimiter
162+
skipinitialspace=True, # understands ' ' and ' ' are both valid delimiters
163+
quotechar='"', # elements surrounded by " are one entry -> correct parsing of strings with spaces
164+
names=column_names, # column names we have determined, avoids using first read row for columns
165+
)
166+
167+
LOGGER.debug("Converting to TfsDataFrame")
168+
tfs_data_frame = TfsDataFrame(data_frame, headers=headers)
169+
_assign_column_types(tfs_data_frame, column_names, column_types) # ensure proper types
170+
171+
if index:
172+
LOGGER.debug(f"Setting '{index}' column as index")
173+
tfs_data_frame = tfs_data_frame.set_index(index)
174+
else:
175+
LOGGER.debug("Attempting to find index identifier in columns")
176+
tfs_data_frame = _find_and_set_index(tfs_data_frame)
177+
178+
_validate(tfs_data_frame, f"from file {tfs_file_path.absolute()}", non_unique_behavior)
179+
return tfs_data_frame
161180

162181

163182
def write_tfs(
@@ -334,6 +353,27 @@ def quote_strings(s):
334353
return data_frame
335354

336355

356+
def _find_and_set_index(data_frame: TfsDataFrame) -> TfsDataFrame:
357+
"""
358+
Looks for a column with a name starting with the index identifier, and sets it as index if found.
359+
The index identifier will be stripped from the column name first.
360+
361+
Args:
362+
data_frame (TfsDataFrame): the TfsDataFrame to look for an index in.
363+
364+
Returns:
365+
The TfsDataFrame after operation, whether an index was found or not.
366+
"""
367+
index_column = [colname for colname in data_frame.columns if colname.startswith(INDEX_ID)]
368+
if index_column:
369+
data_frame = data_frame.set_index(index_column)
370+
index_name = index_column[0].replace(INDEX_ID, "")
371+
if index_name == "":
372+
index_name = None # to remove it completely (Pandas makes a difference)
373+
data_frame = data_frame.rename_axis(index=index_name)
374+
return data_frame
375+
376+
337377
def _create_data_frame(column_names, column_types, rows_list, headers) -> TfsDataFrame:
338378
data = np.array(rows_list) if rows_list else None # case of empty dataframe
339379
tfs_data_frame = TfsDataFrame(data=data, columns=column_names, headers=headers)

0 commit comments

Comments
 (0)