Skip to content

Commit bad4b7e

Browse files
Merge pull request #47 from JuBiotech/drop-nonmonotonic-time
Drop filtersets with non-monotonic time
2 parents 2006f0a + 167a139 commit bad4b7e

File tree

5 files changed

+2803
-10
lines changed

5 files changed

+2803
-10
lines changed

bletl/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020
NoMeasurementData,
2121
)
2222

23-
__version__ = "1.4.0"
23+
__version__ = "1.4.1"

bletl/parsing/bl1.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,8 @@ def parse(
196196
metadata = extract_metadata(headerlines)
197197
process_parameters = extract_process_parameters(headerlines)
198198
filtersets = extract_filtersets(headerlines)
199-
comments = extract_comments(data)
200199
references = extract_references(data)
200+
comments = extract_comments(data, references)
201201
measurements = extract_measurements(data)
202202

203203
data = BLData(
@@ -377,16 +377,25 @@ def extract_process_parameters(headerlines):
377377
return process_parameters
378378

379379

380-
def extract_comments(dfraw):
380+
def extract_comments(dfraw: pandas.DataFrame, references: pandas.DataFrame) -> pandas.DataFrame:
381+
"""This adds cycle numbers using timestamps from references."""
381382
ocol_ncol_type = [
382383
("TIME [h]", "time", float),
383384
("COMMENTS", "user_comment", str),
384385
]
385386
df = utils.__to_typed_cols__(dfraw[dfraw["READING"] == "K"], ocol_ncol_type)
387+
388+
# Get the times when each cycle started
389+
start_times = references.reset_index().drop_duplicates("cycle", keep="first").set_index("cycle").time
390+
start_times.loc[1] = 0
391+
# Add cycle numbers based on cycle start times and comment timestamps
392+
df["cycle"] = [start_times[t > start_times].index[-1] for t in df["time"]]
393+
386394
# TODO: automatically separate comments into user/sys
387395
df["sys_comment"] = None
388396
df.index = range(len(df))
389-
return df
397+
# Change column order
398+
return df[["cycle", "time", "user_comment", "sys_comment"]]
390399

391400

392401
def extract_references(dfraw):

bletl/parsing/blpro.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import re
99
import warnings
1010
import xml.etree.ElementTree
11-
from typing import Optional, Union
11+
from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union
1212

1313
import numpy
1414
import pandas
@@ -20,6 +20,7 @@
2020
BLDParser,
2121
FilterTimeSeries,
2222
FluidicsSource,
23+
IncompatibleFileError,
2324
InvalidLotNumberError,
2425
)
2526

@@ -128,26 +129,29 @@ def _parse_datalines(datalines) -> pandas.DataFrame:
128129
return dfraw
129130

130131

131-
def parse_metadata_data(fp):
132+
def parse_metadata_data(fp) -> Tuple[Dict[str, Any], pandas.DataFrame]:
132133
with open(fp, "r", encoding="utf-8") as f:
133134
lines = f.readlines()
134135

135-
metadata = collections.defaultdict(dict)
136-
datalines = collections.defaultdict(list)
136+
metadata: DefaultDict[str, Any] = collections.defaultdict(dict)
137137
section = None
138-
data_start = None
138+
data_start: Optional[int] = None
139139

140140
for l, line in enumerate(lines):
141141
if line.startswith("="):
142142
# any section header encountered
143143
section = line.strip().strip("=").strip()
144144
if not data_start and section == "data":
145145
data_start = l + 1
146+
elif section is None:
147+
raise IncompatibleFileError("No metadata section header before first setting.")
146148
elif line.startswith("["):
147149
# register the value
148150
key, value = line.split("]")
149151
key = key.strip("[")
150152
metadata[section][key] = value.strip()
153+
if data_start is None:
154+
raise IncompatibleFileError("Section header 'data' not found.")
151155

152156
# standardize the metadata keys
153157
metadata["date_start"] = datetime.datetime.strptime(
@@ -195,7 +199,7 @@ def parse_metadata_data(fp):
195199
f"{fp} contains defects in lines {defect_lines}. Be extra skeptical about the parsed results."
196200
)
197201

198-
return metadata, dfraw[list(dfraw.columns)[:-1]]
202+
return dict(metadata), dfraw[list(dfraw.columns)[:-1]]
199203

200204

201205
def standardize(df):
@@ -258,6 +262,7 @@ def extract_filtersets(metadata):
258262

259263
def extract_comments(dfraw):
260264
ocol_ncol_type = [
265+
("Cycle", "cycle", int),
261266
("Time", "time", float),
262267
("User_Comment", "user_comment", str),
263268
("Sys_Comment", "sys_comment", str),
@@ -315,6 +320,22 @@ def extract_measurements(dfraw):
315320
)
316321
df_M = df_M[~mask]
317322

323+
# Drop filtersets with non-monotonically increasing time
324+
drop_idxs = []
325+
for idx, fsblock in df_M.groupby(["Cycle", "Filterset"]):
326+
t = fsblock["Time"].astype(int).to_numpy()
327+
if any(t[1:] < t[:-1]):
328+
drop_idxs.append(idx)
329+
ndrop = len(drop_idxs)
330+
if ndrop:
331+
for dropC, dropF in drop_idxs:
332+
mask = numpy.logical_and(df_M["Cycle"] == dropC, df_M["Filterset"] == dropF)
333+
df_M = df_M[~mask]
334+
warnings.warn(
335+
f"Dropped cycle {dropC} filterset {dropF} because of non-monotonically increasing time values.",
336+
UserWarning,
337+
)
338+
318339
# Convert to the expected data types
319340
df = utils.__to_typed_cols__(df_M, ocol_ncol_type)
320341
df = df.set_index(["filterset", "cycle", "well"])

0 commit comments

Comments
 (0)