|
8 | 8 | import re |
9 | 9 | import warnings |
10 | 10 | import xml.etree.ElementTree |
11 | | -from typing import Optional, Union |
| 11 | +from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union |
12 | 12 |
|
13 | 13 | import numpy |
14 | 14 | import pandas |
|
20 | 20 | BLDParser, |
21 | 21 | FilterTimeSeries, |
22 | 22 | FluidicsSource, |
| 23 | + IncompatibleFileError, |
23 | 24 | InvalidLotNumberError, |
24 | 25 | ) |
25 | 26 |
|
@@ -128,26 +129,29 @@ def _parse_datalines(datalines) -> pandas.DataFrame: |
128 | 129 | return dfraw |
129 | 130 |
|
130 | 131 |
|
131 | | -def parse_metadata_data(fp): |
| 132 | +def parse_metadata_data(fp) -> Tuple[Dict[str, Any], pandas.DataFrame]: |
132 | 133 | with open(fp, "r", encoding="utf-8") as f: |
133 | 134 | lines = f.readlines() |
134 | 135 |
|
135 | | - metadata = collections.defaultdict(dict) |
136 | | - datalines = collections.defaultdict(list) |
| 136 | + metadata: DefaultDict[str, Any] = collections.defaultdict(dict) |
137 | 137 | section = None |
138 | | - data_start = None |
| 138 | + data_start: Optional[int] = None |
139 | 139 |
|
140 | 140 | for l, line in enumerate(lines): |
141 | 141 | if line.startswith("="): |
142 | 142 | # any section header encountered |
143 | 143 | section = line.strip().strip("=").strip() |
144 | 144 | if not data_start and section == "data": |
145 | 145 | data_start = l + 1 |
| 146 | + elif section is None: |
| 147 | + raise IncompatibleFileError("No metadata section header before first setting.") |
146 | 148 | elif line.startswith("["): |
147 | 149 | # register the value |
148 | 150 | key, value = line.split("]") |
149 | 151 | key = key.strip("[") |
150 | 152 | metadata[section][key] = value.strip() |
| 153 | + if data_start is None: |
| 154 | + raise IncompatibleFileError("Section header 'data' not found.") |
151 | 155 |
|
152 | 156 | # standardize the metadata keys |
153 | 157 | metadata["date_start"] = datetime.datetime.strptime( |
@@ -195,7 +199,7 @@ def parse_metadata_data(fp): |
195 | 199 | f"{fp} contains defects in lines {defect_lines}. Be extra skeptical about the parsed results." |
196 | 200 | ) |
197 | 201 |
|
198 | | - return metadata, dfraw[list(dfraw.columns)[:-1]] |
| 202 | + return dict(metadata), dfraw[list(dfraw.columns)[:-1]] |
199 | 203 |
|
200 | 204 |
|
201 | 205 | def standardize(df): |
@@ -258,6 +262,7 @@ def extract_filtersets(metadata): |
258 | 262 |
|
259 | 263 | def extract_comments(dfraw): |
260 | 264 | ocol_ncol_type = [ |
| 265 | + ("Cycle", "cycle", int), |
261 | 266 | ("Time", "time", float), |
262 | 267 | ("User_Comment", "user_comment", str), |
263 | 268 | ("Sys_Comment", "sys_comment", str), |
@@ -315,6 +320,22 @@ def extract_measurements(dfraw): |
315 | 320 | ) |
316 | 321 | df_M = df_M[~mask] |
317 | 322 |
|
| 323 | + # Drop filtersets with non-monotonically increasing time |
| 324 | + drop_idxs = [] |
| 325 | + for idx, fsblock in df_M.groupby(["Cycle", "Filterset"]): |
| 326 | + t = fsblock["Time"].astype(int).to_numpy() |
| 327 | + if any(t[1:] < t[:-1]): |
| 328 | + drop_idxs.append(idx) |
| 329 | + ndrop = len(drop_idxs) |
| 330 | + if ndrop: |
| 331 | + for dropC, dropF in drop_idxs: |
| 332 | + mask = numpy.logical_and(df_M["Cycle"] == dropC, df_M["Filterset"] == dropF) |
| 333 | + df_M = df_M[~mask] |
| 334 | + warnings.warn( |
| 335 | + f"Dropped cycle {dropC} filterset {dropF} because of non-monotonically increasing time values.", |
| 336 | + UserWarning, |
| 337 | + ) |
| 338 | + |
318 | 339 | # Convert to the expected data types |
319 | 340 | df = utils.__to_typed_cols__(df_M, ocol_ncol_type) |
320 | 341 | df = df.set_index(["filterset", "cycle", "well"]) |
|
0 commit comments