Skip to content

Commit 2cf93e1

Browse files
authored
Linfile cleaning keep bpms and limit-based cleaning (#439)
* Implementation and test * Changelog and version bump
1 parent e42b082 commit 2cf93e1

File tree

4 files changed

+180
-13
lines changed

4 files changed

+180
-13
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# OMC3 Changelog
22

3+
#### 2024-03-18 - v0.14.0 - _jdilly_
4+
5+
- Added:
6+
- Linfile Updater: `keep`-flag to keep names and option to clean manually between limits.
7+
8+
#### 2024-03-08 - v0.13.1 - _jdilly_, _awegsche_, _mlegarre_, _fesoubel_
9+
10+
- Added:
11+
- Knob Extractor: Lumiscan Knob
12+
13+
- Fixed:
14+
- BBS converter: fixed closed orbit units
15+
- Optics: Pandas indexing error in DPP
16+
317
#### 2023-12-07 - v0.13.0 - _awegsche_
418

519
- Added:

omc3/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
__title__ = "omc3"
1212
__description__ = "An accelerator physics tools package for the OMC team at CERN."
1313
__url__ = "https://github.com/pylhc/omc3"
14-
__version__ = "0.13.1"
14+
__version__ = "0.14.0"
1515
__author__ = "pylhc"
1616
__author_email__ = "[email protected]"
1717
__license__ = "MIT"

omc3/scripts/linfile_clean.py

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
Performs an automated cleaning of different columns in the lin-file
55
as a standalone script to allow for manual refinement after harpy is done.
66
7+
The type of cleaning is determined by the number of values in the ``limit``
8+
parameter. When no ``limit`` is given or a single number is given,
9+
auto-cleaning is performed:
10+
711
All data is assumed to be gaussian distributed around a "true" value,
812
and outliers are cleaned by calculating average and standard deviation
913
of the given data.
@@ -17,6 +21,9 @@
1721
cleaned. The limit is given in whatever units the data itself is in and
1822
is an absolute value.
1923
24+
If two values are given for the ``limit`` parameter, all data-points in between
25+
these limits are kept and all data-points outside of these limits are cleaned.
26+
2027
Cleaning is done per given file independently
2128
i.e. removed BPMs from one file can be present in the next.
2229
The columns are iterated on the same file, i.e. the cleaning is done
@@ -53,10 +60,15 @@
5360
Columns to clean on.
5461
5562
63+
- **keep** *(str)*:
64+
65+
Do not clean BPMs with given names.
66+
67+
5668
- **limit** *(float)*:
5769
58-
Do not clean data-points deviating less than this limit from the
59-
average.
70+
Two values: Do not clean data-points in between these values.
71+
Single value (auto-clean): Do not clean data-points deviating less than this limit from the average.
6072
6173
default: ``0.0``
6274
@@ -117,8 +129,14 @@ def get_params():
117129
),
118130
limit=dict(
119131
type=float,
120-
help="Do not clean data-points deviating less than this limit from the average.",
121-
default=0.0,
132+
nargs='+',
133+
help="Two values: Do not clean data-points in between these values. "
134+
"Single value (auto-clean): Do not clean data-points deviating less than this limit from the average.",
135+
),
136+
keep=dict(
137+
nargs='+',
138+
type=str,
139+
help="Do not clean BPMs with given names.",
122140
),
123141
backup=dict(
124142
action="store_true",
@@ -140,7 +158,7 @@ def main(opt):
140158

141159
if opt.columns is None:
142160
raise ValueError("The option 'columns' is required for cleaning.")
143-
clean_columns(opt.files, opt.columns, opt.limit, opt.backup)
161+
clean_columns(opt.files, opt.columns, opt.limit, opt.keep, opt.backup)
144162

145163

146164
# Restore ----------------------------------------------------------------------
@@ -184,15 +202,30 @@ def _restore_file(file):
184202

185203
# Clean ------------------------------------------------------------------------
186204

187-
def clean_columns(files: Sequence[Union[Path, str]], columns: Sequence[str],
188-
limit: float = 0.0, backup: bool = True):
205+
def clean_columns(files: Sequence[Union[Path, str]],
206+
columns: Sequence[str],
207+
limit: float = None, # default set in _check_limits
208+
keep: Sequence[str] = None, # default set below
209+
backup: bool = True):
189210
""" Clean the columns in the given files."""
190211
for file in files:
191212
file = Path(file)
192213
LOG.info(f"Cleaning {file.name}.")
214+
215+
# check limits
216+
limit = _check_limits(limit)
217+
218+
# read and check file
193219
df = tfs.read_tfs(file, index=COL_NAME)
220+
if keep is None:
221+
keep = ()
222+
not_found_bpms = set(keep) - set(df.index)
223+
if len(not_found_bpms):
224+
LOG.warning(f"The following BPMs to keep were not found in {file.name}:\n{not_found_bpms}")
225+
226+
# clean
194227
for column in columns:
195-
df = _filter_by_column(df, column, limit)
228+
df = _filter_by_column(df, column, limit, keep)
196229
df.headers.update(_compute_headers(df))
197230

198231
if backup:
@@ -201,13 +234,41 @@ def clean_columns(files: Sequence[Union[Path, str]], columns: Sequence[str],
201234
tfs.write_tfs(file, df, save_index=COL_NAME)
202235

203236

204-
def _filter_by_column(df: pd.DataFrame, column: str, limit: Number) -> pd.DataFrame:
237+
def _check_limits(limit: Union[Sequence[Number], Number]) -> Sequence[Number]:
238+
""" Check that one or two limits are given and convert them into a tuple if needed."""
239+
if limit is None:
240+
limit = (0.0,)
241+
242+
try:
243+
len(limit)
244+
except TypeError:
245+
limit = (limit,)
246+
247+
if len(limit) == 1:
248+
LOG.info("Performing auto-cleaning.")
249+
250+
elif len(limit) == 2:
251+
LOG.info(f"Performing cleaning between the limits {limit}.")
252+
limit = tuple(sorted(limit))
253+
254+
else:
255+
raise ValueError(f"Expected 1 or 2 limits, got {len(limit)}.")
256+
257+
return limit
258+
259+
260+
def _filter_by_column(df: pd.DataFrame, column: str, limit: Sequence[Number], keep: Sequence[str]) -> pd.DataFrame:
205261
"""Get the dataframe with all rows dropped filtered by the given column."""
206262
if column not in df.columns:
207263
LOG.info(f"{column} not in current file. Skipping cleaning.")
208264
return df
209265

210-
good_bpms = get_filter_mask(data=df[column], limit=limit)
266+
keep_bpms = df.index.isin(keep)
267+
if len(limit) == 1:
268+
good_bpms = get_filter_mask(data=df[column], limit=limit[0]) | keep_bpms
269+
else:
270+
good_bpms = df[column].between(*limit) | keep_bpms
271+
211272
n_good, n_total = sum(good_bpms), len(good_bpms)
212273
LOG.info(f"Cleaned {n_total-n_good:d} of {n_total:d} elements in {column} ({n_good:d} remaining).")
213274
return df.loc[good_bpms, :]

tests/unit/test_linfile_clean.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def test_filter_tune(tmp_path):
2929

3030
for plane in PLANES:
3131
assert len(filtered[plane]) == 4
32-
assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME]
32+
assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME].to_list()
3333

3434

3535
def test_filter_tune_limit(tmp_path):
@@ -48,6 +48,49 @@ def test_filter_tune_limit(tmp_path):
4848
assert_frame_equal(unfiltered[plane], filtered[plane])
4949

5050

51+
@pytest.mark.basic
52+
def test_keep_bpms(tmp_path):
53+
""" Test that keeping BPMs works. """
54+
columns = [COL_TUNE]
55+
plane_columns = [f"{col}{p}" for col in columns for p in PLANES]
56+
57+
# To be filtered BPMS are (due to the values in the example linfiles)
58+
filtered_bpms = {
59+
"X": ["BPM.10L4.B1", "BPM.10L2.B1"],
60+
"Y": ["BPM.10L1.B1", "BPM.10L2.B1"],
61+
}
62+
63+
# Test that all BPMs are filtered without the keep-flag --------------------
64+
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns)
65+
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
66+
67+
# if limit not given, filters two elements in X
68+
clean_columns(files=linfiles.values(), columns=plane_columns)
69+
filtered = {p: tfs.read(f) for p, f in linfiles.items()}
70+
71+
72+
for plane in PLANES:
73+
assert len(filtered[plane]) == len(unfiltered[plane]) - 2
74+
for bpm in filtered_bpms[plane]:
75+
assert bpm not in filtered[plane][COL_NAME].to_list()
76+
assert bpm in unfiltered[plane][COL_NAME].to_list()
77+
78+
# Now with keeping one of them ---------------------------------------------
79+
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns)
80+
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
81+
82+
# if limit not given, filters two elements in X
83+
clean_columns(files=linfiles.values(), columns=plane_columns, keep=[filtered_bpms["X"][1]])
84+
filtered = {p: tfs.read(f) for p, f in linfiles.items()}
85+
for plane in PLANES:
86+
assert len(filtered[plane]) == len(unfiltered[plane]) - 1
87+
for bpm in filtered_bpms[plane]:
88+
assert bpm in unfiltered[plane][COL_NAME].to_list()
89+
90+
assert filtered_bpms[plane][0] not in filtered[plane][COL_NAME].to_list()
91+
assert filtered_bpms[plane][1] in filtered[plane][COL_NAME].to_list()
92+
93+
5194
@pytest.mark.basic
5295
def test_filter_tune_nattune(tmp_path):
5396
"""Tests that filtering works for two columns."""
@@ -64,6 +107,55 @@ def test_filter_tune_nattune(tmp_path):
64107
assert len(filtered[plane]) == 2 # empirically determined
65108

66109

110+
@pytest.mark.basic
111+
def test_filter_between_limits(tmp_path):
112+
""" Test filtering works on outlier created by modify linfiles function. """
113+
columns = [COL_TUNE]
114+
plane_columns = [f"{col}{p}" for col in columns for p in PLANES]
115+
116+
# Test that no BPMs are filtered by the auto-clean (sanity check) ----------
117+
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
118+
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
119+
120+
clean_columns(files=linfiles.values(), columns=plane_columns)
121+
122+
filtered = {p: tfs.read(f) for p, f in linfiles.items()}
123+
124+
for plane in PLANES:
125+
assert_frame_equal(unfiltered[plane], filtered[plane])
126+
127+
# Test that the two BPMs are filtered by the limits-clean ------------------
128+
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
129+
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
130+
131+
# choosing values so that both planes are filtered
132+
# X tunes are 0.26 + 0.1, Y tunes are 0.32 + 0.1
133+
clean_columns(files=linfiles.values(), columns=plane_columns, limit=(0.20, 0.35))
134+
135+
filtered = {p: tfs.read(f) for p, f in linfiles.items()}
136+
137+
for plane in PLANES:
138+
assert len(filtered[plane]) == 3
139+
assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME].to_list()
140+
assert unfiltered[plane][COL_NAME][3] not in filtered[plane][COL_NAME].to_list()
141+
142+
143+
# Test that keep flag is also respected in the limits-clean ----------------
144+
linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
145+
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
146+
147+
# choosing values so that both planes are filtered
148+
# X tunes are 0.26 + 0.1, Y tunes are 0.32 + 0.1
149+
clean_columns(files=linfiles.values(), columns=plane_columns, limit=(0.20, 0.35), keep=[unfiltered["X"][COL_NAME][2]])
150+
151+
filtered = {p: tfs.read(f) for p, f in linfiles.items()}
152+
153+
for plane in PLANES:
154+
assert len(filtered[plane]) == 4
155+
assert unfiltered[plane][COL_NAME][2] in filtered[plane][COL_NAME].to_list()
156+
assert unfiltered[plane][COL_NAME][3] not in filtered[plane][COL_NAME].to_list()
157+
158+
67159
@pytest.mark.basic
68160
def test_backup_and_restore(tmp_path):
69161
"""Test that the backup and restore functionality works."""
@@ -106,7 +198,7 @@ def test_main(tmp_path):
106198
unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
107199

108200
# if limit not given, would filter two elements in X
109-
main(files=list(linfiles.values()), columns=plane_columns, limit=0.01, backup=True)
201+
main(files=list(linfiles.values()), columns=plane_columns, limit=[0.01], backup=True)
110202
_assert_nlinfiles(tmp_path, 2)
111203

112204
filtered = {p: tfs.read(f) for p, f in linfiles.items()}

0 commit comments

Comments
 (0)