Linfile cleaning keep bpms and limit-based cleaning (#439)

JoschD · web-flow · commit 2cf93e1af074 · 2024-03-18T21:08:48.000+01:00
* Implementation and test
* Changelog and version bump
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # OMC3 Changelog
 
+#### 2024-03-18 - v0.14.0 - _jdilly_
+
+- Added:
+  - Linfile Updater: `keep`-flag to keep names and option to clean manually between limits.
+
+#### 2024-03-08 - v0.13.1 - _jdilly_, _awegsche_, _mlegarre_, _fesoubel_
+
+- Added:
+  - Knob Extractor: Lumiscan Knob
+
+- Fixed:
+  - BBS converter: fixed closed orbit units
+  - Optics: Pandas indexing error in DPP
+
 #### 2023-12-07 - v0.13.0 - _awegsche_
 
 - Added:
diff --git a/omc3/__init__.py b/omc3/__init__.py
@@ -11,7 +11,7 @@
 __title__ = "omc3"
 __description__ = "An accelerator physics tools package for the OMC team at CERN."
 __url__ = "https://github.com/pylhc/omc3"
-__version__ = "0.13.1"
+__version__ = "0.14.0"
 __author__ = "pylhc"
 __author_email__ = "pylhc@github.com"
 __license__ = "MIT"
diff --git a/omc3/scripts/linfile_clean.py b/omc3/scripts/linfile_clean.py
@@ -4,6 +4,10 @@
 Performs an automated cleaning of different columns in the lin-file
 as a standalone script to allow for manual refinement after harpy is done.
 
+The type of cleaning is determined by the number of values in the ``limit``
+parameter. When no ``limit`` is given or a single number is given, 
+auto-cleaning is performed:
+
 All data is assumed to be gaussian distributed around a "true" value,
 and outliers are cleaned by calculating average and standard deviation
 of the given data.
@@ -17,6 +21,9 @@
 cleaned. The limit is given in whatever units the data itself is in and
 is an absolute value.
 
+If two values are given for the ``limit`` parameter, all data-points in between 
+these limits are kept and all data-points outside of these limits are cleaned.
+
 Cleaning is done per given file independently
 i.e. removed BPMs from one file can be present in the next.
 The columns are iterated on the same file, i.e. the cleaning is done
@@ -53,10 +60,15 @@
     Columns to clean on.
 
 
+- **keep** *(str)*:
+
+    Do not clean BPMs with given names.
+
+
 - **limit** *(float)*:
 
-    Do not clean data-points deviating less than this limit from the
-    average.
+    Two values: Do not clean data-points in between these values.
+    Single value (auto-clean): Do not clean data-points deviating less than this limit from the average.
 
     default: ``0.0``
 
@@ -117,8 +129,14 @@ def get_params():
         ),
         limit=dict(
             type=float,
-            help="Do not clean data-points deviating less than this limit from the average.",
-            default=0.0,
+            nargs='+',
+            help="Two values: Do not clean data-points in between these values. "
+                 "Single value (auto-clean): Do not clean data-points deviating less than this limit from the average.",
+        ),
+        keep=dict(
+            nargs='+',
+            type=str,
+            help="Do not clean BPMs with given names.",
         ),
         backup=dict(
             action="store_true",
@@ -140,7 +158,7 @@ def main(opt):
 
     if opt.columns is None:
         raise ValueError("The option 'columns' is required for cleaning.")
-    clean_columns(opt.files, opt.columns, opt.limit, opt.backup)
+    clean_columns(opt.files, opt.columns, opt.limit, opt.keep, opt.backup)
 
 
 # Restore ----------------------------------------------------------------------
@@ -184,15 +202,30 @@ def _restore_file(file):
 
 # Clean ------------------------------------------------------------------------
 
-def clean_columns(files: Sequence[Union[Path, str]], columns: Sequence[str],
-                  limit: float = 0.0, backup: bool = True):
+def clean_columns(files: Sequence[Union[Path, str]], 
+                  columns: Sequence[str],
+                  limit: float = None,   # default set in _check_limits
+                  keep: Sequence[str] = None,  # default set below
+                  backup: bool = True):
     """ Clean the columns in the given files."""
     for file in files:
         file = Path(file)
         LOG.info(f"Cleaning {file.name}.")
+
+        # check limits
+        limit = _check_limits(limit)
+
+        # read and check file
         df = tfs.read_tfs(file, index=COL_NAME)
+        if keep is None:
+            keep = ()
+        not_found_bpms = set(keep) - set(df.index)
+        if len(not_found_bpms):
+            LOG.warning(f"The following BPMs to keep were not found in {file.name}:\n{not_found_bpms}")
+
+        # clean
         for column in columns:
-            df = _filter_by_column(df, column, limit)
+            df = _filter_by_column(df, column, limit, keep)
         df.headers.update(_compute_headers(df))
 
         if backup:
@@ -201,13 +234,41 @@ def clean_columns(files: Sequence[Union[Path, str]], columns: Sequence[str],
         tfs.write_tfs(file, df, save_index=COL_NAME)
 
 
-def _filter_by_column(df: pd.DataFrame, column: str, limit: Number) -> pd.DataFrame:
+def _check_limits(limit: Union[Sequence[Number], Number]) -> Sequence[Number]:
+    """ Check that one or two limits are given and convert them into a tuple if needed."""
+    if limit is None:
+        limit = (0.0,)
+
+    try:
+        len(limit)
+    except TypeError:
+        limit = (limit,)
+
+    if len(limit) == 1:
+        LOG.info("Performing auto-cleaning.")
+
+    elif len(limit) == 2:
+        LOG.info(f"Performing cleaning between the limits {limit}.")
+        limit = tuple(sorted(limit))
+
+    else:
+        raise ValueError(f"Expected 1 or 2 limits, got {len(limit)}.")
+    
+    return limit
+
+
+def _filter_by_column(df: pd.DataFrame, column: str, limit: Sequence[Number], keep: Sequence[str]) -> pd.DataFrame:
     """Get the dataframe with all rows dropped filtered by the given column."""
     if column not in df.columns:
         LOG.info(f"{column} not in current file. Skipping cleaning.")
         return df
 
-    good_bpms = get_filter_mask(data=df[column], limit=limit)
+    keep_bpms = df.index.isin(keep)
+    if len(limit) == 1:
+        good_bpms = get_filter_mask(data=df[column], limit=limit[0]) | keep_bpms
+    else:
+        good_bpms = df[column].between(*limit) | keep_bpms
+    
     n_good, n_total = sum(good_bpms), len(good_bpms)
     LOG.info(f"Cleaned {n_total-n_good:d} of {n_total:d} elements in {column} ({n_good:d} remaining).")
     return df.loc[good_bpms, :]
diff --git a/tests/unit/test_linfile_clean.py b/tests/unit/test_linfile_clean.py
@@ -29,7 +29,7 @@ def test_filter_tune(tmp_path):
 
     for plane in PLANES:
         assert len(filtered[plane]) == 4
-        assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME]
+        assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME].to_list()
 
 
 def test_filter_tune_limit(tmp_path):
@@ -48,6 +48,49 @@ def test_filter_tune_limit(tmp_path):
         assert_frame_equal(unfiltered[plane], filtered[plane])
 
 
+@pytest.mark.basic
+def test_keep_bpms(tmp_path):
+    """ Test that keeping BPMs works. """
+    columns = [COL_TUNE]
+    plane_columns = [f"{col}{p}" for col in columns for p in PLANES]
+    
+    # To be filtered BPMS are (due to the values in the example linfiles)
+    filtered_bpms = {
+        "X": ["BPM.10L4.B1", "BPM.10L2.B1"],
+        "Y": ["BPM.10L1.B1", "BPM.10L2.B1"],
+    }
+
+    # Test that all BPMs are filtered without the keep-flag --------------------
+    linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns)
+    unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
+
+    # if limit not given, filters two elements in X
+    clean_columns(files=linfiles.values(), columns=plane_columns)
+    filtered = {p: tfs.read(f) for p, f in linfiles.items()}
+    
+
+    for plane in PLANES:
+        assert len(filtered[plane]) == len(unfiltered[plane]) - 2
+        for bpm in filtered_bpms[plane]:
+            assert bpm not in filtered[plane][COL_NAME].to_list()
+            assert bpm in unfiltered[plane][COL_NAME].to_list()
+
+    # Now with keeping one of them ---------------------------------------------
+    linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns)
+    unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
+
+    # if limit not given, filters two elements in X
+    clean_columns(files=linfiles.values(), columns=plane_columns, keep=[filtered_bpms["X"][1]])
+    filtered = {p: tfs.read(f) for p, f in linfiles.items()}
+    for plane in PLANES:
+        assert len(filtered[plane]) == len(unfiltered[plane]) - 1
+        for bpm in filtered_bpms[plane]:
+            assert bpm in unfiltered[plane][COL_NAME].to_list()
+
+        assert filtered_bpms[plane][0] not in filtered[plane][COL_NAME].to_list()
+        assert filtered_bpms[plane][1] in filtered[plane][COL_NAME].to_list()
+
+
 @pytest.mark.basic
 def test_filter_tune_nattune(tmp_path):
     """Tests that filtering works for two columns."""
@@ -64,6 +107,55 @@ def test_filter_tune_nattune(tmp_path):
         assert len(filtered[plane]) == 2  # empirically determined
 
 
+@pytest.mark.basic
+def test_filter_between_limits(tmp_path):
+    """ Test filtering works on outlier created by modify linfiles function. """
+    columns = [COL_TUNE]
+    plane_columns = [f"{col}{p}" for col in columns for p in PLANES]
+    
+    # Test that no BPMs are filtered by the auto-clean (sanity check) ----------
+    linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
+    unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
+
+    clean_columns(files=linfiles.values(), columns=plane_columns)
+
+    filtered = {p: tfs.read(f) for p, f in linfiles.items()}
+
+    for plane in PLANES:
+        assert_frame_equal(unfiltered[plane], filtered[plane])
+    
+    # Test that the two BPMs are filtered by the limits-clean ------------------
+    linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
+    unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
+    
+    # choosing values so that both planes are filtered
+    # X tunes are 0.26 + 0.1, Y tunes are 0.32 + 0.1
+    clean_columns(files=linfiles.values(), columns=plane_columns, limit=(0.20, 0.35))
+
+    filtered = {p: tfs.read(f) for p, f in linfiles.items()}
+
+    for plane in PLANES:
+        assert len(filtered[plane]) == 3
+        assert unfiltered[plane][COL_NAME][2] not in filtered[plane][COL_NAME].to_list()
+        assert unfiltered[plane][COL_NAME][3] not in filtered[plane][COL_NAME].to_list()
+    
+    
+    # Test that keep flag is also respected in the limits-clean ----------------
+    linfiles = _copy_and_modify_linfiles(tmp_path, columns=columns, index=[2, 3], by=0.1)
+    unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
+    
+    # choosing values so that both planes are filtered
+    # X tunes are 0.26 + 0.1, Y tunes are 0.32 + 0.1
+    clean_columns(files=linfiles.values(), columns=plane_columns, limit=(0.20, 0.35), keep=[unfiltered["X"][COL_NAME][2]])
+
+    filtered = {p: tfs.read(f) for p, f in linfiles.items()}
+
+    for plane in PLANES:
+        assert len(filtered[plane]) == 4
+        assert unfiltered[plane][COL_NAME][2] in filtered[plane][COL_NAME].to_list()
+        assert unfiltered[plane][COL_NAME][3] not in filtered[plane][COL_NAME].to_list()
+
+
 @pytest.mark.basic
 def test_backup_and_restore(tmp_path):
     """Test that the backup and restore functionality works."""
@@ -106,7 +198,7 @@ def test_main(tmp_path):
     unfiltered = {p: tfs.read(f) for p, f in linfiles.items()}
 
     # if limit not given, would filter two elements in X
-    main(files=list(linfiles.values()), columns=plane_columns, limit=0.01, backup=True)
+    main(files=list(linfiles.values()), columns=plane_columns, limit=[0.01], backup=True)
     _assert_nlinfiles(tmp_path, 2)
 
     filtered = {p: tfs.read(f) for p, f in linfiles.items()}