Unique indices Check Option (#83)

fsoubelet · web-flow · commit 776d7e03e568 · 2021-07-05T16:33:42.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,18 @@
 # TFS-Pandas Changelog
+
+## Version 2.0.3
+
+- Fixed:
+    - Took care of a numpy deprecation warning when using `np.str`, which should not appear anymore for users.
+
+- Changes:
+    - Prior to version `2.0.3`, reading and writing would raise a `TfsFormatError` in case of non-unique indices or columns. From now on, this behavior is an option in `read_tfs` and `write_tfs`called `non_unique_bahvior` which by default is set to log a warning. If explicitely asked by the user, the failed check will raise a `TfsFormatError`.
+
 ## Version 2.0.2
 - Fixed:
     - Proper error on non-string columns
     - Writing numeric-only mixed type dataframes bug
-    
+
 ## Version 2.0.1
 - Fixed:
     - No longer warns on MAD-X styled string column types (`%[num]s`).
@@ -31,7 +40,7 @@
     - Bug with testing for headers, also in pandas DataFrames
     - Same testing method for all data-frame comparisons
     - Some minor fixes
-    
+
 - Added:
     - Testing of writing of pandas DataFrames
 
@@ -46,7 +55,7 @@
 
  - Removed:
    - `.indx` from class (use `index="NAME"` instead)
-   
+
  - Fixed:
    - Writing of empty dataframes
    - Doc imports
@@ -69,7 +78,7 @@
 ## Version 1.0.1
  - Fixed: 
     - Metaclass-Bug in Collections
-    
+
  - Added: 
     - Additional Unit Tests
     - Versioning
diff --git a/tests/test_handler.py b/tests/test_handler.py
@@ -2,6 +2,7 @@
 import pathlib
 import random
 import string
+import sys
 import tempfile
 
 import numpy
@@ -143,27 +144,20 @@ def test_tfs_write_empty_index_dataframe(self, _test_file: str):
         assert_dict_equal(df.headers, new.headers, compare_keys=True)
 
     def test_write_int_float_str_columns(self, _test_file: str):
-        """ This test is more of an extension of the test below
-         (this dataframe was not affected by the bug) """
+        """This test is more of an extension of the test below
+        (this dataframe was not affected by the bug)"""
         df = TfsDataFrame(
-            data=[[1, 1., "one"],
-                  [2, 2., "two"],
-                  [3, 3., "three"]],
-            columns=["Int", "Float", "String"]
+            data=[[1, 1.0, "one"], [2, 2.0, "two"], [3, 3.0, "three"]],
+            columns=["Int", "Float", "String"],
         )
         write_tfs(_test_file, df)
         new = read_tfs(_test_file)
         assert_frame_equal(df, new)
 
     def test_write_int_float_columns(self, _test_file: str):
-        """ This test is here because of numeric conversion bug
-        upon writing back in v2.0.1 """
-        df = TfsDataFrame(
-            data=[[1, 1.],
-                  [2, 2.],
-                  [3, 3.]],
-            columns=["Int", "Float"]
-        )
+        """This test is here because of numeric conversion bug
+        upon writing back in v2.0.1"""
+        df = TfsDataFrame(data=[[1, 1.0], [2, 2.0], [3, 3.0]], columns=["Int", "Float"])
         write_tfs(_test_file, df)
         new = read_tfs(_test_file)
         assert_frame_equal(df, new)
@@ -178,24 +172,33 @@ def test_absent_attributes_and_keys(self, _tfs_file_str: str):
         with pytest.raises(KeyError):
             _ = test_file["Not_HERE"]
 
-    def test_fail_on_non_unique_columns(self, caplog):
+    def test_raising_on_non_unique_columns(self, caplog):
         df = TfsDataFrame(columns=["A", "B", "A"])
         with pytest.raises(TfsFormatError):
-            write_tfs("", df)
+            write_tfs("", df, non_unique_behavior="raise")
 
         for record in caplog.records:
-            assert record.levelname == "ERROR"
+            assert record.levelname == "WARNING"
         assert "Non-unique column names found" in caplog.text
 
-    def test_fail_on_non_unique_index(self, caplog):
+    def test_raising_on_non_unique_index(self, caplog):
         df = TfsDataFrame(index=["A", "B", "A"])
         with pytest.raises(TfsFormatError):
-            write_tfs("", df)
+            write_tfs("", df, non_unique_behavior="raise")
 
         for record in caplog.records:
-            assert record.levelname == "ERROR"
+            assert record.levelname == "WARNING"
         assert "Non-unique indices found" in caplog.text
 
+    def test_raising_on_non_unique_both(self, caplog):
+        df = TfsDataFrame(index=["A", "B", "A"], columns=["A", "B", "A"])
+        with pytest.raises(TfsFormatError):
+            write_tfs("", df, non_unique_behavior="raise")
+
+        for record in caplog.records:
+            assert record.levelname == "WARNING"
+        assert "Non-unique indices found" in caplog.text  # first checked and raised
+
     def test_fail_on_wrong_column_type(self, caplog):
         df = TfsDataFrame(columns=range(5))
         with pytest.raises(TfsFormatError):
@@ -291,6 +294,11 @@ def test_id_to_type_handles_typo_str_id(self):
         with pytest.raises(TfsFormatError):
             _ = tfs.handler._id_to_type(typoed_str_id)
 
+    def test_validate_raises_on_wrong_unique_behavior(self, caplog):
+        df = TfsDataFrame(index=["A", "B", "A"], columns=["A", "B", "A"])
+        with pytest.raises(KeyError):
+            tfs.handler._validate(df, "", non_unique_behavior="invalid")
+
 
 def test_id_to_type_handles_madx_string_identifier():
     madx_str_id = "%20s"
@@ -305,14 +313,44 @@ def test_warn_unphysical_values(self, caplog):
             assert record.levelname == "WARNING"
         assert "contains non-physical values at Index:" in caplog.text
 
+    @pytest.mark.skipif(
+        sys.version_info >= (3, 7),
+        reason="Our workers on 3.7+ install pandas >= 1.3.0  which has fixed the .convert_dtypes() bug "
+        "we try...except in _autoset_pandas_types and test here",
+    )
     def test_empty_df_warns_on_types_inference(self, caplog):
         empty_df = pandas.DataFrame()
         converted_df = tfs.handler._autoset_pandas_types(empty_df)
         assert_frame_equal(converted_df, empty_df)
 
         for record in caplog.records:
             assert record.levelname == "WARNING"
-        assert "An empty dataframe was provided, no types were infered" in caplog.text
+        assert "An empty dataframe was provided, no types were inferred" in caplog.text
+
+    def test_warning_on_non_unique_columns(self, tmp_path, caplog):
+        df = TfsDataFrame(columns=["A", "B", "A"])
+        write_tfs(tmp_path / "temporary.tfs", df)
+
+        for record in caplog.records:
+            assert record.levelname == "WARNING"
+        assert "Non-unique column names found" in caplog.text
+
+    def test_warning_on_non_unique_index(self, tmp_path, caplog):
+        df = TfsDataFrame(index=["A", "B", "A"])
+        write_tfs(tmp_path / "temporary.tfs", df)
+
+        for record in caplog.records:
+            assert record.levelname == "WARNING"
+        assert "Non-unique indices found" in caplog.text
+
+    def test_warning_on_non_unique_both(self, tmp_path, caplog):
+        df = TfsDataFrame(index=["A", "B", "A"], columns=["A", "B", "A"])
+        write_tfs(tmp_path / "temporary.tfs", df)
+
+        for record in caplog.records:
+            assert record.levelname == "WARNING"
+        assert "Non-unique column names found" in caplog.text
+        assert "Non-unique indices found" in caplog.text
 
 
 class TestPrinting:
@@ -359,7 +397,10 @@ def _tfs_dataframe() -> TfsDataFrame:
 @pytest.fixture()
 def _dataframe_empty_headers() -> TfsDataFrame:
     return TfsDataFrame(
-        index=range(3), columns="a b c d e".split(), data=numpy.random.rand(3, 5), headers={},
+        index=range(3),
+        columns="a b c d e".split(),
+        data=numpy.random.rand(3, 5),
+        headers={},
     )
 
 
@@ -431,7 +472,9 @@ def _no_colnames_tfs_path() -> pathlib.Path:
 @pytest.fixture()
 def _pd_dataframe() -> pandas.DataFrame:
     return pandas.DataFrame(
-        index=range(3), columns="a b c d e".split(), data=numpy.random.rand(3, 5),
+        index=range(3),
+        columns="a b c d e".split(),
+        data=numpy.random.rand(3, 5),
     )
 
 
diff --git a/tfs/__init__.py b/tfs/__init__.py
@@ -6,7 +6,7 @@
 __title__ = "tfs-pandas"
 __description__ = "Read and write tfs files."
 __url__ = "https://github.com/pylhc/tfs"
-__version__ = "2.0.2"
+__version__ = "2.0.3"
 __author__ = "pylhc"
 __author_email__ = "pylhc@github.com"
 __license__ = "MIT"
diff --git a/tfs/handler.py b/tfs/handler.py
@@ -92,7 +92,11 @@ def _str_items(items_list):
         return f"{s}{super().__repr__()}"
 
 
-def read_tfs(tfs_file_path: Union[pathlib.Path, str], index: str = None) -> TfsDataFrame:
+def read_tfs(
+    tfs_file_path: Union[pathlib.Path, str],
+    index: str = None,
+    non_unique_behavior: str = "warn",
+) -> TfsDataFrame:
     """
     Parses the TFS table present in **tfs_file_path** and returns a customized version of a Pandas
     DataFrame (a TfsDataFrame).
@@ -102,7 +106,9 @@ def read_tfs(tfs_file_path: Union[pathlib.Path, str], index: str = None) -> TfsD
             a string, in which case it will be cast to a PosixPath object.
         index (str): Name of the column to set as index. If not given, looks in **tfs_file_path**
             for a column starting with `INDEX&&&`.
-
+        non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
+            dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
+            to respectively issue a warning or raise an error if non-unique elements are found.
     Returns:
         A TfsDataFrame object.
     """
@@ -150,7 +156,7 @@ def read_tfs(tfs_file_path: Union[pathlib.Path, str], index: str = None) -> TfsD
                 index_name = None  # to remove it completely (Pandas makes a difference)
             data_frame = data_frame.rename_axis(index_name)
 
-    _validate(data_frame, f"from file {tfs_file_path.absolute()}")
+    _validate(data_frame, f"from file {tfs_file_path.absolute()}", non_unique_behavior)
     return data_frame
 
 
@@ -161,6 +167,7 @@ def write_tfs(
     save_index: Union[str, bool] = False,
     colwidth: int = DEFAULT_COLUMN_WIDTH,
     headerswidth: int = DEFAULT_COLUMN_WIDTH,
+    non_unique_behavior: str = "warn",
 ) -> None:
     """
     Writes the DataFrame into **tfs_file_path** with the `headers_dict` as headers dictionary. If
@@ -178,10 +185,13 @@ def write_tfs(
             saves the index of the data_frame to a column named by the provided value.
         colwidth (int): Column width, can not be smaller than `MIN_COLUMN_WIDTH`.
         headerswidth (int): Used to format the header width for both keys and values.
+        non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
+            dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
+            to respectively issue a warning or raise an error if non-unique elements are found.
     """
     left_align_first_column = False
     tfs_file_path = pathlib.Path(tfs_file_path)
-    _validate(data_frame, f"to be written in {tfs_file_path.absolute()}")
+    _validate(data_frame, f"to be written in {tfs_file_path.absolute()}", non_unique_behavior)
 
     if headers_dict is None:  # tries to get headers from TfsDataFrame
         try:
@@ -219,19 +229,25 @@ def _autoset_pandas_types(
     convert_dtypes() to internally use concat) and then return only a copy of the original
     dataframe. Otherwise, raise the exception given by pandas.
 
+    NOTE: Starting with pandas 1.3.0, this behavior which was a bug has been fixed. This means no
+    ValueError is raised by calling .convert_dtypes() on an empty DataFrame, and from this function
+    no warning is logged. Testing of this behavior is disabled for Python 3.7+ workers, but the
+    function is kept as to not force a new min version requirement on pandas or Python for users.
+    See my comment at https://github.com/pylhc/tfs/pull/83#issuecomment-874208869
+
     Args:
         data_frame (Union[TfsDataFrame, pd.DataFrame]): TfsDataFrame or pandas.DataFrame to
             determine the types of.
 
     Returns:
-        The dataframe with dtypes infered as much as possible to the pandas dtypes.
+        The dataframe with dtypes inferred as much as possible to the pandas dtypes.
     """
     LOGGER.debug("Attempting conversion of dataframe to pandas dtypes")
     try:
         return data_frame.copy().convert_dtypes(convert_integer=False)  # do not force floats to int
     except ValueError as pd_convert_error:  # If used on empty dataframes (uses concat internally)
         if not data_frame.size and "No objects to concatenate" in pd_convert_error.args[0]:
-            LOGGER.warning("An empty dataframe was provided, no types were infered")
+            LOGGER.warning("An empty dataframe was provided, no types were inferred")
             return data_frame.copy()  # since it's empty anyway, nothing to convert
         else:
             raise pd_convert_error
@@ -435,14 +451,24 @@ class TfsFormatError(Exception):
     pass
 
 
-def _validate(data_frame: Union[TfsDataFrame, pd.DataFrame], info_str: str = "") -> None:
+def _validate(
+    data_frame: Union[TfsDataFrame, pd.DataFrame],
+    info_str: str = "",
+    non_unique_behavior: str = "warn",
+) -> None:
     """
-    Check if Dataframe contains finite values only and both indices and columns are unique.
+    Check if Dataframe contains finite values only, strings as column names and no empty headers
+    or column names.
 
     Args:
         data_frame (Union[TfsDataFrame, pd.DataFrame]): the dataframe to check on.
         info_str (str): additional information to includ in logging statements.
+        non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
+            dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
+            to respectively issue a warning or raise an error if non-unique elements are found.
     """
+    if non_unique_behavior.lower() not in ("warn", "raise"):
+        raise KeyError("Invalid value for parameter 'validate_unique'")
 
     def is_not_finite(x):
         try:
@@ -461,13 +487,15 @@ def is_not_finite(x):
             f"{boolean_df.index[boolean_df.any(axis='columns')].tolist()}"
         )
 
-    if len(set(data_frame.index)) != len(data_frame.index):
-        LOGGER.error(f"Non-unique indices found, dataframe {info_str} is invalid")
-        raise TfsFormatError("Indices are not Unique.")
+    if data_frame.index.has_duplicates:
+        LOGGER.warning("Non-unique indices found.")
+        if non_unique_behavior.lower() == "raise":
+            raise TfsFormatError("The dataframe contains non-unique indices")
 
-    if len(set(data_frame.columns)) != len(data_frame.columns):
-        LOGGER.error(f"Non-unique column names found, dataframe {info_str} is invalid")
-        raise TfsFormatError("Column names not Unique.")
+    if data_frame.columns.has_duplicates:
+        LOGGER.warning("Non-unique column names found.")
+        if non_unique_behavior.lower() == "raise":
+            raise TfsFormatError("The dataframe contains non-unique columns.")
 
     if any(not isinstance(c, str) for c in data_frame.columns):
         LOGGER.error(f"Some column-names are not of string-type, dataframe {info_str} is invalid.")