Skip to content

Commit 776d7e0

Browse files
authored
Unique indices Check Option (#83)
1 parent 6c2416d commit 776d7e0

File tree

4 files changed

+122
-42
lines changed

4 files changed

+122
-42
lines changed

CHANGELOG.md

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11
# TFS-Pandas Changelog
2+
3+
## Version 2.0.3
4+
5+
- Fixed:
6+
- Took care of a numpy deprecation warning when using `np.str`, which should not appear anymore for users.
7+
8+
- Changes:
9+
- Prior to version `2.0.3`, reading and writing would raise a `TfsFormatError` in case of non-unique indices or columns. From now on, this behavior is an option in `read_tfs` and `write_tfs`called `non_unique_bahvior` which by default is set to log a warning. If explicitely asked by the user, the failed check will raise a `TfsFormatError`.
10+
211
## Version 2.0.2
312
- Fixed:
413
- Proper error on non-string columns
514
- Writing numeric-only mixed type dataframes bug
6-
15+
716
## Version 2.0.1
817
- Fixed:
918
- No longer warns on MAD-X styled string column types (`%[num]s`).
@@ -31,7 +40,7 @@
3140
- Bug with testing for headers, also in pandas DataFrames
3241
- Same testing method for all data-frame comparisons
3342
- Some minor fixes
34-
43+
3544
- Added:
3645
- Testing of writing of pandas DataFrames
3746

@@ -46,7 +55,7 @@
4655

4756
- Removed:
4857
- `.indx` from class (use `index="NAME"` instead)
49-
58+
5059
- Fixed:
5160
- Writing of empty dataframes
5261
- Doc imports
@@ -69,7 +78,7 @@
6978
## Version 1.0.1
7079
- Fixed:
7180
- Metaclass-Bug in Collections
72-
81+
7382
- Added:
7483
- Additional Unit Tests
7584
- Versioning

tests/test_handler.py

Lines changed: 66 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pathlib
33
import random
44
import string
5+
import sys
56
import tempfile
67

78
import numpy
@@ -143,27 +144,20 @@ def test_tfs_write_empty_index_dataframe(self, _test_file: str):
143144
assert_dict_equal(df.headers, new.headers, compare_keys=True)
144145

145146
def test_write_int_float_str_columns(self, _test_file: str):
146-
""" This test is more of an extension of the test below
147-
(this dataframe was not affected by the bug) """
147+
"""This test is more of an extension of the test below
148+
(this dataframe was not affected by the bug)"""
148149
df = TfsDataFrame(
149-
data=[[1, 1., "one"],
150-
[2, 2., "two"],
151-
[3, 3., "three"]],
152-
columns=["Int", "Float", "String"]
150+
data=[[1, 1.0, "one"], [2, 2.0, "two"], [3, 3.0, "three"]],
151+
columns=["Int", "Float", "String"],
153152
)
154153
write_tfs(_test_file, df)
155154
new = read_tfs(_test_file)
156155
assert_frame_equal(df, new)
157156

158157
def test_write_int_float_columns(self, _test_file: str):
159-
""" This test is here because of numeric conversion bug
160-
upon writing back in v2.0.1 """
161-
df = TfsDataFrame(
162-
data=[[1, 1.],
163-
[2, 2.],
164-
[3, 3.]],
165-
columns=["Int", "Float"]
166-
)
158+
"""This test is here because of numeric conversion bug
159+
upon writing back in v2.0.1"""
160+
df = TfsDataFrame(data=[[1, 1.0], [2, 2.0], [3, 3.0]], columns=["Int", "Float"])
167161
write_tfs(_test_file, df)
168162
new = read_tfs(_test_file)
169163
assert_frame_equal(df, new)
@@ -178,24 +172,33 @@ def test_absent_attributes_and_keys(self, _tfs_file_str: str):
178172
with pytest.raises(KeyError):
179173
_ = test_file["Not_HERE"]
180174

181-
def test_fail_on_non_unique_columns(self, caplog):
175+
def test_raising_on_non_unique_columns(self, caplog):
182176
df = TfsDataFrame(columns=["A", "B", "A"])
183177
with pytest.raises(TfsFormatError):
184-
write_tfs("", df)
178+
write_tfs("", df, non_unique_behavior="raise")
185179

186180
for record in caplog.records:
187-
assert record.levelname == "ERROR"
181+
assert record.levelname == "WARNING"
188182
assert "Non-unique column names found" in caplog.text
189183

190-
def test_fail_on_non_unique_index(self, caplog):
184+
def test_raising_on_non_unique_index(self, caplog):
191185
df = TfsDataFrame(index=["A", "B", "A"])
192186
with pytest.raises(TfsFormatError):
193-
write_tfs("", df)
187+
write_tfs("", df, non_unique_behavior="raise")
194188

195189
for record in caplog.records:
196-
assert record.levelname == "ERROR"
190+
assert record.levelname == "WARNING"
197191
assert "Non-unique indices found" in caplog.text
198192

193+
def test_raising_on_non_unique_both(self, caplog):
194+
df = TfsDataFrame(index=["A", "B", "A"], columns=["A", "B", "A"])
195+
with pytest.raises(TfsFormatError):
196+
write_tfs("", df, non_unique_behavior="raise")
197+
198+
for record in caplog.records:
199+
assert record.levelname == "WARNING"
200+
assert "Non-unique indices found" in caplog.text # first checked and raised
201+
199202
def test_fail_on_wrong_column_type(self, caplog):
200203
df = TfsDataFrame(columns=range(5))
201204
with pytest.raises(TfsFormatError):
@@ -291,6 +294,11 @@ def test_id_to_type_handles_typo_str_id(self):
291294
with pytest.raises(TfsFormatError):
292295
_ = tfs.handler._id_to_type(typoed_str_id)
293296

297+
def test_validate_raises_on_wrong_unique_behavior(self, caplog):
298+
df = TfsDataFrame(index=["A", "B", "A"], columns=["A", "B", "A"])
299+
with pytest.raises(KeyError):
300+
tfs.handler._validate(df, "", non_unique_behavior="invalid")
301+
294302

295303
def test_id_to_type_handles_madx_string_identifier():
296304
madx_str_id = "%20s"
@@ -305,14 +313,44 @@ def test_warn_unphysical_values(self, caplog):
305313
assert record.levelname == "WARNING"
306314
assert "contains non-physical values at Index:" in caplog.text
307315

316+
@pytest.mark.skipif(
317+
sys.version_info >= (3, 7),
318+
reason="Our workers on 3.7+ install pandas >= 1.3.0 which has fixed the .convert_dtypes() bug "
319+
"we try...except in _autoset_pandas_types and test here",
320+
)
308321
def test_empty_df_warns_on_types_inference(self, caplog):
309322
empty_df = pandas.DataFrame()
310323
converted_df = tfs.handler._autoset_pandas_types(empty_df)
311324
assert_frame_equal(converted_df, empty_df)
312325

313326
for record in caplog.records:
314327
assert record.levelname == "WARNING"
315-
assert "An empty dataframe was provided, no types were infered" in caplog.text
328+
assert "An empty dataframe was provided, no types were inferred" in caplog.text
329+
330+
def test_warning_on_non_unique_columns(self, tmp_path, caplog):
331+
df = TfsDataFrame(columns=["A", "B", "A"])
332+
write_tfs(tmp_path / "temporary.tfs", df)
333+
334+
for record in caplog.records:
335+
assert record.levelname == "WARNING"
336+
assert "Non-unique column names found" in caplog.text
337+
338+
def test_warning_on_non_unique_index(self, tmp_path, caplog):
339+
df = TfsDataFrame(index=["A", "B", "A"])
340+
write_tfs(tmp_path / "temporary.tfs", df)
341+
342+
for record in caplog.records:
343+
assert record.levelname == "WARNING"
344+
assert "Non-unique indices found" in caplog.text
345+
346+
def test_warning_on_non_unique_both(self, tmp_path, caplog):
347+
df = TfsDataFrame(index=["A", "B", "A"], columns=["A", "B", "A"])
348+
write_tfs(tmp_path / "temporary.tfs", df)
349+
350+
for record in caplog.records:
351+
assert record.levelname == "WARNING"
352+
assert "Non-unique column names found" in caplog.text
353+
assert "Non-unique indices found" in caplog.text
316354

317355

318356
class TestPrinting:
@@ -359,7 +397,10 @@ def _tfs_dataframe() -> TfsDataFrame:
359397
@pytest.fixture()
360398
def _dataframe_empty_headers() -> TfsDataFrame:
361399
return TfsDataFrame(
362-
index=range(3), columns="a b c d e".split(), data=numpy.random.rand(3, 5), headers={},
400+
index=range(3),
401+
columns="a b c d e".split(),
402+
data=numpy.random.rand(3, 5),
403+
headers={},
363404
)
364405

365406

@@ -431,7 +472,9 @@ def _no_colnames_tfs_path() -> pathlib.Path:
431472
@pytest.fixture()
432473
def _pd_dataframe() -> pandas.DataFrame:
433474
return pandas.DataFrame(
434-
index=range(3), columns="a b c d e".split(), data=numpy.random.rand(3, 5),
475+
index=range(3),
476+
columns="a b c d e".split(),
477+
data=numpy.random.rand(3, 5),
435478
)
436479

437480

tfs/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
__title__ = "tfs-pandas"
77
__description__ = "Read and write tfs files."
88
__url__ = "https://github.com/pylhc/tfs"
9-
__version__ = "2.0.2"
9+
__version__ = "2.0.3"
1010
__author__ = "pylhc"
1111
__author_email__ = "[email protected]"
1212
__license__ = "MIT"

tfs/handler.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,11 @@ def _str_items(items_list):
9292
return f"{s}{super().__repr__()}"
9393

9494

95-
def read_tfs(tfs_file_path: Union[pathlib.Path, str], index: str = None) -> TfsDataFrame:
95+
def read_tfs(
96+
tfs_file_path: Union[pathlib.Path, str],
97+
index: str = None,
98+
non_unique_behavior: str = "warn",
99+
) -> TfsDataFrame:
96100
"""
97101
Parses the TFS table present in **tfs_file_path** and returns a customized version of a Pandas
98102
DataFrame (a TfsDataFrame).
@@ -102,7 +106,9 @@ def read_tfs(tfs_file_path: Union[pathlib.Path, str], index: str = None) -> TfsD
102106
a string, in which case it will be cast to a PosixPath object.
103107
index (str): Name of the column to set as index. If not given, looks in **tfs_file_path**
104108
for a column starting with `INDEX&&&`.
105-
109+
non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
110+
dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
111+
to respectively issue a warning or raise an error if non-unique elements are found.
106112
Returns:
107113
A TfsDataFrame object.
108114
"""
@@ -150,7 +156,7 @@ def read_tfs(tfs_file_path: Union[pathlib.Path, str], index: str = None) -> TfsD
150156
index_name = None # to remove it completely (Pandas makes a difference)
151157
data_frame = data_frame.rename_axis(index_name)
152158

153-
_validate(data_frame, f"from file {tfs_file_path.absolute()}")
159+
_validate(data_frame, f"from file {tfs_file_path.absolute()}", non_unique_behavior)
154160
return data_frame
155161

156162

@@ -161,6 +167,7 @@ def write_tfs(
161167
save_index: Union[str, bool] = False,
162168
colwidth: int = DEFAULT_COLUMN_WIDTH,
163169
headerswidth: int = DEFAULT_COLUMN_WIDTH,
170+
non_unique_behavior: str = "warn",
164171
) -> None:
165172
"""
166173
Writes the DataFrame into **tfs_file_path** with the `headers_dict` as headers dictionary. If
@@ -178,10 +185,13 @@ def write_tfs(
178185
saves the index of the data_frame to a column named by the provided value.
179186
colwidth (int): Column width, can not be smaller than `MIN_COLUMN_WIDTH`.
180187
headerswidth (int): Used to format the header width for both keys and values.
188+
non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
189+
dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
190+
to respectively issue a warning or raise an error if non-unique elements are found.
181191
"""
182192
left_align_first_column = False
183193
tfs_file_path = pathlib.Path(tfs_file_path)
184-
_validate(data_frame, f"to be written in {tfs_file_path.absolute()}")
194+
_validate(data_frame, f"to be written in {tfs_file_path.absolute()}", non_unique_behavior)
185195

186196
if headers_dict is None: # tries to get headers from TfsDataFrame
187197
try:
@@ -219,19 +229,25 @@ def _autoset_pandas_types(
219229
convert_dtypes() to internally use concat) and then return only a copy of the original
220230
dataframe. Otherwise, raise the exception given by pandas.
221231
232+
NOTE: Starting with pandas 1.3.0, this behavior which was a bug has been fixed. This means no
233+
ValueError is raised by calling .convert_dtypes() on an empty DataFrame, and from this function
234+
no warning is logged. Testing of this behavior is disabled for Python 3.7+ workers, but the
235+
function is kept as to not force a new min version requirement on pandas or Python for users.
236+
See my comment at https://github.com/pylhc/tfs/pull/83#issuecomment-874208869
237+
222238
Args:
223239
data_frame (Union[TfsDataFrame, pd.DataFrame]): TfsDataFrame or pandas.DataFrame to
224240
determine the types of.
225241
226242
Returns:
227-
The dataframe with dtypes infered as much as possible to the pandas dtypes.
243+
The dataframe with dtypes inferred as much as possible to the pandas dtypes.
228244
"""
229245
LOGGER.debug("Attempting conversion of dataframe to pandas dtypes")
230246
try:
231247
return data_frame.copy().convert_dtypes(convert_integer=False) # do not force floats to int
232248
except ValueError as pd_convert_error: # If used on empty dataframes (uses concat internally)
233249
if not data_frame.size and "No objects to concatenate" in pd_convert_error.args[0]:
234-
LOGGER.warning("An empty dataframe was provided, no types were infered")
250+
LOGGER.warning("An empty dataframe was provided, no types were inferred")
235251
return data_frame.copy() # since it's empty anyway, nothing to convert
236252
else:
237253
raise pd_convert_error
@@ -435,14 +451,24 @@ class TfsFormatError(Exception):
435451
pass
436452

437453

438-
def _validate(data_frame: Union[TfsDataFrame, pd.DataFrame], info_str: str = "") -> None:
454+
def _validate(
455+
data_frame: Union[TfsDataFrame, pd.DataFrame],
456+
info_str: str = "",
457+
non_unique_behavior: str = "warn",
458+
) -> None:
439459
"""
440-
Check if Dataframe contains finite values only and both indices and columns are unique.
460+
Check if Dataframe contains finite values only, strings as column names and no empty headers
461+
or column names.
441462
442463
Args:
443464
data_frame (Union[TfsDataFrame, pd.DataFrame]): the dataframe to check on.
444465
info_str (str): additional information to includ in logging statements.
466+
non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
467+
dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
468+
to respectively issue a warning or raise an error if non-unique elements are found.
445469
"""
470+
if non_unique_behavior.lower() not in ("warn", "raise"):
471+
raise KeyError("Invalid value for parameter 'validate_unique'")
446472

447473
def is_not_finite(x):
448474
try:
@@ -461,13 +487,15 @@ def is_not_finite(x):
461487
f"{boolean_df.index[boolean_df.any(axis='columns')].tolist()}"
462488
)
463489

464-
if len(set(data_frame.index)) != len(data_frame.index):
465-
LOGGER.error(f"Non-unique indices found, dataframe {info_str} is invalid")
466-
raise TfsFormatError("Indices are not Unique.")
490+
if data_frame.index.has_duplicates:
491+
LOGGER.warning("Non-unique indices found.")
492+
if non_unique_behavior.lower() == "raise":
493+
raise TfsFormatError("The dataframe contains non-unique indices")
467494

468-
if len(set(data_frame.columns)) != len(data_frame.columns):
469-
LOGGER.error(f"Non-unique column names found, dataframe {info_str} is invalid")
470-
raise TfsFormatError("Column names not Unique.")
495+
if data_frame.columns.has_duplicates:
496+
LOGGER.warning("Non-unique column names found.")
497+
if non_unique_behavior.lower() == "raise":
498+
raise TfsFormatError("The dataframe contains non-unique columns.")
471499

472500
if any(not isinstance(c, str) for c in data_frame.columns):
473501
LOGGER.error(f"Some column-names are not of string-type, dataframe {info_str} is invalid.")

0 commit comments

Comments
 (0)