Use dicts instead of OrderedDicts for headers (#133)

fsoubelet · web-flow · commit c0c54ba9202a · 2024-08-15T17:22:23.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,9 @@
 # TFS-Pandas Changelog
 
-## IN PROGRESS - 3.9.0
+## Version 3.8.2
+
+- Changed:
+  - The headers of a `TfsDataFrame` are now stored as a `dict` and no longer an `OrderedDict`. This is transparent to the user.
 
 - Fixed:
   - Removed a workaround function which is no longer necessary due to the higher minimum `pandas` version.
diff --git a/README.md b/README.md
@@ -4,31 +4,34 @@
 [![Code Climate coverage](https://img.shields.io/codeclimate/coverage/pylhc/tfs.svg?style=popout)](https://codeclimate.com/github/pylhc/tfs)
 [![Code Climate maintainability (percentage)](https://img.shields.io/codeclimate/maintainability-percentage/pylhc/tfs.svg?style=popout)](https://codeclimate.com/github/pylhc/tfs)
 <!-- [![GitHub last commit](https://img.shields.io/github/last-commit/pylhc/tfs.svg?style=popout)](https://github.com/pylhc/tfs/) -->
-[![PyPI Version](https://img.shields.io/pypi/v/tfs-pandas?label=PyPI&logo=pypi)](https://pypi.org/project/tfs-pandas/)
 [![GitHub release](https://img.shields.io/github/v/release/pylhc/tfs?logo=github)](https://github.com/pylhc/tfs/)
+[![PyPI Version](https://img.shields.io/pypi/v/tfs-pandas?label=PyPI&logo=pypi)](https://pypi.org/project/tfs-pandas/)
 [![Conda-forge Version](https://img.shields.io/conda/vn/conda-forge/tfs-pandas?color=orange&logo=anaconda)](https://anaconda.org/conda-forge/tfs-pandas)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5070986.svg)](https://doi.org/10.5281/zenodo.5070986)
 
-This package provides reading and writing functionality for [**Table Format System (TFS)** files](http://mad.web.cern.ch/mad/madx.old/Introduction/tfs.html). 
-Files are read into a `TfsDataFrame`, a class built on top of the famous `pandas.DataFrame`, which in addition to the normal behavior attaches an `OrderedDict` of headers to the `DataFrame`.
+This package provides reading and writing functionality for [**Table Format System (TFS)** files](http://mad.web.cern.ch/mad/madx.old/Introduction/tfs.html).
+Files are read into a `TfsDataFrame`, a class built on top of the famous `pandas.DataFrame`, which in addition to the normal behavior attaches a dictionary of headers to the `DataFrame`.
 
 See the [API documentation](https://pylhc.github.io/tfs/) for details.
 
 ## Installing
 
 Installation is easily done via `pip`:
+
 ```bash
 python -m pip install tfs-pandas
 ```
 
 One can also install in a `conda`/`mamba` environment via the `conda-forge` channel with:
+
 ```bash
 conda install -c conda-forge tfs-pandas
 ```
 
 ## Example Usage
 
 The package is imported as `tfs`, and exports top-level functions for reading and writing:
+
 ```python
 import tfs
 
@@ -50,6 +53,7 @@ tfs.write("path_to_output.tfs", data_frame, save_index="index_column")
 ```
 
 Reading and writing compressed files is also supported, and done automatically based on the provided file extension:
+
 ```python
 import tfs
 
diff --git a/tests/test_frame.py b/tests/test_frame.py
@@ -1,5 +1,4 @@
 import pathlib
-from collections import OrderedDict
 from functools import partial, reduce
 
 import pandas as pd
@@ -21,8 +20,8 @@ def test_validate_raises_on_wrong_unique_behavior(self):
 
     @pytest.mark.parametrize("how", ["invalid", "not_left", "not_right"])
     def test_merge_headers_raises_on_invalid_how_key(self, how):
-        headers_left = OrderedDict()
-        headers_right = OrderedDict()
+        headers_left = {}
+        headers_right = {}
 
         with pytest.raises(ValueError, match="Invalid 'how' argument"):
             merge_headers(headers_left, headers_right, how=how)
@@ -49,7 +48,7 @@ def test_correct_merging(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, how_hea
         result = dframe_x.merge(dframe_y, how_headers=how_headers, how=how, on=on)
 
         assert isinstance(result, TfsDataFrame)
-        assert isinstance(result.headers, OrderedDict)
+        assert isinstance(result.headers, dict)
         assert_dict_equal(result.headers, merge_headers(dframe_x.headers, dframe_y.headers, how=how_headers))
         assert_frame_equal(result, pd.DataFrame(dframe_x).merge(pd.DataFrame(dframe_y), how=how, on=on))
 
@@ -64,10 +63,10 @@ def test_merging_accepts_pandas_dataframe(
         result = dframe_x.merge(dframe_y, how_headers=how_headers, how=how, on=on)
 
         assert isinstance(result, TfsDataFrame)
-        assert isinstance(result.headers, OrderedDict)
+        assert isinstance(result.headers, dict)
 
-        # using empty OrderedDict here as it's what dframe_y is getting when converted in the call
-        assert_dict_equal(result.headers, merge_headers(dframe_x.headers, OrderedDict(), how=how_headers))
+        # using empty dict here as it's what dframe_y is getting when converted in the call
+        assert_dict_equal(result.headers, merge_headers(dframe_x.headers, headers_right={}, how=how_headers))
         assert_frame_equal(result, pd.DataFrame(dframe_x).merge(pd.DataFrame(dframe_y), how=how, on=on))
 
 
@@ -78,7 +77,7 @@ def test_headers_merging_left(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, ho
         headers_right = tfs.read(_tfs_file_y_pathlib).headers
         result = merge_headers(headers_left, headers_right, how=how)
 
-        assert isinstance(result, OrderedDict)
+        assert isinstance(result, dict)
         assert len(result) >= len(headers_left)  # no key disappeared
         assert len(result) >= len(headers_right)  # no key disappeared
         for key in result:  # check that we prioritized headers_left's contents
@@ -91,7 +90,7 @@ def test_headers_merging_right(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, h
         headers_right = tfs.read(_tfs_file_y_pathlib).headers
         result = merge_headers(headers_left, headers_right, how=how)
 
-        assert isinstance(result, OrderedDict)
+        assert isinstance(result, dict)
         assert len(result) >= len(headers_left)  # no key disappeared
         assert len(result) >= len(headers_right)  # no key disappeared
         for key in result:  # check that we prioritized headers_right's contents
@@ -103,17 +102,17 @@ def test_headers_merging_none_returns_empty_dict(self, _tfs_file_x_pathlib, _tfs
         headers_left = tfs.read(_tfs_file_x_pathlib).headers
         headers_right = tfs.read(_tfs_file_y_pathlib).headers
         result = merge_headers(headers_left, headers_right, how=how)
-        assert result == OrderedDict()  # giving None returns empty headers
+        assert result == {}  # giving None returns empty headers
 
     def test_providing_new_headers_overrides_merging(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib):
         dframe_x = tfs.read(_tfs_file_x_pathlib)
         dframe_y = tfs.read(_tfs_file_y_pathlib)
 
-        assert dframe_x.merge(right=dframe_y, new_headers={}).headers == OrderedDict()
-        assert dframe_y.merge(right=dframe_x, new_headers={}).headers == OrderedDict()
+        assert dframe_x.merge(right=dframe_y, new_headers={}).headers == {}
+        assert dframe_y.merge(right=dframe_x, new_headers={}).headers == {}
 
-        assert tfs.concat([dframe_x, dframe_y], new_headers={}).headers == OrderedDict()
-        assert tfs.concat([dframe_y, dframe_x], new_headers={}).headers == OrderedDict()
+        assert tfs.concat([dframe_x, dframe_y], new_headers={}).headers == {}
+        assert tfs.concat([dframe_y, dframe_x], new_headers={}).headers == {}
 
 
 class TestPrinting:
@@ -157,7 +156,7 @@ def test_correct_concatenating(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, h
         merger = partial(merge_headers, how=how_headers)
         all_headers = (tfsdframe.headers for tfsdframe in objs)
         assert isinstance(result, TfsDataFrame)
-        assert isinstance(result.headers, OrderedDict)
+        assert isinstance(result.headers, dict)
         assert_dict_equal(result.headers, reduce(merger, all_headers))
         assert_frame_equal(result, pd.concat(objs, axis=axis, join=join))
 
@@ -175,10 +174,10 @@ def test_concatenating_accepts_pandas_dataframes(
         merger = partial(merge_headers, how=how_headers)
         # all_headers = (tfsdframe.headers for tfsdframe in objs)
         assert isinstance(result, TfsDataFrame)
-        assert isinstance(result.headers, OrderedDict)
+        assert isinstance(result.headers, dict)
 
-        all_headers = [  # empty OrderedDicts here as it's what objects are getting when converted in the call
-            dframe.headers if isinstance(dframe, TfsDataFrame) else OrderedDict() for dframe in objs
+        all_headers = [  # empty dicts here as it's what objects are getting when converted in the call
+            dframe.headers if isinstance(dframe, TfsDataFrame) else {} for dframe in objs
         ]
         assert_dict_equal(result.headers, reduce(merger, all_headers))
         assert_frame_equal(result, pd.concat(objs, axis=axis, join=join))
diff --git a/tfs/__init__.py b/tfs/__init__.py
@@ -11,7 +11,7 @@
 __title__ = "tfs-pandas"
 __description__ = "Read and write tfs files."
 __url__ = "https://github.com/pylhc/tfs"
-__version__ = "3.8.1"
+__version__ = "3.8.2"
 __author__ = "pylhc"
 __author_email__ = "pylhc@github.com"
 __license__ = "MIT"
diff --git a/tfs/frame.py b/tfs/frame.py
@@ -9,7 +9,6 @@
 from __future__ import annotations
 
 import logging
-from collections import OrderedDict
 from contextlib import suppress
 from functools import partial, reduce
 from typing import TYPE_CHECKING, ClassVar
@@ -147,23 +146,25 @@ def merge(
         return TfsDataFrame(data=dframe, headers=new_headers)
 
 
-def merge_headers(headers_left: dict, headers_right: dict, how: str) -> OrderedDict:
+def merge_headers(headers_left: dict, headers_right: dict, how: str) -> dict:
     """
     Merge headers of two ``TfsDataFrames`` together.
 
     Args:
-        headers_left (dict): Headers of caller (left) ``TfsDataFrame`` when calling ``.append``, ``.join`` or
-            ``.merge``. Headers of the left (preceeding) ``TfsDataFrame`` when calling ``tfs.frame.concat``.
-        headers_right (dict): Headers of other (right) ``TfsDataFrame`` when calling ``.append``, ``.join``
-            or ``.merge``. Headers of the left (preceeding) ``TfsDataFrame`` when calling
-            ``tfs.frame.concat``.
-        how (str): Type of merge to be performed, either **left** or **right**. If **left*, prioritize keys
-            from **headers_left** in case of duplicate keys. If **right**, prioritize keys from
-            **headers_right** in case of duplicate keys. Case insensitive. If ``None`` is given,
-            an empty dictionary will be returned.
+        headers_left (dict): Headers of caller (left) ``TfsDataFrame`` when calling
+            ``.append``, ``.join`` or ``.merge``. Headers of the left (preceeding)
+            ``TfsDataFrame`` when calling ``tfs.frame.concat``.
+        headers_right (dict): Headers of other (right) ``TfsDataFrame`` when calling
+            ``.append``, ``.join`` or ``.merge``. Headers of the left (preceeding)
+            ``TfsDataFrame`` when calling ``tfs.frame.concat``.
+        how (str): Type of merge to be performed, either **left** or **right**. If
+            **left**, prioritize keys from **headers_left** in case of duplicate keys.
+            If **right**, prioritize keys from **headers_right** in case of duplicate
+            keys. Case-insensitive. If ``None`` is given, an empty dictionary will be
+            returned.
 
     Returns:
-        A new ``OrderedDict`` as the merge of the two provided dictionaries.
+        A new dictionary as the merge of the two provided dictionaries.
     """
     accepted_merges: set[str] = {"left", "right", "none"}
     if str(how).lower() not in accepted_merges:  # handles being given None
@@ -172,14 +173,14 @@ def merge_headers(headers_left: dict, headers_right: dict, how: str) -> OrderedD
 
     LOGGER.debug(f"Merging headers with method '{how}'")
     if str(how).lower() == "left":  # we prioritize the contents of headers_left
-        result = headers_right.copy()
+        result: dict = headers_right.copy()
         result.update(headers_left)
     elif str(how).lower() == "right":  # we prioritize the contents of headers_right
-        result = headers_left.copy()
+        result: dict = headers_left.copy()
         result.update(headers_right)
     else:  # we were given None, result will be an empty dict
         result = {}
-    return OrderedDict(result)  # so that the TfsDataFrame still has an OrderedDict as header
+    return result
 
 
 def concat(
diff --git a/tfs/reader.py b/tfs/reader.py
@@ -10,7 +10,6 @@
 import logging
 import pathlib
 import shlex
-from collections import OrderedDict
 from dataclasses import dataclass
 
 import numpy as np
@@ -168,7 +167,7 @@ def read_tfs(
     return tfs_data_frame
 
 
-def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict:
+def read_headers(tfs_file_path: pathlib.Path | str) -> dict:
     """
     Parses the top of the **tfs_file_path** and returns the headers.
 
@@ -178,7 +177,7 @@ def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict:
             a Path object.
 
     Returns:
-        An ``OrderedDict`` with the headers read from the file.
+        An dictionary with the headers read from the file.
 
 
     Examples:
@@ -207,7 +206,7 @@ def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict:
 class _TfsMetaData:
     """A dataclass to encapsulate the metadata read from a TFS file."""
 
-    headers: OrderedDict
+    headers: dict
     non_data_lines: int
     column_names: np.ndarray
     column_types: np.ndarray
@@ -234,7 +233,7 @@ def _read_metadata(tfs_file_path: pathlib.Path | str) -> _TfsMetaData:
     """
     LOGGER.debug("Reading headers and metadata from file")
     tfs_file_path = pathlib.Path(tfs_file_path)
-    headers = OrderedDict()
+    headers = {}
     column_names = column_types = None
 
     # Read the headers, chunk by chunk (line by line) with pandas.read_csv as a
diff --git a/tfs/writer.py b/tfs/writer.py
@@ -9,7 +9,6 @@
 
 import logging
 import pathlib
-from collections import OrderedDict
 
 import numpy as np
 import pandas as pd
@@ -112,7 +111,7 @@ def write_tfs(
         try:
             headers_dict = data_frame.headers
         except AttributeError:
-            headers_dict = OrderedDict()
+            headers_dict = {}
 
     data_frame = data_frame.convert_dtypes(convert_integer=False)