Fixes all local file loader metadata to have a uniform shape

elijahbenizzy · elijahbenizzy · commit 4c0e5de6f048 · 2024-02-05T20:12:46.000-08:00
Beforehand we had different approaches depending on whether it was a
dataframe. We'll need this for downstream parsing. The new format is one
to many of:
1. sql_metadata
2. file_metadata
3. dataframe_metadata

The utils functions will return prefixed dicts, and consumers just have
to add them. This is all internal-facing (just to be used for
diagnostics), so its OK to change it.
diff --git a/hamilton/io/utils.py b/hamilton/io/utils.py
@@ -22,10 +22,12 @@ def get_file_metadata(path: str) -> Dict[str, Any]:
     - the current time
     """
     return {
-        "size": os.path.getsize(path),
-        "path": path,
-        "last_modified": os.path.getmtime(path),
-        "timestamp": datetime.now().utcnow().timestamp(),
+        FILE_METADATA: {
+            "size": os.path.getsize(path),
+            "path": path,
+            "last_modified": os.path.getmtime(path),
+            "timestamp": datetime.now().utcnow().timestamp(),
+        }
     }
 
 
@@ -42,10 +44,12 @@ def get_dataframe_metadata(df: pd.DataFrame) -> Dict[str, Any]:
     - the data types
     """
     return {
-        "rows": len(df),
-        "columns": len(df.columns),
-        "column_names": list(df.columns),
-        "datatypes": [str(t) for t in list(df.dtypes)],  # for serialization purposes
+        DATAFRAME_METADATA: {
+            "rows": len(df),
+            "columns": len(df.columns),
+            "column_names": list(df.columns),
+            "datatypes": [str(t) for t in list(df.dtypes)],  # for serialization purposes
+        }
     }
 
 
@@ -67,7 +71,7 @@ def get_file_and_dataframe_metadata(path: str, df: pd.DataFrame) -> Dict[str, An
         - the column names
         - the data types
     """
-    return {FILE_METADATA: get_file_metadata(path), DATAFRAME_METADATA: get_dataframe_metadata(df)}
+    return {**get_file_metadata(path), **get_dataframe_metadata(df)}
 
 
 def get_sql_metadata(query_or_table: str, results: Union[int, pd.DataFrame]) -> Dict[str, Any]:
@@ -91,8 +95,10 @@ def get_sql_metadata(query_or_table: str, results: Union[int, pd.DataFrame]) ->
     else:
         rows = None
     return {
-        "rows": rows,
-        "query": query,
-        "table_name": table_name,
-        "timestamp": datetime.now().utcnow().timestamp(),
+        SQL_METADATA: {
+            "rows": rows,
+            "query": query,
+            "table_name": table_name,
+            "timestamp": datetime.now().utcnow().timestamp(),
+        }
     }
diff --git a/hamilton/plugins/pandas_extensions.py b/hamilton/plugins/pandas_extensions.py
@@ -733,11 +733,7 @@ def load_data(self, type_: Type) -> Tuple[DATAFRAME_TYPE, Dict[str, Any]]:
         df = pd.read_sql(self.query_or_table, self.db_connection, **self._get_loading_kwargs())
         sql_metadata = utils.get_sql_metadata(self.query_or_table, df)
         df_metadata = utils.get_dataframe_metadata(df)
-        metadata = {
-            utils.SQL_METADATA: sql_metadata,
-            utils.DATAFRAME_METADATA: df_metadata,
-        }
-        return df, metadata
+        return df, {**sql_metadata, **df_metadata}
 
     @classmethod
     def name(cls) -> str:
@@ -793,11 +789,7 @@ def save_data(self, data: DATAFRAME_TYPE) -> Dict[str, Any]:
         results = data.to_sql(self.table_name, self.db_connection, **self._get_saving_kwargs())
         sql_metadata = utils.get_sql_metadata(self.table_name, results)
         df_metadata = utils.get_dataframe_metadata(data)
-        metadata = {
-            utils.SQL_METADATA: sql_metadata,
-            utils.DATAFRAME_METADATA: df_metadata,
-        }
-        return metadata
+        return {**sql_metadata, **df_metadata}
 
     @classmethod
     def name(cls) -> str:
diff --git a/tests/io/test_utils.py b/tests/io/test_utils.py
@@ -1,16 +1,16 @@
 import pandas as pd
 
-from hamilton.io.utils import get_sql_metadata
+from hamilton.io.utils import SQL_METADATA, get_sql_metadata
 
 
 def test_get_sql_metadata():
     results = 5
     table = "foo"
     query = "SELECT foo FROM bar"
     df = pd.DataFrame({"foo": ["bar"]})
-    metadata1 = get_sql_metadata(table, df)
-    metadata2 = get_sql_metadata(query, results)
-    metadata3 = get_sql_metadata(query, "foo")
+    metadata1 = get_sql_metadata(table, df)[SQL_METADATA]
+    metadata2 = get_sql_metadata(query, results)[SQL_METADATA]
+    metadata3 = get_sql_metadata(query, "foo")[SQL_METADATA]
     assert metadata1["table_name"] == table
     assert metadata1["rows"] == 1
     assert metadata2["query"] == query
diff --git a/tests/plugins/test_lightgbm_extensions.py b/tests/plugins/test_lightgbm_extensions.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from hamilton.io.utils import FILE_METADATA
 from hamilton.plugins.lightgbm_extensions import LightGBMFileReader, LightGBMFileWriter
 
 
@@ -40,7 +41,7 @@ def test_lightgbm_file_writer(
     metadata = writer.save_data(fitted_lightgbm)
 
     assert model_path.exists()
-    assert metadata["path"] == model_path
+    assert metadata[FILE_METADATA]["path"] == model_path
 
 
 @pytest.mark.parametrize(
diff --git a/tests/plugins/test_matplotlib_extensions.py b/tests/plugins/test_matplotlib_extensions.py
@@ -4,6 +4,7 @@
 import matplotlib.pyplot as plt
 import pytest
 
+from hamilton.io.utils import FILE_METADATA
 from hamilton.plugins.matplotlib_extensions import MatplotlibWriter
 
 
@@ -27,4 +28,4 @@ def test_plotly_static_writer(figure: matplotlib.figure.Figure, tmp_path: pathli
     metadata = writer.save_data(figure)
 
     assert file_path.exists()
-    assert metadata["path"] == file_path
+    assert metadata[FILE_METADATA]["path"] == file_path
diff --git a/tests/plugins/test_numpy_extensions.py b/tests/plugins/test_numpy_extensions.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+from hamilton.io.utils import FILE_METADATA
 from hamilton.plugins.numpy_extensions import NumpyNpyReader, NumpyNpyWriter
 
 
@@ -18,7 +19,7 @@ def test_numpy_file_writer(array: np.ndarray, tmp_path: pathlib.Path) -> None:
     metadata = writer.save_data(array)
 
     assert file_path.exists()
-    assert metadata["path"] == file_path
+    assert metadata[FILE_METADATA]["path"] == file_path
 
 
 def test_numpy_file_reader(array: np.ndarray, tmp_path: pathlib.Path) -> None:
diff --git a/tests/plugins/test_plotly_extensions.py b/tests/plugins/test_plotly_extensions.py
@@ -3,6 +3,7 @@
 import plotly.graph_objects as go
 import pytest
 
+from hamilton.io.utils import FILE_METADATA
 from hamilton.plugins.plotly_extensions import PlotlyInteractiveWriter, PlotlyStaticWriter
 
 
@@ -18,7 +19,7 @@ def test_plotly_static_writer(figure: go.Figure, tmp_path: pathlib.Path) -> None
     metadata = writer.save_data(figure)
 
     assert file_path.exists()
-    assert metadata["path"] == file_path
+    assert metadata[FILE_METADATA]["path"] == file_path
 
 
 def test_plotly_interactive_writer(figure: go.Figure, tmp_path: pathlib.Path) -> None:
@@ -28,4 +29,4 @@ def test_plotly_interactive_writer(figure: go.Figure, tmp_path: pathlib.Path) ->
     metadata = writer.save_data(figure)
 
     assert file_path.exists()
-    assert metadata["path"] == file_path
+    assert metadata[FILE_METADATA]["path"] == file_path
diff --git a/tests/plugins/test_sklearn_plot_extensions.py b/tests/plugins/test_sklearn_plot_extensions.py
@@ -15,6 +15,7 @@
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
 
+from hamilton.io.utils import FILE_METADATA
 from hamilton.plugins.sklearn_plot_extensions import SklearnPlotSaver
 
 if hasattr(metrics, "PredictionErrorDisplay"):
@@ -191,7 +192,7 @@ def test_cm_plot_saver(
     metadata = writer.save_data(confusion_matrix_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 def test_det_curve_display(
@@ -203,7 +204,7 @@ def test_det_curve_display(
     metadata = writer.save_data(det_curve_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 def test_precision_recall_display(
@@ -215,7 +216,7 @@ def test_precision_recall_display(
     metadata = writer.save_data(precision_recall_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
@@ -228,7 +229,7 @@ def test_prediction_error_display(
     metadata = writer.save_data(prediction_error_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 def test_roc_curve_display(
@@ -240,7 +241,7 @@ def test_roc_curve_display(
     metadata = writer.save_data(roc_curve_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 def test_calibration_display(
@@ -252,7 +253,7 @@ def test_calibration_display(
     metadata = writer.save_data(calibration_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
@@ -265,7 +266,7 @@ def test_decision_boundary_display(
     metadata = writer.save_data(decision_boundary_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
@@ -278,7 +279,7 @@ def test_partial_dependence_display(
     metadata = writer.save_data(partial_dependence_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
@@ -291,7 +292,7 @@ def test_learning_curve_display(
     metadata = writer.save_data(learning_curve_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
 
 
 @pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python3.8 or higher")
@@ -304,4 +305,4 @@ def test_validation_curve_display(
     metadata = writer.save_data(validation_curve_display)
 
     assert plot_path.exists()
-    assert metadata["path"] == plot_path
+    assert metadata[FILE_METADATA]["path"] == plot_path
diff --git a/tests/plugins/test_xgboost_extensions.py b/tests/plugins/test_xgboost_extensions.py
@@ -4,6 +4,7 @@
 import xgboost
 from sklearn.utils.validation import check_is_fitted
 
+from hamilton.io.utils import FILE_METADATA
 from hamilton.plugins.xgboost_extensions import XGBoostJsonReader, XGBoostJsonWriter
 
 
@@ -30,7 +31,7 @@ def test_xgboost_model_json_writer(
     metadata = writer.save_data(fitted_xgboost_model)
 
     assert model_path.exists()
-    assert metadata["path"] == model_path
+    assert metadata[FILE_METADATA]["path"] == model_path
 
 
 def test_xgboost_model_json_reader(
@@ -55,7 +56,7 @@ def test_xgboost_booster_json_writer(
     metadata = writer.save_data(fitted_xgboost_booster)
 
     assert booster_path.exists()
-    assert metadata["path"] == booster_path
+    assert metadata[FILE_METADATA]["path"] == booster_path
 
 
 def test_xgboost_booster_json_reader(