updated pytests for job_stats/file_stats

Shane Snyder · Shane Snyder · commit 9ab0d49a97ac · 2025-04-28T21:40:17.000-05:00
diff --git a/darshan-util/pydarshan/darshan/tests/test_file_stats.py b/darshan-util/pydarshan/darshan/tests/test_file_stats.py
@@ -2,13 +2,17 @@
 from unittest import mock
 from darshan.log_utils import get_log_path
 from darshan.cli import file_stats
+import pandas as pd
+import io
 import pytest
+
 @pytest.mark.parametrize(
     "argv", [
-        [get_log_path("e3sm_io_heatmap_only.darshan"),
-         "-mSTDIO",
-         "-oSTDIO_BYTES_READ",
-         "-n5"],
+        [get_log_path("shane_macsio_id29959_5-22-32552-7035573431850780836_1590156158.darshan"),
+         "--csv",
+         "--module=POSIX",
+         "--order_by=bytes_written",
+         "--limit=5"],
     ]
 )
 def test_file_stats(argv, capsys):
@@ -19,7 +23,38 @@ def test_file_stats(argv, capsys):
         file_stats.setup_parser(parser=parser)
         # parse the input arguments
         args = parser.parse_args(argv)
+    # run once with CSV output and spot check some of the output
     file_stats.main(args=args)
     captured = capsys.readouterr()
-    assert "15920181672442173319" in captured.out
-
+    assert not captured.err
+    assert captured.out
+    df = pd.read_csv(io.StringIO(captured.out))
+    assert len(df) == 3
+    # check the first file (most bytes written)
+    expected_first = {
+        'file': '/tmp/test/macsio_hdf5_000.h5',
+        'bytes_read': 39816960,
+        'bytes_written': 54579416,
+        'reads': 6,
+        'writes': 7699,
+        'total_jobs': 1
+    }
+    row = df.iloc[0]
+    for key, value in expected_first.items():
+        assert row[key] == value
+    # check the last file (least bytes written)
+    expected_last = {
+        'file': '/tmp/test/macsio-timings.log',
+        'bytes_read': 0,
+        'bytes_written': 12460,
+        'reads': 0,
+        'writes': 51,
+        'total_jobs': 1
+    }
+    row = df.iloc[-1]
+    for key, value in expected_last.items():
+        assert row[key] == value
+    assert expected_first['bytes_written'] > expected_last['bytes_written']
+    # run again to ensure default Rich print mode runs successfully
+    args.csv = False
+    file_stats.main(args=args)
diff --git a/darshan-util/pydarshan/darshan/tests/test_job_stats.py b/darshan-util/pydarshan/darshan/tests/test_job_stats.py
@@ -2,13 +2,18 @@
 from unittest import mock
 from darshan.log_utils import get_log_path
 from darshan.cli import job_stats
+from numpy.testing import assert_allclose
+import pandas as pd
+import io
 import pytest
+
 @pytest.mark.parametrize(
     "argv", [
-        [get_log_path("e3sm_io_heatmap_only.darshan"),
-         "-mSTDIO",
-         "-ototal_bytes",
-         "-n5"],
+        [get_log_path("sample-badost.darshan"),
+         "--csv",
+         "--module=STDIO",
+         "--order_by=total_bytes",
+         "--limit=5"],
     ]
 )
 def test_job_stats(argv, capsys):
@@ -19,6 +24,30 @@ def test_job_stats(argv, capsys):
         job_stats.setup_parser(parser=parser)
         # parse the input arguments
         args = parser.parse_args(argv)
+    # run once with CSV output and spot check some of the output
     job_stats.main(args=args)
     captured = capsys.readouterr()
-    assert "3.258853" in captured.out
+    assert not captured.err
+    assert captured.out
+    df = pd.read_csv(io.StringIO(captured.out))
+    assert len(df) == 1
+    expected = {
+        'log_file': 'sample-badost.darshan',
+        'job_id': 6265799,
+        'nprocs': 2048,
+        'run_time': 780.0,
+        'perf_by_slowest': 8.249708e+06,
+        'time_by_slowest': 0.200828,
+        'total_bytes': 1656773,
+        'total_files': 3,
+        'partial_flag': False
+    }
+    row = df.iloc[0]
+    for key, value in expected.items():
+        if key == 'perf_by_slowest' or key == 'time_by_slowest':
+            assert_allclose(row[key], value, rtol=1e-5, atol=1e-8)
+        else:
+            assert row[key] == value
+    # run again to ensure default Rich print mode runs successfully
+    args.csv = False
+    job_stats.main(args=args)