ENH: WAIT_TIME in Python summary

tylerjereddy · tylerjereddy · commit 141d97b1a368 · 2022-12-17T16:20:45.000-07:00
Fixes #860 * include `PNETCDF_FILE_F_WAIT_TIME` in the I/O cost plot in the Python summary reports, along with regression testing and caption changes * I would have felt more comfortable if the issue pointed me to a log file with prominent non-zero wait times, because all-zero values will also be the default for the `Wait` category when it isn't valid (even when `PNETCDF` is not used); I mostly decided not to exclude the category when `PNETCDF` is absent because it was easier to code + accounted for in the caption adjustment anyway * I'll provide some samples of the new I/O cost plots--note that the x label squishing is handled separately in gh-883.
diff --git a/darshan-util/pydarshan/darshan/cli/summary.py b/darshan-util/pydarshan/darshan/cli/summary.py
@@ -405,7 +405,8 @@ def register_figures(self):
             "Average (across all ranks) amount of run time that each process "
             "spent performing I/O, broken down by access type. See the right "
             "edge bar graph on heat maps in preceding section to indicate if "
-            "I/O activity was balanced across processes."
+            "I/O activity was balanced across processes. The 'Wait' category "
+            "is only meaningful for PNETCDF asynchronous I/O operations."
         )
         io_cost_params = {
             "section_title": "Cross-Module Comparisons",
diff --git a/darshan-util/pydarshan/darshan/experimental/plots/plot_io_cost.py b/darshan-util/pydarshan/darshan/experimental/plots/plot_io_cost.py
@@ -30,7 +30,7 @@ def get_by_avg_series(df: Any, mod_key: str, nprocs: int) -> Any:
     Returns
     -------
     by_avg_series: a ``pd.Series`` containing the
-    average read, write, and meta times.
+    average read, write, meta, and wait times.
 
     """
     # filter out all except the following columns
@@ -39,11 +39,16 @@ def get_by_avg_series(df: Any, mod_key: str, nprocs: int) -> Any:
         f"{mod_key}_F_WRITE_TIME",
         f"{mod_key}_F_META_TIME",
     ]
+    if "PNETCDF_FILE" in mod_key:
+        cols.append("PNETCDF_FILE_F_WAIT_TIME")
+    else:
+        cols.append("Wait")
     by_avg_series = df.filter(cols, axis=1).sum(axis=0) / nprocs
     # reindex to ensure 3 rows are always created
     by_avg_series = by_avg_series.reindex(cols, fill_value=0.0)
     # rename the columns so the labels are automatically generated when plotting
     name_dict = {cols[0]: "Read", cols[1]: "Write", cols[2]: "Meta"}
+    name_dict[cols[3]] = "Wait"
     by_avg_series.rename(index=name_dict, inplace=True)
     return by_avg_series
 
diff --git a/darshan-util/pydarshan/darshan/tests/test_plot_io_cost.py b/darshan-util/pydarshan/darshan/tests/test_plot_io_cost.py
@@ -21,37 +21,37 @@
             "ior_hdf5_example.darshan",
             pd.DataFrame(
                 np.array([
-                    [0.0196126699, 0.1342029571533203, 0.0074423551],
-                    [0.0196372866, 0.13425052165985107, 0.0475],
-                    [0.016869, 0.086689, 0.097160],
-                    [0.0, 2.5570392608642578e-05, 0.0],
+                    [0.0196126699, 0.1342029571533203, 0.0074423551, 0.0],
+                    [0.0196372866, 0.13425052165985107, 0.0475, 0.0],
+                    [0.016869, 0.086689, 0.097160, 0.0],
+                    [0.0, 2.5570392608642578e-05, 0.0, 0.0],
                 ]),
                 ["POSIX", "MPIIO", "HDF5", "STDIO"],
-                ["Read", "Write", "Meta"],
+                ["Read", "Write", "Meta", "Wait"],
             ),
         ),
         (
             "sample-badost.darshan",
             pd.DataFrame(
                 np.array([
-                    [0.0, 33.48587587394286, 0.5547398688504472],
-                    [0.011203573201783001, 4.632166e-07, 0.135187],
+                    [0.0, 33.48587587394286, 0.5547398688504472, 0.0],
+                    [0.011203573201783001, 4.632166e-07, 0.135187, 0.0],
                 ]),
                 ["POSIX", "STDIO"],
-                ["Read", "Write", "Meta"],
+                ["Read", "Write", "Meta", "Wait"],
             ),
         ),
         (
             "shane_ior-PNETCDF_id438100-438100_11-9-41525-10280033558448664385_1.darshan",
             pd.DataFrame(
                 np.array([
-                [0.000378787518, 0.002514898777, 0.000068306923],
-                [0.000397562981, 0.002540826797, 0.001559376717],
-                [0.000402510166, 0.002579867840, 0.001994967461],
-                [0.000000000000, 0.000120997429, 0.000000000000],
+                [0.000378787518, 0.002514898777, 0.000068306923, 0.0],
+                [0.000397562981, 0.002540826797, 0.001559376717, 0.0],
+                [0.000402510166, 0.002579867840, 0.001994967461, 0.0],
+                [0.000000000000, 0.000120997429, 0.000000000000, 0.0],
                 ]),
                 ["POSIX", "MPIIO", "PNETCDF", "STDIO"],
-                ["Read", "Write", "Meta"],
+                ["Read", "Write", "Meta", "Wait"],
             ),
         ),
     ],
@@ -158,8 +158,8 @@ def test_plot_io_cost_y_ticks_and_labels(logname, expected_yticks):
             ],
         ),
         pd.Series(
-            data=[1.2, .6, 3.0],
-            index=["Read", "Write", "Meta"],
+            data=[1.2, .6, 3.0, 0.0],
+            index=["Read", "Write", "Meta", "Wait"],
         ),
     ),
     (
@@ -177,8 +177,8 @@ def test_plot_io_cost_y_ticks_and_labels(logname, expected_yticks):
             ],
         ),
         pd.Series(
-            data=[3000.0, 300.0, 30.0],
-            index=["Read", "Write", "Meta"],
+            data=[3000.0, 300.0, 30.0, 0.0],
+            index=["Read", "Write", "Meta", "Wait"],
         ),
     ),
     (
@@ -197,8 +197,8 @@ def test_plot_io_cost_y_ticks_and_labels(logname, expected_yticks):
             ],
         ),
         pd.Series(
-            data=[3001.2, 300.6, 33.0],
-            index=["Read", "Write", "Meta"],
+            data=[3001.2, 300.6, 33.0, 0.0],
+            index=["Read", "Write", "Meta", "Wait"],
         ),
     )
 ])
@@ -215,11 +215,11 @@ def test_get_by_avg_series(mod_key, input_df, expected_series):
             "nonmpi_dxt_anonymized.darshan",
             pd.DataFrame(
                 np.array([
-                    [0.281718, 0.504260, 0.170138],
-                    [0.232386, 0.165982, 0.072751],
+                    [0.281718, 0.504260, 0.170138, 0.0],
+                    [0.232386, 0.165982, 0.072751, 0.0],
                 ]),
                 ["POSIX", "STDIO"],
-                ["Read", "Write", "Meta"],
+                ["Read", "Write", "Meta", "Wait"],
             ),
         ),
     ])