darshan-hpc · shanedsnyder · Apr 30, 2025 · Mar 11, 2025 · Apr 25, 2025 · Apr 25, 2025
diff --git a/darshan-util/pydarshan/darshan/cli/summary.py b/darshan-util/pydarshan/darshan/cli/summary.py
@@ -7,7 +7,7 @@
 from collections import OrderedDict
 import importlib.resources as importlib_resources
 
-from typing import Any, Union, Callable
+from typing import Any, Union, Callable, List, Optional
 
 import pandas as pd
 from mako.template import Template
@@ -124,7 +124,7 @@ def generate_fig(self):
         elif isinstance(fig, plot_common_access_table.DarshanReportTable):
             # retrieve html table from `DarshanReportTable`
             self.fig_html = fig.html
-        else:
+        elif fig is not None:
             err_msg = f"Figure of type {type(fig)} not supported."
             raise NotImplementedError(err_msg)
 
@@ -137,21 +137,24 @@ class ReportData:
     ----------
     log_path: path to a darshan log file.
     enable_dxt_heatmap: flag indicating whether DXT heatmaps should be enabled
+    filter_patterns: regex patterns for names to exclude/include
+    filter_mode: whether to "exclude" or "include" the filter patterns
 
     """
-    def __init__(self, log_path: str, enable_dxt_heatmap: bool = False):
+    def __init__(self, log_path: str, enable_dxt_heatmap: bool = False,
+                 filter_patterns: Optional[List[str]] = None, filter_mode: str = "exclude"):
         # store the log path and use it to generate the report
         self.log_path = log_path
         self.enable_dxt_heatmap = enable_dxt_heatmap
         # store the report
         self.report = darshan.DarshanReport(log_path, read_all=False)
         # read only generic module data and heatmap data by default
-        self.report.read_all_generic_records()
+        self.report.read_all_generic_records(filter_patterns=filter_patterns, filter_mode=filter_mode)
         if "HEATMAP" in self.report.data['modules']:
             self.report.read_all_heatmap_records()
         # if DXT heatmaps requested, additionally read-in DXT data
         if self.enable_dxt_heatmap:
-            self.report.read_all_dxt_records()
+            self.report.read_all_dxt_records(filter_patterns=filter_patterns, filter_mode=filter_mode)
         # create the header/footer
         self.get_header()
         self.get_footer()
@@ -496,7 +499,11 @@ def register_figures(self):
         elif "PNETCDF_FILE" in self.report.modules:
             opcounts_mods.append("PNETCDF_FILE")
 
-        for mod in self.report.modules:
+        for mod in self.report.records:
+            # skip over modules with no records -- this likely means
+            # records in the log were filtered out via name exclusions
+            if len(self.report.records[mod]) == 0:
+                continue
 
             if "H5" in mod:
                 sect_title = "Per-Module Statistics: HDF5"
@@ -633,6 +640,12 @@ def build_sections(self):
         """
         self.sections = {}
         for fig in self.figures:
+            # skip empty figures that can be generated by report sections
+            # "Data Access by Category" and "Cross-Module Comparisons"
+            if (fig.fig_html == None and
+                (fig.section_title == "Data Access by Category" or
+                 fig.section_title == "Cross-Module Comparisons")):
+                continue
             # if a section title is not already in sections, add
             # the section title and a corresponding empty list
             # to store its figures
@@ -669,6 +682,16 @@ def setup_parser(parser: argparse.ArgumentParser):
         action="store_true",
         help="Enable DXT-based versions of I/O activity heatmaps."
     )
+    parser.add_argument(
+        "--exclude_names",
+        action='append',
+        help="regex patterns for file record names to exclude in summary report"
+    )
+    parser.add_argument(
+        "--include_names",
+        action='append',
+        help="regex patterns for file record names to include in summary report"
+     )
 
 
 def main(args: Union[Any, None] = None):
@@ -687,6 +710,17 @@ def main(args: Union[Any, None] = None):
 
     log_path = args.log_path
     enable_dxt_heatmap = args.enable_dxt_heatmap
+    filter_patterns=None
+    filter_mode="exclude"
+    if args.exclude_names and args.include_names:
+        print('Error: only one of --exclude_names and --include_names may be used.')
+        sys.exit(1)
+    elif args.exclude_names:
+        filter_patterns = args.exclude_names
+        filter_mode = "exclude"
+    elif args.include_names:
+        filter_patterns = args.include_names
+        filter_mode = "include"
 
     if args.output is None:
         # if no output is provided, use the log file
@@ -699,7 +733,9 @@ def main(args: Union[Any, None] = None):
     # collect the report data to feed into the template
     report_data = ReportData(
         log_path=log_path,
-        enable_dxt_heatmap=enable_dxt_heatmap
+        enable_dxt_heatmap=enable_dxt_heatmap,
+        filter_patterns=filter_patterns,
+        filter_mode=filter_mode
     )
 
     with importlib_resources.path(darshan.cli, "base.html") as base_path:

diff --git a/darshan-util/pydarshan/darshan/experimental/aggregators/agg_ioops.py b/darshan-util/pydarshan/darshan/experimental/aggregators/agg_ioops.py
@@ -28,7 +28,7 @@ def agg_ioops(self, mode='append'):
     for mod in mods:
 
         # check records for module are present
-        if mod not in recs:
+        if mod not in recs or len(recs[mod]) == 0:
             continue
 
         agg = None

diff --git a/darshan-util/pydarshan/darshan/experimental/plots/data_access_by_filesystem.py b/darshan-util/pydarshan/darshan/experimental/plots/data_access_by_filesystem.py
@@ -208,11 +208,11 @@ def rec_to_rw_counter_dfs(report: Any,
     rec_counters = pd.DataFrame()
     df_reads = pd.DataFrame()
     df_writes = pd.DataFrame()
-    if "POSIX" in report.modules:
+    if "POSIX" in report.modules and len(report.records["POSIX"]) > 0:
         rec_counters = pd.concat(objs=(rec_counters, report.records["POSIX"].to_df()['counters']))
         df_reads = pd.concat(objs=(df_reads, rec_counters.loc[rec_counters[f'POSIX_BYTES_READ'] >= 1]))
         df_writes = pd.concat(objs=(df_writes, rec_counters.loc[rec_counters[f'POSIX_BYTES_WRITTEN'] >= 1]))
-    if "STDIO" in report.modules:
+    if "STDIO" in report.modules and len(report.records["STDIO"]) > 0:
         rec_counters = pd.concat(objs=(rec_counters, report.records["STDIO"].to_df()['counters']))
         df_reads = pd.concat(objs=(df_reads, rec_counters.loc[rec_counters[f'STDIO_BYTES_READ'] >= 1]))
         df_writes = pd.concat(objs=(df_writes, rec_counters.loc[rec_counters[f'STDIO_BYTES_WRITTEN'] >= 1]))
@@ -632,7 +632,7 @@ def plot_with_report(report: darshan.DarshanReport,
     Returns
     -------
 
-    fig: matplotlib figure object
+    fig: matplotlib figure object or None if no data to plot
     """
     fig = plt.figure()
     file_id_dict = report.data["name_records"]
@@ -648,6 +648,10 @@ def plot_with_report(report: darshan.DarshanReport,
             for ident in allowed_ids:
                 allowed_file_id_dict[ident] = file_id_dict[ident]
 
+    if len(allowed_file_id_dict) == 0:
+        # no data, likely because all records have been filtered out
+        return None
+
     filesystem_roots = identify_filesystems(file_id_dict=allowed_file_id_dict,
                                             verbose=verbose)
     # NOTE: this is a bit ugly, STDIO and POSIX are both combined

diff --git a/darshan-util/pydarshan/darshan/experimental/plots/plot_io_cost.py b/darshan-util/pydarshan/darshan/experimental/plots/plot_io_cost.py
@@ -111,7 +111,7 @@ def get_io_cost_df(report: darshan.DarshanReport) -> Any:
     io_cost_dict = {}
     supported_modules = ["POSIX", "MPI-IO", "STDIO", "H5F", "H5D", "PNETCDF_FILE", "PNETCDF_VAR"]
     for mod_key in report.modules:
-        if mod_key in supported_modules:
+        if mod_key in supported_modules and len(report.records[mod_key]) > 0:
             # collect the records in dataframe form
             recs = report.records[mod_key].to_df(attach=None)
             # correct the MPI module key
@@ -150,13 +150,18 @@ def plot_io_cost(report: darshan.DarshanReport) -> Any:
     Returns
     -------
     io_cost_fig: a ``matplotlib.pyplot.figure`` object containing a
-    stacked bar graph of the average read, write, and metadata times.
+    stacked bar graph of the average read, write, and metadata times --
+    or None when there is no data to plot
 
     """
     # get the run time from the report metadata
     runtime = report.metadata["job"]["run_time"]
     # get the I/O cost dataframe
     io_cost_df = get_io_cost_df(report=report)
+    if io_cost_df.empty:
+        # return an empty figure if there's no data
+        # this typically occurs when all module records have been filtered out
+        return None
     # generate a figure with 2 y axes
     io_cost_fig = plt.figure(figsize=(4.5, 4))
     ax_raw = io_cost_fig.add_subplot(111)