Rearranged glob_feature.py and added test_glob_feature.py to the test directory.

yariseidenbenz · yariseidenbenz · commit 9b757d51f4dd · 2023-06-14T15:01:29.000-06:00
diff --git a/darshan-util/pydarshan/darshan/tests/test_glob_feature.py b/darshan-util/pydarshan/darshan/tests/test_glob_feature.py
@@ -0,0 +1,49 @@
+import os
+import darshan
+from darshan.log_utils import get_log_path
+import pandas as pd
+print(pd.__version__)
+from pandas.testing import assert_frame_equal
+import pytest
+import re 
+print(sys.path)  # Print sys.path again
+import glob_feature
+
+print("hello")
+@pytest.mark.parametrize("log_name, expected_df", [
+     # grow this with more logs...
+     ("e3sm_io_heatmap_only.darshan",
+      pd.DataFrame({"filename_glob":
+                    # NOTE: usage of \\d or r"\d" for a literal backslash followed by "d"
+                    ["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc",
+                     "/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
+                    "glob_count": [2, 1]})),
+])
+
+def test_glob_tables(tmpdir, log_name, expected_df):
+    print("Current working directory:", os.getcwd())
+
+    # test the glob table HTML outputs for various
+    # log files in the logs repo (and new log files
+    # that you creatively design yourself)
+    log_path = get_log_path(log_name)
+    print("log path is", log_path)
+    with tmpdir.as_cwd():
+        cwd = os.getcwd()
+        # TODO: you shouldn't have a hardcoded HTML filename
+        # like this...
+        outfile = os.path.join(cwd, "name_record_glob_hd5f.html")
+        glob_feature.main(log_path, outfile)
+        actual_table = pd.read_html(outfile)[0]
+        actual_table.drop("Unnamed: 0", axis=1, inplace=True)  # Drop the "Unnamed: 0" column
+        print("actual table is", actual_table)
+        print("expected_df is", expected_df)
+        print("pandas version is", pd.__version__)
+        print("log path is", log_path)
+        # Compare the two DataFrames
+        diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])
+        # Print the differences
+        print(diff)
+        assert_frame_equal(actual_table, expected_df)
+
+
diff --git a/darshan-util/pydarshan/glob_feature.py b/darshan-util/pydarshan/glob_feature.py
@@ -0,0 +1,98 @@
+# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$
+# It uses sequence matching and grouping techniques to group similar file paths together and genera$
+# Command to run python glob_feature.py -p path/to/log/file.darshan 
+
+
+import argparse
+import pandas as pd
+import difflib
+import darshan
+import re
+import os
+
+
+def generalize_filename_glob(df):
+    paths = df["filename_glob"].tolist()
+    grouped_paths = []
+
+    for i in range(len(paths)):
+        if not grouped_paths:
+            grouped_paths.append((paths[i],))
+        else:
+            is_grouped = False
+            for j, group in enumerate(grouped_paths):
+                matcher = difflib.SequenceMatcher(None, paths[i], group[0])
+                similarity_ratio = matcher.ratio()
+                if similarity_ratio >= 0.8:
+                    grouped_paths[j] = group + (paths[i],)
+                    is_grouped = True
+                    break
+            if not is_grouped:
+                grouped_paths.append((paths[i],))
+
+    print("grouped paths list is", grouped_paths)
+
+    new_paths = []
+    for group in grouped_paths:
+        if len(group) > 1:
+            common_prefix = os.path.commonprefix(group)
+            pattern = r"({}.*)\d(.*)".format(common_prefix)
+            modified_path = re.sub(pattern, r"\1\\d\2", group[0])
+            new_paths.append((modified_path, len(group)))
+        else:
+            new_paths.append((group[0], 1))
+
+    new_paths = [path for path in new_paths if path[0]]
+
+    if len(new_paths) > len(df):
+        new_paths = new_paths[:len(df)]
+
+    print("new paths are", new_paths)
+    return new_paths
+
+
+
+
+def main(log_path, output_path):
+
+    report = darshan.DarshanReport(log_path)
+
+    df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
+
+    df = df[df["filename_glob"].str.contains(r"/.*")]
+
+    df.reset_index(drop=True, inplace=True)  # Reset the index
+
+
+    new_paths = generalize_filename_glob(df)
+    df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
+    df = df.reset_index(drop=True)
+    df = df.sort_values(by="glob_count", ascending=False)
+
+
+    style = df.style.background_gradient(axis=0, cmap="viridis")
+    style.set_properties(subset=["glob_count"], **{"text-align": "right"})
+
+    style.set_table_styles([
+        {"selector": "", "props": [("border", "1px solid grey")]},
+        {"selector": "tbody td", "props": [("border", "1px solid grey")]},
+        {"selector": "th", "props": [("border", "1px solid grey")]}
+
+    ])
+
+    # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile 
+
+    html = style.to_html() #use when running pytest
+
+
+    with open(output_path, "w") as html_file:
+        html_file.write(html)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
+    parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
+    args = parser.parse_args()
+    main(log_path=args.log_path, output_path=args.output_path)
+