Skip to content

Commit edc12c3

Browse files
Refactored glob_feature.py script and improved data frame creation for e3sm_io_heatmap_only.darshan log file. Also, relocated the script to a more suitable spot among other Python files in the project.
1 parent 397780a commit edc12c3

File tree

2 files changed

+89
-97
lines changed

2 files changed

+89
-97
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
2+
# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
3+
# Command to run python glob_feature.py -p path/to/log/file.darshan
4+
5+
6+
import argparse
7+
import pandas as pd
8+
import difflib
9+
import darshan
10+
import re
11+
import os
12+
13+
14+
def generalize_filename_glob(df):
15+
paths = df["filename_glob"].tolist()
16+
grouped_paths = []
17+
18+
for i in range(len(paths)):
19+
if not grouped_paths:
20+
grouped_paths.append((paths[i],))
21+
else:
22+
is_grouped = False
23+
for j, group in enumerate(grouped_paths):
24+
matcher = difflib.SequenceMatcher(None, paths[i], group[0])
25+
similarity_ratio = matcher.ratio()
26+
if similarity_ratio >= 0.8:
27+
grouped_paths[j] = group + (paths[i],)
28+
is_grouped = True
29+
break
30+
if not is_grouped:
31+
grouped_paths.append((paths[i],))
32+
33+
print("grouped paths list is", grouped_paths)
34+
35+
new_paths = []
36+
for group in grouped_paths:
37+
if len(group) > 1:
38+
common_prefix = os.path.commonprefix(group)
39+
pattern = r"({}.*)\d(.*)".format(common_prefix)
40+
modified_path = re.sub(pattern, r"\1\\d\2", group[0])
41+
new_paths.append((modified_path, len(group)))
42+
else:
43+
new_paths.append((group[0], 1))
44+
45+
new_paths = [path for path in new_paths if path[0]]
46+
47+
if len(new_paths) > len(df):
48+
new_paths = new_paths[:len(df)]
49+
50+
print("new paths are", new_paths)
51+
return new_paths
52+
53+
54+
def main(log_path, output_path):
55+
56+
report = darshan.DarshanReport(log_path)
57+
58+
df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
59+
60+
df = df[df["filename_glob"].str.contains(r"/.*")]
61+
62+
df.reset_index(drop=True, inplace=True) # Reset the index
63+
64+
65+
new_paths = generalize_filename_glob(df)
66+
df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
67+
df = df.reset_index(drop=True)
68+
df = df.sort_values(by="glob_count", ascending=False)
69+
70+
style = df.style.background_gradient(axis=0, cmap="viridis")
71+
style.set_table_styles([
72+
{"selector": "", "props": [("border", "1px solid grey")]},
73+
{"selector": "tbody td", "props": [("border", "1px solid grey")]},
74+
{"selector": "th", "props": [("border", "1px solid grey")]}
75+
])
76+
77+
style = style.hide_index()
78+
html = style.render()
79+
80+
with open(output_path, "w") as html_file:
81+
html_file.write(html)
82+
83+
84+
if __name__ == "__main__":
85+
parser = argparse.ArgumentParser()
86+
parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
87+
parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
88+
args = parser.parse_args()
89+
main(log_path=args.log_path, output_path=args.output_path)

git_project/glob_feature/glob_feature.py

Lines changed: 0 additions & 97 deletions
This file was deleted.

0 commit comments

Comments
 (0)