Skip to content

Commit 8277396

Browse files
updated glob_feature.py which creates dataframe of glob_filename and glob_count
1 parent dce2c0a commit 8277396

File tree

1 file changed

+52
-13
lines changed

1 file changed

+52
-13
lines changed

git_project/glob_feature/glob_feature.py

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -7,42 +7,80 @@
77
import pandas as pd
88
import difflib
99
import darshan
10+
import re
11+
import os
1012

1113

12-
def path_grouper():
14+
def make_path_grouper():
1315
matcher = difflib.SequenceMatcher()
1416
def group_paths(paths):
1517
if not matcher.a:
1618
matcher.set_seq1(paths)
1719
return paths
1820
else:
1921
matcher.set_seq2(paths)
20-
matchings = matcher.get_matching_blocks()
21-
if any(size > 25 for _, _, size in matchings): # change size to bigger number for more precise paths
22+
similarity_ratio = matcher.ratio()
23+
if similarity_ratio >= 0.8:
2224
return matcher.a
2325
else:
2426
matcher.set_seq1(paths)
2527
return paths
26-
2728
return group_paths
2829

2930

3031
def regex_df_condenser(df, paths):
31-
path_grouper_func = path_grouper()
32+
path_grouper_func = make_path_grouper()
3233
df["filename_glob"] = df["filename_glob"].apply(path_grouper_func)
34+
print("Paths after grouping:")
35+
print(df["filename_glob"])
36+
3337
df = df.groupby("filename_glob").size().reset_index(name="glob_count")
3438

39+
df = df.sort_values(by="glob_count", ascending=False)
40+
41+
print("Paths after grouping and counting:")
42+
print(df)
43+
44+
45+
def find_common_prefix(paths):
46+
# Sort the paths in lexicographical order
47+
sorted_paths = sorted(paths)
48+
49+
# Find the common prefix
50+
common_prefix = os.path.commonprefix(sorted_paths)
51+
52+
# Trim the common prefix to the last path separator
53+
last_separator = common_prefix.rfind(os.path.sep)
54+
common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix
55+
56+
return common_prefix
57+
58+
59+
for group in df["filename_glob"].unique():
60+
group_df = df[df["filename_glob"] == group]
61+
common_path = find_common_prefix(group_df["filename_glob"])
62+
df.loc[df["filename_glob"] == group, "filename_glob"] = common_path
63+
64+
print("Paths after modifying filename_glob:")
65+
print(df)
66+
67+
df["regex"] = df.apply(lambda row: re.escape(row["filename_glob"]) + r".*", axis=1)
68+
print("Paths after applying regex:")
69+
print(df)
70+
3571
return df
3672

3773

38-
def main(log_path):
74+
75+
def main(log_path, output_path):
3976
report = darshan.DarshanReport(log_path)
77+
78+
4079
df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
80+
4181
df = df[df["filename_glob"].str.contains(r"/.*")]
4282
df["glob_count"] = 1
4383
df = regex_df_condenser(df, df["filename_glob"])
44-
df.sort_values(by="glob_count", inplace=True, ascending=False)
45-
4684

4785
style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
4886
style.hide(axis="index")
@@ -53,14 +91,15 @@ def main(log_path):
5391
])
5492
html = style.to_html()
5593

56-
# can change name of the output html report here
57-
with open("name_record_table.html", "w") as html_file:
94+
# can change name of the output html report here
95+
with open("name_record_glob.html", "w") as html_file:
5896
html_file.write(html)
5997

60-
98+
# go back to hdf5_diagonal dxt
6199
if __name__ == "__main__":
62100
parser = argparse.ArgumentParser()
63-
parser.add_argument('-p', '--log-path', type=str)
101+
parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
102+
parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
64103
args = parser.parse_args()
65-
main(log_path=args.log_path)
104+
main(log_path=args.log_path , output_path=args.output_path)
66105

0 commit comments

Comments
 (0)