77import pandas as pd
88import difflib
99import darshan
10+ import re
11+ import os
1012
1113
12- def path_grouper ():
14+ def make_path_grouper ():
1315 matcher = difflib .SequenceMatcher ()
1416 def group_paths (paths ):
1517 if not matcher .a :
1618 matcher .set_seq1 (paths )
1719 return paths
1820 else :
1921 matcher .set_seq2 (paths )
20- matchings = matcher .get_matching_blocks ()
21- if any ( size > 25 for _ , _ , size in matchings ): # change size to bigger number for more precise paths
22+ similarity_ratio = matcher .ratio ()
23+ if similarity_ratio >= 0.8 :
2224 return matcher .a
2325 else :
2426 matcher .set_seq1 (paths )
2527 return paths
26-
2728 return group_paths
2829
2930
3031def regex_df_condenser (df , paths ):
31- path_grouper_func = path_grouper ()
32+ path_grouper_func = make_path_grouper ()
3233 df ["filename_glob" ] = df ["filename_glob" ].apply (path_grouper_func )
34+ print ("Paths after grouping:" )
35+ print (df ["filename_glob" ])
36+
3337 df = df .groupby ("filename_glob" ).size ().reset_index (name = "glob_count" )
3438
39+ df = df .sort_values (by = "glob_count" , ascending = False )
40+
41+ print ("Paths after grouping and counting:" )
42+ print (df )
43+
44+
45+ def find_common_prefix (paths ):
46+ # Sort the paths in lexicographical order
47+ sorted_paths = sorted (paths )
48+
49+ # Find the common prefix
50+ common_prefix = os .path .commonprefix (sorted_paths )
51+
52+ # Trim the common prefix to the last path separator
53+ last_separator = common_prefix .rfind (os .path .sep )
54+ common_prefix = common_prefix [:last_separator + 1 ] if last_separator >= 0 else common_prefix
55+
56+ return common_prefix
57+
58+
59+ for group in df ["filename_glob" ].unique ():
60+ group_df = df [df ["filename_glob" ] == group ]
61+ common_path = find_common_prefix (group_df ["filename_glob" ])
62+ df .loc [df ["filename_glob" ] == group , "filename_glob" ] = common_path
63+
64+ print ("Paths after modifying filename_glob:" )
65+ print (df )
66+
67+ df ["regex" ] = df .apply (lambda row : re .escape (row ["filename_glob" ]) + r".*" , axis = 1 )
68+ print ("Paths after applying regex:" )
69+ print (df )
70+
3571 return df
3672
3773
38- def main (log_path ):
74+
75+ def main (log_path , output_path ):
3976 report = darshan .DarshanReport (log_path )
77+
78+
4079 df = pd .DataFrame .from_dict (report .name_records , orient = "index" , columns = ["filename_glob" ])
80+
4181 df = df [df ["filename_glob" ].str .contains (r"/.*" )]
4282 df ["glob_count" ] = 1
4383 df = regex_df_condenser (df , df ["filename_glob" ])
44- df .sort_values (by = "glob_count" , inplace = True , ascending = False )
45-
4684
4785 style = df .style .background_gradient (axis = 0 , cmap = "viridis" , gmap = df ["glob_count" ])
4886 style .hide (axis = "index" )
@@ -53,14 +91,15 @@ def main(log_path):
5391 ])
5492 html = style .to_html ()
5593
56- # can change name of the output html report here
57- with open ("name_record_table .html" , "w" ) as html_file :
94+ # can change name of the output html report here
95+ with open ("name_record_glob .html" , "w" ) as html_file :
5896 html_file .write (html )
5997
60-
98+ # go back to hdf5_diagonal dxt
6199if __name__ == "__main__" :
62100 parser = argparse .ArgumentParser ()
63- parser .add_argument ('-p' , '--log-path' , type = str )
101+ parser .add_argument ('-p' , '--log-path' , type = str , help = "Path to the log file" )
102+ parser .add_argument ('-o' , '--output-path' , type = str , help = "Path to the output HTML file" )
64103 args = parser .parse_args ()
65- main (log_path = args .log_path )
104+ main (log_path = args .log_path , output_path = args . output_path )
66105
0 commit comments