|
| 1 | +# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$ |
| 2 | +# It uses sequence matching and grouping techniques to group similar file paths together and genera$ |
| 3 | +# Command to run python glob_feature.py -p path/to/log/file.darshan |
| 4 | + |
| 5 | + |
| 6 | +import argparse |
| 7 | +import pandas as pd |
| 8 | +import difflib |
| 9 | +import darshan |
| 10 | +import re |
| 11 | +import os |
| 12 | + |
| 13 | + |
| 14 | +def generalize_filename_glob(df): |
| 15 | + paths = df["filename_glob"].tolist() |
| 16 | + grouped_paths = [] |
| 17 | + |
| 18 | + for i in range(len(paths)): |
| 19 | + if not grouped_paths: |
| 20 | + grouped_paths.append((paths[i],)) |
| 21 | + else: |
| 22 | + is_grouped = False |
| 23 | + for j, group in enumerate(grouped_paths): |
| 24 | + matcher = difflib.SequenceMatcher(None, paths[i], group[0]) |
| 25 | + similarity_ratio = matcher.ratio() |
| 26 | + if similarity_ratio >= 0.8: |
| 27 | + grouped_paths[j] = group + (paths[i],) |
| 28 | + is_grouped = True |
| 29 | + break |
| 30 | + if not is_grouped: |
| 31 | + grouped_paths.append((paths[i],)) |
| 32 | + |
| 33 | + print("grouped paths list is", grouped_paths) |
| 34 | + |
| 35 | + new_paths = [] |
| 36 | + for group in grouped_paths: |
| 37 | + if len(group) > 1: |
| 38 | + common_prefix = os.path.commonprefix(group) |
| 39 | + pattern = r"({}.*)\d(.*)".format(common_prefix) |
| 40 | + modified_path = re.sub(pattern, r"\1\\d\2", group[0]) |
| 41 | + new_paths.append((modified_path, len(group))) |
| 42 | + else: |
| 43 | + new_paths.append((group[0], 1)) |
| 44 | + |
| 45 | + new_paths = [path for path in new_paths if path[0]] |
| 46 | + |
| 47 | + if len(new_paths) > len(df): |
| 48 | + new_paths = new_paths[:len(df)] |
| 49 | + |
| 50 | + print("new paths are", new_paths) |
| 51 | + return new_paths |
| 52 | + |
| 53 | + |
| 54 | + |
| 55 | + |
| 56 | +def main(log_path, output_path): |
| 57 | + |
| 58 | + report = darshan.DarshanReport(log_path) |
| 59 | + |
| 60 | + df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) |
| 61 | + |
| 62 | + df = df[df["filename_glob"].str.contains(r"/.*")] |
| 63 | + |
| 64 | + df.reset_index(drop=True, inplace=True) # Reset the index |
| 65 | + |
| 66 | + |
| 67 | + new_paths = generalize_filename_glob(df) |
| 68 | + df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"]) |
| 69 | + df = df.reset_index(drop=True) |
| 70 | + df = df.sort_values(by="glob_count", ascending=False) |
| 71 | + |
| 72 | + |
| 73 | + style = df.style.background_gradient(axis=0, cmap="viridis") |
| 74 | + style.set_properties(subset=["glob_count"], **{"text-align": "right"}) |
| 75 | + |
| 76 | + style.set_table_styles([ |
| 77 | + {"selector": "", "props": [("border", "1px solid grey")]}, |
| 78 | + {"selector": "tbody td", "props": [("border", "1px solid grey")]}, |
| 79 | + {"selector": "th", "props": [("border", "1px solid grey")]} |
| 80 | + |
| 81 | + ]) |
| 82 | + |
| 83 | + # html = style.render() # use this when running python glob_feature.py -p /path/to/logfile |
| 84 | + |
| 85 | + html = style.to_html() #use when running pytest |
| 86 | + |
| 87 | + |
| 88 | + with open(output_path, "w") as html_file: |
| 89 | + html_file.write(html) |
| 90 | + |
| 91 | + |
| 92 | +if __name__ == "__main__": |
| 93 | + parser = argparse.ArgumentParser() |
| 94 | + parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") |
| 95 | + parser.add_argument('-o', '--output_path', type=str, help="Path to the output file") |
| 96 | + args = parser.parse_args() |
| 97 | + main(log_path=args.log_path, output_path=args.output_path) |
| 98 | + |
0 commit comments