Skip to content

Commit 9b757d5

Browse files
Rearranged glob_feature.py and added test_glob_feature.py to the test directory.
1 parent edc12c3 commit 9b757d5

File tree

2 files changed

+147
-0
lines changed

2 files changed

+147
-0
lines changed
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import os
2+
import darshan
3+
from darshan.log_utils import get_log_path
4+
import pandas as pd
5+
print(pd.__version__)
6+
from pandas.testing import assert_frame_equal
7+
import pytest
8+
import re
9+
print(sys.path) # Print sys.path again
10+
import glob_feature
11+
12+
print("hello")
13+
@pytest.mark.parametrize("log_name, expected_df", [
14+
# grow this with more logs...
15+
("e3sm_io_heatmap_only.darshan",
16+
pd.DataFrame({"filename_glob":
17+
# NOTE: usage of \\d or r"\d" for a literal backslash followed by "d"
18+
["/projects/radix-io/snyder/e3sm/can_I_out_h\\[.*]d.nc",
19+
"/projects/radix-io/E3SM-IO-inputs/i_case_1344p.nc"],
20+
"glob_count": [2, 1]})),
21+
])
22+
23+
def test_glob_tables(tmpdir, log_name, expected_df):
24+
print("Current working directory:", os.getcwd())
25+
26+
# test the glob table HTML outputs for various
27+
# log files in the logs repo (and new log files
28+
# that you creatively design yourself)
29+
log_path = get_log_path(log_name)
30+
print("log path is", log_path)
31+
with tmpdir.as_cwd():
32+
cwd = os.getcwd()
33+
# TODO: you shouldn't have a hardcoded HTML filename
34+
# like this...
35+
outfile = os.path.join(cwd, "name_record_glob_hd5f.html")
36+
glob_feature.main(log_path, outfile)
37+
actual_table = pd.read_html(outfile)[0]
38+
actual_table.drop("Unnamed: 0", axis=1, inplace=True) # Drop the "Unnamed: 0" column
39+
print("actual table is", actual_table)
40+
print("expected_df is", expected_df)
41+
print("pandas version is", pd.__version__)
42+
print("log path is", log_path)
43+
# Compare the two DataFrames
44+
diff = actual_table['filename_glob'].compare(expected_df['filename_glob'])
45+
# Print the differences
46+
print(diff)
47+
assert_frame_equal(actual_table, expected_df)
48+
49+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read b$
2+
# It uses sequence matching and grouping techniques to group similar file paths together and genera$
3+
# Command to run python glob_feature.py -p path/to/log/file.darshan
4+
5+
6+
import argparse
7+
import pandas as pd
8+
import difflib
9+
import darshan
10+
import re
11+
import os
12+
13+
14+
def generalize_filename_glob(df):
15+
paths = df["filename_glob"].tolist()
16+
grouped_paths = []
17+
18+
for i in range(len(paths)):
19+
if not grouped_paths:
20+
grouped_paths.append((paths[i],))
21+
else:
22+
is_grouped = False
23+
for j, group in enumerate(grouped_paths):
24+
matcher = difflib.SequenceMatcher(None, paths[i], group[0])
25+
similarity_ratio = matcher.ratio()
26+
if similarity_ratio >= 0.8:
27+
grouped_paths[j] = group + (paths[i],)
28+
is_grouped = True
29+
break
30+
if not is_grouped:
31+
grouped_paths.append((paths[i],))
32+
33+
print("grouped paths list is", grouped_paths)
34+
35+
new_paths = []
36+
for group in grouped_paths:
37+
if len(group) > 1:
38+
common_prefix = os.path.commonprefix(group)
39+
pattern = r"({}.*)\d(.*)".format(common_prefix)
40+
modified_path = re.sub(pattern, r"\1\\d\2", group[0])
41+
new_paths.append((modified_path, len(group)))
42+
else:
43+
new_paths.append((group[0], 1))
44+
45+
new_paths = [path for path in new_paths if path[0]]
46+
47+
if len(new_paths) > len(df):
48+
new_paths = new_paths[:len(df)]
49+
50+
print("new paths are", new_paths)
51+
return new_paths
52+
53+
54+
55+
56+
def main(log_path, output_path):
57+
58+
report = darshan.DarshanReport(log_path)
59+
60+
df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])
61+
62+
df = df[df["filename_glob"].str.contains(r"/.*")]
63+
64+
df.reset_index(drop=True, inplace=True) # Reset the index
65+
66+
67+
new_paths = generalize_filename_glob(df)
68+
df = pd.DataFrame(new_paths, columns=["filename_glob", "glob_count"])
69+
df = df.reset_index(drop=True)
70+
df = df.sort_values(by="glob_count", ascending=False)
71+
72+
73+
style = df.style.background_gradient(axis=0, cmap="viridis")
74+
style.set_properties(subset=["glob_count"], **{"text-align": "right"})
75+
76+
style.set_table_styles([
77+
{"selector": "", "props": [("border", "1px solid grey")]},
78+
{"selector": "tbody td", "props": [("border", "1px solid grey")]},
79+
{"selector": "th", "props": [("border", "1px solid grey")]}
80+
81+
])
82+
83+
# html = style.render() # use this when running python glob_feature.py -p /path/to/logfile
84+
85+
html = style.to_html() #use when running pytest
86+
87+
88+
with open(output_path, "w") as html_file:
89+
html_file.write(html)
90+
91+
92+
if __name__ == "__main__":
93+
parser = argparse.ArgumentParser()
94+
parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
95+
parser.add_argument('-o', '--output_path', type=str, help="Path to the output file")
96+
args = parser.parse_args()
97+
main(log_path=args.log_path, output_path=args.output_path)
98+

0 commit comments

Comments
 (0)