Skip to content

Commit c8e387d

Browse files
committed
WIP, ENH: memory efficient DXT segs
Fixes #779 * at the moment on `main`, DXT record data is effectively stored as a list of dictionaries of lists of dictionaries that look like this: ``` DXT_list -> [rec0, rec1, ..., recN] recN -> {"id":, ..., "rank":, ..., "write_segments": ..., ...} recN["write_segments"] -> [seg0, seg1, ..., segN] segN -> {"offset": int, "length": int, "start_time": float, "end_time": float} ``` - the list of segments is extremely memory inefficient, with the smallest file in the matching issue exceeding 20 GB of physical memory in `mod_read_all_dxt_records`: ``` Line # Mem usage Increment Occurrences Line Contents 852 # fetch records 853 92.484 MiB 18.820 MiB 1 rec = backend.log_get_dxt_record(self.log, mod, dtype=dtype) 854 20295.188 MiB 0.773 MiB 1025 while rec != None: 855 20295.188 MiB 0.000 MiB 1024 self.records[mod].append(rec) 856 20295.188 MiB 0.000 MiB 1024 self.data['modules'][mod]['num_records'] += 1 857 858 # fetch next 859 20295.188 MiB 20201.930 MiB 1024 rec = backend.log_get_dxt_record(self.log, mod, reads=reads, writes=writes, dtype=dtype) ``` - if we switch to NumPy arrays the memory footprint drops a lot (see below), and the performance informally seems similar (36 seconds vs. 33 seconds on `main` to produce a `report` object with smallest file in matching issue): ``` Line # Mem usage Increment Occurrences Line Contents 859 3222.547 MiB 3146.344 MiB 1024 rec = backend.log_get_dxt_record(self.log, mod, reads=reads, writes=writes, dtype=dtype) ``` - this branch currently uses NumPy record arrays, because I thought they'd be a better fit for a data structure with 2 int columns and 2 float columns; however, there is a big performance hit over regular NumPy arrays (almost 6 minutes vs. 33 seconds for the smallest file in matchin issue); so, if we could live without the extra dtype structuring of a recarray, maybe that would be best (we could also try to use a pandas dataframe, which is another natural fit for dtype columns..)
1 parent 6b75b46 commit c8e387d

File tree

3 files changed

+74
-20
lines changed

3 files changed

+74
-20
lines changed

darshan-util/pydarshan/darshan/backend/cffi_backend.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -565,29 +565,31 @@ def log_get_dxt_record(log, mod_name, reads=True, writes=True, dtype='dict'):
565565

566566
size_of = ffi.sizeof("struct dxt_file_record")
567567
segments = ffi.cast("struct segment_info *", buf[0] + size_of )
568-
568+
arr_write = np.recarray(wcnt, dtype=[("offset", int),
569+
("length", int),
570+
("start_time", float),
571+
("end_time", float)])
572+
arr_read = np.recarray(rcnt, dtype=[("offset", int),
573+
("length", int),
574+
("start_time", float),
575+
("end_time", float)])
569576

570577
for i in range(wcnt):
571-
seg = {
572-
"offset": segments[i].offset,
573-
"length": segments[i].length,
574-
"start_time": segments[i].start_time,
575-
"end_time": segments[i].end_time
576-
}
577-
rec['write_segments'].append(seg)
578-
579-
580-
for i in range(rcnt):
581-
i = i + wcnt
582-
seg = {
583-
"offset": segments[i].offset,
584-
"length": segments[i].length,
585-
"start_time": segments[i].start_time,
586-
"end_time": segments[i].end_time
587-
}
588-
rec['read_segments'].append(seg)
578+
arr_write[i, ...] = (segments[i].offset,
579+
segments[i].length,
580+
segments[i].start_time,
581+
segments[i].end_time)
582+
583+
for k in range(rcnt):
584+
i = k + wcnt
585+
arr_read[k, ...] = (segments[i].offset,
586+
segments[i].length,
587+
segments[i].start_time,
588+
segments[i].end_time)
589589

590590

591+
rec['write_segments'] = arr_write
592+
rec['read_segments'] = arr_read
591593
if dtype == "pandas":
592594
rec['read_segments'] = pd.DataFrame(rec['read_segments'])
593595
rec['write_segments'] = pd.DataFrame(rec['write_segments'])

darshan-util/pydarshan/darshan/experimental/plots/heatmap_handling.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ def get_rd_wr_dfs(
132132
# ignore for the same reason as above
133133
seg_df = _dict[seg_key] # type: ignore
134134
if seg_df.size:
135+
seg_df.columns = ["offset",
136+
"length",
137+
"start_time",
138+
"end_time"]
135139
# drop unused columns from the dataframe
136140
seg_df = seg_df.drop(columns=drop_columns)
137141
# create new column for the ranks

darshan-util/pydarshan/darshan/tests/test_moddxt.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22

33
import pytest
4+
import numpy as np
5+
from numpy.testing import assert_allclose
46
import darshan.backend.cffi_backend as backend
57
from darshan.log_utils import get_log_path
68

@@ -33,7 +35,53 @@
3335
'read_segments': []})])
3436
def test_dxt_records(logfile, mod, expected_dict):
3537
# regression guard for DXT records values
38+
# write_segments and read_segments are now NumPy
39+
# recarrays, to save considerable memory
40+
# per gh-779
41+
# TODO: refactor for simplicity--we can probably
42+
# just initialize the expected values via
43+
# np.array() with the appropriate structured dtypes
44+
expected_write_segs = np.recarray(1, dtype=[("offset", int),
45+
("length", int),
46+
("start_time", float),
47+
("end_time", float)])
48+
expected_read_segs = np.recarray(1, dtype=[("offset", int),
49+
("length", int),
50+
("start_time", float),
51+
("end_time", float)])
52+
if expected_dict["write_segments"]:
53+
expected_write_segs.offset = expected_dict["write_segments"][0]["offset"]
54+
expected_write_segs.length = expected_dict["write_segments"][0]["length"]
55+
expected_write_segs.start_time = expected_dict["write_segments"][0]["start_time"]
56+
expected_write_segs.end_time = expected_dict["write_segments"][0]["end_time"]
57+
else:
58+
expected_write_segs = np.recarray(0, dtype=[("offset", int),
59+
("length", int),
60+
("start_time", float),
61+
("end_time", float)])
62+
if expected_dict["read_segments"]:
63+
expected_read_segs.offset = expected_dict["read_segments"][0]["offset"]
64+
expected_read_segs.length = expected_dict["read_segments"][0]["length"]
65+
expected_read_segs.start_time = expected_dict["read_segments"][0]["start_time"]
66+
expected_read_segs.end_time = expected_dict["read_segments"][0]["end_time"]
67+
else:
68+
expected_read_segs = np.recarray(0, dtype=[("offset", int),
69+
("length", int),
70+
("start_time", float),
71+
("end_time", float)])
72+
expected_dict["write_segments"] = expected_write_segs
73+
expected_dict["read_segments"] = expected_read_segs
74+
3675
logfile = get_log_path(logfile)
3776
log = backend.log_open(logfile)
3877
rec = backend.log_get_record(log, mod)
39-
assert rec == expected_dict
78+
for key in expected_dict.keys():
79+
if "segments" in key:
80+
# careful, can't use assert_allclose directly
81+
# on recarrays
82+
assert_allclose(rec[key].offset, expected_dict[key].offset)
83+
assert_allclose(rec[key].length, expected_dict[key].length)
84+
assert_allclose(rec[key].start_time, expected_dict[key].start_time)
85+
assert_allclose(rec[key].end_time, expected_dict[key].end_time)
86+
else:
87+
assert rec[key] == expected_dict[key]

0 commit comments

Comments
 (0)