Skip to content

Commit ca4f3ea

Browse files
committed
MAINT: improve C accum error handling
* returning `-1` from a public API C function isn't sufficient to provide useful error information when working in Python/CFFI--it only tells you something went wrong if you check for a return code in the first place (which isn't normally done in Python anyway--normally you end execution at the error point and `exit()` with an appropriate error message) * when there are multiple conditions that can trigger the `-1` return value, the situation is even worse, one literally has to `printf` sprinkle the source to figure out what went wrong where * as a compromise, I'll leave the `-1` approach in since that is quite common in standard `C`, but I'm going to add in prints to `stderr` so that Python can then intercept the `-1` and refer the user to `stderr` * also, `darshan_accumulator_inject()` assumed that the `module_id` was reasonable because it was set/checked in `darshan_accumulator_create()`, however my experience in darshan-hpcgh-839 was that the accumulator memory location can get freed, or not properly set at CFFI boundary after calling the creation function, so I think the assumption that a previous function was called and worked perfectly is too fragile--I'm adding error handling to prevent a hard segfault on nonsense values of that structure member as a result
1 parent 045b9bd commit ca4f3ea

File tree

3 files changed

+78
-1
lines changed

3 files changed

+78
-1
lines changed

darshan-util/darshan-logutils-accumulator.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <stdlib.h>
1212
#include <assert.h>
13+
#include <stdio.h>
1314

1415
#include "darshan-logutils.h"
1516
#include "uthash-1.9.2/src/uthash.h"
@@ -107,10 +108,16 @@ int darshan_accumulator_inject(darshan_accumulator acc,
107108
int ret;
108109
file_hash_entry_t *hfile = NULL;
109110

111+
if(acc->module_id >= DARSHAN_KNOWN_MODULE_COUNT || acc->module_id < 0) {
112+
fprintf(stderr, "darshan_accumulator_inject received an accumulator struct with an id that is likely corrupted\n");
113+
return(-1);
114+
}
115+
110116
if(!mod_logutils[acc->module_id]->log_agg_records ||
111117
!mod_logutils[acc->module_id]->log_sizeof_record ||
112118
!mod_logutils[acc->module_id]->log_record_metrics) {
113119
/* this module doesn't support this operation */
120+
fprintf(stderr, "darshan_accumulator_inject is operating on a module that doesn't support this operation");
114121
return(-1);
115122
}
116123

@@ -126,8 +133,10 @@ int darshan_accumulator_inject(darshan_accumulator acc,
126133
ret = mod_logutils[acc->module_id]->log_record_metrics( new_record,
127134
&rec_id, &r_bytes, &w_bytes, &max_offset, &io_total_time,
128135
&md_only_time, &rw_only_time, &rank, &nprocs);
129-
if(ret < 0)
136+
if(ret < 0) {
137+
fprintf(stderr, "darshan_accumulator_inject was unable to retrieve generic metrics from record");
130138
return(-1);
139+
}
131140

132141
/* accumulate performance metrics */
133142

darshan-util/pydarshan/darshan/backend/api_def_c.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,34 @@
88

99

1010
header = """/* from darshan-logutils.h */
11+
12+
struct darshan_derived_metrics {
13+
int64_t total_bytes;
14+
double unique_io_total_time_by_slowest;
15+
double unique_rw_only_time_by_slowest;
16+
double unique_md_only_time_by_slowest;
17+
int unique_io_slowest_rank;
18+
double shared_io_total_time_by_slowest;
19+
double agg_perf_by_slowest;
20+
double agg_time_by_slowest;
21+
struct darshan_file_category_counters;
22+
};
23+
24+
25+
26+
struct darshan_accumulator {
27+
int64_t module_id;
28+
int64_t job_nprocs;
29+
void* agg_record;
30+
int num_records;
31+
void *file_hash_table;
32+
double shared_io_total_time_by_slowest;
33+
int64_t total_bytes;
34+
double *rank_cumul_io_total_time;
35+
double *rank_cumul_rw_only_time;
36+
double *rank_cumul_md_only_time;
37+
};
38+
1139
struct darshan_mnt_info
1240
{
1341
char mnt_type[3031];
@@ -23,6 +51,11 @@
2351
int partial_flag;
2452
};
2553
54+
int darshan_accumulator_emit(struct darshan_accumulator, struct darshan_derived_metrics*, void* aggregation_record);
55+
int darshan_accumulator_destroy(struct darshan_accumulator);
56+
int darshan_accumulator_create(enum darshan_module_id, int64_t, struct darshan_accumulator*);
57+
int darshan_accumulator_inject(struct darshan_accumulator, void*, int);
58+
2659
/* from darshan-log-format.h */
2760
typedef uint64_t darshan_record_id;
2861

darshan-util/pydarshan/darshan/tests/test_cffi_misc.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,38 @@ def test_log_get_generic_record(dtype):
159159
# make sure the returned key/column names agree
160160
assert actual_counter_names == expected_counter_names
161161
assert actual_fcounter_names == expected_fcounter_names
162+
163+
164+
@pytest.mark.parametrize("log_path", [
165+
"imbalanced-io.darshan",
166+
])
167+
def test_accumulator_invalid_id(capfdbinary, log_path):
168+
# check for proper error handling of invalid
169+
# id in darshan_accumulator_inject() C function
170+
log_path = get_log_path(log_path)
171+
log = backend.log_open(log_path)
172+
jobrec = backend.ffi.new("struct darshan_job *")
173+
backend.libdutil.darshan_log_get_job(log['handle'], jobrec)
174+
modules = backend.log_get_modules(log)
175+
for mod_name in modules:
176+
mod_type = backend._structdefs[mod_name]
177+
darshan_accumulator = backend.ffi.new("struct darshan_accumulator *")
178+
buf = backend.ffi.new("void **")
179+
r = backend.libdutil.darshan_log_get_record(log['handle'], modules[mod_name]['idx'], buf)
180+
if r < 1:
181+
continue
182+
rbuf = backend.ffi.cast(mod_type, buf)
183+
ret_create = backend.libdutil.darshan_accumulator_create(modules[mod_name]['idx'],
184+
jobrec[0].nprocs,
185+
darshan_accumulator)
186+
# creation should work just fine
187+
assert ret_create == 0
188+
# the memory handling around the CFFI interface/struct opacity
189+
# is such that we expect an error here:
190+
r = backend.libdutil.darshan_accumulator_inject(darshan_accumulator[0], rbuf[0], 1)
191+
if r == -1:
192+
# stderr should provide something useful
193+
captured = capfdbinary.readouterr()
194+
assert b"id that is likely corrupted" in captured.err
195+
else:
196+
pytest.skip("darshan_accumulator_inject() is working in this scenario")

0 commit comments

Comments
 (0)