Skip to content

Commit 1ee0208

Browse files
gdsjaargsjaardema
authored andcommitted
IOSS: CGNS - workaround cgns fpp/parallel-io bug
1 parent 811184d commit 1ee0208

File tree

2 files changed

+88
-2
lines changed

2 files changed

+88
-2
lines changed

packages/seacas/libraries/ioss/src/cgns/Iocgns_DatabaseIO.C

Lines changed: 85 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// * Single Base.
55
// * ZoneGridConnectivity is 1to1 with point lists for unstructured
66

7-
// Copyright(C) 1999-2024 National Technology & Engineering Solutions
7+
// Copyright(C) 1999-2025 National Technology & Engineering Solutions
88
// of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with
99
// NTESS, the U.S. Government retains certain rights in this software.
1010
//
@@ -79,6 +79,78 @@
7979
// extern char hdf5_access[64];
8080

8181
namespace {
82+
extern "C" {
83+
// From private CGNS header: `cgio_internal_type.h`
84+
typedef struct _cgns_io_ctx_t {
85+
/* Flag indicating if HDF5 file accesses is PARALLEL or NATIVE */
86+
char hdf5_access[64];
87+
#if CG_BUILD_PARALLEL
88+
/* MPI-2 info object */
89+
MPI_Comm pcg_mpi_comm;
90+
int pcg_mpi_comm_size;
91+
int pcg_mpi_comm_rank;
92+
/* flag indicating if mpi_initialized was called */
93+
int pcg_mpi_initialized;
94+
MPI_Info pcg_mpi_info;
95+
int64_t default_pio_mode;
96+
#endif
97+
} cgns_io_ctx_t;
98+
99+
extern cgns_io_ctx_t ctx_cgio; /* located in cgns_io.c */
100+
}
101+
102+
// There is a bug in the CGNS library (4.4.0 and before) where it
103+
// has a global symbol `ctx_cgio` which controls whether
104+
// file-per-rank access is being used, or parallel io (single file,
105+
// multiple ranks). In an application (like many IOSS uses) that
106+
// uses both access methods in the same execution, this can result
107+
// in hangs and corruprtion due to the wrong access type being used
108+
// at the wrong time (collective for a file-per-rank typically).
109+
//
110+
// The code below is a kluge to workaround this shortcoming in the
111+
// CGNS library. Before each file-per-rank access of the underlying
112+
// file, the code below access the CGNS global `ctx_cgio` and sets
113+
// it to non-parallel access. After the access, the destructor of
114+
// the class sets the global back to its previous value.
115+
//
116+
// The CGNS developers are aware of the issue and are looking at
117+
// options. See https://github.com/CGNS/CGNS/issues/835
118+
struct ParallelGuard
119+
{
120+
ParallelGuard(IOSS_MAYBE_UNUSED bool yes_no)
121+
{
122+
#if CG_BUILD_PARALLEL
123+
m_wasSet = strcmp(ctx_cgio.hdf5_access, "PARALLEL") == 0;
124+
if (m_wasSet != yes_no) {
125+
m_changed = true;
126+
if (yes_no) {
127+
strcpy(ctx_cgio.hdf5_access,"PARALLEL");
128+
}
129+
else {
130+
strcpy(ctx_cgio.hdf5_access,"NATIVE");
131+
}
132+
}
133+
#endif
134+
}
135+
~ParallelGuard()
136+
{
137+
#if CG_BUILD_PARALLEL
138+
if (m_changed) {
139+
if (m_wasSet) {
140+
strcpy(ctx_cgio.hdf5_access,"PARALLEL");
141+
}
142+
else {
143+
strcpy(ctx_cgio.hdf5_access,"NATIVE");
144+
}
145+
}
146+
#endif
147+
}
148+
#if CG_BUILD_PARALLEL
149+
bool m_wasSet{false};
150+
bool m_changed{false};
151+
#endif
152+
};
153+
82154
size_t global_to_zone_local_idx(size_t i, const Ioss::Map *block_map, const Ioss::Map &nodeMap,
83155
bool isParallel)
84156
{
@@ -996,7 +1068,7 @@ namespace Iocgns {
9961068
Ioss::IJK_t global_ijk;
9971069
Ioss::IJK_t offset_ijk;
9981070

999-
zone_data[id++]; // proc field. Not currently used.
1071+
id++; // proc field. Not currently used.
10001072
unpack(id, Data(zone_data), local_ijk.data(), 3);
10011073
unpack(id, Data(zone_data), global_ijk.data(), 3);
10021074
unpack(id, Data(zone_data), offset_ijk.data(), 3);
@@ -1533,6 +1605,7 @@ namespace Iocgns {
15331605
"ERROR: CGNS: Too many bases; only support files with a single bases at this time");
15341606
}
15351607

1608+
ParallelGuard serial(0);
15361609
get_step_times_nl();
15371610

15381611
if (open_create_behavior() == Ioss::DB_APPEND) {
@@ -1862,6 +1935,7 @@ namespace Iocgns {
18621935
};
18631936
// End of lambda...
18641937

1938+
ParallelGuard serial(0);
18651939
if (role == Ioss::Field::MESH) {
18661940
if (field.get_name() == "mesh_model_coordinates_x") {
18671941
// Use the lambda...
@@ -2000,6 +2074,7 @@ namespace Iocgns {
20002074
return 0;
20012075
}
20022076

2077+
ParallelGuard serial(0);
20032078
Ioss::Field::RoleType role = field.get_role();
20042079
if (role == Ioss::Field::TRANSIENT) {
20052080
// Get the StructuredBlock that this NodeBlock is contained in:
@@ -2069,6 +2144,7 @@ namespace Iocgns {
20692144
int64_t DatabaseIO::get_field_internal(const Ioss::ElementBlock *eb, const Ioss::Field &field,
20702145
void *data, size_t data_size) const
20712146
{
2147+
ParallelGuard serial(0);
20722148
size_t num_to_get = field.verify(data_size);
20732149
if (num_to_get > 0) {
20742150

@@ -2210,6 +2286,7 @@ namespace Iocgns {
22102286
int zone = Iocgns::Utils::get_db_zone(sb);
22112287

22122288
cgsize_t num_to_get = field.verify(data_size);
2289+
ParallelGuard serial(0);
22132290

22142291
// In this routine, if isParallel, then reading file-per-processor; not parallel io from single
22152292
// file.
@@ -2398,6 +2475,7 @@ namespace Iocgns {
23982475
int64_t DatabaseIO::get_field_internal(const Ioss::SideBlock *sb, const Ioss::Field &field,
23992476
void *data, size_t data_size) const
24002477
{
2478+
ParallelGuard serial(0);
24012479
int base = sb->get_property("base").get_int();
24022480
int zone = Iocgns::Utils::get_db_zone(sb);
24032481
int sect = sb->get_property("section").get_int();
@@ -2554,6 +2632,7 @@ namespace Iocgns {
25542632

25552633
// In this routine, if isParallel, then writing file-per-processor; not parallel io to single
25562634
// file.
2635+
ParallelGuard serial(0);
25572636
if (isParallel && num_to_get == 0) {
25582637
return 0;
25592638
}
@@ -2664,6 +2743,7 @@ namespace Iocgns {
26642743
int64_t DatabaseIO::put_field_internal(const Ioss::ElementBlock *eb, const Ioss::Field &field,
26652744
void *data, size_t data_size) const
26662745
{
2746+
ParallelGuard serial(0);
26672747
size_t num_to_get = field.verify(data_size);
26682748
if (num_to_get > 0) {
26692749

@@ -2835,6 +2915,7 @@ namespace Iocgns {
28352915
return put_field_internal_sub_nb(nb, field, data, data_size);
28362916
}
28372917

2918+
ParallelGuard serial(0);
28382919
// Instead of outputting a global nodeblock's worth of data,
28392920
// the data is output a "zone" at a time.
28402921
// The m_globalToBlockLocalNodeMap[zone] map is used (Ioss::Map pointer)
@@ -3007,6 +3088,7 @@ namespace Iocgns {
30073088
int zone = Iocgns::Utils::get_db_zone(sb);
30083089
cgsize_t num_to_get = field.verify(data_size);
30093090

3091+
ParallelGuard serial(0);
30103092
// In this routine, if isParallel, then writing file-per-processor; not parallel io to single
30113093
// file.
30123094
if (isParallel && num_to_get == 0) {
@@ -3070,6 +3152,7 @@ namespace Iocgns {
30703152
int64_t DatabaseIO::put_field_internal(const Ioss::SideBlock *sb, const Ioss::Field &field,
30713153
void *data, size_t data_size) const
30723154
{
3155+
ParallelGuard serial(0);
30733156
const Ioss::EntityBlock *parent_block = sb->parent_block();
30743157
if (parent_block == nullptr) {
30753158
IOSS_ERROR(fmt::format(

packages/seacas/libraries/ioss/src/cgns/Iocgns_Utils.C

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1266,6 +1266,9 @@ size_t Iocgns::Utils::common_write_metadata(int file_ptr, const Ioss::Region &re
12661266
for (size_t i = 0; i < 3; i++) {
12671267
bc_range[idx++] = bc.m_rangeEnd[i];
12681268
}
1269+
if (is_parallel_io) {
1270+
region.get_database()->progress(fmt::format("\t\tBC Range calculation, {}.", bc.m_bcName));
1271+
}
12691272
}
12701273

12711274
if (is_parallel_io) {

0 commit comments

Comments
 (0)