|
4 | 4 | // * Single Base. |
5 | 5 | // * ZoneGridConnectivity is 1to1 with point lists for unstructured |
6 | 6 |
|
7 | | -// Copyright(C) 1999-2024 National Technology & Engineering Solutions |
| 7 | +// Copyright(C) 1999-2025 National Technology & Engineering Solutions |
8 | 8 | // of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with |
9 | 9 | // NTESS, the U.S. Government retains certain rights in this software. |
10 | 10 | // |
|
79 | 79 | // extern char hdf5_access[64]; |
80 | 80 |
|
81 | 81 | namespace { |
| 82 | + extern "C" { |
| 83 | + // From private CGNS header: `cgio_internal_type.h` |
| 84 | + typedef struct _cgns_io_ctx_t { |
| 85 | + /* Flag indicating if HDF5 file accesses is PARALLEL or NATIVE */ |
| 86 | + char hdf5_access[64]; |
| 87 | +#if CG_BUILD_PARALLEL |
| 88 | + /* MPI-2 info object */ |
| 89 | + MPI_Comm pcg_mpi_comm; |
| 90 | + int pcg_mpi_comm_size; |
| 91 | + int pcg_mpi_comm_rank; |
| 92 | + /* flag indicating if mpi_initialized was called */ |
| 93 | + int pcg_mpi_initialized; |
| 94 | + MPI_Info pcg_mpi_info; |
| 95 | + int64_t default_pio_mode; |
| 96 | +#endif |
| 97 | + } cgns_io_ctx_t; |
| 98 | + |
| 99 | + extern cgns_io_ctx_t ctx_cgio; /* located in cgns_io.c */ |
| 100 | + } |
| 101 | + |
| 102 | + // There is a bug in the CGNS library (4.4.0 and before) where it |
| 103 | + // has a global symbol `ctx_cgio` which controls whether |
| 104 | + // file-per-rank access is being used, or parallel io (single file, |
| 105 | + // multiple ranks). In an application (like many IOSS uses) that |
| 106 | + // uses both access methods in the same execution, this can result |
| 107 | + // in hangs and corruprtion due to the wrong access type being used |
| 108 | + // at the wrong time (collective for a file-per-rank typically). |
| 109 | + // |
| 110 | + // The code below is a kluge to workaround this shortcoming in the |
| 111 | + // CGNS library. Before each file-per-rank access of the underlying |
| 112 | + // file, the code below access the CGNS global `ctx_cgio` and sets |
| 113 | + // it to non-parallel access. After the access, the destructor of |
| 114 | + // the class sets the global back to its previous value. |
| 115 | + // |
| 116 | + // The CGNS developers are aware of the issue and are looking at |
| 117 | + // options. See https://github.com/CGNS/CGNS/issues/835 |
| 118 | + struct ParallelGuard |
| 119 | + { |
| 120 | + ParallelGuard(IOSS_MAYBE_UNUSED bool yes_no) |
| 121 | + { |
| 122 | +#if CG_BUILD_PARALLEL |
| 123 | + m_wasSet = strcmp(ctx_cgio.hdf5_access, "PARALLEL") == 0; |
| 124 | + if (m_wasSet != yes_no) { |
| 125 | + m_changed = true; |
| 126 | + if (yes_no) { |
| 127 | + strcpy(ctx_cgio.hdf5_access,"PARALLEL"); |
| 128 | + } |
| 129 | + else { |
| 130 | + strcpy(ctx_cgio.hdf5_access,"NATIVE"); |
| 131 | + } |
| 132 | + } |
| 133 | +#endif |
| 134 | + } |
| 135 | + ~ParallelGuard() |
| 136 | + { |
| 137 | +#if CG_BUILD_PARALLEL |
| 138 | + if (m_changed) { |
| 139 | + if (m_wasSet) { |
| 140 | + strcpy(ctx_cgio.hdf5_access,"PARALLEL"); |
| 141 | + } |
| 142 | + else { |
| 143 | + strcpy(ctx_cgio.hdf5_access,"NATIVE"); |
| 144 | + } |
| 145 | + } |
| 146 | +#endif |
| 147 | + } |
| 148 | +#if CG_BUILD_PARALLEL |
| 149 | + bool m_wasSet{false}; |
| 150 | + bool m_changed{false}; |
| 151 | +#endif |
| 152 | + }; |
| 153 | + |
82 | 154 | size_t global_to_zone_local_idx(size_t i, const Ioss::Map *block_map, const Ioss::Map &nodeMap, |
83 | 155 | bool isParallel) |
84 | 156 | { |
@@ -996,7 +1068,7 @@ namespace Iocgns { |
996 | 1068 | Ioss::IJK_t global_ijk; |
997 | 1069 | Ioss::IJK_t offset_ijk; |
998 | 1070 |
|
999 | | - zone_data[id++]; // proc field. Not currently used. |
| 1071 | + id++; // proc field. Not currently used. |
1000 | 1072 | unpack(id, Data(zone_data), local_ijk.data(), 3); |
1001 | 1073 | unpack(id, Data(zone_data), global_ijk.data(), 3); |
1002 | 1074 | unpack(id, Data(zone_data), offset_ijk.data(), 3); |
@@ -1533,6 +1605,7 @@ namespace Iocgns { |
1533 | 1605 | "ERROR: CGNS: Too many bases; only support files with a single bases at this time"); |
1534 | 1606 | } |
1535 | 1607 |
|
| 1608 | + ParallelGuard serial(0); |
1536 | 1609 | get_step_times_nl(); |
1537 | 1610 |
|
1538 | 1611 | if (open_create_behavior() == Ioss::DB_APPEND) { |
@@ -1862,6 +1935,7 @@ namespace Iocgns { |
1862 | 1935 | }; |
1863 | 1936 | // End of lambda... |
1864 | 1937 |
|
| 1938 | + ParallelGuard serial(0); |
1865 | 1939 | if (role == Ioss::Field::MESH) { |
1866 | 1940 | if (field.get_name() == "mesh_model_coordinates_x") { |
1867 | 1941 | // Use the lambda... |
@@ -2000,6 +2074,7 @@ namespace Iocgns { |
2000 | 2074 | return 0; |
2001 | 2075 | } |
2002 | 2076 |
|
| 2077 | + ParallelGuard serial(0); |
2003 | 2078 | Ioss::Field::RoleType role = field.get_role(); |
2004 | 2079 | if (role == Ioss::Field::TRANSIENT) { |
2005 | 2080 | // Get the StructuredBlock that this NodeBlock is contained in: |
@@ -2069,6 +2144,7 @@ namespace Iocgns { |
2069 | 2144 | int64_t DatabaseIO::get_field_internal(const Ioss::ElementBlock *eb, const Ioss::Field &field, |
2070 | 2145 | void *data, size_t data_size) const |
2071 | 2146 | { |
| 2147 | + ParallelGuard serial(0); |
2072 | 2148 | size_t num_to_get = field.verify(data_size); |
2073 | 2149 | if (num_to_get > 0) { |
2074 | 2150 |
|
@@ -2210,6 +2286,7 @@ namespace Iocgns { |
2210 | 2286 | int zone = Iocgns::Utils::get_db_zone(sb); |
2211 | 2287 |
|
2212 | 2288 | cgsize_t num_to_get = field.verify(data_size); |
| 2289 | + ParallelGuard serial(0); |
2213 | 2290 |
|
2214 | 2291 | // In this routine, if isParallel, then reading file-per-processor; not parallel io from single |
2215 | 2292 | // file. |
@@ -2398,6 +2475,7 @@ namespace Iocgns { |
2398 | 2475 | int64_t DatabaseIO::get_field_internal(const Ioss::SideBlock *sb, const Ioss::Field &field, |
2399 | 2476 | void *data, size_t data_size) const |
2400 | 2477 | { |
| 2478 | + ParallelGuard serial(0); |
2401 | 2479 | int base = sb->get_property("base").get_int(); |
2402 | 2480 | int zone = Iocgns::Utils::get_db_zone(sb); |
2403 | 2481 | int sect = sb->get_property("section").get_int(); |
@@ -2554,6 +2632,7 @@ namespace Iocgns { |
2554 | 2632 |
|
2555 | 2633 | // In this routine, if isParallel, then writing file-per-processor; not parallel io to single |
2556 | 2634 | // file. |
| 2635 | + ParallelGuard serial(0); |
2557 | 2636 | if (isParallel && num_to_get == 0) { |
2558 | 2637 | return 0; |
2559 | 2638 | } |
@@ -2664,6 +2743,7 @@ namespace Iocgns { |
2664 | 2743 | int64_t DatabaseIO::put_field_internal(const Ioss::ElementBlock *eb, const Ioss::Field &field, |
2665 | 2744 | void *data, size_t data_size) const |
2666 | 2745 | { |
| 2746 | + ParallelGuard serial(0); |
2667 | 2747 | size_t num_to_get = field.verify(data_size); |
2668 | 2748 | if (num_to_get > 0) { |
2669 | 2749 |
|
@@ -2835,6 +2915,7 @@ namespace Iocgns { |
2835 | 2915 | return put_field_internal_sub_nb(nb, field, data, data_size); |
2836 | 2916 | } |
2837 | 2917 |
|
| 2918 | + ParallelGuard serial(0); |
2838 | 2919 | // Instead of outputting a global nodeblock's worth of data, |
2839 | 2920 | // the data is output a "zone" at a time. |
2840 | 2921 | // The m_globalToBlockLocalNodeMap[zone] map is used (Ioss::Map pointer) |
@@ -3007,6 +3088,7 @@ namespace Iocgns { |
3007 | 3088 | int zone = Iocgns::Utils::get_db_zone(sb); |
3008 | 3089 | cgsize_t num_to_get = field.verify(data_size); |
3009 | 3090 |
|
| 3091 | + ParallelGuard serial(0); |
3010 | 3092 | // In this routine, if isParallel, then writing file-per-processor; not parallel io to single |
3011 | 3093 | // file. |
3012 | 3094 | if (isParallel && num_to_get == 0) { |
@@ -3070,6 +3152,7 @@ namespace Iocgns { |
3070 | 3152 | int64_t DatabaseIO::put_field_internal(const Ioss::SideBlock *sb, const Ioss::Field &field, |
3071 | 3153 | void *data, size_t data_size) const |
3072 | 3154 | { |
| 3155 | + ParallelGuard serial(0); |
3073 | 3156 | const Ioss::EntityBlock *parent_block = sb->parent_block(); |
3074 | 3157 | if (parent_block == nullptr) { |
3075 | 3158 | IOSS_ERROR(fmt::format( |
|
0 commit comments