Skip to content

Commit 419d112

Browse files
FIX: fixing various pragma open loops
1 parent ae76ed3 commit 419d112

1 file changed

Lines changed: 95 additions & 105 deletions

File tree

src/functions.cu

Lines changed: 95 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,117 +1415,97 @@ __host__ void do_gridding(std::vector<Field>& fields,
14151415
// ====================================================================
14161416
// Process each visibility twice: once as-is, once as Hermitian symmetric
14171417
// (Measurement sets only contain half the data due to symmetry)
1418-
// Use thread-local accumulation to eliminate critical section bottleneck
1419-
int grid_size = M * N;
1420-
1421-
#pragma omp parallel num_threads(gridding) \
1418+
#pragma omp parallel for schedule(static) num_threads(gridding) \
14221419
shared(g_weights, g_weights_aux, g_Vo, freq, center_j, center_k, \
1423-
support_x, support_y, num_visibilities, grid_size) \
1424-
private(j_fp, k_fp, j, k, grid_pos_x, grid_pos_y, uvw, w, Vo, \
1420+
support_x, support_y) private( \
1421+
j_fp, k_fp, j, k, grid_pos_x, grid_pos_y, uvw, w, Vo, \
14251422
shifted_j, shifted_k, kernel_i, kernel_j, ckernel_result)
1426-
{
1427-
// Allocate thread-local accumulation arrays
1428-
std::vector<float> local_weights(grid_size, 0.0f);
1429-
std::vector<float> local_weights_aux(grid_size, 0.0f);
1430-
std::vector<cufftComplex> local_Vo(grid_size, floatComplexZero());
1423+
for (int z = 0; z < 2 * num_visibilities; z++) {
1424+
// Determine which visibility to use (first half: original, second half: Hermitian)
1425+
int vis_index = (z < num_visibilities) ? z : (z - num_visibilities);
14311426

1432-
// Parallel loop over visibilities
1433-
#pragma omp for schedule(static)
1434-
for (int z = 0; z < 2 * num_visibilities; z++) {
1435-
// Determine which visibility to use (first half: original, second half: Hermitian)
1436-
int vis_index = (z < num_visibilities) ? z : (z - num_visibilities);
1437-
1438-
// Load visibility data
1439-
uvw = fields[f].visibilities[i][s].uvw[vis_index];
1440-
w = fields[f].visibilities[i][s].weight[vis_index];
1441-
Vo = fields[f].visibilities[i][s].Vo[vis_index];
1427+
// Load visibility data
1428+
uvw = fields[f].visibilities[i][s].uvw[vis_index];
1429+
w = fields[f].visibilities[i][s].weight[vis_index];
1430+
Vo = fields[f].visibilities[i][s].Vo[vis_index];
14421431

1443-
// ================================================================
1444-
// Step 2a: Create Hermitian symmetric visibility for second half
1445-
// ================================================================
1446-
// For V(u,v): V*(-u,-v) = V*(u,v) where * denotes complex conjugate
1447-
if (z >= num_visibilities) {
1448-
uvw.x *= -1.0; // Negate u coordinate
1449-
uvw.y *= -1.0; // Negate v coordinate
1450-
uvw.z *= -1.0; // Negate w coordinate
1451-
Vo.y *= -1.0f; // Negate imaginary part (complex conjugate)
1452-
}
1432+
// ================================================================
1433+
// Step 2a: Create Hermitian symmetric visibility for second half
1434+
// ================================================================
1435+
// For V(u,v): V*(-u,-v) = V*(u,v) where * denotes complex conjugate
1436+
if (z >= num_visibilities) {
1437+
uvw.x *= -1.0; // Negate u coordinate
1438+
uvw.y *= -1.0; // Negate v coordinate
1439+
uvw.z *= -1.0; // Negate w coordinate
1440+
Vo.y *= -1.0f; // Negate imaginary part (complex conjugate)
1441+
}
14531442

1454-
// ================================================================
1455-
// Step 2b: Convert UVW coordinates from meters to lambda units
1456-
// ================================================================
1457-
uvw.x = metres_to_lambda(uvw.x, freq);
1458-
uvw.y = metres_to_lambda(uvw.y, freq);
1459-
uvw.z = metres_to_lambda(uvw.z, freq);
1443+
// ================================================================
1444+
// Step 2b: Convert UVW coordinates from meters to lambda units
1445+
// ================================================================
1446+
uvw.x = metres_to_lambda(uvw.x, freq);
1447+
uvw.y = metres_to_lambda(uvw.y, freq);
1448+
uvw.z = metres_to_lambda(uvw.z, freq);
14601449

1461-
// ================================================================
1462-
// Step 2c: Calculate grid pixel coordinates
1463-
// ================================================================
1464-
grid_pos_x = uvw.x / deltau; // Grid position in u direction
1465-
grid_pos_y = uvw.y / deltav; // Grid position in v direction
1466-
1467-
// Center the grid: center pixel is at floor(N/2) for both even and odd N
1468-
// The +0.5 ensures proper rounding when converting to integer pixel index
1469-
j_fp = grid_pos_x + center_j + 0.5;
1470-
k_fp = grid_pos_y + center_k + 0.5;
1471-
j = int(j_fp); // Column index in grid
1472-
k = int(k_fp); // Row index in grid
1450+
// ================================================================
1451+
// Step 2c: Calculate grid pixel coordinates
1452+
// ================================================================
1453+
grid_pos_x = uvw.x / deltau; // Grid position in u direction
1454+
grid_pos_y = uvw.y / deltav; // Grid position in v direction
1455+
1456+
// Center the grid: center pixel is at floor(N/2) for both even and odd N
1457+
// The +0.5 ensures proper rounding when converting to integer pixel index
1458+
j_fp = grid_pos_x + center_j + 0.5;
1459+
k_fp = grid_pos_y + center_k + 0.5;
1460+
j = int(j_fp); // Column index in grid
1461+
k = int(k_fp); // Row index in grid
14731462

1474-
// ================================================================
1475-
// Step 2d: Apply convolution kernel to grid this visibility
1476-
// ================================================================
1477-
// The kernel spreads each visibility over neighboring grid pixels
1478-
// Accumulate into thread-local arrays (no synchronization needed)
1479-
for (int m = -support_y; m <= support_y; m++) {
1480-
for (int n = -support_x; n <= support_x; n++) {
1481-
// Calculate shifted grid position
1482-
shifted_j = j + n;
1483-
shifted_k = k + m;
1463+
// ================================================================
1464+
// Step 2d: Apply convolution kernel to grid this visibility
1465+
// ================================================================
1466+
// The kernel spreads each visibility over neighboring grid pixels
1467+
for (int m = -support_y; m <= support_y; m++) {
1468+
for (int n = -support_x; n <= support_x; n++) {
1469+
// Calculate shifted grid position
1470+
shifted_j = j + n;
1471+
shifted_k = k + m;
1472+
1473+
// Calculate kernel array indices (kernel is centered at support)
1474+
kernel_j = n + support_x;
1475+
kernel_i = m + support_y;
1476+
1477+
// Check bounds: grid must be valid AND kernel indices must be valid
1478+
if (shifted_k >= 0 && shifted_k < M &&
1479+
shifted_j >= 0 && shifted_j < N &&
1480+
kernel_i >= 0 && kernel_i < ckernel->getm() &&
1481+
kernel_j >= 0 && kernel_j < ckernel->getn()) {
1482+
1483+
// Get kernel value at this position
1484+
ckernel_result = ckernel->getKernelValue(kernel_i, kernel_j);
14841485

1485-
// Calculate kernel array indices (kernel is centered at support)
1486-
kernel_j = n + support_x;
1487-
kernel_i = m + support_y;
1486+
// Cache squared kernel value to avoid recomputation
1487+
float ckernel_result_sq = ckernel_result * ckernel_result;
14881488

1489-
// Check bounds: grid must be valid AND kernel indices must be valid
1490-
if (shifted_k >= 0 && shifted_k < M &&
1491-
shifted_j >= 0 && shifted_j < N &&
1492-
kernel_i >= 0 && kernel_i < ckernel->getm() &&
1493-
kernel_j >= 0 && kernel_j < ckernel->getn()) {
1494-
1495-
// Get kernel value at this position
1496-
ckernel_result = ckernel->getKernelValue(kernel_i, kernel_j);
1497-
1498-
// Accumulate into thread-local arrays (no synchronization needed)
1499-
int grid_idx = N * shifted_k + shifted_j;
1500-
1501-
// Accumulate weighted visibility: V_grid += w * V * kernel
1502-
local_Vo[grid_idx].x += w * Vo.x * ckernel_result;
1503-
local_Vo[grid_idx].y += w * Vo.y * ckernel_result;
1504-
1505-
// Accumulate weights: sum of w * kernel
1506-
local_weights[grid_idx] += w * ckernel_result;
1507-
1508-
// Accumulate auxiliary weights: sum of w * kernel^2 (for effective weight)
1509-
local_weights_aux[grid_idx] += w * ckernel_result * ckernel_result;
1489+
int grid_idx = N * shifted_k + shifted_j;
1490+
1491+
// Use atomic operations for simple weight accumulations (faster than critical)
1492+
#pragma omp atomic
1493+
g_weights[grid_idx] += w * ckernel_result;
1494+
1495+
#pragma omp atomic
1496+
g_weights_aux[grid_idx] += w * ckernel_result_sq;
1497+
1498+
// Critical section only for complex number updates (g_Vo)
1499+
// Complex numbers require synchronized updates of both real and imaginary parts
1500+
#pragma omp critical
1501+
{
1502+
g_Vo[grid_idx].x += w * Vo.x * ckernel_result;
1503+
g_Vo[grid_idx].y += w * Vo.y * ckernel_result;
15101504
}
15111505
}
15121506
}
15131507
}
1514-
1515-
// ================================================================
1516-
// Step 2e: Merge thread-local arrays into global arrays
1517-
// ================================================================
1518-
// Use atomic operations for simple increments, critical for complex numbers
1519-
#pragma omp critical
1520-
{
1521-
for (int idx = 0; idx < grid_size; idx++) {
1522-
g_Vo[idx].x += local_Vo[idx].x;
1523-
g_Vo[idx].y += local_Vo[idx].y;
1524-
g_weights[idx] += local_weights[idx];
1525-
g_weights_aux[idx] += local_weights_aux[idx];
1526-
}
1527-
}
1528-
} // End parallel region
1508+
}
15291509

15301510
// ====================================================================
15311511
// Step 3: Normalize gridded visibilities and calculate effective weights
@@ -1679,17 +1659,27 @@ __host__ void calc_sBeam(std::vector<double3> uvw,
16791659
double* s_vv,
16801660
double* s_uv) {
16811661
double u_lambda, v_lambda;
1682-
#pragma omp parallel for shared(uvw, weight) private(u_lambda, v_lambda)
1662+
double local_s_uu = 0.0;
1663+
double local_s_vv = 0.0;
1664+
double local_s_uv = 0.0;
1665+
1666+
// Use reduction for efficient parallel accumulation
1667+
#pragma omp parallel for shared(uvw, weight) private(u_lambda, v_lambda) \
1668+
reduction(+ : local_s_uu, local_s_vv, local_s_uv)
16831669
for (int i = 0; i < uvw.size(); i++) {
16841670
u_lambda = metres_to_lambda(uvw[i].x, nu);
16851671
v_lambda = metres_to_lambda(uvw[i].y, nu);
1686-
#pragma omp critical
1687-
{
1688-
*s_uu += u_lambda * u_lambda * weight[i];
1689-
*s_vv += v_lambda * v_lambda * weight[i];
1690-
*s_uv += u_lambda * v_lambda * weight[i];
1691-
}
1672+
1673+
// Accumulate into reduction variables (no synchronization needed)
1674+
local_s_uu += u_lambda * u_lambda * weight[i];
1675+
local_s_vv += v_lambda * v_lambda * weight[i];
1676+
local_s_uv += u_lambda * v_lambda * weight[i];
16921677
}
1678+
1679+
// Update output pointers after parallel reduction
1680+
*s_uu += local_s_uu;
1681+
*s_vv += local_s_vv;
1682+
*s_uv += local_s_uv;
16931683
}
16941684

16951685
__host__ double3 calc_beamSize(double s_uu, double s_vv, double s_uv) {

0 commit comments

Comments
 (0)