@@ -1415,117 +1415,97 @@ __host__ void do_gridding(std::vector<Field>& fields,
14151415 // ====================================================================
14161416 // Process each visibility twice: once as-is, once as Hermitian symmetric
14171417 // (Measurement sets only contain half the data due to symmetry)
1418- // Use thread-local accumulation to eliminate critical section bottleneck
1419- int grid_size = M * N;
1420-
1421- #pragma omp parallel num_threads(gridding) \
1418+ #pragma omp parallel for schedule(static) num_threads(gridding) \
14221419 shared (g_weights, g_weights_aux, g_Vo, freq, center_j, center_k, \
1423- support_x, support_y, num_visibilities, grid_size) \
1424- private ( j_fp, k_fp, j, k, grid_pos_x, grid_pos_y, uvw, w, Vo, \
1420+ support_x, support_y) private ( \
1421+ j_fp, k_fp, j, k, grid_pos_x, grid_pos_y, uvw, w, Vo, \
14251422 shifted_j, shifted_k, kernel_i, kernel_j, ckernel_result)
1426- {
1427- // Allocate thread-local accumulation arrays
1428- std::vector<float > local_weights (grid_size, 0 .0f );
1429- std::vector<float > local_weights_aux (grid_size, 0 .0f );
1430- std::vector<cufftComplex> local_Vo (grid_size, floatComplexZero ());
1423+ for (int z = 0 ; z < 2 * num_visibilities; z++) {
1424+ // Determine which visibility to use (first half: original, second half: Hermitian)
1425+ int vis_index = (z < num_visibilities) ? z : (z - num_visibilities);
14311426
1432- // Parallel loop over visibilities
1433- #pragma omp for schedule(static)
1434- for (int z = 0 ; z < 2 * num_visibilities; z++) {
1435- // Determine which visibility to use (first half: original, second half: Hermitian)
1436- int vis_index = (z < num_visibilities) ? z : (z - num_visibilities);
1437-
1438- // Load visibility data
1439- uvw = fields[f].visibilities [i][s].uvw [vis_index];
1440- w = fields[f].visibilities [i][s].weight [vis_index];
1441- Vo = fields[f].visibilities [i][s].Vo [vis_index];
1427+ // Load visibility data
1428+ uvw = fields[f].visibilities [i][s].uvw [vis_index];
1429+ w = fields[f].visibilities [i][s].weight [vis_index];
1430+ Vo = fields[f].visibilities [i][s].Vo [vis_index];
14421431
1443- // ================================================================
1444- // Step 2a: Create Hermitian symmetric visibility for second half
1445- // ================================================================
1446- // For V(u,v): V*(-u,-v) = V*(u,v) where * denotes complex conjugate
1447- if (z >= num_visibilities) {
1448- uvw.x *= -1.0 ; // Negate u coordinate
1449- uvw.y *= -1.0 ; // Negate v coordinate
1450- uvw.z *= -1.0 ; // Negate w coordinate
1451- Vo.y *= -1 .0f ; // Negate imaginary part (complex conjugate)
1452- }
1432+ // ================================================================
1433+ // Step 2a: Create Hermitian symmetric visibility for second half
1434+ // ================================================================
1435+ // For V(u,v): V*(-u,-v) = V*(u,v) where * denotes complex conjugate
1436+ if (z >= num_visibilities) {
1437+ uvw.x *= -1.0 ; // Negate u coordinate
1438+ uvw.y *= -1.0 ; // Negate v coordinate
1439+ uvw.z *= -1.0 ; // Negate w coordinate
1440+ Vo.y *= -1 .0f ; // Negate imaginary part (complex conjugate)
1441+ }
14531442
1454- // ================================================================
1455- // Step 2b: Convert UVW coordinates from meters to lambda units
1456- // ================================================================
1457- uvw.x = metres_to_lambda (uvw.x , freq);
1458- uvw.y = metres_to_lambda (uvw.y , freq);
1459- uvw.z = metres_to_lambda (uvw.z , freq);
1443+ // ================================================================
1444+ // Step 2b: Convert UVW coordinates from meters to lambda units
1445+ // ================================================================
1446+ uvw.x = metres_to_lambda (uvw.x , freq);
1447+ uvw.y = metres_to_lambda (uvw.y , freq);
1448+ uvw.z = metres_to_lambda (uvw.z , freq);
14601449
1461- // ================================================================
1462- // Step 2c: Calculate grid pixel coordinates
1463- // ================================================================
1464- grid_pos_x = uvw.x / deltau; // Grid position in u direction
1465- grid_pos_y = uvw.y / deltav; // Grid position in v direction
1466-
1467- // Center the grid: center pixel is at floor(N/2) for both even and odd N
1468- // The +0.5 ensures proper rounding when converting to integer pixel index
1469- j_fp = grid_pos_x + center_j + 0.5 ;
1470- k_fp = grid_pos_y + center_k + 0.5 ;
1471- j = int (j_fp); // Column index in grid
1472- k = int (k_fp); // Row index in grid
1450+ // ================================================================
1451+ // Step 2c: Calculate grid pixel coordinates
1452+ // ================================================================
1453+ grid_pos_x = uvw.x / deltau; // Grid position in u direction
1454+ grid_pos_y = uvw.y / deltav; // Grid position in v direction
1455+
1456+ // Center the grid: center pixel is at floor(N/2) for both even and odd N
1457+ // The +0.5 ensures proper rounding when converting to integer pixel index
1458+ j_fp = grid_pos_x + center_j + 0.5 ;
1459+ k_fp = grid_pos_y + center_k + 0.5 ;
1460+ j = int (j_fp); // Column index in grid
1461+ k = int (k_fp); // Row index in grid
14731462
1474- // ================================================================
1475- // Step 2d: Apply convolution kernel to grid this visibility
1476- // ================================================================
1477- // The kernel spreads each visibility over neighboring grid pixels
1478- // Accumulate into thread-local arrays (no synchronization needed)
1479- for (int m = -support_y; m <= support_y; m++) {
1480- for (int n = -support_x; n <= support_x; n++) {
1481- // Calculate shifted grid position
1482- shifted_j = j + n;
1483- shifted_k = k + m;
1463+ // ================================================================
1464+ // Step 2d: Apply convolution kernel to grid this visibility
1465+ // ================================================================
1466+ // The kernel spreads each visibility over neighboring grid pixels
1467+ for (int m = -support_y; m <= support_y; m++) {
1468+ for (int n = -support_x; n <= support_x; n++) {
1469+ // Calculate shifted grid position
1470+ shifted_j = j + n;
1471+ shifted_k = k + m;
1472+
1473+ // Calculate kernel array indices (kernel is centered at support)
1474+ kernel_j = n + support_x;
1475+ kernel_i = m + support_y;
1476+
1477+ // Check bounds: grid must be valid AND kernel indices must be valid
1478+ if (shifted_k >= 0 && shifted_k < M &&
1479+ shifted_j >= 0 && shifted_j < N &&
1480+ kernel_i >= 0 && kernel_i < ckernel->getm () &&
1481+ kernel_j >= 0 && kernel_j < ckernel->getn ()) {
1482+
1483+ // Get kernel value at this position
1484+ ckernel_result = ckernel->getKernelValue (kernel_i, kernel_j);
14841485
1485- // Calculate kernel array indices (kernel is centered at support)
1486- kernel_j = n + support_x;
1487- kernel_i = m + support_y;
1486+ // Cache squared kernel value to avoid recomputation
1487+ float ckernel_result_sq = ckernel_result * ckernel_result;
14881488
1489- // Check bounds: grid must be valid AND kernel indices must be valid
1490- if (shifted_k >= 0 && shifted_k < M &&
1491- shifted_j >= 0 && shifted_j < N &&
1492- kernel_i >= 0 && kernel_i < ckernel->getm () &&
1493- kernel_j >= 0 && kernel_j < ckernel->getn ()) {
1494-
1495- // Get kernel value at this position
1496- ckernel_result = ckernel->getKernelValue (kernel_i, kernel_j);
1497-
1498- // Accumulate into thread-local arrays (no synchronization needed)
1499- int grid_idx = N * shifted_k + shifted_j;
1500-
1501- // Accumulate weighted visibility: V_grid += w * V * kernel
1502- local_Vo[grid_idx].x += w * Vo.x * ckernel_result;
1503- local_Vo[grid_idx].y += w * Vo.y * ckernel_result;
1504-
1505- // Accumulate weights: sum of w * kernel
1506- local_weights[grid_idx] += w * ckernel_result;
1507-
1508- // Accumulate auxiliary weights: sum of w * kernel^2 (for effective weight)
1509- local_weights_aux[grid_idx] += w * ckernel_result * ckernel_result;
1489+ int grid_idx = N * shifted_k + shifted_j;
1490+
1491+ // Use atomic operations for simple weight accumulations (faster than critical)
1492+ #pragma omp atomic
1493+ g_weights[grid_idx] += w * ckernel_result;
1494+
1495+ #pragma omp atomic
1496+ g_weights_aux[grid_idx] += w * ckernel_result_sq;
1497+
1498+ // Critical section only for complex number updates (g_Vo)
1499+ // Complex numbers require synchronized updates of both real and imaginary parts
1500+ #pragma omp critical
1501+ {
1502+ g_Vo[grid_idx].x += w * Vo.x * ckernel_result;
1503+ g_Vo[grid_idx].y += w * Vo.y * ckernel_result;
15101504 }
15111505 }
15121506 }
15131507 }
1514-
1515- // ================================================================
1516- // Step 2e: Merge thread-local arrays into global arrays
1517- // ================================================================
1518- // Use atomic operations for simple increments, critical for complex numbers
1519- #pragma omp critical
1520- {
1521- for (int idx = 0 ; idx < grid_size; idx++) {
1522- g_Vo[idx].x += local_Vo[idx].x ;
1523- g_Vo[idx].y += local_Vo[idx].y ;
1524- g_weights[idx] += local_weights[idx];
1525- g_weights_aux[idx] += local_weights_aux[idx];
1526- }
1527- }
1528- } // End parallel region
1508+ }
15291509
15301510 // ====================================================================
15311511 // Step 3: Normalize gridded visibilities and calculate effective weights
@@ -1679,17 +1659,27 @@ __host__ void calc_sBeam(std::vector<double3> uvw,
16791659 double * s_vv,
16801660 double * s_uv) {
16811661 double u_lambda, v_lambda;
1682- #pragma omp parallel for shared(uvw, weight) private(u_lambda, v_lambda)
1662+ double local_s_uu = 0.0 ;
1663+ double local_s_vv = 0.0 ;
1664+ double local_s_uv = 0.0 ;
1665+
1666+ // Use reduction for efficient parallel accumulation
1667+ #pragma omp parallel for shared(uvw, weight) private(u_lambda, v_lambda) \
1668+ reduction (+ : local_s_uu, local_s_vv, local_s_uv)
16831669 for (int i = 0 ; i < uvw.size (); i++) {
16841670 u_lambda = metres_to_lambda (uvw[i].x , nu);
16851671 v_lambda = metres_to_lambda (uvw[i].y , nu);
1686- #pragma omp critical
1687- {
1688- *s_uu += u_lambda * u_lambda * weight[i];
1689- *s_vv += v_lambda * v_lambda * weight[i];
1690- *s_uv += u_lambda * v_lambda * weight[i];
1691- }
1672+
1673+ // Accumulate into reduction variables (no synchronization needed)
1674+ local_s_uu += u_lambda * u_lambda * weight[i];
1675+ local_s_vv += v_lambda * v_lambda * weight[i];
1676+ local_s_uv += u_lambda * v_lambda * weight[i];
16921677 }
1678+
1679+ // Update output pointers after parallel reduction
1680+ *s_uu += local_s_uu;
1681+ *s_vv += local_s_vv;
1682+ *s_uv += local_s_uv;
16931683}
16941684
16951685__host__ double3 calc_beamSize (double s_uu, double s_vv, double s_uv) {
0 commit comments