@@ -65,8 +65,8 @@ extern varsPerGPU *vars_gpu;
6565
6666typedef float (*FnPtr)(float , float , float , float );
6767
68- __device__ FnPtr beam_maps[2 ] = {
69- AiryDiskBeam, GaussianBeam
68+ __device__ FnPtr beam_maps[2 ] = {
69+ AiryDiskBeam, GaussianBeam
7070};
7171
7272
@@ -1007,7 +1007,7 @@ __host__ void degridding(std::vector<Field>& fields, MSData data, double deltau,
10071007
10081008 int nthreads = gridding;
10091009
1010- std::vector<std::vector<cufftComplex>> gridded_visibilities (nthreads, std::vector<cufftComplex> (M*N));
1010+ std::vector<std::vector<cufftComplex> > gridded_visibilities (nthreads, std::vector<cufftComplex> (M*N));
10111011
10121012 if (num_gpus == 1 ) {
10131013 cudaSetDevice (selected);
@@ -2031,13 +2031,10 @@ __device__ float calculateDNormL1(float *I, float lambda, float noise, float noi
20312031 float dL1 = 0 .0f ;
20322032
20332033 float c = I[N*M*index+N*i+j];
2034- float normI = normf (1 , &c);
2035- if (noise <= noise_cut) {
2036- if (normI > 0 .0f )
2037- dL1 = c / normI;
2038- else
2039- dL1 = 0 .0f ;
2040- }
2034+ float sign = copysignf (1 .0f , c);
2035+ if (noise <= noise_cut)
2036+ dL1 = sign;
2037+
20412038
20422039 dL1 *= lambda;
20432040 return dL1;
@@ -2523,25 +2520,21 @@ __global__ void DChi2_2I(float *noise, float *chain, float *I, float *dchi2, flo
25232520}
25242521
25252522
2526- __global__ void I_nu_0_Noise (float *noise_I, float *images, float *noise, float noise_cut, float nu, float nu_0, float *w, float antenna_diameter, float pb_factor, float pb_cutoff, float xobs, float yobs, double DELTAX, double DELTAY, long numVisibilities , long N, long M, int primary_beam)
2523+ __global__ void I_nu_0_Noise (float *noise_I, float *images, float *noise, float noise_cut, float nu, float nu_0, float *w, float antenna_diameter, float pb_factor, float pb_cutoff, float xobs, float yobs, double DELTAX, double DELTAY, float sum_weights , long N, long M, int primary_beam)
25272524{
25282525 int j = threadIdx .x + blockDim .x * blockIdx .x ;
25292526 int i = threadIdx .y + blockDim .y * blockIdx .y ;
25302527
2531- float alpha, nudiv, nudiv_pow_alpha, sum_noise, atten;
2528+ float alpha, nudiv, nudiv_pow_alpha, atten;
25322529
25332530 atten = attenuation (antenna_diameter, pb_factor, pb_cutoff, nu, xobs, yobs, DELTAX, DELTAY, primary_beam);
25342531
25352532 nudiv = nu/nu_0;
25362533 alpha = images[N*M+N*i+j];
25372534 nudiv_pow_alpha = powf (nudiv, 2 .0f *alpha);
25382535
2539- sum_noise = 0 .0f ;
25402536 if (noise[N*i+j] <= noise_cut) {
2541- for (int k=0 ; k<numVisibilities; k++) {
2542- sum_noise += w[k];
2543- }
2544- noise_I[N*i+j] += atten * atten * sum_noise * nudiv_pow_alpha;
2537+ noise_I[N*i+j] += atten * atten * sum_weights * nudiv_pow_alpha;
25452538 }else {
25462539 noise_I[N*i+j] = 0 .0f ;
25472540 }
@@ -2637,7 +2630,6 @@ __host__ float chi2(float *I, VirtualImageProcessor *ip)
26372630 ip->calculateInu (vars_gpu[0 ].device_I_nu , I, datasets[d].fields [f].nu [i]);
26382631
26392632 ip->apply_beam (vars_gpu[0 ].device_I_nu , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , datasets[d].fields [f].nu [i], datasets[d].antennas [0 ].primary_beam );
2640- gpuErrchk (cudaDeviceSynchronize ());
26412633
26422634 // FFT 2D
26432635 FFT2D (vars_gpu[0 ].device_V , vars_gpu[0 ].device_I_nu , vars_gpu[0 ].plan , M, N, false );
@@ -2695,11 +2687,8 @@ __host__ float chi2(float *I, VirtualImageProcessor *ip)
26952687 cudaGetDevice (&gpu_id);
26962688 ip->calculateInu (vars_gpu[i % num_gpus].device_I_nu , I, datasets[d].fields [f].nu [i]);
26972689
2698- // apply_beam<<<numBlocksNN, threadsPerBlockNN>>>(beam_fwhm, beam_freq, beam_cutoff, vars_gpu[i%num_gpus].device_I_nu, device_fg_image, N, fields[f].ref_xobs, fields[f].ref_yobs, fg_scale, fields[f].visibilities[i].freq, DELTAX, DELTAY);
26992690 ip->apply_beam (vars_gpu[i % num_gpus].device_I_nu , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , datasets[d].fields [f].ref_xobs ,
27002691 datasets[d].fields [f].ref_yobs , datasets[d].fields [f].nu [i], datasets[d].antennas [0 ].primary_beam );
2701- gpuErrchk (cudaDeviceSynchronize ());
2702-
27032692
27042693 // FFT 2D
27052694 FFT2D (vars_gpu[i % num_gpus].device_V , vars_gpu[i % num_gpus].device_I_nu , vars_gpu[i % num_gpus].plan , M, N, false );
@@ -3067,16 +3056,17 @@ __host__ void calculateErrors(Image *image){
30673056
30683057 gpuErrchk (cudaMalloc ((void **)&errors, sizeof (float )*M*N*image->getImageCount ()));
30693058 gpuErrchk (cudaMemset (errors, 0 , sizeof (float )*M*N*image->getImageCount ()));
3070-
3059+ float sum_weights;
30713060 for (int d=0 ; d<nMeasurementSets; d++) {
30723061 if (num_gpus == 1 ) {
30733062 cudaSetDevice (selected);
30743063 for (int f=0 ; f<datasets[d].data .nfields ; f++) {
30753064 for (int i=0 ; i<datasets[d].data .total_frequencies ; i++) {
30763065 for (int s=0 ; s<datasets[d].data .nstokes ; s++) {
30773066 if (datasets[d].fields [f].numVisibilitiesPerFreq [i] > 0 ) {
3067+ sum_weights = deviceReduce<float >(datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].fields [f].numVisibilitiesPerFreqPerStoke [i][s]);
30783068 I_nu_0_Noise <<< numBlocksNN, threadsPerBlockNN >>>
3079- (errors, image->getImage (), device_noise_image, noise_cut, datasets[d].fields [f].nu [i], nu_0, datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , DELTAX, DELTAY, datasets[d]. fields [f]. numVisibilitiesPerFreqPerStoke [i][s] , N, M, datasets[d].antennas [0 ].primary_beam );
3069+ (errors, image->getImage (), device_noise_image, noise_cut, datasets[d].fields [f].nu [i], nu_0, datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , DELTAX, DELTAY, sum_weights , N, M, datasets[d].antennas [0 ].primary_beam );
30803070 gpuErrchk (cudaDeviceSynchronize ());
30813071 alpha_Noise <<< numBlocksNN, threadsPerBlockNN >>>
30823072 (errors, image->getImage (), datasets[d].fields [f].nu [i], nu_0, datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].fields [f].device_visibilities [i][s].uvw , datasets[d].fields [f].device_visibilities [i][s].Vr , device_noise_image, noise_cut, DELTAX, DELTAY, datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , fg_scale, datasets[d].fields [f].numVisibilitiesPerFreqPerStoke [i][s], N, M, datasets[d].antennas [0 ].primary_beam );
@@ -3087,7 +3077,7 @@ __host__ void calculateErrors(Image *image){
30873077 }
30883078 }else {
30893079 for (int f=0 ; f<datasets[d].data .nfields ; f++) {
3090- #pragma omp parallel for schedule(static,1)
3080+ #pragma omp parallel for private(sum_weights) schedule(static,1)
30913081 for (int i = 0 ; i < datasets[d].data .total_frequencies ; i++)
30923082 {
30933083 unsigned int j = omp_get_thread_num ();
@@ -3098,11 +3088,12 @@ __host__ void calculateErrors(Image *image){
30983088 cudaGetDevice (&gpu_id);
30993089 for (int s=0 ; s<datasets[d].data .nstokes ; s++) {
31003090 if (datasets[d].fields [f].numVisibilitiesPerFreq [i] > 0 ) {
3091+ sum_weights = deviceReduce<float >(datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].fields [f].numVisibilitiesPerFreqPerStoke [i][s]);
31013092
31023093 #pragma omp critical
31033094 {
31043095 I_nu_0_Noise << < numBlocksNN, threadsPerBlockNN >> >
3105- (errors, image->getImage (), device_noise_image, noise_cut, datasets[d].fields [f].nu [i], nu_0, datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , DELTAX, DELTAY, datasets[d]. fields [f]. numVisibilitiesPerFreqPerStoke [i][s] , N, M, datasets[d].antennas [0 ].primary_beam );
3096+ (errors, image->getImage (), device_noise_image, noise_cut, datasets[d].fields [f].nu [i], nu_0, datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , DELTAX, DELTAY, sum_weights , N, M, datasets[d].antennas [0 ].primary_beam );
31063097 gpuErrchk (cudaDeviceSynchronize ());
31073098 alpha_Noise << < numBlocksNN, threadsPerBlockNN >> >
31083099 (errors, image->getImage (), datasets[d].fields [f].nu [i], nu_0, datasets[d].fields [f].device_visibilities [i][s].weight , datasets[d].fields [f].device_visibilities [i][s].uvw , datasets[d].fields [f].device_visibilities [i][s].Vr , device_noise_image, noise_cut, DELTAX, DELTAY, datasets[d].fields [f].ref_xobs , datasets[d].fields [f].ref_yobs , datasets[d].antennas [0 ].antenna_diameter , datasets[d].antennas [0 ].pb_factor , datasets[d].antennas [0 ].pb_cutoff , fg_scale, datasets[d].fields [f].numVisibilitiesPerFreqPerStoke [i][s], N, M, datasets[d].antennas [0 ].primary_beam );
0 commit comments