Skip to content

Commit e07ffc9

Browse files
minor fixes
1 parent 35c5a56 commit e07ffc9

1 file changed

Lines changed: 16 additions & 25 deletions

File tree

src/functions.cu

Lines changed: 16 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ extern varsPerGPU *vars_gpu;
6565

6666
typedef float (*FnPtr)(float, float, float, float);
6767

68-
__device__ FnPtr beam_maps[2] = {
69-
AiryDiskBeam, GaussianBeam
68+
__device__ FnPtr beam_maps[2] = {
69+
AiryDiskBeam, GaussianBeam
7070
};
7171

7272

@@ -1007,7 +1007,7 @@ __host__ void degridding(std::vector<Field>& fields, MSData data, double deltau,
10071007

10081008
int nthreads = gridding;
10091009

1010-
std::vector<std::vector<cufftComplex>> gridded_visibilities(nthreads, std::vector<cufftComplex> (M*N));
1010+
std::vector<std::vector<cufftComplex> > gridded_visibilities(nthreads, std::vector<cufftComplex> (M*N));
10111011

10121012
if(num_gpus == 1) {
10131013
cudaSetDevice(selected);
@@ -2031,13 +2031,10 @@ __device__ float calculateDNormL1(float *I, float lambda, float noise, float noi
20312031
float dL1 = 0.0f;
20322032

20332033
float c = I[N*M*index+N*i+j];
2034-
float normI = normf(1, &c);
2035-
if(noise <= noise_cut) {
2036-
if(normI > 0.0f)
2037-
dL1 = c / normI;
2038-
else
2039-
dL1 = 0.0f;
2040-
}
2034+
float sign = copysignf(1.0f, c);
2035+
if(noise <= noise_cut)
2036+
dL1 = sign;
2037+
20412038

20422039
dL1 *= lambda;
20432040
return dL1;
@@ -2523,25 +2520,21 @@ __global__ void DChi2_2I(float *noise, float *chain, float *I, float *dchi2, flo
25232520
}
25242521

25252522

2526-
__global__ void I_nu_0_Noise(float *noise_I, float *images, float *noise, float noise_cut, float nu, float nu_0, float *w, float antenna_diameter, float pb_factor, float pb_cutoff, float xobs, float yobs, double DELTAX, double DELTAY, long numVisibilities, long N, long M, int primary_beam)
2523+
__global__ void I_nu_0_Noise(float *noise_I, float *images, float *noise, float noise_cut, float nu, float nu_0, float *w, float antenna_diameter, float pb_factor, float pb_cutoff, float xobs, float yobs, double DELTAX, double DELTAY, float sum_weights, long N, long M, int primary_beam)
25272524
{
25282525
int j = threadIdx.x + blockDim.x * blockIdx.x;
25292526
int i = threadIdx.y + blockDim.y * blockIdx.y;
25302527

2531-
float alpha, nudiv, nudiv_pow_alpha, sum_noise, atten;
2528+
float alpha, nudiv, nudiv_pow_alpha, atten;
25322529

25332530
atten = attenuation(antenna_diameter, pb_factor, pb_cutoff, nu, xobs, yobs, DELTAX, DELTAY, primary_beam);
25342531

25352532
nudiv = nu/nu_0;
25362533
alpha = images[N*M+N*i+j];
25372534
nudiv_pow_alpha = powf(nudiv, 2.0f*alpha);
25382535

2539-
sum_noise = 0.0f;
25402536
if(noise[N*i+j] <= noise_cut) {
2541-
for(int k=0; k<numVisibilities; k++) {
2542-
sum_noise += w[k];
2543-
}
2544-
noise_I[N*i+j] += atten * atten * sum_noise * nudiv_pow_alpha;
2537+
noise_I[N*i+j] += atten * atten * sum_weights * nudiv_pow_alpha;
25452538
}else{
25462539
noise_I[N*i+j] = 0.0f;
25472540
}
@@ -2637,7 +2630,6 @@ __host__ float chi2(float *I, VirtualImageProcessor *ip)
26372630
ip->calculateInu(vars_gpu[0].device_I_nu, I, datasets[d].fields[f].nu[i]);
26382631

26392632
ip->apply_beam(vars_gpu[0].device_I_nu, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, datasets[d].fields[f].nu[i], datasets[d].antennas[0].primary_beam);
2640-
gpuErrchk(cudaDeviceSynchronize());
26412633

26422634
//FFT 2D
26432635
FFT2D(vars_gpu[0].device_V, vars_gpu[0].device_I_nu, vars_gpu[0].plan, M, N, false);
@@ -2695,11 +2687,8 @@ __host__ float chi2(float *I, VirtualImageProcessor *ip)
26952687
cudaGetDevice(&gpu_id);
26962688
ip->calculateInu(vars_gpu[i % num_gpus].device_I_nu, I, datasets[d].fields[f].nu[i]);
26972689

2698-
//apply_beam<<<numBlocksNN, threadsPerBlockNN>>>(beam_fwhm, beam_freq, beam_cutoff, vars_gpu[i%num_gpus].device_I_nu, device_fg_image, N, fields[f].ref_xobs, fields[f].ref_yobs, fg_scale, fields[f].visibilities[i].freq, DELTAX, DELTAY);
26992690
ip->apply_beam(vars_gpu[i % num_gpus].device_I_nu, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, datasets[d].fields[f].ref_xobs,
27002691
datasets[d].fields[f].ref_yobs, datasets[d].fields[f].nu[i], datasets[d].antennas[0].primary_beam);
2701-
gpuErrchk(cudaDeviceSynchronize());
2702-
27032692

27042693
//FFT 2D
27052694
FFT2D(vars_gpu[i % num_gpus].device_V, vars_gpu[i % num_gpus].device_I_nu, vars_gpu[i % num_gpus].plan, M, N, false);
@@ -3067,16 +3056,17 @@ __host__ void calculateErrors(Image *image){
30673056

30683057
gpuErrchk(cudaMalloc((void**)&errors, sizeof(float)*M*N*image->getImageCount()));
30693058
gpuErrchk(cudaMemset(errors, 0, sizeof(float)*M*N*image->getImageCount()));
3070-
3059+
float sum_weights;
30713060
for(int d=0; d<nMeasurementSets; d++) {
30723061
if(num_gpus == 1) {
30733062
cudaSetDevice(selected);
30743063
for(int f=0; f<datasets[d].data.nfields; f++) {
30753064
for(int i=0; i<datasets[d].data.total_frequencies; i++) {
30763065
for(int s=0; s<datasets[d].data.nstokes; s++) {
30773066
if (datasets[d].fields[f].numVisibilitiesPerFreq[i] > 0) {
3067+
sum_weights = deviceReduce<float>(datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].fields[f].numVisibilitiesPerFreqPerStoke[i][s]);
30783068
I_nu_0_Noise <<< numBlocksNN, threadsPerBlockNN >>>
3079-
(errors, image->getImage(), device_noise_image, noise_cut, datasets[d].fields[f].nu[i], nu_0, datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, DELTAX, DELTAY, datasets[d].fields[f].numVisibilitiesPerFreqPerStoke[i][s], N, M, datasets[d].antennas[0].primary_beam);
3069+
(errors, image->getImage(), device_noise_image, noise_cut, datasets[d].fields[f].nu[i], nu_0, datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, DELTAX, DELTAY, sum_weights, N, M, datasets[d].antennas[0].primary_beam);
30803070
gpuErrchk(cudaDeviceSynchronize());
30813071
alpha_Noise <<< numBlocksNN, threadsPerBlockNN >>>
30823072
(errors, image->getImage(), datasets[d].fields[f].nu[i], nu_0, datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].fields[f].device_visibilities[i][s].uvw, datasets[d].fields[f].device_visibilities[i][s].Vr, device_noise_image, noise_cut, DELTAX, DELTAY, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, fg_scale, datasets[d].fields[f].numVisibilitiesPerFreqPerStoke[i][s], N, M, datasets[d].antennas[0].primary_beam);
@@ -3087,7 +3077,7 @@ __host__ void calculateErrors(Image *image){
30873077
}
30883078
}else{
30893079
for(int f=0; f<datasets[d].data.nfields; f++) {
3090-
#pragma omp parallel for schedule(static,1)
3080+
#pragma omp parallel for private(sum_weights) schedule(static,1)
30913081
for (int i = 0; i < datasets[d].data.total_frequencies; i++)
30923082
{
30933083
unsigned int j = omp_get_thread_num();
@@ -3098,11 +3088,12 @@ __host__ void calculateErrors(Image *image){
30983088
cudaGetDevice(&gpu_id);
30993089
for(int s=0; s<datasets[d].data.nstokes; s++) {
31003090
if (datasets[d].fields[f].numVisibilitiesPerFreq[i] > 0) {
3091+
sum_weights = deviceReduce<float>(datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].fields[f].numVisibilitiesPerFreqPerStoke[i][s]);
31013092

31023093
#pragma omp critical
31033094
{
31043095
I_nu_0_Noise << < numBlocksNN, threadsPerBlockNN >> >
3105-
(errors, image->getImage(), device_noise_image, noise_cut, datasets[d].fields[f].nu[i], nu_0, datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, DELTAX, DELTAY, datasets[d].fields[f].numVisibilitiesPerFreqPerStoke[i][s], N, M, datasets[d].antennas[0].primary_beam);
3096+
(errors, image->getImage(), device_noise_image, noise_cut, datasets[d].fields[f].nu[i], nu_0, datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, DELTAX, DELTAY, sum_weights, N, M, datasets[d].antennas[0].primary_beam);
31063097
gpuErrchk(cudaDeviceSynchronize());
31073098
alpha_Noise << < numBlocksNN, threadsPerBlockNN >> >
31083099
(errors, image->getImage(), datasets[d].fields[f].nu[i], nu_0, datasets[d].fields[f].device_visibilities[i][s].weight, datasets[d].fields[f].device_visibilities[i][s].uvw, datasets[d].fields[f].device_visibilities[i][s].Vr, device_noise_image, noise_cut, DELTAX, DELTAY, datasets[d].fields[f].ref_xobs, datasets[d].fields[f].ref_yobs, datasets[d].antennas[0].antenna_diameter, datasets[d].antennas[0].pb_factor, datasets[d].antennas[0].pb_cutoff, fg_scale, datasets[d].fields[f].numVisibilitiesPerFreqPerStoke[i][s], N, M, datasets[d].antennas[0].primary_beam);

0 commit comments

Comments
 (0)