|
| 1 | +const int Nthreads = 1024, NrankMax = 6, maxFR = 10000, nt0max=81, NchanMax = 17, nsizes = 5; |
| 2 | + |
| 3 | + |
| 4 | +////////////////////////////////////////////////////////////////////////////////////////// |
| 5 | +__global__ void Conv1D(const double *Params, const float *data, const float *W, float *conv_sig){ |
| 6 | + volatile __shared__ float sW[81*NrankMax], sdata[(Nthreads+81)]; |
| 7 | + float y; |
| 8 | + int tid, tid0, bid, i, nid, Nrank, NT, nt0, Nchan; |
| 9 | + |
| 10 | + tid = threadIdx.x; |
| 11 | + bid = blockIdx.x; |
| 12 | + |
| 13 | + NT = (int) Params[0]; |
| 14 | + Nchan = (int) Params[1]; |
| 15 | + nt0 = (int) Params[2]; |
| 16 | + Nrank = (int) Params[4]; |
| 17 | + |
| 18 | + if(tid<nt0*Nrank) |
| 19 | + sW[tid]= W[tid]; |
| 20 | + __syncthreads(); |
| 21 | + |
| 22 | + tid0 = 0; |
| 23 | + while (tid0<NT-Nthreads-nt0+1){ |
| 24 | + if (tid<nt0) |
| 25 | + sdata[tid] = data[tid0 + tid + NT*bid]; |
| 26 | + sdata[tid + nt0] = data[nt0+tid0 + tid+ NT*bid]; |
| 27 | + __syncthreads(); |
| 28 | + |
| 29 | + for(nid=0;nid<Nrank;nid++){ |
| 30 | + y = 0.0f; |
| 31 | + #pragma unroll 4 |
| 32 | + for(i=0;i<nt0;i++) |
| 33 | + y += sW[i + nid*nt0] * sdata[i+tid]; |
| 34 | + conv_sig[tid0 + tid + NT*bid + nid * NT * Nchan] = y; |
| 35 | + } |
| 36 | + tid0+=Nthreads; |
| 37 | + __syncthreads(); |
| 38 | + } |
| 39 | +} |
| 40 | + |
| 41 | +////////////////////////////////////////////////////////////////////////////////////////// |
| 42 | +__global__ void sumChannels(const double *Params, const float *data, |
| 43 | + float *datasum, int *kkmax, const int *iC2, const float *dist, const float *v2){ |
| 44 | + |
| 45 | + int tid, tid0,t,k, kmax, bidx, bidy, NT, Nchan, NchanNear,j,iChan, Nsum, Nrank; |
| 46 | + float Cmax, C0; |
| 47 | + float a[nsizes], d2; |
| 48 | + float sigma; |
| 49 | + volatile __shared__ float sA[nsizes * 20]; |
| 50 | + |
| 51 | + |
| 52 | + tid = threadIdx.x; |
| 53 | + bidx = blockIdx.x; |
| 54 | + bidy = blockIdx.y; |
| 55 | + NT = (int) Params[0]; |
| 56 | + Nchan = (int) Params[1]; |
| 57 | + NchanNear = (int) Params[3]; |
| 58 | + Nrank = (int) Params[4]; |
| 59 | + Nsum = (int) Params[3]; |
| 60 | + sigma = (float) Params[9]; |
| 61 | + |
| 62 | + if (tid<nsizes*NchanNear){ |
| 63 | + d2 = dist[tid/nsizes + NchanNear * bidy]; |
| 64 | + k = tid%nsizes; |
| 65 | + sA[tid] = expf( - (d2 * d2)/((1+k)*(1+k)*sigma*sigma)); |
| 66 | + } |
| 67 | + __syncthreads(); |
| 68 | + |
| 69 | + tid0 = tid + bidx * blockDim.x; |
| 70 | + while (tid0<NT){ |
| 71 | + Cmax = 0.0f; |
| 72 | + kmax = 0; |
| 73 | + |
| 74 | + for (t=0;t<Nrank;t++){ |
| 75 | + for(k=0; k<nsizes; k++) |
| 76 | + a[k] = 0.; |
| 77 | + |
| 78 | + for(j=0; j<Nsum; j++){ |
| 79 | + iChan = iC2[j + NchanNear * bidy]; |
| 80 | + for(k=0; k<nsizes; k++) |
| 81 | + a[k] += sA[k + nsizes * j] * |
| 82 | + data[tid0 + NT * iChan + t * NT * Nchan]; |
| 83 | + } |
| 84 | + for(k=0; k<nsizes; k++){ |
| 85 | + a[k] = max(a[k], 0.); |
| 86 | + if (a[k]*a[k] / v2[k + nsizes*bidy] > Cmax){ |
| 87 | + Cmax = a[k]*a[k]/v2[k + nsizes*bidy]; |
| 88 | + kmax = t + k*Nrank; |
| 89 | + } |
| 90 | + } |
| 91 | + } |
| 92 | + datasum[tid0 + NT * bidy] = Cmax; |
| 93 | + kkmax[tid0 + NT * bidy] = kmax; |
| 94 | + |
| 95 | + tid0 += blockDim.x * gridDim.x; |
| 96 | + } |
| 97 | +} |
| 98 | + |
| 99 | +////////////////////////////////////////////////////////////////////////////////////////// |
| 100 | +__global__ void max1D(const double *Params, const float *data, float *conv_sig){ |
| 101 | + |
| 102 | + volatile __shared__ float sdata[Nthreads+81]; |
| 103 | + float y, spkTh; |
| 104 | + int tid, tid0, bid, i, NT, nt0, nt0min; |
| 105 | + |
| 106 | + NT = (int) Params[0]; |
| 107 | + nt0 = (int) Params[2]; |
| 108 | + nt0min = (int) Params[5]; |
| 109 | + spkTh = (float) Params[6]; |
| 110 | + |
| 111 | + tid = threadIdx.x; |
| 112 | + bid = blockIdx.x; |
| 113 | + |
| 114 | + tid0 = 0; |
| 115 | + while (tid0<NT-Nthreads-nt0+1){ |
| 116 | + if (tid<nt0) |
| 117 | + sdata[tid] = data[tid0 + tid + NT*bid]; |
| 118 | + sdata[tid + nt0] = data[nt0+tid0 + tid+ NT*bid]; |
| 119 | + __syncthreads(); |
| 120 | + |
| 121 | + y = 0.0f; |
| 122 | + #pragma unroll 4 |
| 123 | + for(i=0;i<2*nt0min;i++) |
| 124 | + y = max(y, sdata[tid+i]); |
| 125 | + |
| 126 | + if (y>spkTh*spkTh) |
| 127 | + conv_sig[tid0 + 1*(nt0min) + tid + NT*bid] = y; |
| 128 | + |
| 129 | + tid0+=Nthreads; |
| 130 | + __syncthreads(); |
| 131 | + } |
| 132 | +} |
| 133 | + |
| 134 | +////////////////////////////////////////////////////////////////////////////////////////// |
| 135 | +__global__ void maxChannels(const double *Params, const float *dataraw, const float *data, |
| 136 | + const int *iC, const int *iC2, const float *dist2, const int *kkmax, |
| 137 | + const float *dfilt, int *st, int *counter, float *cF){ |
| 138 | + |
| 139 | + int nt0, indx, tid, tid0, i, bid, NT, j,iChan, nt0min, Nrank, kfilt; |
| 140 | + int Nchan, NchanNear, NchanUp, NchanNearUp, bidy ; |
| 141 | + double Cf, d; |
| 142 | + float spkTh, d2; |
| 143 | + bool flag; |
| 144 | + |
| 145 | + NT = (int) Params[0]; |
| 146 | + Nchan = (int) Params[1]; |
| 147 | + NchanNear = (int) Params[3]; |
| 148 | + NchanUp = (int) Params[7]; |
| 149 | + NchanNearUp = (int) Params[8]; |
| 150 | + nt0 = (int) Params[2]; |
| 151 | + nt0min = (int) Params[5]; |
| 152 | + spkTh = (float) Params[6]; |
| 153 | + Nrank = (int) Params[4]; |
| 154 | + |
| 155 | + tid = threadIdx.x; |
| 156 | + bid = blockIdx.x; |
| 157 | + bidy = blockIdx.y; |
| 158 | + |
| 159 | + tid0 = tid + bid * blockDim.x; |
| 160 | + while (tid0<NT-nt0-nt0min){ |
| 161 | + i = bidy; |
| 162 | + Cf = (double) data[tid0 + NT * i]; |
| 163 | + flag = true; |
| 164 | + for(j=1; j<NchanNearUp; j++){ |
| 165 | + if (dist2[j + NchanNearUp * i] < 100.){ |
| 166 | + iChan = iC2[j+ NchanNearUp * i]; |
| 167 | + if (data[tid0 + NT * iChan] > Cf){ |
| 168 | + flag = false; |
| 169 | + break; |
| 170 | + } |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + if (flag){ |
| 175 | + if (Cf>spkTh*spkTh){ |
| 176 | + d = (double) dataraw[tid0+0 * (nt0min-1) + NT*i]; // |
| 177 | + if (d > Cf-1e-6){ |
| 178 | + // this is a hit, atomicAdd and return spikes |
| 179 | + indx = atomicAdd(&counter[0], 1); |
| 180 | + if (indx<maxFR){ |
| 181 | + st[0+4*indx] = tid0; |
| 182 | + st[1+4*indx] = i; |
| 183 | + st[2+4*indx] = sqrt(d); |
| 184 | + st[3+4*indx] = kkmax[tid0+0*(nt0min-1) + NT*i]; |
| 185 | + kfilt = st[3+4*indx]%Nrank; |
| 186 | + for(j=0; j<NchanNear; j++){ |
| 187 | + iChan = iC[j+ NchanNear * i]; |
| 188 | + cF[j + NchanNear * indx] = dfilt[tid0+0*(nt0min-1) + NT * iChan + kfilt * Nchan*NT]; |
| 189 | + } |
| 190 | + } |
| 191 | + } |
| 192 | + } |
| 193 | + } |
| 194 | + |
| 195 | + tid0 += blockDim.x * gridDim.x; |
| 196 | + } |
| 197 | +} |
0 commit comments