-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunified_mds_stochastic.cu
243 lines (217 loc) · 13.8 KB
/
unified_mds_stochastic.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#define OBSTRUCTION 1.4
#define WALK_SPEED 1.3
#define INF 0x7f800000
texture<int, 1, cudaReadModeElementType> tex_near_stations;
texture<int, 2, cudaReadModeElementType> tex_station_coords;
texture<int, 2, cudaReadModeElementType> tex_matrix;
// preprocess to replace blockDim.x/y
/*
* [ stations kernel ]
*
* Finds nearby stations to each thread block and saves them.
*/
__global__ void stations (int *glb_near_stations, int max_x, int max_y)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int ds_x, ds_y;
int ds_idx;
float dist;
int blk_near_idx [@N_NEAR_STATIONS];
float blk_near_dist[@N_NEAR_STATIONS];
if (x < max_x && y < max_y) { // Check that this thread is inside the map.
int slots_filled = 0;
float max_dist = 0;
int max_slot;
for (ds_idx = 0; ds_idx < @N_STATIONS; ds_idx++) { // For every station:
ds_x = tex2D(tex_station_coords, ds_idx, 0); // Get the station's geographic x coordinate.
ds_y = tex2D(tex_station_coords, ds_idx, 1); // Get the station's geographic y coordinate.
dist = sqrt( pow(float(ds_x - x), 2) + pow(float(ds_y - y), 2)) * 100; // Find the geographic distance from the station to this texel.
if (slots_filled < @N_NEAR_STATIONS) { // First, fill up all the nearby station slots, keeping track of the 'worst' station.
blk_near_idx [slots_filled] = ds_idx;
blk_near_dist[slots_filled] = dist;
if (dist > max_dist) {
max_dist = dist;
max_slot = slots_filled;
}
slots_filled++;
} else { // Then, keep replacing the worst station each time a closer one is found.
if (dist < max_dist) {
blk_near_idx [max_slot] = ds_idx;
blk_near_dist[max_slot] = dist;
max_dist = 0;
for (int slot = 0; slot < @N_NEAR_STATIONS; slot++) { // Scan through the list to find the new worst.
if (blk_near_dist[slot] > max_dist) {
max_dist = blk_near_dist[slot];
max_slot = slot;
}
}
}
}
}
// mark this cell for uselessness
float min_dist = INF;
for (int i = 0; i < @N_NEAR_STATIONS; i++) min_dist = min(min_dist, blk_near_dist[i]);
if (min_dist > 2000) blk_near_idx[0] = -1;
int *p = glb_near_stations + (x * max_y + y) * @N_NEAR_STATIONS * 2;
// should index the pointer or increment? constant array subscripts in unrolled loop would be better.
for (int i = 0; i < @N_NEAR_STATIONS; i++) {
*(p++) = blk_near_idx[i];
*(p++) = int(blk_near_dist[i]);
}
}
}
/*
* [ forces kernel ]
*
* Finds both network travel times and forces on a texel-by-texel basis.
*/
__global__ void forces (
int n_stations, //replace with constant
int s_low,
int s_high,
int max_x,
int max_y,
int *active_cells, // currently unused
float *glb_coords, // make them a texture? test performance - this is an optimisation.
float *glb_forces,
float *glb_weights,
float *glb_errors,
int *debug,
float *debug_img)
{
float coord [@DIM];
float vector[@DIM];
float force [@DIM];
float *glb_coord;
float *glb_error;
float *glb_force;
float *glb_weight;
float adjust;
float error = 0; // must initialize because error is cumulative per pass
float error_here;
float error_max = 0;
float norm;
float tt;
float weight = 0;
float weight_here;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int os_x, os_y, os_idx, cell_idx, o, c;
float blk_origin_coord [@DIM];
float blk_near_dist[@N_NEAR_STATIONS]; // Dist from this texel to the destination station
int blk_near_idx [@N_NEAR_STATIONS];
if (isnan(glb_coords[(x * max_y + y) * @DIM])) return; // should kill off useless threads, but also those that should be loading shared memory. shared mem should be exchanged for registers anyway.
// actually, it doesn't help. memory accesses must be really slow. calculation is faster.
// when changing code, I see slight differences in the evolution of error images - this is probably due to random initial configuration of points.
if ( x < max_x && y < max_y ) { // Check that this thread is inside the map.
@unroll @N_NEAR_STATIONS
blk_near_idx [@I] = tex1Dfetch(tex_near_stations, (x * max_y + y) * @N_NEAR_STATIONS * 2 + @I * 2 + 0);
if (blk_near_idx[0] == -1) return; // this cell was marked useless because it is too far from transit
@unroll @N_NEAR_STATIONS
blk_near_dist [@I] = tex1Dfetch(tex_near_stations, (x * max_y + y) * @N_NEAR_STATIONS * 2 + @I * 2 + 1);
glb_coord = &( glb_coords[(x * max_y + y) * @DIM] ); // Make a pointer to this texel's time-space coordinate in global memory.
@unroll @DIM
coord[@I] = glb_coord[@I]; // Copy the time-space coordinate for this texel from global to local memory.
@unroll @DIM
force[@I] = 0; // Initialize this timestep's force to 0. (Do this before calculating forces, outside the loops!)
for (c = s_low; c < s_high; c++) { // For every origin station:
//int cell_idx = c + ((x * max_y + y) * 200) % 100000; // try to offset a bit and use more random values. this could be done better.
int cell_idx = c + ((blockIdx.x * gridDim.y + blockIdx.y) * 75) % 100000; // try to offset a bit and use more random values. this could be done better.
os_x = active_cells[cell_idx*2]; // Get the origin station's geographic x coordinate.
os_y = active_cells[cell_idx*2 + 1]; // Get the origin station's geographic y coordinate.
float *glb_origin_coord = &(glb_coords[(os_x*max_y + os_y) * @DIM]); // Make a pointer to the gobal time-space coordinates for this origin station.
@unroll @DIM
blk_origin_coord[@I] = glb_origin_coord[@I]; // Copy origin time-space coordinates from global to block-shared memory. Maybe it should be in register.
tt = INF; // Set the current best travel time to +infinity.
// Using manhattan distance does not noticeably speed up computation.
// Global memory is the bottleneck, so computation is better than a lookup table.
// Is it OK to unroll long loops, like the nearby stations code, as below?
// might matrix times be flipped, depending on element ordering conventions?
for (o=0; o < @N_NEAR_STATIONS; o++) {
int os_idx = tex1Dfetch(tex_near_stations, (os_x * max_y + os_y) * @N_NEAR_STATIONS * 2 + o * 2 + 0);
float dist = tex1Dfetch(tex_near_stations, (os_x * max_y + os_y) * @N_NEAR_STATIONS * 2 + o * 2 + 1);
@unroll @N_NEAR_STATIONS
tt = min(tt, dist + blk_near_dist[@I] + tex2D(tex_matrix, os_idx, blk_near_idx[@I])); // obstruction and walk speed removed
}
// inaccessible stations are coded as 99999999
// this is infinity for all practical purposes since the earth circumference is only 40k km
// if (tt >= 40000000) continue;
// maybe should just set weight_here to 0 (done below)
norm = 0; // Init here, just before accumulation in inner loop
@unroll @DIM
vector[@I] = coord[@I] - blk_origin_coord[@I]; norm += pow(vector[@I], 2); // Find the vector from the origin station to this texel in time-space.
// Accumulate terms to find the norm.
norm = sqrt(norm); // Take the square root to find the norm, i.e. the distance in time-space.
adjust = tt - norm ; // How much would the point need to move to match the best travel time?
// global influence scaling like gravity, relative to tt - scale adjustment according to travel time to point
// weight_here = (tt < 120*60) * (1 - 1 / (120*60 - (tt-1)));
// turn off weighting
if (tt >= 40000000) weight_here = 0;
else weight_here = 1;
// global influence cutoff above T minutes
// if (tt > 60 * 120) weight_here = 0; else weight_here = 1;
// weight_here = 1 / (tt + 1);
weight += weight_here;
// use __isnan() ? anyway, this is in the outer loop, and should almost never diverge within a warp.
if (norm != 0) {
@unroll @DIM // Avoid propagating nans through division by zero. Force should be 0, so add skip this step / add nothing.
force[@I] += ((vector[@I] / norm) * adjust * weight_here); // Find a unit vector, scale it by the desired 'correct' time-space distance. (weighted)
// why is this in the if block?
error_here = abs(adjust) * weight_here; // Accumulate error to each texel, so we can observe progress as the program runs. (weighted)
error += error_here; // now using rms error - should weight be inside or outside square? for now it's always 1.
error_max = max(error_max, error_here);
}
}
// DEBUGGING OUTPUT
// visualize travel times for last origin station
// debug_img[ x * max_y + y ] = tt;
// Force computation for all origin stations is now finished for this timestep.
// We should perform an Euler integration to move the coordinates.
// However, when the number of executing blocks exceeds number of processors on the device,
// some points will move before others finish (or even start) calculating.
// Therefore integration has been split off into another kernel to allow global, device-level thread synchronization.
glb_force = glb_forces + (x * max_y + y) * @DIM; // Make a pointer to a force record in global memory. You could do this with C array notation
glb_error = glb_errors + (x * max_y + y) ; // Make a pointer to an error record in global memory.
glb_weight = glb_weights + (x * max_y + y) ; // Make a pointer to an error record in global memory.
if (s_low > 0) {
@unroll @DIM
force[@I] += glb_force[@I]; // ADD Output forces to device global memory.
error += *glb_error;
weight += *glb_weight;
// visualize max error per cell
error_max = max(error_max, debug_img[ x * max_y + y ]);
}
@unroll @DIM
glb_force[@I] = force[@I]; // SET Output forces to device global memory.
*glb_error = error; // SET Output this texel's cumulative error to device global memory.
*glb_weight = weight; // SET Output this texel's cumulative error to device global memory.
debug_img[ x * max_y + y ] = error_max;
}
}
/*
* "integrate" CUDA kernel.
* Applies forces accumulated in the unified kernel to time-space coordinates in global device memory.
* It has been split out to allow synchronization of all threads before moving any points.
*/
__global__ void integrate (
int max_x,
int max_y,
float *glb_coords,
float *glb_forces,
float *glb_weights)
{
int d;
float *glb_coord;
float *glb_force;
float *glb_weight;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if ( x < max_x && y < max_y ) { // Check that this thread is inside the map
glb_coord = &( glb_coords [(x * max_y + y) * @DIM] ); // Make a pointer to time-space coordinates in global memory
glb_force = &( glb_forces [(x * max_y + y) * @DIM] ); // Make a pointer to forces in global memory
glb_weight = &( glb_weights[(x * max_y + y) ] ); // Make a pointer to forces in global memory
for (d = 0; d < @DIM; d++) glb_force[d] /= *glb_weight; // Scale forces by the sum of all weights acting on this cell
for (d = 0; d < @DIM; d++) glb_coord[d] += glb_force [d]; // Euler integration, 1 timestep
}
}