-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCudaSortTOP.cpp
334 lines (253 loc) · 8.8 KB
/
CudaSortTOP.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* Shared Use License: This file is owned by Derivative Inc. (Derivative) and
* can only be used, and/or modified for use, in conjunction with
* Derivative's TouchDesigner software, and only if you are a licensee who has
* accepted Derivative's TouchDesigner license or assignment agreement (which
* also govern the use of this file). You may share a modified version of this
* file with another authorized licensee of Derivative's TouchDesigner software.
* Otherwise, no redistribution or sharing of this file, with or without
* modification, is permitted.
*/
#include "CudaSortTOP.h"
#include <assert.h>
#ifdef __APPLE__
#include <OpenGL/gl3.h>
#include <string.h>
#endif
#include <cstdio>
#include <iostream>
// These functions are basic C function, which the DLL loader can find
// much easier than finding a C++ Class.
// The DLLEXPORT prefix is needed so the compile exports these functions from the .dll
// you are creating
extern "C"
{
DLLEXPORT
void
FillTOPPluginInfo(TOP_PluginInfo *info)
{
// This must always be set to this constant
info->apiVersion = TOPCPlusPlusAPIVersion;
// Change this to change the executeMode behavior of this plugin.
info->executeMode = TOP_ExecuteMode::CUDA;
// The opType is the unique name for this TOP. It must start with a
// capital A-Z character, and all the following characters must lower case
// or numbers (a-z, 0-9)
info->customOPInfo.opType->setString("Cudasort");
// The opLabel is the text that will show up in the OP Create Dialog
info->customOPInfo.opLabel->setString("CUDA Sort");
// Will be turned into a 3 letter icon on the nodes
info->customOPInfo.opIcon->setString("CDA");
// Information about the author of this OP
info->customOPInfo.authorName->setString("Vincent Houze");
//info->customOPInfo.authorEmail->setString("[email protected]");
// This TOP works with 0 or 1 inputs connected
info->customOPInfo.minInputs = 0;
info->customOPInfo.maxInputs = 1;
}
DLLEXPORT
TOP_CPlusPlusBase*
CreateTOPInstance(const OP_NodeInfo* info, TOP_Context *context)
{
// Return a new instance of your class every time this is called.
// It will be called once per TOP that is using the .dll
// Note we can't do any OpenGL work during instantiation
return new CudaSortTOP(info, context);
}
DLLEXPORT
void
DestroyTOPInstance(TOP_CPlusPlusBase* instance, TOP_Context *context)
{
// Delete the instance here, this will be called when
// Touch is shutting down, when the TOP using that instance is deleted, or
// if the TOP loads a different DLL
// We do some OpenGL teardown on destruction, so ask the TOP_Context
// to set up our OpenGL context
delete (CudaSortTOP*)instance;
}
};
CudaSortTOP::CudaSortTOP(const OP_NodeInfo* info, TOP_Context *context)
: myNodeInfo(info), myExecuteCount(0), myError(nullptr)
{
previousWidth = -1;
previousHeight = -1;
}
CudaSortTOP::~CudaSortTOP()
{
}
void
CudaSortTOP::getGeneralInfo(TOP_GeneralInfo* ginfo, const OP_Inputs *inputs, void* reserved)
{
// Setting cookEveryFrame to true causes the TOP to cook every frame even
// if none of its inputs/parameters are changing. Set it to false if it
// only needs to cook when inputs/parameters change.
ginfo->cookEveryFrame = false;
}
bool
CudaSortTOP::getOutputFormat(TOP_OutputFormat* format, const OP_Inputs *inputs, void* reserved)
{
/*format->redChannel = true;
format->greenChannel = false;
format->blueChannel = false;
format->alphaChannel = false;
format->bitsPerChannel = 32;
format->floatPrecision = true;
return true;*/
return false;
}
void CudaSortTOP::initBuffers(int width, int height) {
if (previousWidth != -1) {
deleteBuffers();
}
else {
printf("***FIRST_INIT***\n");
}
printf("***CHANGE_RESOLUTION***\n");
cudaMalloc((void**)&m_sortKeys, sizeof(float)*width*height);
cudaMalloc((void**)&m_indices, sizeof(uint)*width*height);
//cudaMalloc((void**)&out_data, sizeof(float4)*width*height);
//init = 1;
previousWidth = width;
previousHeight = height;
}
void CudaSortTOP::deleteBuffers() {
cudaFree(m_sortKeys);
cudaFree(m_indices);
//cudaFree(out_data);
}
//extern cudaError_t doCUDAOperation(int width, int height, cudaArray *input, cudaArray *output);
////////////////////////////////////////////////////////////////////////////////
extern "C" void
sortParticles(float *sortKeys, uint *indices, uint numParticles);
//launch_cudaProcess(dim3 grid, dim3 block, cudaArray *g_data_array, float4 *g_odata, int imgw, int imgh, float incr);
extern "C" void
launch_arrayToData(dim3 grid, dim3 block, cudaArray *g_data_array, float *g_odata, uint* keys, int imgw, int imgh);
extern "C" void
launch_dataToTex(dim3 grid, dim3 block, uint* in_indices, float4 *g_odata, int imgw, int imgh);
extern "C" void
launch_dataToArray(dim3 grid, dim3 block, uint *mIndices, cudaArray *output, int imgw, int imgh);
void
CudaSortTOP::execute(TOP_OutputFormatSpecs* outputFormat ,
const OP_Inputs* inputs,
TOP_Context* context,
void* reserved)
{
myExecuteCount++;
myError = nullptr;
int width = outputFormat->width;
int height = outputFormat->height;
cudaArray *inputMem = nullptr;
if (inputs->getNumInputs() > 0)
{
const OP_TOPInput* topInput = inputs->getInputTOP(0);
//const GLint format = outputFormat->pixelFormat;
//std::cout << "format :" << format << std::endl;
//printf("format %d", format);
if (outputFormat->redBits != 32 ||
outputFormat->greenBits != 0 ||
outputFormat->blueBits != 0 ||
outputFormat->alphaBits != 0)
{
myError = "TOP Pixel Format should be set to 32-bit float (Mono)";
return;
}
if (topInput->cudaInput == nullptr)
{
myError = "CUDA memory for input TOP was not mapped correctly.";
return;
}
//matching texture<float4, cudaTextureType2D, cudaReadModeElementType> inTex;
if (topInput->pixelFormat != GL_RGBA16F && topInput->pixelFormat != GL_RGBA32F) {
myError = "Input Texture must be 16-bit float (RGBA) or 32-bit float (RGBA).";
return;
}
inputMem = topInput->cudaInput;
if (outputFormat->width != previousWidth || outputFormat->height != previousHeight) {
initBuffers(outputFormat->width, outputFormat->height);
}
int nThreads = 16;
// calculate grid size
dim3 block(nThreads, nThreads, 1);
int width = outputFormat->width;
int height = outputFormat->height;
//dim3 grid(outputFormat->width / block.x, outputFormat->width / block.y, 1);
dim3 grid;
grid.x = width / nThreads + (!(width%nThreads) ? 0 : 1);
grid.y = height / nThreads + (!(height%nThreads) ? 0 : 1);
grid.z = 1;
// execute CUDA kernel
launch_arrayToData(grid, block, inputMem, m_sortKeys, m_indices, outputFormat->width, outputFormat->height);
int numKeys = outputFormat->width * outputFormat->height;
sortParticles(m_sortKeys, m_indices, numKeys);
//launch_dataToTex(grid, block, m_indices, out_data, outputFormat->width, outputFormat->height);
//cudaMemcpyToArray(outputFormat->cudaOutput[0], 0, 0, out_data, outputFormat->width*outputFormat->height * sizeof(float4), cudaMemcpyDeviceToDevice);
launch_dataToArray(grid, block, m_indices, outputFormat->cudaOutput[0], outputFormat->width, outputFormat->height);
}
}
int32_t
CudaSortTOP::getNumInfoCHOPChans(void* reserved)
{
// We return the number of channel we want to output to any Info CHOP
// connected to the TOP. In this example we are just going to send one channel.
return 1;
}
void
CudaSortTOP::getInfoCHOPChan(int32_t index,
OP_InfoCHOPChan* chan,
void* reserved)
{
// This function will be called once for each channel we said we'd want to return
// In this example it'll only be called once.
if (index == 0)
{
chan->name->setString("executeCount");
chan->value = (float)myExecuteCount;
}
}
bool
CudaSortTOP::getInfoDATSize(OP_InfoDATSize* infoSize, void* reserved)
{
infoSize->rows = 1;
infoSize->cols = 2;
// Setting this to false means we'll be assigning values to the table
// one row at a time. True means we'll do it one column at a time.
infoSize->byColumn = false;
return true;
}
void
CudaSortTOP::getInfoDATEntries(int32_t index,
int32_t nEntries,
OP_InfoDATEntries* entries,
void* reserved)
{
char tempBuffer[4096];
if (index == 0)
{
// Set the value for the first column
#ifdef _WIN32
strcpy_s(tempBuffer, "executeCount");
#else // macOS
strlcpy(tempBuffer, "executeCount", sizeof(tempBuffer));
#endif
entries->values[0]->setString(tempBuffer);
// Set the value for the second column
#ifdef _WIN32
sprintf_s(tempBuffer, "%d", myExecuteCount);
#else // macOS
snprintf(tempBuffer, sizeof(tempBuffer), "%d", myExecuteCount);
#endif
entries->values[1]->setString(tempBuffer);
}
}
void
CudaSortTOP::getErrorString(OP_String *error, void* reserved)
{
error->setString(myError);
}
void
CudaSortTOP::setupParameters(OP_ParameterManager* manager, void* reserved)
{
}
void
CudaSortTOP::pulsePressed(const char* name, void* reserved)
{
}