@@ -176,31 +176,57 @@ TEST(SamplingTests, RandomizedSamplingTopPCpu) {
176
176
}
177
177
}
178
178
179
+ void SoftMax (std::span<float > scores, float temperature) {
180
+ float const max_score = *std::max_element (scores.begin (), scores.end ());
181
+
182
+ // Subtract max score and scale by temperature
183
+ std::transform (scores.begin (), scores.end (), scores.begin (), [max_score, temperature](float score) { return std::exp ((score - max_score) / temperature); });
184
+
185
+ // Compute sum of exponentials
186
+ float const exp_sum = std::accumulate (scores.begin (), scores.end (), 0 .0f );
187
+
188
+ // Divide each score by the sum of exponentials
189
+ std::transform (scores.begin (), scores.end (), scores.begin (), [exp_sum](float score) { return score / exp_sum; });
190
+ }
191
+
179
192
TEST (SamplingTests, RandomizedSamplingTopKCpu) {
180
193
auto model = Generators::CreateModel (Generators::GetOrtEnv (), MODEL_PATH " hf-internal-testing/tiny-random-gpt2-fp32" );
181
- int batch_size = 5 ;
182
- int k = 5 ;
183
- std::vector<int32_t > input_ids{0 , 1 , 2 , 3 , 4 };
194
+ const int batch_size = 5 ;
195
+ const int k = 5 ;
184
196
185
197
Generators::Config config;
186
- config.model .vocab_size = 32000 ; // vocab size of llama
198
+ const int vocab_size = 13 ; // vocab size of llama
199
+ config.model .vocab_size = vocab_size; // vocab size of llama
187
200
201
+ // Create a generator
188
202
auto params = Generators::CreateGeneratorParams (config);
189
203
params->search .max_length = 10 ;
190
204
params->search .do_sample = true ;
191
205
params->search .top_k = k;
192
206
params->search .batch_size = batch_size;
193
207
params->p_device = Generators::GetDeviceInterface (Generators::DeviceType::CPU);
194
208
params->device_type = Generators::DeviceType::CPU;
195
- std::vector<float > logits_cpu (config.model .vocab_size * batch_size);
209
+
210
+ // Create data structures for testing
196
211
std::random_device rd;
197
212
std::mt19937 engine (rd ());
198
- std::uniform_int_distribution<> dist (5 , 25 );
199
- int num_iter = 100 ;
213
+ std::vector<int > indices (vocab_size);
214
+ std::vector<float > logits_cpu (vocab_size * batch_size);
215
+ const int num_iter = 100 ;
216
+ std::map<float , int > logit_to_count;
217
+
218
+ // Run test
200
219
for (int i = 0 ; i < num_iter; i++) {
201
- int num_large = dist (engine);
202
220
auto generator = Generators::CreateGenerator (*model, *params);
203
- CreateRandomLogits (logits_cpu.data (), num_large, config.model .vocab_size , batch_size, engine);
221
+ logits_cpu = std::vector<float >(vocab_size * batch_size, 0 .0f );
222
+ // Shuffle integers 1 to k randomly into logits_cpu
223
+ for (int b = 0 ; b < batch_size; b++) {
224
+ std::iota (indices.begin (), indices.end (), 0 );
225
+ std::shuffle (indices.begin (), indices.end (), engine);
226
+ for (int j = 0 ; j < k; j++)
227
+ logits_cpu[indices[j] + vocab_size * b] = float (k - j);
228
+ }
229
+ // Set logits and get generated token
204
230
auto logits_copy = logits_cpu;
205
231
auto logits = params->p_device ->WrapMemory <float >(logits_copy);
206
232
generator->SetLogits (logits);
@@ -209,10 +235,22 @@ TEST(SamplingTests, RandomizedSamplingTopKCpu) {
209
235
// Verify outputs match expected outputs
210
236
for (int b = 0 ; b < batch_size; b++) {
211
237
auto next_token = next_tokens[b];
212
- auto next_token_score = logits_cpu[next_token + config.model .vocab_size * b];
213
- EXPECT_GT (next_token_score, 10 .0f );
238
+ auto next_token_score = logits_cpu[next_token + vocab_size * b];
239
+ logit_to_count[next_token_score]++;
240
+ EXPECT_GT (next_token_score, 0 .0f );
214
241
}
215
242
}
243
+ // Calculate expected distribution of tokens by softmaxing given logits (integers 1 through k)
244
+ std::vector<float > expected_distributions (k);
245
+ for (int i = 0 ; i < k; i++)
246
+ expected_distributions[i] = float (i + 1 );
247
+ SoftMax (expected_distributions, 1 .0f );
248
+ // Check that the distribution of tokens generated by the model is close to the expected distribution
249
+ const int total_count = batch_size * num_iter;
250
+ for (auto & [logit, count] : logit_to_count) {
251
+ const float expected_distribution = expected_distributions[int (logit) - 1 ];
252
+ EXPECT_NEAR (count / float (total_count), expected_distribution, 0.1 );
253
+ }
216
254
}
217
255
218
256
TEST (SamplingTests, RandomizedSamplingTopPAndKCpu) {
@@ -396,43 +434,65 @@ TEST(SamplingTests, RandomizedSamplingTopPCuda) {
396
434
397
435
TEST (SamplingTests, RandomizedSamplingTopKCuda) {
398
436
auto model = Generators::CreateModel (Generators::GetOrtEnv (), MODEL_PATH " hf-internal-testing/tiny-random-gpt2-fp32" );
399
- int batch_size = 5 ;
400
- int k = 5 ;
401
- std::vector<int32_t > input_ids{0 , 1 , 2 , 3 , 4 };
437
+ const int batch_size = 5 ;
438
+ const int k = 5 ;
402
439
403
440
Generators::Config config;
404
- config.model .vocab_size = 32000 ; // vocab size of llama
441
+ const int vocab_size = 17 ; // vocab size of llama
442
+ config.model .vocab_size = vocab_size; // vocab size of llama
405
443
444
+ // Create a generator
406
445
auto params = Generators::CreateGeneratorParams (config);
407
446
params->search .max_length = 10 ;
408
447
params->search .do_sample = true ;
409
448
params->search .top_k = k;
410
449
params->search .batch_size = batch_size;
411
450
params->p_device = Generators::GetDeviceInterface (Generators::DeviceType::CUDA);
412
451
params->device_type = Generators::DeviceType::CUDA;
413
- auto logits_gpu = params->p_device ->Allocate <float >(config.model .vocab_size * batch_size);
414
- auto indices_buffer = params->p_device ->Allocate <int >(config.model .vocab_size * batch_size);
415
452
453
+ // Create data structures for testing
416
454
std::random_device rd;
417
455
std::mt19937 engine (rd ());
418
- std::uniform_int_distribution<> dist (1 , 25 );
419
- int num_iter = 100 ;
456
+ std::vector<int > indices (vocab_size);
457
+ const int num_iter = 100 ;
458
+ std::map<float , int > logit_to_count;
459
+
460
+ // Run test
420
461
for (int i = 0 ; i < num_iter; i++) {
421
- int num_large = dist (engine);
422
- LaunchGeometricDecayKernel (logits_gpu.Span ().data (), config.model .vocab_size , batch_size, num_large, 20 .0f , params->cuda_stream );
423
- LaunchFisherYatesKernel (logits_gpu.Span ().data (), indices_buffer.Span ().data (), config.model .vocab_size , batch_size, params->cuda_stream );
424
462
auto generator = Generators::CreateGenerator (*model, *params);
463
+ Generators::DeviceSpan<float > logits_gpu = params->p_device ->Allocate <float >(vocab_size * batch_size);
464
+ auto cpu_span = logits_gpu.CpuSpan ();
465
+ // Shuffle integers 1 to k randomly into cpu_span
466
+ for (int b = 0 ; b < batch_size; b++) {
467
+ std::iota (indices.begin (), indices.end (), 0 );
468
+ std::shuffle (indices.begin (), indices.end (), engine);
469
+ for (int j = 0 ; j < k; j++)
470
+ cpu_span[indices[j] + vocab_size * b] = float (k - j);
471
+ }
472
+ // Copy logits onto device, set logits, and get generated token
473
+ logits_gpu.CopyCpuToDevice ();
425
474
generator->SetLogits (logits_gpu);
426
475
generator->GenerateNextToken ();
427
476
auto next_tokens = generator->search_ ->GetNextTokens ().CopyDeviceToCpu ();
428
- auto logits_cpu = logits_gpu.CopyDeviceToCpu ();
429
477
// Verify outputs match expected outputs
430
478
for (int b = 0 ; b < batch_size; b++) {
431
479
auto next_token = next_tokens[b];
432
- auto next_token_score = logits_cpu[next_token + config.model .vocab_size * b];
433
- EXPECT_GT (next_token_score, 10 .0f );
480
+ auto next_token_score = cpu_span[next_token + vocab_size * b];
481
+ logit_to_count[next_token_score]++;
482
+ EXPECT_GT (next_token_score, 0 .0f );
434
483
}
435
484
}
485
+ // Calculate expected distribution of tokens by softmaxing given logits (integers 1 through k)
486
+ std::vector<float > expected_distributions (k);
487
+ for (int i = 0 ; i < k; i++)
488
+ expected_distributions[i] = float (i + 1 );
489
+ SoftMax (expected_distributions, 1 .0f );
490
+ const int total_count = batch_size * num_iter;
491
+ // Check that the distribution of tokens generated by the model is close to the expected distribution
492
+ for (auto & [logit, count] : logit_to_count) {
493
+ const float expected_distribution = expected_distributions[int (logit) - 1 ];
494
+ EXPECT_NEAR (count / float (total_count), expected_distribution, 0.1 );
495
+ }
436
496
}
437
497
438
498
TEST (SamplingTests, RandomizedSamplingTopPAndKCuda) {
0 commit comments