|
8 | 8 | #include <math.h> // for sqrt
|
9 | 9 | #include <algorithm> // for sort, min
|
10 | 10 | #include <string>
|
| 11 | +#include <sstream> |
11 | 12 |
|
12 | 13 | #include <unordered_map>
|
13 | 14 | #include <parallel_hashmap/phmap.h>
|
14 | 15 | #include <functional>
|
15 | 16 |
|
| 17 | +#define COUNT_OF(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x]))))) |
| 18 | +#define ARRAY_END(x) (x + COUNT_OF(x)) |
| 19 | + |
16 | 20 | typedef std::unordered_map<std::string, int,
|
17 | 21 | std::function<size_t (const std::string &key)>> std_hashmap;
|
18 | 22 | typedef phmap::flat_hash_map<std::string, int,
|
@@ -240,7 +244,7 @@ double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int block
|
240 | 244 |
|
241 | 245 | double t;
|
242 | 246 |
|
243 |
| - if(blocksize < 100) |
| 247 | + if(blocksize <= TIMEHASH_SMALL_LEN_MAX) |
244 | 248 | {
|
245 | 249 | t = (double)timehash_small(hash,block,blocksize,itrial);
|
246 | 250 | }
|
@@ -305,6 +309,107 @@ double TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bo
|
305 | 309 | return cycles;
|
306 | 310 | }
|
307 | 311 |
|
| 312 | +static void ReportAverage ( const std::vector<double>& cph, int minkey, int maxkey ) |
| 313 | +{ |
| 314 | + double sum = 0; |
| 315 | + for (int i = minkey; i <= maxkey; i++) |
| 316 | + sum += cph[i]; |
| 317 | + sum /= (maxkey - minkey + 1); |
| 318 | + printf("Average %9.3f cycles/hash\n",sum); |
| 319 | +} |
| 320 | + |
| 321 | +static void ReportWeighted ( const std::vector<double>& cph, const std::vector<double>& weights, int minkey, int maxkey, const char *name ) |
| 322 | +{ |
| 323 | + assert(0 <= minkey && minkey <= maxkey && maxkey <= cph.size() + 1); |
| 324 | + if (weights.size() < cph.size()) { |
| 325 | + printf("Average, weighted by key length, SKIP %s dataset, need %lu more weights\n", |
| 326 | + name, cph.size() - weights.size()); |
| 327 | + return; |
| 328 | + } |
| 329 | + double tot = 0.0, use = 0.0, sum = 0.0; |
| 330 | + for (int i = 0; i < minkey; i++) |
| 331 | + tot += weights[i]; |
| 332 | + for (int i = minkey; i <= maxkey; i++) { |
| 333 | + sum += weights[i] * cph[i]; |
| 334 | + use += weights[i]; |
| 335 | + tot += weights[i]; |
| 336 | + } |
| 337 | + for (int i = maxkey + 1; i < weights.size(); i++) |
| 338 | + tot += weights[i]; |
| 339 | + printf("Average, weighted by key length freq. %9.3f cycles/hash (using %.1f%% of %s dataset)\n", |
| 340 | + sum / use, 100. * use / tot, name); |
| 341 | +} |
| 342 | + |
| 343 | +// These are lengths of top 7,073,200 domain names from Tranco. The list represents "popular" domain |
| 344 | +// names. The dataset was downloaded from https://tranco-list.eu/list/LJ5W4/1000000 on 2024-Sep-05 |
| 345 | +// SHA256(tranco_LJ5W4.csv) = 4593f2a162697946f36ef7bbe7c8b434eec42e0e93c4298517c4a3966b08c054 |
| 346 | +// |
| 347 | +// Victor Le Pochat, Tom Van Goethem, Samaneh Tajalizadehkhoob, Maciej Korczyński, and Wouter |
| 348 | +// Joosen. 2019. "Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation", |
| 349 | +// Proceedings of the 26th Annual Network and Distributed System Security Symposium (NDSS 2019). |
| 350 | +// https://doi.org/10.14722/ndss.2019.23386 |
| 351 | +// |
| 352 | +static const unsigned TrancoDNSNameLengths[] = { 0, 0, 5, 0, 326, 5568, 41632, 88175, 151138, 253649, |
| 353 | + 386024, 416786, 458718, 482490, 491891, 473417, 450606, 413517, 371676, 327361, 288868, 251641, |
| 354 | + 213514, 178542, 159986, 132611, 113222, 101498, 82455, 67296, 67906, 56843, 53731, 49744, 36404, |
| 355 | + 32346, 30329, 26978, 24359, 24345, 19161, 16914, 16370, 13708, 13714, 10832, 13548, 9635, 8125, |
| 356 | + 15536, 6273, 8207, 7490, 5196, 7330, 6202, 3801, 4455, 3756, 3709, 4142, 3989, 3593, 4783, 5052, |
| 357 | + 1403, 1580, 2072, 1998, 1420, 1836, 1872, 1135, 2664, 1172, 837, 998, 1063, 685, 566, 2020, 627, |
| 358 | + 2146, 1144, 635, 618, 569, 756, 411, 361, 362, 1138, 218, 278, 182, 185, 175, 220, 3205, 143, 353, |
| 359 | + 131, 132, 199, 134, 139, 130, 168, 135, 169, 630, 155, 137, 129, 229, 154, 166, 205, 204, 203, 208, |
| 360 | + 201, 211, 141, 157, 147, 172, 183, 134, 155, 123, 159, 148, 165, 145, 143, 112, 111, 112, 115, 128, |
| 361 | + 120, 116, 119, 137, 123, 106, 118, 105, 125, 126, 106, 99, 124, 102, 94, 95, 113, 105, 103, 118, 81, |
| 362 | + 103, 86, 78, 80, 82, 70, 72, 74, 52, 58, 71, 46, 67, 65, 70, 74, 75, 66, 59, 81, 110, 97, 107, 116, |
| 363 | + 109, 72, 67, 89, 82, 79, 73, 82, 83, 73, 71, 89, 98, 103, 90, 118, 120, 67, 63, 50, 71, 57, 67, 64, |
| 364 | + 54, 55, 65, 53, 73, 65, 63, 60, 83, 80, 61, 87, 82, 55, 74, 66, 38, 41, 22, 47, 27, 36, 30, 38, 33, |
| 365 | + 46, 33, 36, 58, 50, 61, 71, 99, 46, 50, 54, 38, 17, 15, 4, 3, 0, 0, 116, 0, 0 }; |
| 366 | + |
| 367 | +// These are lengths of 1,000,000 calls to umash_full() during the batch hash table phase. |
| 368 | +// It's arguably with an off-by-one, since NUL terminators are included in the hashed data. |
| 369 | +// |
| 370 | +// All the lengths are clamped to 256 bytes per TIMEHASH_SMALL_LEN_MAX. |
| 371 | +// The last bin UmashStartupLengths[256] is essentially the long tail that is never used. |
| 372 | +// |
| 373 | +// startup-1M.2020-08-28.trace.bz2 @ https://github.com/backtrace-labs/umash/wiki/Execution-traces |
| 374 | +// SHA256(trace.bz2) = 02bae7f0e07880bf24fdd67b6d5fc2a675c6ca05b534081925a16f06c11659c0 |
| 375 | +// |
| 376 | +static const unsigned UmashStartupLengths[] = { 0, 7, 51, 396, 1312, 3110, 5616, 7887, 11145, 68172, |
| 377 | + 14618, 16670, 9502, 8275, 7444, 8088, 105451, 246, 100, 117, 116, 487, 367, 179, 293, 58, 56, 124, |
| 378 | + 191, 340, 323, 333, 303, 274, 238, 202, 246, 409961, 235, 10119, 239, 171, 128, 100, 5217, 51, 62, |
| 379 | + 53, 42, 69, 63, 89, 38, 52, 102, 84, 90, 75, 61, 90, 55, 57, 60, 71, 106, 92520, 54, 57, 101, 316, |
| 380 | + 961, 1873, 1714, 290, 88, 185, 600, 1038, 1762, 3228, 3174, 284, 266, 292, 752, 1381, 1331, 145, |
| 381 | + 161, 177, 1517, 304, 176, 9464, 342, 1809, 286, 962, 116, 390, 383, 244, 50, 54, 46, 88, 191, 74, |
| 382 | + 54, 91, 110, 11347, 4310, 5021, 51, 189, 902, 60, 3476, 44543, 275, 5960, 58, 1705, 84, 15, 34, 68, |
| 383 | + 1113, 43, 55, 27, 126, 15, 33, 1512, 14, 359, 13, 43, 7604, 78108, 43, 27, 7, 23, 140, 5, 3, 0, 13, |
| 384 | + 6, 8, 33, 54, 3, 0, 0, 13, 10, 13, 0, 6, 5, 11, 0, 11, 25, 11, 9, 0, 12, 13, 0, 0, 41, 3, 4, 8, 49, |
| 385 | + 29, 25, 17, 10, 3, 29, 7, 9, 2, 20, 17, 17, 5, 35, 3, 5, 0, 13, 0, 149, 17, 6, 8, 3, 11, 17, 0, 1, |
| 386 | + 780, 0, 0, 14, 29, 10, 3, 14, 20, 9, 12, 29, 11, 6, 10, 6, 12, 0, 10, 7, 22, 13, 6, 10, 14, 167, 0, |
| 387 | + 3, 0, 11, 7, 5, 9, 35, 4, 5, 7, 2, 14, 6, 7, 2, 16, 5, 6, 8, 0, 4, 1022 }; |
| 388 | + |
| 389 | +// Weighted average exist under assumption that hash speed does not depend on input, |
| 390 | +// which is not true due to multiplication instruction having certain amount of variance. |
| 391 | +void ReportTinySpeedTest ( const std::vector<double>& cycles_per_hash, int minkey, int maxkey ) |
| 392 | +{ |
| 393 | + ReportAverage(cycles_per_hash, minkey, maxkey); |
| 394 | + |
| 395 | + std::vector<double> w(TrancoDNSNameLengths, ARRAY_END(TrancoDNSNameLengths)); |
| 396 | + ReportWeighted(cycles_per_hash, w, minkey, maxkey, "top-7m Tranco DNS names"); |
| 397 | + w.clear(); |
| 398 | + |
| 399 | + w.insert(w.begin(), UmashStartupLengths, ARRAY_END(UmashStartupLengths)); |
| 400 | + ReportWeighted(cycles_per_hash, w, minkey, maxkey, "startup-1M UMASH trace"); |
| 401 | + w.clear(); |
| 402 | + |
| 403 | + if (const char *ew = getenv("SMHASHER_SMALLKEY_WEIGHTS")) |
| 404 | + { |
| 405 | + std::istringstream ssws(ew); |
| 406 | + for (double flt; ssws >> flt; ) |
| 407 | + w.push_back(flt); |
| 408 | + ReportWeighted(cycles_per_hash, w, minkey, maxkey, "${SMHASHER_SMALLKEY_WEIGHTS}"); |
| 409 | + w.clear(); |
| 410 | + } |
| 411 | +} |
| 412 | + |
308 | 413 | double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
|
309 | 414 | std::vector<std::string> words,
|
310 | 415 | const uint32_t seed, const int trials, bool verbose )
|
@@ -453,4 +558,3 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
|
453 | 558 | return mean;
|
454 | 559 | }
|
455 | 560 |
|
456 |
| -//----------------------------------------------------------------------------- |
|
0 commit comments