Skip to content

Commit f6177e7

Browse files
committed
Add weighted average to SpeedSmall report
Weights coming from two datasets are hard-coded: DNS domain lengths and UMASH traces. Custom one might be passed via ENV{SMHASHER_SMALLKEY_WEIGHTS} It partly addresses the question at rurban#113 What is the "real" average cycles/hash value for a given hash function? We can't know, but we can estimate it better if we assume that the function timing does not depend on input (that's not true for hashes based on multiplication) and we know distribution of key length in advance (that might be somewhat known for certain classes of inputs, but the distribution varies across classes measurably).
1 parent 139a0f8 commit f6177e7

File tree

3 files changed

+115
-8
lines changed

3 files changed

+115
-8
lines changed

SpeedTest.cpp

+106-2
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,15 @@
88
#include <math.h> // for sqrt
99
#include <algorithm> // for sort, min
1010
#include <string>
11+
#include <sstream>
1112

1213
#include <unordered_map>
1314
#include <parallel_hashmap/phmap.h>
1415
#include <functional>
1516

17+
#define COUNT_OF(x) ((sizeof(x)/sizeof(0[x])) / ((size_t)(!(sizeof(x) % sizeof(0[x])))))
18+
#define ARRAY_END(x) (x + COUNT_OF(x))
19+
1620
typedef std::unordered_map<std::string, int,
1721
std::function<size_t (const std::string &key)>> std_hashmap;
1822
typedef phmap::flat_hash_map<std::string, int,
@@ -240,7 +244,7 @@ double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int block
240244

241245
double t;
242246

243-
if(blocksize < 100)
247+
if(blocksize <= TIMEHASH_SMALL_LEN_MAX)
244248
{
245249
t = (double)timehash_small(hash,block,blocksize,itrial);
246250
}
@@ -305,6 +309,107 @@ double TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bo
305309
return cycles;
306310
}
307311

312+
static void ReportAverage ( const std::vector<double>& cph, int minkey, int maxkey )
313+
{
314+
double sum = 0;
315+
for (int i = minkey; i <= maxkey; i++)
316+
sum += cph[i];
317+
sum /= (maxkey - minkey + 1);
318+
printf("Average %9.3f cycles/hash\n",sum);
319+
}
320+
321+
static void ReportWeighted ( const std::vector<double>& cph, const std::vector<double>& weights, int minkey, int maxkey, const char *name )
322+
{
323+
assert(0 <= minkey && minkey <= maxkey && maxkey <= cph.size() + 1);
324+
if (weights.size() < cph.size()) {
325+
printf("Average, weighted by key length, SKIP %s dataset, need %lu more weights\n",
326+
name, cph.size() - weights.size());
327+
return;
328+
}
329+
double tot = 0.0, use = 0.0, sum = 0.0;
330+
for (int i = 0; i < minkey; i++)
331+
tot += weights[i];
332+
for (int i = minkey; i <= maxkey; i++) {
333+
sum += weights[i] * cph[i];
334+
use += weights[i];
335+
tot += weights[i];
336+
}
337+
for (int i = maxkey + 1; i < weights.size(); i++)
338+
tot += weights[i];
339+
printf("Average, weighted by key length freq. %9.3f cycles/hash (using %.1f%% of %s dataset)\n",
340+
sum / use, 100. * use / tot, name);
341+
}
342+
343+
// These are lengths of top 7,073,200 domain names from Tranco. The list represents "popular" domain
344+
// names. The dataset was downloaded from https://tranco-list.eu/list/LJ5W4/1000000 on 2024-Sep-05
345+
// SHA256(tranco_LJ5W4.csv) = 4593f2a162697946f36ef7bbe7c8b434eec42e0e93c4298517c4a3966b08c054
346+
//
347+
// Victor Le Pochat, Tom Van Goethem, Samaneh Tajalizadehkhoob, Maciej Korczyński, and Wouter
348+
// Joosen. 2019. "Tranco: A Research-Oriented Top Sites Ranking Hardened Against Manipulation",
349+
// Proceedings of the 26th Annual Network and Distributed System Security Symposium (NDSS 2019).
350+
// https://doi.org/10.14722/ndss.2019.23386
351+
//
352+
static const unsigned TrancoDNSNameLengths[] = { 0, 0, 5, 0, 326, 5568, 41632, 88175, 151138, 253649,
353+
386024, 416786, 458718, 482490, 491891, 473417, 450606, 413517, 371676, 327361, 288868, 251641,
354+
213514, 178542, 159986, 132611, 113222, 101498, 82455, 67296, 67906, 56843, 53731, 49744, 36404,
355+
32346, 30329, 26978, 24359, 24345, 19161, 16914, 16370, 13708, 13714, 10832, 13548, 9635, 8125,
356+
15536, 6273, 8207, 7490, 5196, 7330, 6202, 3801, 4455, 3756, 3709, 4142, 3989, 3593, 4783, 5052,
357+
1403, 1580, 2072, 1998, 1420, 1836, 1872, 1135, 2664, 1172, 837, 998, 1063, 685, 566, 2020, 627,
358+
2146, 1144, 635, 618, 569, 756, 411, 361, 362, 1138, 218, 278, 182, 185, 175, 220, 3205, 143, 353,
359+
131, 132, 199, 134, 139, 130, 168, 135, 169, 630, 155, 137, 129, 229, 154, 166, 205, 204, 203, 208,
360+
201, 211, 141, 157, 147, 172, 183, 134, 155, 123, 159, 148, 165, 145, 143, 112, 111, 112, 115, 128,
361+
120, 116, 119, 137, 123, 106, 118, 105, 125, 126, 106, 99, 124, 102, 94, 95, 113, 105, 103, 118, 81,
362+
103, 86, 78, 80, 82, 70, 72, 74, 52, 58, 71, 46, 67, 65, 70, 74, 75, 66, 59, 81, 110, 97, 107, 116,
363+
109, 72, 67, 89, 82, 79, 73, 82, 83, 73, 71, 89, 98, 103, 90, 118, 120, 67, 63, 50, 71, 57, 67, 64,
364+
54, 55, 65, 53, 73, 65, 63, 60, 83, 80, 61, 87, 82, 55, 74, 66, 38, 41, 22, 47, 27, 36, 30, 38, 33,
365+
46, 33, 36, 58, 50, 61, 71, 99, 46, 50, 54, 38, 17, 15, 4, 3, 0, 0, 116, 0, 0 };
366+
367+
// These are lengths of 1,000,000 calls to umash_full() during the batch hash table phase.
368+
// It's arguably with an off-by-one, since NUL terminators are included in the hashed data.
369+
//
370+
// All the lengths are clamped to 256 bytes per TIMEHASH_SMALL_LEN_MAX.
371+
// The last bin UmashStartupLengths[256] is essentially the long tail that is never used.
372+
//
373+
// startup-1M.2020-08-28.trace.bz2 @ https://github.com/backtrace-labs/umash/wiki/Execution-traces
374+
// SHA256(trace.bz2) = 02bae7f0e07880bf24fdd67b6d5fc2a675c6ca05b534081925a16f06c11659c0
375+
//
376+
static const unsigned UmashStartupLengths[] = { 0, 7, 51, 396, 1312, 3110, 5616, 7887, 11145, 68172,
377+
14618, 16670, 9502, 8275, 7444, 8088, 105451, 246, 100, 117, 116, 487, 367, 179, 293, 58, 56, 124,
378+
191, 340, 323, 333, 303, 274, 238, 202, 246, 409961, 235, 10119, 239, 171, 128, 100, 5217, 51, 62,
379+
53, 42, 69, 63, 89, 38, 52, 102, 84, 90, 75, 61, 90, 55, 57, 60, 71, 106, 92520, 54, 57, 101, 316,
380+
961, 1873, 1714, 290, 88, 185, 600, 1038, 1762, 3228, 3174, 284, 266, 292, 752, 1381, 1331, 145,
381+
161, 177, 1517, 304, 176, 9464, 342, 1809, 286, 962, 116, 390, 383, 244, 50, 54, 46, 88, 191, 74,
382+
54, 91, 110, 11347, 4310, 5021, 51, 189, 902, 60, 3476, 44543, 275, 5960, 58, 1705, 84, 15, 34, 68,
383+
1113, 43, 55, 27, 126, 15, 33, 1512, 14, 359, 13, 43, 7604, 78108, 43, 27, 7, 23, 140, 5, 3, 0, 13,
384+
6, 8, 33, 54, 3, 0, 0, 13, 10, 13, 0, 6, 5, 11, 0, 11, 25, 11, 9, 0, 12, 13, 0, 0, 41, 3, 4, 8, 49,
385+
29, 25, 17, 10, 3, 29, 7, 9, 2, 20, 17, 17, 5, 35, 3, 5, 0, 13, 0, 149, 17, 6, 8, 3, 11, 17, 0, 1,
386+
780, 0, 0, 14, 29, 10, 3, 14, 20, 9, 12, 29, 11, 6, 10, 6, 12, 0, 10, 7, 22, 13, 6, 10, 14, 167, 0,
387+
3, 0, 11, 7, 5, 9, 35, 4, 5, 7, 2, 14, 6, 7, 2, 16, 5, 6, 8, 0, 4, 1022 };
388+
389+
// Weighted average exist under assumption that hash speed does not depend on input,
390+
// which is not true due to multiplication instruction having certain amount of variance.
391+
void ReportTinySpeedTest ( const std::vector<double>& cycles_per_hash, int minkey, int maxkey )
392+
{
393+
ReportAverage(cycles_per_hash, minkey, maxkey);
394+
395+
std::vector<double> w(TrancoDNSNameLengths, ARRAY_END(TrancoDNSNameLengths));
396+
ReportWeighted(cycles_per_hash, w, minkey, maxkey, "top-7m Tranco DNS names");
397+
w.clear();
398+
399+
w.insert(w.begin(), UmashStartupLengths, ARRAY_END(UmashStartupLengths));
400+
ReportWeighted(cycles_per_hash, w, minkey, maxkey, "startup-1M UMASH trace");
401+
w.clear();
402+
403+
if (const char *ew = getenv("SMHASHER_SMALLKEY_WEIGHTS"))
404+
{
405+
std::istringstream ssws(ew);
406+
for (double flt; ssws >> flt; )
407+
w.push_back(flt);
408+
ReportWeighted(cycles_per_hash, w, minkey, maxkey, "${SMHASHER_SMALLKEY_WEIGHTS}");
409+
w.clear();
410+
}
411+
}
412+
308413
double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
309414
std::vector<std::string> words,
310415
const uint32_t seed, const int trials, bool verbose )
@@ -453,4 +558,3 @@ double HashMapSpeedTest ( pfHash pfhash, const int hashbits,
453558
return mean;
454559
}
455560

456-
//-----------------------------------------------------------------------------

SpeedTest.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
#include "Types.h"
44

5+
constexpr int TIMEHASH_SMALL_LEN_MAX = 255;
6+
57
void BulkSpeedTest ( pfHash hash, uint32_t seed );
68
double TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose );
79
double HashMapSpeedTest ( pfHash pfhash, int hashbits, std::vector<std::string> words,
810
const uint32_t seed, const int trials, bool verbose );
9-
//-----------------------------------------------------------------------------
11+
void ReportTinySpeedTest ( const std::vector<double>& cycles_per_hash, int minkey, int maxkey );

main.cpp

+6-5
Original file line numberDiff line numberDiff line change
@@ -1027,7 +1027,6 @@ void test ( hashfunc<hashtype> hash, HashInfo* info )
10271027

10281028
if(g_testSpeedBulk || g_testSpeedSmall || g_testAll)
10291029
{
1030-
double sum = 0.0;
10311030
printf("[[[ Speed Tests ]]]\n\n");
10321031
if (timer_counts_ns())
10331032
printf("WARNING: no cycle counter, cycle == 1ns\n");
@@ -1053,13 +1052,15 @@ void test ( hashfunc<hashtype> hash, HashInfo* info )
10531052
const int dflmax = g_testExtra ? 64 : 32;
10541053
const int minkey = getenvlong("SMHASHER_SMALLKEY_MIN", 1, 1, TIMEHASH_SMALL_LEN_MAX);
10551054
const int maxkey = getenvlong("SMHASHER_SMALLKEY_MAX", minkey, dflmax, TIMEHASH_SMALL_LEN_MAX);
1056-
for(int i = minkey; i <= maxkey; i++)
1055+
std::vector<double> cph(maxkey+1, NAN);
1056+
for(int i = minkey, g_speed = 0.0; i <= maxkey; i++)
10571057
{
10581058
volatile int j = i;
1059-
sum += TinySpeedTest(hashfunc<hashtype>(info->hash),sizeof(hashtype),j,info->verification,true);
1059+
cph[j] = TinySpeedTest(hashfunc<hashtype>(info->hash),sizeof(hashtype),j,info->verification,true);
1060+
g_speed += cph[j];
10601061
}
1061-
g_speed = sum = sum / (maxkey - minkey + 1);
1062-
printf("Average %6.3f cycles/hash\n",sum);
1062+
g_speed /= (maxkey - minkey + 1);
1063+
ReportTinySpeedTest(cph, minkey, maxkey);
10631064
printf("\n");
10641065
fflush(NULL);
10651066
}

0 commit comments

Comments
 (0)