-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhashWords_phf_cfd.nasm
178 lines (137 loc) · 8.97 KB
/
hashWords_phf_cfd.nasm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
global hashWords
;; perfect hashing algorithm using the perfect hashfunction
;; right aligned codes from before counting
;; 5 arrays: {codes, index, links} are aligned elements. hashtable is 65780 dwords with indexes into the arrays, to fit the perfect hashfunction.
;; hashtable is cleared to -1 to account for the zero indexes being valid. takes .93µs
;; input words is needed for hashtable lookup, codes are only used once for hashfunction. overwrite input codes with first-unique codes for count.
;; 5th array is for first-unique words for remaking the codes after the count and ranking.
;; start of memory areas assumed to be 64-byte aligned. hashtable needs to be writeable to 65792nd dword.
; total time for 10000 input words: 6.33µs expected, 12.2µs measured
; (again, presumably gather/scatter being slow, though this time hashtable is l2-sized)
; 4 inputs: rdi = codes, rsi = hashspace, rdx = linkspace, ecx = n_words
section .data
align 64
dwid: dd 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
bit1vals: dw 0,0,0,0,0,0,10626,19481,26796,32781,37626,41502,44562,46942,48762,50127,51128,51843,52338,52668,52878,53004,53074,53109,53124,53129,53130,0,0,0,0,0
bit2vals: dw 0,0,0,0,0,0,1771,3311,4641,5781,6750,7566,8246,8806,9261,9625,9911,10131,10296,10416,10500,10556,10591,10611,10621,10625,10626,0,0,0,0,0
bit3vals: dw 0,0,0,0,0,0,231,441,631,802,955,1091,1211,1316,1407,1485,1551,1606,1651,1687,1715,1736,1751,1761,1767,1770,1771,0,0,0,0,0
bit4vals: dw 0,0,0,0,0,0,21,41,60,78,95,111,126,140,153,165,176,186,195,203,210,216,221,225,228,230,231,0,0,0,0,0
bit5vals: dw 0,0,0,0,0,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0 ;; adjusted for A=1 (empty first bit means all pcnt(x-1) has 1 more)
evencomb: dq 0x5555555555555555
one: dd 1
thirtyone: dd 31
sixteen: dd 16
section .text
align 16
hashWords:
lea r10, [rdx + rcx * 4] ; linkspace in r10
lea rdi, [rdi + rcx * 4] ; adjust pointers for end-referencing negative indexes
vmovdqa32 zmm21, [bit1vals]
vmovdqa32 zmm22, [bit2vals]
vmovdqa32 zmm23, [bit3vals]
vmovdqa32 zmm24, [bit4vals]
vmovdqa32 zmm25, [bit5vals]
vmovdqa32 zmm10, [dwid]
vpbroadcastd zmm20, [one]
vpbroadcastd zmm31, [thirtyone]
vpbroadcastd zmm26, [sixteen]
vpternlogd zmm27, zmm27, zmm27, 0xff ; all1s
;; first clear(set) the hashmemory
mov edx, 65760 ; covers top block and a bit more. make sure 65792 is writeable.
.clearloop:
vmovdqa32 [rsi + rdx * 4 + 64], zmm27 ; set hashspace to all1s
vmovdqa32 [rsi + rdx * 4], zmm27 ; set hashspace to all1s
sub edx, 32
jns .clearloop ; loop should manage 1 write/cycle
kxorq k0, k0, k0
knotq k0, k0 ; k0 isn't all 1s until you set it. for the gather/scatter ops.
kmovq k7, [evencomb] ; the comb mask actually isn't required since upper-half of dwords will always be zero after vpopcntd
; and selecting 0 from these tables will always give 0 in this specific case. but I left it in.
neg rcx ; negative read index
mov rdx, rcx ; duplicate for the also-negative write index to reference same pointer for codes
.loop:
vmovdqu32 zmm5, [rdi + rcx * 4] ; get codes
vplzcntd zmm3, zmm5 ; lzcnt gives different numbers but still useable
vpermw zmm4{k7z}, zmm3, zmm25 ; bit 5 values from 27 to 6 (0,0,0,0),0,1,2,3,4,5,6,7....20,21,(0,0,0,0,0,0) (w)(r2l)
vpsubd zmm2, zmm5, zmm20 ; zmm20 = 1s (dw)
vpandd zmm0, zmm5, zmm2 ; unmodified codes remains in zmm5
vpopcntd zmm3, zmm2 ; using without removing the extra bits just increases popcnt return by 4
vpermw zmm3{k7z}, zmm3, zmm21 ; bit 1 values from 4 to 25 in zmm21, k7=0x55555555
vpaddd zmm4, zmm4, zmm3
vpsubd zmm2, zmm0, zmm20 ; zmm20 = 1s
vpandd zmm0, zmm0, zmm2
vpopcntd zmm3, zmm2 ; using without removing the extra bits just increases popcnt return by 3
vpermw zmm3{k7z}, zmm3, zmm22 ; bit 2 values from 4 to 25
vpaddd zmm4, zmm4, zmm3
vpsubd zmm2, zmm0, zmm20 ; zmm20 = 1s
vpandd zmm0, zmm0, zmm2
vpopcntd zmm3, zmm2 ; using without removing the extra bits just increases popcnt return by 2
vpermw zmm3{k7z}, zmm3, zmm23 ; bit 3 values from 4 to 25
vpaddd zmm4, zmm4, zmm3
vpsubd zmm2, zmm0, zmm20 ; zmm20 = 1s
vpopcntd zmm3, zmm2 ; using without removing the extra bits just increases popcnt return by 1
vpermw zmm3{k7z}, zmm3, zmm24 ; bit 4 values from 4 to 25
vpaddd zmm4, zmm4, zmm3 ; latency 14c 9p5 1p0 11p05, 21total µops
;; we now have a perfect hash of the codes, that will prevent the need for word-checking and trees on each hash entry
;; we need to build the chain directly. normally, the link elements would get the current hashtable entry, and replace it with their own index.
;; if there is a conflict, the rightmost entry gets the current hashtable contents, but the leftmost conflicting entry gets it's index in the hashtable.
;; so the link entry for each conflicting element needs to be the index of the conflicting element to it's right.
mov rax, rcx ; keep a copy first because we still need it for a write yet
add rcx, 16 ; increment to see if there's overread before continuing
jns .lastfew
kmovw k1, k0 ; k0=0xffffffffffffffff
vpgatherdd zmm0{k1}, [rsi + zmm4 * 4] ; read current hashtable contents
kmovw k1, k0 ; k0=0xffffffffffffffff
vpscatterdd [rsi + zmm4 * 4]{k1}, zmm10 ; write indexes of current 16 codes/words to hashtable, leaving the leftmost in case of conflicts
vpconflictd zmm1, zmm4 ; detect identical hashes in table (=anagrams)
vptestmd k1, zmm1, zmm1 ; indicates conflicting elements that need their link adjusted
vplzcntd zmm1, zmm1 ; find leftmost bit (nearest conflict to right) in conflicted elements
vpsubd zmm1, zmm31, zmm1 ; zmm31 has 31s (a 1-bit in the rightmost position gives 31 - we need that to select element 0)
vpermd zmm0{k1}, zmm1, zmm10 ; replace links/blanks from hashtable with indexes of the conflicting elements from this table
vmovdqa32 [r10 + rax * 4], zmm0 ; write links to the link array, using the pre-increment copy of rcx in rax
vpcmpeqd k1, zmm0, zmm27 ; zmm27=-1s (dw) mask of new entries. also excludes in-register conflicted elements.
vpcompressd zmm5{k1}, zmm5 ; first-unique codes. this is actually what we're hashing *for*.
vmovdqu32 [rdi + rdx * 4], zmm5 ; split writes for zen4
kmovw eax, k1
popcnt eax, eax ; count how many first-uniques there were
vpaddd zmm10, zmm10, zmm26 ; zmm26 has 16's: increment the indexes
add rdx, rax ; add the first-unique total to the first-unique write index
jmp .loop ; 17p5+(17p5) 6p0+(11p0) 14p05+(9p05)
; 36p5 + 17p0 + 23p05. min loop is 38c. 10000 input words is 625 loops, so 5.40µs
;zen4: conflictd is 2 µops 1fp01,1fp12
; permdi2d is 1fp12 x 5
; scatterdd is 89µops 2fp01,4fp12,8fp123,4fp23,34fp45 (52 + 37 write ops?)
; gatherdd is 81µops 1fp01,7fp0123,3fp12,9fp123,2fp23,18fp45 (40 + 41read ops?)
; vpadd,and,andn: 1fp0123 x 21
; vpopcntd: 1fp01 x 5
; vplzcntd: 1fp01 x 1
; compress: 1fp01,1fp12 x 1
; movdqa/u32: (write) 1fp45+1memwr
; kmov: 1fp45 (k-k:1fp23) 1 of each
; testnm: 1fp01 x 2
; assuming core can do all of 0,1,2,3,4,5 each cycle...
; memory reads and writes don't go via the ports, but are still counted. fp45 is write address calc
; zen4 compress is weird - more mem-ops than scatter if direct→mem...
; 28p0123 17p123 7p23 13p01 7p12 57p45 72tp0123 ... p45 limits at 28.5c? 4.05µs?
.lastfew:
mov r11d, 0xffff
shrx r11d, r11d, ecx ; make mask of valid (non-overread) columns
kmovw k2, r11d
kmovw k1, k2
vpgatherdd zmm0{k1}, [rsi + zmm4 * 4] ; read current hashtable contents
kmovw k1, k2
vpscatterdd [rsi + zmm4 * 4]{k1}, zmm10 ; write indexes to hashtable, leaving the leftmost in case of conflicts
vpconflictd zmm1{k2}, zmm4 ; detect identical hashes in table
vptestmd k1{k2}, zmm1, zmm1 ; indicates conflicting elements that need their link overwritten with the next right conflict
vplzcntd zmm1{k1}{z}, zmm1 ; find leftmost bit in conflicted elements. the conflictd is still valid because the junk is leftward
vpsubd zmm1{k1}{z}, zmm26, zmm1 ; zmm1 has 31s (a 1-bit in the rightmost position gives 31 - we need that to select element 0)
vpermd zmm0{k1}, zmm1, zmm10 ; replace link with index of the next right conflicting element from this group
vmovdqa32 [r10 + rax * 4]{k2}, zmm0 ; write links to the link array
vpcmpeqd k1{k2}, zmm0, zmm27 ; zmm27=-1s (dw) mask of new entries
vpcompressd zmm5{k1}, zmm5 ; first-unique codes
vmovdqu32 [rdi + rdx * 4], zmm5 ; split writes for zen4
kmovd eax, k1
popcnt eax, eax
add rax, rdx ; return is the number that were expunged, still negative. caller add to previous nwords value to get new value
;; no pops needed
ret