-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinearSearch splitHistory.nasm
459 lines (400 loc) · 19.1 KB
/
linearSearch splitHistory.nasm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
;; ***************** the Search 345µs expected for search space of 223000. actual search space is 54000, and actual time is 147.4µs
;; (this is both slower and quicker than expected, quicker because search-space is much smaller,
;; slower because in proportion to the expected search-space is equivalent to 665µs, or expected should be 83.5µs.
;; this is in line with other results where random-access to a dataset > l1 size is about 2x slower)
;; (the difference in search-space size is due to my having taken a trace of an old scalar version of the algorithm at some point,
;; which was fundamentally broken in some way {the reason for the trace} but then some time later I used the trace to tell me
;; what the expected numbers of iterations and proportions/distribution of different lengths of search, without considering
;; that the trace was of a broken version that had given garbage results)
global search
section .data align=64
pileindex:
dd 218940,68940,18940,8940,4940,2940,1940,1440
dd 1240,1140,1090,1050,1030,1010,1000,0 ; top of pile0 258940
dq 0,0,0,0,0,0,0,0
pileindexcopy:
dd 218940,68940,18940,8940,4940,2940,1940,1440
dd 1240,1140,1090,1050,1030,1010,1000,0 ; top of pile0 258940
;64-byte loads
;z16: dd 4,5,6,7,0,1,2,3, 12,13,14,15,8,9,10,11
;z18: dd 16,24,17,25,18,26,19,27, 20,28,21,29,22,30,23,31
z19: ;dd 0,8,1,9,2,10,3,11, 4,12,5,13,6,14,7,15
dd 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
z24: db 0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46
db 16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62
db 1,33,3,35,5,37,7,39,9,41,11,43,13,45,15,47,
db 17,49,19,51,21,53,23,55,25,57,27,59,29,61,31,63
z25: dq 4,4,-4,-4,4,4,-4,-4
z26: dq 2,-2,2,-2,2,-2,2,-2
z28: dq 0xf0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f0,0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
dq 0xf0f0f0f0f0f0f0f0,0xf0f0f0f0f0f0f0f0,0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
z31: dd 1,3,7,0xf,0x1f,0x3f,0x7f,0xff
dd 0x1ff,0x3ff,0x7ff,0xfff,0x1fff,0x3fff,0x7fff, 0xffff
z15: dd 0x1ffff,0x3ffff,0x7ffff,0xfffff,0x1fffff,0x3fffff,0x7fffff,0xffffff
dd 0x1ffffff,0x3ffffff,0x7ffffff,0xfffffff,0x1fffffff,0x3fffffff,0x7fffffff,0xffffffff
;; rightward relevence masks include the self bit in order to pre-increment as the
;; scalar code does
;16-byte broadcasts
z17: dq 0x0504070601000302,0x0d0c0f0e09080b0a
z29: dq 0xcccccccccccccccc,0x3333333333333333
;qword broadcasts
z27: dq 0x19110901372f271f
z30: dq 0x55555555aaaaaaaa
xaaaaa: dq 0xaaaaaaaaaaaaaaaa
xccccc: dq 0xcccccccccccccccc
xfffff: dq 0xffffffffffffffff
;dword broadcasts
z11: dd 1
z13: dd 0x80000000
z14: dd 32
z18: dd 16
z20: dd 0xfc0
z21: dd 0xffe00000
maskffff: dd 0x0000ffff
xfffe dd 0xfffe
;word broadcasts (keep 2 for broadcastd)
z12: dw 14,14
z22: dw 15,15
z23: dw 1,1
section .text
;; rdi = ptr to sorted codes, rsi = ptr to index-length array,
;; rdx = ptr to mask space, rcx = pointer to index history space (HUGE malloc ... 4Gb?)
;; search array entry: 63:32=parent index 31:6=mask 5=skipped 2:0 = layer?
;; splitting history means 63:32 and 31:0 are now separate dword arrays
; TODO: ensure all writepointers are now indexes
align 16
search:
push rbp
push rbx
push r12
push r13
push r14
push r15
mov rbp, rcx ; history ptr now in rbp
mov word [rsi+(32*63+27)*4], 16 ; puts winners in pile -1 (15 beyond the saturation of 14)
movsx rcx, word [rsi] ; negative length of first set (position is assumed 0)
movzx r8d, word [rsi + 2] ; working write index, but it is also positive length of first group
lea r9, [rdi + r8 * 4] ; read endpoint
lea r10, [rdx + r8 * 4] ; masks write endpoint
lea r11, [rbp + r8 * 4] ; history write endpoint
vpxord zmm0, zmm0, zmm0 ; to erase the history
.initl1:
vmovdqu32 zmm1, [r9 + rcx * 4]
vmovdqu32 [r10 + rcx * 4], zmm1
vmovdqu32 [r11 + rcx * 4], zmm0
add rcx, 16
js .initl1
; we don't have to care about the junk at the end, r8d has right number
; 2c/l, 600/16l, 17ns
;;; now we begin to search, using rdx + r9 * 8 as readpointer
kmovq k7, [xaaaaa]
knotq k6, k7 ; 0x5555555555555555
kmovq k5, [xccccc]
kmovq k0, [xfffff]
lea r10, [pileindex] ; needed for PIE
vmovdqa32 zmm10, [pileindexcopy]
vmovdqa32 [r10], zmm10 ; reset piles between speedtest runs
lea r15, [rsp - (258940 * 8)] ; space for piles
vmovdqa32 ymm10, [r10 + 32] ; load initial pile indexes
kmovw k1, k0
vpscatterdq [r15 + ymm10 * 8]{k1}, zmm0 ; write zeros to pile bottoms
vmovdqa32 zmm10, [r10] ; load initial pile indexes
kmovw k1, k0
vpscatterdq [r15 + ymm10 * 8]{k1}, zmm0 ; write zeros to pile bottoms
; loading register constants
vpbroadcastd zmm11, [z11] ; 1s(dw)
vpbroadcastd zmm12, [z22] ; 15s(w)
vpbroadcastd zmm13, [z13] ; 0x80000000s(dw)
vpbroadcastd zmm14, [z14] ; 32s(dw)
vmovdqa32 zmm15, [z15]
vbroadcasti32x4 zmm17, [z17]
vpbroadcastd zmm18, [z18] ; dw 16s
vmovdqa32 zmm19, [z19] ; dw ID
vpbroadcastd zmm20, [z20]
vpbroadcastd zmm21, [z21]
vpbroadcastd zmm22, [z22]
vpbroadcastd zmm23, [z23]
vmovdqa32 zmm24, [z24]
vmovdqa32 zmm25, [z25]
vmovdqa32 zmm26, [z26]
vpbroadcastq zmm27, [z27]
vmovdqa32 zmm28, [z28]
vbroadcasti32x4 zmm29, [z29]
vpbroadcastq zmm30, [z30]
vmovdqa32 zmm31, [z31]
xor r9d, r9d ; working read index
call .sortwork ;5bits
call .processpiles ;>10bits
call .sortwork ;10bits
call .processpiles ;>15bits
call .sortwork ;15bits
call .processpiles ;>20bits
call .sortwork ;20bits
call .processpiles ;>25bits (all noskip winners found)
movsx rcx, word [rsi + 4] ; negative length of first set (position is assumed 0)
movzx r9, word [rsi + 6] ; end index
lea r9, [rdi + r9 * 4] ; read endpointer
sub r8d, ecx ; maintain write index
lea rax, [rdx + r8 * 4] ; masks write endpoint
lea rbx, [rbp + r8 * 4] ; history write endpoint
vpxord zmm0, zmm0, zmm0 ; to erase the history
.initl2:
vmovdqu32 zmm1, [r9 + rcx * 4]
vmovdqu32 [rax + rcx * 4], zmm1
vmovdqu32 [rbx + rcx * 4], zmm0
add rcx, 16
js .initl2
; we don't have to care about the junk at the end, r8d has right number
; 2c/l, 600/16l, 17ns
xor r9d, r9d ; start again reading from zero
vpbroadcastd zmm12, [z12] ; 14s(w) for saturate. pile 15 for winners from now on
call .sortworkaddskip ;>6-26bits we only need to add skips once since we
;only add skips to unskipped searches and all
;unskipped searches are already in the history
call .processpiles ;>11-26bits
call .sortwork ;11-26bits
call .processpiles ;>16-26bits
call .sortwork ;16-26bits
call .processpiles ;>21-26bits
call .sortwork ;21-26bits
call .processpiles ;>26bits (all skip winners found)
call .sortwork ;26bits - to get the last skip winners all in pile15
;**************************** search is done ************************
;; all winners end up in pile 15, we can just copy them to the top of the working space
mov r13d, [r10 + 60] ; starting from the top of pile 15
mov dword [rdx + r8 * 4], 0 ; terminate the main masks pile with a null
mov dword [rbp + r8 * 4], 0 ; terminate the main history pile with a null
.winnerloop:
mov eax, [r15 + r13 * 8] ; transferring the winner "masks" is a waste of time, they're all the same and never read
mov ebx, [r15 + r13 * 8 + 4] ; what's interesting is not the result but how we got there...
test eax, eax
jz .donewinners
inc r8d
dec r13d
mov [rdx + r8 * 4], eax
mov [rbp + r8 * 4], ebx
jmp .winnerloop
.donewinners:
mov eax, r8d ; return value is end of winners list
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret ;; returns index to top of solutions found
;; (written to end of working space with a zero separating them from the rest)
;; caller uses the indexes to trace back the solution and uses xor/andn to recover the codes
.sortworkaddskip:
mov r13d, 76543 ; magic number
.sortwork:
vmovdqa32 zmm10, [r10] ; (re)load pile indexes
kmovw k4, [xfffe] ; re-load mask for non-update of zero pile
.vsearchreadloop:
vmovdqu32 zmm0, [rdx + r9 * 4] ; get 16 work items for sorting to piles (0-15)
vmovdqu32 zmm4, [rdx + r9 * 4 + 64] ; get 16 more (16-31)
vpternlogd zmm3, zmm0, zmm0, 0x55 ; a=!c
vplzcntd zmm3, zmm3 ; count leading 1s to find next search letter
vpternlogd zmm7, zmm4, zmm4, 0x55 ; a=!c
vplzcntd zmm7, zmm7 ; count leading 1s to find next search letter
cmp r13d, 76543 ; r13d only becomes this number if set exactly
jnz .noaddskip ; to prevent bypass of the skip adding
vpsrlvd zmm3, zmm13, zmm3 ; z13 is top bit only (0x80000000s (dw))
vpternlogd zmm2, zmm3, zmm14, 0xfe ; orABC z14 is just the skip bit (32s(dw))
vpternlogd zmm3, zmm2, zmm2, 0x55 ; a=!c
vplzcntd zmm3, zmm3 ; count leading 1s to find next search letter
vpsrlvd zmm7, zmm13, zmm7 ; z13 is top bit only (0x80000000s (dw))
vpternlogd zmm4, zmm7, zmm14, 0xfe ; orABC z14 is just the skip bit (32s(dw))
vpternlogd zmm7, zmm4, zmm4, 0x55 ; a=!c
vplzcntd zmm7, zmm7 ; count leading 1s to find next search letter
.noaddskip:
vpslld zmm3, zmm3, 1 ; line up 5-bit lzcnt with bottom of pop6 bits
vpternlogd zmm3, zmm0, zmm20, 0xd8 ; c?b:a z20 has 0x00000fc0 to pull pop6 out
kmovw k1, k0
vpgatherdd zmm8{k1}, [rsi + zmm3 * 2] ; ** get group lengths for next step
vpslld zmm7, zmm7, 1 ; line up with pop6 bits
vpternlogd zmm7, zmm4, zmm20, 0xd8 ; c?b:a z20 has 0x00000fc0 to pull pop6 out
kmovw k1, k0
vpgatherdd zmm9{k1}, [rsi + zmm7 * 2] ; ** get group lengths for next step
vpslld zmm1, zmm3, 20 ; bits [11:1] to top of dword [31:21]
vpbroadcastd zmm5, r9d ; new history tags from current read index
vpaddd zmm5, zmm5, zmm19 ; z19 has 0,1,2,3,4,5,6...(dw)(l2r)
vpternlogd zmm1, zmm5, zmm21, 0xe4 ; c?a:b z21 has 0xffe00000 to pull history in
vmovdqa32 zmm2, zmm0
vpshufd zmm0{k7}, zmm1, 0xb1 ; new history tags and search group merged with ;k7=aaaaaaaa
vpshufd zmm1{k6}, zmm2, 0xb1 ; current mask ready for scatter to piles ;k6=55555555
vmovdqu32 [rsp - 72], zmm0 ; for DEBUG
vmovdqu32 [rsp - 72], zmm1 ; for DEBUG
vpslld zmm7, zmm7, 20 ; to top of dword
vpaddd zmm5, zmm5, zmm18 ; z18 has 16s(dw)(l2r)
vpternlogd zmm5, zmm7, zmm21, 0xd8 ; c?b:a z21 has 0xffe00000 to pull group in
vmovdqa32 zmm6, zmm4
vpshufd zmm4{k7}, zmm5, 0xb1 ; new history tags and search group merged with
vpshufd zmm5{k6}, zmm6, 0xb1 ; current mask ready for scatter to piles
vpshufb zmm8{k5}, zmm9, zmm17 ; z17 has 0x0d0c0f0e09080b0a0504070601000302 x4
; k5 = 0xcccccccccccccccc = swap words and merge
; this merges the two to appear as alternating
; bits in the rotation, but this can be fixed in
; the initial permb of the bmt, or with the mrr bit order
vpsubw zmm8, zmm22, zmm8 ; z22 has 15s (w) << adjusted for negative len
vpsraw zmm8, zmm8, 4 ; floor((len+15)/16) = search cycle count
vpminsw zmm8, zmm8, zmm12 ; saturate at 15/14: = pile number
; vpshufb zmm9, zmm8, zmm17 ; z17 has 0x0d0c0f0e09080b0a0504070601000302 x4
vprold zmm9, zmm8, 16 ; alternate method of word swap, not on p5 and not word filtering
;***it is possible to get this sorting to pull out the winners into a seperate pile
; eg: winners will have lzcnt>21(<4 letters left to fill) and <27, so if those (unused)
; search groups all have negative lengths (-16 is good), and minsw used to saturate to 14,
; they'll end up on their own in pile 15... but then we'll need to use a different shift
; that filters the bits (which is fine).
; and even if not found when doing the unskipped search due to one missing bit higher up,
; it will be "found" by the skipped version later on once it's filled in the missing bit.
; ... but will we still find the word? yes, because the skipped bit is the only thing added
; and the pile has the original as it's history. possible there will be double-reports due
; to this. if negative lengths are only in groups for lzcnt >26 (skip-len gap) ie 27-28
; then all will be reported only on the skip pass, and will be in zero pile on unskip pass.
; Indeed, the layer count will be obsolete.
; only one positive length is needed: lzcnt=27, g15.3
;***
vmovdqa32 zmm6, zmm23 ; z23=1s(w)
vpshldvw zmm6, zmm23, zmm8 ; filter needed because -1 must be wrapped to 15
; vpbmtwd zmm6, zmm6 ; instruction not found... so...
vpermb zmm6, zmm24, zmm6 ; 0,32,2,34..30,62;1,33,3,35,5..31,63(b)(l2r)
vpermq zmm7, zmm6, 0x4e ; swap lanes
vprolvq zmm7, zmm7, zmm25 ; 4,4,-4,-4,4,4,-4,-4(qw)(l2r) alternate rotate
vpternlogd zmm6, zmm7, zmm28, 0xd8 ; c?b:a 0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f,f(r2l)
vpalignr zmm7, zmm6, zmm6, 8 ; swap qwords
vprolvq zmm7, zmm7, zmm26 ; 2,-2,2,-2,2,-2,2,-2(qw)(l2r)
vpternlogd zmm6, zmm7, zmm29, 0xd8 ; c?b:a 3333333333333333cccccccccccccccc x4
vpmultishiftqb zmm7, zmm27, zmm6 ; 0x19110901372f271f
vpternlogd zmm6, zmm7, zmm30, 0xd8 ; c?b:a 55555555aaaaaaaa55555555aaaaaaaa x4
; result is all 32 word bit0's in dword 0
; in order
add r9d, 32
cmp r9d, r8d ; is ok to always run, should never have no work
jl .vsearchdone ; and even if it did it would do no harm
vpermd zmm2, zmm8, zmm6 ; 1st half conflicts->scatter columns
vpermd zmm3, zmm9, zmm6 ; 2nd half conflicts->scatter columns
vpopcntd zmm6, zmm6 ; source increments
vpandd zmm2, zmm2, zmm31 ; 1st half filter for rightward relevence
; 1,3,7,f,1f...ffff(l2r)
vpandd zmm3, zmm3, zmm15 ; 2nd half filter for rightward relevence
; 1ffff,3ffff,7ffff...ffffffff(l2r)
vpopcntd zmm2, zmm2 ; 1st half scatter preincrements
vpopcntd zmm3, zmm3 ; 2nd half scatter preincrements
vpermd zmm8, zmm8, zmm10 ; select 16 dword indexes from z10
vpermd zmm9, zmm9, zmm10 ; select 16 dword indexes from z10
vpaddd zmm10{k4}, zmm10, zmm6 ; increment used indexes, except zero pile <<possible bug? dirty end data might cause unintended increments
vpaddd zmm8, zmm8, zmm2 ; avoid conflicts << also here
vpaddd zmm9, zmm9, zmm3 ; avoid conflicts
vmovdqa32 zmm6{k6z}, zmm8 ; just 8 (0,2,4,6,8,10,12,14)
kmovw k1, k0
vpscatterqq [r15 + zmm6 * 8]{k1}, zmm0 ; at a time
vpshufd zmm6{k6z}, zmm8, 0xb1 ; but each (1,3,5,7,9,11,13,15)
kmovw k1, k0
vpscatterqq [r15 + zmm6 * 8]{k1}, zmm1 ; in right place
vmovdqa32 zmm6{k6z}, zmm9 ; just 8 (16,18,20,22,24,26,28,30)
kmovw k1, k0
vpscatterqq [r15 + zmm6 * 8]{k1}, zmm4 ; at a time
vpshufd zmm6{k6z}, zmm9, 0xb1 ; but each (17,19,21,23,25,27,29,31)
kmovw k1, k0
vpscatterqq [r15 + zmm6 * 8]{k1}, zmm5 ; in right place
jmp .vsearchreadloop ; recount: scatterqq(z,z) has 2p0,1p0156,8p49,8p78.gatherdd has 1p0,1p5,16p23
; loop: 28p0,20p5,20p05 68 on p05, 96 memops, 8 others (p15,p06,p0156 ops) 172µop total
; (write min 32c)(vec alu min 34c)(aloc. min 34.4c) so 7082l in 55.4µs
.vsearchdone:
sub r9d, r8d ; how many extras were read
mov eax, 0xffffffff
shrx eax, eax, r9d ; full-width valid element mask
vpbroadcastd zmm16, eax ; to filter increments by
mov ebx, 0x55555555 ; evens
pext ebx, eax, ebx
kmovb k1, ebx
shr ebx, 8
kmovb k3, ebx
mov ebx, 0xaaaaaaaa ; odds
pext ebx, eax, ebx
kmovb k2, ebx
shr ebx, 8
; kmovb k4, ebx ; moved below final use of k4=0xfffe
vpandd zmm6, zmm6, zmm16 ; remove bits from overread columns
vpermd zmm2, zmm8, zmm6 ; 1st half conflicts->scatter columns
vpermd zmm3, zmm9, zmm6 ; 2nd half conflicts->scatter columns
vpopcntd zmm6, zmm6 ; source increments
vpandd zmm2, zmm2, zmm31 ; 1st half filter for rightward relevence
; 1,3,7,f,1f...ffff(l2r)
vpandd zmm3, zmm3, zmm15 ; 2nd half filter for rightward relevence
; 1ffff,3ffff,7ffff...ffffffff(l2r)
vpopcntd zmm2, zmm2 ; 1st half scatter preincrements
vpopcntd zmm3, zmm3 ; 2nd half scatter preincrements
vpermd zmm8, zmm8, zmm10 ; select 16 dword indexes from z10
vpermd zmm9, zmm9, zmm10 ; select 16 dword indexes from z10
vpaddd zmm10{k4}, zmm10, zmm6 ; increment used indexes, except zero pile
vpaddd zmm8, zmm8, zmm2 ; avoid conflicts
vpaddd zmm9, zmm9, zmm3 ; avoid conflicts
kmovb k4, ebx
vmovdqa32 zmm6{k6z}, zmm8 ; just 8
vpscatterqq [r15 + zmm6 * 8]{k1}, zmm0 ; at a time
vpshufd zmm6{k6z}, zmm8, 0xb1 ; but each
vpscatterqq [r15 + zmm6 * 8]{k2}, zmm1 ; in right place
vmovdqa32 zmm6{k6z}, zmm9 ; just 8
vpscatterqq [r15 + zmm6 * 8]{k3}, zmm4 ; at a time
vpshufd zmm6{k6z}, zmm9, 0xb1 ; but each
vpscatterqq [r15 + zmm6 * 8]{k4}, zmm5 ; in right place
mov r9d, r8d ; r9 (readpoint) went too far, set it right
vmovdqa32 [r10], zmm10 ; store back pile indexes
xor r13d, r13d ; clear the addskip magic number so we don't do it again next time
ret ; return from call to sortwork/sortworkaddskip
.pilesfinished:
ret ; return from call to processpiles
.processpiles:
mov r14d, 0xffff ; for maskmaking
vmovd r12d, xmm12 ; start with last pile as used by last sortwork
and r12d, 15 ; zmm has top-pile as words for word saturation. get rid of the second word.
jmp .pileloopstart ; (z12 still holds the saturation limit on write pile)
.pileloop:
mov [r10 + r12 * 4], r11d ; reset pile index to starting place after finishing
sub r12d, 1 ; working down through the piles
jz .pilesfinished ; don't process pile zero
.pileloopstart:
mov r11d, [r10 + r12 * 4] ; get a pile index
.pileouterloop:
mov rax, [r15 + r11 * 8] ; get the work from the pile
test rax, rax ; null = pile finished
jz .pileloop ; so get the next pile
shr rax, 53 ; 11-bit search group
movsx rcx, word [rsi + rax * 4]; negative length of search
movzx ebx, word [rsi + rax * 4 + 2]; end index of search
lea rbx, [rdi + rbx * 4] ; index -> read endpointer
vpbroadcastd zmm1, [r15 + r11 * 8] ; mask to test
vpbroadcastd zmm3, [r15 + r11 * 8 + 4] ; history for merge
; outer loop: 3c/l (1p5, 15µop, 5 reads) 192621 in 131µs
; pre-neg,enddex: 13µop, 5 read, 2.6c/l: 192621 in 113µs
.writeinnerloop:
vmovdqu32 zmm0, [rbx + rcx * 4] ; get candidates
vptestnmd k1, zmm0, zmm1 ; test candidates (0==success)
kmovw eax, k1 ; retrieve for count
vpord zmm0, zmm0, zmm1 ; merge candidates
add rcx, 16 ; increment readpointer
jnl .plastfew ; +ve==read too many. 1s pile will always exit
popcnt eax, eax ; count successes
vpcompressd zmm0{k1z}, zmm0 ; discard failures
vmovdqu32 [rbp + r8 * 4], zmm3 ; write out history
vmovdqu32 [rdx + r8 * 4], zmm0 ; write out masks
add r8d, eax ; increment writepointer by number written
jmp .writeinnerloop ; 14ops, 3p5, 3c/l 281000l in 192µs
; can eat 1op/il from outerloop: -12.8µs
; can eat 1op/ol from outerloop: -8.7µs
; 113+192+55-21.5=339µs
.plastfew:
shrx r13d, r14d, ecx ; maskmaking
and eax, r13d ; apply to test result mask
kmovw k1, eax ; return modified mask
;****
popcnt eax, eax ; count successes
vpcompressd zmm0{k1z}, zmm0 ; discard failures
vmovdqu32 [rbp + r8 * 4], zmm3 ; write out history
vmovdqu32 [rdx + r8 * 4], zmm0 ; write out masks
add r8d, eax ; increment writepointer by number written
;**** between stars should be counted as innerloop final iteration
sub r11d, 1 ; scan down through the pile
jmp .pileouterloop ; break test is at start of loop so always go back