Skip to content

Commit 8e80393

Browse files
committed
Minor performance improvements
Change-Id: I6552b7dc082b97c28bc889a378208d0588da755b
1 parent 5d68ae4 commit 8e80393

File tree

3 files changed

+39
-33
lines changed

3 files changed

+39
-33
lines changed

Changes

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
0.2.0 2023-09-05
1+
0.2.1 2023-09-05
22
- Add english tokenizer.
33
- Fix buffer bug.
44
- Improve Readme.
5+
- Minor performance improvements.
56

67
0.1.7 2023-02-28
78
- Add dependabot checks.

datok.go

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ func (auto *Automaton) ToDoubleArray() *DaTokenizer {
113113
var atrans *edge
114114
var s, s1 int
115115
var t, t1 uint32
116+
var diff int
116117

117118
// Create a mapping from s (in Ms aka Intermediate FSA)
118119
// to t (in Mt aka Double Array FSA)
@@ -215,9 +216,10 @@ func (auto *Automaton) ToDoubleArray() *DaTokenizer {
215216
// Store a final transition
216217
dat.array[base+uint32(dat.final)].setCheck(t)
217218

218-
if dat.maxSize < int(base)+dat.final {
219-
dat.maxSize = int(base) + dat.final
220-
}
219+
// Find max
220+
// see https://dev.to/jobinrjohnson/branchless-programming-does-it-really-matter-20j4
221+
diff = dat.maxSize - (int(base) + dat.final)
222+
dat.maxSize -= (diff & (diff >> 31))
221223
}
222224
}
223225
}
@@ -461,6 +463,8 @@ func (dat *DaTokenizer) TransCount() int {
461463

462464
dat.transCount = 0
463465
for x := 1; x < len(dat.array); x++ {
466+
467+
// Hopefully branchless
464468
if dat.array[x].getBase() != 0 {
465469
dat.transCount++
466470
}
@@ -512,9 +516,12 @@ func (dat *DaTokenizer) WriteTo(w io.Writer) (n int64, err error) {
512516
max := 0
513517
for sym, num := range dat.sigma {
514518
sigmalist[num] = sym
515-
if num > max {
516-
max = num
517-
}
519+
520+
// Find max
521+
max -= ((max - num) & ((max - num) >> 31))
522+
// if num > max {
523+
// max = num
524+
// }
518525
}
519526

520527
sigmalist = sigmalist[:max+1]
@@ -852,9 +859,7 @@ PARSECHAR:
852859
// Better not repeatedly check for a!
853860
// Possibly keep a buffer with a.
854861
if int(char) < 256 {
855-
if int(char) == EOT {
856-
eot = true
857-
}
862+
eot = int(char) == EOT
858863
a = dat.sigmaASCII[int(char)]
859864
} else {
860865
a, ok = dat.sigma[char]
@@ -933,6 +938,7 @@ PARSECHAR:
933938
// token and start blank at the root node of the automaton for the remaining data.
934939
// It may be beneficial to have something like a "drop()" event to capture these cases,
935940
// as they are likely the result of a bad automaton design.
941+
// Hopefully this is branchless code
936942
if buffc-bufft <= 0 {
937943
buffc++
938944
if buffc == 0 {
@@ -953,9 +959,7 @@ PARSECHAR:
953959
log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
954960
}
955961

956-
for x, i := range buffer[buffc:buffi] {
957-
buffer[x] = i
958-
}
962+
copy(buffer[0:], buffer[buffc:buffi])
959963

960964
buffi -= buffc
961965
epsilonState = 0
@@ -986,6 +990,7 @@ PARSECHAR:
986990
buffc++
987991

988992
// Transition does not produce a character
993+
// Hopefully this is branchless
989994
if buffc-bufft == 1 && ta.isNonToken() {
990995
if DEBUG {
991996
log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
@@ -1028,10 +1033,7 @@ PARSECHAR:
10281033
}
10291034

10301035
// TODO: Better as a ring buffer
1031-
// buffer = buffer[buffc:] !slower
1032-
for x, i := range buffer[buffc:buffi] {
1033-
buffer[x] = i
1034-
}
1036+
copy(buffer[0:], buffer[buffc:buffi])
10351037

10361038
buffi -= buffc
10371039
// epsilonOffset -= buffo

matrix.go

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ func (auto *Automaton) ToMatrix() *MatrixTokenizer {
5555
if num > auto.sigmaCount {
5656
panic("sigmaCount is smaller")
5757
}
58-
if num > max {
59-
max = num
60-
}
58+
59+
// Find max
60+
// see https://dev.to/jobinrjohnson/branchless-programming-does-it-really-matter-20j4
61+
max -= ((max - num) & ((max - num) >> 31))
62+
// if num > max {
63+
// max = num
64+
// }
6165
}
6266
// Add final entry to the list (maybe not necessary actually)
6367

@@ -137,9 +141,13 @@ func (mat *MatrixTokenizer) WriteTo(w io.Writer) (n int64, err error) {
137141
max := 0
138142
for sym, num := range mat.sigma {
139143
sigmalist[num] = sym
140-
if num > max {
141-
max = num
142-
}
144+
145+
// Find max
146+
// see https://dev.to/jobinrjohnson/branchless-programming-does-it-really-matter-20j4
147+
max -= ((max - num) & ((max - num) >> 31))
148+
// if num > max {
149+
// max = num
150+
// }
143151
}
144152

145153
// Add final entry to the list (maybe not necessary actually)
@@ -411,9 +419,7 @@ PARSECHARM:
411419
// Better not repeatedly check for a!
412420
// Possibly keep a buffer with a.
413421
if int(char) < 256 {
414-
if int(char) == EOT {
415-
eot = true
416-
}
422+
eot = int(char) == EOT
417423

418424
// mat.SigmaASCII[] is initialized with mat.identity
419425
a = mat.sigmaASCII[int(char)]
@@ -513,6 +519,7 @@ PARSECHARM:
513519
break
514520
}
515521
}
522+
// This will hopefully be branchless by the compiler
516523

517524
if DEBUG {
518525
log.Println("-> Flush buffer: [", string(buffer[bufft:buffc]), "]", showBufferNew(buffer, bufft, buffc, buffi))
@@ -527,9 +534,7 @@ PARSECHARM:
527534
log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
528535
}
529536

530-
for x, i := range buffer[buffc:buffi] {
531-
buffer[x] = i
532-
}
537+
copy(buffer[0:], buffer[buffc:buffi])
533538

534539
buffi -= buffc
535540
epsilonState = 0
@@ -575,6 +580,7 @@ PARSECHARM:
575580
buffc++
576581

577582
// Transition does not produce a character
583+
// Hopefully generated branchless code
578584
if buffc-bufft == 1 && (t&FIRSTBIT) != 0 {
579585
if DEBUG {
580586
log.Println("Nontoken forward", showBufferNew(buffer, bufft, buffc, buffi))
@@ -601,10 +607,7 @@ PARSECHARM:
601607
log.Println("-> Rewind buffer", bufft, buffc, buffi, epsilonOffset)
602608
}
603609

604-
// buffer = buffer[buffc:]
605-
for x, i := range buffer[buffc:buffi] {
606-
buffer[x] = i
607-
}
610+
copy(buffer[0:], buffer[buffc:buffi])
608611

609612
buffi -= buffc
610613
// epsilonOffset -= buffo

0 commit comments

Comments
 (0)