Skip to content

Commit e6d5d66

Browse files
committed
perf: optimize character filtering function
1 parent 437069b commit e6d5d66

File tree

1 file changed

+61
-51
lines changed

1 file changed

+61
-51
lines changed

probe/charset_probe.go

Lines changed: 61 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"bytes"
55

66
"github.com/wlynxg/chardet/consts"
7-
"github.com/wlynxg/chardet/util"
87
)
98

109
type CharSetProbe struct {
@@ -77,47 +76,46 @@ by markers. This function works to filter all words that contain at
7776
least one international character. All contiguous sequences of markers
7877
are replaced by a single space ascii character.
7978
80-
This filter applies to all scripts which do not use English characters.
79+
This filter applies to all scripts that do not use English characters.
8180
*/
8281
func (p *CharSetProbe) FilterInternationalWords(buf []byte) []byte {
83-
var filtered bytes.Buffer
84-
85-
var word bytes.Buffer
86-
hasInternational := false
82+
var (
83+
filtered = make([]byte, 0, len(buf))
84+
word = make([]byte, 0, 16)
85+
hasInternational bool
86+
)
8787

8888
for i := 0; i < len(buf); i++ {
8989
b := buf[i]
9090

9191
// Check if the byte is an English alphabet
9292
if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') {
93-
word.WriteByte(b)
93+
word = append(word, b)
9494
} else if b >= 0x80 { // Check if the byte is an international character
95-
word.WriteByte(b)
95+
word = append(word, b)
9696
hasInternational = true
9797
} else { // It's a marker character
98-
if hasInternational && word.Len() > 0 {
99-
filtered.Write(word.Bytes())
98+
if hasInternational {
99+
filtered = append(filtered, word...)
100100
// Replace the last character with a space if it's a marker
101-
if word.Len() > 0 {
102-
lastChar := word.Bytes()[word.Len()-1]
103-
if (lastChar < 'a' || lastChar > 'z') && (lastChar < 'A' || lastChar > 'Z') {
104-
filtered.WriteByte(' ')
105-
} else {
106-
filtered.WriteByte(lastChar)
107-
}
101+
if lastChar := word[len(word)-1]; (lastChar < 'a' || lastChar > 'z') &&
102+
(lastChar < 'A' || lastChar > 'Z') {
103+
filtered = append(filtered, ' ')
104+
} else {
105+
filtered = append(filtered, lastChar)
108106
}
109107
}
110-
word.Reset() // Reset the word buffer
108+
word = word[:0] // Reset the word buffer
111109
hasInternational = false // Reset international flag
112110
}
113111
}
114112

115113
// Handle the last word if it was not followed by a marker
116-
if hasInternational && word.Len() > 0 {
117-
filtered.Write(word.Bytes())
114+
if hasInternational && len(word) > 0 {
115+
filtered = append(filtered, word...)
118116
}
119117

120-
return filtered.Bytes()
118+
return filtered
121119
}
122120

123121
/*
@@ -127,16 +125,15 @@ alphabet and high byte characters that are not between <> characters.
127125
Also retains English alphabet and high byte characters immediately
128126
before occurrences of >.
129127
130-
This filter can be applied to all scripts which contain both English
128+
This filter can be applied to all scripts that contain both English
131129
characters and extended ASCII characters, but is currently only used by
132130
"Latin1Probe".
133131
*/
134132
func (p *CharSetProbe) FilterWithEnglishLetters(buf []byte) []byte {
135-
var (
136-
filtered bytes.Buffer
137-
inTag bool
138-
prev int
139-
)
133+
// Pre-allocate a buffer based on an estimate of the filtered content size
134+
filtered := make([]byte, 0, len(buf))
135+
inTag := false
136+
prev := 0
140137

141138
for curr := 0; curr < len(buf); curr++ {
142139
bufChar := buf[curr]
@@ -147,56 +144,69 @@ func (p *CharSetProbe) FilterWithEnglishLetters(buf []byte) []byte {
147144
inTag = true
148145
}
149146

150-
// If the current character is not extended-ASCII and not alphabetic...
151-
if bufChar < 0x80 && !util.IsAlpha(bufChar) {
152-
// ...and we're not in a tag
147+
// Inline the check for alphabetic characters
148+
if bufChar < 0x80 && !((bufChar >= 'a' && bufChar <= 'z') || (bufChar >= 'A' && bufChar <= 'Z')) {
149+
// If we're not in a tag, and we've found some text to keep
153150
if curr > prev && !inTag {
154-
// Keep everything after last non-extended-ASCII, non-alphabetic character
155-
filtered.Write(buf[prev:curr])
156-
// Output a space to delimit stretch we kept
157-
filtered.WriteByte(' ')
151+
// Append the slice from prev to curr
152+
filtered = append(filtered, buf[prev:curr]...)
153+
filtered = append(filtered, ' ')
158154
}
159155
prev = curr + 1
160156
}
161157
}
162158

163-
// If we're not in a tag...
164-
if !inTag {
165-
// Keep everything after last non-extended-ASCII, non-alphabetic character
166-
filtered.Write(buf[prev:])
159+
// If we're not in a tag, and we've got some remaining text to keep
160+
if !inTag && prev < len(buf) {
161+
filtered = append(filtered, buf[prev:]...)
162+
}
163+
164+
// If no filtering occurred, return the original buffer
165+
if len(filtered) == 0 {
166+
return buf
167167
}
168168

169-
return filtered.Bytes()
169+
return filtered
170170
}
171171

172172
// RemoveXMLTags removes XML tags from the input buffer and retains only the sequences
173173
// of English alphabet and high byte characters that are not between <> characters.
174174
func (p *CharSetProbe) RemoveXMLTags(buf []byte) []byte {
175-
var filtered bytes.Buffer
175+
// Pre-allocate a buffer based on an estimate of the filtered content size
176+
filtered := make([]byte, 0, len(buf))
176177
inTag := false
177178
prev := 0
178179

179180
for curr := 0; curr < len(buf); curr++ {
180181
bufChar := buf[curr]
181182

182-
if bufChar == '>' { // End of a tag
183-
prev = curr + 1
183+
// Check for end of tag
184+
if bufChar == '>' {
184185
inTag = false
185-
} else if bufChar == '<' { // Start of a tag
186+
prev = curr + 1
187+
continue
188+
}
189+
190+
// Check to begin tag
191+
if bufChar == '<' {
186192
if curr > prev && !inTag {
187-
// Keep everything after last non-extended-ASCII, non-alphabetic character
188-
filtered.Write(buf[prev:curr])
189-
// Output a space to delimit stretch we kept
190-
filtered.WriteByte(' ')
193+
// Append the slice from prev to curr
194+
filtered = append(filtered, buf[prev:curr]...)
195+
filtered = append(filtered, ' ')
191196
}
192197
inTag = true
193198
}
194199
}
195200

196-
// If we're not in a tag, keep everything after last non-extended-ASCII, non-alphabetic character
197-
if !inTag {
198-
filtered.Write(buf[prev:])
201+
// If we're not in a tag, and we've got some remaining text to keep
202+
if !inTag && prev < len(buf) {
203+
filtered = append(filtered, buf[prev:]...)
204+
}
205+
206+
// If no filtering occurred, return the original buffer
207+
if len(filtered) == 0 {
208+
return buf
199209
}
200210

201-
return filtered.Bytes()
211+
return filtered
202212
}

0 commit comments

Comments
 (0)