4
4
"bytes"
5
5
6
6
"github.com/wlynxg/chardet/consts"
7
- "github.com/wlynxg/chardet/util"
8
7
)
9
8
10
9
type CharSetProbe struct {
@@ -77,47 +76,46 @@ by markers. This function works to filter all words that contain at
77
76
least one international character. All contiguous sequences of markers
78
77
are replaced by a single space ascii character.
79
78
80
- This filter applies to all scripts which do not use English characters.
79
+ This filter applies to all scripts that do not use English characters.
81
80
*/
82
81
func (p * CharSetProbe ) FilterInternationalWords (buf []byte ) []byte {
83
- var filtered bytes.Buffer
84
-
85
- var word bytes.Buffer
86
- hasInternational := false
82
+ var (
83
+ filtered = make ([]byte , 0 , len (buf ))
84
+ word = make ([]byte , 0 , 16 )
85
+ hasInternational bool
86
+ )
87
87
88
88
for i := 0 ; i < len (buf ); i ++ {
89
89
b := buf [i ]
90
90
91
91
// Check if the byte is an English alphabet
92
92
if (b >= 'a' && b <= 'z' ) || (b >= 'A' && b <= 'Z' ) {
93
- word . WriteByte ( b )
93
+ word = append ( word , b )
94
94
} else if b >= 0x80 { // Check if the byte is an international character
95
- word . WriteByte ( b )
95
+ word = append ( word , b )
96
96
hasInternational = true
97
97
} else { // It's a marker character
98
- if hasInternational && word . Len () > 0 {
99
- filtered . Write ( word .Bytes () )
98
+ if hasInternational {
99
+ filtered = append ( filtered , word ... )
100
100
// Replace the last character with a space if it's a marker
101
- if word .Len () > 0 {
102
- lastChar := word .Bytes ()[word .Len ()- 1 ]
103
- if (lastChar < 'a' || lastChar > 'z' ) && (lastChar < 'A' || lastChar > 'Z' ) {
104
- filtered .WriteByte (' ' )
105
- } else {
106
- filtered .WriteByte (lastChar )
107
- }
101
+ if lastChar := word [len (word )- 1 ]; (lastChar < 'a' || lastChar > 'z' ) &&
102
+ (lastChar < 'A' || lastChar > 'Z' ) {
103
+ filtered = append (filtered , ' ' )
104
+ } else {
105
+ filtered = append (filtered , lastChar )
108
106
}
109
107
}
110
- word . Reset () // Reset the word buffer
108
+ word = word [: 0 ] // Reset the word buffer
111
109
hasInternational = false // Reset international flag
112
110
}
113
111
}
114
112
115
113
// Handle the last word if it was not followed by a marker
116
- if hasInternational && word . Len ( ) > 0 {
117
- filtered . Write ( word .Bytes () )
114
+ if hasInternational && len ( word ) > 0 {
115
+ filtered = append ( filtered , word ... )
118
116
}
119
117
120
- return filtered . Bytes ()
118
+ return filtered
121
119
}
122
120
123
121
/*
@@ -127,16 +125,15 @@ alphabet and high byte characters that are not between <> characters.
127
125
Also retains English alphabet and high byte characters immediately
128
126
before occurrences of >.
129
127
130
- This filter can be applied to all scripts which contain both English
128
+ This filter can be applied to all scripts that contain both English
131
129
characters and extended ASCII characters, but is currently only used by
132
130
"Latin1Probe".
133
131
*/
134
132
func (p * CharSetProbe ) FilterWithEnglishLetters (buf []byte ) []byte {
135
- var (
136
- filtered bytes.Buffer
137
- inTag bool
138
- prev int
139
- )
133
+ // Pre-allocate a buffer based on an estimate of the filtered content size
134
+ filtered := make ([]byte , 0 , len (buf ))
135
+ inTag := false
136
+ prev := 0
140
137
141
138
for curr := 0 ; curr < len (buf ); curr ++ {
142
139
bufChar := buf [curr ]
@@ -147,56 +144,69 @@ func (p *CharSetProbe) FilterWithEnglishLetters(buf []byte) []byte {
147
144
inTag = true
148
145
}
149
146
150
- // If the current character is not extended-ASCII and not alphabetic...
151
- if bufChar < 0x80 && ! util . IsAlpha ( bufChar ) {
152
- // ...and we're not in a tag
147
+ // Inline the check for alphabetic characters
148
+ if bufChar < 0x80 && ! (( bufChar >= 'a' && bufChar <= 'z' ) || ( bufChar >= 'A' && bufChar <= 'Z' ) ) {
149
+ // If we're not in a tag, and we've found some text to keep
153
150
if curr > prev && ! inTag {
154
- // Keep everything after last non-extended-ASCII, non-alphabetic character
155
- filtered .Write (buf [prev :curr ])
156
- // Output a space to delimit stretch we kept
157
- filtered .WriteByte (' ' )
151
+ // Append the slice from prev to curr
152
+ filtered = append (filtered , buf [prev :curr ]... )
153
+ filtered = append (filtered , ' ' )
158
154
}
159
155
prev = curr + 1
160
156
}
161
157
}
162
158
163
- // If we're not in a tag...
164
- if ! inTag {
165
- // Keep everything after last non-extended-ASCII, non-alphabetic character
166
- filtered .Write (buf [prev :])
159
+ // If we're not in a tag, and we've got some remaining text to keep
160
+ if ! inTag && prev < len (buf ) {
161
+ filtered = append (filtered , buf [prev :]... )
162
+ }
163
+
164
+ // If no filtering occurred, return the original buffer
165
+ if len (filtered ) == 0 {
166
+ return buf
167
167
}
168
168
169
- return filtered . Bytes ()
169
+ return filtered
170
170
}
171
171
172
172
// RemoveXMLTags removes XML tags from the input buffer and retains only the sequences
173
173
// of English alphabet and high byte characters that are not between <> characters.
174
174
func (p * CharSetProbe ) RemoveXMLTags (buf []byte ) []byte {
175
- var filtered bytes.Buffer
175
+ // Pre-allocate a buffer based on an estimate of the filtered content size
176
+ filtered := make ([]byte , 0 , len (buf ))
176
177
inTag := false
177
178
prev := 0
178
179
179
180
for curr := 0 ; curr < len (buf ); curr ++ {
180
181
bufChar := buf [curr ]
181
182
182
- if bufChar == '>' { // End of a tag
183
- prev = curr + 1
183
+ // Check for end of tag
184
+ if bufChar == '>' {
184
185
inTag = false
185
- } else if bufChar == '<' { // Start of a tag
186
+ prev = curr + 1
187
+ continue
188
+ }
189
+
190
+ // Check to begin tag
191
+ if bufChar == '<' {
186
192
if curr > prev && ! inTag {
187
- // Keep everything after last non-extended-ASCII, non-alphabetic character
188
- filtered .Write (buf [prev :curr ])
189
- // Output a space to delimit stretch we kept
190
- filtered .WriteByte (' ' )
193
+ // Append the slice from prev to curr
194
+ filtered = append (filtered , buf [prev :curr ]... )
195
+ filtered = append (filtered , ' ' )
191
196
}
192
197
inTag = true
193
198
}
194
199
}
195
200
196
- // If we're not in a tag, keep everything after last non-extended-ASCII, non-alphabetic character
197
- if ! inTag {
198
- filtered .Write (buf [prev :])
201
+ // If we're not in a tag, and we've got some remaining text to keep
202
+ if ! inTag && prev < len (buf ) {
203
+ filtered = append (filtered , buf [prev :]... )
204
+ }
205
+
206
+ // If no filtering occurred, return the original buffer
207
+ if len (filtered ) == 0 {
208
+ return buf
199
209
}
200
210
201
- return filtered . Bytes ()
211
+ return filtered
202
212
}
0 commit comments