Skip to content

Commit 437d54e

Browse files
committed
perf: optimize character counting with single-pass algorithm and direct Unicode checks
- Replace unicode.In() with direct Unicode range checks for better performance - Implement single-pass processing to count lines and characters simultaneously - Use local variables to reduce struct field access overhead - Add comprehensive isChinese() function covering all CJK Unicode blocks - Improve line counting logic accuracy - Rename TextCounter to Counter for better naming consistency - Achieve zero memory allocation in CountBytes method - Add extensive benchmark tests showing significant performance improvements Performance improvements: - Zero memory allocations (0 B/op, 0 allocs/op) - Faster character classification with direct range checks - Reduced time complexity with single-pass algorithm - Better cache locality with local variables Also update .gitignore to exclude coverage files from repository.
1 parent d75d54c commit 437d54e

9 files changed

Lines changed: 446 additions & 72 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,3 +116,7 @@ counter.xlsx
116116
*.xlsx
117117
test.csv
118118
test.xlsx
119+
120+
# Coverage files
121+
coverage.out
122+
coverage.html

count.go

Lines changed: 66 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,24 @@ package wordcounter
3232

3333
import (
3434
"fmt"
35-
"unicode"
3635
"unicode/utf8"
3736
)
3837

39-
// TextCounter provides character counting functionality for text content.
38+
// Counter provides character counting functionality for text content.
4039
// It implements the CharacterCounter interface and tracks statistics
4140
// including lines, Chinese characters, non-Chinese characters, and total characters.
42-
type TextCounter struct {
41+
type Counter struct {
4342
S *Stats // Statistics collected during counting
4443
}
4544

46-
// NewTextCounter creates a new TextCounter instance with initialized statistics.
45+
// NewCounter creates a new Counter instance with initialized statistics.
4746
// The returned counter is ready to use for counting operations.
48-
func NewTextCounter() *TextCounter {
49-
return &TextCounter{S: &Stats{}}
47+
func NewCounter() *Counter {
48+
return &Counter{S: &Stats{}}
5049
}
5150

5251
// GetStats returns the counting statistics
53-
func (c *TextCounter) GetStats() *Stats {
52+
func (c *Counter) GetStats() *Stats {
5453
return c.S
5554
}
5655

@@ -62,7 +61,7 @@ func (c *TextCounter) GetStats() *Stats {
6261
// - []byte: processed directly
6362
//
6463
// Returns an error if the input is empty or of an unsupported type.
65-
func (c *TextCounter) Count(input any) error {
64+
func (c *Counter) Count(input any) error {
6665
switch v := input.(type) {
6766
case string:
6867
if v == "" {
@@ -79,50 +78,83 @@ func (c *TextCounter) Count(input any) error {
7978
}
8079
}
8180

81+
// isChinese checks if a rune is a Chinese character using direct Unicode range checks.
82+
// This is more efficient than using unicode.In(r, unicode.Han) as it avoids
83+
// the overhead of range table lookups.
84+
//
85+
// Covers the main CJK Unicode blocks:
86+
// - 0x4E00-0x9FFF: CJK Unified Ideographs (most common Chinese characters)
87+
// - 0x3400-0x4DBF: CJK Extension A
88+
// - 0x20000-0x2A6DF: CJK Extension B
89+
// - 0x2A700-0x2B73F: CJK Extension C
90+
// - 0x2B740-0x2B81F: CJK Extension D
91+
// - 0x2B820-0x2CEAF: CJK Extension E
92+
// - 0x2CEB0-0x2EBEF: CJK Extension F
93+
// - 0x3000-0x303F: CJK Symbols and Punctuation
94+
// - 0xFF00-0xFFEF: Halfwidth and Fullwidth Forms (Chinese punctuation)
95+
func isChinese(r rune) bool {
96+
return (r >= 0x4E00 && r <= 0x9FFF) || // CJK Unified Ideographs
97+
(r >= 0x3400 && r <= 0x4DBF) || // CJK Extension A
98+
(r >= 0x20000 && r <= 0x2A6DF) || // CJK Extension B
99+
(r >= 0x2A700 && r <= 0x2B73F) || // CJK Extension C
100+
(r >= 0x2B740 && r <= 0x2B81F) || // CJK Extension D
101+
(r >= 0x2B820 && r <= 0x2CEAF) || // CJK Extension E
102+
(r >= 0x2CEB0 && r <= 0x2EBEF) || // CJK Extension F
103+
(r >= 0x3000 && r <= 0x303F) || // CJK Symbols and Punctuation
104+
(r >= 0xFF00 && r <= 0xFFEF) // Halfwidth and Fullwidth Forms
105+
}
106+
82107
// CountBytes efficiently counts characters from a byte slice with minimal memory allocation.
83-
// This method processes UTF-8 encoded text and updates the following statistics:
84-
// - Lines: counted by scanning for newline characters
85-
// - Chinese characters: identified using Unicode Han script ranges
108+
// This optimized version processes UTF-8 encoded text in a single pass and updates the following statistics:
109+
// - Lines: counted by scanning for newline characters (newlines + 1 for content)
110+
// - Chinese characters: identified using optimized Unicode range checks
86111
// - Non-Chinese characters: all other characters except newlines
87112
// - Total characters: sum of Chinese and non-Chinese characters (excluding newlines)
88113
//
89-
// The method uses utf8.DecodeRune for proper UTF-8 character boundary handling
90-
// and avoids unnecessary string conversions for optimal performance.
114+
// Performance optimizations:
115+
// - Single-pass processing (combines line counting and character analysis)
116+
// - Direct Unicode range checks instead of unicode.In() for better performance
117+
// - Minimal function call overhead
118+
// - Local variables to reduce struct field access overhead
91119
//
92120
// Returns an error if the input data is empty.
93-
func (c *TextCounter) CountBytes(data []byte) error {
121+
func (c *Counter) CountBytes(data []byte) error {
94122
if len(data) == 0 {
95123
return NewInvalidInputError("input data cannot be empty")
96124
}
97125

98-
// Count lines by scanning for newline characters
126+
// Use local variables to minimize struct field access overhead
99127
lines := 0
100-
for _, b := range data {
101-
if b == '\n' {
102-
lines++
103-
}
104-
}
105-
// If there's content but no newlines, it's still one line
106-
if lines == 0 && len(data) > 0 {
107-
lines = 1
108-
}
109-
c.S.Lines += lines
128+
chineseChars := 0
129+
nonChineseChars := 0
110130

111-
// Process runes directly from byte slice to avoid string conversion
112-
// Skip newline characters to match original behavior
113-
i := 0
114-
for i < len(data) {
131+
// Single-pass processing: count lines and characters simultaneously
132+
for i := 0; i < len(data); {
115133
r, size := utf8.DecodeRune(data[i:])
116-
if r != '\n' { // Skip newline characters
117-
c.S.TotalChars++
118-
if unicode.In(r, unicode.Han) {
119-
c.S.ChineseChars++
134+
if r == '\n' {
135+
lines++
136+
} else {
137+
// Count non-newline characters
138+
if isChinese(r) {
139+
chineseChars++
120140
} else {
121-
c.S.NonChineseChars++
141+
nonChineseChars++
122142
}
123143
}
124144
i += size
125145
}
126146

147+
// Line counting logic: number of newlines + 1 (if there's any content)
148+
// This correctly handles cases like "line1\nline2\nline3" (2 newlines = 3 lines)
149+
if len(data) > 0 {
150+
lines++ // Add 1 for the content itself
151+
}
152+
153+
// Update statistics in batch to minimize memory writes
154+
c.S.Lines += lines
155+
c.S.ChineseChars += chineseChars
156+
c.S.NonChineseChars += nonChineseChars
157+
c.S.TotalChars += chineseChars + nonChineseChars
158+
127159
return nil
128160
}

0 commit comments

Comments
 (0)