@@ -32,25 +32,24 @@ package wordcounter
3232
3333import (
3434 "fmt"
35- "unicode"
3635 "unicode/utf8"
3736)
3837
39- // TextCounter provides character counting functionality for text content.
38+ // Counter provides character counting functionality for text content.
4039// It implements the CharacterCounter interface and tracks statistics
4140// including lines, Chinese characters, non-Chinese characters, and total characters.
42- type TextCounter struct {
41+ type Counter struct {
4342 S * Stats // Statistics collected during counting
4443}
4544
46- // NewTextCounter creates a new TextCounter instance with initialized statistics.
45+ // NewCounter creates a new Counter instance with initialized statistics.
4746// The returned counter is ready to use for counting operations.
48- func NewTextCounter () * TextCounter {
49- return & TextCounter {S : & Stats {}}
47+ func NewCounter () * Counter {
48+ return & Counter {S : & Stats {}}
5049}
5150
5251// GetStats returns the counting statistics
53- func (c * TextCounter ) GetStats () * Stats {
52+ func (c * Counter ) GetStats () * Stats {
5453 return c .S
5554}
5655
@@ -62,7 +61,7 @@ func (c *TextCounter) GetStats() *Stats {
6261// - []byte: processed directly
6362//
6463// Returns an error if the input is empty or of an unsupported type.
65- func (c * TextCounter ) Count (input any ) error {
64+ func (c * Counter ) Count (input any ) error {
6665 switch v := input .(type ) {
6766 case string :
6867 if v == "" {
@@ -79,50 +78,83 @@ func (c *TextCounter) Count(input any) error {
7978 }
8079}
8180
81+ // isChinese checks if a rune is a Chinese character using direct Unicode range checks.
82+ // This is more efficient than using unicode.In(r, unicode.Han) as it avoids
83+ // the overhead of range table lookups.
84+ //
85+ // Covers the main CJK Unicode blocks:
86+ // - 0x4E00-0x9FFF: CJK Unified Ideographs (most common Chinese characters)
87+ // - 0x3400-0x4DBF: CJK Extension A
88+ // - 0x20000-0x2A6DF: CJK Extension B
89+ // - 0x2A700-0x2B73F: CJK Extension C
90+ // - 0x2B740-0x2B81F: CJK Extension D
91+ // - 0x2B820-0x2CEAF: CJK Extension E
92+ // - 0x2CEB0-0x2EBEF: CJK Extension F
93+ // - 0x3000-0x303F: CJK Symbols and Punctuation
94+ // - 0xFF00-0xFFEF: Halfwidth and Fullwidth Forms (Chinese punctuation)
95+ func isChinese (r rune ) bool {
96+ return (r >= 0x4E00 && r <= 0x9FFF ) || // CJK Unified Ideographs
97+ (r >= 0x3400 && r <= 0x4DBF ) || // CJK Extension A
98+ (r >= 0x20000 && r <= 0x2A6DF ) || // CJK Extension B
99+ (r >= 0x2A700 && r <= 0x2B73F ) || // CJK Extension C
100+ (r >= 0x2B740 && r <= 0x2B81F ) || // CJK Extension D
101+ (r >= 0x2B820 && r <= 0x2CEAF ) || // CJK Extension E
102+ (r >= 0x2CEB0 && r <= 0x2EBEF ) || // CJK Extension F
103+ (r >= 0x3000 && r <= 0x303F ) || // CJK Symbols and Punctuation
104+ (r >= 0xFF00 && r <= 0xFFEF ) // Halfwidth and Fullwidth Forms
105+ }
106+
82107// CountBytes efficiently counts characters from a byte slice with minimal memory allocation.
83- // This method processes UTF-8 encoded text and updates the following statistics:
84- // - Lines: counted by scanning for newline characters
85- // - Chinese characters: identified using Unicode Han script ranges
108+ // This optimized version processes UTF-8 encoded text in a single pass and updates the following statistics:
109+ // - Lines: counted by scanning for newline characters (newlines + 1 for content)
110+ // - Chinese characters: identified using optimized Unicode range checks
86111// - Non-Chinese characters: all other characters except newlines
87112// - Total characters: sum of Chinese and non-Chinese characters (excluding newlines)
88113//
89- // The method uses utf8.DecodeRune for proper UTF-8 character boundary handling
90- // and avoids unnecessary string conversions for optimal performance.
114+ // Performance optimizations:
115+ // - Single-pass processing (combines line counting and character analysis)
116+ // - Direct Unicode range checks instead of unicode.In() for better performance
117+ // - Minimal function call overhead
118+ // - Local variables to reduce struct field access overhead
91119//
92120// Returns an error if the input data is empty.
93- func (c * TextCounter ) CountBytes (data []byte ) error {
121+ func (c * Counter ) CountBytes (data []byte ) error {
94122 if len (data ) == 0 {
95123 return NewInvalidInputError ("input data cannot be empty" )
96124 }
97125
98- // Count lines by scanning for newline characters
126+ // Use local variables to minimize struct field access overhead
99127 lines := 0
100- for _ , b := range data {
101- if b == '\n' {
102- lines ++
103- }
104- }
105- // If there's content but no newlines, it's still one line
106- if lines == 0 && len (data ) > 0 {
107- lines = 1
108- }
109- c .S .Lines += lines
128+ chineseChars := 0
129+ nonChineseChars := 0
110130
111- // Process runes directly from byte slice to avoid string conversion
112- // Skip newline characters to match original behavior
113- i := 0
114- for i < len (data ) {
131+ // Single-pass processing: count lines and characters simultaneously
132+ for i := 0 ; i < len (data ); {
115133 r , size := utf8 .DecodeRune (data [i :])
116- if r != '\n' { // Skip newline characters
117- c .S .TotalChars ++
118- if unicode .In (r , unicode .Han ) {
119- c .S .ChineseChars ++
134+ if r == '\n' {
135+ lines ++
136+ } else {
137+ // Count non-newline characters
138+ if isChinese (r ) {
139+ chineseChars ++
120140 } else {
121- c . S . NonChineseChars ++
141+ nonChineseChars ++
122142 }
123143 }
124144 i += size
125145 }
126146
147+ // Line counting logic: number of newlines + 1 (if there's any content)
148+ // This correctly handles cases like "line1\nline2\nline3" (2 newlines = 3 lines)
149+ if len (data ) > 0 {
150+ lines ++ // Add 1 for the content itself
151+ }
152+
153+ // Update statistics in batch to minimize memory writes
154+ c .S .Lines += lines
155+ c .S .ChineseChars += chineseChars
156+ c .S .NonChineseChars += nonChineseChars
157+ c .S .TotalChars += chineseChars + nonChineseChars
158+
127159 return nil
128160}
0 commit comments