1414
1515package contentbuffer
1616
17- // chunkBoundary stores metadata for a single data chunk.
1817type chunkBoundary struct {
1918 start int
2019 end int
2120 writeTimes int
2221}
2322
24- // SplitResult encapsulates the result of a split operation.
2523type SplitResult struct {
2624 Chunks []string
2725 CompletedEvents int
@@ -42,25 +40,13 @@ type ContentBuffer struct {
4240 currEventCounter int // Event counter for the current chunk.
4341 overlapEventCounter int // Event counter for the current overlap area (delays the next chunk's count).
4442
45- initialCapacity int // The initial capacity of the buffer.
46- shrinkFactor float64 // The threshold factor that triggers buffer shrinkage.
47- resizeFactor float64
43+ initialCapacity int // The initial capacity of the buffer.
4844
4945 counter CharCounter
5046}
5147
52- // BufferOption is a function type for configuring a ContentBuffer.
5348type BufferOption func (* ContentBuffer )
5449
55- // WithInitialCapacity sets the initial capacity of the buffer.
56- func WithInitialCapacity (capacity int ) BufferOption {
57- return func (c * ContentBuffer ) {
58- if capacity > 0 {
59- c .initialCapacity = capacity
60- }
61- }
62- }
63-
6450func WithMaxChars (maxChars int ) BufferOption {
6551 return func (c * ContentBuffer ) {
6652 c .maxChars = maxChars
@@ -73,7 +59,6 @@ func WithOverlapCharNum(overlapCharNum int) BufferOption {
7359 }
7460}
7561
76- // NewContentBuffer creates and initializes a new ContentBuffer.
7762func NewContentBuffer (opts ... BufferOption ) * ContentBuffer {
7863 c := & ContentBuffer {
7964 maxChars : 100 ,
@@ -83,15 +68,13 @@ func NewContentBuffer(opts ...BufferOption) *ContentBuffer {
8368 currStart : 0 ,
8469 currChars : 0 ,
8570 overlapCountDelayed : true ,
86- initialCapacity : 2049 ,
87- shrinkFactor : 2 ,
88- resizeFactor : 1.3 ,
8971 }
9072
9173 for _ , opt := range opts {
9274 opt (c )
9375 }
9476
77+ c .initialCapacity = 2 * c .counter .MaxBytesForChars (c .maxChars )
9578 c .buffer = make ([]byte , 0 , c .initialCapacity )
9679 return c
9780}
@@ -115,49 +98,22 @@ func (c *ContentBuffer) startNewChunk(disableOverlap bool) {
11598 c .overlapEventCounter = 0
11699
117100 if c .overlapCharNum > 0 && ! disableOverlap {
118- overlapStart := c .counter .TailStartIndex (c .buffer , c .overlapCharNum ) // Better implementation?
101+ overlapStart := c .counter .TailStartIndex (c .buffer , c .overlapCharNum )
119102 c .currStart = overlapStart
120103 c .currChars = c .overlapCharNum
121104 } else {
122- c .currStart = len ( c . buffer )
105+ c .currStart = end
123106 c .currChars = 0
124107 }
125108}
126109
127- // shrinkIfNeeded checks if the buffer's capacity needs to be reduced and performs the shrink if necessary.
128- func (c * ContentBuffer ) shrinkIfNeeded () {
129- currentCap := cap (c .buffer )
130- currentLen := len (c .buffer )
131-
132- // If the buffer is empty and its capacity is greater than the initial capacity,
133- // shrink it back to the initial capacity.
134- if currentLen == 0 && currentCap > c .initialCapacity {
135- c .buffer = make ([]byte , 0 , c .initialCapacity )
136- return
137- }
138-
139- // Only consider shrinking when the capacity is greater than the initial capacity.
140- if currentCap > c .initialCapacity {
141- targetShrinkCapacity := int (float64 (currentLen ) * c .shrinkFactor )
142- if targetShrinkCapacity < c .initialCapacity {
143- targetShrinkCapacity = c .initialCapacity
144- }
145-
146- if currentCap > targetShrinkCapacity {
147- newBuf := make ([]byte , currentLen , int (float64 (currentLen )* c .resizeFactor ))
148- copy (newBuf , c .buffer )
149- c .buffer = newBuf
150- }
151- }
152- }
153-
154110// Write adds data to the buffer.
155111func (c * ContentBuffer ) Write (data []byte ) {
156112 i := 0
157113 for i < len (data ) {
158- _ , size , err := c .counter .DecodeChar (data [i :])
114+ size , err := c .counter .DecodeOne (data [i :])
159115 if err != nil {
160- // As a fault-tolerance strategy, skip invalid UTF-8 bytes.
116+ // skip invalid bytes.
161117 i ++
162118 continue
163119 }
@@ -167,9 +123,9 @@ func (c *ContentBuffer) Write(data []byte) {
167123 i += size
168124
169125 if c .currChars == c .maxChars {
170- // Processing is complete and the buffered text has reached the upper limit.
126+ // Processing is complete, and the buffered text has reached the upper limit.
171127 c .startNewChunk (false )
172- if i = = len (data ) && c .overlapCharNum == 0 {
128+ if i > = len (data ) && c .overlapCharNum == 0 {
173129 c .boundaries [len (c .boundaries )- 1 ].writeTimes ++
174130 return
175131 }
@@ -186,7 +142,6 @@ func (c *ContentBuffer) Write(data []byte) {
186142// Flush commits the currently ongoing chunk.
187143func (c * ContentBuffer ) Flush () {
188144 if c .currChars > 0 {
189- // Avoid missing event counts.
190145 c .currEventCounter += c .overlapEventCounter
191146 c .startNewChunk (true )
192147 }
@@ -212,7 +167,6 @@ func (c *ContentBuffer) GetCompletedResult() SplitResult {
212167 eventCount += boundary .writeTimes
213168 }
214169
215- // Clean completed result
216170 if c .currStart > 0 {
217171 remainingSize := len (c .buffer ) - c .currStart
218172 copy (c .buffer , c .buffer [c .currStart :])
@@ -221,7 +175,6 @@ func (c *ContentBuffer) GetCompletedResult() SplitResult {
221175 c .currStart = 0
222176 c .outputIndex = 0
223177 }
224- c .shrinkIfNeeded ()
225178
226179 return SplitResult {Chunks : chunks , CompletedEvents : eventCount }
227180}
0 commit comments