@@ -66,7 +66,8 @@ export class LocalDocumentResult extends LocalDocument {
6666 startPos : startPos + offset ,
6767 endPos : startPos + offset + chunkLength - 1 ,
6868 score : chunk . score ,
69- tokenCount : chunkLength
69+ tokenCount : chunkLength ,
70+ isBm25 : false
7071 } ) ;
7172 offset += chunkLength ;
7273 }
@@ -103,7 +104,8 @@ export class LocalDocumentResult extends LocalDocument {
103104 return {
104105 text : text ,
105106 tokenCount : section . tokenCount ,
106- score : section . score
107+ score : section . score ,
108+ isBm25 : false ,
107109 } ;
108110 } ) ;
109111 }
@@ -127,7 +129,8 @@ export class LocalDocumentResult extends LocalDocument {
127129 return [ {
128130 text,
129131 tokenCount : length ,
130- score : 1.0
132+ score : 1.0 ,
133+ isBm25 : false ,
131134 } ] ;
132135 }
133136
@@ -148,7 +151,8 @@ export class LocalDocumentResult extends LocalDocument {
148151 startPos,
149152 endPos,
150153 score : chunk . score ,
151- tokenCount : this . _tokenizer . encode ( chunkText ) . length
154+ tokenCount : this . _tokenizer . encode ( chunkText ) . length ,
155+ isBm25 : Boolean ( chunk . item . metadata . isBm25 ) ,
152156 } ;
153157 } ) . filter ( chunk => chunk . tokenCount <= maxTokens ) . sort ( ( a , b ) => a . startPos - b . startPos ) ;
154158
@@ -163,36 +167,63 @@ export class LocalDocumentResult extends LocalDocument {
163167 return [ {
164168 text : this . _tokenizer . decode ( tokens . slice ( 0 , maxTokens ) ) ,
165169 tokenCount : maxTokens ,
166- score : topChunk . score
170+ score : topChunk . score ,
171+ isBm25 : false ,
167172 } ] ;
168173 }
169174
170- // Generate sections
175+ // Generate semantic sections
171176 const sections : Section [ ] = [ ] ;
172177 for ( let i = 0 ; i < chunks . length ; i ++ ) {
173178 const chunk = chunks [ i ] ;
174179 let section = sections [ sections . length - 1 ] ;
175- if ( ! section || section . tokenCount + chunk . tokenCount > maxTokens ) {
176- section = {
177- chunks : [ ] ,
178- score : 0 ,
179- tokenCount : 0
180- } ;
181- sections . push ( section ) ;
180+ if ( ! chunk . isBm25 ) {
181+ if ( ! section || section . tokenCount + chunk . tokenCount > maxTokens ) {
182+ section = {
183+ chunks : [ ] ,
184+ score : 0 ,
185+ tokenCount : 0
186+ } ;
187+ sections . push ( section ) ;
188+ }
189+ section . chunks . push ( chunk ) ;
190+ section . score += chunk . score ;
191+ section . tokenCount += chunk . tokenCount ;
182192 }
183- section . chunks . push ( chunk ) ;
184- section . score += chunk . score ;
185- section . tokenCount += chunk . tokenCount ;
186193 }
187194
195+ // Generate bm25 sections
196+ const bm25Sections : Section [ ] = [ ] ;
197+ for ( let i = 0 ; i < chunks . length ; i ++ ) {
198+ const chunk = chunks [ i ] ;
199+ let section = bm25Sections [ bm25Sections . length - 1 ] ;
200+ if ( chunk . isBm25 ) {
201+ if ( ! section || section . tokenCount + chunk . tokenCount > maxTokens ) {
202+ section = {
203+ chunks : [ ] ,
204+ score : 0 ,
205+ tokenCount : 0
206+ } ;
207+ bm25Sections . push ( section ) ;
208+ }
209+ section . chunks . push ( chunk ) ;
210+ section . score += chunk . score ;
211+ section . tokenCount += chunk . tokenCount ;
212+ }
213+ }
188214 // Normalize section scores
189215 sections . forEach ( section => section . score /= section . chunks . length ) ;
216+ bm25Sections . forEach ( section => section . score /= section . chunks . length ) ;
190217
191218 // Sort sections by score and limit to maxSections
192219 sections . sort ( ( a , b ) => b . score - a . score ) ;
220+ bm25Sections . sort ( ( a , b ) => b . score - a . score ) ;
193221 if ( sections . length > maxSections ) {
194222 sections . splice ( maxSections , sections . length - maxSections ) ;
195223 }
224+ if ( bm25Sections . length > maxSections ) {
225+ bm25Sections . splice ( maxSections , bm25Sections . length - maxSections ) ;
226+ }
196227
197228 // Combine adjacent chunks of text
198229 sections . forEach ( section => {
@@ -216,7 +247,8 @@ export class LocalDocumentResult extends LocalDocument {
216247 startPos : - 1 ,
217248 endPos : - 1 ,
218249 score : 0 ,
219- tokenCount : this . _tokenizer . encode ( '\n\n...\n\n' ) . length
250+ tokenCount : this . _tokenizer . encode ( '\n\n...\n\n' ) . length ,
251+ isBm25 : false ,
220252 } ;
221253 sections . forEach ( section => {
222254 // Insert connectors between chunks
@@ -242,7 +274,8 @@ export class LocalDocumentResult extends LocalDocument {
242274 startPos : sectionStart - beforeBudget ,
243275 endPos : sectionStart - 1 ,
244276 score : 0 ,
245- tokenCount : beforeBudget
277+ tokenCount : beforeBudget ,
278+ isBm25 : false ,
246279 } ;
247280 section . chunks . unshift ( chunk ) ;
248281 section . tokenCount += chunk . tokenCount ;
@@ -258,7 +291,8 @@ export class LocalDocumentResult extends LocalDocument {
258291 startPos : sectionEnd + 1 ,
259292 endPos : sectionEnd + afterBudget ,
260293 score : 0 ,
261- tokenCount : afterBudget
294+ tokenCount : afterBudget ,
295+ isBm25 : false ,
262296 } ;
263297 section . chunks . push ( chunk ) ;
264298 section . tokenCount += chunk . tokenCount ;
@@ -268,16 +302,29 @@ export class LocalDocumentResult extends LocalDocument {
268302 } ) ;
269303 }
270304
271- // Return final rendered sections
272- return sections . map ( section => {
305+ const semanticDocTextSections = sections . map ( section => {
306+ let text = '' ;
307+ section . chunks . forEach ( chunk => text += chunk . text ) ;
308+ return {
309+ text : text ,
310+ tokenCount : section . tokenCount ,
311+ score : section . score ,
312+ isBm25 : false ,
313+ } ;
314+ } ) ;
315+ const bm25DocTextSections = bm25Sections . map ( section => {
273316 let text = '' ;
274317 section . chunks . forEach ( chunk => text += chunk . text ) ;
275318 return {
276319 text : text ,
277320 tokenCount : section . tokenCount ,
278- score : section . score
321+ score : section . score ,
322+ isBm25 : true ,
279323 } ;
280324 } ) ;
325+
326+ // Return final rendered sections
327+ return [ ...semanticDocTextSections , ...bm25DocTextSections ] ;
281328 }
282329
283330 private encodeBeforeText ( text : string , budget : number ) : number [ ] {
@@ -300,6 +347,7 @@ interface SectionChunk {
300347 endPos : number ;
301348 score : number ;
302349 tokenCount : number ;
350+ isBm25 : boolean ;
303351}
304352
305353interface Section {
0 commit comments