1010using Microsoft . Extensions . DependencyInjection ;
1111using Microsoft . Extensions . Logging ;
1212using System ;
13+ using System . Collections . Generic ;
1314using System . IO ;
1415using System . Linq ;
1516using System . Net . Http ;
1617using System . Net . Http . Headers ;
1718using System . Security . Cryptography ;
19+ using System . Text ;
1820using System . Text . Json ;
21+ using System . Text . RegularExpressions ;
1922using System . Threading ;
2023using System . Threading . Tasks ;
2124
@@ -64,7 +67,17 @@ public Task HandleAsync(FileDeletedEvent data, MetaData metaData, CancellationTo
6467
6568 private async Task ProcessFileAsync ( FileEntry fileEntry , CancellationToken cancellationToken )
6669 {
67- if ( string . IsNullOrEmpty ( fileEntry ? . FileLocation ) )
70+ if ( fileEntry == null )
71+ {
72+ return ;
73+ }
74+
75+ if ( string . IsNullOrEmpty ( fileEntry . FileLocation ) )
76+ {
77+ return ;
78+ }
79+
80+ if ( fileEntry . Encrypted && string . IsNullOrEmpty ( fileEntry . EncryptionKey ) )
6881 {
6982 return ;
7083 }
@@ -78,8 +91,22 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
7891 fileExtension == ".md" ||
7992 fileExtension == ".markdown" )
8093 {
81- // TODO: xxx
82- return ;
94+
95+ var bytes = await GetBytesAsync ( fileStorageManager , fileEntry , cancellationToken ) ;
96+
97+ var chunks = TextChunkingService . ChunkSentences ( Encoding . UTF8 . GetString ( bytes ) ) ;
98+
99+ var chunksFolder = Path . Combine ( _configuration [ "Storage:TempFolderPath" ] , "Chunks" , fileEntry . Id . ToString ( ) ) ;
100+
101+ if ( ! Directory . Exists ( chunksFolder ) )
102+ {
103+ Directory . CreateDirectory ( chunksFolder ) ;
104+ }
105+
106+ foreach ( var chunk in chunks )
107+ {
108+ await File . WriteAllTextAsync ( Path . Combine ( chunksFolder , $ "{ chunk . StartIndex } _{ chunk . EndIndex } .txt") , chunk . Text , cancellationToken ) ;
109+ }
83110 }
84111 else if ( fileExtension == ".pdf" ||
85112 fileExtension == ".docx" ||
@@ -101,6 +128,20 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
101128 var markdown = await ConvertToMarkdownAsync ( fileStorageManager , fileEntry , cancellationToken ) ;
102129 await File . WriteAllTextAsync ( markdownFile , markdown , cancellationToken ) ;
103130 }
131+
132+ var chunks = TextChunkingService . ChunkSentences ( await File . ReadAllTextAsync ( markdownFile , cancellationToken ) ) ;
133+
134+ var chunksFolder = Path . Combine ( _configuration [ "Storage:TempFolderPath" ] , "Chunks" , fileEntry . Id . ToString ( ) ) ;
135+
136+ if ( ! Directory . Exists ( chunksFolder ) )
137+ {
138+ Directory . CreateDirectory ( chunksFolder ) ;
139+ }
140+
141+ foreach ( var chunk in chunks )
142+ {
143+ await File . WriteAllTextAsync ( Path . Combine ( chunksFolder , $ "{ chunk . StartIndex } _{ chunk . EndIndex } .txt") , chunk . Text , cancellationToken ) ;
144+ }
104145 }
105146 else if ( fileExtension == ".jpg" ||
106147 fileExtension == ".png" )
@@ -234,3 +275,173 @@ class Caption
234275 public float Confidence { get ; set ; }
235276 }
236277}
278+
279+ public class Chunk
280+ {
281+ public required string Text { get ; init ; }
282+
283+ public required int StartIndex { get ; init ; }
284+
285+ public required int EndIndex { get ; init ; }
286+ }
287+
288+ public class TextChunkingService
289+ {
290+ public static IEnumerable < Chunk > ChunkSentences ( string text , int maxTokens = 800 )
291+ {
292+ // Split text into sentences while preserving their original positions
293+ var sentenceMatches = Regex . Matches ( text , @"[^\.!\?]*[\.!\?]\s*" ) ;
294+ var sentences = new List < ( string content , int start , int end ) > ( ) ;
295+
296+ int lastEnd = 0 ;
297+ foreach ( Match match in sentenceMatches )
298+ {
299+ sentences . Add ( ( match . Value , match . Index , match . Index + match . Length - 1 ) ) ;
300+ lastEnd = match . Index + match . Length ;
301+ }
302+
303+ // Handle any remaining text that doesn't end with sentence punctuation
304+ if ( lastEnd < text . Length )
305+ {
306+ string remaining = text . Substring ( lastEnd ) ;
307+ if ( ! string . IsNullOrWhiteSpace ( remaining ) )
308+ {
309+ sentences . Add ( ( remaining , lastEnd , text . Length - 1 ) ) ;
310+ }
311+ }
312+
313+ var current = new StringBuilder ( ) ;
314+ int tokenCount = 0 ;
315+ int chunkStartIndex = - 1 ;
316+ int chunkEndIndex = - 1 ;
317+
318+ foreach ( var ( content , start , end ) in sentences )
319+ {
320+ int sentenceTokens = content . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) . Length ;
321+
322+ if ( tokenCount + sentenceTokens > maxTokens && current . Length > 0 )
323+ {
324+ yield return new Chunk
325+ {
326+ Text = current . ToString ( ) . Trim ( ) ,
327+ StartIndex = chunkStartIndex ,
328+ EndIndex = chunkEndIndex
329+ } ;
330+
331+ current . Clear ( ) ;
332+ tokenCount = 0 ;
333+ chunkStartIndex = - 1 ;
334+ }
335+
336+ if ( current . Length == 0 )
337+ {
338+ chunkStartIndex = start ;
339+ }
340+
341+ current . Append ( content ) ;
342+ tokenCount += sentenceTokens ;
343+ chunkEndIndex = end ;
344+ }
345+
346+ if ( current . Length > 0 )
347+ {
348+ yield return new Chunk
349+ {
350+ Text = current . ToString ( ) . Trim ( ) ,
351+ StartIndex = chunkStartIndex ,
352+ EndIndex = chunkEndIndex
353+ } ;
354+ }
355+ }
356+
357+ public static IEnumerable < Chunk > ChunkSentencesOverlapping ( string text , int maxTokens = 800 , double overlapRatio = 0.1 )
358+ {
359+ // Split text into sentences while preserving their original positions
360+ var sentenceMatches = Regex . Matches ( text , @"[^\.!\?]*[\.!\?]\s*" ) ;
361+ var sentences = new List < ( string content , int start , int end ) > ( ) ;
362+
363+ int lastEnd = 0 ;
364+ foreach ( Match match in sentenceMatches )
365+ {
366+ sentences . Add ( ( match . Value . Trim ( ) , match . Index , match . Index + match . Length - 1 ) ) ;
367+ lastEnd = match . Index + match . Length ;
368+ }
369+
370+ // Handle remaining text
371+ if ( lastEnd < text . Length )
372+ {
373+ string remaining = text . Substring ( lastEnd ) ;
374+ if ( ! string . IsNullOrWhiteSpace ( remaining ) )
375+ {
376+ sentences . Add ( ( remaining . Trim ( ) , lastEnd , text . Length - 1 ) ) ;
377+ }
378+ }
379+
380+ var current = new StringBuilder ( ) ;
381+ int tokenCount = 0 ;
382+ int estimatedOverlapTokens = ( int ) ( maxTokens * overlapRatio ) ;
383+ var lastChunkSentences = new List < ( string content , int start , int end ) > ( ) ;
384+ int chunkStartIndex = - 1 ;
385+ int chunkEndIndex = - 1 ;
386+
387+ foreach ( var sentence in sentences )
388+ {
389+ int sentenceTokens = EstimateTokens ( sentence . content ) ;
390+
391+ // If adding this sentence exceeds limit, yield current chunk
392+ if ( tokenCount + sentenceTokens > maxTokens && current . Length > 0 )
393+ {
394+ yield return new Chunk
395+ {
396+ Text = current . ToString ( ) . Trim ( ) ,
397+ StartIndex = chunkStartIndex ,
398+ EndIndex = chunkEndIndex
399+ } ;
400+
401+ // Prepare overlap from previous sentences
402+ var overlapSentences = lastChunkSentences . TakeLast ( Math . Max ( 1 , estimatedOverlapTokens / 20 ) ) . ToList ( ) ;
403+ if ( overlapSentences . Any ( ) )
404+ {
405+ var overlap = string . Join ( " " , overlapSentences . Select ( s => s . content ) ) ;
406+ current . Clear ( ) ;
407+ current . Append ( overlap + " " ) ;
408+ tokenCount = EstimateTokens ( overlap ) ;
409+ chunkStartIndex = overlapSentences . First ( ) . start ;
410+ }
411+ else
412+ {
413+ current . Clear ( ) ;
414+ tokenCount = 0 ;
415+ chunkStartIndex = sentence . start ;
416+ }
417+ lastChunkSentences . Clear ( ) ;
418+ }
419+
420+ if ( current . Length == 0 )
421+ {
422+ chunkStartIndex = sentence . start ;
423+ }
424+
425+ current . Append ( sentence . content + " " ) ;
426+ tokenCount += sentenceTokens ;
427+ chunkEndIndex = sentence . end ;
428+ lastChunkSentences . Add ( sentence ) ;
429+ }
430+
431+ if ( current . Length > 0 )
432+ {
433+ yield return new Chunk
434+ {
435+ Text = current . ToString ( ) . Trim ( ) ,
436+ StartIndex = chunkStartIndex ,
437+ EndIndex = chunkEndIndex
438+ } ;
439+ }
440+ }
441+
442+ private static int EstimateTokens ( string text )
443+ {
444+ // Rough heuristic: 1 token ≈ 4 chars
445+ return Math . Max ( 1 , text . Length / 4 ) ;
446+ }
447+ }
0 commit comments