Skip to content

Commit b090394

Browse files
committed
(#271) Semantic/Vector Search
1 parent c91af1c commit b090394

File tree

1 file changed

+214
-3
lines changed

1 file changed

+214
-3
lines changed

src/Monolith/ClassifiedAds.Background/MessageBusConsumers/FileEmbeddingConsumer.cs

Lines changed: 214 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@
1010
using Microsoft.Extensions.DependencyInjection;
1111
using Microsoft.Extensions.Logging;
1212
using System;
13+
using System.Collections.Generic;
1314
using System.IO;
1415
using System.Linq;
1516
using System.Net.Http;
1617
using System.Net.Http.Headers;
1718
using System.Security.Cryptography;
19+
using System.Text;
1820
using System.Text.Json;
21+
using System.Text.RegularExpressions;
1922
using System.Threading;
2023
using System.Threading.Tasks;
2124

@@ -64,7 +67,17 @@ public Task HandleAsync(FileDeletedEvent data, MetaData metaData, CancellationTo
6467

6568
private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cancellationToken)
6669
{
67-
if (string.IsNullOrEmpty(fileEntry?.FileLocation))
70+
if (fileEntry == null)
71+
{
72+
return;
73+
}
74+
75+
if (string.IsNullOrEmpty(fileEntry.FileLocation))
76+
{
77+
return;
78+
}
79+
80+
if (fileEntry.Encrypted && string.IsNullOrEmpty(fileEntry.EncryptionKey))
6881
{
6982
return;
7083
}
@@ -78,8 +91,22 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
7891
fileExtension == ".md" ||
7992
fileExtension == ".markdown")
8093
{
81-
// TODO: xxx
82-
return;
94+
95+
var bytes = await GetBytesAsync(fileStorageManager, fileEntry, cancellationToken);
96+
97+
var chunks = TextChunkingService.ChunkSentences(Encoding.UTF8.GetString(bytes));
98+
99+
var chunksFolder = Path.Combine(_configuration["Storage:TempFolderPath"], "Chunks", fileEntry.Id.ToString());
100+
101+
if (!Directory.Exists(chunksFolder))
102+
{
103+
Directory.CreateDirectory(chunksFolder);
104+
}
105+
106+
foreach (var chunk in chunks)
107+
{
108+
await File.WriteAllTextAsync(Path.Combine(chunksFolder, $"{chunk.StartIndex}_{chunk.EndIndex}.txt"), chunk.Text, cancellationToken);
109+
}
83110
}
84111
else if (fileExtension == ".pdf" ||
85112
fileExtension == ".docx" ||
@@ -101,6 +128,20 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
101128
var markdown = await ConvertToMarkdownAsync(fileStorageManager, fileEntry, cancellationToken);
102129
await File.WriteAllTextAsync(markdownFile, markdown, cancellationToken);
103130
}
131+
132+
var chunks = TextChunkingService.ChunkSentences(await File.ReadAllTextAsync(markdownFile, cancellationToken));
133+
134+
var chunksFolder = Path.Combine(_configuration["Storage:TempFolderPath"], "Chunks", fileEntry.Id.ToString());
135+
136+
if (!Directory.Exists(chunksFolder))
137+
{
138+
Directory.CreateDirectory(chunksFolder);
139+
}
140+
141+
foreach (var chunk in chunks)
142+
{
143+
await File.WriteAllTextAsync(Path.Combine(chunksFolder, $"{chunk.StartIndex}_{chunk.EndIndex}.txt"), chunk.Text, cancellationToken);
144+
}
104145
}
105146
else if (fileExtension == ".jpg" ||
106147
fileExtension == ".png")
@@ -234,3 +275,173 @@ class Caption
234275
public float Confidence { get; set; }
235276
}
236277
}
278+
279+
public class Chunk
280+
{
281+
public required string Text { get; init; }
282+
283+
public required int StartIndex { get; init; }
284+
285+
public required int EndIndex { get; init; }
286+
}
287+
288+
public class TextChunkingService
289+
{
290+
public static IEnumerable<Chunk> ChunkSentences(string text, int maxTokens = 800)
291+
{
292+
// Split text into sentences while preserving their original positions
293+
var sentenceMatches = Regex.Matches(text, @"[^\.!\?]*[\.!\?]\s*");
294+
var sentences = new List<(string content, int start, int end)>();
295+
296+
int lastEnd = 0;
297+
foreach (Match match in sentenceMatches)
298+
{
299+
sentences.Add((match.Value, match.Index, match.Index + match.Length - 1));
300+
lastEnd = match.Index + match.Length;
301+
}
302+
303+
// Handle any remaining text that doesn't end with sentence punctuation
304+
if (lastEnd < text.Length)
305+
{
306+
string remaining = text.Substring(lastEnd);
307+
if (!string.IsNullOrWhiteSpace(remaining))
308+
{
309+
sentences.Add((remaining, lastEnd, text.Length - 1));
310+
}
311+
}
312+
313+
var current = new StringBuilder();
314+
int tokenCount = 0;
315+
int chunkStartIndex = -1;
316+
int chunkEndIndex = -1;
317+
318+
foreach (var (content, start, end) in sentences)
319+
{
320+
int sentenceTokens = content.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length;
321+
322+
if (tokenCount + sentenceTokens > maxTokens && current.Length > 0)
323+
{
324+
yield return new Chunk
325+
{
326+
Text = current.ToString().Trim(),
327+
StartIndex = chunkStartIndex,
328+
EndIndex = chunkEndIndex
329+
};
330+
331+
current.Clear();
332+
tokenCount = 0;
333+
chunkStartIndex = -1;
334+
}
335+
336+
if (current.Length == 0)
337+
{
338+
chunkStartIndex = start;
339+
}
340+
341+
current.Append(content);
342+
tokenCount += sentenceTokens;
343+
chunkEndIndex = end;
344+
}
345+
346+
if (current.Length > 0)
347+
{
348+
yield return new Chunk
349+
{
350+
Text = current.ToString().Trim(),
351+
StartIndex = chunkStartIndex,
352+
EndIndex = chunkEndIndex
353+
};
354+
}
355+
}
356+
357+
public static IEnumerable<Chunk> ChunkSentencesOverlapping(string text, int maxTokens = 800, double overlapRatio = 0.1)
358+
{
359+
// Split text into sentences while preserving their original positions
360+
var sentenceMatches = Regex.Matches(text, @"[^\.!\?]*[\.!\?]\s*");
361+
var sentences = new List<(string content, int start, int end)>();
362+
363+
int lastEnd = 0;
364+
foreach (Match match in sentenceMatches)
365+
{
366+
sentences.Add((match.Value.Trim(), match.Index, match.Index + match.Length - 1));
367+
lastEnd = match.Index + match.Length;
368+
}
369+
370+
// Handle remaining text
371+
if (lastEnd < text.Length)
372+
{
373+
string remaining = text.Substring(lastEnd);
374+
if (!string.IsNullOrWhiteSpace(remaining))
375+
{
376+
sentences.Add((remaining.Trim(), lastEnd, text.Length - 1));
377+
}
378+
}
379+
380+
var current = new StringBuilder();
381+
int tokenCount = 0;
382+
int estimatedOverlapTokens = (int)(maxTokens * overlapRatio);
383+
var lastChunkSentences = new List<(string content, int start, int end)>();
384+
int chunkStartIndex = -1;
385+
int chunkEndIndex = -1;
386+
387+
foreach (var sentence in sentences)
388+
{
389+
int sentenceTokens = EstimateTokens(sentence.content);
390+
391+
// If adding this sentence exceeds limit, yield current chunk
392+
if (tokenCount + sentenceTokens > maxTokens && current.Length > 0)
393+
{
394+
yield return new Chunk
395+
{
396+
Text = current.ToString().Trim(),
397+
StartIndex = chunkStartIndex,
398+
EndIndex = chunkEndIndex
399+
};
400+
401+
// Prepare overlap from previous sentences
402+
var overlapSentences = lastChunkSentences.TakeLast(Math.Max(1, estimatedOverlapTokens / 20)).ToList();
403+
if (overlapSentences.Any())
404+
{
405+
var overlap = string.Join(" ", overlapSentences.Select(s => s.content));
406+
current.Clear();
407+
current.Append(overlap + " ");
408+
tokenCount = EstimateTokens(overlap);
409+
chunkStartIndex = overlapSentences.First().start;
410+
}
411+
else
412+
{
413+
current.Clear();
414+
tokenCount = 0;
415+
chunkStartIndex = sentence.start;
416+
}
417+
lastChunkSentences.Clear();
418+
}
419+
420+
if (current.Length == 0)
421+
{
422+
chunkStartIndex = sentence.start;
423+
}
424+
425+
current.Append(sentence.content + " ");
426+
tokenCount += sentenceTokens;
427+
chunkEndIndex = sentence.end;
428+
lastChunkSentences.Add(sentence);
429+
}
430+
431+
if (current.Length > 0)
432+
{
433+
yield return new Chunk
434+
{
435+
Text = current.ToString().Trim(),
436+
StartIndex = chunkStartIndex,
437+
EndIndex = chunkEndIndex
438+
};
439+
}
440+
}
441+
442+
private static int EstimateTokens(string text)
443+
{
444+
// Rough heuristic: 1 token ≈ 4 chars
445+
return Math.Max(1, text.Length / 4);
446+
}
447+
}

0 commit comments

Comments
 (0)