Skip to content

Commit 33f46ea

Browse files
committed
(#271) Semantic/Vector Search
1 parent ca1945c commit 33f46ea

File tree

3 files changed

+33
-17
lines changed

3 files changed

+33
-17
lines changed

src/Monolith/ClassifiedAds.Background/MessageBusConsumers/FileEmbeddingConsumer.cs

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,25 @@ public sealed class FileEmbeddingConsumer :
2929
private readonly IConfiguration _configuration;
3030
private readonly IServiceProvider _serviceProvider;
3131

32+
private readonly string _tempFolder;
33+
private readonly string _markdownFolder;
34+
private readonly string _imageAnalysisFolder;
35+
private readonly string _chunkFolder;
36+
private readonly string _embeddingFolder;
37+
3238
public FileEmbeddingConsumer(ILogger<FileEmbeddingConsumer> logger,
3339
IConfiguration configuration,
3440
IServiceProvider serviceProvider)
3541
{
3642
_logger = logger;
3743
_configuration = configuration;
3844
_serviceProvider = serviceProvider;
45+
46+
_tempFolder = _configuration["Storage:TempFolderPath"];
47+
_markdownFolder = Path.Combine(_tempFolder, "Markdown");
48+
_imageAnalysisFolder = Path.Combine(_tempFolder, "ImageAnalysis");
49+
_chunkFolder = Path.Combine(_tempFolder, "Chunks");
50+
_embeddingFolder = Path.Combine(_tempFolder, "Embeddings");
3951
}
4052

4153
public async Task HandleAsync(FileCreatedEvent data, MetaData metaData, CancellationToken cancellationToken = default)
@@ -107,9 +119,9 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
107119

108120
var chunks = TextChunkingService.ChunkSentences(Encoding.UTF8.GetString(bytes));
109121

110-
var chunksFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Chunks", fileEntry.Id.ToString()));
122+
var chunksFolder = CreateDirectoryIfNotExist(Path.Combine(_chunkFolder, fileEntry.Id.ToString()));
111123

112-
var embeddingsFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Embeddings", fileEntry.Id.ToString()));
124+
var embeddingsFolder = CreateDirectoryIfNotExist(Path.Combine(_embeddingFolder, fileEntry.Id.ToString()));
113125

114126
foreach (var chunk in chunks)
115127
{
@@ -121,6 +133,7 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
121133
var fileEntryEmbedding = new FileEntryEmbedding
122134
{
123135
ChunkName = $"{chunk.StartIndex}_{chunk.EndIndex}.txt",
136+
ChunkLocation = Path.Combine("Chunks", fileEntry.Id.ToString(), $"{chunk.StartIndex}_{chunk.EndIndex}.txt"),
124137
FileEntryId = fileEntry.Id,
125138
Embedding = JsonSerializer.Serialize(embedding.EmbeddingVector),
126139
TokenDetails = JsonSerializer.Serialize(embedding.UsageDetails)
@@ -141,19 +154,17 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
141154
{
142155
_logger.LogInformation("Converting file to markdown for FileEntry Id: {FileEntryId}", fileEntry?.Id);
143156

144-
var markdownFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Markdown"));
157+
var markdownFolder = CreateDirectoryIfNotExist(_markdownFolder);
145158

146-
var markdownFile = Path.Combine(markdownFolder, fileEntry.Id + ".md");
159+
var markdownFile = Path.Combine(markdownFolder, $"{fileEntry.Id}.md");
147160

148-
if (!File.Exists(markdownFile))
149-
{
150-
var bytes = await GetBytesAsync(fileStorageManager, fileEntry, cancellationToken);
151-
var markdown = await markdownService.ConvertToMarkdownAsync(bytes, fileEntry.FileName, cancellationToken);
152-
await File.WriteAllTextAsync(markdownFile, markdown, cancellationToken);
153-
}
161+
var bytes = await GetBytesAsync(fileStorageManager, fileEntry, cancellationToken);
162+
var markdown = await markdownService.ConvertToMarkdownAsync(bytes, fileEntry.FileName, cancellationToken);
163+
await File.WriteAllTextAsync(markdownFile, markdown, cancellationToken);
154164

155165
fileEntryText = new FileEntryText
156166
{
167+
TextLocation = Path.Combine("Markdown", $"{fileEntry.Id}.md"),
157168
FileEntryId = fileEntry.Id,
158169
};
159170

@@ -165,15 +176,13 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
165176

166177
if (!hasFileEntryEmbeddings)
167178
{
168-
var markdownFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Markdown"));
169-
170-
var markdownFile = Path.Combine(markdownFolder, fileEntry.Id + ".md");
179+
var markdownFile = Path.Combine(_markdownFolder, $"{fileEntry.Id}.md");
171180

172181
var chunks = TextChunkingService.ChunkSentences(await File.ReadAllTextAsync(markdownFile, cancellationToken));
173182

174-
var chunksFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Chunks", fileEntry.Id.ToString()));
183+
var chunksFolder = CreateDirectoryIfNotExist(Path.Combine(_chunkFolder, fileEntry.Id.ToString()));
175184

176-
var embeddingsFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Embeddings", fileEntry.Id.ToString()));
185+
var embeddingsFolder = CreateDirectoryIfNotExist(Path.Combine(_embeddingFolder, fileEntry.Id.ToString()));
177186

178187
foreach (var chunk in chunks)
179188
{
@@ -185,6 +194,7 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
185194
var fileEntryEmbedding = new FileEntryEmbedding
186195
{
187196
ChunkName = $"{chunk.StartIndex}_{chunk.EndIndex}.txt",
197+
ChunkLocation = Path.Combine("Chunks", fileEntry.Id.ToString(), $"{chunk.StartIndex}_{chunk.EndIndex}.txt"),
188198
FileEntryId = fileEntry.Id,
189199
Embedding = JsonSerializer.Serialize(embedding.EmbeddingVector),
190200
TokenDetails = JsonSerializer.Serialize(embedding.UsageDetails)
@@ -201,9 +211,9 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
201211
{
202212
_logger.LogInformation("Processing image file for FileEntry Id: {FileEntryId}", fileEntry?.Id);
203213

204-
var imageAnalysisFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "ImageAnalysis"));
214+
var imageAnalysisFolder = CreateDirectoryIfNotExist(_imageAnalysisFolder);
205215

206-
var embeddingsFolder = CreateDirectoryIfNotExist(Path.Combine(_configuration["Storage:TempFolderPath"], "Embeddings", fileEntry.Id.ToString()));
216+
var embeddingsFolder = CreateDirectoryIfNotExist(Path.Combine(_embeddingFolder, fileEntry.Id.ToString()));
207217

208218
var imageAnalysisFile = Path.Combine(imageAnalysisFolder, $"{fileEntry.Id}.json");
209219
var embeddingFile = Path.Combine(embeddingsFolder, $"{fileEntry.Id}.json");
@@ -222,6 +232,7 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
222232

223233
fileEntryText = new FileEntryText
224234
{
235+
TextLocation = Path.Combine("ImageAnalysis", $"{fileEntry.Id}.json"),
225236
FileEntryId = fileEntry.Id,
226237
};
227238

@@ -240,6 +251,7 @@ private async Task ProcessFileAsync(FileEntry fileEntry, CancellationToken cance
240251
var fileEntryEmbedding = new FileEntryEmbedding
241252
{
242253
ChunkName = $"{fileEntry.Id}.json",
254+
ChunkLocation = Path.Combine("ImageAnalysis", $"{fileEntry.Id}.json"),
243255
FileEntryId = fileEntry.Id,
244256
Embedding = JsonSerializer.Serialize(embedding.EmbeddingVector),
245257
TokenDetails = JsonSerializer.Serialize(embedding.UsageDetails)

src/Monolith/ClassifiedAds.Domain/Entities/FileEntryEmbedding.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ public class FileEntryEmbedding : Entity<Guid>, IAggregateRoot
66
{
77
public string ChunkName { get; set; }
88

9+
public string ChunkLocation { get; set; }
10+
911
public string Embedding { get; set; }
1012

1113
public string TokenDetails { get; set; }

src/Monolith/ClassifiedAds.Domain/Entities/FileEntryText.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ namespace ClassifiedAds.Domain.Entities;
44

55
public class FileEntryText : Entity<Guid>, IAggregateRoot
66
{
7+
public string TextLocation { get; set; }
8+
79
public Guid FileEntryId { get; set; }
810

911
public FileEntry FileEntry { get; set; }

0 commit comments

Comments
 (0)