diff --git a/Directory.Packages.props b/Directory.Packages.props index dea329a59..efda8b785 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -33,6 +33,9 @@ + + + diff --git a/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj b/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj index 568eb0d5f..ffbf2ad66 100644 --- a/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj +++ b/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj @@ -56,6 +56,10 @@ Always + + + Always + diff --git a/examples/002-dotnet-Serverless/Program.cs b/examples/002-dotnet-Serverless/Program.cs index c38c980af..e37a95723 100644 --- a/examples/002-dotnet-Serverless/Program.cs +++ b/examples/002-dotnet-Serverless/Program.cs @@ -302,6 +302,27 @@ private static async Task StoreJson() s_toDelete.Add("json01"); } + // Extract memory from Legacy Word .doc file + private static async Task StoreLegacyWordDoc() + { + if (!await s_memory.IsDocumentReadyAsync(documentId: "doc01")) + { + Console.WriteLine("Uploading Legacy Word Doc file"); + var docId = await s_memory.ImportDocumentAsync( + new Document("doc01") + .AddFiles(["file10-sample-legacy-word.doc"])); + + s_toDelete.Add(docId); + Console.WriteLine($"- Document Id: {docId}"); + } + else + { + Console.WriteLine("doc01 already uploaded."); + } + + s_toDelete.Add("doc01"); + } + // ======================= // === RETRIEVAL ========= // ======================= @@ -614,6 +635,27 @@ Azure Active Directory (AAD) authentication mechanism. To test this locally, you */ } + private static async Task AskQuestionsAboutLegacyWordDocFile() + { + var question = "Can we use Hybrid search in Azure AI Search for implementing the RAG pattern?"; + Console.WriteLine($"Question: {question}"); + Console.WriteLine($"Yes, you can use hybrid search in Azure AI Search for implementing the Retrieval Augmented Generation (RAG) pattern"); + + var answer = await s_memory.AskAsync(question, filter: MemoryFilters.ByDocument("doc01")); + Console.WriteLine($"\nAnswer: {answer.Result}"); + + Console.WriteLine("\n====================================\n"); + + /* OUTPUT + + Question: Can we use Hybrid search in Azure AI Search for implementing the RAG pattern? + + Answer: Yes, you can use hybrid search in Azure AI Search for implementing the Retrieval Augmented Generation (RAG) pattern. + Azure AI Search supports both keyword (term-based) and vector queries, allowing you to create a search index that contains vector fields, non-vector content, or both. + This capability enables hybrid search, where the system can leverage traditional keyword search alongside vector similarity search to retrieve relevant information. + */ + } + // Download file and print details private static async Task DownloadFile() { diff --git a/examples/002-dotnet-Serverless/file10-sample-legacy-word.doc b/examples/002-dotnet-Serverless/file10-sample-legacy-word.doc new file mode 100644 index 000000000..47d7b1c93 Binary files /dev/null and b/examples/002-dotnet-Serverless/file10-sample-legacy-word.doc differ diff --git a/service/Core/Core.csproj b/service/Core/Core.csproj index 8f5f0d6ed..7054d6f39 100644 --- a/service/Core/Core.csproj +++ b/service/Core/Core.csproj @@ -20,6 +20,9 @@ + + + diff --git a/service/Core/DataFormats/DependencyInjection.cs b/service/Core/DataFormats/DependencyInjection.cs index 97f87564b..30a3756d5 100644 --- a/service/Core/DataFormats/DependencyInjection.cs +++ b/service/Core/DataFormats/DependencyInjection.cs @@ -89,6 +89,7 @@ public static IServiceCollection AddDefaultContentDecoders( services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); + services.AddSingleton(); return services; } diff --git a/service/Core/DataFormats/Office/MsWordLegacyDecoder.cs b/service/Core/DataFormats/Office/MsWordLegacyDecoder.cs new file mode 100644 index 000000000..bf73074ee --- /dev/null +++ b/service/Core/DataFormats/Office/MsWordLegacyDecoder.cs @@ -0,0 +1,103 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using Microsoft.KernelMemory.Diagnostics; +using Microsoft.KernelMemory.Pipeline; +using Microsoft.KernelMemory.Text; +using NPOI.HWPF; +using NPOI.HWPF.Extractor; + +namespace Microsoft.KernelMemory.DataFormats.Office; + +[Experimental("KMEXP00")] +public sealed class MsWordLegacyDecoder : IContentDecoder +{ + private readonly ILogger _log; + + static MsWordLegacyDecoder() + { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); // Ensure legacy encodings are available: https://nicolaiarocci.com/how-to-read-windows-1252-encoded-files-with-.netcore-and-.net5-/ + } + + public MsWordLegacyDecoder(ILoggerFactory? loggerFactory = null) + { + this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger(); + } + + /// + public bool SupportsMimeType(string mimeType) + { + return mimeType != null && mimeType.StartsWith(MimeTypes.MsWord, StringComparison.OrdinalIgnoreCase); + } + + /// + public Task DecodeAsync(string filename, CancellationToken cancellationToken = default) + { + using var stream = File.OpenRead(filename); + return this.DecodeAsync(stream, cancellationToken); + } + + /// + public Task DecodeAsync(BinaryData data, CancellationToken cancellationToken = default) + { + using var stream = data.ToStream(); + return this.DecodeAsync(stream, cancellationToken); + } + + /// + public Task DecodeAsync(Stream data, CancellationToken cancellationToken = default) + { + this._log.LogDebug("Extracting text from MS Word legacy (.doc) file"); + + var result = new FileContent(MimeTypes.PlainText); + + try + { + var document = new HWPFDocument(data); + var extractor = new WordExtractor(document); + + string[] paragraphs = extractor.ParagraphText; + + int pageNumber = 1; + var sb = new StringBuilder(); + + foreach (string paragraph in paragraphs) + { + if (!string.IsNullOrWhiteSpace(paragraph)) + { + sb.AppendLineNix(paragraph.Trim()); + + // For legacy .doc files, we'll treat each significant paragraph break as a potential page break + // This is a simplified approach since .doc format doesn't have reliable page break detection + if (sb.Length > 2000) // Arbitrary chunk size + { + string content = sb.ToString().NormalizeNewlines(false); + result.Sections.Add(new Chunk(content, pageNumber, Chunk.Meta(sentencesAreComplete: true))); + sb.Clear(); + pageNumber++; + } + } + } + + // Add any remaining content + if (sb.Length > 0) + { + string content = sb.ToString().NormalizeNewlines(false); + result.Sections.Add(new Chunk(content, pageNumber, Chunk.Meta(sentencesAreComplete: true))); + } + + return Task.FromResult(result); + } + catch (Exception ex) + { + this._log.LogError(ex, "Error extracting text from MS Word legacy file"); + throw; + } + } +} diff --git a/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj b/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj index e13bda732..32112da57 100644 --- a/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj +++ b/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj @@ -48,6 +48,10 @@ Always + + + Always + diff --git a/service/tests/Core.FunctionalTests/DataFormats/Office/MsWordLegacyDecoderTest.cs b/service/tests/Core.FunctionalTests/DataFormats/Office/MsWordLegacyDecoderTest.cs new file mode 100644 index 000000000..0aa64b102 --- /dev/null +++ b/service/tests/Core.FunctionalTests/DataFormats/Office/MsWordLegacyDecoderTest.cs @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Configuration; +using Microsoft.KernelMemory.DataFormats; +using Microsoft.KernelMemory.DataFormats.Office; +using Microsoft.KM.TestHelpers; + +namespace Microsoft.KM.Core.FunctionalTests.DataFormats.Office; + +public class MsWordLegacyDecoderTest : BaseFunctionalTestCase +{ + public MsWordLegacyDecoderTest(IConfiguration cfg, ITestOutputHelper output) : base(cfg, output) + { + } + + [Fact] + [Trait("Category", "UnitTest")] + [Trait("Category", "DataFormats")] + [Trait("Category", "WordLegacy")] + public async Task ItExtractsTextFromDocFile() + { + // Arrange + const string file = "file4-sample-legacy-word.doc"; + var decoder = new MsWordLegacyDecoder(); + + // Act + FileContent result = await decoder.DecodeAsync(file); + string content = result.Sections.Aggregate("", (current, s) => current + (s.Content + "\n")); + Console.WriteLine(content); + + // Assert + Assert.NotEmpty(content); + Assert.True(result.Sections.Count > 0); + Assert.Contains("Retrieval Augmented Generation (RAG) is an architecture", content); + } + + [Fact] + [Trait("Category", "UnitTest")] + [Trait("Category", "DataFormats")] + [Trait("Category", "WordLegacy")] + public void ItSupportsMsWordMimeType() + { + // Arrange + var decoder = new MsWordLegacyDecoder(); + + // Act & Assert + Assert.True(decoder.SupportsMimeType("application/msword")); + Assert.False(decoder.SupportsMimeType("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + Assert.False(decoder.SupportsMimeType("application/pdf")); + } +} diff --git a/service/tests/Core.FunctionalTests/file4-sample-legacy-word.doc b/service/tests/Core.FunctionalTests/file4-sample-legacy-word.doc new file mode 100644 index 000000000..47d7b1c93 Binary files /dev/null and b/service/tests/Core.FunctionalTests/file4-sample-legacy-word.doc differ