diff --git a/Directory.Packages.props b/Directory.Packages.props
index dea329a59..efda8b785 100644
--- a/Directory.Packages.props
+++ b/Directory.Packages.props
@@ -33,6 +33,9 @@
+
+
+
diff --git a/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj b/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj
index 568eb0d5f..ffbf2ad66 100644
--- a/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj
+++ b/examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj
@@ -56,6 +56,10 @@
Always
+
+
+ Always
+
diff --git a/examples/002-dotnet-Serverless/Program.cs b/examples/002-dotnet-Serverless/Program.cs
index c38c980af..e37a95723 100644
--- a/examples/002-dotnet-Serverless/Program.cs
+++ b/examples/002-dotnet-Serverless/Program.cs
@@ -302,6 +302,27 @@ private static async Task StoreJson()
s_toDelete.Add("json01");
}
+ // Extract memory from Legacy Word .doc file
+ private static async Task StoreLegacyWordDoc()
+ {
+ if (!await s_memory.IsDocumentReadyAsync(documentId: "doc01"))
+ {
+ Console.WriteLine("Uploading Legacy Word Doc file");
+ var docId = await s_memory.ImportDocumentAsync(
+ new Document("doc01")
+ .AddFiles(["file10-sample-legacy-word.doc"]));
+
+ s_toDelete.Add(docId);
+ Console.WriteLine($"- Document Id: {docId}");
+ }
+ else
+ {
+ Console.WriteLine("doc01 already uploaded.");
+ }
+
+ s_toDelete.Add("doc01");
+ }
+
// =======================
// === RETRIEVAL =========
// =======================
@@ -614,6 +635,27 @@ Azure Active Directory (AAD) authentication mechanism. To test this locally, you
*/
}
+ private static async Task AskQuestionsAboutLegacyWordDocFile()
+ {
+ var question = "Can we use Hybrid search in Azure AI Search for implementing the RAG pattern?";
+ Console.WriteLine($"Question: {question}");
+ Console.WriteLine($"Yes, you can use hybrid search in Azure AI Search for implementing the Retrieval Augmented Generation (RAG) pattern");
+
+ var answer = await s_memory.AskAsync(question, filter: MemoryFilters.ByDocument("doc01"));
+ Console.WriteLine($"\nAnswer: {answer.Result}");
+
+ Console.WriteLine("\n====================================\n");
+
+ /* OUTPUT
+
+ Question: Can we use Hybrid search in Azure AI Search for implementing the RAG pattern?
+
+ Answer: Yes, you can use hybrid search in Azure AI Search for implementing the Retrieval Augmented Generation (RAG) pattern.
+ Azure AI Search supports both keyword (term-based) and vector queries, allowing you to create a search index that contains vector fields, non-vector content, or both.
+ This capability enables hybrid search, where the system can leverage traditional keyword search alongside vector similarity search to retrieve relevant information.
+ */
+ }
+
// Download file and print details
private static async Task DownloadFile()
{
diff --git a/examples/002-dotnet-Serverless/file10-sample-legacy-word.doc b/examples/002-dotnet-Serverless/file10-sample-legacy-word.doc
new file mode 100644
index 000000000..47d7b1c93
Binary files /dev/null and b/examples/002-dotnet-Serverless/file10-sample-legacy-word.doc differ
diff --git a/service/Core/Core.csproj b/service/Core/Core.csproj
index 8f5f0d6ed..7054d6f39 100644
--- a/service/Core/Core.csproj
+++ b/service/Core/Core.csproj
@@ -20,6 +20,9 @@
+
+
+
diff --git a/service/Core/DataFormats/DependencyInjection.cs b/service/Core/DataFormats/DependencyInjection.cs
index 97f87564b..30a3756d5 100644
--- a/service/Core/DataFormats/DependencyInjection.cs
+++ b/service/Core/DataFormats/DependencyInjection.cs
@@ -89,6 +89,7 @@ public static IServiceCollection AddDefaultContentDecoders(
services.AddSingleton();
services.AddSingleton();
services.AddSingleton();
+ services.AddSingleton();
return services;
}
diff --git a/service/Core/DataFormats/Office/MsWordLegacyDecoder.cs b/service/Core/DataFormats/Office/MsWordLegacyDecoder.cs
new file mode 100644
index 000000000..bf73074ee
--- /dev/null
+++ b/service/Core/DataFormats/Office/MsWordLegacyDecoder.cs
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.IO;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.Extensions.Logging;
+using Microsoft.KernelMemory.Diagnostics;
+using Microsoft.KernelMemory.Pipeline;
+using Microsoft.KernelMemory.Text;
+using NPOI.HWPF;
+using NPOI.HWPF.Extractor;
+
+namespace Microsoft.KernelMemory.DataFormats.Office;
+
+[Experimental("KMEXP00")]
+public sealed class MsWordLegacyDecoder : IContentDecoder
+{
+ private readonly ILogger _log;
+
+ static MsWordLegacyDecoder()
+ {
+ Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); // Ensure legacy encodings are available: https://nicolaiarocci.com/how-to-read-windows-1252-encoded-files-with-.netcore-and-.net5-/
+ }
+
+ public MsWordLegacyDecoder(ILoggerFactory? loggerFactory = null)
+ {
+ this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger();
+ }
+
+ ///
+ public bool SupportsMimeType(string mimeType)
+ {
+ return mimeType != null && mimeType.StartsWith(MimeTypes.MsWord, StringComparison.OrdinalIgnoreCase);
+ }
+
+ ///
+ public Task DecodeAsync(string filename, CancellationToken cancellationToken = default)
+ {
+ using var stream = File.OpenRead(filename);
+ return this.DecodeAsync(stream, cancellationToken);
+ }
+
+ ///
+ public Task DecodeAsync(BinaryData data, CancellationToken cancellationToken = default)
+ {
+ using var stream = data.ToStream();
+ return this.DecodeAsync(stream, cancellationToken);
+ }
+
+ ///
+ public Task DecodeAsync(Stream data, CancellationToken cancellationToken = default)
+ {
+ this._log.LogDebug("Extracting text from MS Word legacy (.doc) file");
+
+ var result = new FileContent(MimeTypes.PlainText);
+
+ try
+ {
+ var document = new HWPFDocument(data);
+ var extractor = new WordExtractor(document);
+
+ string[] paragraphs = extractor.ParagraphText;
+
+ int pageNumber = 1;
+ var sb = new StringBuilder();
+
+ foreach (string paragraph in paragraphs)
+ {
+ if (!string.IsNullOrWhiteSpace(paragraph))
+ {
+ sb.AppendLineNix(paragraph.Trim());
+
+ // For legacy .doc files, we'll treat each significant paragraph break as a potential page break
+ // This is a simplified approach since .doc format doesn't have reliable page break detection
+ if (sb.Length > 2000) // Arbitrary chunk size
+ {
+ string content = sb.ToString().NormalizeNewlines(false);
+ result.Sections.Add(new Chunk(content, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
+ sb.Clear();
+ pageNumber++;
+ }
+ }
+ }
+
+ // Add any remaining content
+ if (sb.Length > 0)
+ {
+ string content = sb.ToString().NormalizeNewlines(false);
+ result.Sections.Add(new Chunk(content, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
+ }
+
+ return Task.FromResult(result);
+ }
+ catch (Exception ex)
+ {
+ this._log.LogError(ex, "Error extracting text from MS Word legacy file");
+ throw;
+ }
+ }
+}
diff --git a/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj b/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj
index e13bda732..32112da57 100644
--- a/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj
+++ b/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj
@@ -48,6 +48,10 @@
Always
+
+
+ Always
+
diff --git a/service/tests/Core.FunctionalTests/DataFormats/Office/MsWordLegacyDecoderTest.cs b/service/tests/Core.FunctionalTests/DataFormats/Office/MsWordLegacyDecoderTest.cs
new file mode 100644
index 000000000..0aa64b102
--- /dev/null
+++ b/service/tests/Core.FunctionalTests/DataFormats/Office/MsWordLegacyDecoderTest.cs
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.Extensions.Configuration;
+using Microsoft.KernelMemory.DataFormats;
+using Microsoft.KernelMemory.DataFormats.Office;
+using Microsoft.KM.TestHelpers;
+
+namespace Microsoft.KM.Core.FunctionalTests.DataFormats.Office;
+
+public class MsWordLegacyDecoderTest : BaseFunctionalTestCase
+{
+ public MsWordLegacyDecoderTest(IConfiguration cfg, ITestOutputHelper output) : base(cfg, output)
+ {
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ [Trait("Category", "DataFormats")]
+ [Trait("Category", "WordLegacy")]
+ public async Task ItExtractsTextFromDocFile()
+ {
+ // Arrange
+ const string file = "file4-sample-legacy-word.doc";
+ var decoder = new MsWordLegacyDecoder();
+
+ // Act
+ FileContent result = await decoder.DecodeAsync(file);
+ string content = result.Sections.Aggregate("", (current, s) => current + (s.Content + "\n"));
+ Console.WriteLine(content);
+
+ // Assert
+ Assert.NotEmpty(content);
+ Assert.True(result.Sections.Count > 0);
+ Assert.Contains("Retrieval Augmented Generation (RAG) is an architecture", content);
+ }
+
+ [Fact]
+ [Trait("Category", "UnitTest")]
+ [Trait("Category", "DataFormats")]
+ [Trait("Category", "WordLegacy")]
+ public void ItSupportsMsWordMimeType()
+ {
+ // Arrange
+ var decoder = new MsWordLegacyDecoder();
+
+ // Act & Assert
+ Assert.True(decoder.SupportsMimeType("application/msword"));
+ Assert.False(decoder.SupportsMimeType("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ Assert.False(decoder.SupportsMimeType("application/pdf"));
+ }
+}
diff --git a/service/tests/Core.FunctionalTests/file4-sample-legacy-word.doc b/service/tests/Core.FunctionalTests/file4-sample-legacy-word.doc
new file mode 100644
index 000000000..47d7b1c93
Binary files /dev/null and b/service/tests/Core.FunctionalTests/file4-sample-legacy-word.doc differ