Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@
<PackageVersion Include="Newtonsoft.Json" Version="13.0.3" />
<PackageVersion Include="OllamaSharp" Version="5.1.14" />
<PackageVersion Include="PdfPig" Version="0.1.10" />
<PackageVersion Include="NPOI" Version="2.5.6" />
<PackageVersion Include="ScratchPad.NPOI.HWPF" Version="2.5.7" />
<PackageVersion Include="System.Text.Encoding.CodePages" Version="9.0.0" />
<PackageVersion Include="Polly.Core" Version="8.5.2" />
<PackageVersion Include="RabbitMQ.Client" Version="7.1.2" />
<PackageVersion Include="ReadLine" Version="2.0.1" />
Expand Down
4 changes: 4 additions & 0 deletions examples/002-dotnet-Serverless/002-dotnet-Serverless.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
<Content Include="file8-data.xlsx">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="file10-sample-legacy-word.doc" />
<Content Include="file10-sample-legacy-word.doc">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
42 changes: 42 additions & 0 deletions examples/002-dotnet-Serverless/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,27 @@ private static async Task StoreJson()
s_toDelete.Add("json01");
}

// Extract memory from Legacy Word .doc file
private static async Task StoreLegacyWordDoc()
{
if (!await s_memory.IsDocumentReadyAsync(documentId: "doc01"))
{
Console.WriteLine("Uploading Legacy Word Doc file");
var docId = await s_memory.ImportDocumentAsync(
new Document("doc01")
.AddFiles(["file10-sample-legacy-word.doc"]));

s_toDelete.Add(docId);
Console.WriteLine($"- Document Id: {docId}");
}
else
{
Console.WriteLine("doc01 already uploaded.");
}

s_toDelete.Add("doc01");
}

// =======================
// === RETRIEVAL =========
// =======================
Expand Down Expand Up @@ -614,6 +635,27 @@ Azure Active Directory (AAD) authentication mechanism. To test this locally, you
*/
}

private static async Task AskQuestionsAboutLegacyWordDocFile()
{
var question = "Can we use Hybrid search in Azure AI Search for implementing the RAG pattern?";
Console.WriteLine($"Question: {question}");
Console.WriteLine($"Yes, you can use hybrid search in Azure AI Search for implementing the Retrieval Augmented Generation (RAG) pattern");

var answer = await s_memory.AskAsync(question, filter: MemoryFilters.ByDocument("doc01"));
Console.WriteLine($"\nAnswer: {answer.Result}");

Console.WriteLine("\n====================================\n");

/* OUTPUT

Question: Can we use Hybrid search in Azure AI Search for implementing the RAG pattern?

Answer: Yes, you can use hybrid search in Azure AI Search for implementing the Retrieval Augmented Generation (RAG) pattern.
Azure AI Search supports both keyword (term-based) and vector queries, allowing you to create a search index that contains vector fields, non-vector content, or both.
This capability enables hybrid search, where the system can leverage traditional keyword search alongside vector similarity search to retrieve relevant information.
*/
}

// Download file and print details
private static async Task DownloadFile()
{
Expand Down
Binary file not shown.
3 changes: 3 additions & 0 deletions service/Core/Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
<PackageReference Include="HtmlAgilityPack" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Http" />
<PackageReference Include="NPOI" />
<PackageReference Include="ScratchPad.NPOI.HWPF" />
<PackageReference Include="System.Text.Encoding.CodePages" />
<PackageReference Include="PdfPig" />
<PackageReference Include="Polly.Core" />
<PackageReference Include="System.Linq.Async" />
Expand Down
1 change: 1 addition & 0 deletions service/Core/DataFormats/DependencyInjection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ public static IServiceCollection AddDefaultContentDecoders(
services.AddSingleton<IContentDecoder, MsExcelDecoder>();
services.AddSingleton<IContentDecoder, MsPowerPointDecoder>();
services.AddSingleton<IContentDecoder, MsWordDecoder>();
services.AddSingleton<IContentDecoder, MsWordLegacyDecoder>();

return services;
}
Expand Down
103 changes: 103 additions & 0 deletions service/Core/DataFormats/Office/MsWordLegacyDecoder.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Pipeline;
using Microsoft.KernelMemory.Text;
using NPOI.HWPF;
using NPOI.HWPF.Extractor;

namespace Microsoft.KernelMemory.DataFormats.Office;

[Experimental("KMEXP00")]
public sealed class MsWordLegacyDecoder : IContentDecoder
{
private readonly ILogger<MsWordLegacyDecoder> _log;

static MsWordLegacyDecoder()
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); // Ensure legacy encodings are available: https://nicolaiarocci.com/how-to-read-windows-1252-encoded-files-with-.netcore-and-.net5-/
}

public MsWordLegacyDecoder(ILoggerFactory? loggerFactory = null)
{
this._log = (loggerFactory ?? DefaultLogger.Factory).CreateLogger<MsWordLegacyDecoder>();
}

/// <inheritdoc />
public bool SupportsMimeType(string mimeType)
{
return mimeType != null && mimeType.StartsWith(MimeTypes.MsWord, StringComparison.OrdinalIgnoreCase);
}

/// <inheritdoc />
public Task<FileContent> DecodeAsync(string filename, CancellationToken cancellationToken = default)
{
using var stream = File.OpenRead(filename);
return this.DecodeAsync(stream, cancellationToken);
}

/// <inheritdoc />
public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancellationToken = default)
{
using var stream = data.ToStream();
return this.DecodeAsync(stream, cancellationToken);
}

/// <inheritdoc />
public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellationToken = default)
{
this._log.LogDebug("Extracting text from MS Word legacy (.doc) file");

var result = new FileContent(MimeTypes.PlainText);

try
{
var document = new HWPFDocument(data);
var extractor = new WordExtractor(document);

string[] paragraphs = extractor.ParagraphText;

int pageNumber = 1;
var sb = new StringBuilder();

foreach (string paragraph in paragraphs)
{
if (!string.IsNullOrWhiteSpace(paragraph))
{
sb.AppendLineNix(paragraph.Trim());

// For legacy .doc files, we'll treat each significant paragraph break as a potential page break
// This is a simplified approach since .doc format doesn't have reliable page break detection
if (sb.Length > 2000) // Arbitrary chunk size
{
string content = sb.ToString().NormalizeNewlines(false);
result.Sections.Add(new Chunk(content, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
sb.Clear();
pageNumber++;
}
}
}

// Add any remaining content
if (sb.Length > 0)
{
string content = sb.ToString().NormalizeNewlines(false);
result.Sections.Add(new Chunk(content, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
}

return Task.FromResult(result);
}
catch (Exception ex)
{
this._log.LogError(ex, "Error extracting text from MS Word legacy file");
throw;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
<Content Include="file3-data.xlsx">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="file4-sample-legacy-word.doc" />
<Content Include="file4-sample-legacy-word.doc">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.Configuration;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.DataFormats.Office;
using Microsoft.KM.TestHelpers;

namespace Microsoft.KM.Core.FunctionalTests.DataFormats.Office;

public class MsWordLegacyDecoderTest : BaseFunctionalTestCase
{
public MsWordLegacyDecoderTest(IConfiguration cfg, ITestOutputHelper output) : base(cfg, output)
{
}

[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "DataFormats")]
[Trait("Category", "WordLegacy")]
public async Task ItExtractsTextFromDocFile()
{
// Arrange
const string file = "file4-sample-legacy-word.doc";
var decoder = new MsWordLegacyDecoder();

// Act
FileContent result = await decoder.DecodeAsync(file);
string content = result.Sections.Aggregate("", (current, s) => current + (s.Content + "\n"));
Console.WriteLine(content);

// Assert
Assert.NotEmpty(content);
Assert.True(result.Sections.Count > 0);
Assert.Contains("Retrieval Augmented Generation (RAG) is an architecture", content);
}

[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "DataFormats")]
[Trait("Category", "WordLegacy")]
public void ItSupportsMsWordMimeType()
{
// Arrange
var decoder = new MsWordLegacyDecoder();

// Act & Assert
Assert.True(decoder.SupportsMimeType("application/msword"));
Assert.False(decoder.SupportsMimeType("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
Assert.False(decoder.SupportsMimeType("application/pdf"));
}
}
Binary file not shown.