-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathPdfDocumentLoader.cs
More file actions
48 lines (41 loc) · 1.76 KB
/
Copy pathPdfDocumentLoader.cs
File metadata and controls
48 lines (41 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Mythosia.Documents;
using Mythosia.Documents.Elements;
namespace Mythosia.Documents.Pdf
{
/// <summary>
/// Loads PDF documents via DoclingDocument → MarkdownSerializer.
/// </summary>
public class PdfDocumentLoader : IDocumentLoader
{
private readonly IDocumentParser _parser;
public PdfDocumentLoader(
IDocumentParser? parser = null,
PdfParserOptions? options = null)
{
if (parser != null && options != null)
throw new ArgumentException("Pass options to the parser instance instead of the loader.");
_parser = parser ?? new PdfPigParser(options);
}
public async Task<IReadOnlyList<DoclingDocument>> LoadAsync(string source, CancellationToken ct = default)
{
if (string.IsNullOrWhiteSpace(source))
throw new ArgumentException("Source path is required.", nameof(source));
if (!File.Exists(source))
throw new FileNotFoundException($"Document file not found: {source}", source);
if (!_parser.CanParse(source))
throw new NotSupportedException($"Parser '{_parser.GetType().Name}' cannot parse '{source}'.");
var doclingDoc = await _parser.ParseAsync(source, ct);
doclingDoc.Source = source;
doclingDoc.Metadata["type"] = "pdf";
doclingDoc.Metadata["filename"] = Path.GetFileName(source);
doclingDoc.Metadata["extension"] = Path.GetExtension(source).ToLowerInvariant();
doclingDoc.Metadata["parser"] = _parser.GetType().Name;
return new[] { doclingDoc };
}
}
}