diff --git a/src/UmbracoExamine.PDF/IPdfTextExtractor.cs b/src/UmbracoExamine.PDF/IPdfTextExtractor.cs index 0b1a7a2..7bd1221 100644 --- a/src/UmbracoExamine.PDF/IPdfTextExtractor.cs +++ b/src/UmbracoExamine.PDF/IPdfTextExtractor.cs @@ -5,5 +5,6 @@ namespace UmbracoExamine.PDF public interface IPdfTextExtractor { string GetTextFromPdf(Stream pdfFileStream); + string GetLinkFromPdf(Stream pdfFileStream); } } diff --git a/src/UmbracoExamine.PDF/PdfIndexConstants.cs b/src/UmbracoExamine.PDF/PdfIndexConstants.cs index 62b65b9..585c33a 100644 --- a/src/UmbracoExamine.PDF/PdfIndexConstants.cs +++ b/src/UmbracoExamine.PDF/PdfIndexConstants.cs @@ -4,6 +4,7 @@ public static class PdfIndexConstants { public const string PdfIndexName = "PDFIndex"; public const string PdfContentFieldName = "fileTextContent"; + public const string PdfLinksFieldName = "fileLinks"; public const string UmbracoMediaExtensionPropertyAlias = "umbracoExtension"; public const string PdfFileExtension = "pdf"; public const string PdfCategory = "pdf"; diff --git a/src/UmbracoExamine.PDF/PdfIndexValueSetBuilder.cs b/src/UmbracoExamine.PDF/PdfIndexValueSetBuilder.cs index 37fdd85..6779f59 100644 --- a/src/UmbracoExamine.PDF/PdfIndexValueSetBuilder.cs +++ b/src/UmbracoExamine.PDF/PdfIndexValueSetBuilder.cs @@ -31,9 +31,11 @@ public IEnumerable GetValueSets(params IMedia[] content) if (string.IsNullOrWhiteSpace(umbracoFile)) continue; string fileTextContent; + string fileLinks; try { fileTextContent = ExtractTextFromFile(umbracoFile); + fileLinks = ExtractLinkFromFile(umbracoFile); } catch (Exception ex) { @@ -45,7 +47,8 @@ public IEnumerable GetValueSets(params IMedia[] content) ["nodeName"] = item.Name, ["id"] = item.Id, ["path"] = item.Path, - [PdfIndexConstants.PdfContentFieldName] = fileTextContent + [PdfIndexConstants.PdfContentFieldName] = fileTextContent, + [PdfIndexConstants.PdfLinksFieldName] = fileLinks }; var valueSet = new ValueSet(item.Id.ToString(), PdfIndexConstants.PdfCategory, item.ContentType.Alias, indexValues); @@ -66,5 +69,18 @@ private string ExtractTextFromFile(string filePath) return string.Empty; } } + + private string ExtractLinkFromFile(string filePath) + { + try + { + return _pdfTextService.ExtractLink(filePath); + } + catch (Exception ex) + { + _logger.LogError(ex, "Could not extract text from PDF {PdfFilePath}", filePath); + return string.Empty; + } + } } } diff --git a/src/UmbracoExamine.PDF/PdfPigTextExtractor.cs b/src/UmbracoExamine.PDF/PdfPigTextExtractor.cs index ace1eb9..def9809 100644 --- a/src/UmbracoExamine.PDF/PdfPigTextExtractor.cs +++ b/src/UmbracoExamine.PDF/PdfPigTextExtractor.cs @@ -25,5 +25,23 @@ public string GetTextFromPdf(Stream pdfFileStream) return result.ToString(); } } + + public string GetLinkFromPdf(Stream pdfFileStream) + { + using (PdfDocument document = PdfDocument.Open(pdfFileStream)) + { + var result = new StringBuilder(); + + foreach (Page page in document.GetPages()) + { + // page.Text in some test cases runs words together where page.GetWords keeps them seperated + // so we use page.GetWords() instead of the simpler page.Text + IEnumerable links = page.GetHyperlinks(); + result.Append(string.Join(" ", links)); + result.AppendLine(); + } + return result.ToString(); + } + } } } diff --git a/src/UmbracoExamine.PDF/PdfTextService.cs b/src/UmbracoExamine.PDF/PdfTextService.cs index fe3c685..e27a778 100644 --- a/src/UmbracoExamine.PDF/PdfTextService.cs +++ b/src/UmbracoExamine.PDF/PdfTextService.cs @@ -47,6 +47,27 @@ public string ExtractText(string filePath) } } + /// + /// Extract links from a PDF file at the given path + /// + /// + /// + public string ExtractLink(string filePath) + { + using (var fs = _mediaFileSystem.FileSystem.OpenFile(filePath)) + { + if (fs != null) + { + return ExceptChars(_pdfTextExtractor.GetLinkFromPdf(fs), UnsupportedRange.Value, ReplaceWithSpace); + } + else + { + _logger.LogError(new Exception($"Unable to open PDF file {filePath}"), "Unable to Open PDF file"); + return null; + } + } + } + /// /// Stores the unsupported range of character ///