Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/UmbracoExamine.PDF/IPdfTextExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ namespace UmbracoExamine.PDF
public interface IPdfTextExtractor
{
string GetTextFromPdf(Stream pdfFileStream);
string GetLinkFromPdf(Stream pdfFileStream);
}
}
1 change: 1 addition & 0 deletions src/UmbracoExamine.PDF/PdfIndexConstants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ public static class PdfIndexConstants
{
public const string PdfIndexName = "PDFIndex";
public const string PdfContentFieldName = "fileTextContent";
public const string PdfLinksFieldName = "fileLinks";
public const string UmbracoMediaExtensionPropertyAlias = "umbracoExtension";
public const string PdfFileExtension = "pdf";
public const string PdfCategory = "pdf";
Expand Down
18 changes: 17 additions & 1 deletion src/UmbracoExamine.PDF/PdfIndexValueSetBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ public IEnumerable<ValueSet> GetValueSets(params IMedia[] content)
if (string.IsNullOrWhiteSpace(umbracoFile)) continue;

string fileTextContent;
string fileLinks;
try
{
fileTextContent = ExtractTextFromFile(umbracoFile);
fileLinks = ExtractLinkFromFile(umbracoFile);
}
catch (Exception ex)
{
Expand All @@ -45,7 +47,8 @@ public IEnumerable<ValueSet> GetValueSets(params IMedia[] content)
["nodeName"] = item.Name,
["id"] = item.Id,
["path"] = item.Path,
[PdfIndexConstants.PdfContentFieldName] = fileTextContent
[PdfIndexConstants.PdfContentFieldName] = fileTextContent,
[PdfIndexConstants.PdfLinksFieldName] = fileLinks
};

var valueSet = new ValueSet(item.Id.ToString(), PdfIndexConstants.PdfCategory, item.ContentType.Alias, indexValues);
Expand All @@ -66,5 +69,18 @@ private string ExtractTextFromFile(string filePath)
return string.Empty;
}
}

private string ExtractLinkFromFile(string filePath)
{
try
{
return _pdfTextService.ExtractLink(filePath);
}
catch (Exception ex)
{
_logger.LogError(ex, "Could not extract text from PDF {PdfFilePath}", filePath);
return string.Empty;
}
}
}
}
18 changes: 18 additions & 0 deletions src/UmbracoExamine.PDF/PdfPigTextExtractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,23 @@ public string GetTextFromPdf(Stream pdfFileStream)
return result.ToString();
}
}

public string GetLinkFromPdf(Stream pdfFileStream)
{
using (PdfDocument document = PdfDocument.Open(pdfFileStream))
{
var result = new StringBuilder();

foreach (Page page in document.GetPages())
{
// page.Text in some test cases runs words together where page.GetWords keeps them seperated
// so we use page.GetWords() instead of the simpler page.Text
IEnumerable<Hyperlink> links = page.GetHyperlinks();
result.Append(string.Join(" ", links));
result.AppendLine();
}
return result.ToString();
}
}
}
}
21 changes: 21 additions & 0 deletions src/UmbracoExamine.PDF/PdfTextService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,27 @@ public string ExtractText(string filePath)
}
}

/// <summary>
/// Extract links from a PDF file at the given path
/// </summary>
/// <param name="filePath"></param>
/// <returns></returns>
public string ExtractLink(string filePath)
{
using (var fs = _mediaFileSystem.FileSystem.OpenFile(filePath))
{
if (fs != null)
{
return ExceptChars(_pdfTextExtractor.GetLinkFromPdf(fs), UnsupportedRange.Value, ReplaceWithSpace);
}
else
{
_logger.LogError(new Exception($"Unable to open PDF file {filePath}"), "Unable to Open PDF file");
return null;
}
}
}

/// <summary>
/// Stores the unsupported range of character
/// </summary>
Expand Down