Skip to content

Commit 301f8c4

Browse files
committed
Remove all unwanted chars from PDF extract
Fix j0k3r/f43.me#63
1 parent 1851af3 commit 301f8c4

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

src/Graby.php

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,14 @@ private function handleMimeAction($mimeInfo, $effective_url, $body = '')
471471
if ($mimeInfo['mime'] == 'application/pdf') {
472472
$parser = new PdfParser();
473473
$pdf = $parser->parseFile($effective_url);
474-
$infos['html'] = Encoding::toUTF8(nl2br($pdf->getText()));
474+
475+
$html = Encoding::toUTF8(nl2br($pdf->getText()));
476+
477+
// strip away unwanted chars (that usualy came from PDF extracted content)
478+
// @see http://www.phpwact.org/php/i18n/charsets#common_problem_areas_with_utf-8
479+
$html = preg_replace('/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u', ' ', $html);
480+
481+
$infos['html'] = $html;
475482

476483
// update title in case of details are present
477484
$details = $pdf->getDetails();

0 commit comments

Comments
 (0)