Skip to content

Commit 8a82361

Browse files
committed
Fix error when trying to persist non-utf-8 title
1 parent d746018 commit 8a82361

File tree

3 files changed

+94
-17
lines changed

3 files changed

+94
-17
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
4141
* [#2565](https://github.com/shlinkio/shlink/issues/2565) Remove explicit dependency in ext-json, since it's part of PHP since v8.0
4242

4343
### Fixed
44-
* *Nothing*
44+
* [#2564](https://github.com/shlinkio/shlink/issues/2564) Fix error when trying to persist non-utf-8 title without being able to determine its original charset for parsing.
45+
46+
Now, when resolving a website's charset, two improvements have been introduced:
47+
48+
1. If the `Content-Type` header does not define the charset, we fall back to `<meta charset>` or `<meta http-equiv="Content-Type">`.
49+
2. If it's still not possible to determine the charset, we ignore the auto-resolved title, to avoid other encoding errors further down the line.
4550

4651

4752
## [4.6.0] - 2025-11-01

module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
use GuzzleHttp\RequestOptions;
1111
use Laminas\Stdlib\ErrorHandler;
1212
use Psr\Http\Message\ResponseInterface;
13+
use Psr\Http\Message\StreamInterface;
1314
use Psr\Log\LoggerInterface;
1415
use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions;
1516
use Throwable;
@@ -30,10 +31,12 @@
3031
public const string CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
3132
. 'Chrome/121.0.0.0 Safari/537.36';
3233

33-
// Matches the value inside a html title tag
34+
/** Matches the value inside a html title tag */
3435
private const string TITLE_TAG_VALUE = '/<title[^>]*>(.*?)<\/title>/i';
35-
// Matches the charset inside a Content-Type header
36+
/** Matches the charset inside a Content-Type header */
3637
private const string CHARSET_VALUE = '/charset=([^;]+)/i';
38+
/** Matches the charset from charset-related <meta /> tags */
39+
private const string CHARSET_FROM_META = '/<meta\b[^>]*\bcharset\s*=\s*(?:["\']?)([^"\'\s>;]+)/i';
3740

3841
/**
3942
* @param (Closure(): bool)|null $isIconvInstalled
@@ -83,7 +86,8 @@ private function fetchUrl(string $url): ResponseInterface|null
8386
RequestOptions::IDN_CONVERSION => true,
8487
// Making the request with a browser's user agent results in responses closer to a real user
8588
RequestOptions::HEADERS => ['User-Agent' => self::CHROME_USER_AGENT],
86-
RequestOptions::STREAM => true, // This ensures large files are not fully downloaded if not needed
89+
// This ensures large files are not fully downloaded if not needed
90+
RequestOptions::STREAM => true,
8791
]);
8892
} catch (Throwable) {
8993
return null;
@@ -102,24 +106,56 @@ private function tryToResolveTitle(ResponseInterface $response, string $contentT
102106

103107
// Try to match the title from the <title /> tag
104108
preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches);
105-
if (! isset($titleMatches[1])) {
109+
$titleInOriginalEncoding = $titleMatches[1] ?? null;
110+
if ($titleInOriginalEncoding === null) {
106111
return null;
107112
}
108-
109-
$titleInOriginalEncoding = $titleMatches[1];
110-
111-
// Get the page's charset from Content-Type header, or return title as is if not found
112-
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
113-
if (! isset($charsetMatches[1])) {
114-
return $titleInOriginalEncoding;
113+
;
114+
$pageCharset = $this->resolvePageCharset($contentType, $body, $collectedBody);
115+
if ($pageCharset === null) {
116+
// If it was not possible to determine the page's charset, ignore the title to avoid the risk of encoding
117+
// errors when the value is persisted
118+
return null;
115119
}
116120

117-
$pageCharset = $charsetMatches[1];
118121
return $this->encodeToUtf8WithMbString($titleInOriginalEncoding, $pageCharset)
119122
?? $this->encodeToUtf8WithIconv($titleInOriginalEncoding, $pageCharset)
120123
?? $titleInOriginalEncoding;
121124
}
122125

126+
/**
127+
* Tries to resolve the page's charset by looking into the:
128+
* 1. Content-Type header
129+
* 2. <meta charset="???"> tag
130+
* 3. <meta http-equiv="Content-Type" content="text/html; charset=???"> tag
131+
*
132+
* @param StreamInterface $body - The body stream, in case we need to continue reading from it
133+
* @param string $collectedBody - The part of the body that has already been read while looking for the title
134+
*/
135+
private function resolvePageCharset(string $contentType, StreamInterface $body, string $collectedBody): string|null
136+
{
137+
// First try to resolve the charset from the `Content-Type` header
138+
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
139+
$pageCharset = $charsetMatches[1] ?? null;
140+
if ($pageCharset !== null) {
141+
return $pageCharset;
142+
}
143+
144+
$readCharsetFromMeta = static function (string $collectedBody): string|null {
145+
preg_match(self::CHARSET_FROM_META, $collectedBody, $charsetFromMetaMatches);
146+
return $charsetFromMetaMatches[1] ?? null;
147+
};
148+
149+
// Continue reading the body, looking for any of the charset meta tags
150+
$charsetFromMeta = $readCharsetFromMeta($collectedBody);
151+
while ($charsetFromMeta === null && ! $body->eof()) {
152+
$collectedBody .= $body->read(1024);
153+
$charsetFromMeta = $readCharsetFromMeta($collectedBody);
154+
}
155+
156+
return $charsetFromMeta;
157+
}
158+
123159
private function encodeToUtf8WithMbString(string $titleInOriginalEncoding, string $pageCharset): string|null
124160
{
125161
try {

module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
use Laminas\Diactoros\Response;
1212
use Laminas\Diactoros\Response\JsonResponse;
1313
use Laminas\Diactoros\Stream;
14+
use PHPUnit\Framework\Attributes\AllowMockObjectsWithoutExpectations;
1415
use PHPUnit\Framework\Attributes\Test;
1516
use PHPUnit\Framework\Attributes\TestWith;
1617
use PHPUnit\Framework\MockObject\InvocationStubber;
@@ -23,7 +24,7 @@
2324

2425
class ShortUrlTitleResolutionHelperTest extends TestCase
2526
{
26-
private const string LONG_URL = 'http://foobar.com/12345/hello?foo=bar';
27+
private const string LONG_URL = 'https://foobar.com/12345/hello?foo=bar';
2728

2829
private MockObject & ClientInterface $httpClient;
2930
private MockObject & LoggerInterface $logger;
@@ -98,7 +99,6 @@ public function dataIsReturnedAsIsWhenTitleCannotBeResolvedFromResponse(): void
9899
}
99100

100101
#[Test]
101-
#[TestWith(['TEXT/html', false], 'no charset')]
102102
#[TestWith(['TEXT/html; charset=utf-8', false], 'mbstring-supported charset')]
103103
#[TestWith(['TEXT/html; charset=Windows-1255', true], 'mbstring-unsupported charset')]
104104
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType, bool $expectsWarning): void
@@ -120,6 +120,37 @@ public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentTyp
120120
self::assertEquals('Resolved "title"', $result->title);
121121
}
122122

123+
#[Test, AllowMockObjectsWithoutExpectations]
124+
public function resolvedTitleIsIgnoredWhenCharsetCannotBeResolved(): void
125+
{
126+
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle('text/html'));
127+
128+
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
129+
$result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
130+
131+
self::assertSame($data, $result);
132+
self::assertNull($result->title);
133+
}
134+
135+
#[Test, AllowMockObjectsWithoutExpectations]
136+
#[TestWith(['<meta charset="utf-8">'])]
137+
#[TestWith(['<meta charset="utf-8" />'])]
138+
#[TestWith(['<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'])]
139+
#[TestWith(['<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'])]
140+
public function pageCharsetCanBeReadFromMeta(string $extraContent): void
141+
{
142+
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle(
143+
contentType: 'text/html',
144+
extraContent: $extraContent,
145+
));
146+
147+
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
148+
$result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
149+
150+
self::assertNotSame($data, $result);
151+
self::assertEquals('Resolved "title"', $result->title);
152+
}
153+
123154
#[Test]
124155
#[TestWith([
125156
'contentType' => 'text/html; charset=Windows-1255',
@@ -178,9 +209,14 @@ private function respWithoutTitle(): Response
178209
return new Response($body, 200, ['Content-Type' => 'text/html']);
179210
}
180211

181-
private function respWithTitle(string $contentType): Response
212+
private function respWithTitle(string $contentType, string|null $extraContent = null): Response
182213
{
183-
$body = $this->createStreamWithContent('<title data-foo="bar"> Resolved &quot;title&quot; </title>');
214+
$content = '<title data-foo="bar"> Resolved &quot;title&quot; </title>';
215+
if ($extraContent !== null) {
216+
$content .= $extraContent;
217+
}
218+
219+
$body = $this->createStreamWithContent($content);
184220
return new Response($body, 200, ['Content-Type' => $contentType]);
185221
}
186222

0 commit comments

Comments
 (0)