Merge pull request #2569 from acelaya-forks/charset-error

Fix error when trying to persist non-utf-8 title
This commit is contained in:
Alejandro Celaya
2026-01-09 17:12:37 +01:00
committed by GitHub
3 changed files with 95 additions and 18 deletions

View File

@@ -41,7 +41,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
* [#2565](https://github.com/shlinkio/shlink/issues/2565) Remove explicit dependency in ext-json, since it's part of PHP since v8.0
### Fixed
* *Nothing*
* [#2564](https://github.com/shlinkio/shlink/issues/2564) Fix error when trying to persist non-utf-8 title without being able to determine its original charset for parsing.
Now, when resolving a website's charset, two improvements have been introduced:
1. If the `Content-Type` header does not define the charset, we fall back to `<meta charset>` or `<meta http-equiv="Content-Type">`.
2. If it's still not possible to determine the charset, we ignore the auto-resolved title, to avoid other encoding errors further down the line.
## [4.6.0] - 2025-11-01

View File

@@ -10,6 +10,7 @@ use GuzzleHttp\ClientInterface;
use GuzzleHttp\RequestOptions;
use Laminas\Stdlib\ErrorHandler;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\StreamInterface;
use Psr\Log\LoggerInterface;
use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions;
use Throwable;
@@ -30,10 +31,12 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
public const string CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
. 'Chrome/121.0.0.0 Safari/537.36';
// Matches the value inside a html title tag
/** Matches the value inside a html title tag */
private const string TITLE_TAG_VALUE = '/<title[^>]*>(.*?)<\/title>/i';
// Matches the charset inside a Content-Type header
/** Matches the charset inside a Content-Type header */
private const string CHARSET_VALUE = '/charset=([^;]+)/i';
/** Matches the charset from charset-related <meta /> tags */
private const string CHARSET_FROM_META = '/<meta\b[^>]*\bcharset\s*=\s*(?:["\']?)([^"\'\s>;]+)/i';
/**
* @param (Closure(): bool)|null $isIconvInstalled
@@ -83,7 +86,8 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
RequestOptions::IDN_CONVERSION => true,
// Making the request with a browser's user agent results in responses closer to a real user
RequestOptions::HEADERS => ['User-Agent' => self::CHROME_USER_AGENT],
RequestOptions::STREAM => true, // This ensures large files are not fully downloaded if not needed
// This ensures large files are not fully downloaded if not needed
RequestOptions::STREAM => true,
]);
} catch (Throwable) {
return null;
@@ -102,24 +106,56 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
// Try to match the title from the <title /> tag
preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches);
if (! isset($titleMatches[1])) {
$titleInOriginalEncoding = $titleMatches[1] ?? null;
if ($titleInOriginalEncoding === null) {
return null;
}
;
$pageCharset = $this->resolvePageCharset($contentType, $body, $collectedBody);
if ($pageCharset === null) {
// If it was not possible to determine the page's charset, ignore the title to avoid the risk of encoding
// errors when the value is persisted
return null;
}
$titleInOriginalEncoding = $titleMatches[1];
// Get the page's charset from Content-Type header, or return title as is if not found
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
if (! isset($charsetMatches[1])) {
return $titleInOriginalEncoding;
}
$pageCharset = $charsetMatches[1];
return $this->encodeToUtf8WithMbString($titleInOriginalEncoding, $pageCharset)
?? $this->encodeToUtf8WithIconv($titleInOriginalEncoding, $pageCharset)
?? $titleInOriginalEncoding;
}
/**
* Tries to resolve the page's charset by looking into the:
* 1. Content-Type header
* 2. <meta charset="???"> tag
* 3. <meta http-equiv="Content-Type" content="text/html; charset=???"> tag
*
* @param StreamInterface $body - The body stream, in case we need to continue reading from it
* @param string $collectedBody - The part of the body that has already been read while looking for the title
*/
private function resolvePageCharset(string $contentType, StreamInterface $body, string $collectedBody): string|null
{
// First try to resolve the charset from the `Content-Type` header
preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
$pageCharset = $charsetMatches[1] ?? null;
if ($pageCharset !== null) {
return $pageCharset;
}
$readCharsetFromMeta = static function (string $collectedBody): string|null {
preg_match(self::CHARSET_FROM_META, $collectedBody, $charsetFromMetaMatches);
return $charsetFromMetaMatches[1] ?? null;
};
// Continue reading the body, looking for any of the charset meta tags
$charsetFromMeta = $readCharsetFromMeta($collectedBody);
while ($charsetFromMeta === null && ! $body->eof()) {
$collectedBody .= $body->read(1024);
$charsetFromMeta = $readCharsetFromMeta($collectedBody);
}
return $charsetFromMeta;
}
private function encodeToUtf8WithMbString(string $titleInOriginalEncoding, string $pageCharset): string|null
{
try {

View File

@@ -11,6 +11,7 @@ use GuzzleHttp\RequestOptions;
use Laminas\Diactoros\Response;
use Laminas\Diactoros\Response\JsonResponse;
use Laminas\Diactoros\Stream;
use PHPUnit\Framework\Attributes\AllowMockObjectsWithoutExpectations;
use PHPUnit\Framework\Attributes\Test;
use PHPUnit\Framework\Attributes\TestWith;
use PHPUnit\Framework\MockObject\InvocationStubber;
@@ -23,7 +24,7 @@ use Shlinkio\Shlink\Core\ShortUrl\Model\ShortUrlCreation;
class ShortUrlTitleResolutionHelperTest extends TestCase
{
private const string LONG_URL = 'http://foobar.com/12345/hello?foo=bar';
private const string LONG_URL = 'https://foobar.com/12345/hello?foo=bar';
private MockObject & ClientInterface $httpClient;
private MockObject & LoggerInterface $logger;
@@ -98,7 +99,6 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
}
#[Test]
#[TestWith(['TEXT/html', false], 'no charset')]
#[TestWith(['TEXT/html; charset=utf-8', false], 'mbstring-supported charset')]
#[TestWith(['TEXT/html; charset=Windows-1255', true], 'mbstring-unsupported charset')]
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType, bool $expectsWarning): void
@@ -120,6 +120,37 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
self::assertEquals('Resolved "title"', $result->title);
}
#[Test, AllowMockObjectsWithoutExpectations]
public function resolvedTitleIsIgnoredWhenCharsetCannotBeResolved(): void
{
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle('text/html'));
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
$result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
self::assertSame($data, $result);
self::assertNull($result->title);
}
#[Test, AllowMockObjectsWithoutExpectations]
#[TestWith(['<meta charset="utf-8">'])]
#[TestWith(['<meta charset="utf-8" />'])]
#[TestWith(['<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'])]
#[TestWith(['<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'])]
public function pageCharsetCanBeReadFromMeta(string $extraContent): void
{
$this->expectRequestToBeCalled()->willReturn($this->respWithTitle(
contentType: 'text/html',
extraContent: $extraContent,
));
$data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
$result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
self::assertNotSame($data, $result);
self::assertEquals('Resolved "title"', $result->title);
}
#[Test]
#[TestWith([
'contentType' => 'text/html; charset=Windows-1255',
@@ -178,9 +209,14 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
return new Response($body, 200, ['Content-Type' => 'text/html']);
}
private function respWithTitle(string $contentType): Response
private function respWithTitle(string $contentType, string|null $extraContent = null): Response
{
$body = $this->createStreamWithContent('<title data-foo="bar"> Resolved &quot;title&quot; </title>');
$content = '<title data-foo="bar"> Resolved &quot;title&quot; </title>';
if ($extraContent !== null) {
$content .= $extraContent;
}
$body = $this->createStreamWithContent($content);
return new Response($body, 200, ['Content-Type' => $contentType]);
}