diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c684a4e..991679d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,7 +41,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this * [#2565](https://github.com/shlinkio/shlink/issues/2565) Remove explicit dependency in ext-json, since it's part of PHP since v8.0 ### Fixed -* *Nothing* +* [#2564](https://github.com/shlinkio/shlink/issues/2564) Fix error when trying to persist non-utf-8 title without being able to determine its original charset for parsing. + + Now, when resolving a website's charset, two improvements have been introduced: + + 1. If the `Content-Type` header does not define the charset, we fall back to `` or ``. + 2. If it's still not possible to determine the charset, we ignore the auto-resolved title, to avoid other encoding errors further down the line. ## [4.6.0] - 2025-11-01 diff --git a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php index f870bb87..496708d9 100644 --- a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php +++ b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php @@ -10,6 +10,7 @@ use GuzzleHttp\ClientInterface; use GuzzleHttp\RequestOptions; use Laminas\Stdlib\ErrorHandler; use Psr\Http\Message\ResponseInterface; +use Psr\Http\Message\StreamInterface; use Psr\Log\LoggerInterface; use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions; use Throwable; @@ -30,10 +31,12 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH public const string CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' . 'Chrome/121.0.0.0 Safari/537.36'; - // Matches the value inside a html title tag + /** Matches the value inside a html title tag */ private const string TITLE_TAG_VALUE = '/]*>(.*?)<\/title>/i'; - // Matches the charset inside a Content-Type header + /** Matches the charset inside a Content-Type header */ private const string CHARSET_VALUE = '/charset=([^;]+)/i'; + /** Matches the charset from charset-related tags */ + private const string CHARSET_FROM_META = '/]*\bcharset\s*=\s*(?:["\']?)([^"\'\s>;]+)/i'; /** * @param (Closure(): bool)|null $isIconvInstalled @@ -83,7 +86,8 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH RequestOptions::IDN_CONVERSION => true, // Making the request with a browser's user agent results in responses closer to a real user RequestOptions::HEADERS => ['User-Agent' => self::CHROME_USER_AGENT], - RequestOptions::STREAM => true, // This ensures large files are not fully downloaded if not needed + // This ensures large files are not fully downloaded if not needed + RequestOptions::STREAM => true, ]); } catch (Throwable) { return null; @@ -102,24 +106,56 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH // Try to match the title from the tag preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches); - if (! isset($titleMatches[1])) { + $titleInOriginalEncoding = $titleMatches[1] ?? null; + if ($titleInOriginalEncoding === null) { + return null; + } + ; + $pageCharset = $this->resolvePageCharset($contentType, $body, $collectedBody); + if ($pageCharset === null) { + // If it was not possible to determine the page's charset, ignore the title to avoid the risk of encoding + // errors when the value is persisted return null; } - $titleInOriginalEncoding = $titleMatches[1]; - - // Get the page's charset from Content-Type header, or return title as is if not found - preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches); - if (! isset($charsetMatches[1])) { - return $titleInOriginalEncoding; - } - - $pageCharset = $charsetMatches[1]; return $this->encodeToUtf8WithMbString($titleInOriginalEncoding, $pageCharset) ?? $this->encodeToUtf8WithIconv($titleInOriginalEncoding, $pageCharset) ?? $titleInOriginalEncoding; } + /** + * Tries to resolve the page's charset by looking into the: + * 1. Content-Type header + * 2. <meta charset="???"> tag + * 3. <meta http-equiv="Content-Type" content="text/html; charset=???"> tag + * + * @param StreamInterface $body - The body stream, in case we need to continue reading from it + * @param string $collectedBody - The part of the body that has already been read while looking for the title + */ + private function resolvePageCharset(string $contentType, StreamInterface $body, string $collectedBody): string|null + { + // First try to resolve the charset from the `Content-Type` header + preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches); + $pageCharset = $charsetMatches[1] ?? null; + if ($pageCharset !== null) { + return $pageCharset; + } + + $readCharsetFromMeta = static function (string $collectedBody): string|null { + preg_match(self::CHARSET_FROM_META, $collectedBody, $charsetFromMetaMatches); + return $charsetFromMetaMatches[1] ?? null; + }; + + // Continue reading the body, looking for any of the charset meta tags + $charsetFromMeta = $readCharsetFromMeta($collectedBody); + while ($charsetFromMeta === null && ! $body->eof()) { + $collectedBody .= $body->read(1024); + $charsetFromMeta = $readCharsetFromMeta($collectedBody); + } + + return $charsetFromMeta; + } + private function encodeToUtf8WithMbString(string $titleInOriginalEncoding, string $pageCharset): string|null { try { diff --git a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php index 857c4291..b1f2e0d0 100644 --- a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php +++ b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php @@ -11,6 +11,7 @@ use GuzzleHttp\RequestOptions; use Laminas\Diactoros\Response; use Laminas\Diactoros\Response\JsonResponse; use Laminas\Diactoros\Stream; +use PHPUnit\Framework\Attributes\AllowMockObjectsWithoutExpectations; use PHPUnit\Framework\Attributes\Test; use PHPUnit\Framework\Attributes\TestWith; use PHPUnit\Framework\MockObject\InvocationStubber; @@ -23,7 +24,7 @@ use Shlinkio\Shlink\Core\ShortUrl\Model\ShortUrlCreation; class ShortUrlTitleResolutionHelperTest extends TestCase { - private const string LONG_URL = 'http://foobar.com/12345/hello?foo=bar'; + private const string LONG_URL = 'https://foobar.com/12345/hello?foo=bar'; private MockObject & ClientInterface $httpClient; private MockObject & LoggerInterface $logger; @@ -98,7 +99,6 @@ class ShortUrlTitleResolutionHelperTest extends TestCase } #[Test] - #[TestWith(['TEXT/html', false], 'no charset')] #[TestWith(['TEXT/html; charset=utf-8', false], 'mbstring-supported charset')] #[TestWith(['TEXT/html; charset=Windows-1255', true], 'mbstring-unsupported charset')] public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType, bool $expectsWarning): void @@ -120,6 +120,37 @@ class ShortUrlTitleResolutionHelperTest extends TestCase self::assertEquals('Resolved "title"', $result->title); } + #[Test, AllowMockObjectsWithoutExpectations] + public function resolvedTitleIsIgnoredWhenCharsetCannotBeResolved(): void + { + $this->expectRequestToBeCalled()->willReturn($this->respWithTitle('text/html')); + + $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]); + $result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data); + + self::assertSame($data, $result); + self::assertNull($result->title); + } + + #[Test, AllowMockObjectsWithoutExpectations] + #[TestWith(['<meta charset="utf-8">'])] + #[TestWith(['<meta charset="utf-8" />'])] + #[TestWith(['<meta http-equiv="Content-Type" content="text/html; charset=utf-8">'])] + #[TestWith(['<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'])] + public function pageCharsetCanBeReadFromMeta(string $extraContent): void + { + $this->expectRequestToBeCalled()->willReturn($this->respWithTitle( + contentType: 'text/html', + extraContent: $extraContent, + )); + + $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]); + $result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data); + + self::assertNotSame($data, $result); + self::assertEquals('Resolved "title"', $result->title); + } + #[Test] #[TestWith([ 'contentType' => 'text/html; charset=Windows-1255', @@ -178,9 +209,14 @@ class ShortUrlTitleResolutionHelperTest extends TestCase return new Response($body, 200, ['Content-Type' => 'text/html']); } - private function respWithTitle(string $contentType): Response + private function respWithTitle(string $contentType, string|null $extraContent = null): Response { - $body = $this->createStreamWithContent('<title data-foo="bar"> Resolved "title" '); + $content = ' Resolved "title" '; + if ($extraContent !== null) { + $content .= $extraContent; + } + + $body = $this->createStreamWithContent($content); return new Response($body, 200, ['Content-Type' => $contentType]); }