diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6c684a4e..991679d2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,7 +41,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com), and this
* [#2565](https://github.com/shlinkio/shlink/issues/2565) Remove explicit dependency in ext-json, since it's part of PHP since v8.0
### Fixed
-* *Nothing*
+* [#2564](https://github.com/shlinkio/shlink/issues/2564) Fix error when trying to persist non-utf-8 title without being able to determine its original charset for parsing.
+
+ Now, when resolving a website's charset, two improvements have been introduced:
+
+ 1. If the `Content-Type` header does not define the charset, we fall back to `` or ``.
+ 2. If it's still not possible to determine the charset, we ignore the auto-resolved title, to avoid other encoding errors further down the line.
## [4.6.0] - 2025-11-01
diff --git a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php
index f870bb87..496708d9 100644
--- a/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php
+++ b/module/Core/src/ShortUrl/Helper/ShortUrlTitleResolutionHelper.php
@@ -10,6 +10,7 @@ use GuzzleHttp\ClientInterface;
use GuzzleHttp\RequestOptions;
use Laminas\Stdlib\ErrorHandler;
use Psr\Http\Message\ResponseInterface;
+use Psr\Http\Message\StreamInterface;
use Psr\Log\LoggerInterface;
use Shlinkio\Shlink\Core\Config\Options\UrlShortenerOptions;
use Throwable;
@@ -30,10 +31,12 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
public const string CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
. 'Chrome/121.0.0.0 Safari/537.36';
- // Matches the value inside a html title tag
+ /** Matches the value inside a html title tag */
private const string TITLE_TAG_VALUE = '/
]*>(.*?)<\/title>/i';
- // Matches the charset inside a Content-Type header
+ /** Matches the charset inside a Content-Type header */
private const string CHARSET_VALUE = '/charset=([^;]+)/i';
+ /** Matches the charset from charset-related tags */
+ private const string CHARSET_FROM_META = '/]*\bcharset\s*=\s*(?:["\']?)([^"\'\s>;]+)/i';
/**
* @param (Closure(): bool)|null $isIconvInstalled
@@ -83,7 +86,8 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
RequestOptions::IDN_CONVERSION => true,
// Making the request with a browser's user agent results in responses closer to a real user
RequestOptions::HEADERS => ['User-Agent' => self::CHROME_USER_AGENT],
- RequestOptions::STREAM => true, // This ensures large files are not fully downloaded if not needed
+ // This ensures large files are not fully downloaded if not needed
+ RequestOptions::STREAM => true,
]);
} catch (Throwable) {
return null;
@@ -102,24 +106,56 @@ readonly class ShortUrlTitleResolutionHelper implements ShortUrlTitleResolutionH
// Try to match the title from the tag
preg_match(self::TITLE_TAG_VALUE, $collectedBody, $titleMatches);
- if (! isset($titleMatches[1])) {
+ $titleInOriginalEncoding = $titleMatches[1] ?? null;
+ if ($titleInOriginalEncoding === null) {
+ return null;
+ }
+ ;
+ $pageCharset = $this->resolvePageCharset($contentType, $body, $collectedBody);
+ if ($pageCharset === null) {
+ // If it was not possible to determine the page's charset, ignore the title to avoid the risk of encoding
+ // errors when the value is persisted
return null;
}
- $titleInOriginalEncoding = $titleMatches[1];
-
- // Get the page's charset from Content-Type header, or return title as is if not found
- preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
- if (! isset($charsetMatches[1])) {
- return $titleInOriginalEncoding;
- }
-
- $pageCharset = $charsetMatches[1];
return $this->encodeToUtf8WithMbString($titleInOriginalEncoding, $pageCharset)
?? $this->encodeToUtf8WithIconv($titleInOriginalEncoding, $pageCharset)
?? $titleInOriginalEncoding;
}
+ /**
+ * Tries to resolve the page's charset by looking into the:
+ * 1. Content-Type header
+ * 2. tag
+ * 3. tag
+ *
+ * @param StreamInterface $body - The body stream, in case we need to continue reading from it
+ * @param string $collectedBody - The part of the body that has already been read while looking for the title
+ */
+ private function resolvePageCharset(string $contentType, StreamInterface $body, string $collectedBody): string|null
+ {
+ // First try to resolve the charset from the `Content-Type` header
+ preg_match(self::CHARSET_VALUE, $contentType, $charsetMatches);
+ $pageCharset = $charsetMatches[1] ?? null;
+ if ($pageCharset !== null) {
+ return $pageCharset;
+ }
+
+ $readCharsetFromMeta = static function (string $collectedBody): string|null {
+ preg_match(self::CHARSET_FROM_META, $collectedBody, $charsetFromMetaMatches);
+ return $charsetFromMetaMatches[1] ?? null;
+ };
+
+ // Continue reading the body, looking for any of the charset meta tags
+ $charsetFromMeta = $readCharsetFromMeta($collectedBody);
+ while ($charsetFromMeta === null && ! $body->eof()) {
+ $collectedBody .= $body->read(1024);
+ $charsetFromMeta = $readCharsetFromMeta($collectedBody);
+ }
+
+ return $charsetFromMeta;
+ }
+
private function encodeToUtf8WithMbString(string $titleInOriginalEncoding, string $pageCharset): string|null
{
try {
diff --git a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php
index 857c4291..b1f2e0d0 100644
--- a/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php
+++ b/module/Core/test/ShortUrl/Helper/ShortUrlTitleResolutionHelperTest.php
@@ -11,6 +11,7 @@ use GuzzleHttp\RequestOptions;
use Laminas\Diactoros\Response;
use Laminas\Diactoros\Response\JsonResponse;
use Laminas\Diactoros\Stream;
+use PHPUnit\Framework\Attributes\AllowMockObjectsWithoutExpectations;
use PHPUnit\Framework\Attributes\Test;
use PHPUnit\Framework\Attributes\TestWith;
use PHPUnit\Framework\MockObject\InvocationStubber;
@@ -23,7 +24,7 @@ use Shlinkio\Shlink\Core\ShortUrl\Model\ShortUrlCreation;
class ShortUrlTitleResolutionHelperTest extends TestCase
{
- private const string LONG_URL = 'http://foobar.com/12345/hello?foo=bar';
+ private const string LONG_URL = 'https://foobar.com/12345/hello?foo=bar';
private MockObject & ClientInterface $httpClient;
private MockObject & LoggerInterface $logger;
@@ -98,7 +99,6 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
}
#[Test]
- #[TestWith(['TEXT/html', false], 'no charset')]
#[TestWith(['TEXT/html; charset=utf-8', false], 'mbstring-supported charset')]
#[TestWith(['TEXT/html; charset=Windows-1255', true], 'mbstring-unsupported charset')]
public function titleIsUpdatedWhenItCanBeResolvedFromResponse(string $contentType, bool $expectsWarning): void
@@ -120,6 +120,37 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
self::assertEquals('Resolved "title"', $result->title);
}
+ #[Test, AllowMockObjectsWithoutExpectations]
+ public function resolvedTitleIsIgnoredWhenCharsetCannotBeResolved(): void
+ {
+ $this->expectRequestToBeCalled()->willReturn($this->respWithTitle('text/html'));
+
+ $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
+ $result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
+
+ self::assertSame($data, $result);
+ self::assertNull($result->title);
+ }
+
+ #[Test, AllowMockObjectsWithoutExpectations]
+ #[TestWith([''])]
+ #[TestWith([''])]
+ #[TestWith([''])]
+ #[TestWith([''])]
+ public function pageCharsetCanBeReadFromMeta(string $extraContent): void
+ {
+ $this->expectRequestToBeCalled()->willReturn($this->respWithTitle(
+ contentType: 'text/html',
+ extraContent: $extraContent,
+ ));
+
+ $data = ShortUrlCreation::fromRawData(['longUrl' => self::LONG_URL]);
+ $result = $this->helper(autoResolveTitles: true, iconvEnabled: true)->processTitle($data);
+
+ self::assertNotSame($data, $result);
+ self::assertEquals('Resolved "title"', $result->title);
+ }
+
#[Test]
#[TestWith([
'contentType' => 'text/html; charset=Windows-1255',
@@ -178,9 +209,14 @@ class ShortUrlTitleResolutionHelperTest extends TestCase
return new Response($body, 200, ['Content-Type' => 'text/html']);
}
- private function respWithTitle(string $contentType): Response
+ private function respWithTitle(string $contentType, string|null $extraContent = null): Response
{
- $body = $this->createStreamWithContent(' Resolved "title" ');
+ $content = ' Resolved "title" ';
+ if ($extraContent !== null) {
+ $content .= $extraContent;
+ }
+
+ $body = $this->createStreamWithContent($content);
return new Response($body, 200, ['Content-Type' => $contentType]);
}