Added detection of visits from potential bots

This commit is contained in:
Alejandro Celaya
2021-05-22 15:09:14 +02:00
parent 663ae9f6bb
commit 9fa32b5b6b
16 changed files with 123 additions and 19 deletions

View File

@@ -65,4 +65,9 @@ return static function (ClassMetadata $metadata, array $emConfig): void {
->columnName('type')
->length(255)
->build();
$builder->createField('potentialBot', Types::BOOLEAN)
->columnName('potential_bot')
->option('default', false)
->build();
};

View File

@@ -7,6 +7,7 @@ namespace Shlinkio\Shlink\Core;
use Cake\Chronos\Chronos;
use DateTimeInterface;
use Fig\Http\Message\StatusCodeInterface;
use Jaybizzle\CrawlerDetect\CrawlerDetect;
use Laminas\InputFilter\InputFilter;
use PUGX\Shortid\Factory as ShortIdFactory;
use Shlinkio\Shlink\Common\Util\DateRange;
@@ -128,3 +129,13 @@ function kebabCaseToCamelCase(string $name): string
{
return lcfirst(str_replace(' ', '', ucwords(str_replace('-', ' ', $name))));
}
function isCrawler(string $userAgent): bool
{
static $detector;
if ($detector === null) {
$detector = new CrawlerDetect();
}
return $detector->isCrawler($userAgent);
}

View File

@@ -13,6 +13,8 @@ use Shlinkio\Shlink\Core\Model\Visitor;
use Shlinkio\Shlink\Core\Visit\Model\VisitLocationInterface;
use Shlinkio\Shlink\Importer\Model\ImportedShlinkVisit;
use function Shlinkio\Shlink\Core\isCrawler;
class Visit extends AbstractEntity implements JsonSerializable
{
public const TYPE_VALID_SHORT_URL = 'valid_short_url';
@@ -29,6 +31,7 @@ class Visit extends AbstractEntity implements JsonSerializable
private string $type;
private ?ShortUrl $shortUrl;
private ?VisitLocation $visitLocation = null;
private bool $potentialBot;
private function __construct(?ShortUrl $shortUrl, string $type)
{
@@ -49,6 +52,7 @@ class Visit extends AbstractEntity implements JsonSerializable
{
$instance = new self($shortUrl, self::TYPE_IMPORTED);
$instance->userAgent = $importedVisit->userAgent();
$instance->potentialBot = isCrawler($instance->userAgent);
$instance->referer = $importedVisit->referer();
$instance->date = Chronos::instance($importedVisit->date());
@@ -88,6 +92,7 @@ class Visit extends AbstractEntity implements JsonSerializable
$this->referer = $visitor->getReferer();
$this->remoteAddr = $this->processAddress($anonymize, $visitor->getRemoteAddress());
$this->visitedUrl = $visitor->getVisitedUrl();
$this->potentialBot = $visitor->isPotentialBot();
}
private function processAddress(bool $anonymize, ?string $address): ?string
@@ -166,6 +171,7 @@ class Visit extends AbstractEntity implements JsonSerializable
'date' => $this->date->toAtomString(),
'userAgent' => $this->userAgent,
'visitLocation' => $this->visitLocation,
'potentialBot' => $this->potentialBot,
];
}

View File

@@ -8,6 +8,7 @@ use Psr\Http\Message\ServerRequestInterface;
use Shlinkio\Shlink\Common\Middleware\IpAddressMiddlewareFactory;
use Shlinkio\Shlink\Core\Options\TrackingOptions;
use function Shlinkio\Shlink\Core\isCrawler;
use function substr;
final class Visitor
@@ -21,6 +22,7 @@ final class Visitor
private string $referer;
private string $visitedUrl;
private ?string $remoteAddress;
private bool $potentialBot;
public function __construct(string $userAgent, string $referer, ?string $remoteAddress, string $visitedUrl)
{
@@ -28,6 +30,7 @@ final class Visitor
$this->referer = $this->cropToLength($referer, self::REFERER_MAX_LENGTH);
$this->visitedUrl = $this->cropToLength($visitedUrl, self::VISITED_URL_MAX_LENGTH);
$this->remoteAddress = $this->cropToLength($remoteAddress, self::REMOTE_ADDRESS_MAX_LENGTH);
$this->potentialBot = isCrawler($userAgent);
}
private function cropToLength(?string $value, int $length): ?string
@@ -70,14 +73,22 @@ final class Visitor
return $this->visitedUrl;
}
public function isPotentialBot(): bool
{
return $this->potentialBot;
}
public function normalizeForTrackingOptions(TrackingOptions $options): self
{
$instance = self::emptyInstance();
$instance = new self(
$options->disableUaTracking() ? '' : $this->userAgent,
$options->disableReferrerTracking() ? '' : $this->referer,
$options->disableIpTracking() ? null : $this->remoteAddress,
$this->visitedUrl,
);
$instance->userAgent = $options->disableUaTracking() ? '' : $this->userAgent;
$instance->referer = $options->disableReferrerTracking() ? '' : $this->referer;
$instance->remoteAddress = $options->disableIpTracking() ? null : $this->remoteAddress;
$instance->visitedUrl = $this->visitedUrl;
// Keep the fact that the visit was a potential bot, even if we no longer save the user agent
$instance->potentialBot = $this->potentialBot;
return $instance;
}

View File

@@ -12,19 +12,35 @@ use Shlinkio\Shlink\Core\Model\Visitor;
class VisitTest extends TestCase
{
/** @test */
public function isProperlyJsonSerialized(): void
/**
* @test
* @dataProvider provideUserAgents
*/
public function isProperlyJsonSerialized(string $userAgent, bool $expectedToBePotentialBot): void
{
$visit = Visit::forValidShortUrl(ShortUrl::createEmpty(), new Visitor('Chrome', 'some site', '1.2.3.4', ''));
$visit = Visit::forValidShortUrl(ShortUrl::createEmpty(), new Visitor($userAgent, 'some site', '1.2.3.4', ''));
self::assertEquals([
'referer' => 'some site',
'date' => $visit->getDate()->toAtomString(),
'userAgent' => 'Chrome',
'userAgent' => $userAgent,
'visitLocation' => null,
'potentialBot' => $expectedToBePotentialBot,
], $visit->jsonSerialize());
}
public function provideUserAgents(): iterable
{
yield 'Chrome' => [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
false,
];
yield 'Firefox' => ['Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0', false];
yield 'Facebook' => ['cf-facebook', true];
yield 'Twitter' => ['IDG Twitter Links Resolver', true];
yield 'Guzzle' => ['guzzlehttp', true];
}
/**
* @test
* @dataProvider provideAddresses

View File

@@ -66,6 +66,7 @@ class MercureUpdatesGeneratorTest extends TestCase
'userAgent' => '',
'visitLocation' => null,
'date' => $visit->getDate()->toAtomString(),
'potentialBot' => false,
],
], json_decode($update->getData()));
}
@@ -91,6 +92,7 @@ class MercureUpdatesGeneratorTest extends TestCase
'userAgent' => '',
'visitLocation' => null,
'date' => $orphanVisit->getDate()->toAtomString(),
'potentialBot' => false,
'visitedUrl' => $orphanVisit->visitedUrl(),
'type' => $orphanVisit->type(),
],

View File

@@ -42,6 +42,7 @@ class OrphanVisitDataTransformerTest extends TestCase
'date' => $visit->getDate()->toAtomString(),
'userAgent' => '',
'visitLocation' => null,
'potentialBot' => false,
'visitedUrl' => '',
'type' => Visit::TYPE_BASE_URL,
],
@@ -57,6 +58,7 @@ class OrphanVisitDataTransformerTest extends TestCase
'date' => $visit->getDate()->toAtomString(),
'userAgent' => 'foo',
'visitLocation' => null,
'potentialBot' => false,
'visitedUrl' => 'https://example.com/foo',
'type' => Visit::TYPE_INVALID_SHORT_URL,
],
@@ -74,6 +76,7 @@ class OrphanVisitDataTransformerTest extends TestCase
'date' => $visit->getDate()->toAtomString(),
'userAgent' => 'user-agent',
'visitLocation' => $location,
'potentialBot' => false,
'visitedUrl' => 'https://doma.in/foo/bar',
'type' => Visit::TYPE_REGULAR_404,
],