Download PDF instead of creating HTML snapshot if URL points at PDF (#1271)

* basic pdf snapshots

* cleanup website_loader tests

* cleanup asset tests

* cleanup asset service tests

* use PDF download as display name

* update new snapshot name

* update docs

* update e2e test

* update test
This commit is contained in:
Sascha Ißbrücker
2026-01-06 10:29:31 +01:00
committed by GitHub
parent 4f26c3483b
commit 7333b283cf
11 changed files with 448 additions and 41 deletions

View File

@@ -131,6 +131,7 @@ class BookmarkAsset(models.Model):
TYPE_UPLOAD = "upload"
CONTENT_TYPE_HTML = "text/html"
CONTENT_TYPE_PDF = "application/pdf"
STATUS_PENDING = "pending"
STATUS_COMPLETE = "complete"
@@ -148,11 +149,11 @@ class BookmarkAsset(models.Model):
@property
def download_name(self):
return (
f"{self.display_name}.html"
if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT
else self.display_name
)
if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT:
if self.content_type == BookmarkAsset.CONTENT_TYPE_PDF:
return f"{self.display_name}.pdf"
return f"{self.display_name}.html"
return self.display_name
def save(self, *args, **kwargs):
if self.file:

View File

@@ -3,27 +3,35 @@ import logging
import os
import shutil
import requests
from django.conf import settings
from django.core.files.uploadedfile import UploadedFile
from django.utils import formats, timezone
from bookmarks.models import Bookmark, BookmarkAsset
from bookmarks.services import singlefile
from bookmarks.services.website_loader import (
detect_content_type,
fake_request_headers,
is_pdf_content_type,
)
MAX_ASSET_FILENAME_LENGTH = 192
logger = logging.getLogger(__name__)
class PdfTooLargeError(Exception):
pass
def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset:
date_created = timezone.now()
timestamp = formats.date_format(date_created, "SHORT_DATE_FORMAT")
asset = BookmarkAsset(
bookmark=bookmark,
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
date_created=date_created,
content_type=BookmarkAsset.CONTENT_TYPE_HTML,
display_name=f"HTML snapshot from {timestamp}",
date_created=timezone.now(),
content_type="",
display_name="New snapshot",
status=BookmarkAsset.STATUS_PENDING,
)
return asset
@@ -31,37 +39,109 @@ def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset:
def create_snapshot(asset: BookmarkAsset):
try:
# Create snapshot into temporary file
temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp")
temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
singlefile.create_snapshot(asset.bookmark.url, temp_filepath)
url = asset.bookmark.url
content_type = detect_content_type(url)
# Store as gzip in asset folder
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
with (
open(temp_filepath, "rb") as temp_file,
gzip.open(filepath, "wb") as gz_file,
):
shutil.copyfileobj(temp_file, gz_file)
# Remove temporary file
os.remove(temp_filepath)
asset.status = BookmarkAsset.STATUS_COMPLETE
asset.file = filename
asset.gzip = True
asset.save()
asset.bookmark.latest_snapshot = asset
asset.bookmark.date_modified = timezone.now()
asset.bookmark.save()
if is_pdf_content_type(content_type):
_create_pdf_snapshot(asset)
else:
_create_html_snapshot(asset)
except Exception as error:
asset.status = BookmarkAsset.STATUS_FAILURE
asset.save()
raise error
def _create_html_snapshot(asset: BookmarkAsset):
# Create snapshot into temporary file
temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp")
temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
singlefile.create_snapshot(asset.bookmark.url, temp_filepath)
# Store as gzip in asset folder
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
with (
open(temp_filepath, "rb") as temp_file,
gzip.open(filepath, "wb") as gz_file,
):
shutil.copyfileobj(temp_file, gz_file)
# Remove temporary file
os.remove(temp_filepath)
# Update display name for HTML
timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
asset.status = BookmarkAsset.STATUS_COMPLETE
asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML
asset.display_name = f"HTML snapshot from {timestamp}"
asset.file = filename
asset.gzip = True
asset.save()
asset.bookmark.latest_snapshot = asset
asset.bookmark.date_modified = timezone.now()
asset.bookmark.save()
def _create_pdf_snapshot(asset: BookmarkAsset):
url = asset.bookmark.url
max_size = settings.LD_SNAPSHOT_PDF_MAX_SIZE
# Download PDF to temporary file
temp_filename = _generate_asset_filename(asset, url, "tmp")
temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
headers = fake_request_headers()
timeout = 60
with requests.get(url, headers=headers, stream=True, timeout=timeout) as response:
response.raise_for_status()
# Check Content-Length header if available
content_length = response.headers.get("Content-Length")
if content_length and int(content_length) > max_size:
raise PdfTooLargeError(
f"PDF size ({content_length} bytes) exceeds limit ({max_size} bytes)"
)
# Download in chunks, tracking size
downloaded_size = 0
with open(temp_filepath, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
downloaded_size += len(chunk)
if downloaded_size > max_size:
raise PdfTooLargeError(f"PDF size exceeds limit ({max_size} bytes)")
f.write(chunk)
# Store as gzip in asset folder
filename = _generate_asset_filename(asset, url, "pdf.gz")
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
with (
open(temp_filepath, "rb") as temp_file,
gzip.open(filepath, "wb") as gz_file,
):
shutil.copyfileobj(temp_file, gz_file)
# Remove temporary file
os.remove(temp_filepath)
# Update display name for PDF
timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
asset.status = BookmarkAsset.STATUS_COMPLETE
asset.content_type = BookmarkAsset.CONTENT_TYPE_PDF
asset.display_name = f"PDF download from {timestamp}"
asset.file = filename
asset.gzip = True
asset.save()
asset.bookmark.latest_snapshot = asset
asset.bookmark.date_modified = timezone.now()
asset.bookmark.save()
def upload_snapshot(bookmark: Bookmark, html: bytes):
asset = create_snapshot_asset(bookmark)
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
@@ -71,7 +151,11 @@ def upload_snapshot(bookmark: Bookmark, html: bytes):
gz_file.write(html)
# Only save the asset if the file was written successfully
timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
asset.status = BookmarkAsset.STATUS_COMPLETE
asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML
asset.display_name = f"HTML snapshot from {timestamp}"
asset.file = filename
asset.gzip = True
asset.save()

View File

@@ -139,3 +139,42 @@ def fake_request_headers():
"Upgrade-Insecure-Requests": "1",
"User-Agent": DEFAULT_USER_AGENT,
}
def detect_content_type(url: str, timeout: int = 10) -> str | None:
"""Make HEAD request to detect content type of URL. Returns None on failure."""
headers = fake_request_headers()
try:
response = requests.head(
url, headers=headers, timeout=timeout, allow_redirects=True
)
if response.status_code == 200:
return (
response.headers.get("Content-Type", "").split(";")[0].strip().lower()
)
except requests.RequestException:
pass
try:
with requests.get(
url, headers=headers, timeout=timeout, stream=True, allow_redirects=True
) as response:
if response.status_code == 200:
return (
response.headers.get("Content-Type", "")
.split(";")[0]
.strip()
.lower()
)
except requests.RequestException:
pass
return None
def is_pdf_content_type(content_type: str | None) -> bool:
"""Check if the content type indicates a PDF."""
if not content_type:
return False
return content_type in ("application/pdf", "application/x-pdf")

View File

@@ -327,6 +327,7 @@ LD_SINGLEFILE_UBLOCK_OPTIONS = os.getenv(
)
LD_SINGLEFILE_OPTIONS = os.getenv("LD_SINGLEFILE_OPTIONS", "")
LD_SINGLEFILE_TIMEOUT_SEC = float(os.getenv("LD_SINGLEFILE_TIMEOUT_SEC", 120))
LD_SNAPSHOT_PDF_MAX_SIZE = int(os.getenv("LD_SNAPSHOT_PDF_MAX_SIZE", 15728640)) # 15MB
# Monolith isn't used at the moment, as the local snapshot implementation
# switched to single-file after the prototype. Keeping this around in case

View File

@@ -6,7 +6,7 @@ from pathlib import Path
from unittest import mock
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test import TestCase
from django.test import TestCase, override_settings
from django.utils import timezone
from bookmarks.models import BookmarkAsset
@@ -20,6 +20,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
self.get_or_create_test_user()
self.html_content = "<html><body><h1>Hello, World!</h1></body></html>"
self.pdf_content = b"%PDF-1.4 test pdf content"
self.mock_singlefile_create_snapshot_patcher = mock.patch(
"bookmarks.services.singlefile.create_snapshot",
)
@@ -30,8 +32,24 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
Path(filepath).write_text(self.html_content)
)
# Mock detect_content_type to return text/html by default
self.mock_detect_content_type_patcher = mock.patch(
"bookmarks.services.assets.detect_content_type",
)
self.mock_detect_content_type = self.mock_detect_content_type_patcher.start()
self.mock_detect_content_type.return_value = "text/html"
# Mock is_pdf_content_type to return False by default
self.mock_is_pdf_content_type_patcher = mock.patch(
"bookmarks.services.assets.is_pdf_content_type",
)
self.mock_is_pdf_content_type = self.mock_is_pdf_content_type_patcher.start()
self.mock_is_pdf_content_type.return_value = False
def tearDown(self) -> None:
self.mock_singlefile_create_snapshot_patcher.stop()
self.mock_detect_content_type_patcher.stop()
self.mock_is_pdf_content_type_patcher.stop()
def get_saved_snapshot_file(self):
# look up first file in the asset folder
@@ -39,6 +57,20 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
if files:
return files[0]
def create_mock_pdf_response(self, content=None, content_length=None):
if content is None:
content = self.pdf_content
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/pdf"}
if content_length is not None:
mock_response.headers["Content-Length"] = str(content_length)
mock_response.iter_content = mock.Mock(return_value=[content])
mock_response.raise_for_status = mock.Mock()
mock_response.__enter__ = mock.Mock(return_value=mock_response)
mock_response.__exit__ = mock.Mock(return_value=False)
return mock_response
def test_create_snapshot_asset(self):
bookmark = self.setup_bookmark()
@@ -47,8 +79,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
self.assertIsNotNone(asset)
self.assertEqual(asset.bookmark, bookmark)
self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT)
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
self.assertIn("HTML snapshot from", asset.display_name)
self.assertEqual(asset.content_type, "")
self.assertEqual(asset.display_name, "New snapshot")
self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING)
# asset is not saved to the database
@@ -91,6 +123,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
# should update asset status and file
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
self.assertIn("HTML snapshot from", asset.display_name)
self.assertEqual(asset.file, expected_filename)
self.assertTrue(asset.gzip)
@@ -127,6 +161,119 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
self.assertTrue(saved_file.startswith("snapshot_"))
self.assertTrue(saved_file.endswith("aaaa.html.gz"))
def test_create_pdf_snapshot(self):
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
asset = assets.create_snapshot_asset(bookmark)
asset.save()
asset.date_created = timezone.datetime(
2023, 8, 11, 21, 45, 11, tzinfo=datetime.UTC
)
self.mock_detect_content_type.return_value = "application/pdf"
self.mock_is_pdf_content_type.return_value = True
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
mock_get.return_value = self.create_mock_pdf_response()
assets.create_snapshot(asset)
expected_filename = (
"snapshot_2023-08-11_214511_https___example.com_doc.pdf.pdf.gz"
)
expected_filepath = os.path.join(self.assets_dir, expected_filename)
# should create gzip file in asset folder
self.assertTrue(os.path.exists(expected_filepath))
# gzip file should contain the correct content
with gzip.open(expected_filepath, "rb") as gz_file:
self.assertEqual(gz_file.read(), self.pdf_content)
# should update asset status and file
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
self.assertEqual(asset.file, expected_filename)
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_PDF)
self.assertIn("PDF download from", asset.display_name)
self.assertTrue(asset.gzip)
# should update bookmark
bookmark.refresh_from_db()
self.assertEqual(bookmark.latest_snapshot, asset)
def test_create_snapshot_falls_back_to_singlefile_when_detection_fails(self):
bookmark = self.setup_bookmark(url="https://example.com")
asset = assets.create_snapshot_asset(bookmark)
asset.save()
self.mock_detect_content_type.return_value = None # Detection failed
assets.create_snapshot(asset)
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
self.mock_singlefile_create_snapshot.assert_called()
@override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100)
def test_create_pdf_snapshot_fails_when_content_length_exceeds_limit(self):
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
asset = assets.create_snapshot_asset(bookmark)
asset.save()
self.mock_detect_content_type.return_value = "application/pdf"
self.mock_is_pdf_content_type.return_value = True
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
mock_get.return_value = self.create_mock_pdf_response(
content_length=1000 # Exceeds 100 byte limit
)
with self.assertRaises(assets.PdfTooLargeError):
assets.create_snapshot(asset)
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
@override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100)
def test_create_pdf_snapshot_fails_when_download_exceeds_limit(self):
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
asset = assets.create_snapshot_asset(bookmark)
asset.save()
large_content = b"x" * 150 # Exceeds 100 byte limit
self.mock_detect_content_type.return_value = "application/pdf"
self.mock_is_pdf_content_type.return_value = True
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
# Response without Content-Length header, will fail during streaming
mock_get.return_value = self.create_mock_pdf_response(content=large_content)
with self.assertRaises(assets.PdfTooLargeError):
assets.create_snapshot(asset)
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
def test_create_pdf_snapshot_failure(self):
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
asset = assets.create_snapshot_asset(bookmark)
asset.save()
self.mock_detect_content_type.return_value = "application/pdf"
self.mock_is_pdf_content_type.return_value = True
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
import requests
mock_get.side_effect = requests.RequestException("Download failed")
with self.assertRaises(requests.RequestException):
assets.create_snapshot(asset)
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
def test_upload_snapshot(self):
initial_modified = timezone.datetime(2025, 1, 1, 0, 0, 0, tzinfo=datetime.UTC)
bookmark = self.setup_bookmark(

View File

@@ -3,6 +3,7 @@ import os
from django.conf import settings
from django.test import TestCase
from bookmarks.models import BookmarkAsset
from bookmarks.services import bookmarks
from bookmarks.tests.helpers import BookmarkFactoryMixin
@@ -79,3 +80,33 @@ class BookmarkAssetsTestCase(TestCase, BookmarkFactoryMixin):
# Create asset with initial file
asset = self.setup_asset(bookmark=bookmark, file="temp.html.gz")
self.assertEqual(asset.file_size, 4)
def test_download_name_for_html_snapshot(self):
bookmark = self.setup_bookmark()
asset = self.setup_asset(
bookmark=bookmark,
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
content_type=BookmarkAsset.CONTENT_TYPE_HTML,
display_name="HTML snapshot from Jan 1, 2025",
)
self.assertEqual(asset.download_name, "HTML snapshot from Jan 1, 2025.html")
def test_download_name_for_pdf_snapshot(self):
bookmark = self.setup_bookmark()
asset = self.setup_asset(
bookmark=bookmark,
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
content_type=BookmarkAsset.CONTENT_TYPE_PDF,
display_name="PDF download from Jan 1, 2025",
)
self.assertEqual(asset.download_name, "PDF download from Jan 1, 2025.pdf")
def test_download_name_for_upload(self):
bookmark = self.setup_bookmark()
asset = self.setup_asset(
bookmark=bookmark,
asset_type=BookmarkAsset.TYPE_UPLOAD,
content_type="text/plain",
display_name="document.txt",
)
self.assertEqual(asset.download_name, "document.txt")

View File

@@ -482,8 +482,8 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
for asset in assets:
self.assertEqual(asset.bookmark, bookmark)
self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT)
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
self.assertIn("HTML snapshot", asset.display_name)
self.assertEqual(asset.content_type, "")
self.assertIn("New snapshot", asset.display_name)
self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING)
self.mock_assets_create_snapshot.assert_not_called()

View File

@@ -201,3 +201,101 @@ class WebsiteLoaderTestCase(TestCase):
"https://example.com", ignore_cache=True
)
self.assertEqual(mock_load_page.call_count, 2)
class ContentTypeDetectionTestCase(TestCase):
def test_detect_content_type_returns_content_type_from_head_request(self):
with mock.patch("requests.head") as mock_head:
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/pdf"}
mock_head.return_value = mock_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertEqual(result, "application/pdf")
mock_head.assert_called_once()
def test_detect_content_type_strips_charset(self):
with mock.patch("requests.head") as mock_head:
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "text/html; charset=utf-8"}
mock_head.return_value = mock_response
result = website_loader.detect_content_type("https://example.com")
self.assertEqual(result, "text/html")
def test_detect_content_type_returns_lowercase(self):
with mock.patch("requests.head") as mock_head:
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "Application/PDF"}
mock_head.return_value = mock_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertEqual(result, "application/pdf")
def test_detect_content_type_falls_back_to_get_when_head_fails(self):
with (
mock.patch("requests.head") as mock_head,
mock.patch("requests.get") as mock_get,
):
import requests
mock_head.side_effect = requests.RequestException("HEAD failed")
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/pdf"}
mock_response.__enter__ = mock.Mock(return_value=mock_response)
mock_response.__exit__ = mock.Mock(return_value=False)
mock_get.return_value = mock_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertEqual(result, "application/pdf")
mock_head.assert_called_once()
mock_get.assert_called_once()
def test_detect_content_type_returns_none_when_both_head_and_get_fail(self):
with (
mock.patch("requests.head") as mock_head,
mock.patch("requests.get") as mock_get,
):
import requests
mock_head.side_effect = requests.RequestException("HEAD failed")
mock_get.side_effect = requests.RequestException("GET failed")
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertIsNone(result)
def test_detect_content_type_returns_none_for_non_200_status(self):
with (
mock.patch("requests.head") as mock_head,
mock.patch("requests.get") as mock_get,
):
mock_head_response = mock.Mock()
mock_head_response.status_code = 404
mock_head.return_value = mock_head_response
mock_get_response = mock.Mock()
mock_get_response.status_code = 404
mock_get_response.__enter__ = mock.Mock(return_value=mock_get_response)
mock_get_response.__exit__ = mock.Mock(return_value=False)
mock_get.return_value = mock_get_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertIsNone(result)
def test_is_pdf_content_type(self):
self.assertTrue(website_loader.is_pdf_content_type("application/pdf"))
self.assertTrue(website_loader.is_pdf_content_type("application/x-pdf"))
self.assertFalse(website_loader.is_pdf_content_type("text/html"))
self.assertFalse(website_loader.is_pdf_content_type(None))
self.assertFalse(website_loader.is_pdf_content_type(""))

View File

@@ -152,11 +152,11 @@ class BookmarkDetailsModalE2ETestCase(LinkdingE2ETestCase):
asset_list = details_modal.locator(".assets")
# No snapshots initially
snapshot = asset_list.get_by_text("HTML snapshot from", exact=False)
snapshot = asset_list.get_by_text("snapshot", exact=False)
expect(snapshot).not_to_be_visible()
# Create snapshot
details_modal.get_by_text("Create HTML snapshot", exact=False).click()
details_modal.get_by_text("snapshot", exact=False).click()
self.assertReloads(0)
# Has new snapshots

View File

@@ -35,6 +35,8 @@ def view(request, asset_id: int):
response["Content-Disposition"] = f'inline; filename="{asset.download_name}"'
if asset.content_type and asset.content_type.startswith("video/"):
response["Content-Security-Policy"] = "default-src 'none'; media-src 'self';"
elif asset.content_type == "application/pdf":
response["Content-Security-Policy"] = "default-src 'none'; object-src 'self';"
else:
response["Content-Security-Policy"] = "sandbox allow-scripts"
return response

View File

@@ -11,6 +11,10 @@ Linkding can automatically create HTML snapshots whenever a bookmark is added. T
The snapshots are created using [singlefile-cli](https://github.com/gildas-lormeau/single-file-cli), which effectively runs a headless Chromium instance on the server to convert the web page into a single HTML file. Linkding will also load the [uBlock Origin Lite extension](https://github.com/uBlockOrigin/uBOL-home) into Chromium to attempt to block ads and other unwanted content.
<!--
When bookmarking a URL that points directly to a PDF file, linkding will download the PDF instead of creating an HTML snapshot. This happens automatically based on the content type of the URL, and the downloaded PDF will be stored as an asset alongside the bookmark, just like HTML snapshots.
-->
This method is fairly easy to set up, but also has several downsides:
- The Docker image is significantly larger than the base image, as it includes a Chromium installation.
- Running Chromium requires significantly more memory, at least 1 GB of RAM.