diff --git a/bookmarks/models.py b/bookmarks/models.py
index 745806b..dfa4614 100644
--- a/bookmarks/models.py
+++ b/bookmarks/models.py
@@ -131,6 +131,7 @@ class BookmarkAsset(models.Model):
TYPE_UPLOAD = "upload"
CONTENT_TYPE_HTML = "text/html"
+ CONTENT_TYPE_PDF = "application/pdf"
STATUS_PENDING = "pending"
STATUS_COMPLETE = "complete"
@@ -148,11 +149,11 @@ class BookmarkAsset(models.Model):
@property
def download_name(self):
- return (
- f"{self.display_name}.html"
- if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT
- else self.display_name
- )
+ if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT:
+ if self.content_type == BookmarkAsset.CONTENT_TYPE_PDF:
+ return f"{self.display_name}.pdf"
+ return f"{self.display_name}.html"
+ return self.display_name
def save(self, *args, **kwargs):
if self.file:
diff --git a/bookmarks/services/assets.py b/bookmarks/services/assets.py
index 7c0887d..2f635a8 100644
--- a/bookmarks/services/assets.py
+++ b/bookmarks/services/assets.py
@@ -3,27 +3,35 @@ import logging
import os
import shutil
+import requests
from django.conf import settings
from django.core.files.uploadedfile import UploadedFile
from django.utils import formats, timezone
from bookmarks.models import Bookmark, BookmarkAsset
from bookmarks.services import singlefile
+from bookmarks.services.website_loader import (
+ detect_content_type,
+ fake_request_headers,
+ is_pdf_content_type,
+)
MAX_ASSET_FILENAME_LENGTH = 192
logger = logging.getLogger(__name__)
+class PdfTooLargeError(Exception):
+ pass
+
+
def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset:
- date_created = timezone.now()
- timestamp = formats.date_format(date_created, "SHORT_DATE_FORMAT")
asset = BookmarkAsset(
bookmark=bookmark,
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
- date_created=date_created,
- content_type=BookmarkAsset.CONTENT_TYPE_HTML,
- display_name=f"HTML snapshot from {timestamp}",
+ date_created=timezone.now(),
+ content_type="",
+ display_name="New snapshot",
status=BookmarkAsset.STATUS_PENDING,
)
return asset
@@ -31,37 +39,109 @@ def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset:
def create_snapshot(asset: BookmarkAsset):
try:
- # Create snapshot into temporary file
- temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp")
- temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
- singlefile.create_snapshot(asset.bookmark.url, temp_filepath)
+ url = asset.bookmark.url
+ content_type = detect_content_type(url)
- # Store as gzip in asset folder
- filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
- filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
- with (
- open(temp_filepath, "rb") as temp_file,
- gzip.open(filepath, "wb") as gz_file,
- ):
- shutil.copyfileobj(temp_file, gz_file)
-
- # Remove temporary file
- os.remove(temp_filepath)
-
- asset.status = BookmarkAsset.STATUS_COMPLETE
- asset.file = filename
- asset.gzip = True
- asset.save()
-
- asset.bookmark.latest_snapshot = asset
- asset.bookmark.date_modified = timezone.now()
- asset.bookmark.save()
+ if is_pdf_content_type(content_type):
+ _create_pdf_snapshot(asset)
+ else:
+ _create_html_snapshot(asset)
except Exception as error:
asset.status = BookmarkAsset.STATUS_FAILURE
asset.save()
raise error
+def _create_html_snapshot(asset: BookmarkAsset):
+ # Create snapshot into temporary file
+ temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp")
+ temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
+ singlefile.create_snapshot(asset.bookmark.url, temp_filepath)
+
+ # Store as gzip in asset folder
+ filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
+ filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
+ with (
+ open(temp_filepath, "rb") as temp_file,
+ gzip.open(filepath, "wb") as gz_file,
+ ):
+ shutil.copyfileobj(temp_file, gz_file)
+
+ # Remove temporary file
+ os.remove(temp_filepath)
+
+ # Update display name for HTML
+ timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
+
+ asset.status = BookmarkAsset.STATUS_COMPLETE
+ asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML
+ asset.display_name = f"HTML snapshot from {timestamp}"
+ asset.file = filename
+ asset.gzip = True
+ asset.save()
+
+ asset.bookmark.latest_snapshot = asset
+ asset.bookmark.date_modified = timezone.now()
+ asset.bookmark.save()
+
+
+def _create_pdf_snapshot(asset: BookmarkAsset):
+ url = asset.bookmark.url
+ max_size = settings.LD_SNAPSHOT_PDF_MAX_SIZE
+
+ # Download PDF to temporary file
+ temp_filename = _generate_asset_filename(asset, url, "tmp")
+ temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
+
+ headers = fake_request_headers()
+ timeout = 60
+
+ with requests.get(url, headers=headers, stream=True, timeout=timeout) as response:
+ response.raise_for_status()
+
+ # Check Content-Length header if available
+ content_length = response.headers.get("Content-Length")
+ if content_length and int(content_length) > max_size:
+ raise PdfTooLargeError(
+ f"PDF size ({content_length} bytes) exceeds limit ({max_size} bytes)"
+ )
+
+ # Download in chunks, tracking size
+ downloaded_size = 0
+ with open(temp_filepath, "wb") as f:
+ for chunk in response.iter_content(chunk_size=8192):
+ downloaded_size += len(chunk)
+ if downloaded_size > max_size:
+ raise PdfTooLargeError(f"PDF size exceeds limit ({max_size} bytes)")
+ f.write(chunk)
+
+ # Store as gzip in asset folder
+ filename = _generate_asset_filename(asset, url, "pdf.gz")
+ filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
+ with (
+ open(temp_filepath, "rb") as temp_file,
+ gzip.open(filepath, "wb") as gz_file,
+ ):
+ shutil.copyfileobj(temp_file, gz_file)
+
+ # Remove temporary file
+ os.remove(temp_filepath)
+
+ # Update display name for PDF
+ timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
+
+ asset.status = BookmarkAsset.STATUS_COMPLETE
+ asset.content_type = BookmarkAsset.CONTENT_TYPE_PDF
+ asset.display_name = f"PDF download from {timestamp}"
+ asset.file = filename
+ asset.gzip = True
+ asset.save()
+
+ asset.bookmark.latest_snapshot = asset
+ asset.bookmark.date_modified = timezone.now()
+ asset.bookmark.save()
+
+
def upload_snapshot(bookmark: Bookmark, html: bytes):
asset = create_snapshot_asset(bookmark)
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
@@ -71,7 +151,11 @@ def upload_snapshot(bookmark: Bookmark, html: bytes):
gz_file.write(html)
# Only save the asset if the file was written successfully
+ timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
+
asset.status = BookmarkAsset.STATUS_COMPLETE
+ asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML
+ asset.display_name = f"HTML snapshot from {timestamp}"
asset.file = filename
asset.gzip = True
asset.save()
diff --git a/bookmarks/services/website_loader.py b/bookmarks/services/website_loader.py
index a3e8c36..33cc913 100644
--- a/bookmarks/services/website_loader.py
+++ b/bookmarks/services/website_loader.py
@@ -139,3 +139,42 @@ def fake_request_headers():
"Upgrade-Insecure-Requests": "1",
"User-Agent": DEFAULT_USER_AGENT,
}
+
+
+def detect_content_type(url: str, timeout: int = 10) -> str | None:
+ """Make HEAD request to detect content type of URL. Returns None on failure."""
+ headers = fake_request_headers()
+
+ try:
+ response = requests.head(
+ url, headers=headers, timeout=timeout, allow_redirects=True
+ )
+ if response.status_code == 200:
+ return (
+ response.headers.get("Content-Type", "").split(";")[0].strip().lower()
+ )
+ except requests.RequestException:
+ pass
+
+ try:
+ with requests.get(
+ url, headers=headers, timeout=timeout, stream=True, allow_redirects=True
+ ) as response:
+ if response.status_code == 200:
+ return (
+ response.headers.get("Content-Type", "")
+ .split(";")[0]
+ .strip()
+ .lower()
+ )
+ except requests.RequestException:
+ pass
+
+ return None
+
+
+def is_pdf_content_type(content_type: str | None) -> bool:
+ """Check if the content type indicates a PDF."""
+ if not content_type:
+ return False
+ return content_type in ("application/pdf", "application/x-pdf")
diff --git a/bookmarks/settings/base.py b/bookmarks/settings/base.py
index 6759abf..92172ed 100644
--- a/bookmarks/settings/base.py
+++ b/bookmarks/settings/base.py
@@ -327,6 +327,7 @@ LD_SINGLEFILE_UBLOCK_OPTIONS = os.getenv(
)
LD_SINGLEFILE_OPTIONS = os.getenv("LD_SINGLEFILE_OPTIONS", "")
LD_SINGLEFILE_TIMEOUT_SEC = float(os.getenv("LD_SINGLEFILE_TIMEOUT_SEC", 120))
+LD_SNAPSHOT_PDF_MAX_SIZE = int(os.getenv("LD_SNAPSHOT_PDF_MAX_SIZE", 15728640)) # 15MB
# Monolith isn't used at the moment, as the local snapshot implementation
# switched to single-file after the prototype. Keeping this around in case
diff --git a/bookmarks/tests/test_assets_service.py b/bookmarks/tests/test_assets_service.py
index 8bddd27..848639a 100644
--- a/bookmarks/tests/test_assets_service.py
+++ b/bookmarks/tests/test_assets_service.py
@@ -6,7 +6,7 @@ from pathlib import Path
from unittest import mock
from django.core.files.uploadedfile import SimpleUploadedFile
-from django.test import TestCase
+from django.test import TestCase, override_settings
from django.utils import timezone
from bookmarks.models import BookmarkAsset
@@ -20,6 +20,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
self.get_or_create_test_user()
self.html_content = "
Hello, World!
"
+ self.pdf_content = b"%PDF-1.4 test pdf content"
+
self.mock_singlefile_create_snapshot_patcher = mock.patch(
"bookmarks.services.singlefile.create_snapshot",
)
@@ -30,8 +32,24 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
Path(filepath).write_text(self.html_content)
)
+ # Mock detect_content_type to return text/html by default
+ self.mock_detect_content_type_patcher = mock.patch(
+ "bookmarks.services.assets.detect_content_type",
+ )
+ self.mock_detect_content_type = self.mock_detect_content_type_patcher.start()
+ self.mock_detect_content_type.return_value = "text/html"
+
+ # Mock is_pdf_content_type to return False by default
+ self.mock_is_pdf_content_type_patcher = mock.patch(
+ "bookmarks.services.assets.is_pdf_content_type",
+ )
+ self.mock_is_pdf_content_type = self.mock_is_pdf_content_type_patcher.start()
+ self.mock_is_pdf_content_type.return_value = False
+
def tearDown(self) -> None:
self.mock_singlefile_create_snapshot_patcher.stop()
+ self.mock_detect_content_type_patcher.stop()
+ self.mock_is_pdf_content_type_patcher.stop()
def get_saved_snapshot_file(self):
# look up first file in the asset folder
@@ -39,6 +57,20 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
if files:
return files[0]
+ def create_mock_pdf_response(self, content=None, content_length=None):
+ if content is None:
+ content = self.pdf_content
+ mock_response = mock.Mock()
+ mock_response.status_code = 200
+ mock_response.headers = {"Content-Type": "application/pdf"}
+ if content_length is not None:
+ mock_response.headers["Content-Length"] = str(content_length)
+ mock_response.iter_content = mock.Mock(return_value=[content])
+ mock_response.raise_for_status = mock.Mock()
+ mock_response.__enter__ = mock.Mock(return_value=mock_response)
+ mock_response.__exit__ = mock.Mock(return_value=False)
+ return mock_response
+
def test_create_snapshot_asset(self):
bookmark = self.setup_bookmark()
@@ -47,8 +79,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
self.assertIsNotNone(asset)
self.assertEqual(asset.bookmark, bookmark)
self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT)
- self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
- self.assertIn("HTML snapshot from", asset.display_name)
+ self.assertEqual(asset.content_type, "")
+ self.assertEqual(asset.display_name, "New snapshot")
self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING)
# asset is not saved to the database
@@ -91,6 +123,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
# should update asset status and file
asset.refresh_from_db()
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
+ self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
+ self.assertIn("HTML snapshot from", asset.display_name)
self.assertEqual(asset.file, expected_filename)
self.assertTrue(asset.gzip)
@@ -127,6 +161,119 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
self.assertTrue(saved_file.startswith("snapshot_"))
self.assertTrue(saved_file.endswith("aaaa.html.gz"))
+ def test_create_pdf_snapshot(self):
+ bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
+ asset = assets.create_snapshot_asset(bookmark)
+ asset.save()
+ asset.date_created = timezone.datetime(
+ 2023, 8, 11, 21, 45, 11, tzinfo=datetime.UTC
+ )
+
+ self.mock_detect_content_type.return_value = "application/pdf"
+ self.mock_is_pdf_content_type.return_value = True
+
+ with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
+ mock_get.return_value = self.create_mock_pdf_response()
+ assets.create_snapshot(asset)
+
+ expected_filename = (
+ "snapshot_2023-08-11_214511_https___example.com_doc.pdf.pdf.gz"
+ )
+ expected_filepath = os.path.join(self.assets_dir, expected_filename)
+
+ # should create gzip file in asset folder
+ self.assertTrue(os.path.exists(expected_filepath))
+
+ # gzip file should contain the correct content
+ with gzip.open(expected_filepath, "rb") as gz_file:
+ self.assertEqual(gz_file.read(), self.pdf_content)
+
+ # should update asset status and file
+ asset.refresh_from_db()
+ self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
+ self.assertEqual(asset.file, expected_filename)
+ self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_PDF)
+ self.assertIn("PDF download from", asset.display_name)
+ self.assertTrue(asset.gzip)
+
+ # should update bookmark
+ bookmark.refresh_from_db()
+ self.assertEqual(bookmark.latest_snapshot, asset)
+
+ def test_create_snapshot_falls_back_to_singlefile_when_detection_fails(self):
+ bookmark = self.setup_bookmark(url="https://example.com")
+ asset = assets.create_snapshot_asset(bookmark)
+ asset.save()
+
+ self.mock_detect_content_type.return_value = None # Detection failed
+
+ assets.create_snapshot(asset)
+
+ asset.refresh_from_db()
+ self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
+ self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
+ self.mock_singlefile_create_snapshot.assert_called()
+
+ @override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100)
+ def test_create_pdf_snapshot_fails_when_content_length_exceeds_limit(self):
+ bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
+ asset = assets.create_snapshot_asset(bookmark)
+ asset.save()
+
+ self.mock_detect_content_type.return_value = "application/pdf"
+ self.mock_is_pdf_content_type.return_value = True
+
+ with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
+ mock_get.return_value = self.create_mock_pdf_response(
+ content_length=1000 # Exceeds 100 byte limit
+ )
+
+ with self.assertRaises(assets.PdfTooLargeError):
+ assets.create_snapshot(asset)
+
+ asset.refresh_from_db()
+ self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
+
+ @override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100)
+ def test_create_pdf_snapshot_fails_when_download_exceeds_limit(self):
+ bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
+ asset = assets.create_snapshot_asset(bookmark)
+ asset.save()
+
+ large_content = b"x" * 150 # Exceeds 100 byte limit
+
+ self.mock_detect_content_type.return_value = "application/pdf"
+ self.mock_is_pdf_content_type.return_value = True
+
+ with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
+ # Response without Content-Length header, will fail during streaming
+ mock_get.return_value = self.create_mock_pdf_response(content=large_content)
+
+ with self.assertRaises(assets.PdfTooLargeError):
+ assets.create_snapshot(asset)
+
+ asset.refresh_from_db()
+ self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
+
+ def test_create_pdf_snapshot_failure(self):
+ bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
+ asset = assets.create_snapshot_asset(bookmark)
+ asset.save()
+
+ self.mock_detect_content_type.return_value = "application/pdf"
+ self.mock_is_pdf_content_type.return_value = True
+
+ with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
+ import requests
+
+ mock_get.side_effect = requests.RequestException("Download failed")
+
+ with self.assertRaises(requests.RequestException):
+ assets.create_snapshot(asset)
+
+ asset.refresh_from_db()
+ self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
+
def test_upload_snapshot(self):
initial_modified = timezone.datetime(2025, 1, 1, 0, 0, 0, tzinfo=datetime.UTC)
bookmark = self.setup_bookmark(
diff --git a/bookmarks/tests/test_bookmark_assets.py b/bookmarks/tests/test_bookmark_assets.py
index 87c5119..ce894a0 100644
--- a/bookmarks/tests/test_bookmark_assets.py
+++ b/bookmarks/tests/test_bookmark_assets.py
@@ -3,6 +3,7 @@ import os
from django.conf import settings
from django.test import TestCase
+from bookmarks.models import BookmarkAsset
from bookmarks.services import bookmarks
from bookmarks.tests.helpers import BookmarkFactoryMixin
@@ -79,3 +80,33 @@ class BookmarkAssetsTestCase(TestCase, BookmarkFactoryMixin):
# Create asset with initial file
asset = self.setup_asset(bookmark=bookmark, file="temp.html.gz")
self.assertEqual(asset.file_size, 4)
+
+ def test_download_name_for_html_snapshot(self):
+ bookmark = self.setup_bookmark()
+ asset = self.setup_asset(
+ bookmark=bookmark,
+ asset_type=BookmarkAsset.TYPE_SNAPSHOT,
+ content_type=BookmarkAsset.CONTENT_TYPE_HTML,
+ display_name="HTML snapshot from Jan 1, 2025",
+ )
+ self.assertEqual(asset.download_name, "HTML snapshot from Jan 1, 2025.html")
+
+ def test_download_name_for_pdf_snapshot(self):
+ bookmark = self.setup_bookmark()
+ asset = self.setup_asset(
+ bookmark=bookmark,
+ asset_type=BookmarkAsset.TYPE_SNAPSHOT,
+ content_type=BookmarkAsset.CONTENT_TYPE_PDF,
+ display_name="PDF download from Jan 1, 2025",
+ )
+ self.assertEqual(asset.download_name, "PDF download from Jan 1, 2025.pdf")
+
+ def test_download_name_for_upload(self):
+ bookmark = self.setup_bookmark()
+ asset = self.setup_asset(
+ bookmark=bookmark,
+ asset_type=BookmarkAsset.TYPE_UPLOAD,
+ content_type="text/plain",
+ display_name="document.txt",
+ )
+ self.assertEqual(asset.download_name, "document.txt")
diff --git a/bookmarks/tests/test_bookmarks_tasks.py b/bookmarks/tests/test_bookmarks_tasks.py
index fd77004..2b2ce31 100644
--- a/bookmarks/tests/test_bookmarks_tasks.py
+++ b/bookmarks/tests/test_bookmarks_tasks.py
@@ -482,8 +482,8 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
for asset in assets:
self.assertEqual(asset.bookmark, bookmark)
self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT)
- self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
- self.assertIn("HTML snapshot", asset.display_name)
+ self.assertEqual(asset.content_type, "")
+ self.assertIn("New snapshot", asset.display_name)
self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING)
self.mock_assets_create_snapshot.assert_not_called()
diff --git a/bookmarks/tests/test_website_loader.py b/bookmarks/tests/test_website_loader.py
index 5a6f009..c497b84 100644
--- a/bookmarks/tests/test_website_loader.py
+++ b/bookmarks/tests/test_website_loader.py
@@ -201,3 +201,101 @@ class WebsiteLoaderTestCase(TestCase):
"https://example.com", ignore_cache=True
)
self.assertEqual(mock_load_page.call_count, 2)
+
+
+class ContentTypeDetectionTestCase(TestCase):
+ def test_detect_content_type_returns_content_type_from_head_request(self):
+ with mock.patch("requests.head") as mock_head:
+ mock_response = mock.Mock()
+ mock_response.status_code = 200
+ mock_response.headers = {"Content-Type": "application/pdf"}
+ mock_head.return_value = mock_response
+
+ result = website_loader.detect_content_type("https://example.com/doc.pdf")
+
+ self.assertEqual(result, "application/pdf")
+ mock_head.assert_called_once()
+
+ def test_detect_content_type_strips_charset(self):
+ with mock.patch("requests.head") as mock_head:
+ mock_response = mock.Mock()
+ mock_response.status_code = 200
+ mock_response.headers = {"Content-Type": "text/html; charset=utf-8"}
+ mock_head.return_value = mock_response
+
+ result = website_loader.detect_content_type("https://example.com")
+
+ self.assertEqual(result, "text/html")
+
+ def test_detect_content_type_returns_lowercase(self):
+ with mock.patch("requests.head") as mock_head:
+ mock_response = mock.Mock()
+ mock_response.status_code = 200
+ mock_response.headers = {"Content-Type": "Application/PDF"}
+ mock_head.return_value = mock_response
+
+ result = website_loader.detect_content_type("https://example.com/doc.pdf")
+
+ self.assertEqual(result, "application/pdf")
+
+ def test_detect_content_type_falls_back_to_get_when_head_fails(self):
+ with (
+ mock.patch("requests.head") as mock_head,
+ mock.patch("requests.get") as mock_get,
+ ):
+ import requests
+
+ mock_head.side_effect = requests.RequestException("HEAD failed")
+
+ mock_response = mock.Mock()
+ mock_response.status_code = 200
+ mock_response.headers = {"Content-Type": "application/pdf"}
+ mock_response.__enter__ = mock.Mock(return_value=mock_response)
+ mock_response.__exit__ = mock.Mock(return_value=False)
+ mock_get.return_value = mock_response
+
+ result = website_loader.detect_content_type("https://example.com/doc.pdf")
+
+ self.assertEqual(result, "application/pdf")
+ mock_head.assert_called_once()
+ mock_get.assert_called_once()
+
+ def test_detect_content_type_returns_none_when_both_head_and_get_fail(self):
+ with (
+ mock.patch("requests.head") as mock_head,
+ mock.patch("requests.get") as mock_get,
+ ):
+ import requests
+
+ mock_head.side_effect = requests.RequestException("HEAD failed")
+ mock_get.side_effect = requests.RequestException("GET failed")
+
+ result = website_loader.detect_content_type("https://example.com/doc.pdf")
+
+ self.assertIsNone(result)
+
+ def test_detect_content_type_returns_none_for_non_200_status(self):
+ with (
+ mock.patch("requests.head") as mock_head,
+ mock.patch("requests.get") as mock_get,
+ ):
+ mock_head_response = mock.Mock()
+ mock_head_response.status_code = 404
+ mock_head.return_value = mock_head_response
+
+ mock_get_response = mock.Mock()
+ mock_get_response.status_code = 404
+ mock_get_response.__enter__ = mock.Mock(return_value=mock_get_response)
+ mock_get_response.__exit__ = mock.Mock(return_value=False)
+ mock_get.return_value = mock_get_response
+
+ result = website_loader.detect_content_type("https://example.com/doc.pdf")
+
+ self.assertIsNone(result)
+
+ def test_is_pdf_content_type(self):
+ self.assertTrue(website_loader.is_pdf_content_type("application/pdf"))
+ self.assertTrue(website_loader.is_pdf_content_type("application/x-pdf"))
+ self.assertFalse(website_loader.is_pdf_content_type("text/html"))
+ self.assertFalse(website_loader.is_pdf_content_type(None))
+ self.assertFalse(website_loader.is_pdf_content_type(""))
diff --git a/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py b/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py
index 3d87efb..fda3be6 100644
--- a/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py
+++ b/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py
@@ -152,11 +152,11 @@ class BookmarkDetailsModalE2ETestCase(LinkdingE2ETestCase):
asset_list = details_modal.locator(".assets")
# No snapshots initially
- snapshot = asset_list.get_by_text("HTML snapshot from", exact=False)
+ snapshot = asset_list.get_by_text("snapshot", exact=False)
expect(snapshot).not_to_be_visible()
# Create snapshot
- details_modal.get_by_text("Create HTML snapshot", exact=False).click()
+ details_modal.get_by_text("snapshot", exact=False).click()
self.assertReloads(0)
# Has new snapshots
diff --git a/bookmarks/views/assets.py b/bookmarks/views/assets.py
index 4048875..9176f0e 100644
--- a/bookmarks/views/assets.py
+++ b/bookmarks/views/assets.py
@@ -35,6 +35,8 @@ def view(request, asset_id: int):
response["Content-Disposition"] = f'inline; filename="{asset.download_name}"'
if asset.content_type and asset.content_type.startswith("video/"):
response["Content-Security-Policy"] = "default-src 'none'; media-src 'self';"
+ elif asset.content_type == "application/pdf":
+ response["Content-Security-Policy"] = "default-src 'none'; object-src 'self';"
else:
response["Content-Security-Policy"] = "sandbox allow-scripts"
return response
diff --git a/docs/src/content/docs/archiving.md b/docs/src/content/docs/archiving.md
index 77d2146..0763832 100644
--- a/docs/src/content/docs/archiving.md
+++ b/docs/src/content/docs/archiving.md
@@ -11,6 +11,10 @@ Linkding can automatically create HTML snapshots whenever a bookmark is added. T
The snapshots are created using [singlefile-cli](https://github.com/gildas-lormeau/single-file-cli), which effectively runs a headless Chromium instance on the server to convert the web page into a single HTML file. Linkding will also load the [uBlock Origin Lite extension](https://github.com/uBlockOrigin/uBOL-home) into Chromium to attempt to block ads and other unwanted content.
+
+
This method is fairly easy to set up, but also has several downsides:
- The Docker image is significantly larger than the base image, as it includes a Chromium installation.
- Running Chromium requires significantly more memory, at least 1 GB of RAM.