diff --git a/bookmarks/models.py b/bookmarks/models.py index 745806b..dfa4614 100644 --- a/bookmarks/models.py +++ b/bookmarks/models.py @@ -131,6 +131,7 @@ class BookmarkAsset(models.Model): TYPE_UPLOAD = "upload" CONTENT_TYPE_HTML = "text/html" + CONTENT_TYPE_PDF = "application/pdf" STATUS_PENDING = "pending" STATUS_COMPLETE = "complete" @@ -148,11 +149,11 @@ class BookmarkAsset(models.Model): @property def download_name(self): - return ( - f"{self.display_name}.html" - if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT - else self.display_name - ) + if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT: + if self.content_type == BookmarkAsset.CONTENT_TYPE_PDF: + return f"{self.display_name}.pdf" + return f"{self.display_name}.html" + return self.display_name def save(self, *args, **kwargs): if self.file: diff --git a/bookmarks/services/assets.py b/bookmarks/services/assets.py index 7c0887d..2f635a8 100644 --- a/bookmarks/services/assets.py +++ b/bookmarks/services/assets.py @@ -3,27 +3,35 @@ import logging import os import shutil +import requests from django.conf import settings from django.core.files.uploadedfile import UploadedFile from django.utils import formats, timezone from bookmarks.models import Bookmark, BookmarkAsset from bookmarks.services import singlefile +from bookmarks.services.website_loader import ( + detect_content_type, + fake_request_headers, + is_pdf_content_type, +) MAX_ASSET_FILENAME_LENGTH = 192 logger = logging.getLogger(__name__) +class PdfTooLargeError(Exception): + pass + + def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset: - date_created = timezone.now() - timestamp = formats.date_format(date_created, "SHORT_DATE_FORMAT") asset = BookmarkAsset( bookmark=bookmark, asset_type=BookmarkAsset.TYPE_SNAPSHOT, - date_created=date_created, - content_type=BookmarkAsset.CONTENT_TYPE_HTML, - display_name=f"HTML snapshot from {timestamp}", + date_created=timezone.now(), + content_type="", + display_name="New snapshot", status=BookmarkAsset.STATUS_PENDING, ) return asset @@ -31,37 +39,109 @@ def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset: def create_snapshot(asset: BookmarkAsset): try: - # Create snapshot into temporary file - temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp") - temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename) - singlefile.create_snapshot(asset.bookmark.url, temp_filepath) + url = asset.bookmark.url + content_type = detect_content_type(url) - # Store as gzip in asset folder - filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz") - filepath = os.path.join(settings.LD_ASSET_FOLDER, filename) - with ( - open(temp_filepath, "rb") as temp_file, - gzip.open(filepath, "wb") as gz_file, - ): - shutil.copyfileobj(temp_file, gz_file) - - # Remove temporary file - os.remove(temp_filepath) - - asset.status = BookmarkAsset.STATUS_COMPLETE - asset.file = filename - asset.gzip = True - asset.save() - - asset.bookmark.latest_snapshot = asset - asset.bookmark.date_modified = timezone.now() - asset.bookmark.save() + if is_pdf_content_type(content_type): + _create_pdf_snapshot(asset) + else: + _create_html_snapshot(asset) except Exception as error: asset.status = BookmarkAsset.STATUS_FAILURE asset.save() raise error +def _create_html_snapshot(asset: BookmarkAsset): + # Create snapshot into temporary file + temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp") + temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename) + singlefile.create_snapshot(asset.bookmark.url, temp_filepath) + + # Store as gzip in asset folder + filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz") + filepath = os.path.join(settings.LD_ASSET_FOLDER, filename) + with ( + open(temp_filepath, "rb") as temp_file, + gzip.open(filepath, "wb") as gz_file, + ): + shutil.copyfileobj(temp_file, gz_file) + + # Remove temporary file + os.remove(temp_filepath) + + # Update display name for HTML + timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT") + + asset.status = BookmarkAsset.STATUS_COMPLETE + asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML + asset.display_name = f"HTML snapshot from {timestamp}" + asset.file = filename + asset.gzip = True + asset.save() + + asset.bookmark.latest_snapshot = asset + asset.bookmark.date_modified = timezone.now() + asset.bookmark.save() + + +def _create_pdf_snapshot(asset: BookmarkAsset): + url = asset.bookmark.url + max_size = settings.LD_SNAPSHOT_PDF_MAX_SIZE + + # Download PDF to temporary file + temp_filename = _generate_asset_filename(asset, url, "tmp") + temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename) + + headers = fake_request_headers() + timeout = 60 + + with requests.get(url, headers=headers, stream=True, timeout=timeout) as response: + response.raise_for_status() + + # Check Content-Length header if available + content_length = response.headers.get("Content-Length") + if content_length and int(content_length) > max_size: + raise PdfTooLargeError( + f"PDF size ({content_length} bytes) exceeds limit ({max_size} bytes)" + ) + + # Download in chunks, tracking size + downloaded_size = 0 + with open(temp_filepath, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + downloaded_size += len(chunk) + if downloaded_size > max_size: + raise PdfTooLargeError(f"PDF size exceeds limit ({max_size} bytes)") + f.write(chunk) + + # Store as gzip in asset folder + filename = _generate_asset_filename(asset, url, "pdf.gz") + filepath = os.path.join(settings.LD_ASSET_FOLDER, filename) + with ( + open(temp_filepath, "rb") as temp_file, + gzip.open(filepath, "wb") as gz_file, + ): + shutil.copyfileobj(temp_file, gz_file) + + # Remove temporary file + os.remove(temp_filepath) + + # Update display name for PDF + timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT") + + asset.status = BookmarkAsset.STATUS_COMPLETE + asset.content_type = BookmarkAsset.CONTENT_TYPE_PDF + asset.display_name = f"PDF download from {timestamp}" + asset.file = filename + asset.gzip = True + asset.save() + + asset.bookmark.latest_snapshot = asset + asset.bookmark.date_modified = timezone.now() + asset.bookmark.save() + + def upload_snapshot(bookmark: Bookmark, html: bytes): asset = create_snapshot_asset(bookmark) filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz") @@ -71,7 +151,11 @@ def upload_snapshot(bookmark: Bookmark, html: bytes): gz_file.write(html) # Only save the asset if the file was written successfully + timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT") + asset.status = BookmarkAsset.STATUS_COMPLETE + asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML + asset.display_name = f"HTML snapshot from {timestamp}" asset.file = filename asset.gzip = True asset.save() diff --git a/bookmarks/services/website_loader.py b/bookmarks/services/website_loader.py index a3e8c36..33cc913 100644 --- a/bookmarks/services/website_loader.py +++ b/bookmarks/services/website_loader.py @@ -139,3 +139,42 @@ def fake_request_headers(): "Upgrade-Insecure-Requests": "1", "User-Agent": DEFAULT_USER_AGENT, } + + +def detect_content_type(url: str, timeout: int = 10) -> str | None: + """Make HEAD request to detect content type of URL. Returns None on failure.""" + headers = fake_request_headers() + + try: + response = requests.head( + url, headers=headers, timeout=timeout, allow_redirects=True + ) + if response.status_code == 200: + return ( + response.headers.get("Content-Type", "").split(";")[0].strip().lower() + ) + except requests.RequestException: + pass + + try: + with requests.get( + url, headers=headers, timeout=timeout, stream=True, allow_redirects=True + ) as response: + if response.status_code == 200: + return ( + response.headers.get("Content-Type", "") + .split(";")[0] + .strip() + .lower() + ) + except requests.RequestException: + pass + + return None + + +def is_pdf_content_type(content_type: str | None) -> bool: + """Check if the content type indicates a PDF.""" + if not content_type: + return False + return content_type in ("application/pdf", "application/x-pdf") diff --git a/bookmarks/settings/base.py b/bookmarks/settings/base.py index 6759abf..92172ed 100644 --- a/bookmarks/settings/base.py +++ b/bookmarks/settings/base.py @@ -327,6 +327,7 @@ LD_SINGLEFILE_UBLOCK_OPTIONS = os.getenv( ) LD_SINGLEFILE_OPTIONS = os.getenv("LD_SINGLEFILE_OPTIONS", "") LD_SINGLEFILE_TIMEOUT_SEC = float(os.getenv("LD_SINGLEFILE_TIMEOUT_SEC", 120)) +LD_SNAPSHOT_PDF_MAX_SIZE = int(os.getenv("LD_SNAPSHOT_PDF_MAX_SIZE", 15728640)) # 15MB # Monolith isn't used at the moment, as the local snapshot implementation # switched to single-file after the prototype. Keeping this around in case diff --git a/bookmarks/tests/test_assets_service.py b/bookmarks/tests/test_assets_service.py index 8bddd27..848639a 100644 --- a/bookmarks/tests/test_assets_service.py +++ b/bookmarks/tests/test_assets_service.py @@ -6,7 +6,7 @@ from pathlib import Path from unittest import mock from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase +from django.test import TestCase, override_settings from django.utils import timezone from bookmarks.models import BookmarkAsset @@ -20,6 +20,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin): self.get_or_create_test_user() self.html_content = "

Hello, World!

" + self.pdf_content = b"%PDF-1.4 test pdf content" + self.mock_singlefile_create_snapshot_patcher = mock.patch( "bookmarks.services.singlefile.create_snapshot", ) @@ -30,8 +32,24 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin): Path(filepath).write_text(self.html_content) ) + # Mock detect_content_type to return text/html by default + self.mock_detect_content_type_patcher = mock.patch( + "bookmarks.services.assets.detect_content_type", + ) + self.mock_detect_content_type = self.mock_detect_content_type_patcher.start() + self.mock_detect_content_type.return_value = "text/html" + + # Mock is_pdf_content_type to return False by default + self.mock_is_pdf_content_type_patcher = mock.patch( + "bookmarks.services.assets.is_pdf_content_type", + ) + self.mock_is_pdf_content_type = self.mock_is_pdf_content_type_patcher.start() + self.mock_is_pdf_content_type.return_value = False + def tearDown(self) -> None: self.mock_singlefile_create_snapshot_patcher.stop() + self.mock_detect_content_type_patcher.stop() + self.mock_is_pdf_content_type_patcher.stop() def get_saved_snapshot_file(self): # look up first file in the asset folder @@ -39,6 +57,20 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin): if files: return files[0] + def create_mock_pdf_response(self, content=None, content_length=None): + if content is None: + content = self.pdf_content + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/pdf"} + if content_length is not None: + mock_response.headers["Content-Length"] = str(content_length) + mock_response.iter_content = mock.Mock(return_value=[content]) + mock_response.raise_for_status = mock.Mock() + mock_response.__enter__ = mock.Mock(return_value=mock_response) + mock_response.__exit__ = mock.Mock(return_value=False) + return mock_response + def test_create_snapshot_asset(self): bookmark = self.setup_bookmark() @@ -47,8 +79,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin): self.assertIsNotNone(asset) self.assertEqual(asset.bookmark, bookmark) self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT) - self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML) - self.assertIn("HTML snapshot from", asset.display_name) + self.assertEqual(asset.content_type, "") + self.assertEqual(asset.display_name, "New snapshot") self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING) # asset is not saved to the database @@ -91,6 +123,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin): # should update asset status and file asset.refresh_from_db() self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE) + self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML) + self.assertIn("HTML snapshot from", asset.display_name) self.assertEqual(asset.file, expected_filename) self.assertTrue(asset.gzip) @@ -127,6 +161,119 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin): self.assertTrue(saved_file.startswith("snapshot_")) self.assertTrue(saved_file.endswith("aaaa.html.gz")) + def test_create_pdf_snapshot(self): + bookmark = self.setup_bookmark(url="https://example.com/doc.pdf") + asset = assets.create_snapshot_asset(bookmark) + asset.save() + asset.date_created = timezone.datetime( + 2023, 8, 11, 21, 45, 11, tzinfo=datetime.UTC + ) + + self.mock_detect_content_type.return_value = "application/pdf" + self.mock_is_pdf_content_type.return_value = True + + with mock.patch("bookmarks.services.assets.requests.get") as mock_get: + mock_get.return_value = self.create_mock_pdf_response() + assets.create_snapshot(asset) + + expected_filename = ( + "snapshot_2023-08-11_214511_https___example.com_doc.pdf.pdf.gz" + ) + expected_filepath = os.path.join(self.assets_dir, expected_filename) + + # should create gzip file in asset folder + self.assertTrue(os.path.exists(expected_filepath)) + + # gzip file should contain the correct content + with gzip.open(expected_filepath, "rb") as gz_file: + self.assertEqual(gz_file.read(), self.pdf_content) + + # should update asset status and file + asset.refresh_from_db() + self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE) + self.assertEqual(asset.file, expected_filename) + self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_PDF) + self.assertIn("PDF download from", asset.display_name) + self.assertTrue(asset.gzip) + + # should update bookmark + bookmark.refresh_from_db() + self.assertEqual(bookmark.latest_snapshot, asset) + + def test_create_snapshot_falls_back_to_singlefile_when_detection_fails(self): + bookmark = self.setup_bookmark(url="https://example.com") + asset = assets.create_snapshot_asset(bookmark) + asset.save() + + self.mock_detect_content_type.return_value = None # Detection failed + + assets.create_snapshot(asset) + + asset.refresh_from_db() + self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE) + self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML) + self.mock_singlefile_create_snapshot.assert_called() + + @override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100) + def test_create_pdf_snapshot_fails_when_content_length_exceeds_limit(self): + bookmark = self.setup_bookmark(url="https://example.com/doc.pdf") + asset = assets.create_snapshot_asset(bookmark) + asset.save() + + self.mock_detect_content_type.return_value = "application/pdf" + self.mock_is_pdf_content_type.return_value = True + + with mock.patch("bookmarks.services.assets.requests.get") as mock_get: + mock_get.return_value = self.create_mock_pdf_response( + content_length=1000 # Exceeds 100 byte limit + ) + + with self.assertRaises(assets.PdfTooLargeError): + assets.create_snapshot(asset) + + asset.refresh_from_db() + self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE) + + @override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100) + def test_create_pdf_snapshot_fails_when_download_exceeds_limit(self): + bookmark = self.setup_bookmark(url="https://example.com/doc.pdf") + asset = assets.create_snapshot_asset(bookmark) + asset.save() + + large_content = b"x" * 150 # Exceeds 100 byte limit + + self.mock_detect_content_type.return_value = "application/pdf" + self.mock_is_pdf_content_type.return_value = True + + with mock.patch("bookmarks.services.assets.requests.get") as mock_get: + # Response without Content-Length header, will fail during streaming + mock_get.return_value = self.create_mock_pdf_response(content=large_content) + + with self.assertRaises(assets.PdfTooLargeError): + assets.create_snapshot(asset) + + asset.refresh_from_db() + self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE) + + def test_create_pdf_snapshot_failure(self): + bookmark = self.setup_bookmark(url="https://example.com/doc.pdf") + asset = assets.create_snapshot_asset(bookmark) + asset.save() + + self.mock_detect_content_type.return_value = "application/pdf" + self.mock_is_pdf_content_type.return_value = True + + with mock.patch("bookmarks.services.assets.requests.get") as mock_get: + import requests + + mock_get.side_effect = requests.RequestException("Download failed") + + with self.assertRaises(requests.RequestException): + assets.create_snapshot(asset) + + asset.refresh_from_db() + self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE) + def test_upload_snapshot(self): initial_modified = timezone.datetime(2025, 1, 1, 0, 0, 0, tzinfo=datetime.UTC) bookmark = self.setup_bookmark( diff --git a/bookmarks/tests/test_bookmark_assets.py b/bookmarks/tests/test_bookmark_assets.py index 87c5119..ce894a0 100644 --- a/bookmarks/tests/test_bookmark_assets.py +++ b/bookmarks/tests/test_bookmark_assets.py @@ -3,6 +3,7 @@ import os from django.conf import settings from django.test import TestCase +from bookmarks.models import BookmarkAsset from bookmarks.services import bookmarks from bookmarks.tests.helpers import BookmarkFactoryMixin @@ -79,3 +80,33 @@ class BookmarkAssetsTestCase(TestCase, BookmarkFactoryMixin): # Create asset with initial file asset = self.setup_asset(bookmark=bookmark, file="temp.html.gz") self.assertEqual(asset.file_size, 4) + + def test_download_name_for_html_snapshot(self): + bookmark = self.setup_bookmark() + asset = self.setup_asset( + bookmark=bookmark, + asset_type=BookmarkAsset.TYPE_SNAPSHOT, + content_type=BookmarkAsset.CONTENT_TYPE_HTML, + display_name="HTML snapshot from Jan 1, 2025", + ) + self.assertEqual(asset.download_name, "HTML snapshot from Jan 1, 2025.html") + + def test_download_name_for_pdf_snapshot(self): + bookmark = self.setup_bookmark() + asset = self.setup_asset( + bookmark=bookmark, + asset_type=BookmarkAsset.TYPE_SNAPSHOT, + content_type=BookmarkAsset.CONTENT_TYPE_PDF, + display_name="PDF download from Jan 1, 2025", + ) + self.assertEqual(asset.download_name, "PDF download from Jan 1, 2025.pdf") + + def test_download_name_for_upload(self): + bookmark = self.setup_bookmark() + asset = self.setup_asset( + bookmark=bookmark, + asset_type=BookmarkAsset.TYPE_UPLOAD, + content_type="text/plain", + display_name="document.txt", + ) + self.assertEqual(asset.download_name, "document.txt") diff --git a/bookmarks/tests/test_bookmarks_tasks.py b/bookmarks/tests/test_bookmarks_tasks.py index fd77004..2b2ce31 100644 --- a/bookmarks/tests/test_bookmarks_tasks.py +++ b/bookmarks/tests/test_bookmarks_tasks.py @@ -482,8 +482,8 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): for asset in assets: self.assertEqual(asset.bookmark, bookmark) self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT) - self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML) - self.assertIn("HTML snapshot", asset.display_name) + self.assertEqual(asset.content_type, "") + self.assertIn("New snapshot", asset.display_name) self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING) self.mock_assets_create_snapshot.assert_not_called() diff --git a/bookmarks/tests/test_website_loader.py b/bookmarks/tests/test_website_loader.py index 5a6f009..c497b84 100644 --- a/bookmarks/tests/test_website_loader.py +++ b/bookmarks/tests/test_website_loader.py @@ -201,3 +201,101 @@ class WebsiteLoaderTestCase(TestCase): "https://example.com", ignore_cache=True ) self.assertEqual(mock_load_page.call_count, 2) + + +class ContentTypeDetectionTestCase(TestCase): + def test_detect_content_type_returns_content_type_from_head_request(self): + with mock.patch("requests.head") as mock_head: + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/pdf"} + mock_head.return_value = mock_response + + result = website_loader.detect_content_type("https://example.com/doc.pdf") + + self.assertEqual(result, "application/pdf") + mock_head.assert_called_once() + + def test_detect_content_type_strips_charset(self): + with mock.patch("requests.head") as mock_head: + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "text/html; charset=utf-8"} + mock_head.return_value = mock_response + + result = website_loader.detect_content_type("https://example.com") + + self.assertEqual(result, "text/html") + + def test_detect_content_type_returns_lowercase(self): + with mock.patch("requests.head") as mock_head: + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "Application/PDF"} + mock_head.return_value = mock_response + + result = website_loader.detect_content_type("https://example.com/doc.pdf") + + self.assertEqual(result, "application/pdf") + + def test_detect_content_type_falls_back_to_get_when_head_fails(self): + with ( + mock.patch("requests.head") as mock_head, + mock.patch("requests.get") as mock_get, + ): + import requests + + mock_head.side_effect = requests.RequestException("HEAD failed") + + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"Content-Type": "application/pdf"} + mock_response.__enter__ = mock.Mock(return_value=mock_response) + mock_response.__exit__ = mock.Mock(return_value=False) + mock_get.return_value = mock_response + + result = website_loader.detect_content_type("https://example.com/doc.pdf") + + self.assertEqual(result, "application/pdf") + mock_head.assert_called_once() + mock_get.assert_called_once() + + def test_detect_content_type_returns_none_when_both_head_and_get_fail(self): + with ( + mock.patch("requests.head") as mock_head, + mock.patch("requests.get") as mock_get, + ): + import requests + + mock_head.side_effect = requests.RequestException("HEAD failed") + mock_get.side_effect = requests.RequestException("GET failed") + + result = website_loader.detect_content_type("https://example.com/doc.pdf") + + self.assertIsNone(result) + + def test_detect_content_type_returns_none_for_non_200_status(self): + with ( + mock.patch("requests.head") as mock_head, + mock.patch("requests.get") as mock_get, + ): + mock_head_response = mock.Mock() + mock_head_response.status_code = 404 + mock_head.return_value = mock_head_response + + mock_get_response = mock.Mock() + mock_get_response.status_code = 404 + mock_get_response.__enter__ = mock.Mock(return_value=mock_get_response) + mock_get_response.__exit__ = mock.Mock(return_value=False) + mock_get.return_value = mock_get_response + + result = website_loader.detect_content_type("https://example.com/doc.pdf") + + self.assertIsNone(result) + + def test_is_pdf_content_type(self): + self.assertTrue(website_loader.is_pdf_content_type("application/pdf")) + self.assertTrue(website_loader.is_pdf_content_type("application/x-pdf")) + self.assertFalse(website_loader.is_pdf_content_type("text/html")) + self.assertFalse(website_loader.is_pdf_content_type(None)) + self.assertFalse(website_loader.is_pdf_content_type("")) diff --git a/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py b/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py index 3d87efb..fda3be6 100644 --- a/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py +++ b/bookmarks/tests_e2e/e2e_test_bookmark_details_modal.py @@ -152,11 +152,11 @@ class BookmarkDetailsModalE2ETestCase(LinkdingE2ETestCase): asset_list = details_modal.locator(".assets") # No snapshots initially - snapshot = asset_list.get_by_text("HTML snapshot from", exact=False) + snapshot = asset_list.get_by_text("snapshot", exact=False) expect(snapshot).not_to_be_visible() # Create snapshot - details_modal.get_by_text("Create HTML snapshot", exact=False).click() + details_modal.get_by_text("snapshot", exact=False).click() self.assertReloads(0) # Has new snapshots diff --git a/bookmarks/views/assets.py b/bookmarks/views/assets.py index 4048875..9176f0e 100644 --- a/bookmarks/views/assets.py +++ b/bookmarks/views/assets.py @@ -35,6 +35,8 @@ def view(request, asset_id: int): response["Content-Disposition"] = f'inline; filename="{asset.download_name}"' if asset.content_type and asset.content_type.startswith("video/"): response["Content-Security-Policy"] = "default-src 'none'; media-src 'self';" + elif asset.content_type == "application/pdf": + response["Content-Security-Policy"] = "default-src 'none'; object-src 'self';" else: response["Content-Security-Policy"] = "sandbox allow-scripts" return response diff --git a/docs/src/content/docs/archiving.md b/docs/src/content/docs/archiving.md index 77d2146..0763832 100644 --- a/docs/src/content/docs/archiving.md +++ b/docs/src/content/docs/archiving.md @@ -11,6 +11,10 @@ Linkding can automatically create HTML snapshots whenever a bookmark is added. T The snapshots are created using [singlefile-cli](https://github.com/gildas-lormeau/single-file-cli), which effectively runs a headless Chromium instance on the server to convert the web page into a single HTML file. Linkding will also load the [uBlock Origin Lite extension](https://github.com/uBlockOrigin/uBOL-home) into Chromium to attempt to block ads and other unwanted content. + + This method is fairly easy to set up, but also has several downsides: - The Docker image is significantly larger than the base image, as it includes a Chromium installation. - Running Chromium requires significantly more memory, at least 1 GB of RAM.