mirror of
https://github.com/sissbruecker/linkding.git
synced 2026-02-28 06:53:12 +08:00
Download PDF instead of creating HTML snapshot if URL points at PDF (#1271)
* basic pdf snapshots * cleanup website_loader tests * cleanup asset tests * cleanup asset service tests * use PDF download as display name * update new snapshot name * update docs * update e2e test * update test
This commit is contained in:
@@ -131,6 +131,7 @@ class BookmarkAsset(models.Model):
|
||||
TYPE_UPLOAD = "upload"
|
||||
|
||||
CONTENT_TYPE_HTML = "text/html"
|
||||
CONTENT_TYPE_PDF = "application/pdf"
|
||||
|
||||
STATUS_PENDING = "pending"
|
||||
STATUS_COMPLETE = "complete"
|
||||
@@ -148,11 +149,11 @@ class BookmarkAsset(models.Model):
|
||||
|
||||
@property
|
||||
def download_name(self):
|
||||
return (
|
||||
f"{self.display_name}.html"
|
||||
if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT
|
||||
else self.display_name
|
||||
)
|
||||
if self.asset_type == BookmarkAsset.TYPE_SNAPSHOT:
|
||||
if self.content_type == BookmarkAsset.CONTENT_TYPE_PDF:
|
||||
return f"{self.display_name}.pdf"
|
||||
return f"{self.display_name}.html"
|
||||
return self.display_name
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if self.file:
|
||||
|
||||
@@ -3,27 +3,35 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import requests
|
||||
from django.conf import settings
|
||||
from django.core.files.uploadedfile import UploadedFile
|
||||
from django.utils import formats, timezone
|
||||
|
||||
from bookmarks.models import Bookmark, BookmarkAsset
|
||||
from bookmarks.services import singlefile
|
||||
from bookmarks.services.website_loader import (
|
||||
detect_content_type,
|
||||
fake_request_headers,
|
||||
is_pdf_content_type,
|
||||
)
|
||||
|
||||
MAX_ASSET_FILENAME_LENGTH = 192
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PdfTooLargeError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset:
|
||||
date_created = timezone.now()
|
||||
timestamp = formats.date_format(date_created, "SHORT_DATE_FORMAT")
|
||||
asset = BookmarkAsset(
|
||||
bookmark=bookmark,
|
||||
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
|
||||
date_created=date_created,
|
||||
content_type=BookmarkAsset.CONTENT_TYPE_HTML,
|
||||
display_name=f"HTML snapshot from {timestamp}",
|
||||
date_created=timezone.now(),
|
||||
content_type="",
|
||||
display_name="New snapshot",
|
||||
status=BookmarkAsset.STATUS_PENDING,
|
||||
)
|
||||
return asset
|
||||
@@ -31,37 +39,109 @@ def create_snapshot_asset(bookmark: Bookmark) -> BookmarkAsset:
|
||||
|
||||
def create_snapshot(asset: BookmarkAsset):
|
||||
try:
|
||||
# Create snapshot into temporary file
|
||||
temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp")
|
||||
temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
|
||||
singlefile.create_snapshot(asset.bookmark.url, temp_filepath)
|
||||
url = asset.bookmark.url
|
||||
content_type = detect_content_type(url)
|
||||
|
||||
# Store as gzip in asset folder
|
||||
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
|
||||
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
|
||||
with (
|
||||
open(temp_filepath, "rb") as temp_file,
|
||||
gzip.open(filepath, "wb") as gz_file,
|
||||
):
|
||||
shutil.copyfileobj(temp_file, gz_file)
|
||||
|
||||
# Remove temporary file
|
||||
os.remove(temp_filepath)
|
||||
|
||||
asset.status = BookmarkAsset.STATUS_COMPLETE
|
||||
asset.file = filename
|
||||
asset.gzip = True
|
||||
asset.save()
|
||||
|
||||
asset.bookmark.latest_snapshot = asset
|
||||
asset.bookmark.date_modified = timezone.now()
|
||||
asset.bookmark.save()
|
||||
if is_pdf_content_type(content_type):
|
||||
_create_pdf_snapshot(asset)
|
||||
else:
|
||||
_create_html_snapshot(asset)
|
||||
except Exception as error:
|
||||
asset.status = BookmarkAsset.STATUS_FAILURE
|
||||
asset.save()
|
||||
raise error
|
||||
|
||||
|
||||
def _create_html_snapshot(asset: BookmarkAsset):
|
||||
# Create snapshot into temporary file
|
||||
temp_filename = _generate_asset_filename(asset, asset.bookmark.url, "tmp")
|
||||
temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
|
||||
singlefile.create_snapshot(asset.bookmark.url, temp_filepath)
|
||||
|
||||
# Store as gzip in asset folder
|
||||
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
|
||||
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
|
||||
with (
|
||||
open(temp_filepath, "rb") as temp_file,
|
||||
gzip.open(filepath, "wb") as gz_file,
|
||||
):
|
||||
shutil.copyfileobj(temp_file, gz_file)
|
||||
|
||||
# Remove temporary file
|
||||
os.remove(temp_filepath)
|
||||
|
||||
# Update display name for HTML
|
||||
timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
|
||||
|
||||
asset.status = BookmarkAsset.STATUS_COMPLETE
|
||||
asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML
|
||||
asset.display_name = f"HTML snapshot from {timestamp}"
|
||||
asset.file = filename
|
||||
asset.gzip = True
|
||||
asset.save()
|
||||
|
||||
asset.bookmark.latest_snapshot = asset
|
||||
asset.bookmark.date_modified = timezone.now()
|
||||
asset.bookmark.save()
|
||||
|
||||
|
||||
def _create_pdf_snapshot(asset: BookmarkAsset):
|
||||
url = asset.bookmark.url
|
||||
max_size = settings.LD_SNAPSHOT_PDF_MAX_SIZE
|
||||
|
||||
# Download PDF to temporary file
|
||||
temp_filename = _generate_asset_filename(asset, url, "tmp")
|
||||
temp_filepath = os.path.join(settings.LD_ASSET_FOLDER, temp_filename)
|
||||
|
||||
headers = fake_request_headers()
|
||||
timeout = 60
|
||||
|
||||
with requests.get(url, headers=headers, stream=True, timeout=timeout) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Check Content-Length header if available
|
||||
content_length = response.headers.get("Content-Length")
|
||||
if content_length and int(content_length) > max_size:
|
||||
raise PdfTooLargeError(
|
||||
f"PDF size ({content_length} bytes) exceeds limit ({max_size} bytes)"
|
||||
)
|
||||
|
||||
# Download in chunks, tracking size
|
||||
downloaded_size = 0
|
||||
with open(temp_filepath, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
downloaded_size += len(chunk)
|
||||
if downloaded_size > max_size:
|
||||
raise PdfTooLargeError(f"PDF size exceeds limit ({max_size} bytes)")
|
||||
f.write(chunk)
|
||||
|
||||
# Store as gzip in asset folder
|
||||
filename = _generate_asset_filename(asset, url, "pdf.gz")
|
||||
filepath = os.path.join(settings.LD_ASSET_FOLDER, filename)
|
||||
with (
|
||||
open(temp_filepath, "rb") as temp_file,
|
||||
gzip.open(filepath, "wb") as gz_file,
|
||||
):
|
||||
shutil.copyfileobj(temp_file, gz_file)
|
||||
|
||||
# Remove temporary file
|
||||
os.remove(temp_filepath)
|
||||
|
||||
# Update display name for PDF
|
||||
timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
|
||||
|
||||
asset.status = BookmarkAsset.STATUS_COMPLETE
|
||||
asset.content_type = BookmarkAsset.CONTENT_TYPE_PDF
|
||||
asset.display_name = f"PDF download from {timestamp}"
|
||||
asset.file = filename
|
||||
asset.gzip = True
|
||||
asset.save()
|
||||
|
||||
asset.bookmark.latest_snapshot = asset
|
||||
asset.bookmark.date_modified = timezone.now()
|
||||
asset.bookmark.save()
|
||||
|
||||
|
||||
def upload_snapshot(bookmark: Bookmark, html: bytes):
|
||||
asset = create_snapshot_asset(bookmark)
|
||||
filename = _generate_asset_filename(asset, asset.bookmark.url, "html.gz")
|
||||
@@ -71,7 +151,11 @@ def upload_snapshot(bookmark: Bookmark, html: bytes):
|
||||
gz_file.write(html)
|
||||
|
||||
# Only save the asset if the file was written successfully
|
||||
timestamp = formats.date_format(asset.date_created, "SHORT_DATE_FORMAT")
|
||||
|
||||
asset.status = BookmarkAsset.STATUS_COMPLETE
|
||||
asset.content_type = BookmarkAsset.CONTENT_TYPE_HTML
|
||||
asset.display_name = f"HTML snapshot from {timestamp}"
|
||||
asset.file = filename
|
||||
asset.gzip = True
|
||||
asset.save()
|
||||
|
||||
@@ -139,3 +139,42 @@ def fake_request_headers():
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"User-Agent": DEFAULT_USER_AGENT,
|
||||
}
|
||||
|
||||
|
||||
def detect_content_type(url: str, timeout: int = 10) -> str | None:
|
||||
"""Make HEAD request to detect content type of URL. Returns None on failure."""
|
||||
headers = fake_request_headers()
|
||||
|
||||
try:
|
||||
response = requests.head(
|
||||
url, headers=headers, timeout=timeout, allow_redirects=True
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return (
|
||||
response.headers.get("Content-Type", "").split(";")[0].strip().lower()
|
||||
)
|
||||
except requests.RequestException:
|
||||
pass
|
||||
|
||||
try:
|
||||
with requests.get(
|
||||
url, headers=headers, timeout=timeout, stream=True, allow_redirects=True
|
||||
) as response:
|
||||
if response.status_code == 200:
|
||||
return (
|
||||
response.headers.get("Content-Type", "")
|
||||
.split(";")[0]
|
||||
.strip()
|
||||
.lower()
|
||||
)
|
||||
except requests.RequestException:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def is_pdf_content_type(content_type: str | None) -> bool:
|
||||
"""Check if the content type indicates a PDF."""
|
||||
if not content_type:
|
||||
return False
|
||||
return content_type in ("application/pdf", "application/x-pdf")
|
||||
|
||||
@@ -327,6 +327,7 @@ LD_SINGLEFILE_UBLOCK_OPTIONS = os.getenv(
|
||||
)
|
||||
LD_SINGLEFILE_OPTIONS = os.getenv("LD_SINGLEFILE_OPTIONS", "")
|
||||
LD_SINGLEFILE_TIMEOUT_SEC = float(os.getenv("LD_SINGLEFILE_TIMEOUT_SEC", 120))
|
||||
LD_SNAPSHOT_PDF_MAX_SIZE = int(os.getenv("LD_SNAPSHOT_PDF_MAX_SIZE", 15728640)) # 15MB
|
||||
|
||||
# Monolith isn't used at the moment, as the local snapshot implementation
|
||||
# switched to single-file after the prototype. Keeping this around in case
|
||||
|
||||
@@ -6,7 +6,7 @@ from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from django.test import TestCase
|
||||
from django.test import TestCase, override_settings
|
||||
from django.utils import timezone
|
||||
|
||||
from bookmarks.models import BookmarkAsset
|
||||
@@ -20,6 +20,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
self.get_or_create_test_user()
|
||||
|
||||
self.html_content = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||
self.pdf_content = b"%PDF-1.4 test pdf content"
|
||||
|
||||
self.mock_singlefile_create_snapshot_patcher = mock.patch(
|
||||
"bookmarks.services.singlefile.create_snapshot",
|
||||
)
|
||||
@@ -30,8 +32,24 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
Path(filepath).write_text(self.html_content)
|
||||
)
|
||||
|
||||
# Mock detect_content_type to return text/html by default
|
||||
self.mock_detect_content_type_patcher = mock.patch(
|
||||
"bookmarks.services.assets.detect_content_type",
|
||||
)
|
||||
self.mock_detect_content_type = self.mock_detect_content_type_patcher.start()
|
||||
self.mock_detect_content_type.return_value = "text/html"
|
||||
|
||||
# Mock is_pdf_content_type to return False by default
|
||||
self.mock_is_pdf_content_type_patcher = mock.patch(
|
||||
"bookmarks.services.assets.is_pdf_content_type",
|
||||
)
|
||||
self.mock_is_pdf_content_type = self.mock_is_pdf_content_type_patcher.start()
|
||||
self.mock_is_pdf_content_type.return_value = False
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.mock_singlefile_create_snapshot_patcher.stop()
|
||||
self.mock_detect_content_type_patcher.stop()
|
||||
self.mock_is_pdf_content_type_patcher.stop()
|
||||
|
||||
def get_saved_snapshot_file(self):
|
||||
# look up first file in the asset folder
|
||||
@@ -39,6 +57,20 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
if files:
|
||||
return files[0]
|
||||
|
||||
def create_mock_pdf_response(self, content=None, content_length=None):
|
||||
if content is None:
|
||||
content = self.pdf_content
|
||||
mock_response = mock.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "application/pdf"}
|
||||
if content_length is not None:
|
||||
mock_response.headers["Content-Length"] = str(content_length)
|
||||
mock_response.iter_content = mock.Mock(return_value=[content])
|
||||
mock_response.raise_for_status = mock.Mock()
|
||||
mock_response.__enter__ = mock.Mock(return_value=mock_response)
|
||||
mock_response.__exit__ = mock.Mock(return_value=False)
|
||||
return mock_response
|
||||
|
||||
def test_create_snapshot_asset(self):
|
||||
bookmark = self.setup_bookmark()
|
||||
|
||||
@@ -47,8 +79,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
self.assertIsNotNone(asset)
|
||||
self.assertEqual(asset.bookmark, bookmark)
|
||||
self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT)
|
||||
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
|
||||
self.assertIn("HTML snapshot from", asset.display_name)
|
||||
self.assertEqual(asset.content_type, "")
|
||||
self.assertEqual(asset.display_name, "New snapshot")
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING)
|
||||
|
||||
# asset is not saved to the database
|
||||
@@ -91,6 +123,8 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
# should update asset status and file
|
||||
asset.refresh_from_db()
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
|
||||
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
|
||||
self.assertIn("HTML snapshot from", asset.display_name)
|
||||
self.assertEqual(asset.file, expected_filename)
|
||||
self.assertTrue(asset.gzip)
|
||||
|
||||
@@ -127,6 +161,119 @@ class AssetServiceTestCase(TestCase, BookmarkFactoryMixin):
|
||||
self.assertTrue(saved_file.startswith("snapshot_"))
|
||||
self.assertTrue(saved_file.endswith("aaaa.html.gz"))
|
||||
|
||||
def test_create_pdf_snapshot(self):
|
||||
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
|
||||
asset = assets.create_snapshot_asset(bookmark)
|
||||
asset.save()
|
||||
asset.date_created = timezone.datetime(
|
||||
2023, 8, 11, 21, 45, 11, tzinfo=datetime.UTC
|
||||
)
|
||||
|
||||
self.mock_detect_content_type.return_value = "application/pdf"
|
||||
self.mock_is_pdf_content_type.return_value = True
|
||||
|
||||
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
|
||||
mock_get.return_value = self.create_mock_pdf_response()
|
||||
assets.create_snapshot(asset)
|
||||
|
||||
expected_filename = (
|
||||
"snapshot_2023-08-11_214511_https___example.com_doc.pdf.pdf.gz"
|
||||
)
|
||||
expected_filepath = os.path.join(self.assets_dir, expected_filename)
|
||||
|
||||
# should create gzip file in asset folder
|
||||
self.assertTrue(os.path.exists(expected_filepath))
|
||||
|
||||
# gzip file should contain the correct content
|
||||
with gzip.open(expected_filepath, "rb") as gz_file:
|
||||
self.assertEqual(gz_file.read(), self.pdf_content)
|
||||
|
||||
# should update asset status and file
|
||||
asset.refresh_from_db()
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
|
||||
self.assertEqual(asset.file, expected_filename)
|
||||
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_PDF)
|
||||
self.assertIn("PDF download from", asset.display_name)
|
||||
self.assertTrue(asset.gzip)
|
||||
|
||||
# should update bookmark
|
||||
bookmark.refresh_from_db()
|
||||
self.assertEqual(bookmark.latest_snapshot, asset)
|
||||
|
||||
def test_create_snapshot_falls_back_to_singlefile_when_detection_fails(self):
|
||||
bookmark = self.setup_bookmark(url="https://example.com")
|
||||
asset = assets.create_snapshot_asset(bookmark)
|
||||
asset.save()
|
||||
|
||||
self.mock_detect_content_type.return_value = None # Detection failed
|
||||
|
||||
assets.create_snapshot(asset)
|
||||
|
||||
asset.refresh_from_db()
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_COMPLETE)
|
||||
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
|
||||
self.mock_singlefile_create_snapshot.assert_called()
|
||||
|
||||
@override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100)
|
||||
def test_create_pdf_snapshot_fails_when_content_length_exceeds_limit(self):
|
||||
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
|
||||
asset = assets.create_snapshot_asset(bookmark)
|
||||
asset.save()
|
||||
|
||||
self.mock_detect_content_type.return_value = "application/pdf"
|
||||
self.mock_is_pdf_content_type.return_value = True
|
||||
|
||||
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
|
||||
mock_get.return_value = self.create_mock_pdf_response(
|
||||
content_length=1000 # Exceeds 100 byte limit
|
||||
)
|
||||
|
||||
with self.assertRaises(assets.PdfTooLargeError):
|
||||
assets.create_snapshot(asset)
|
||||
|
||||
asset.refresh_from_db()
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
|
||||
|
||||
@override_settings(LD_SNAPSHOT_PDF_MAX_SIZE=100)
|
||||
def test_create_pdf_snapshot_fails_when_download_exceeds_limit(self):
|
||||
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
|
||||
asset = assets.create_snapshot_asset(bookmark)
|
||||
asset.save()
|
||||
|
||||
large_content = b"x" * 150 # Exceeds 100 byte limit
|
||||
|
||||
self.mock_detect_content_type.return_value = "application/pdf"
|
||||
self.mock_is_pdf_content_type.return_value = True
|
||||
|
||||
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
|
||||
# Response without Content-Length header, will fail during streaming
|
||||
mock_get.return_value = self.create_mock_pdf_response(content=large_content)
|
||||
|
||||
with self.assertRaises(assets.PdfTooLargeError):
|
||||
assets.create_snapshot(asset)
|
||||
|
||||
asset.refresh_from_db()
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
|
||||
|
||||
def test_create_pdf_snapshot_failure(self):
|
||||
bookmark = self.setup_bookmark(url="https://example.com/doc.pdf")
|
||||
asset = assets.create_snapshot_asset(bookmark)
|
||||
asset.save()
|
||||
|
||||
self.mock_detect_content_type.return_value = "application/pdf"
|
||||
self.mock_is_pdf_content_type.return_value = True
|
||||
|
||||
with mock.patch("bookmarks.services.assets.requests.get") as mock_get:
|
||||
import requests
|
||||
|
||||
mock_get.side_effect = requests.RequestException("Download failed")
|
||||
|
||||
with self.assertRaises(requests.RequestException):
|
||||
assets.create_snapshot(asset)
|
||||
|
||||
asset.refresh_from_db()
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_FAILURE)
|
||||
|
||||
def test_upload_snapshot(self):
|
||||
initial_modified = timezone.datetime(2025, 1, 1, 0, 0, 0, tzinfo=datetime.UTC)
|
||||
bookmark = self.setup_bookmark(
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
from django.conf import settings
|
||||
from django.test import TestCase
|
||||
|
||||
from bookmarks.models import BookmarkAsset
|
||||
from bookmarks.services import bookmarks
|
||||
from bookmarks.tests.helpers import BookmarkFactoryMixin
|
||||
|
||||
@@ -79,3 +80,33 @@ class BookmarkAssetsTestCase(TestCase, BookmarkFactoryMixin):
|
||||
# Create asset with initial file
|
||||
asset = self.setup_asset(bookmark=bookmark, file="temp.html.gz")
|
||||
self.assertEqual(asset.file_size, 4)
|
||||
|
||||
def test_download_name_for_html_snapshot(self):
|
||||
bookmark = self.setup_bookmark()
|
||||
asset = self.setup_asset(
|
||||
bookmark=bookmark,
|
||||
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
|
||||
content_type=BookmarkAsset.CONTENT_TYPE_HTML,
|
||||
display_name="HTML snapshot from Jan 1, 2025",
|
||||
)
|
||||
self.assertEqual(asset.download_name, "HTML snapshot from Jan 1, 2025.html")
|
||||
|
||||
def test_download_name_for_pdf_snapshot(self):
|
||||
bookmark = self.setup_bookmark()
|
||||
asset = self.setup_asset(
|
||||
bookmark=bookmark,
|
||||
asset_type=BookmarkAsset.TYPE_SNAPSHOT,
|
||||
content_type=BookmarkAsset.CONTENT_TYPE_PDF,
|
||||
display_name="PDF download from Jan 1, 2025",
|
||||
)
|
||||
self.assertEqual(asset.download_name, "PDF download from Jan 1, 2025.pdf")
|
||||
|
||||
def test_download_name_for_upload(self):
|
||||
bookmark = self.setup_bookmark()
|
||||
asset = self.setup_asset(
|
||||
bookmark=bookmark,
|
||||
asset_type=BookmarkAsset.TYPE_UPLOAD,
|
||||
content_type="text/plain",
|
||||
display_name="document.txt",
|
||||
)
|
||||
self.assertEqual(asset.download_name, "document.txt")
|
||||
|
||||
@@ -482,8 +482,8 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
|
||||
for asset in assets:
|
||||
self.assertEqual(asset.bookmark, bookmark)
|
||||
self.assertEqual(asset.asset_type, BookmarkAsset.TYPE_SNAPSHOT)
|
||||
self.assertEqual(asset.content_type, BookmarkAsset.CONTENT_TYPE_HTML)
|
||||
self.assertIn("HTML snapshot", asset.display_name)
|
||||
self.assertEqual(asset.content_type, "")
|
||||
self.assertIn("New snapshot", asset.display_name)
|
||||
self.assertEqual(asset.status, BookmarkAsset.STATUS_PENDING)
|
||||
|
||||
self.mock_assets_create_snapshot.assert_not_called()
|
||||
|
||||
@@ -201,3 +201,101 @@ class WebsiteLoaderTestCase(TestCase):
|
||||
"https://example.com", ignore_cache=True
|
||||
)
|
||||
self.assertEqual(mock_load_page.call_count, 2)
|
||||
|
||||
|
||||
class ContentTypeDetectionTestCase(TestCase):
|
||||
def test_detect_content_type_returns_content_type_from_head_request(self):
|
||||
with mock.patch("requests.head") as mock_head:
|
||||
mock_response = mock.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "application/pdf"}
|
||||
mock_head.return_value = mock_response
|
||||
|
||||
result = website_loader.detect_content_type("https://example.com/doc.pdf")
|
||||
|
||||
self.assertEqual(result, "application/pdf")
|
||||
mock_head.assert_called_once()
|
||||
|
||||
def test_detect_content_type_strips_charset(self):
|
||||
with mock.patch("requests.head") as mock_head:
|
||||
mock_response = mock.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "text/html; charset=utf-8"}
|
||||
mock_head.return_value = mock_response
|
||||
|
||||
result = website_loader.detect_content_type("https://example.com")
|
||||
|
||||
self.assertEqual(result, "text/html")
|
||||
|
||||
def test_detect_content_type_returns_lowercase(self):
|
||||
with mock.patch("requests.head") as mock_head:
|
||||
mock_response = mock.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "Application/PDF"}
|
||||
mock_head.return_value = mock_response
|
||||
|
||||
result = website_loader.detect_content_type("https://example.com/doc.pdf")
|
||||
|
||||
self.assertEqual(result, "application/pdf")
|
||||
|
||||
def test_detect_content_type_falls_back_to_get_when_head_fails(self):
|
||||
with (
|
||||
mock.patch("requests.head") as mock_head,
|
||||
mock.patch("requests.get") as mock_get,
|
||||
):
|
||||
import requests
|
||||
|
||||
mock_head.side_effect = requests.RequestException("HEAD failed")
|
||||
|
||||
mock_response = mock.Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"Content-Type": "application/pdf"}
|
||||
mock_response.__enter__ = mock.Mock(return_value=mock_response)
|
||||
mock_response.__exit__ = mock.Mock(return_value=False)
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = website_loader.detect_content_type("https://example.com/doc.pdf")
|
||||
|
||||
self.assertEqual(result, "application/pdf")
|
||||
mock_head.assert_called_once()
|
||||
mock_get.assert_called_once()
|
||||
|
||||
def test_detect_content_type_returns_none_when_both_head_and_get_fail(self):
|
||||
with (
|
||||
mock.patch("requests.head") as mock_head,
|
||||
mock.patch("requests.get") as mock_get,
|
||||
):
|
||||
import requests
|
||||
|
||||
mock_head.side_effect = requests.RequestException("HEAD failed")
|
||||
mock_get.side_effect = requests.RequestException("GET failed")
|
||||
|
||||
result = website_loader.detect_content_type("https://example.com/doc.pdf")
|
||||
|
||||
self.assertIsNone(result)
|
||||
|
||||
def test_detect_content_type_returns_none_for_non_200_status(self):
|
||||
with (
|
||||
mock.patch("requests.head") as mock_head,
|
||||
mock.patch("requests.get") as mock_get,
|
||||
):
|
||||
mock_head_response = mock.Mock()
|
||||
mock_head_response.status_code = 404
|
||||
mock_head.return_value = mock_head_response
|
||||
|
||||
mock_get_response = mock.Mock()
|
||||
mock_get_response.status_code = 404
|
||||
mock_get_response.__enter__ = mock.Mock(return_value=mock_get_response)
|
||||
mock_get_response.__exit__ = mock.Mock(return_value=False)
|
||||
mock_get.return_value = mock_get_response
|
||||
|
||||
result = website_loader.detect_content_type("https://example.com/doc.pdf")
|
||||
|
||||
self.assertIsNone(result)
|
||||
|
||||
def test_is_pdf_content_type(self):
|
||||
self.assertTrue(website_loader.is_pdf_content_type("application/pdf"))
|
||||
self.assertTrue(website_loader.is_pdf_content_type("application/x-pdf"))
|
||||
self.assertFalse(website_loader.is_pdf_content_type("text/html"))
|
||||
self.assertFalse(website_loader.is_pdf_content_type(None))
|
||||
self.assertFalse(website_loader.is_pdf_content_type(""))
|
||||
|
||||
@@ -152,11 +152,11 @@ class BookmarkDetailsModalE2ETestCase(LinkdingE2ETestCase):
|
||||
asset_list = details_modal.locator(".assets")
|
||||
|
||||
# No snapshots initially
|
||||
snapshot = asset_list.get_by_text("HTML snapshot from", exact=False)
|
||||
snapshot = asset_list.get_by_text("snapshot", exact=False)
|
||||
expect(snapshot).not_to_be_visible()
|
||||
|
||||
# Create snapshot
|
||||
details_modal.get_by_text("Create HTML snapshot", exact=False).click()
|
||||
details_modal.get_by_text("snapshot", exact=False).click()
|
||||
self.assertReloads(0)
|
||||
|
||||
# Has new snapshots
|
||||
|
||||
@@ -35,6 +35,8 @@ def view(request, asset_id: int):
|
||||
response["Content-Disposition"] = f'inline; filename="{asset.download_name}"'
|
||||
if asset.content_type and asset.content_type.startswith("video/"):
|
||||
response["Content-Security-Policy"] = "default-src 'none'; media-src 'self';"
|
||||
elif asset.content_type == "application/pdf":
|
||||
response["Content-Security-Policy"] = "default-src 'none'; object-src 'self';"
|
||||
else:
|
||||
response["Content-Security-Policy"] = "sandbox allow-scripts"
|
||||
return response
|
||||
|
||||
@@ -11,6 +11,10 @@ Linkding can automatically create HTML snapshots whenever a bookmark is added. T
|
||||
|
||||
The snapshots are created using [singlefile-cli](https://github.com/gildas-lormeau/single-file-cli), which effectively runs a headless Chromium instance on the server to convert the web page into a single HTML file. Linkding will also load the [uBlock Origin Lite extension](https://github.com/uBlockOrigin/uBOL-home) into Chromium to attempt to block ads and other unwanted content.
|
||||
|
||||
<!--
|
||||
When bookmarking a URL that points directly to a PDF file, linkding will download the PDF instead of creating an HTML snapshot. This happens automatically based on the content type of the URL, and the downloaded PDF will be stored as an asset alongside the bookmark, just like HTML snapshots.
|
||||
-->
|
||||
|
||||
This method is fairly easy to set up, but also has several downsides:
|
||||
- The Docker image is significantly larger than the base image, as it includes a Chromium installation.
|
||||
- Running Chromium requires significantly more memory, at least 1 GB of RAM.
|
||||
|
||||
Reference in New Issue
Block a user