Files
linkding/bookmarks/tests/test_website_loader.py
Sascha Ißbrücker 7333b283cf Download PDF instead of creating HTML snapshot if URL points at PDF (#1271)
* basic pdf snapshots

* cleanup website_loader tests

* cleanup asset tests

* cleanup asset service tests

* use PDF download as display name

* update new snapshot name

* update docs

* update e2e test

* update test
2026-01-06 10:29:31 +01:00

302 lines
12 KiB
Python

from unittest import mock
from django.test import TestCase
from bookmarks.services import website_loader
class MockStreamingResponse:
def __init__(self, num_chunks, chunk_size, insert_head_after_chunk=None):
self.chunks = []
for index in range(num_chunks):
chunk = "".zfill(chunk_size)
self.chunks.append(chunk.encode("utf-8"))
if index == insert_head_after_chunk:
self.chunks.append(b"</head>")
def iter_content(self, **kwargs):
return self.chunks
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
pass
class WebsiteLoaderTestCase(TestCase):
def setUp(self):
# clear cached metadata before test run
website_loader._load_website_metadata_cached.cache_clear()
def render_html_document(
self, title, description="", og_description="", og_image=""
):
meta_description = (
f'<meta name="description" content="{description}">' if description else ""
)
meta_og_description = (
f'<meta property="og:description" content="{og_description}">'
if og_description
else ""
)
meta_og_image = (
f'<meta property="og:image" content="{og_image}">' if og_image else ""
)
return f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{title}</title>
{meta_description}
{meta_og_description}
{meta_og_image}
</head>
<body></body>
</html>
"""
def test_load_page_returns_content(self):
with mock.patch("requests.get") as mock_get:
mock_get.return_value = MockStreamingResponse(
num_chunks=10, chunk_size=1024
)
content = website_loader.load_page("https://example.com")
expected_content_size = 10 * 1024
self.assertEqual(expected_content_size, len(content))
def test_load_page_limits_large_documents(self):
with mock.patch("requests.get") as mock_get:
mock_get.return_value = MockStreamingResponse(
num_chunks=10, chunk_size=1024 * 1000
)
content = website_loader.load_page("https://example.com")
# Should have read six chunks, after which content exceeds the max of 5MB
expected_content_size = 6 * 1024 * 1000
self.assertEqual(expected_content_size, len(content))
def test_load_page_stops_reading_at_end_of_head(self):
with mock.patch("requests.get") as mock_get:
mock_get.return_value = MockStreamingResponse(
num_chunks=10, chunk_size=1024 * 1000, insert_head_after_chunk=0
)
content = website_loader.load_page("https://example.com")
# Should have read first chunk, and second chunk containing closing head tag
expected_content_size = 1 * 1024 * 1000 + len("</head>")
self.assertEqual(expected_content_size, len(content))
def test_load_page_removes_bytes_after_end_of_head(self):
with mock.patch("requests.get") as mock_get:
mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0)
mock_response.chunks[0] = "<head>人</head>".encode()
# add a single byte that can't be decoded to utf-8
mock_response.chunks[0] += 0xFF.to_bytes(1, "big")
mock_get.return_value = mock_response
content = website_loader.load_page("https://example.com")
# verify that byte after head was removed, content parsed as utf-8
self.assertEqual(content, "<head>人</head>")
def test_load_website_metadata(self):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
"test title", "test description"
)
metadata = website_loader.load_website_metadata("https://example.com")
self.assertEqual("test title", metadata.title)
self.assertEqual("test description", metadata.description)
self.assertIsNone(metadata.preview_image)
def test_load_website_metadata_trims_title_and_description(self):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
" test title ", " test description "
)
metadata = website_loader.load_website_metadata("https://example.com")
self.assertEqual("test title", metadata.title)
self.assertEqual("test description", metadata.description)
def test_load_website_metadata_using_og_description(self):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
"test title", "", og_description="test og description"
)
metadata = website_loader.load_website_metadata("https://example.com")
self.assertEqual("test title", metadata.title)
self.assertEqual("test og description", metadata.description)
def test_load_website_metadata_using_og_image(self):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
"test title", og_image="http://example.com/image.jpg"
)
metadata = website_loader.load_website_metadata("https://example.com")
self.assertEqual("http://example.com/image.jpg", metadata.preview_image)
def test_load_website_metadata_gets_absolute_og_image_path_when_path_starts_with_dots(
self,
):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
"test title", og_image="../image.jpg"
)
metadata = website_loader.load_website_metadata(
"https://example.com/a/b/page.html"
)
self.assertEqual("https://example.com/a/image.jpg", metadata.preview_image)
def test_load_website_metadata_gets_absolute_og_image_path_when_path_starts_with_slash(
self,
):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
"test title", og_image="/image.jpg"
)
metadata = website_loader.load_website_metadata(
"https://example.com/a/b/page.html"
)
self.assertEqual("https://example.com/image.jpg", metadata.preview_image)
def test_load_website_metadata_prefers_description_over_og_description(self):
with mock.patch(
"bookmarks.services.website_loader.load_page"
) as mock_load_page:
mock_load_page.return_value = self.render_html_document(
"test title", "test description", og_description="test og description"
)
metadata = website_loader.load_website_metadata("https://example.com")
self.assertEqual("test title", metadata.title)
self.assertEqual("test description", metadata.description)
def test_website_metadata_ignore_cache(self):
expected_html = '<html><head><title>Test Title</title><meta name="description" content="Test Description"><meta property="og:image" content="/images/test.jpg"></head></html>'
with mock.patch.object(
website_loader, "load_page", return_value=expected_html
) as mock_load_page:
website_loader.load_website_metadata("https://example.com")
mock_load_page.assert_called_once()
website_loader.load_website_metadata("https://example.com")
mock_load_page.assert_called_once()
website_loader.load_website_metadata(
"https://example.com", ignore_cache=True
)
self.assertEqual(mock_load_page.call_count, 2)
class ContentTypeDetectionTestCase(TestCase):
def test_detect_content_type_returns_content_type_from_head_request(self):
with mock.patch("requests.head") as mock_head:
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/pdf"}
mock_head.return_value = mock_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertEqual(result, "application/pdf")
mock_head.assert_called_once()
def test_detect_content_type_strips_charset(self):
with mock.patch("requests.head") as mock_head:
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "text/html; charset=utf-8"}
mock_head.return_value = mock_response
result = website_loader.detect_content_type("https://example.com")
self.assertEqual(result, "text/html")
def test_detect_content_type_returns_lowercase(self):
with mock.patch("requests.head") as mock_head:
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "Application/PDF"}
mock_head.return_value = mock_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertEqual(result, "application/pdf")
def test_detect_content_type_falls_back_to_get_when_head_fails(self):
with (
mock.patch("requests.head") as mock_head,
mock.patch("requests.get") as mock_get,
):
import requests
mock_head.side_effect = requests.RequestException("HEAD failed")
mock_response = mock.Mock()
mock_response.status_code = 200
mock_response.headers = {"Content-Type": "application/pdf"}
mock_response.__enter__ = mock.Mock(return_value=mock_response)
mock_response.__exit__ = mock.Mock(return_value=False)
mock_get.return_value = mock_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertEqual(result, "application/pdf")
mock_head.assert_called_once()
mock_get.assert_called_once()
def test_detect_content_type_returns_none_when_both_head_and_get_fail(self):
with (
mock.patch("requests.head") as mock_head,
mock.patch("requests.get") as mock_get,
):
import requests
mock_head.side_effect = requests.RequestException("HEAD failed")
mock_get.side_effect = requests.RequestException("GET failed")
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertIsNone(result)
def test_detect_content_type_returns_none_for_non_200_status(self):
with (
mock.patch("requests.head") as mock_head,
mock.patch("requests.get") as mock_get,
):
mock_head_response = mock.Mock()
mock_head_response.status_code = 404
mock_head.return_value = mock_head_response
mock_get_response = mock.Mock()
mock_get_response.status_code = 404
mock_get_response.__enter__ = mock.Mock(return_value=mock_get_response)
mock_get_response.__exit__ = mock.Mock(return_value=False)
mock_get.return_value = mock_get_response
result = website_loader.detect_content_type("https://example.com/doc.pdf")
self.assertIsNone(result)
def test_is_pdf_content_type(self):
self.assertTrue(website_loader.is_pdf_content_type("application/pdf"))
self.assertTrue(website_loader.is_pdf_content_type("application/x-pdf"))
self.assertFalse(website_loader.is_pdf_content_type("text/html"))
self.assertFalse(website_loader.is_pdf_content_type(None))
self.assertFalse(website_loader.is_pdf_content_type(""))