from unittest import mock from django.test import TestCase from bookmarks.services import website_loader class MockStreamingResponse: def __init__(self, num_chunks, chunk_size, insert_head_after_chunk=None): self.chunks = [] for index in range(num_chunks): chunk = "".zfill(chunk_size) self.chunks.append(chunk.encode("utf-8")) if index == insert_head_after_chunk: self.chunks.append(b"") def iter_content(self, **kwargs): return self.chunks def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): pass class WebsiteLoaderTestCase(TestCase): def setUp(self): # clear cached metadata before test run website_loader._load_website_metadata_cached.cache_clear() def render_html_document( self, title, description="", og_description="", og_image="" ): meta_description = ( f'' if description else "" ) meta_og_description = ( f'' if og_description else "" ) meta_og_image = ( f'' if og_image else "" ) return f""" {title} {meta_description} {meta_og_description} {meta_og_image} """ def test_load_page_returns_content(self): with mock.patch("requests.get") as mock_get: mock_get.return_value = MockStreamingResponse( num_chunks=10, chunk_size=1024 ) content = website_loader.load_page("https://example.com") expected_content_size = 10 * 1024 self.assertEqual(expected_content_size, len(content)) def test_load_page_limits_large_documents(self): with mock.patch("requests.get") as mock_get: mock_get.return_value = MockStreamingResponse( num_chunks=10, chunk_size=1024 * 1000 ) content = website_loader.load_page("https://example.com") # Should have read six chunks, after which content exceeds the max of 5MB expected_content_size = 6 * 1024 * 1000 self.assertEqual(expected_content_size, len(content)) def test_load_page_stops_reading_at_end_of_head(self): with mock.patch("requests.get") as mock_get: mock_get.return_value = MockStreamingResponse( num_chunks=10, chunk_size=1024 * 1000, insert_head_after_chunk=0 ) content = website_loader.load_page("https://example.com") # Should have read first chunk, and second chunk containing closing head tag expected_content_size = 1 * 1024 * 1000 + len("") self.assertEqual(expected_content_size, len(content)) def test_load_page_removes_bytes_after_end_of_head(self): with mock.patch("requests.get") as mock_get: mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0) mock_response.chunks[0] = "人".encode() # add a single byte that can't be decoded to utf-8 mock_response.chunks[0] += 0xFF.to_bytes(1, "big") mock_get.return_value = mock_response content = website_loader.load_page("https://example.com") # verify that byte after head was removed, content parsed as utf-8 self.assertEqual(content, "人") def test_load_website_metadata(self): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( "test title", "test description" ) metadata = website_loader.load_website_metadata("https://example.com") self.assertEqual("test title", metadata.title) self.assertEqual("test description", metadata.description) self.assertIsNone(metadata.preview_image) def test_load_website_metadata_trims_title_and_description(self): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( " test title ", " test description " ) metadata = website_loader.load_website_metadata("https://example.com") self.assertEqual("test title", metadata.title) self.assertEqual("test description", metadata.description) def test_load_website_metadata_using_og_description(self): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( "test title", "", og_description="test og description" ) metadata = website_loader.load_website_metadata("https://example.com") self.assertEqual("test title", metadata.title) self.assertEqual("test og description", metadata.description) def test_load_website_metadata_using_og_image(self): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( "test title", og_image="http://example.com/image.jpg" ) metadata = website_loader.load_website_metadata("https://example.com") self.assertEqual("http://example.com/image.jpg", metadata.preview_image) def test_load_website_metadata_gets_absolute_og_image_path_when_path_starts_with_dots( self, ): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( "test title", og_image="../image.jpg" ) metadata = website_loader.load_website_metadata( "https://example.com/a/b/page.html" ) self.assertEqual("https://example.com/a/image.jpg", metadata.preview_image) def test_load_website_metadata_gets_absolute_og_image_path_when_path_starts_with_slash( self, ): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( "test title", og_image="/image.jpg" ) metadata = website_loader.load_website_metadata( "https://example.com/a/b/page.html" ) self.assertEqual("https://example.com/image.jpg", metadata.preview_image) def test_load_website_metadata_prefers_description_over_og_description(self): with mock.patch( "bookmarks.services.website_loader.load_page" ) as mock_load_page: mock_load_page.return_value = self.render_html_document( "test title", "test description", og_description="test og description" ) metadata = website_loader.load_website_metadata("https://example.com") self.assertEqual("test title", metadata.title) self.assertEqual("test description", metadata.description) def test_website_metadata_ignore_cache(self): expected_html = 'Test Title' with mock.patch.object( website_loader, "load_page", return_value=expected_html ) as mock_load_page: website_loader.load_website_metadata("https://example.com") mock_load_page.assert_called_once() website_loader.load_website_metadata("https://example.com") mock_load_page.assert_called_once() website_loader.load_website_metadata( "https://example.com", ignore_cache=True ) self.assertEqual(mock_load_page.call_count, 2) class ContentTypeDetectionTestCase(TestCase): def test_detect_content_type_returns_content_type_from_head_request(self): with mock.patch("requests.head") as mock_head: mock_response = mock.Mock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/pdf"} mock_head.return_value = mock_response result = website_loader.detect_content_type("https://example.com/doc.pdf") self.assertEqual(result, "application/pdf") mock_head.assert_called_once() def test_detect_content_type_strips_charset(self): with mock.patch("requests.head") as mock_head: mock_response = mock.Mock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "text/html; charset=utf-8"} mock_head.return_value = mock_response result = website_loader.detect_content_type("https://example.com") self.assertEqual(result, "text/html") def test_detect_content_type_returns_lowercase(self): with mock.patch("requests.head") as mock_head: mock_response = mock.Mock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "Application/PDF"} mock_head.return_value = mock_response result = website_loader.detect_content_type("https://example.com/doc.pdf") self.assertEqual(result, "application/pdf") def test_detect_content_type_falls_back_to_get_when_head_fails(self): with ( mock.patch("requests.head") as mock_head, mock.patch("requests.get") as mock_get, ): import requests mock_head.side_effect = requests.RequestException("HEAD failed") mock_response = mock.Mock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/pdf"} mock_response.__enter__ = mock.Mock(return_value=mock_response) mock_response.__exit__ = mock.Mock(return_value=False) mock_get.return_value = mock_response result = website_loader.detect_content_type("https://example.com/doc.pdf") self.assertEqual(result, "application/pdf") mock_head.assert_called_once() mock_get.assert_called_once() def test_detect_content_type_returns_none_when_both_head_and_get_fail(self): with ( mock.patch("requests.head") as mock_head, mock.patch("requests.get") as mock_get, ): import requests mock_head.side_effect = requests.RequestException("HEAD failed") mock_get.side_effect = requests.RequestException("GET failed") result = website_loader.detect_content_type("https://example.com/doc.pdf") self.assertIsNone(result) def test_detect_content_type_returns_none_for_non_200_status(self): with ( mock.patch("requests.head") as mock_head, mock.patch("requests.get") as mock_get, ): mock_head_response = mock.Mock() mock_head_response.status_code = 404 mock_head.return_value = mock_head_response mock_get_response = mock.Mock() mock_get_response.status_code = 404 mock_get_response.__enter__ = mock.Mock(return_value=mock_get_response) mock_get_response.__exit__ = mock.Mock(return_value=False) mock_get.return_value = mock_get_response result = website_loader.detect_content_type("https://example.com/doc.pdf") self.assertIsNone(result) def test_is_pdf_content_type(self): self.assertTrue(website_loader.is_pdf_content_type("application/pdf")) self.assertTrue(website_loader.is_pdf_content_type("application/x-pdf")) self.assertFalse(website_loader.is_pdf_content_type("text/html")) self.assertFalse(website_loader.is_pdf_content_type(None)) self.assertFalse(website_loader.is_pdf_content_type(""))