@@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from page_importer.dates import normalize_date
|
||||
from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
|
||||
from page_importer.wxr import build_wxr
|
||||
from page_importer.models import ScrapedPost
|
||||
|
||||
|
||||
class DateNormalizationTests(unittest.TestCase):
|
||||
def test_preserves_timezone_offset_in_normalized_value(self) -> None:
|
||||
self.assertEqual(
|
||||
normalize_date("2024-05-01T09:30:00-07:00"),
|
||||
"2024-05-01 09:30:00-07:00",
|
||||
)
|
||||
|
||||
|
||||
class WxrSerializationTests(unittest.TestCase):
|
||||
def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
|
||||
xml = build_wxr(
|
||||
[
|
||||
ScrapedPost(
|
||||
source_url="https://example.com/post",
|
||||
title="Example",
|
||||
body_html="<p>Body</p>",
|
||||
publish_date="2024-05-01 09:30:00-07:00",
|
||||
success=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
|
||||
self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
|
||||
self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
|
||||
|
||||
def test_splits_cdata_terminators_in_content(self) -> None:
|
||||
xml = build_wxr(
|
||||
[
|
||||
ScrapedPost(
|
||||
source_url="https://example.com/post",
|
||||
title="Example",
|
||||
body_html="<p>alpha ]]> omega</p>",
|
||||
author="Jane ]]> Doe",
|
||||
success=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
|
||||
self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
|
||||
|
||||
|
||||
class HtmlSanitizationTests(unittest.TestCase):
|
||||
def test_removes_inline_event_handlers_and_script_uris(self) -> None:
|
||||
sanitized = sanitize_html(
|
||||
'<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
|
||||
)
|
||||
|
||||
self.assertNotIn("onclick", sanitized)
|
||||
self.assertNotIn("onerror", sanitized)
|
||||
self.assertNotIn("javascript:", sanitized)
|
||||
|
||||
|
||||
class TaxonomySelectorTests(unittest.TestCase):
|
||||
def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
|
||||
soup = BeautifulSoup(
|
||||
'<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
|
||||
"html.parser",
|
||||
)
|
||||
|
||||
self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
|
||||
self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user