from __future__ import annotations import unittest from bs4 import BeautifulSoup from page_importer.dates import normalize_date from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html from page_importer.wxr import build_wxr from page_importer.models import ScrapedPost class DateNormalizationTests(unittest.TestCase): def test_preserves_timezone_offset_in_normalized_value(self) -> None: self.assertEqual( normalize_date("2024-05-01T09:30:00-07:00"), "2024-05-01 09:30:00-07:00", ) class WxrSerializationTests(unittest.TestCase): def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None: xml = build_wxr( [ ScrapedPost( source_url="https://example.com/post", title="Example", body_html="

Body

", publish_date="2024-05-01 09:30:00-07:00", success=True, ) ] ) self.assertIn("", xml) self.assertIn("", xml) self.assertIn("Wed, 01 May 2024 16:30:00 +0000", xml) def test_splits_cdata_terminators_in_content(self) -> None: xml = build_wxr( [ ScrapedPost( source_url="https://example.com/post", title="Example", body_html="

alpha ]]> omega

", author="Jane ]]> Doe", success=True, ) ] ) self.assertIn("alpha ]]]]> omega", xml) self.assertIn("Jane ]]]]> Doe", xml) class HtmlSanitizationTests(unittest.TestCase): def test_removes_inline_event_handlers_and_script_uris(self) -> None: sanitized = sanitize_html( '

' ) self.assertNotIn("onclick", sanitized) self.assertNotIn("onerror", sanitized) self.assertNotIn("javascript:", sanitized) class TaxonomySelectorTests(unittest.TestCase): def test_drupal_tag_field_is_not_treated_as_category(self) -> None: soup = BeautifulSoup( '

Example Tag

', "html.parser", ) self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), []) self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"]) if __name__ == "__main__": unittest.main()