Files
WDW-Sitemap-and-Scraper-Docker/Page Importer/tests/test_regressions.py
T
wdwalrus ead872a0a5
Build Docker Image / docker (push) Successful in 44s
first commit
2026-04-09 10:42:10 -07:00

80 lines
2.7 KiB
Python

from __future__ import annotations
import unittest
from bs4 import BeautifulSoup
from page_importer.dates import normalize_date
from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
from page_importer.wxr import build_wxr
from page_importer.models import ScrapedPost
class DateNormalizationTests(unittest.TestCase):
def test_preserves_timezone_offset_in_normalized_value(self) -> None:
self.assertEqual(
normalize_date("2024-05-01T09:30:00-07:00"),
"2024-05-01 09:30:00-07:00",
)
class WxrSerializationTests(unittest.TestCase):
def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
xml = build_wxr(
[
ScrapedPost(
source_url="https://example.com/post",
title="Example",
body_html="<p>Body</p>",
publish_date="2024-05-01 09:30:00-07:00",
success=True,
)
]
)
self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
def test_splits_cdata_terminators_in_content(self) -> None:
xml = build_wxr(
[
ScrapedPost(
source_url="https://example.com/post",
title="Example",
body_html="<p>alpha ]]> omega</p>",
author="Jane ]]> Doe",
success=True,
)
]
)
self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
class HtmlSanitizationTests(unittest.TestCase):
def test_removes_inline_event_handlers_and_script_uris(self) -> None:
sanitized = sanitize_html(
'<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
)
self.assertNotIn("onclick", sanitized)
self.assertNotIn("onerror", sanitized)
self.assertNotIn("javascript:", sanitized)
class TaxonomySelectorTests(unittest.TestCase):
def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
soup = BeautifulSoup(
'<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
"html.parser",
)
self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
if __name__ == "__main__":
unittest.main()