80 lines
2.7 KiB
Python
80 lines
2.7 KiB
Python
from __future__ import annotations
|
|
|
|
import unittest
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from page_importer.dates import normalize_date
|
|
from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
|
|
from page_importer.wxr import build_wxr
|
|
from page_importer.models import ScrapedPost
|
|
|
|
|
|
class DateNormalizationTests(unittest.TestCase):
|
|
def test_preserves_timezone_offset_in_normalized_value(self) -> None:
|
|
self.assertEqual(
|
|
normalize_date("2024-05-01T09:30:00-07:00"),
|
|
"2024-05-01 09:30:00-07:00",
|
|
)
|
|
|
|
|
|
class WxrSerializationTests(unittest.TestCase):
|
|
def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
|
|
xml = build_wxr(
|
|
[
|
|
ScrapedPost(
|
|
source_url="https://example.com/post",
|
|
title="Example",
|
|
body_html="<p>Body</p>",
|
|
publish_date="2024-05-01 09:30:00-07:00",
|
|
success=True,
|
|
)
|
|
]
|
|
)
|
|
|
|
self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
|
|
self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
|
|
self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
|
|
|
|
def test_splits_cdata_terminators_in_content(self) -> None:
|
|
xml = build_wxr(
|
|
[
|
|
ScrapedPost(
|
|
source_url="https://example.com/post",
|
|
title="Example",
|
|
body_html="<p>alpha ]]> omega</p>",
|
|
author="Jane ]]> Doe",
|
|
success=True,
|
|
)
|
|
]
|
|
)
|
|
|
|
self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
|
|
self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
|
|
|
|
|
|
class HtmlSanitizationTests(unittest.TestCase):
|
|
def test_removes_inline_event_handlers_and_script_uris(self) -> None:
|
|
sanitized = sanitize_html(
|
|
'<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
|
|
)
|
|
|
|
self.assertNotIn("onclick", sanitized)
|
|
self.assertNotIn("onerror", sanitized)
|
|
self.assertNotIn("javascript:", sanitized)
|
|
|
|
|
|
class TaxonomySelectorTests(unittest.TestCase):
|
|
def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
|
|
soup = BeautifulSoup(
|
|
'<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
|
|
"html.parser",
|
|
)
|
|
|
|
self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
|
|
self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|