87 lines
3.5 KiB
Python
87 lines
3.5 KiB
Python
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import pathlib
|
|
import sys
|
|
import tempfile
|
|
import unittest
|
|
|
|
|
|
ROOT_DIR = pathlib.Path(__file__).resolve().parents[2]
|
|
MODULE_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
|
|
SPEC = importlib.util.spec_from_file_location("sitemap_builder_test", MODULE_PATH)
|
|
assert SPEC is not None and SPEC.loader is not None
|
|
MODULE = importlib.util.module_from_spec(SPEC)
|
|
sys.modules["sitemap_builder_test"] = MODULE
|
|
SPEC.loader.exec_module(MODULE)
|
|
|
|
|
|
class SitemapBuilderTests(unittest.TestCase):
|
|
def test_cleanup_run_files_deletes_generated_artifacts(self) -> None:
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
output_path = pathlib.Path(temp_dir) / "crawl.csv"
|
|
state_path = MODULE.get_state_path(output_path)
|
|
log_path = MODULE.get_log_path(output_path)
|
|
|
|
output_path.write_text("csv", encoding="utf-8")
|
|
state_path.write_text("state", encoding="utf-8")
|
|
log_path.write_text("log", encoding="utf-8")
|
|
|
|
removed_paths = MODULE.cleanup_run_files(output_path)
|
|
|
|
self.assertCountEqual(removed_paths, [output_path, state_path, log_path])
|
|
self.assertFalse(output_path.exists())
|
|
self.assertFalse(state_path.exists())
|
|
self.assertFalse(log_path.exists())
|
|
|
|
def test_run_crawl_rejects_resume_with_changed_subdomain_setting(self) -> None:
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
output_path = pathlib.Path(temp_dir) / "crawl.csv"
|
|
state_path = MODULE.get_state_path(output_path)
|
|
state = MODULE.initialize_state("https://example.com", include_subdomains=False, include_documents=False)
|
|
MODULE.save_state(state, state_path, output_path)
|
|
|
|
with self.assertRaisesRegex(ValueError, "different subdomain setting"):
|
|
MODULE.run_crawl(
|
|
start_url="https://example.com",
|
|
output_path=output_path,
|
|
include_subdomains=True,
|
|
include_documents=False,
|
|
resume=True,
|
|
fresh=False,
|
|
)
|
|
|
|
def test_crawl_site_removes_dequeued_urls_from_queued_set(self) -> None:
|
|
original_fetch = MODULE.fetch_page_with_delay
|
|
try:
|
|
MODULE.fetch_page_with_delay = lambda url, timeout, user_agent, delay: MODULE.CrawlResult(url=url, links=[])
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
output_path = pathlib.Path(temp_dir) / "crawl.csv"
|
|
state_path = MODULE.get_state_path(output_path)
|
|
log_path = MODULE.get_log_path(output_path)
|
|
state = MODULE.initialize_state("https://example.com", include_subdomains=False, include_documents=False)
|
|
|
|
final_state, user_stopped = MODULE.crawl_site(
|
|
state=state,
|
|
max_pages=1,
|
|
delay=0.0,
|
|
timeout=1.0,
|
|
user_agent=MODULE.DEFAULT_USER_AGENT,
|
|
state_path=state_path,
|
|
output_path=output_path,
|
|
log_path=log_path,
|
|
save_every=1,
|
|
workers=1,
|
|
)
|
|
|
|
self.assertFalse(user_stopped)
|
|
self.assertEqual(list(final_state.queue), [])
|
|
self.assertEqual(final_state.queued, set())
|
|
finally:
|
|
MODULE.fetch_page_with_delay = original_fetch
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|