from __future__ import annotations import importlib.util import pathlib import sys import tempfile import unittest ROOT_DIR = pathlib.Path(__file__).resolve().parents[2] MODULE_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py" SPEC = importlib.util.spec_from_file_location("sitemap_builder_test", MODULE_PATH) assert SPEC is not None and SPEC.loader is not None MODULE = importlib.util.module_from_spec(SPEC) sys.modules["sitemap_builder_test"] = MODULE SPEC.loader.exec_module(MODULE) class SitemapBuilderTests(unittest.TestCase): def test_cleanup_run_files_deletes_generated_artifacts(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: output_path = pathlib.Path(temp_dir) / "crawl.csv" state_path = MODULE.get_state_path(output_path) log_path = MODULE.get_log_path(output_path) output_path.write_text("csv", encoding="utf-8") state_path.write_text("state", encoding="utf-8") log_path.write_text("log", encoding="utf-8") removed_paths = MODULE.cleanup_run_files(output_path) self.assertCountEqual(removed_paths, [output_path, state_path, log_path]) self.assertFalse(output_path.exists()) self.assertFalse(state_path.exists()) self.assertFalse(log_path.exists()) def test_run_crawl_rejects_resume_with_changed_subdomain_setting(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: output_path = pathlib.Path(temp_dir) / "crawl.csv" state_path = MODULE.get_state_path(output_path) state = MODULE.initialize_state("https://example.com", include_subdomains=False, include_documents=False) MODULE.save_state(state, state_path, output_path) with self.assertRaisesRegex(ValueError, "different subdomain setting"): MODULE.run_crawl( start_url="https://example.com", output_path=output_path, include_subdomains=True, include_documents=False, resume=True, fresh=False, ) def test_crawl_site_removes_dequeued_urls_from_queued_set(self) -> None: original_fetch = MODULE.fetch_page_with_delay try: MODULE.fetch_page_with_delay = lambda url, timeout, user_agent, delay: MODULE.CrawlResult(url=url, links=[]) with tempfile.TemporaryDirectory() as temp_dir: output_path = pathlib.Path(temp_dir) / "crawl.csv" state_path = MODULE.get_state_path(output_path) log_path = MODULE.get_log_path(output_path) state = MODULE.initialize_state("https://example.com", include_subdomains=False, include_documents=False) final_state, user_stopped = MODULE.crawl_site( state=state, max_pages=1, delay=0.0, timeout=1.0, user_agent=MODULE.DEFAULT_USER_AGENT, state_path=state_path, output_path=output_path, log_path=log_path, save_every=1, workers=1, ) self.assertFalse(user_stopped) self.assertEqual(list(final_state.queue), []) self.assertEqual(final_state.queued, set()) finally: MODULE.fetch_page_with_delay = original_fetch if __name__ == "__main__": unittest.main()