Edge Case fixes, bug fixes, and UI Cleanup.
Build Docker Image / docker (push) Successful in 6s

This commit is contained in:
2026-04-09 11:21:23 -07:00
parent 8667f547e6
commit 0e410a1f6c
5 changed files with 256 additions and 6 deletions
+24 -2
View File
@@ -2,6 +2,7 @@ from __future__ import annotations
import csv import csv
import datetime as dt import datetime as dt
import hashlib
import io import io
import re import re
from dataclasses import replace from dataclasses import replace
@@ -13,6 +14,7 @@ from page_importer.models import ScrapeOptions, ScrapedPost
from page_importer.scraper import Scraper from page_importer.scraper import Scraper
from page_importer.wxr import build_wxr from page_importer.wxr import build_wxr
def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]: def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
text = file_data.decode("utf-8-sig", errors="replace") text = file_data.decode("utf-8-sig", errors="replace")
reader = csv.DictReader(io.StringIO(text)) reader = csv.DictReader(io.StringIO(text))
@@ -20,6 +22,20 @@ def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
return reader.fieldnames or [], rows return reader.fieldnames or [], rows
def build_upload_fingerprint(file_data: bytes) -> str:
return hashlib.sha256(file_data).hexdigest()
def sync_uploaded_file_state(session_state: dict[str, object], upload_fingerprint: str) -> None:
previous_fingerprint = session_state.get("uploaded_csv_fingerprint")
if previous_fingerprint == upload_fingerprint:
return
for key in ("results", "input_rows", "input_headers", "scrape_context"):
session_state.pop(key, None)
session_state["uploaded_csv_fingerprint"] = upload_fingerprint
def render_app() -> None: def render_app() -> None:
st.title("Page Importer") st.title("Page Importer")
st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.") st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
@@ -47,7 +63,10 @@ def render_app() -> None:
st.info("Upload a CSV to begin.") st.info("Upload a CSV to begin.")
return return
headers, rows = load_csv(uploaded.getvalue()) uploaded_bytes = uploaded.getvalue()
sync_uploaded_file_state(st.session_state, build_upload_fingerprint(uploaded_bytes))
headers, rows = load_csv(uploaded_bytes)
if not rows: if not rows:
st.error("The CSV did not contain any rows.") st.error("The CSV did not contain any rows.")
return return
@@ -88,6 +107,7 @@ def render_app() -> None:
results = scrape_rows(rows, context, phase_label="Scraping") results = scrape_rows(rows, context, phase_label="Scraping")
st.session_state["results"] = results st.session_state["results"] = results
st.session_state["input_rows"] = rows st.session_state["input_rows"] = rows
st.session_state["input_headers"] = headers
st.session_state["scrape_context"] = context st.session_state["scrape_context"] = context
results = st.session_state.get("results", []) results = st.session_state.get("results", [])
@@ -176,7 +196,9 @@ def render_app() -> None:
st.write(f"**Author:** {selected.author or '(missing)'}") st.write(f"**Author:** {selected.author or '(missing)'}")
st.write(f"**Post Type:** {selected.post_type}") st.write(f"**Post Type:** {selected.post_type}")
st.write(selected.body_html, unsafe_allow_html=True) st.write(selected.body_html, unsafe_allow_html=True)
render_export_sidebar(successful, rows, headers) stored_rows = st.session_state.get("input_rows", rows)
stored_headers = st.session_state.get("input_headers", headers)
render_export_sidebar(successful, stored_rows, stored_headers)
def build_scrape_context( def build_scrape_context(
+108
View File
@@ -0,0 +1,108 @@
from __future__ import annotations
import importlib.util
import pathlib
import sys
import types
import unittest
ROOT_DIR = pathlib.Path(__file__).resolve().parents[2]
PAGE_IMPORTER_DIR = ROOT_DIR / "Page Importer"
if str(PAGE_IMPORTER_DIR) not in sys.path:
sys.path.insert(0, str(PAGE_IMPORTER_DIR))
APP_MODULE = None
def load_app_module():
original_modules = {
name: sys.modules.get(name)
for name in (
"streamlit",
"page_importer.dates",
"page_importer.models",
"page_importer.scraper",
"page_importer.wxr",
"page_importer_app_test",
)
}
try:
sys.modules["streamlit"] = types.ModuleType("streamlit")
dates_module = types.ModuleType("page_importer.dates")
dates_module.parse_datetime = lambda value: None
sys.modules["page_importer.dates"] = dates_module
models_module = types.ModuleType("page_importer.models")
class ScrapeOptions:
pass
class ScrapedPost:
pass
models_module.ScrapeOptions = ScrapeOptions
models_module.ScrapedPost = ScrapedPost
sys.modules["page_importer.models"] = models_module
scraper_module = types.ModuleType("page_importer.scraper")
scraper_module.Scraper = object
sys.modules["page_importer.scraper"] = scraper_module
wxr_module = types.ModuleType("page_importer.wxr")
wxr_module.build_wxr = lambda posts: ""
sys.modules["page_importer.wxr"] = wxr_module
app_path = PAGE_IMPORTER_DIR / "app.py"
spec = importlib.util.spec_from_file_location("page_importer_app_test", app_path)
assert spec is not None and spec.loader is not None
module = importlib.util.module_from_spec(spec)
sys.modules["page_importer_app_test"] = module
spec.loader.exec_module(module)
return module
finally:
for name, original in original_modules.items():
if original is None:
sys.modules.pop(name, None)
else:
sys.modules[name] = original
APP_MODULE = load_app_module()
class UploadStateTests(unittest.TestCase):
def test_sync_uploaded_file_state_clears_stale_results_for_new_file(self) -> None:
session_state = {
"uploaded_csv_fingerprint": "old",
"results": ["stale"],
"input_rows": [{"url": "https://example.com"}],
"input_headers": ["url"],
"scrape_context": {"url_column": "url"},
}
APP_MODULE.sync_uploaded_file_state(session_state, "new")
self.assertEqual(session_state["uploaded_csv_fingerprint"], "new")
self.assertNotIn("results", session_state)
self.assertNotIn("input_rows", session_state)
self.assertNotIn("input_headers", session_state)
self.assertNotIn("scrape_context", session_state)
def test_sync_uploaded_file_state_keeps_results_for_same_file(self) -> None:
session_state = {
"uploaded_csv_fingerprint": "same",
"results": ["keep"],
"input_rows": [{"url": "https://example.com"}],
}
APP_MODULE.sync_uploaded_file_state(session_state, "same")
self.assertEqual(session_state["results"], ["keep"])
self.assertEqual(session_state["input_rows"], [{"url": "https://example.com"}])
if __name__ == "__main__":
unittest.main()
@@ -0,0 +1,86 @@
from __future__ import annotations
import importlib.util
import pathlib
import sys
import tempfile
import unittest
ROOT_DIR = pathlib.Path(__file__).resolve().parents[2]
MODULE_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
SPEC = importlib.util.spec_from_file_location("sitemap_builder_test", MODULE_PATH)
assert SPEC is not None and SPEC.loader is not None
MODULE = importlib.util.module_from_spec(SPEC)
sys.modules["sitemap_builder_test"] = MODULE
SPEC.loader.exec_module(MODULE)
class SitemapBuilderTests(unittest.TestCase):
def test_cleanup_run_files_deletes_generated_artifacts(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
output_path = pathlib.Path(temp_dir) / "crawl.csv"
state_path = MODULE.get_state_path(output_path)
log_path = MODULE.get_log_path(output_path)
output_path.write_text("csv", encoding="utf-8")
state_path.write_text("state", encoding="utf-8")
log_path.write_text("log", encoding="utf-8")
removed_paths = MODULE.cleanup_run_files(output_path)
self.assertCountEqual(removed_paths, [output_path, state_path, log_path])
self.assertFalse(output_path.exists())
self.assertFalse(state_path.exists())
self.assertFalse(log_path.exists())
def test_run_crawl_rejects_resume_with_changed_subdomain_setting(self) -> None:
with tempfile.TemporaryDirectory() as temp_dir:
output_path = pathlib.Path(temp_dir) / "crawl.csv"
state_path = MODULE.get_state_path(output_path)
state = MODULE.initialize_state("https://example.com", include_subdomains=False, include_documents=False)
MODULE.save_state(state, state_path, output_path)
with self.assertRaisesRegex(ValueError, "different subdomain setting"):
MODULE.run_crawl(
start_url="https://example.com",
output_path=output_path,
include_subdomains=True,
include_documents=False,
resume=True,
fresh=False,
)
def test_crawl_site_removes_dequeued_urls_from_queued_set(self) -> None:
original_fetch = MODULE.fetch_page_with_delay
try:
MODULE.fetch_page_with_delay = lambda url, timeout, user_agent, delay: MODULE.CrawlResult(url=url, links=[])
with tempfile.TemporaryDirectory() as temp_dir:
output_path = pathlib.Path(temp_dir) / "crawl.csv"
state_path = MODULE.get_state_path(output_path)
log_path = MODULE.get_log_path(output_path)
state = MODULE.initialize_state("https://example.com", include_subdomains=False, include_documents=False)
final_state, user_stopped = MODULE.crawl_site(
state=state,
max_pages=1,
delay=0.0,
timeout=1.0,
user_agent=MODULE.DEFAULT_USER_AGENT,
state_path=state_path,
output_path=output_path,
log_path=log_path,
save_every=1,
workers=1,
)
self.assertFalse(user_stopped)
self.assertEqual(list(final_state.queue), [])
self.assertEqual(final_state.queued, set())
finally:
MODULE.fetch_page_with_delay = original_fetch
if __name__ == "__main__":
unittest.main()
+16
View File
@@ -175,6 +175,15 @@ def get_log_path(output_path: Path) -> Path:
return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX) return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
def cleanup_run_files(output_path: Path) -> list[Path]:
removed_paths: list[Path] = []
for path in (Path(output_path), get_state_path(Path(output_path)), get_log_path(Path(output_path))):
if path.exists():
path.unlink()
removed_paths.append(path)
return removed_paths
def log_message(log_path: Path, message: str) -> None: def log_message(log_path: Path, message: str) -> None:
log_path.parent.mkdir(parents=True, exist_ok=True) log_path.parent.mkdir(parents=True, exist_ok=True)
timestamp = time.strftime("%Y-%m-%d %H:%M:%S") timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
@@ -599,6 +608,7 @@ def crawl_site(
break break
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical) current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
state.queued.discard(current)
if current in state.visited: if current in state.visited:
continue continue
@@ -647,6 +657,7 @@ def crawl_site(
while state.queue and len(pending) < workers and len(state.visited) < max_pages: while state.queue and len(pending) < workers and len(state.visited) < max_pages:
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical) current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
state.queued.discard(current)
if current in state.visited: if current in state.visited:
continue continue
@@ -790,6 +801,11 @@ def run_crawl(
"The saved crawl state uses a different document setting. " "The saved crawl state uses a different document setting. "
"Keep the same choice or start a fresh crawl." "Keep the same choice or start a fresh crawl."
) )
if state.include_subdomains != include_subdomains:
raise ValueError(
"The saved crawl state uses a different subdomain setting. "
"Keep the same choice or start a fresh crawl."
)
else: else:
state = initialize_state(normalized_start, include_subdomains, include_documents) state = initialize_state(normalized_start, include_subdomains, include_documents)
+22 -4
View File
@@ -137,6 +137,7 @@ def render_sitemap_tab() -> None:
st.info("Run a crawl to generate a sitemap CSV.") st.info("Run a crawl to generate a sitemap CSV.")
return return
sitemap_builder = get_sitemap_module()
summary = result_data["summary"] summary = result_data["summary"]
csv_path = Path(result_data["output_path"]) csv_path = Path(result_data["output_path"])
state_path = Path(result_data["state_path"]) state_path = Path(result_data["state_path"])
@@ -184,6 +185,19 @@ def render_sitemap_tab() -> None:
mime="text/plain", mime="text/plain",
) )
cleanup_targets = [path for path in (csv_path, state_path, log_path) if path.exists()]
if cleanup_targets:
st.caption("Cleanup removes the sitemap CSV, crawl state, and crawl log for this run.")
if st.button("Delete Crawl Files"):
removed_paths = sitemap_builder.cleanup_run_files(csv_path)
st.session_state.pop("sitemap_result", None)
if removed_paths:
removed_names = ", ".join(path.name for path in removed_paths)
st.success(f"Deleted: {removed_names}")
else:
st.info("No crawl files were present to delete.")
return
crawl_output = (result_data.get("stdout") or "").strip() crawl_output = (result_data.get("stdout") or "").strip()
if crawl_output: if crawl_output:
st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True) st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True)
@@ -196,12 +210,16 @@ def render_sitemap_tab() -> None:
def main() -> None: def main() -> None:
st.set_page_config(page_title="WDW Tools", layout="wide") st.set_page_config(page_title="WDW Tools", layout="wide")
st.header("WDW Sitemap And Import Tools") st.header("WDW Sitemap And Import Tools")
sitemap_tab, importer_tab = st.tabs(["Sitemap Generator", "Page Importer"]) selected_tool = st.radio(
"Tool",
["Sitemap Generator", "Page Importer"],
horizontal=True,
label_visibility="collapsed",
)
with sitemap_tab: if selected_tool == "Sitemap Generator":
render_sitemap_tab() render_sitemap_tab()
else:
with importer_tab:
page_importer_app = get_page_importer_module() page_importer_app = get_page_importer_module()
page_importer_app.render_app() page_importer_app.render_app()