first commit

2026-04-09 10:42:10 -07:00
commit ead872a0a5
19 changed files with 2783 additions and 0 deletions
@@ -0,0 +1,13 @@
+.git
+.gitignore
+.codex
+**/.git
+**/.venv
+**/__pycache__
+**/*.pyc
+**/*.pyo
+**/*.pyd
+**/.pytest_cache
+**/.mypy_cache
+**/.DS_Store
+.data
@@ -0,0 +1,42 @@
+name: Build Docker Image
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+env:
+  IMAGE_NAME: wdw-sitemap-and-importer
+  REGISTRY: ${{ secrets.REGISTRY_URL }}
+  REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
+  REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Build image
+        run: docker build -t "${IMAGE_NAME}:${GITHUB_SHA}" .
+
+      - name: Tag latest image
+        run: docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${IMAGE_NAME}:latest"
+
+      - name: Log in to registry
+        if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
+        run: echo "${REGISTRY_PASSWORD}" | docker login "${REGISTRY}" -u "${REGISTRY_USERNAME}" --password-stdin
+
+      - name: Push commit image
+        if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
+        run: |
+          docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
+          docker push "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
+
+      - name: Push latest image
+        if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
+        run: |
+          docker tag "${IMAGE_NAME}:latest" "${REGISTRY}/${IMAGE_NAME}:latest"
+          docker push "${REGISTRY}/${IMAGE_NAME}:latest"
@@ -0,0 +1,15 @@
+.codex
+.data/
+__pycache__/
+*.py[cod]
+
+.venv/
+**/.venv/
+**/__pycache__/
+.pytest_cache/
+.mypy_cache/
+
+*.crawl.log
+*.crawlstate.json
+
+streamlit_uploads/
@@ -0,0 +1,22 @@
+FROM python:3.14-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    STREAMLIT_SERVER_HEADLESS=true \
+    STREAMLIT_SERVER_PORT=8501 \
+    STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
+    APP_DATA_DIR=/data
+
+WORKDIR /app
+
+COPY requirements.txt ./requirements.txt
+RUN pip install -r requirements.txt
+
+COPY . .
+
+RUN mkdir -p /data
+
+EXPOSE 8501
+
+CMD ["streamlit", "run", "app.py"]
@@ -0,0 +1,12 @@
+.venv/
+__pycache__/
+*.py[cod]
+*$py.class
+
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+
+.streamlit/secrets.toml
+
+*.log
@@ -0,0 +1,63 @@
+# Page Importer
+
+This folder contains the WordPress import tool used by the combined application in the repository root.
+
+The importer still uses Streamlit internally, but it is now rendered as the `Page Importer` tab inside the shared app rather than being the main entrypoint for the repository.
+
+## Features
+
+- Upload a CSV of submitted URLs
+- Choose the URL column and optional title override column
+- Optionally map post type from the CSV or force a single post type
+- Scrape only the listed URLs
+- Extract title, publish date, author, body HTML, categories, and tags
+- Retry failed rows
+- Export a WordPress WXR XML file
+
+## Recommended Usage
+
+Run the root application:
+
+```bash
+streamlit run ../app.py
+```
+
+Or run the combined Docker container from the repository root.
+
+## Standalone Usage
+
+If you need to run this importer by itself:
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+streamlit run app.py
+```
+
+On Windows PowerShell:
+
+```powershell
+python -m venv .venv
+.venv\Scripts\Activate.ps1
+pip install -r requirements.txt
+streamlit run app.py
+```
+
+## CSV Input
+
+The app accepts CSV files with any columns. You choose:
+
+- the URL column to scrape
+- an optional title or name column to override the scraped title
+- an optional post type column with values like `post` or `page`
+- an optional category column whose values are appended during export
+
+You can also add manual categories in the sidebar to append them to every exported item.
+
+## Notes
+
+- Exported posts default to `draft` unless changed in the UI
+- Image and link URLs remain pointed at the source site
+- Some themes need heuristic fallback. The `Force heuristic scraping` option skips JSON-LD-first extraction and relies on page structure
+- In the combined app, dependencies come from the root `requirements.txt`
@@ -0,0 +1,475 @@
+from __future__ import annotations
+
+import csv
+import datetime as dt
+import io
+import re
+from dataclasses import replace
+
+import streamlit as st
+
+from page_importer.dates import parse_datetime
+from page_importer.models import ScrapeOptions, ScrapedPost
+from page_importer.scraper import Scraper
+from page_importer.wxr import build_wxr
+
+def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
+    text = file_data.decode("utf-8-sig", errors="replace")
+    reader = csv.DictReader(io.StringIO(text))
+    rows = list(reader)
+    return reader.fieldnames or [], rows
+
+
+def render_app() -> None:
+    st.title("Page Importer")
+    st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
+
+    with st.sidebar:
+        st.header("Options")
+        include_author = st.checkbox("Include author", value=True)
+        include_categories = st.checkbox("Include categories", value=True)
+        include_tags = st.checkbox("Include tags", value=True)
+        force_heuristics = st.checkbox("Force heuristic scraping", value=False)
+        test_run = st.checkbox(
+            "Test run only",
+            value=False,
+            help="Scrape only the first 10 rows that contain a URL.",
+        )
+        post_type_mode = st.selectbox(
+            "WordPress post type mode",
+            ["Single type for all rows", "Use a CSV column"],
+            index=0,
+        )
+        default_post_type = st.selectbox("Default WordPress post type", ["post", "page"], index=0)
+
+    uploaded = st.file_uploader("Upload CSV", type=["csv"])
+    if not uploaded:
+        st.info("Upload a CSV to begin.")
+        return
+
+    headers, rows = load_csv(uploaded.getvalue())
+    if not rows:
+        st.error("The CSV did not contain any rows.")
+        return
+
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        url_column = st.selectbox("URL column", headers, index=_safe_index(headers, ["url", "link"]))
+    with col2:
+        title_column = st.selectbox(
+            "Optional title override column",
+            ["(none)", *headers],
+            index=_safe_index(["(none)", *headers], ["name", "title"]),
+        )
+    with col3:
+        post_type_column = st.selectbox(
+            "Optional post type column",
+            ["(none)", *headers],
+            index=_safe_index(["(none)", *headers], ["post_type", "type"]),
+            disabled=post_type_mode != "Use a CSV column",
+        )
+    st.write(f"Loaded {len(rows)} row(s). Only the selected URL column will be scraped.")
+    if test_run:
+        st.caption("Test run is enabled. Only the first 10 rows with a URL will be scraped.")
+
+    if st.button("Scrape URLs", type="primary"):
+        context = build_scrape_context(
+            include_author=include_author,
+            include_categories=include_categories,
+            include_tags=include_tags,
+            force_heuristics=force_heuristics,
+            test_run=test_run,
+            post_type_mode=post_type_mode,
+            post_type_column=post_type_column,
+            default_post_type=default_post_type,
+            url_column=url_column,
+            title_column=title_column,
+        )
+        results = scrape_rows(rows, context, phase_label="Scraping")
+        st.session_state["results"] = results
+        st.session_state["input_rows"] = rows
+        st.session_state["scrape_context"] = context
+
+    results = st.session_state.get("results", [])
+    if not results:
+        return
+
+    successful = [post for post in results if post.success]
+    failed = [post for post in results if not post.success]
+
+    st.subheader("Results")
+    st.write(f"Successful: {len(successful)} | Failed: {len(failed)}")
+
+    if failed and st.button("Retry failed items"):
+        stored_rows = st.session_state.get("input_rows", rows)
+        context = st.session_state.get("scrape_context")
+        if context:
+            retried = scrape_rows(
+                stored_rows,
+                context,
+                row_numbers=[post.row_number for post in failed if post.row_number],
+                phase_label="Retrying",
+            )
+            results = merge_retry_results(results, retried)
+            st.session_state["results"] = results
+            successful = [post for post in results if post.success]
+            failed = [post for post in results if not post.success]
+
+    preview_rows = []
+    for post in results:
+        preview_rows.append(
+            {
+                "Row": post.row_number,
+                "URL": post.source_url,
+                "CMS": post.cms,
+                "Success": post.success,
+                "Title": post.title,
+                "Publish Date": post.publish_date,
+                "Author": post.author,
+                "Categories": ", ".join(post.categories),
+                "Tags": ", ".join(post.tags),
+                "Post Type": post.post_type,
+                "Error": post.error,
+            }
+        )
+    st.dataframe(
+        preview_rows,
+        width="stretch",
+        hide_index=True,
+        column_config={
+            "Row": st.column_config.NumberColumn(width="small"),
+            "URL": st.column_config.TextColumn(width="medium"),
+            "Title": st.column_config.TextColumn(width="medium"),
+            "Publish Date": st.column_config.TextColumn(width="medium"),
+            "Categories": st.column_config.TextColumn(width="medium"),
+            "Tags": st.column_config.TextColumn(width="medium"),
+            "Error": st.column_config.TextColumn(width="large"),
+        },
+    )
+
+    if failed:
+        selected_failed = st.selectbox(
+            "Failed row details",
+            failed,
+            format_func=lambda post: f"Row {post.row_number}: {post.source_url or '(missing URL)'}",
+        )
+        st.text_area(
+            "Error details",
+            value=selected_failed.error_details or selected_failed.error,
+            height=180,
+            disabled=True,
+        )
+
+    if successful:
+        selected_index = st.number_input(
+            "Preview successful row",
+            min_value=1,
+            max_value=len(successful),
+            value=1,
+            step=1,
+        )
+        selected = successful[selected_index - 1]
+        st.markdown("### Content Preview")
+        st.write(f"**Title:** {selected.title}")
+        st.write(f"**Source URL:** {selected.source_url}")
+        st.write(f"**Publish Date:** {selected.publish_date or '(missing)'}")
+        st.write(f"**Author:** {selected.author or '(missing)'}")
+        st.write(f"**Post Type:** {selected.post_type}")
+        st.write(selected.body_html, unsafe_allow_html=True)
+        render_export_sidebar(successful, rows, headers)
+
+
+def build_scrape_context(
+    *,
+    include_author: bool,
+    include_categories: bool,
+    include_tags: bool,
+    force_heuristics: bool,
+    test_run: bool,
+    post_type_mode: str,
+    post_type_column: str,
+    default_post_type: str,
+    url_column: str,
+    title_column: str,
+) -> dict[str, object]:
+    return {
+        "options": ScrapeOptions(
+            include_author=include_author,
+            include_categories=include_categories,
+            include_tags=include_tags,
+            force_heuristics=force_heuristics,
+        ),
+        "test_run": test_run,
+        "post_type_mode": post_type_mode,
+        "post_type_column": post_type_column,
+        "default_post_type": default_post_type,
+        "url_column": url_column,
+        "title_column": title_column,
+    }
+
+
+def scrape_rows(
+    rows: list[dict[str, str]],
+    context: dict[str, object],
+    row_numbers: list[int] | None = None,
+    phase_label: str = "Scraping",
+) -> list[ScrapedPost]:
+    options = context["options"]
+    if not isinstance(options, ScrapeOptions):
+        raise TypeError("Invalid scrape options in session state.")
+
+    scraper = Scraper(options)
+    targets = list(enumerate(rows, start=1))
+    if row_numbers is not None:
+        requested_rows = set(row_numbers)
+        targets = [(row_number, row) for row_number, row in targets if row_number in requested_rows]
+    elif bool(context.get("test_run")):
+        targets = [
+            (row_number, row)
+            for row_number, row in targets
+            if (row.get(str(context["url_column"])) or "").strip()
+        ][:10]
+
+    results: list[ScrapedPost] = []
+    progress = st.progress(0.0)
+    status = st.empty()
+
+    total = len(targets) or 1
+    for index, (row_number, row) in enumerate(targets, start=1):
+        url = (row.get(context["url_column"]) or "").strip()
+        status.write(f"{phase_label} {index}/{len(targets)}: {url or f'row {row_number} has no URL'}")
+
+        if url:
+            post = scraper.scrape(url)
+        else:
+            post = ScrapedPost(
+                source_url="",
+                row_number=row_number,
+                error="Missing URL in the selected URL column.",
+                error_details=f"Row {row_number} does not contain a URL in column '{context['url_column']}'.",
+            )
+
+        post.row_number = row_number
+        apply_row_overrides(post, row, context)
+        results.append(post)
+        progress.progress(index / total)
+
+    status.write(f"{phase_label} complete.")
+    return results
+
+
+def apply_row_overrides(post: ScrapedPost, row: dict[str, str], context: dict[str, object]) -> None:
+    title_column = context["title_column"]
+    if isinstance(title_column, str) and title_column != "(none)" and row.get(title_column):
+        post.title = row[title_column].strip()
+
+    post.post_type = resolve_post_type(
+        row=row,
+        mode=str(context["post_type_mode"]),
+        column=str(context["post_type_column"]),
+        default_value=str(context["default_post_type"]),
+    )
+
+
+def resolve_export_categories(
+    row: dict[str, str],
+    category_column: str,
+    manual_categories: list[str],
+) -> list[str]:
+    csv_categories = parse_terms(row.get(category_column, "")) if category_column != "(none)" else []
+    return merge_unique_terms(csv_categories, manual_categories)
+
+
+def parse_terms(value: str) -> list[str]:
+    return [term.strip() for term in re.split(r"[,|>]", value or "") if term.strip()]
+
+
+def merge_unique_terms(*groups: list[str]) -> list[str]:
+    merged: list[str] = []
+    for group in groups:
+        for term in group:
+            cleaned = term.strip()
+            if cleaned and cleaned not in merged:
+                merged.append(cleaned)
+    return merged
+
+
+def merge_retry_results(existing: list[ScrapedPost], replacements: list[ScrapedPost]) -> list[ScrapedPost]:
+    replacement_map = {post.row_number: post for post in replacements}
+    merged = [replacement_map.get(post.row_number, post) for post in existing]
+    return sorted(merged, key=lambda post: post.row_number or 0)
+
+
+def build_export_posts(
+    posts: list[ScrapedPost],
+    rows: list[dict[str, str]],
+    category_column: str,
+    manual_categories: list[str],
+    post_status: str,
+    custom_post_type_slug: str,
+) -> list[ScrapedPost]:
+    export_posts: list[ScrapedPost] = []
+    for post in posts:
+        row = rows[post.row_number - 1] if 0 < post.row_number <= len(rows) else {}
+        export_posts.append(
+            replace(
+                post,
+                status=post_status,
+                post_type=custom_post_type_slug or post.post_type,
+                categories=merge_unique_terms(
+                    post.categories,
+                    resolve_export_categories(row, category_column, manual_categories),
+                ),
+            )
+        )
+    return export_posts
+
+
+def render_export_sidebar(
+    successful: list[ScrapedPost],
+    rows: list[dict[str, str]],
+    headers: list[str],
+) -> None:
+    with st.sidebar:
+        st.markdown("---")
+        st.subheader("Export")
+        post_status = st.selectbox(
+            "Imported post status",
+            ["draft", "publish", "private"],
+            index=0,
+            key="export_post_status",
+        )
+        category_column = st.selectbox(
+            "CSV category column",
+            ["(none)", *headers],
+            index=_safe_index(["(none)", *headers], ["category", "categories", "department"]),
+            key="export_category_column",
+        )
+        manual_categories = parse_terms(
+            st.text_input(
+                "Additional export categories",
+                value="",
+                help="Comma-separated categories to append to every exported item.",
+                key="export_manual_categories",
+            )
+        )
+        output_name = st.text_input(
+            "Output filename",
+            value="wordpress-import.xml",
+            key="export_output_name",
+        )
+        custom_post_type_slug = normalize_post_type_slug(
+            st.text_input(
+                "Custom post type slug",
+                value="",
+                help="Optional. If set, all exported items will use this WordPress post type slug.",
+                key="export_custom_post_type_slug",
+            )
+        )
+
+        export_posts = build_export_posts(
+            successful,
+            rows,
+            category_column,
+            manual_categories,
+            post_status,
+            custom_post_type_slug,
+        )
+        if custom_post_type_slug:
+            st.caption(f"Exporting all items as post type `{custom_post_type_slug}`.")
+        dated_export_posts = [(post, publish_date) for post in export_posts if (publish_date := parse_publish_date(post.publish_date))]
+
+        if dated_export_posts:
+            min_date = min(publish_date for _, publish_date in dated_export_posts)
+            max_date = max(publish_date for _, publish_date in dated_export_posts)
+            filter_by_publish_date = st.checkbox(
+                "Filter export by publish date",
+                value=False,
+                key="export_filter_by_publish_date",
+            )
+
+            if filter_by_publish_date:
+                export_start = st.date_input(
+                    "Export start date",
+                    value=min_date,
+                    min_value=min_date,
+                    max_value=max_date,
+                    format="MM/DD/YYYY",
+                    key="export_start_date",
+                )
+                export_end = st.date_input(
+                    "Export end date",
+                    value=max_date,
+                    min_value=min_date,
+                    max_value=max_date,
+                    format="MM/DD/YYYY",
+                    key="export_end_date",
+                )
+
+                if export_start > export_end:
+                    st.error("Export start date must be on or before the end date.")
+                    export_posts = []
+                else:
+                    export_posts = [
+                        post
+                        for post in export_posts
+                        if (publish_date := parse_publish_date(post.publish_date)) and export_start <= publish_date <= export_end
+                    ]
+                    st.caption(
+                        "Date filter: "
+                        f"{export_start.strftime('%m/%d/%Y')} to {export_end.strftime('%m/%d/%Y')}."
+                    )
+                    undated_count = len(successful) - len(dated_export_posts)
+                    if undated_count:
+                        st.caption(f"Excluded {undated_count} successful item(s) with no publish date.")
+        else:
+            st.caption("No successful items have a publish date, so export date filtering is unavailable.")
+
+        st.caption(f"Ready to export {len(export_posts)} post(s).")
+        xml_data = build_wxr(export_posts)
+        st.download_button(
+            label="Download WXR XML",
+            data=xml_data,
+            file_name=output_name,
+            mime="application/xml",
+            disabled=not export_posts,
+        )
+
+
+def parse_publish_date(value: str) -> dt.date | None:
+    parsed = parse_datetime(value)
+    if parsed is None:
+        return None
+    return parsed.date()
+
+
+def _safe_index(values: list[str], candidates: list[str]) -> int:
+    lowered = {value.lower(): idx for idx, value in enumerate(values)}
+    for candidate in candidates:
+        if candidate in lowered:
+            return lowered[candidate]
+    return 0
+
+
+def resolve_post_type(
+    row: dict[str, str],
+    mode: str,
+    column: str,
+    default_value: str,
+) -> str:
+    if mode != "Use a CSV column" or column == "(none)":
+        return default_value
+
+    raw_value = normalize_post_type_slug(row.get(column) or "")
+    if raw_value:
+        return raw_value
+    return default_value
+
+
+def normalize_post_type_slug(value: str) -> str:
+    return re.sub(r"[^a-z0-9_-]", "", (value or "").strip().lower())
+
+
+if __name__ == "__main__":
+    st.set_page_config(page_title="Page Importer", layout="wide")
+    render_app()
@@ -0,0 +1 @@
+
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import datetime as dt
+
+from dateutil import parser as date_parser
+
+
+def parse_datetime(value: str | None) -> dt.datetime | None:
+    if not value:
+        return None
+    try:
+        return date_parser.parse(value)
+    except (TypeError, ValueError, OverflowError):
+        try:
+            return date_parser.parse(value, fuzzy=True)
+        except (TypeError, ValueError, OverflowError):
+            return None
+
+
+def normalize_date(value: str | None) -> str:
+    parsed = parse_datetime(value)
+    if parsed is None:
+        return ""
+    if parsed.tzinfo is None or parsed.utcoffset() is None:
+        return parsed.strftime("%Y-%m-%d %H:%M:%S")
+    return parsed.isoformat(sep=" ", timespec="seconds")
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class ScrapeOptions:
+    include_author: bool = True
+    include_categories: bool = True
+    include_tags: bool = True
+    force_heuristics: bool = False
+    request_timeout: int = 20
+    user_agent: str = (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0 Safari/537.36"
+    )
+
+
+@dataclass
+class ScrapedPost:
+    source_url: str
+    row_number: int = 0
+    cms: str = "unknown"
+    title: str = ""
+    publish_date: str = ""
+    author: str = ""
+    body_html: str = ""
+    categories: list[str] = field(default_factory=list)
+    tags: list[str] = field(default_factory=list)
+    status: str = "draft"
+    post_type: str = "post"
+    success: bool = False
+    error: str = ""
+    error_details: str = ""
@@ -0,0 +1,555 @@
+from __future__ import annotations
+
+import json
+import re
+import traceback
+from html import unescape
+from typing import Iterable
+
+import requests
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString, Tag
+
+from page_importer.dates import normalize_date
+from page_importer.models import ScrapeOptions, ScrapedPost
+
+JSON_ARTICLE_TYPES = {
+    "article",
+    "blogposting",
+    "newsarticle",
+    "report",
+    "webpage",
+}
+
+BODY_SELECTORS = [
+    "article .entry-content",
+    "article .post-content",
+    "article .node__content",
+    "article .node .content",
+    "article .node-content",
+    "article .field-name-body .field-item",
+    "article .field-name-body",
+    "article .field--name-body",
+    "article .article-body",
+    "article .content",
+    ".post-content",
+    ".entry-content",
+    ".node__content",
+    ".node .content",
+    ".node-content",
+    ".field-name-body .field-item",
+    ".field-name-body",
+    ".field--name-body",
+    ".article-body",
+    "#content-area .node .content",
+    "article",
+    "main article",
+    "main",
+]
+
+CATEGORY_SELECTORS = [
+    ".cat-links a",
+    ".post-categories a",
+    ".field--name-field-category a",
+    ".tags a[rel='category tag']",
+    ".terms a",
+    ".taxonomy a",
+]
+
+TAG_SELECTORS = [
+    ".tags-links a",
+    ".post-tags a",
+    ".field--name-field-tags a",
+    "a[rel='tag']",
+    ".terms a",
+]
+
+AUTHOR_SELECTORS = [
+    "[rel='author']",
+    ".author a",
+    ".byline a",
+    ".submitted a",
+    ".node__submitted a",
+    ".node-info a",
+    ".createdby",
+]
+
+DATE_SELECTORS = [
+    "time[datetime]",
+    "meta[property='article:published_time']",
+    "meta[name='publish_date']",
+    "meta[name='pubdate']",
+    ".date-display-single",
+    ".submitted",
+    ".node-info",
+]
+
+DRUPAL_TITLE_DATE_PATTERN = re.compile(
+    r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
+    r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
+)
+
+
+class Scraper:
+    def __init__(self, options: ScrapeOptions) -> None:
+        self.options = options
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": options.user_agent})
+
+    def scrape(self, url: str) -> ScrapedPost:
+        post = ScrapedPost(source_url=url)
+        response: requests.Response | None = None
+        try:
+            response = self.session.get(url, timeout=self.options.request_timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, "html.parser")
+            post.cms = detect_cms(soup)
+
+            article_data = extract_article_json_ld(soup)
+            if article_data and not self.options.force_heuristics:
+                apply_article_data(post, article_data, soup, self.options)
+
+            merge_fallback_data(post, soup, self.options)
+            post.body_html = sanitize_html(post.body_html)
+
+            missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
+            if missing_fields:
+                raise ValueError(
+                    "Unable to extract required field(s): "
+                    f"{', '.join(missing_fields)}. "
+                    f"Detected CMS: {post.cms}. "
+                    f"Publish date found: {'yes' if post.publish_date else 'no'}. "
+                    f"Author found: {'yes' if post.author else 'no'}."
+                )
+
+            post.success = True
+            return post
+        except Exception as exc:
+            post.error = format_error_summary(url, exc, response, self.options.request_timeout)
+            post.error_details = format_error_details(url, exc, response)
+            return post
+
+
+def detect_cms(soup: BeautifulSoup) -> str:
+    generator = meta_content(soup, "meta", {"name": "generator"})
+    html = str(soup).lower()
+    if generator:
+        g = generator.lower()
+        if "wordpress" in g:
+            return "wordpress"
+        if "drupal" in g:
+            return "drupal"
+        if "joomla" in g:
+            return "joomla"
+    if "/wp-content/" in html:
+        return "wordpress"
+    if "drupal-settings-json" in html or "sites/default/files" in html:
+        return "drupal"
+    if "com_content" in html or "joomla" in html:
+        return "joomla"
+    return "unknown"
+
+
+def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
+    for script in soup.select("script[type='application/ld+json']"):
+        raw = script.string or script.get_text(" ", strip=True)
+        if not raw:
+            continue
+        for payload in parse_json_candidates(raw):
+            article = find_article_payload(payload)
+            if article:
+                return article
+    return None
+
+
+def parse_json_candidates(raw: str) -> Iterable[dict | list]:
+    try:
+        data = json.loads(raw)
+        yield data
+        return
+    except json.JSONDecodeError:
+        pass
+
+    cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
+    try:
+        data = json.loads(cleaned)
+        yield data
+    except json.JSONDecodeError:
+        return
+
+
+def find_article_payload(payload: dict | list) -> dict | None:
+    if isinstance(payload, list):
+        for item in payload:
+            found = find_article_payload(item)
+            if found:
+                return found
+        return None
+    if not isinstance(payload, dict):
+        return None
+    if "@graph" in payload:
+        found = find_article_payload(payload["@graph"])
+        if found:
+            return found
+    node_type = payload.get("@type")
+    types = {node_type.lower()} if isinstance(node_type, str) else {
+        item.lower() for item in node_type or [] if isinstance(item, str)
+    }
+    if types & JSON_ARTICLE_TYPES:
+        return payload
+    return None
+
+
+def apply_article_data(
+    post: ScrapedPost,
+    article: dict,
+    soup: BeautifulSoup,
+    options: ScrapeOptions,
+) -> None:
+    post.title = article.get("headline") or article.get("name") or post.title
+    post.publish_date = normalize_date(
+        article.get("datePublished") or article.get("dateCreated") or post.publish_date
+    )
+    if options.include_author:
+        post.author = extract_author_from_json_ld(article) or post.author
+    if options.include_categories:
+        post.categories = normalize_terms(article.get("articleSection")) or post.categories
+    if options.include_tags:
+        post.tags = normalize_terms(article.get("keywords")) or post.tags
+    post.body_html = extract_body_from_article(article, soup) or post.body_html
+
+
+def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
+    if not post.title:
+        post.title = extract_title(soup)
+    if not post.publish_date:
+        post.publish_date = extract_date(soup, post.cms)
+    if options.include_author and not post.author:
+        post.author = extract_author(soup)
+    if not post.body_html:
+        post.body_html = extract_body(soup)
+    if options.include_categories:
+        post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
+        if post.cms == "drupal":
+            post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
+    if options.include_tags and not post.tags:
+        post.tags = extract_terms(soup, TAG_SELECTORS)
+
+
+def extract_title(soup: BeautifulSoup) -> str:
+    og_title = meta_content(soup, "meta", {"property": "og:title"})
+    if og_title:
+        return og_title
+    for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
+        node = soup.select_one(selector)
+        if node:
+            return clean_text(node.get_text(" ", strip=True))
+    return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
+
+
+def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
+    for selector in DATE_SELECTORS:
+        node = soup.select_one(selector)
+        if not node:
+            continue
+        candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
+        normalized = normalize_date(candidate)
+        if normalized:
+            return normalized
+    if cms == "drupal":
+        return extract_drupal_title_adjacent_date(soup)
+    return ""
+
+
+def extract_author(soup: BeautifulSoup) -> str:
+    author = meta_content(soup, "meta", {"name": "author"})
+    if author:
+        return clean_text(author)
+    for selector in AUTHOR_SELECTORS:
+        node = soup.select_one(selector)
+        if node:
+            return clean_text(node.get_text(" ", strip=True))
+    return ""
+
+
+def extract_body(soup: BeautifulSoup) -> str:
+    fallback_html = ""
+    for selector in BODY_SELECTORS:
+        node = soup.select_one(selector)
+        if not node:
+            continue
+        candidate = clone_tag(node)
+        strip_unwanted(candidate)
+        html = candidate.decode_contents().strip()
+        text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
+        if text_length >= 120:
+            return html
+        if not fallback_html and has_meaningful_body_content(html):
+            fallback_html = html
+    return fallback_html
+
+
+def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
+    terms: list[str] = []
+    for selector in selectors:
+        for node in soup.select(selector):
+            term = clean_text(node.get_text(" ", strip=True))
+            if term and term not in terms:
+                terms.append(term)
+    return terms
+
+
+def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
+    title_node = find_title_node(soup)
+    if not title_node:
+        return ""
+
+    for sibling in title_node.next_siblings:
+        candidate = text_from_node(sibling)
+        normalized = normalize_drupal_date(candidate)
+        if normalized:
+            return normalized
+
+    header = title_node.find_parent(["header", "div", "section"])
+    if header:
+        header_text = clean_text(header.get_text(" ", strip=True))
+        title_text = clean_text(title_node.get_text(" ", strip=True))
+        if title_text and header_text.startswith(title_text):
+            header_text = clean_text(header_text[len(title_text):])
+        normalized = normalize_drupal_date(header_text)
+        if normalized:
+            return normalized
+
+    return ""
+
+
+def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
+    categories: list[str] = []
+    label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)
+
+    for label_node in soup.find_all(string=label_pattern):
+        parent = label_node.parent if isinstance(label_node.parent, Tag) else None
+        if not parent:
+            continue
+
+        inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
+        normalized_inline_value = normalize_department_category(inline_value)
+        if normalized_inline_value:
+            categories = merge_terms(categories, [normalized_inline_value])
+            continue
+
+        for sibling in parent.next_siblings:
+            value = normalize_department_category(text_from_node(sibling))
+            if value:
+                categories = merge_terms(categories, [value])
+                break
+
+    for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
+        text = clean_text(candidate.get_text(" ", strip=True))
+        if not text.lower().startswith("department:"):
+            continue
+        extracted = normalize_department_category(extract_labeled_value(text, "Department"))
+        if extracted:
+            categories = merge_terms(categories, [extracted])
+
+    return categories
+
+
+def extract_author_from_json_ld(article: dict) -> str:
+    author = article.get("author")
+    if isinstance(author, dict):
+        return clean_text(author.get("name", ""))
+    if isinstance(author, list):
+        names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
+        return ", ".join(name for name in names if name)
+    if isinstance(author, str):
+        return clean_text(author)
+    return ""
+
+
+def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
+    body = article.get("articleBody")
+    if isinstance(body, str) and len(body.strip()) > 120:
+        return f"<p>{unescape(body.strip())}</p>"
+    return extract_body(soup)
+
+
+def normalize_terms(value: object) -> list[str]:
+    if isinstance(value, str):
+        parts = re.split(r"[,|>]", value)
+        return [clean_text(part) for part in parts if clean_text(part)]
+    if isinstance(value, list):
+        result: list[str] = []
+        for item in value:
+            if isinstance(item, str):
+                cleaned = clean_text(item)
+                if cleaned and cleaned not in result:
+                    result.append(cleaned)
+        return result
+    return []
+
+
+def merge_terms(*groups: list[str]) -> list[str]:
+    merged: list[str] = []
+    for group in groups:
+        for item in group:
+            cleaned = clean_text(item)
+            if cleaned and cleaned not in merged:
+                merged.append(cleaned)
+    return merged
+
+
+def normalize_drupal_date(value: str | None) -> str:
+    if not value:
+        return ""
+    match = DRUPAL_TITLE_DATE_PATTERN.search(value)
+    if not match:
+        return ""
+    return normalize_date(match.group(0))
+
+
+def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
+    node = soup.find(tag_name, attrs=attrs)
+    if node and node.get("content"):
+        return node["content"].strip()
+    return ""
+
+
+def clean_text(value: str) -> str:
+    return re.sub(r"\s+", " ", value or "").strip()
+
+
+def text_from_node(node: object) -> str:
+    if isinstance(node, NavigableString):
+        return clean_text(str(node))
+    if isinstance(node, Tag):
+        return clean_text(node.get_text(" ", strip=True))
+    return ""
+
+
+def sanitize_html(html: str) -> str:
+    if not html:
+        return ""
+    soup = BeautifulSoup(html, "html.parser")
+    strip_unwanted(soup)
+    strip_dangerous_attributes(soup)
+    return soup.decode_contents().strip()
+
+
+def has_meaningful_body_content(html: str) -> bool:
+    if not html:
+        return False
+    text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
+    return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))
+
+
+def strip_unwanted(node: BeautifulSoup | Tag) -> None:
+    for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
+        for child in node.select(selector):
+            child.decompose()
+
+
+def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
+    for child in node.find_all(True):
+        for attr_name in list(child.attrs):
+            normalized_name = attr_name.lower()
+            if normalized_name.startswith("on") or normalized_name == "srcdoc":
+                del child.attrs[attr_name]
+                continue
+
+            if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
+                continue
+
+            raw_value = child.attrs.get(attr_name)
+            if isinstance(raw_value, list):
+                candidate = " ".join(str(item) for item in raw_value)
+            else:
+                candidate = str(raw_value or "")
+
+            lowered = candidate.strip().lower()
+            if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
+                del child.attrs[attr_name]
+
+
+def clone_tag(node: Tag) -> BeautifulSoup:
+    return BeautifulSoup(str(node), "html.parser")
+
+
+def find_title_node(soup: BeautifulSoup) -> Tag | None:
+    for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
+        node = soup.select_one(selector)
+        if node:
+            return node
+    return None
+
+
+def extract_labeled_value(text: str, label: str) -> str:
+    if not text:
+        return ""
+
+    pattern = re.compile(
+        rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
+        re.IGNORECASE,
+    )
+    match = pattern.search(clean_text(text))
+    if not match:
+        return ""
+    return clean_text(match.group(1))
+
+
+def normalize_department_category(value: str) -> str:
+    cleaned = clean_text(value)
+    if not cleaned:
+        return ""
+    if len(cleaned) > 80 or len(cleaned.split()) > 8:
+        return ""
+    if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
+        return ""
+    return cleaned
+
+
+def format_error_summary(
+    url: str,
+    exc: Exception,
+    response: requests.Response | None,
+    timeout_seconds: int,
+) -> str:
+    if isinstance(exc, requests.HTTPError):
+        failing_response = exc.response or response
+        if failing_response is not None:
+            return (
+                f"HTTP {failing_response.status_code} {failing_response.reason} "
+                f"while fetching {failing_response.url or url}"
+            )
+    if isinstance(exc, requests.Timeout):
+        return f"Request timed out after {timeout_seconds}s while fetching {url}"
+    if isinstance(exc, requests.RequestException):
+        return f"{type(exc).__name__} while fetching {url}: {exc}"
+    return f"{type(exc).__name__}: {exc}"
+
+
+def format_error_details(
+    url: str,
+    exc: Exception,
+    response: requests.Response | None,
+) -> str:
+    details = [
+        f"URL: {url}",
+        f"Error Type: {type(exc).__name__}",
+        f"Message: {exc}",
+    ]
+
+    failing_response = getattr(exc, "response", None) or response
+    if failing_response is not None:
+        details.extend(
+            [
+                f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
+                f"Resolved URL: {failing_response.url}",
+            ]
+        )
+
+    trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
+    if trace:
+        details.append(f"Exception: {trace}")
+
+    return "\n".join(details)
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from email.utils import format_datetime
+from io import StringIO
+from xml.sax.saxutils import escape
+import datetime as dt
+
+from page_importer.dates import parse_datetime
+from page_importer.models import ScrapedPost
+
+
+def build_wxr(posts: list[ScrapedPost], channel_title: str = "Imported Content") -> str:
+    now = dt.datetime.now(dt.timezone.utc)
+    out = StringIO()
+    out.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
+    out.write(
+        '<rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" '
+        'xmlns:content="http://purl.org/rss/1.0/modules/content/" '
+        'xmlns:wfw="http://wellformedweb.org/CommentAPI/" '
+        'xmlns:dc="http://purl.org/dc/elements/1.1/" '
+        'xmlns:wp="http://wordpress.org/export/1.2/">\n'
+    )
+    out.write("<channel>\n")
+    out.write(f"<title>{escape(channel_title)}</title>\n")
+    out.write("<link>http://localhost/</link>\n")
+    out.write("<description>Generated by Page Importer</description>\n")
+    out.write(f"<pubDate>{format_datetime(now)}</pubDate>\n")
+    out.write("<language>en-US</language>\n")
+    out.write("<wp:wxr_version>1.2</wp:wxr_version>\n")
+
+    for post in posts:
+        local_date, gmt_date, item_pub_date = _resolve_post_dates(post.publish_date, now)
+        out.write("<item>\n")
+        out.write(f"<title>{escape(post.title)}</title>\n")
+        out.write(f"<link>{escape(post.source_url)}</link>\n")
+        out.write(f"<pubDate>{format_datetime(item_pub_date)}</pubDate>\n")
+        out.write(f"<dc:creator>{cdata(post.author or 'importer')}</dc:creator>\n")
+        out.write(f"<guid isPermaLink=\"false\">{escape(post.source_url)}</guid>\n")
+        out.write("<description></description>\n")
+        out.write(f"<content:encoded>{cdata(post.body_html)}</content:encoded>\n")
+        out.write(f"<excerpt:encoded>{cdata('')}</excerpt:encoded>\n")
+        out.write(f"<wp:post_date>{cdata(local_date)}</wp:post_date>\n")
+        out.write(f"<wp:post_date_gmt>{cdata(gmt_date)}</wp:post_date_gmt>\n")
+        out.write("<wp:comment_status><![CDATA[closed]]></wp:comment_status>\n")
+        out.write("<wp:ping_status><![CDATA[closed]]></wp:ping_status>\n")
+        out.write("<wp:post_name><![CDATA[]]></wp:post_name>\n")
+        out.write(f"<wp:status>{cdata(post.status)}</wp:status>\n")
+        out.write("<wp:post_parent>0</wp:post_parent>\n")
+        out.write("<wp:menu_order>0</wp:menu_order>\n")
+        out.write(f"<wp:post_type>{cdata(post.post_type or 'post')}</wp:post_type>\n")
+        out.write("<wp:post_password><![CDATA[]]></wp:post_password>\n")
+        out.write("<wp:is_sticky>0</wp:is_sticky>\n")
+        for category in post.categories:
+            out.write(
+                f'<category domain="category" nicename="{escape(slugify(category))}">{cdata(category)}</category>\n'
+            )
+        for tag in post.tags:
+            out.write(
+                f'<category domain="post_tag" nicename="{escape(slugify(tag))}">{cdata(tag)}</category>\n'
+            )
+        out.write("</item>\n")
+
+    out.write("</channel>\n</rss>\n")
+    return out.getvalue()
+
+
+def slugify(value: str) -> str:
+    return "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-")
+
+
+def cdata(value: str) -> str:
+    return f"<![CDATA[{(value or '').replace(']]>', ']]]]><![CDATA[>')}]]>"
+
+
+def _resolve_post_dates(value: str, fallback: dt.datetime) -> tuple[str, str, dt.datetime]:
+    parsed = parse_datetime(value)
+    if parsed is None:
+        return "", "", fallback
+
+    if parsed.tzinfo is None or parsed.utcoffset() is None:
+        local_date = _format_wp_date(parsed)
+        assumed_utc = parsed.replace(tzinfo=dt.timezone.utc)
+        return local_date, local_date, assumed_utc
+
+    local_date = _format_wp_date(parsed)
+    gmt_value = parsed.astimezone(dt.timezone.utc)
+    return local_date, _format_wp_date(gmt_value), gmt_value
+
+
+def _format_wp_date(value: dt.datetime) -> str:
+    return value.replace(tzinfo=None).strftime("%Y-%m-%d %H:%M:%S")
@@ -0,0 +1,4 @@
+streamlit>=1.43,<2
+requests>=2.32,<3
+beautifulsoup4>=4.12,<5
+python-dateutil>=2.9,<3
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import unittest
+
+from bs4 import BeautifulSoup
+
+from page_importer.dates import normalize_date
+from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
+from page_importer.wxr import build_wxr
+from page_importer.models import ScrapedPost
+
+
+class DateNormalizationTests(unittest.TestCase):
+    def test_preserves_timezone_offset_in_normalized_value(self) -> None:
+        self.assertEqual(
+            normalize_date("2024-05-01T09:30:00-07:00"),
+            "2024-05-01 09:30:00-07:00",
+        )
+
+
+class WxrSerializationTests(unittest.TestCase):
+    def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
+        xml = build_wxr(
+            [
+                ScrapedPost(
+                    source_url="https://example.com/post",
+                    title="Example",
+                    body_html="<p>Body</p>",
+                    publish_date="2024-05-01 09:30:00-07:00",
+                    success=True,
+                )
+            ]
+        )
+
+        self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
+        self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
+        self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
+
+    def test_splits_cdata_terminators_in_content(self) -> None:
+        xml = build_wxr(
+            [
+                ScrapedPost(
+                    source_url="https://example.com/post",
+                    title="Example",
+                    body_html="<p>alpha ]]> omega</p>",
+                    author="Jane ]]> Doe",
+                    success=True,
+                )
+            ]
+        )
+
+        self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
+        self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
+
+
+class HtmlSanitizationTests(unittest.TestCase):
+    def test_removes_inline_event_handlers_and_script_uris(self) -> None:
+        sanitized = sanitize_html(
+            '<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
+        )
+
+        self.assertNotIn("onclick", sanitized)
+        self.assertNotIn("onerror", sanitized)
+        self.assertNotIn("javascript:", sanitized)
+
+
+class TaxonomySelectorTests(unittest.TestCase):
+    def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
+        soup = BeautifulSoup(
+            '<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
+            "html.parser",
+        )
+
+        self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
+        self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,110 @@
+# WDW Sitemap And Import Tools
+
+This repository combines two internal tools into one web application and one Docker image:
+
+- `Sitemap Generator`
+- `Page Importer`
+
+The application uses Streamlit and presents both tools behind a single URL with two tabs at the top of the page.
+
+## What It Does
+
+### Sitemap Generator
+
+- Crawls a site from a starting URL
+- Discovers URLs from page links and XML sitemaps
+- Exports a sitemap CSV
+- Saves crawl state and logs so a crawl can be resumed later
+
+### Page Importer
+
+- Reads a CSV of submitted URLs
+- Scrapes page content
+- Lets you review the extracted content
+- Exports a WordPress WXR XML import file
+
+## Project Layout
+
+- `app.py`: top-level Streamlit app with both tabs
+- `requirements.txt`: shared Python dependencies for the combined app
+- `Dockerfile`: single image for the combined tool
+- `.gitea/workflows/docker-image.yml`: Gitea Actions workflow for Docker builds
+- `Sitemap Builder/`: sitemap crawler logic
+- `Page Importer/`: WordPress import logic
+
+## Run Locally
+
+### Linux or macOS
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+streamlit run app.py
+```
+
+### Windows PowerShell
+
+```powershell
+python -m venv .venv
+.venv\Scripts\Activate.ps1
+pip install -r requirements.txt
+streamlit run app.py
+```
+
+Then open:
+
+```text
+http://localhost:8501
+```
+
+## Docker
+
+Build the image:
+
+```bash
+docker build -t wdw-sitemap-and-importer .
+```
+
+Run the container:
+
+```bash
+docker run --rm -p 8501:8501 -v wdw-tools-data:/data wdw-sitemap-and-importer
+```
+
+Then open:
+
+```text
+http://localhost:8501
+```
+
+The mounted `/data` volume stores sitemap CSV files, crawl state files, and crawl logs so sitemap jobs can survive container restarts.
+
+## Gitea Automation
+
+The workflow file is:
+
+```text
+.gitea/workflows/docker-image.yml
+```
+
+It runs on pushes to `main` and on manual workflow dispatch.
+
+The workflow always builds the Docker image. If these secrets are configured in Gitea, it also logs in and pushes the image to your registry:
+
+- `GITEA_REGISTRY_URL`
+- `GITEA_REGISTRY_USERNAME`
+- `GITEA_REGISTRY_PASSWORD`
+
+Published tags:
+
+- `${REGISTRY}/wdw-sitemap-and-importer:<commit-sha>`
+- `${REGISTRY}/wdw-sitemap-and-importer:latest`
+
+If the registry secrets are not configured, the workflow still performs the build as validation but skips the push steps.
+
+## Notes
+
+- Sitemap output files are written under `/data` in Docker.
+- The sitemap crawler can resume previous runs when a matching crawl state file exists.
+- The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app.
@@ -0,0 +1,80 @@
+# Sitemap Builder
+
+This folder contains the sitemap crawler used by the combined web application in the repository root.
+
+The crawler can still be used directly from Python, but the primary supported experience is now the shared Streamlit interface in the root project:
+
+```text
+../app.py
+```
+
+## Current Role In The Combined App
+
+The root application uses this module to:
+
+- crawl a site from a submitted starting URL
+- discover internal URLs from HTML links and XML sitemaps
+- export a sitemap CSV
+- save crawl state and crawl logs for resume support
+
+## Output
+
+The crawler writes:
+
+- a CSV file
+- a sidecar crawl state file ending in `.crawlstate.json`
+- a crawl log file ending in `.crawl.log`
+
+The CSV contains these columns:
+
+- `URL`
+- `Title`
+- `Canonical URL`
+- `Type`
+
+## Standalone CLI Usage
+
+Interactive mode:
+
+```bash
+python3 sitemap_builder.py
+```
+
+Command line mode:
+
+```bash
+python3 sitemap_builder.py https://example.com -o ./sitemap.csv
+```
+
+On Windows:
+
+```powershell
+python .\sitemap_builder.py https://example.com -o .\sitemap.csv
+```
+
+## Useful Options
+
+```bash
+python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --include-subdomains
+```
+
+- `--max-pages`: stop after the given number of visited pages. Default: `10000`
+- `--delay`: wait between requests to reduce load on the site
+- `--timeout`: request timeout in seconds
+- `--include-subdomains`: crawl subdomains of the starting host
+- `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
+- `--workers`: number of worker threads to use. Set `1` to disable multithreading
+- `--save-every`: save progress after every N pages. Default: `25`
+- `--resume`: resume from an existing state file
+- `--fresh`: ignore the existing state file and start over
+
+## Discovery And Behavior
+
+- The crawler checks `robots.txt` for sitemap references and also tries `/sitemap.xml`
+- XML sitemap URLs are added to the crawl queue before page crawling begins
+- HTML pages store page title and canonical URL in the CSV when available
+- On Windows CLI runs, `P` pauses, `R` resumes, and `Q` stops cleanly and saves progress
+
+## Recommendation
+
+For normal use, run the root application or Docker container instead of calling this script directly. That is now the intended user interface for this repository.
@@ -0,0 +1,947 @@
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+import os
+import sys
+import time
+import xml.etree.ElementTree as ET
+from collections import deque
+from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
+from dataclasses import dataclass
+from html.parser import HTMLParser
+from pathlib import Path
+from typing import Iterable
+from urllib.error import HTTPError, URLError
+from urllib.parse import urljoin, urlsplit, urlunsplit
+from urllib.request import Request, urlopen
+
+if os.name == "nt":
+    import msvcrt
+
+
+DEFAULT_USER_AGENT = "SitemapBuilder/1.0 (+local script)"
+DEFAULT_OUTPUT_NAME = "sitemap.csv"
+DEFAULT_STATE_SUFFIX = ".crawlstate.json"
+DEFAULT_LOG_SUFFIX = ".crawl.log"
+DEFAULT_MAX_PAGES = 10000
+DEFAULT_RESUME_PAGE_INCREMENT = 10000
+DEFAULT_SAVE_EVERY = 25
+DEFAULT_WORKERS = 8
+SCRIPT_DIR = Path(__file__).resolve().parent
+DOCUMENT_EXTENSIONS = {
+    ".pdf",
+    ".csv",
+    ".doc",
+    ".docx",
+    ".xls",
+    ".xlsx",
+    ".ppt",
+    ".pptx",
+    ".txt",
+    ".rtf",
+    ".zip",
+    ".xml",
+    ".json",
+}
+
+
+@dataclass
+class CrawlResult:
+    url: str
+    links: list[str]
+    title: str = ""
+    canonical_url: str = ""
+    skipped: bool = False
+    error: str | None = None
+
+
+@dataclass
+class CrawlState:
+    start_url: str
+    include_subdomains: bool
+    include_documents: bool
+    visited: set[str]
+    queued: set[str]
+    queue: deque[str]
+    records: dict[str, dict[str, str]]
+    alias_to_canonical: dict[str, str]
+    errors: list[dict[str, str]]
+    skipped_count: int
+    discovered_from_sitemaps: int
+
+
+@dataclass
+class RuntimeControl:
+    paused: bool = False
+    stop_requested: bool = False
+
+
+@dataclass
+class CrawlRunResult:
+    state: CrawlState
+    user_stopped: bool
+    output_path: Path
+    state_path: Path
+    log_path: Path
+    max_pages: int
+    workers: int
+
+
+class HTMLPageParser(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__()
+        self.links: list[str] = []
+        self.title_parts: list[str] = []
+        self.in_title = False
+        self.canonical_href = ""
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        attrs_map = {key.lower(): value for key, value in attrs}
+        lower_tag = tag.lower()
+
+        if lower_tag == "a":
+            href = attrs_map.get("href")
+            if href:
+                self.links.append(href)
+
+        if lower_tag == "title":
+            self.in_title = True
+
+        if lower_tag == "link":
+            rel = (attrs_map.get("rel") or "").lower()
+            href = attrs_map.get("href") or ""
+            if "canonical" in rel and href:
+                self.canonical_href = href
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag.lower() == "title":
+            self.in_title = False
+
+    def handle_data(self, data: str) -> None:
+        if self.in_title:
+            self.title_parts.append(data)
+
+    @property
+    def title(self) -> str:
+        return " ".join(part.strip() for part in self.title_parts if part.strip()).strip()
+
+
+def normalize_url(url: str) -> str:
+    parts = urlsplit(url.strip())
+    scheme = parts.scheme.lower() or "https"
+    netloc = parts.netloc.lower()
+    path = parts.path or "/"
+
+    if path != "/" and path.endswith("/"):
+        path = path.rstrip("/")
+
+    return urlunsplit((scheme, netloc, path, parts.query, ""))
+
+
+def is_http_url(url: str) -> bool:
+    return urlsplit(url).scheme in {"http", "https"}
+
+
+def build_allowed_hosts(start_url: str) -> set[str]:
+    return {urlsplit(start_url).netloc.lower()}
+
+
+def should_visit(url: str, allowed_hosts: set[str], include_subdomains: bool) -> bool:
+    if not is_http_url(url):
+        return False
+
+    host = urlsplit(url).netloc.lower()
+    if include_subdomains:
+        return any(host == allowed or host.endswith(f".{allowed}") for allowed in allowed_hosts)
+    return host in allowed_hosts
+
+
+def is_document_url(url: str) -> bool:
+    return Path(urlsplit(url).path).suffix.lower() in DOCUMENT_EXTENSIONS
+
+
+def should_record_url(url: str) -> bool:
+    query = urlsplit(url).query.lower()
+    return query != "page=1"
+
+
+def get_state_path(output_path: Path) -> Path:
+    return output_path.with_suffix(output_path.suffix + DEFAULT_STATE_SUFFIX)
+
+
+def get_log_path(output_path: Path) -> Path:
+    return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
+
+
+def log_message(log_path: Path, message: str) -> None:
+    log_path.parent.mkdir(parents=True, exist_ok=True)
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+    with log_path.open("a", encoding="utf-8") as log_file:
+        log_file.write(f"[{timestamp}] {message}\n")
+
+
+def resolve_alias(url: str, alias_to_canonical: dict[str, str]) -> str:
+    resolved = url
+    seen: set[str] = set()
+    while resolved in alias_to_canonical and resolved not in seen:
+        seen.add(resolved)
+        resolved = alias_to_canonical[resolved]
+    return resolved
+
+
+def register_record(
+    state: CrawlState,
+    url: str,
+    record_type: str,
+    title: str = "",
+    canonical_url: str = "",
+) -> None:
+    if not should_record_url(url):
+        return
+
+    existing = state.records.get(url, {"title": "", "canonical_url": "", "type": record_type})
+    if not existing.get("type"):
+        existing["type"] = record_type
+    elif existing["type"] == "document" and record_type == "page":
+        existing["type"] = "page"
+
+    if title and not existing.get("title"):
+        existing["title"] = title
+    if canonical_url and not existing.get("canonical_url"):
+        existing["canonical_url"] = canonical_url
+    if "canonical_url" not in existing:
+        existing["canonical_url"] = canonical_url
+    if "title" not in existing:
+        existing["title"] = title
+    state.records[url] = existing
+
+
+def save_state(state: CrawlState, state_path: Path, output_path: Path) -> None:
+    state_path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {
+        "start_url": state.start_url,
+        "include_subdomains": state.include_subdomains,
+        "include_documents": state.include_documents,
+        "visited": sorted(state.visited),
+        "queued": sorted(state.queued),
+        "queue": list(state.queue),
+        "records": state.records,
+        "alias_to_canonical": state.alias_to_canonical,
+        "errors": state.errors,
+        "skipped_count": state.skipped_count,
+        "discovered_from_sitemaps": state.discovered_from_sitemaps,
+        "saved_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "output_path": str(output_path),
+    }
+    state_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+
+
+def load_state(state_path: Path) -> CrawlState:
+    payload = json.loads(state_path.read_text(encoding="utf-8"))
+    return CrawlState(
+        start_url=payload["start_url"],
+        include_subdomains=bool(payload.get("include_subdomains", False)),
+        include_documents=bool(payload.get("include_documents", False)),
+        visited=set(payload.get("visited", [])),
+        queued=set(payload.get("queued", [])),
+        queue=deque(payload.get("queue", [])),
+        records=dict(payload.get("records", {})),
+        alias_to_canonical=dict(payload.get("alias_to_canonical", {})),
+        errors=list(payload.get("errors", [])),
+        skipped_count=int(payload.get("skipped_count", 0)),
+        discovered_from_sitemaps=int(payload.get("discovered_from_sitemaps", 0)),
+    )
+
+
+def initialize_state(start_url: str, include_subdomains: bool, include_documents: bool) -> CrawlState:
+    normalized_start = normalize_url(start_url)
+    return CrawlState(
+        start_url=normalized_start,
+        include_subdomains=include_subdomains,
+        include_documents=include_documents,
+        visited=set(),
+        queued={normalized_start},
+        queue=deque([normalized_start]),
+        records={},
+        alias_to_canonical={},
+        errors=[],
+        skipped_count=0,
+        discovered_from_sitemaps=0,
+    )
+
+
+def prompt_if_missing(value: str | None, prompt_text: str) -> str:
+    if value:
+        return value
+    return input(prompt_text).strip()
+
+
+def prompt_yes_no(prompt_text: str, default: bool) -> bool:
+    suffix = "Y/n" if default else "y/N"
+    answer = input(f"{prompt_text} [{suffix}]: ").strip().lower()
+    if not answer:
+        return default
+    return answer in {"y", "yes"}
+
+
+def write_csv(records: dict[str, dict[str, str]], output_path: Path) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
+        writer = csv.writer(csv_file)
+        writer.writerow(["URL", "Title", "Canonical URL", "Type"])
+        for url in sorted(records):
+            record = records[url]
+            writer.writerow(
+                [
+                    url,
+                    record.get("title", ""),
+                    record.get("canonical_url", ""),
+                    record.get("type", ""),
+                ]
+            )
+
+
+def fetch_text(url: str, timeout: float, user_agent: str, accept: str) -> tuple[str | None, str | None]:
+    request = Request(url, headers={"User-Agent": user_agent, "Accept": accept})
+    try:
+        with urlopen(request, timeout=timeout) as response:
+            return (
+                response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace"),
+                None,
+            )
+    except HTTPError as exc:
+        return None, f"HTTP {exc.code}"
+    except URLError as exc:
+        return None, str(exc.reason)
+    except TimeoutError:
+        return None, "request timed out"
+    except Exception as exc:  # pragma: no cover
+        return None, str(exc)
+
+
+def fetch_page(url: str, timeout: float, user_agent: str) -> CrawlResult:
+    request = Request(
+        url,
+        headers={
+            "User-Agent": user_agent,
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        },
+    )
+
+    try:
+        with urlopen(request, timeout=timeout) as response:
+            content_type = response.headers.get("Content-Type", "").lower()
+            if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
+                return CrawlResult(url=url, links=[], skipped=True)
+
+            content = response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace")
+    except HTTPError as exc:
+        return CrawlResult(url=url, links=[], error=f"HTTP {exc.code}")
+    except URLError as exc:
+        return CrawlResult(url=url, links=[], error=str(exc.reason))
+    except TimeoutError:
+        return CrawlResult(url=url, links=[], error="request timed out")
+    except Exception as exc:  # pragma: no cover
+        return CrawlResult(url=url, links=[], error=str(exc))
+
+    parser = HTMLPageParser()
+    parser.feed(content)
+    canonical_url = normalize_url(urljoin(url, parser.canonical_href)) if parser.canonical_href else ""
+    return CrawlResult(
+        url=url,
+        links=parser.links,
+        title=parser.title,
+        canonical_url=canonical_url,
+    )
+
+
+def fetch_page_with_delay(url: str, timeout: float, user_agent: str, delay: float) -> CrawlResult:
+    if delay > 0:
+        time.sleep(delay)
+    return fetch_page(url, timeout=timeout, user_agent=user_agent)
+
+
+def print_progress(state: CrawlState, max_pages: int, current_url: str) -> None:
+    print(
+        f"[{len(state.visited)}/{max_pages}] Found {len(state.records)} URL(s), "
+        f"queued {len(state.queue)} more: {current_url}"
+    )
+
+
+def poll_runtime_control(control: RuntimeControl, log_path: Path) -> None:
+    if os.name != "nt":
+        return
+
+    while msvcrt.kbhit():
+        key = msvcrt.getwch().lower()
+        if key == "p" and not control.paused:
+            control.paused = True
+            print("Paused. Press R to resume or Q to stop.")
+            log_message(log_path, "Crawl paused by user")
+        elif key == "r" and control.paused:
+            control.paused = False
+            print("Resuming crawl.")
+            log_message(log_path, "Crawl resumed by user")
+        elif key == "q":
+            control.stop_requested = True
+            log_message(log_path, "Stop requested by user")
+
+
+def discover_robots_sitemaps(
+    start_url: str,
+    timeout: float,
+    user_agent: str,
+    log_path: Path,
+) -> set[str]:
+    robots_url = normalize_url(urljoin(start_url, "/robots.txt"))
+    content, error = fetch_text(robots_url, timeout, user_agent, "text/plain,*/*;q=0.8")
+    if error:
+        log_message(log_path, f"robots.txt not available at {robots_url}: {error}")
+        return set()
+
+    sitemap_urls: set[str] = set()
+    for line in content.splitlines():
+        if line.lower().startswith("sitemap:"):
+            raw_url = line.split(":", 1)[1].strip()
+            if raw_url:
+                sitemap_urls.add(normalize_url(raw_url))
+
+    if sitemap_urls:
+        log_message(log_path, f"Discovered {len(sitemap_urls)} sitemap reference(s) from robots.txt")
+    return sitemap_urls
+
+
+def xml_local_name(tag: str) -> str:
+    if "}" in tag:
+        return tag.rsplit("}", 1)[1]
+    return tag
+
+
+def parse_sitemap_urls(
+    sitemap_url: str,
+    allowed_hosts: set[str],
+    include_subdomains: bool,
+    timeout: float,
+    user_agent: str,
+    log_path: Path,
+    seen_sitemaps: set[str],
+) -> set[str]:
+    normalized_sitemap = normalize_url(sitemap_url)
+    if normalized_sitemap in seen_sitemaps:
+        return set()
+    seen_sitemaps.add(normalized_sitemap)
+
+    if not should_visit(normalized_sitemap, allowed_hosts, include_subdomains):
+        return set()
+
+    content, error = fetch_text(normalized_sitemap, timeout, user_agent, "application/xml,text/xml;q=0.9,*/*;q=0.8")
+    if error:
+        log_message(log_path, f"Sitemap fetch failed for {normalized_sitemap}: {error}")
+        return set()
+
+    try:
+        root = ET.fromstring(content)
+    except ET.ParseError as exc:
+        log_message(log_path, f"Sitemap parse failed for {normalized_sitemap}: {exc}")
+        return set()
+
+    tag_name = xml_local_name(root.tag)
+    discovered_urls: set[str] = set()
+
+    if tag_name == "urlset":
+        for element in root.findall(".//"):
+            if xml_local_name(element.tag) == "loc" and element.text:
+                normalized = normalize_url(element.text.strip())
+                if should_visit(normalized, allowed_hosts, include_subdomains):
+                    discovered_urls.add(normalized)
+    elif tag_name == "sitemapindex":
+        for element in root.findall(".//"):
+            if xml_local_name(element.tag) == "loc" and element.text:
+                child_sitemap = normalize_url(element.text.strip())
+                discovered_urls.update(
+                    parse_sitemap_urls(
+                        child_sitemap,
+                        allowed_hosts,
+                        include_subdomains,
+                        timeout,
+                        user_agent,
+                        log_path,
+                        seen_sitemaps,
+                    )
+                )
+    else:
+        log_message(log_path, f"Unsupported sitemap format at {normalized_sitemap}")
+
+    return discovered_urls
+
+
+def seed_from_xml_sitemaps(
+    state: CrawlState,
+    timeout: float,
+    user_agent: str,
+    log_path: Path,
+) -> None:
+    allowed_hosts = build_allowed_hosts(state.start_url)
+    sitemap_candidates = discover_robots_sitemaps(state.start_url, timeout, user_agent, log_path)
+    sitemap_candidates.add(normalize_url(urljoin(state.start_url, "/sitemap.xml")))
+
+    seen_sitemaps: set[str] = set()
+    discovered_urls: set[str] = set()
+    for sitemap_url in sitemap_candidates:
+        discovered_urls.update(
+            parse_sitemap_urls(
+                sitemap_url,
+                allowed_hosts,
+                state.include_subdomains,
+                timeout,
+                user_agent,
+                log_path,
+                seen_sitemaps,
+            )
+        )
+
+    added = 0
+    for url in discovered_urls:
+        canonical_url = resolve_alias(url, state.alias_to_canonical)
+        if is_document_url(canonical_url):
+            if state.include_documents:
+                register_record(state, canonical_url, "document")
+                added += 1
+            continue
+
+        register_record(state, canonical_url, "page")
+        if canonical_url not in state.visited and canonical_url not in state.queued:
+            state.queue.append(canonical_url)
+            state.queued.add(canonical_url)
+        added += 1
+
+    state.discovered_from_sitemaps += added
+    log_message(log_path, f"Added {added} URL(s) from XML sitemap discovery")
+
+
+def process_crawl_result(
+    state: CrawlState,
+    result: CrawlResult,
+    allowed_hosts: set[str],
+    log_path: Path,
+) -> None:
+    if result.error:
+        state.errors.append({"url": result.url, "error": result.error})
+        log_message(log_path, f"Error fetching {result.url}: {result.error}")
+        return
+
+    if result.skipped:
+        state.skipped_count += 1
+        register_record(state, result.url, "document")
+        return
+
+    canonical_url = ""
+    if result.canonical_url and should_visit(result.canonical_url, allowed_hosts, state.include_subdomains):
+        canonical_url = resolve_alias(result.canonical_url, state.alias_to_canonical)
+        state.alias_to_canonical[result.url] = canonical_url
+        register_record(state, canonical_url, "page", title=result.title, canonical_url=canonical_url)
+        if canonical_url not in state.visited and canonical_url not in state.queued:
+            state.queue.append(canonical_url)
+            state.queued.add(canonical_url)
+    register_record(state, result.url, "page", title=result.title, canonical_url=canonical_url)
+
+    for raw_link in result.links:
+        absolute = normalize_url(urljoin(result.url, raw_link))
+        if not should_visit(absolute, allowed_hosts, state.include_subdomains):
+            continue
+
+        absolute = resolve_alias(absolute, state.alias_to_canonical)
+        if is_document_url(absolute):
+            if state.include_documents:
+                register_record(state, absolute, "document")
+            continue
+
+        register_record(state, absolute, "page")
+        if absolute not in state.queued and absolute not in state.visited:
+            state.queue.append(absolute)
+            state.queued.add(absolute)
+
+
+def crawl_site(
+    state: CrawlState,
+    max_pages: int,
+    delay: float,
+    timeout: float,
+    user_agent: str,
+    state_path: Path,
+    output_path: Path,
+    log_path: Path,
+    save_every: int,
+    workers: int,
+) -> tuple[CrawlState, bool]:
+    allowed_hosts = build_allowed_hosts(state.start_url)
+    processed_since_save = 0
+    user_stopped = False
+    control = RuntimeControl()
+
+    if workers <= 1:
+        while state.queue and len(state.visited) < max_pages:
+            poll_runtime_control(control, log_path)
+            if control.stop_requested:
+                user_stopped = True
+                print("Stop requested. Saving progress and finishing cleanly...")
+                break
+
+            while control.paused and not control.stop_requested:
+                time.sleep(0.2)
+                poll_runtime_control(control, log_path)
+
+            if control.stop_requested:
+                user_stopped = True
+                print("Stop requested. Saving progress and finishing cleanly...")
+                break
+
+            current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
+            if current in state.visited:
+                continue
+
+            state.visited.add(current)
+            register_record(state, current, "page")
+            print_progress(state, max_pages, current)
+
+            result = fetch_page_with_delay(current, timeout=timeout, user_agent=user_agent, delay=delay)
+            process_crawl_result(state, result, allowed_hosts, log_path)
+
+            processed_since_save += 1
+            if processed_since_save >= save_every:
+                write_csv(state.records, output_path)
+                save_state(state, state_path, output_path)
+                log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
+                processed_since_save = 0
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as executor:
+            pending: dict[object, str] = {}
+
+            while pending or (state.queue and len(state.visited) < max_pages):
+                poll_runtime_control(control, log_path)
+
+                if control.stop_requested:
+                    user_stopped = True
+                    print("Stop requested. No new pages will be queued. Waiting for active requests to finish...")
+                    break
+
+                if control.paused:
+                    if pending:
+                        completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
+                        for future in completed:
+                            pending.pop(future, None)
+                            result = future.result()
+                            process_crawl_result(state, result, allowed_hosts, log_path)
+                            processed_since_save += 1
+                    else:
+                        time.sleep(0.2)
+
+                    if processed_since_save >= save_every:
+                        write_csv(state.records, output_path)
+                        save_state(state, state_path, output_path)
+                        log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
+                        processed_since_save = 0
+                    continue
+
+                while state.queue and len(pending) < workers and len(state.visited) < max_pages:
+                    current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
+                    if current in state.visited:
+                        continue
+
+                    state.visited.add(current)
+                    register_record(state, current, "page")
+                    print_progress(state, max_pages, current)
+                    future = executor.submit(fetch_page_with_delay, current, timeout, user_agent, delay)
+                    pending[future] = current
+
+                if not pending:
+                    continue
+
+                completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
+                for future in completed:
+                    pending.pop(future, None)
+                    result = future.result()
+                    process_crawl_result(state, result, allowed_hosts, log_path)
+                    processed_since_save += 1
+
+                    if processed_since_save >= save_every:
+                        write_csv(state.records, output_path)
+                        save_state(state, state_path, output_path)
+                        log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
+                        processed_since_save = 0
+
+            if user_stopped and pending:
+                completed, _ = wait(pending.keys())
+                for future in completed:
+                    pending.pop(future, None)
+                    result = future.result()
+                    process_crawl_result(state, result, allowed_hosts, log_path)
+
+    write_csv(state.records, output_path)
+    save_state(state, state_path, output_path)
+    log_message(log_path, f"Final save completed with {len(state.records)} URL(s) recorded")
+    return state, user_stopped
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Crawl a website and export discovered internal URLs to a CSV sitemap.",
+    )
+    parser.add_argument("url", nargs="?", help="Starting URL to crawl, for example https://example.com")
+    parser.add_argument(
+        "-o",
+        "--output",
+        help=f"Output CSV path. Defaults to {DEFAULT_OUTPUT_NAME} in the script folder.",
+    )
+    parser.add_argument(
+        "--max-pages",
+        type=int,
+        default=DEFAULT_MAX_PAGES,
+        help=f"Maximum number of pages to crawl before stopping. Default: {DEFAULT_MAX_PAGES}",
+    )
+    parser.add_argument(
+        "--delay",
+        type=float,
+        default=0.0,
+        help="Delay in seconds between requests. Default: 0",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=15.0,
+        help="Request timeout in seconds. Default: 15",
+    )
+    parser.add_argument(
+        "--include-subdomains",
+        action="store_true",
+        help="Also crawl subdomains of the starting host.",
+    )
+    parser.add_argument(
+        "--include-documents",
+        action="store_true",
+        help="Include document links like PDF, CSV, DOC, and DOCX in the sitemap output.",
+    )
+    parser.add_argument(
+        "--save-every",
+        type=int,
+        default=DEFAULT_SAVE_EVERY,
+        help=f"Save progress after this many pages. Default: {DEFAULT_SAVE_EVERY}",
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="Resume from the saved crawl state if a state file already exists.",
+    )
+    parser.add_argument(
+        "--fresh",
+        action="store_true",
+        help="Ignore any saved crawl state and start over.",
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=0,
+        help=f"Number of worker threads. Use 1 to disable multithreading. Default when prompted on: {DEFAULT_WORKERS}",
+    )
+    return parser.parse_args()
+
+
+def run_crawl(
+    *,
+    start_url: str,
+    output_path: Path,
+    max_pages: int = DEFAULT_MAX_PAGES,
+    delay: float = 0.0,
+    timeout: float = 15.0,
+    include_subdomains: bool = False,
+    include_documents: bool = False,
+    save_every: int = DEFAULT_SAVE_EVERY,
+    workers: int = DEFAULT_WORKERS,
+    resume: bool = True,
+    fresh: bool = False,
+    user_agent: str = DEFAULT_USER_AGENT,
+) -> CrawlRunResult:
+    if not start_url:
+        raise ValueError("A starting URL is required.")
+
+    if "://" not in start_url:
+        start_url = f"https://{start_url}"
+
+    normalized_start = normalize_url(start_url)
+    if not is_http_url(normalized_start):
+        raise ValueError("Only http and https URLs are supported.")
+
+    output_path = Path(output_path)
+    state_path = get_state_path(output_path)
+    log_path = get_log_path(output_path)
+
+    state: CrawlState
+    if state_path.exists() and not fresh and resume:
+        state = load_state(state_path)
+        if state.start_url != normalized_start:
+            raise ValueError(
+                "The saved crawl state belongs to a different starting URL. "
+                "Use a different output name or start a fresh crawl."
+            )
+        if state.include_documents != include_documents:
+            raise ValueError(
+                "The saved crawl state uses a different document setting. "
+                "Keep the same choice or start a fresh crawl."
+            )
+    else:
+        state = initialize_state(normalized_start, include_subdomains, include_documents)
+
+    effective_workers = max(int(workers), 1)
+    effective_max_pages = max(int(max_pages), 1)
+    if state.visited:
+        effective_max_pages = max(effective_max_pages, len(state.visited) + DEFAULT_RESUME_PAGE_INCREMENT)
+    else:
+        seed_from_xml_sitemaps(state, max(timeout, 1.0), user_agent, log_path)
+
+    log_message(log_path, f"Starting crawl for {state.start_url}")
+    log_message(log_path, f"Output CSV: {output_path.resolve()}")
+    log_message(log_path, f"State file: {state_path.resolve()}")
+    log_message(log_path, f"Multithreading workers: {effective_workers}")
+    log_message(log_path, f"Include documents: {state.include_documents}")
+
+    state, user_stopped = crawl_site(
+        state=state,
+        max_pages=effective_max_pages,
+        delay=max(delay, 0.0),
+        timeout=max(timeout, 1.0),
+        user_agent=user_agent,
+        state_path=state_path,
+        output_path=output_path,
+        log_path=log_path,
+        save_every=max(save_every, 1),
+        workers=effective_workers,
+    )
+
+    if user_stopped:
+        log_message(log_path, "Crawl stopped by user")
+    elif state.queue and len(state.visited) >= effective_max_pages:
+        log_message(log_path, "Crawl stopped at max page limit")
+    elif state.queue:
+        log_message(log_path, "Crawl stopped before queue emptied")
+    else:
+        log_message(log_path, "Crawl completed with empty queue")
+
+    return CrawlRunResult(
+        state=state,
+        user_stopped=user_stopped,
+        output_path=output_path,
+        state_path=state_path,
+        log_path=log_path,
+        max_pages=effective_max_pages,
+        workers=effective_workers,
+    )
+
+
+def main() -> int:
+    args = parse_args()
+
+    start_url = prompt_if_missing(args.url, "Enter the website URL to crawl: ")
+    if not start_url:
+        print("A starting URL is required.", file=sys.stderr)
+        return 1
+
+    if "://" not in start_url:
+        start_url = f"https://{start_url}"
+
+    normalized_start = normalize_url(start_url)
+    if not is_http_url(normalized_start):
+        print("Only http and https URLs are supported.", file=sys.stderr)
+        return 1
+
+    output_value = prompt_if_missing(args.output, f"Enter output CSV path [{DEFAULT_OUTPUT_NAME}]: ")
+    output_path = Path(output_value) if output_value else SCRIPT_DIR / DEFAULT_OUTPUT_NAME
+    state_path = get_state_path(output_path)
+    log_path = get_log_path(output_path)
+    include_documents = args.include_documents or prompt_yes_no(
+        "Include document links such as PDF, CSV, DOC, and DOCX in the sitemap?",
+        default=False,
+    )
+    workers = args.workers
+    if workers <= 0:
+        enable_multithreading = prompt_yes_no(
+            f"Enable multithreading for faster scanning? {DEFAULT_WORKERS} worker threads will be used.",
+            default=True,
+        )
+        workers = DEFAULT_WORKERS if enable_multithreading else 1
+
+    print(f"Crawling {normalized_start}")
+    print(f"Output file: {output_path.resolve()}")
+    print(f"State file: {state_path.resolve()}")
+    print(f"Log file: {log_path.resolve()}")
+    resume_existing = False
+    if state_path.exists() and not args.fresh:
+        resume_existing = args.resume or prompt_yes_no(
+            f"Found saved crawl state at {state_path.name}. Resume from where it left off?",
+            default=True,
+        )
+
+    try:
+        run_result = run_crawl(
+            start_url=normalized_start,
+            output_path=output_path,
+            max_pages=args.max_pages,
+            delay=args.delay,
+            timeout=args.timeout,
+            include_subdomains=args.include_subdomains,
+            include_documents=include_documents,
+            save_every=args.save_every,
+            workers=workers,
+            resume=resume_existing,
+            fresh=args.fresh,
+            user_agent=DEFAULT_USER_AGENT,
+        )
+    except ValueError as exc:
+        print(str(exc), file=sys.stderr)
+        return 1
+
+    state = run_result.state
+    user_stopped = run_result.user_stopped
+    effective_max_pages = run_result.max_pages
+
+    print(f"Max pages: {effective_max_pages}")
+    print(f"Include documents: {'Yes' if state.include_documents else 'No'}")
+    print(f"Multithreading: {'Yes' if run_result.workers > 1 else 'No'}")
+    print(f"Worker threads: {run_result.workers}")
+    if os.name == "nt":
+        print("Press P to pause, R to resume, or Q to stop cleanly and save progress.")
+    if resume_existing:
+        print("Resumed from the existing crawl state file.")
+        log_message(log_path, "Resumed from existing crawl state")
+
+    print(f"Found {len(state.records)} unique URL(s).")
+    print(f"Visited pages: {len(state.visited)}")
+    print(f"Queued pages remaining: {len(state.queue)}")
+    print(f"URLs added from XML sitemaps: {state.discovered_from_sitemaps}")
+    if state.errors:
+        print(f"Pages with errors: {len(state.errors)}")
+        for result in state.errors[:10]:
+            print(f"  {result['url']} -> {result['error']}")
+    if state.skipped_count:
+        print(f"Non-HTML pages skipped while crawling: {state.skipped_count}")
+
+    if user_stopped:
+        print("Stopped by user. Run it again to continue from the saved state.")
+        log_message(log_path, "Crawl stopped by user")
+    elif state.queue and len(state.visited) >= effective_max_pages:
+        print("Stopped because the max page limit was reached. Run it again to continue.")
+        log_message(log_path, "Crawl stopped at max page limit")
+    elif state.queue:
+        print("Stopped before the queue was empty. Run it again to continue.")
+        log_message(log_path, "Crawl stopped before queue emptied")
+    else:
+        print("Crawl complete. No queued pages remain.")
+        log_message(log_path, "Crawl completed with empty queue")
+
+    print("Done.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import contextlib
+import csv
+import importlib.util
+import io
+import os
+import re
+import sys
+from pathlib import Path
+
+import streamlit as st
+
+
+ROOT_DIR = Path(__file__).resolve().parent
+PAGE_IMPORTER_DIR = ROOT_DIR / "Page Importer"
+SITEMAP_BUILDER_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
+APP_DATA_DIR = Path(os.environ.get("APP_DATA_DIR", ROOT_DIR / ".data")).resolve()
+SITEMAP_OUTPUT_DIR = APP_DATA_DIR / "sitemaps"
+
+
+def load_module(module_name: str, file_path: Path):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Unable to load module from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+def get_page_importer_module():
+    if str(PAGE_IMPORTER_DIR) not in sys.path:
+        sys.path.insert(0, str(PAGE_IMPORTER_DIR))
+    return load_module("page_importer_streamlit", PAGE_IMPORTER_DIR / "app.py")
+
+
+def get_sitemap_module():
+    return load_module("sitemap_builder_module", SITEMAP_BUILDER_PATH)
+
+
+def sanitize_job_name(value: str) -> str:
+    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", (value or "").strip())
+    cleaned = cleaned.strip(".-")
+    return cleaned or "sitemap"
+
+
+def read_csv_preview(csv_bytes: bytes, limit: int = 200) -> list[dict[str, str]]:
+    text = csv_bytes.decode("utf-8-sig", errors="replace")
+    reader = csv.DictReader(io.StringIO(text))
+    rows: list[dict[str, str]] = []
+    for index, row in enumerate(reader):
+        if index >= limit:
+            break
+        rows.append(dict(row))
+    return rows
+
+
+def render_sitemap_tab() -> None:
+    st.title("Sitemap Generator")
+    st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
+
+    SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    with st.form("sitemap-form"):
+        start_url = st.text_input("Starting URL", placeholder="https://example.com")
+        job_name = st.text_input(
+            "Output name",
+            value="sitemap",
+            help="Used for the CSV, crawl state, and log file names.",
+        )
+
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
+            workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
+        with col2:
+            delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
+            timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
+        with col3:
+            save_every = st.number_input("Save progress every N pages", min_value=1, value=25, step=1)
+            include_subdomains = st.checkbox("Include subdomains", value=False)
+            include_documents = st.checkbox("Include document links", value=False)
+
+        resume_existing = st.checkbox("Resume from saved crawl state if present", value=True)
+        start_fresh = st.checkbox("Ignore any saved crawl state and start fresh", value=False)
+        submitted = st.form_submit_button("Run Sitemap Crawl", type="primary")
+
+    if submitted:
+        if not start_url.strip():
+            st.error("Starting URL is required.")
+        else:
+            sitemap_builder = get_sitemap_module()
+            safe_name = sanitize_job_name(job_name)
+            output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
+            captured_stdout = io.StringIO()
+
+            try:
+                with st.spinner("Running sitemap crawl..."):
+                    with contextlib.redirect_stdout(captured_stdout):
+                        result = sitemap_builder.run_crawl(
+                            start_url=start_url,
+                            output_path=output_path,
+                            max_pages=int(max_pages),
+                            delay=float(delay),
+                            timeout=float(timeout),
+                            include_subdomains=include_subdomains,
+                            include_documents=include_documents,
+                            save_every=int(save_every),
+                            workers=int(workers),
+                            resume=resume_existing,
+                            fresh=start_fresh,
+                        )
+            except Exception as exc:
+                st.error(str(exc))
+            else:
+                st.session_state["sitemap_result"] = {
+                    "summary": {
+                        "records": len(result.state.records),
+                        "visited": len(result.state.visited),
+                        "queued": len(result.state.queue),
+                        "errors": len(result.state.errors),
+                        "skipped": result.state.skipped_count,
+                        "from_sitemaps": result.state.discovered_from_sitemaps,
+                        "user_stopped": result.user_stopped,
+                        "max_pages": result.max_pages,
+                        "workers": result.workers,
+                    },
+                    "output_path": str(result.output_path),
+                    "state_path": str(result.state_path),
+                    "log_path": str(result.log_path),
+                    "stdout": captured_stdout.getvalue(),
+                }
+
+    result_data = st.session_state.get("sitemap_result")
+    if not result_data:
+        st.info("Run a crawl to generate a sitemap CSV.")
+        return
+
+    summary = result_data["summary"]
+    csv_path = Path(result_data["output_path"])
+    state_path = Path(result_data["state_path"])
+    log_path = Path(result_data["log_path"])
+
+    st.subheader("Crawl Summary")
+    metric_cols = st.columns(6)
+    metric_cols[0].metric("URLs Found", summary["records"])
+    metric_cols[1].metric("Visited", summary["visited"])
+    metric_cols[2].metric("Queued", summary["queued"])
+    metric_cols[3].metric("XML Seeded", summary["from_sitemaps"])
+    metric_cols[4].metric("Errors", summary["errors"])
+    metric_cols[5].metric("Skipped", summary["skipped"])
+
+    status_text = "Stopped by user." if summary["user_stopped"] else "Run completed."
+    st.caption(f"{status_text} Max pages used: {summary['max_pages']} | Worker threads: {summary['workers']}")
+
+    if csv_path.exists():
+        csv_bytes = csv_path.read_bytes()
+        st.download_button(
+            "Download Sitemap CSV",
+            data=csv_bytes,
+            file_name=csv_path.name,
+            mime="text/csv",
+        )
+        preview_rows = read_csv_preview(csv_bytes)
+        if preview_rows:
+            st.dataframe(preview_rows, width="stretch", hide_index=True)
+
+    file_cols = st.columns(2)
+    with file_cols[0]:
+        if state_path.exists():
+            st.download_button(
+                "Download Crawl State",
+                data=state_path.read_bytes(),
+                file_name=state_path.name,
+                mime="application/json",
+            )
+    with file_cols[1]:
+        if log_path.exists():
+            st.download_button(
+                "Download Crawl Log",
+                data=log_path.read_bytes(),
+                file_name=log_path.name,
+                mime="text/plain",
+            )
+
+    crawl_output = (result_data.get("stdout") or "").strip()
+    if crawl_output:
+        st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True)
+
+    if log_path.exists():
+        log_text = log_path.read_text(encoding="utf-8", errors="replace")
+        st.text_area("Log Tail", value="\n".join(log_text.splitlines()[-50:]), height=220, disabled=True)
+
+
+def main() -> None:
+    st.set_page_config(page_title="WDW Tools", layout="wide")
+    st.header("WDW Sitemap And Import Tools")
+    sitemap_tab, importer_tab = st.tabs(["Sitemap Generator", "Page Importer"])
+
+    with sitemap_tab:
+        render_sitemap_tab()
+
+    with importer_tab:
+        page_importer_app = get_page_importer_module()
+        page_importer_app.render_app()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,4 @@
+streamlit>=1.43,<2
+requests>=2.32,<3
+beautifulsoup4>=4.12,<5
+python-dateutil>=2.9,<3