first commit

2026-04-09 10:42:10 -07:00
commit ead872a0a5
19 changed files with 2783 additions and 0 deletions
@@ -0,0 +1,13 @@
 .git
 .gitignore
 .codex
 **/.git
 **/.venv
 **/__pycache__
 **/*.pyc
 **/*.pyo
 **/*.pyd
 **/.pytest_cache
 **/.mypy_cache
 **/.DS_Store
 .data
@@ -0,0 +1,42 @@
 name: Build Docker Image
 on:
  push:
    branches:
      - main
  workflow_dispatch:
 env:
  IMAGE_NAME: wdw-sitemap-and-importer
  REGISTRY: ${{ secrets.REGISTRY_URL }}
  REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
  REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
 jobs:
  docker:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
      - name: Build image
        run: docker build -t "${IMAGE_NAME}:${GITHUB_SHA}" .
      - name: Tag latest image
        run: docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${IMAGE_NAME}:latest"
      - name: Log in to registry
        if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
        run: echo "${REGISTRY_PASSWORD}" | docker login "${REGISTRY}" -u "${REGISTRY_USERNAME}" --password-stdin
      - name: Push commit image
        if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
        run: |
          docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
          docker push "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
      - name: Push latest image
        if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
        run: |
          docker tag "${IMAGE_NAME}:latest" "${REGISTRY}/${IMAGE_NAME}:latest"
          docker push "${REGISTRY}/${IMAGE_NAME}:latest"
@@ -0,0 +1,15 @@
 .codex
 .data/
 __pycache__/
 *.py[cod]
 .venv/
 **/.venv/
 **/__pycache__/
 .pytest_cache/
 .mypy_cache/
 *.crawl.log
 *.crawlstate.json
 streamlit_uploads/
@@ -0,0 +1,22 @@
 FROM python:3.14-slim
 ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1 \
    STREAMLIT_SERVER_HEADLESS=true \
    STREAMLIT_SERVER_PORT=8501 \
    STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
    APP_DATA_DIR=/data
 WORKDIR /app
 COPY requirements.txt ./requirements.txt
 RUN pip install -r requirements.txt
 COPY . .
 RUN mkdir -p /data
 EXPOSE 8501
 CMD ["streamlit", "run", "app.py"]
@@ -0,0 +1,12 @@
 .venv/
 __pycache__/
 *.py[cod]
 *$py.class
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 .streamlit/secrets.toml
 *.log
@@ -0,0 +1,63 @@
 # Page Importer
 This folder contains the WordPress import tool used by the combined application in the repository root.
 The importer still uses Streamlit internally, but it is now rendered as the `Page Importer` tab inside the shared app rather than being the main entrypoint for the repository.
 ## Features
 - Upload a CSV of submitted URLs
 - Choose the URL column and optional title override column
 - Optionally map post type from the CSV or force a single post type
 - Scrape only the listed URLs
 - Extract title, publish date, author, body HTML, categories, and tags
 - Retry failed rows
 - Export a WordPress WXR XML file
 ## Recommended Usage
 Run the root application:
 ```bash
 streamlit run ../app.py
 ```
 Or run the combined Docker container from the repository root.
 ## Standalone Usage
 If you need to run this importer by itself:
 ```bash
 python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
 streamlit run app.py
 ```
 On Windows PowerShell:
 ```powershell
 python -m venv .venv
 .venv\Scripts\Activate.ps1
 pip install -r requirements.txt
 streamlit run app.py
 ```
 ## CSV Input
 The app accepts CSV files with any columns. You choose:
 - the URL column to scrape
 - an optional title or name column to override the scraped title
 - an optional post type column with values like `post` or `page`
 - an optional category column whose values are appended during export
 You can also add manual categories in the sidebar to append them to every exported item.
 ## Notes
 - Exported posts default to `draft` unless changed in the UI
 - Image and link URLs remain pointed at the source site
 - Some themes need heuristic fallback. The `Force heuristic scraping` option skips JSON-LD-first extraction and relies on page structure
 - In the combined app, dependencies come from the root `requirements.txt`
@@ -0,0 +1,475 @@
 from __future__ import annotations
 import csv
 import datetime as dt
 import io
 import re
 from dataclasses import replace
 import streamlit as st
 from page_importer.dates import parse_datetime
 from page_importer.models import ScrapeOptions, ScrapedPost
 from page_importer.scraper import Scraper
 from page_importer.wxr import build_wxr
 def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
    text = file_data.decode("utf-8-sig", errors="replace")
    reader = csv.DictReader(io.StringIO(text))
    rows = list(reader)
    return reader.fieldnames or [], rows
 def render_app() -> None:
    st.title("Page Importer")
    st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
    with st.sidebar:
        st.header("Options")
        include_author = st.checkbox("Include author", value=True)
        include_categories = st.checkbox("Include categories", value=True)
        include_tags = st.checkbox("Include tags", value=True)
        force_heuristics = st.checkbox("Force heuristic scraping", value=False)
        test_run = st.checkbox(
            "Test run only",
            value=False,
            help="Scrape only the first 10 rows that contain a URL.",
        )
        post_type_mode = st.selectbox(
            "WordPress post type mode",
            ["Single type for all rows", "Use a CSV column"],
            index=0,
        )
        default_post_type = st.selectbox("Default WordPress post type", ["post", "page"], index=0)
    uploaded = st.file_uploader("Upload CSV", type=["csv"])
    if not uploaded:
        st.info("Upload a CSV to begin.")
        return
    headers, rows = load_csv(uploaded.getvalue())
    if not rows:
        st.error("The CSV did not contain any rows.")
        return
    col1, col2, col3 = st.columns(3)
    with col1:
        url_column = st.selectbox("URL column", headers, index=_safe_index(headers, ["url", "link"]))
    with col2:
        title_column = st.selectbox(
            "Optional title override column",
            ["(none)", *headers],
            index=_safe_index(["(none)", *headers], ["name", "title"]),
        )
    with col3:
        post_type_column = st.selectbox(
            "Optional post type column",
            ["(none)", *headers],
            index=_safe_index(["(none)", *headers], ["post_type", "type"]),
            disabled=post_type_mode != "Use a CSV column",
        )
    st.write(f"Loaded {len(rows)} row(s). Only the selected URL column will be scraped.")
    if test_run:
        st.caption("Test run is enabled. Only the first 10 rows with a URL will be scraped.")
    if st.button("Scrape URLs", type="primary"):
        context = build_scrape_context(
            include_author=include_author,
            include_categories=include_categories,
            include_tags=include_tags,
            force_heuristics=force_heuristics,
            test_run=test_run,
            post_type_mode=post_type_mode,
            post_type_column=post_type_column,
            default_post_type=default_post_type,
            url_column=url_column,
            title_column=title_column,
        )
        results = scrape_rows(rows, context, phase_label="Scraping")
        st.session_state["results"] = results
        st.session_state["input_rows"] = rows
        st.session_state["scrape_context"] = context
    results = st.session_state.get("results", [])
    if not results:
        return
    successful = [post for post in results if post.success]
    failed = [post for post in results if not post.success]
    st.subheader("Results")
    st.write(f"Successful: {len(successful)} | Failed: {len(failed)}")
    if failed and st.button("Retry failed items"):
        stored_rows = st.session_state.get("input_rows", rows)
        context = st.session_state.get("scrape_context")
        if context:
            retried = scrape_rows(
                stored_rows,
                context,
                row_numbers=[post.row_number for post in failed if post.row_number],
                phase_label="Retrying",
            )
            results = merge_retry_results(results, retried)
            st.session_state["results"] = results
            successful = [post for post in results if post.success]
            failed = [post for post in results if not post.success]
    preview_rows = []
    for post in results:
        preview_rows.append(
            {
                "Row": post.row_number,
                "URL": post.source_url,
                "CMS": post.cms,
                "Success": post.success,
                "Title": post.title,
                "Publish Date": post.publish_date,
                "Author": post.author,
                "Categories": ", ".join(post.categories),
                "Tags": ", ".join(post.tags),
                "Post Type": post.post_type,
                "Error": post.error,
            }
        )
    st.dataframe(
        preview_rows,
        width="stretch",
        hide_index=True,
        column_config={
            "Row": st.column_config.NumberColumn(width="small"),
            "URL": st.column_config.TextColumn(width="medium"),
            "Title": st.column_config.TextColumn(width="medium"),
            "Publish Date": st.column_config.TextColumn(width="medium"),
            "Categories": st.column_config.TextColumn(width="medium"),
            "Tags": st.column_config.TextColumn(width="medium"),
            "Error": st.column_config.TextColumn(width="large"),
        },
    )
    if failed:
        selected_failed = st.selectbox(
            "Failed row details",
            failed,
            format_func=lambda post: f"Row {post.row_number}: {post.source_url or '(missing URL)'}",
        )
        st.text_area(
            "Error details",
            value=selected_failed.error_details or selected_failed.error,
            height=180,
            disabled=True,
        )
    if successful:
        selected_index = st.number_input(
            "Preview successful row",
            min_value=1,
            max_value=len(successful),
            value=1,
            step=1,
        )
        selected = successful[selected_index - 1]
        st.markdown("### Content Preview")
        st.write(f"**Title:** {selected.title}")
        st.write(f"**Source URL:** {selected.source_url}")
        st.write(f"**Publish Date:** {selected.publish_date or '(missing)'}")
        st.write(f"**Author:** {selected.author or '(missing)'}")
        st.write(f"**Post Type:** {selected.post_type}")
        st.write(selected.body_html, unsafe_allow_html=True)
        render_export_sidebar(successful, rows, headers)
 def build_scrape_context(
    *,
    include_author: bool,
    include_categories: bool,
    include_tags: bool,
    force_heuristics: bool,
    test_run: bool,
    post_type_mode: str,
    post_type_column: str,
    default_post_type: str,
    url_column: str,
    title_column: str,
 ) -> dict[str, object]:
    return {
        "options": ScrapeOptions(
            include_author=include_author,
            include_categories=include_categories,
            include_tags=include_tags,
            force_heuristics=force_heuristics,
        ),
        "test_run": test_run,
        "post_type_mode": post_type_mode,
        "post_type_column": post_type_column,
        "default_post_type": default_post_type,
        "url_column": url_column,
        "title_column": title_column,
    }
 def scrape_rows(
    rows: list[dict[str, str]],
    context: dict[str, object],
    row_numbers: list[int] | None = None,
    phase_label: str = "Scraping",
 ) -> list[ScrapedPost]:
    options = context["options"]
    if not isinstance(options, ScrapeOptions):
        raise TypeError("Invalid scrape options in session state.")
    scraper = Scraper(options)
    targets = list(enumerate(rows, start=1))
    if row_numbers is not None:
        requested_rows = set(row_numbers)
        targets = [(row_number, row) for row_number, row in targets if row_number in requested_rows]
    elif bool(context.get("test_run")):
        targets = [
            (row_number, row)
            for row_number, row in targets
            if (row.get(str(context["url_column"])) or "").strip()
        ][:10]
    results: list[ScrapedPost] = []
    progress = st.progress(0.0)
    status = st.empty()
    total = len(targets) or 1
    for index, (row_number, row) in enumerate(targets, start=1):
        url = (row.get(context["url_column"]) or "").strip()
        status.write(f"{phase_label} {index}/{len(targets)}: {url or f'row {row_number} has no URL'}")
        if url:
            post = scraper.scrape(url)
        else:
            post = ScrapedPost(
                source_url="",
                row_number=row_number,
                error="Missing URL in the selected URL column.",
                error_details=f"Row {row_number} does not contain a URL in column '{context['url_column']}'.",
            )
        post.row_number = row_number
        apply_row_overrides(post, row, context)
        results.append(post)
        progress.progress(index / total)
    status.write(f"{phase_label} complete.")
    return results
 def apply_row_overrides(post: ScrapedPost, row: dict[str, str], context: dict[str, object]) -> None:
    title_column = context["title_column"]
    if isinstance(title_column, str) and title_column != "(none)" and row.get(title_column):
        post.title = row[title_column].strip()
    post.post_type = resolve_post_type(
        row=row,
        mode=str(context["post_type_mode"]),
        column=str(context["post_type_column"]),
        default_value=str(context["default_post_type"]),
    )
 def resolve_export_categories(
    row: dict[str, str],
    category_column: str,
    manual_categories: list[str],
 ) -> list[str]:
    csv_categories = parse_terms(row.get(category_column, "")) if category_column != "(none)" else []
    return merge_unique_terms(csv_categories, manual_categories)
 def parse_terms(value: str) -> list[str]:
    return [term.strip() for term in re.split(r"[,|>]", value or "") if term.strip()]
 def merge_unique_terms(*groups: list[str]) -> list[str]:
    merged: list[str] = []
    for group in groups:
        for term in group:
            cleaned = term.strip()
            if cleaned and cleaned not in merged:
                merged.append(cleaned)
    return merged
 def merge_retry_results(existing: list[ScrapedPost], replacements: list[ScrapedPost]) -> list[ScrapedPost]:
    replacement_map = {post.row_number: post for post in replacements}
    merged = [replacement_map.get(post.row_number, post) for post in existing]
    return sorted(merged, key=lambda post: post.row_number or 0)
 def build_export_posts(
    posts: list[ScrapedPost],
    rows: list[dict[str, str]],
    category_column: str,
    manual_categories: list[str],
    post_status: str,
    custom_post_type_slug: str,
 ) -> list[ScrapedPost]:
    export_posts: list[ScrapedPost] = []
    for post in posts:
        row = rows[post.row_number - 1] if 0 < post.row_number <= len(rows) else {}
        export_posts.append(
            replace(
                post,
                status=post_status,
                post_type=custom_post_type_slug or post.post_type,
                categories=merge_unique_terms(
                    post.categories,
                    resolve_export_categories(row, category_column, manual_categories),
                ),
            )
        )
    return export_posts
 def render_export_sidebar(
    successful: list[ScrapedPost],
    rows: list[dict[str, str]],
    headers: list[str],
 ) -> None:
    with st.sidebar:
        st.markdown("---")
        st.subheader("Export")
        post_status = st.selectbox(
            "Imported post status",
            ["draft", "publish", "private"],
            index=0,
            key="export_post_status",
        )
        category_column = st.selectbox(
            "CSV category column",
            ["(none)", *headers],
            index=_safe_index(["(none)", *headers], ["category", "categories", "department"]),
            key="export_category_column",
        )
        manual_categories = parse_terms(
            st.text_input(
                "Additional export categories",
                value="",
                help="Comma-separated categories to append to every exported item.",
                key="export_manual_categories",
            )
        )
        output_name = st.text_input(
            "Output filename",
            value="wordpress-import.xml",
            key="export_output_name",
        )
        custom_post_type_slug = normalize_post_type_slug(
            st.text_input(
                "Custom post type slug",
                value="",
                help="Optional. If set, all exported items will use this WordPress post type slug.",
                key="export_custom_post_type_slug",
            )
        )
        export_posts = build_export_posts(
            successful,
            rows,
            category_column,
            manual_categories,
            post_status,
            custom_post_type_slug,
        )
        if custom_post_type_slug:
            st.caption(f"Exporting all items as post type `{custom_post_type_slug}`.")
        dated_export_posts = [(post, publish_date) for post in export_posts if (publish_date := parse_publish_date(post.publish_date))]
        if dated_export_posts:
            min_date = min(publish_date for _, publish_date in dated_export_posts)
            max_date = max(publish_date for _, publish_date in dated_export_posts)
            filter_by_publish_date = st.checkbox(
                "Filter export by publish date",
                value=False,
                key="export_filter_by_publish_date",
            )
            if filter_by_publish_date:
                export_start = st.date_input(
                    "Export start date",
                    value=min_date,
                    min_value=min_date,
                    max_value=max_date,
                    format="MM/DD/YYYY",
                    key="export_start_date",
                )
                export_end = st.date_input(
                    "Export end date",
                    value=max_date,
                    min_value=min_date,
                    max_value=max_date,
                    format="MM/DD/YYYY",
                    key="export_end_date",
                )
                if export_start > export_end:
                    st.error("Export start date must be on or before the end date.")
                    export_posts = []
                else:
                    export_posts = [
                        post
                        for post in export_posts
                        if (publish_date := parse_publish_date(post.publish_date)) and export_start <= publish_date <= export_end
                    ]
                    st.caption(
                        "Date filter: "
                        f"{export_start.strftime('%m/%d/%Y')} to {export_end.strftime('%m/%d/%Y')}."
                    )
                    undated_count = len(successful) - len(dated_export_posts)
                    if undated_count:
                        st.caption(f"Excluded {undated_count} successful item(s) with no publish date.")
        else:
            st.caption("No successful items have a publish date, so export date filtering is unavailable.")
        st.caption(f"Ready to export {len(export_posts)} post(s).")
        xml_data = build_wxr(export_posts)
        st.download_button(
            label="Download WXR XML",
            data=xml_data,
            file_name=output_name,
            mime="application/xml",
            disabled=not export_posts,
        )
 def parse_publish_date(value: str) -> dt.date | None:
    parsed = parse_datetime(value)
    if parsed is None:
        return None
    return parsed.date()
 def _safe_index(values: list[str], candidates: list[str]) -> int:
    lowered = {value.lower(): idx for idx, value in enumerate(values)}
    for candidate in candidates:
        if candidate in lowered:
            return lowered[candidate]
    return 0
 def resolve_post_type(
    row: dict[str, str],
    mode: str,
    column: str,
    default_value: str,
 ) -> str:
    if mode != "Use a CSV column" or column == "(none)":
        return default_value
    raw_value = normalize_post_type_slug(row.get(column) or "")
    if raw_value:
        return raw_value
    return default_value
 def normalize_post_type_slug(value: str) -> str:
    return re.sub(r"[^a-z0-9_-]", "", (value or "").strip().lower())
 if __name__ == "__main__":
    st.set_page_config(page_title="Page Importer", layout="wide")
    render_app()
@@ -0,0 +1 @@
@@ -0,0 +1,26 @@
 from __future__ import annotations
 import datetime as dt
 from dateutil import parser as date_parser
 def parse_datetime(value: str | None) -> dt.datetime | None:
    if not value:
        return None
    try:
        return date_parser.parse(value)
    except (TypeError, ValueError, OverflowError):
        try:
            return date_parser.parse(value, fuzzy=True)
        except (TypeError, ValueError, OverflowError):
            return None
 def normalize_date(value: str | None) -> str:
    parsed = parse_datetime(value)
    if parsed is None:
        return ""
    if parsed.tzinfo is None or parsed.utcoffset() is None:
        return parsed.strftime("%Y-%m-%d %H:%M:%S")
    return parsed.isoformat(sep=" ", timespec="seconds")
@@ -0,0 +1,34 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
@dataclass
 class ScrapeOptions:
    include_author: bool = True
    include_categories: bool = True
    include_tags: bool = True
    force_heuristics: bool = False
    request_timeout: int = 20
    user_agent: str = (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0 Safari/537.36"
    )
@dataclass
 class ScrapedPost:
    source_url: str
    row_number: int = 0
    cms: str = "unknown"
    title: str = ""
    publish_date: str = ""
    author: str = ""
    body_html: str = ""
    categories: list[str] = field(default_factory=list)
    tags: list[str] = field(default_factory=list)
    status: str = "draft"
    post_type: str = "post"
    success: bool = False
    error: str = ""
    error_details: str = ""
@@ -0,0 +1,555 @@
 from __future__ import annotations
 import json
 import re
 import traceback
 from html import unescape
 from typing import Iterable
 import requests
 from bs4 import BeautifulSoup
 from bs4.element import NavigableString, Tag
 from page_importer.dates import normalize_date
 from page_importer.models import ScrapeOptions, ScrapedPost
 JSON_ARTICLE_TYPES = {
    "article",
    "blogposting",
    "newsarticle",
    "report",
    "webpage",
 }
 BODY_SELECTORS = [
    "article .entry-content",
    "article .post-content",
    "article .node__content",
    "article .node .content",
    "article .node-content",
    "article .field-name-body .field-item",
    "article .field-name-body",
    "article .field--name-body",
    "article .article-body",
    "article .content",
    ".post-content",
    ".entry-content",
    ".node__content",
    ".node .content",
    ".node-content",
    ".field-name-body .field-item",
    ".field-name-body",
    ".field--name-body",
    ".article-body",
    "#content-area .node .content",
    "article",
    "main article",
    "main",
 ]
 CATEGORY_SELECTORS = [
    ".cat-links a",
    ".post-categories a",
    ".field--name-field-category a",
    ".tags a[rel='category tag']",
    ".terms a",
    ".taxonomy a",
 ]
 TAG_SELECTORS = [
    ".tags-links a",
    ".post-tags a",
    ".field--name-field-tags a",
    "a[rel='tag']",
    ".terms a",
 ]
 AUTHOR_SELECTORS = [
    "[rel='author']",
    ".author a",
    ".byline a",
    ".submitted a",
    ".node__submitted a",
    ".node-info a",
    ".createdby",
 ]
 DATE_SELECTORS = [
    "time[datetime]",
    "meta[property='article:published_time']",
    "meta[name='publish_date']",
    "meta[name='pubdate']",
    ".date-display-single",
    ".submitted",
    ".node-info",
 ]
 DRUPAL_TITLE_DATE_PATTERN = re.compile(
    r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
    r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
 )
 class Scraper:
    def __init__(self, options: ScrapeOptions) -> None:
        self.options = options
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": options.user_agent})
    def scrape(self, url: str) -> ScrapedPost:
        post = ScrapedPost(source_url=url)
        response: requests.Response | None = None
        try:
            response = self.session.get(url, timeout=self.options.request_timeout)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            post.cms = detect_cms(soup)
            article_data = extract_article_json_ld(soup)
            if article_data and not self.options.force_heuristics:
                apply_article_data(post, article_data, soup, self.options)
            merge_fallback_data(post, soup, self.options)
            post.body_html = sanitize_html(post.body_html)
            missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
            if missing_fields:
                raise ValueError(
                    "Unable to extract required field(s): "
                    f"{', '.join(missing_fields)}. "
                    f"Detected CMS: {post.cms}. "
                    f"Publish date found: {'yes' if post.publish_date else 'no'}. "
                    f"Author found: {'yes' if post.author else 'no'}."
                )
            post.success = True
            return post
        except Exception as exc:
            post.error = format_error_summary(url, exc, response, self.options.request_timeout)
            post.error_details = format_error_details(url, exc, response)
            return post
 def detect_cms(soup: BeautifulSoup) -> str:
    generator = meta_content(soup, "meta", {"name": "generator"})
    html = str(soup).lower()
    if generator:
        g = generator.lower()
        if "wordpress" in g:
            return "wordpress"
        if "drupal" in g:
            return "drupal"
        if "joomla" in g:
            return "joomla"
    if "/wp-content/" in html:
        return "wordpress"
    if "drupal-settings-json" in html or "sites/default/files" in html:
        return "drupal"
    if "com_content" in html or "joomla" in html:
        return "joomla"
    return "unknown"
 def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
    for script in soup.select("script[type='application/ld+json']"):
        raw = script.string or script.get_text(" ", strip=True)
        if not raw:
            continue
        for payload in parse_json_candidates(raw):
            article = find_article_payload(payload)
            if article:
                return article
    return None
 def parse_json_candidates(raw: str) -> Iterable[dict | list]:
    try:
        data = json.loads(raw)
        yield data
        return
    except json.JSONDecodeError:
        pass
    cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
    try:
        data = json.loads(cleaned)
        yield data
    except json.JSONDecodeError:
        return
 def find_article_payload(payload: dict | list) -> dict | None:
    if isinstance(payload, list):
        for item in payload:
            found = find_article_payload(item)
            if found:
                return found
        return None
    if not isinstance(payload, dict):
        return None
    if "@graph" in payload:
        found = find_article_payload(payload["@graph"])
        if found:
            return found
    node_type = payload.get("@type")
    types = {node_type.lower()} if isinstance(node_type, str) else {
        item.lower() for item in node_type or [] if isinstance(item, str)
    }
    if types & JSON_ARTICLE_TYPES:
        return payload
    return None
 def apply_article_data(
    post: ScrapedPost,
    article: dict,
    soup: BeautifulSoup,
    options: ScrapeOptions,
 ) -> None:
    post.title = article.get("headline") or article.get("name") or post.title
    post.publish_date = normalize_date(
        article.get("datePublished") or article.get("dateCreated") or post.publish_date
    )
    if options.include_author:
        post.author = extract_author_from_json_ld(article) or post.author
    if options.include_categories:
        post.categories = normalize_terms(article.get("articleSection")) or post.categories
    if options.include_tags:
        post.tags = normalize_terms(article.get("keywords")) or post.tags
    post.body_html = extract_body_from_article(article, soup) or post.body_html
 def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
    if not post.title:
        post.title = extract_title(soup)
    if not post.publish_date:
        post.publish_date = extract_date(soup, post.cms)
    if options.include_author and not post.author:
        post.author = extract_author(soup)
    if not post.body_html:
        post.body_html = extract_body(soup)
    if options.include_categories:
        post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
        if post.cms == "drupal":
            post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
    if options.include_tags and not post.tags:
        post.tags = extract_terms(soup, TAG_SELECTORS)
 def extract_title(soup: BeautifulSoup) -> str:
    og_title = meta_content(soup, "meta", {"property": "og:title"})
    if og_title:
        return og_title
    for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
        node = soup.select_one(selector)
        if node:
            return clean_text(node.get_text(" ", strip=True))
    return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
 def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
    for selector in DATE_SELECTORS:
        node = soup.select_one(selector)
        if not node:
            continue
        candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
        normalized = normalize_date(candidate)
        if normalized:
            return normalized
    if cms == "drupal":
        return extract_drupal_title_adjacent_date(soup)
    return ""
 def extract_author(soup: BeautifulSoup) -> str:
    author = meta_content(soup, "meta", {"name": "author"})
    if author:
        return clean_text(author)
    for selector in AUTHOR_SELECTORS:
        node = soup.select_one(selector)
        if node:
            return clean_text(node.get_text(" ", strip=True))
    return ""
 def extract_body(soup: BeautifulSoup) -> str:
    fallback_html = ""
    for selector in BODY_SELECTORS:
        node = soup.select_one(selector)
        if not node:
            continue
        candidate = clone_tag(node)
        strip_unwanted(candidate)
        html = candidate.decode_contents().strip()
        text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
        if text_length >= 120:
            return html
        if not fallback_html and has_meaningful_body_content(html):
            fallback_html = html
    return fallback_html
 def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
    terms: list[str] = []
    for selector in selectors:
        for node in soup.select(selector):
            term = clean_text(node.get_text(" ", strip=True))
            if term and term not in terms:
                terms.append(term)
    return terms
 def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
    title_node = find_title_node(soup)
    if not title_node:
        return ""
    for sibling in title_node.next_siblings:
        candidate = text_from_node(sibling)
        normalized = normalize_drupal_date(candidate)
        if normalized:
            return normalized
    header = title_node.find_parent(["header", "div", "section"])
    if header:
        header_text = clean_text(header.get_text(" ", strip=True))
        title_text = clean_text(title_node.get_text(" ", strip=True))
        if title_text and header_text.startswith(title_text):
            header_text = clean_text(header_text[len(title_text):])
        normalized = normalize_drupal_date(header_text)
        if normalized:
            return normalized
    return ""
 def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
    categories: list[str] = []
    label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)
    for label_node in soup.find_all(string=label_pattern):
        parent = label_node.parent if isinstance(label_node.parent, Tag) else None
        if not parent:
            continue
        inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
        normalized_inline_value = normalize_department_category(inline_value)
        if normalized_inline_value:
            categories = merge_terms(categories, [normalized_inline_value])
            continue
        for sibling in parent.next_siblings:
            value = normalize_department_category(text_from_node(sibling))
            if value:
                categories = merge_terms(categories, [value])
                break
    for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
        text = clean_text(candidate.get_text(" ", strip=True))
        if not text.lower().startswith("department:"):
            continue
        extracted = normalize_department_category(extract_labeled_value(text, "Department"))
        if extracted:
            categories = merge_terms(categories, [extracted])
    return categories
 def extract_author_from_json_ld(article: dict) -> str:
    author = article.get("author")
    if isinstance(author, dict):
        return clean_text(author.get("name", ""))
    if isinstance(author, list):
        names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
        return ", ".join(name for name in names if name)
    if isinstance(author, str):
        return clean_text(author)
    return ""
 def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
    body = article.get("articleBody")
    if isinstance(body, str) and len(body.strip()) > 120:
        return f"<p>{unescape(body.strip())}</p>"
    return extract_body(soup)
 def normalize_terms(value: object) -> list[str]:
    if isinstance(value, str):
        parts = re.split(r"[,|>]", value)
        return [clean_text(part) for part in parts if clean_text(part)]
    if isinstance(value, list):
        result: list[str] = []
        for item in value:
            if isinstance(item, str):
                cleaned = clean_text(item)
                if cleaned and cleaned not in result:
                    result.append(cleaned)
        return result
    return []
 def merge_terms(*groups: list[str]) -> list[str]:
    merged: list[str] = []
    for group in groups:
        for item in group:
            cleaned = clean_text(item)
            if cleaned and cleaned not in merged:
                merged.append(cleaned)
    return merged
 def normalize_drupal_date(value: str | None) -> str:
    if not value:
        return ""
    match = DRUPAL_TITLE_DATE_PATTERN.search(value)
    if not match:
        return ""
    return normalize_date(match.group(0))
 def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
    node = soup.find(tag_name, attrs=attrs)
    if node and node.get("content"):
        return node["content"].strip()
    return ""
 def clean_text(value: str) -> str:
    return re.sub(r"\s+", " ", value or "").strip()
 def text_from_node(node: object) -> str:
    if isinstance(node, NavigableString):
        return clean_text(str(node))
    if isinstance(node, Tag):
        return clean_text(node.get_text(" ", strip=True))
    return ""
 def sanitize_html(html: str) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "html.parser")
    strip_unwanted(soup)
    strip_dangerous_attributes(soup)
    return soup.decode_contents().strip()
 def has_meaningful_body_content(html: str) -> bool:
    if not html:
        return False
    text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
    return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))
 def strip_unwanted(node: BeautifulSoup | Tag) -> None:
    for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
        for child in node.select(selector):
            child.decompose()
 def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
    for child in node.find_all(True):
        for attr_name in list(child.attrs):
            normalized_name = attr_name.lower()
            if normalized_name.startswith("on") or normalized_name == "srcdoc":
                del child.attrs[attr_name]
                continue
            if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
                continue
            raw_value = child.attrs.get(attr_name)
            if isinstance(raw_value, list):
                candidate = " ".join(str(item) for item in raw_value)
            else:
                candidate = str(raw_value or "")
            lowered = candidate.strip().lower()
            if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
                del child.attrs[attr_name]
 def clone_tag(node: Tag) -> BeautifulSoup:
    return BeautifulSoup(str(node), "html.parser")
 def find_title_node(soup: BeautifulSoup) -> Tag | None:
    for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
        node = soup.select_one(selector)
        if node:
            return node
    return None
 def extract_labeled_value(text: str, label: str) -> str:
    if not text:
        return ""
    pattern = re.compile(
        rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
        re.IGNORECASE,
    )
    match = pattern.search(clean_text(text))
    if not match:
        return ""
    return clean_text(match.group(1))
 def normalize_department_category(value: str) -> str:
    cleaned = clean_text(value)
    if not cleaned:
        return ""
    if len(cleaned) > 80 or len(cleaned.split()) > 8:
        return ""
    if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
        return ""
    return cleaned
 def format_error_summary(
    url: str,
    exc: Exception,
    response: requests.Response | None,
    timeout_seconds: int,
 ) -> str:
    if isinstance(exc, requests.HTTPError):
        failing_response = exc.response or response
        if failing_response is not None:
            return (
                f"HTTP {failing_response.status_code} {failing_response.reason} "
                f"while fetching {failing_response.url or url}"
            )
    if isinstance(exc, requests.Timeout):
        return f"Request timed out after {timeout_seconds}s while fetching {url}"
    if isinstance(exc, requests.RequestException):
        return f"{type(exc).__name__} while fetching {url}: {exc}"
    return f"{type(exc).__name__}: {exc}"
 def format_error_details(
    url: str,
    exc: Exception,
    response: requests.Response | None,
 ) -> str:
    details = [
        f"URL: {url}",
        f"Error Type: {type(exc).__name__}",
        f"Message: {exc}",
    ]
    failing_response = getattr(exc, "response", None) or response
    if failing_response is not None:
        details.extend(
            [
                f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
                f"Resolved URL: {failing_response.url}",
            ]
        )
    trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
    if trace:
        details.append(f"Exception: {trace}")
    return "\n".join(details)
@@ -0,0 +1,91 @@
 from __future__ import annotations
 from email.utils import format_datetime
 from io import StringIO
 from xml.sax.saxutils import escape
 import datetime as dt
 from page_importer.dates import parse_datetime
 from page_importer.models import ScrapedPost
 def build_wxr(posts: list[ScrapedPost], channel_title: str = "Imported Content") -> str:
    now = dt.datetime.now(dt.timezone.utc)
    out = StringIO()
    out.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
    out.write(
        '<rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" '
        'xmlns:content="http://purl.org/rss/1.0/modules/content/" '
        'xmlns:wfw="http://wellformedweb.org/CommentAPI/" '
        'xmlns:dc="http://purl.org/dc/elements/1.1/" '
        'xmlns:wp="http://wordpress.org/export/1.2/">\n'
    )
    out.write("<channel>\n")
    out.write(f"<title>{escape(channel_title)}</title>\n")
    out.write("<link>http://localhost/</link>\n")
    out.write("<description>Generated by Page Importer</description>\n")
    out.write(f"<pubDate>{format_datetime(now)}</pubDate>\n")
    out.write("<language>en-US</language>\n")
    out.write("<wp:wxr_version>1.2</wp:wxr_version>\n")
    for post in posts:
        local_date, gmt_date, item_pub_date = _resolve_post_dates(post.publish_date, now)
        out.write("<item>\n")
        out.write(f"<title>{escape(post.title)}</title>\n")
        out.write(f"<link>{escape(post.source_url)}</link>\n")
        out.write(f"<pubDate>{format_datetime(item_pub_date)}</pubDate>\n")
        out.write(f"<dc:creator>{cdata(post.author or 'importer')}</dc:creator>\n")
        out.write(f"<guid isPermaLink=\"false\">{escape(post.source_url)}</guid>\n")
        out.write("<description></description>\n")
        out.write(f"<content:encoded>{cdata(post.body_html)}</content:encoded>\n")
        out.write(f"<excerpt:encoded>{cdata('')}</excerpt:encoded>\n")
        out.write(f"<wp:post_date>{cdata(local_date)}</wp:post_date>\n")
        out.write(f"<wp:post_date_gmt>{cdata(gmt_date)}</wp:post_date_gmt>\n")
        out.write("<wp:comment_status><![CDATA[closed]]></wp:comment_status>\n")
        out.write("<wp:ping_status><![CDATA[closed]]></wp:ping_status>\n")
        out.write("<wp:post_name><![CDATA[]]></wp:post_name>\n")
        out.write(f"<wp:status>{cdata(post.status)}</wp:status>\n")
        out.write("<wp:post_parent>0</wp:post_parent>\n")
        out.write("<wp:menu_order>0</wp:menu_order>\n")
        out.write(f"<wp:post_type>{cdata(post.post_type or 'post')}</wp:post_type>\n")
        out.write("<wp:post_password><![CDATA[]]></wp:post_password>\n")
        out.write("<wp:is_sticky>0</wp:is_sticky>\n")
        for category in post.categories:
            out.write(
                f'<category domain="category" nicename="{escape(slugify(category))}">{cdata(category)}</category>\n'
            )
        for tag in post.tags:
            out.write(
                f'<category domain="post_tag" nicename="{escape(slugify(tag))}">{cdata(tag)}</category>\n'
            )
        out.write("</item>\n")
    out.write("</channel>\n</rss>\n")
    return out.getvalue()
 def slugify(value: str) -> str:
    return "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-")
 def cdata(value: str) -> str:
    return f"<![CDATA[{(value or '').replace(']]>', ']]]]><![CDATA[>')}]]>"
 def _resolve_post_dates(value: str, fallback: dt.datetime) -> tuple[str, str, dt.datetime]:
    parsed = parse_datetime(value)
    if parsed is None:
        return "", "", fallback
    if parsed.tzinfo is None or parsed.utcoffset() is None:
        local_date = _format_wp_date(parsed)
        assumed_utc = parsed.replace(tzinfo=dt.timezone.utc)
        return local_date, local_date, assumed_utc
    local_date = _format_wp_date(parsed)
    gmt_value = parsed.astimezone(dt.timezone.utc)
    return local_date, _format_wp_date(gmt_value), gmt_value
 def _format_wp_date(value: dt.datetime) -> str:
    return value.replace(tzinfo=None).strftime("%Y-%m-%d %H:%M:%S")
@@ -0,0 +1,4 @@
 streamlit>=1.43,<2
 requests>=2.32,<3
 beautifulsoup4>=4.12,<5
 python-dateutil>=2.9,<3
@@ -0,0 +1,79 @@
 from __future__ import annotations
 import unittest
 from bs4 import BeautifulSoup
 from page_importer.dates import normalize_date
 from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
 from page_importer.wxr import build_wxr
 from page_importer.models import ScrapedPost
 class DateNormalizationTests(unittest.TestCase):
    def test_preserves_timezone_offset_in_normalized_value(self) -> None:
        self.assertEqual(
            normalize_date("2024-05-01T09:30:00-07:00"),
            "2024-05-01 09:30:00-07:00",
        )
 class WxrSerializationTests(unittest.TestCase):
    def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
        xml = build_wxr(
            [
                ScrapedPost(
                    source_url="https://example.com/post",
                    title="Example",
                    body_html="<p>Body</p>",
                    publish_date="2024-05-01 09:30:00-07:00",
                    success=True,
                )
            ]
        )
        self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
        self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
        self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
    def test_splits_cdata_terminators_in_content(self) -> None:
        xml = build_wxr(
            [
                ScrapedPost(
                    source_url="https://example.com/post",
                    title="Example",
                    body_html="<p>alpha ]]> omega</p>",
                    author="Jane ]]> Doe",
                    success=True,
                )
            ]
        )
        self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
        self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
 class HtmlSanitizationTests(unittest.TestCase):
    def test_removes_inline_event_handlers_and_script_uris(self) -> None:
        sanitized = sanitize_html(
            '<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
        )
        self.assertNotIn("onclick", sanitized)
        self.assertNotIn("onerror", sanitized)
        self.assertNotIn("javascript:", sanitized)
 class TaxonomySelectorTests(unittest.TestCase):
    def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
        soup = BeautifulSoup(
            '<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
            "html.parser",
        )
        self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
        self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
 if __name__ == "__main__":
    unittest.main()
@@ -0,0 +1,110 @@
 # WDW Sitemap And Import Tools
 This repository combines two internal tools into one web application and one Docker image:
 - `Sitemap Generator`
 - `Page Importer`
 The application uses Streamlit and presents both tools behind a single URL with two tabs at the top of the page.
 ## What It Does
 ### Sitemap Generator
 - Crawls a site from a starting URL
 - Discovers URLs from page links and XML sitemaps
 - Exports a sitemap CSV
 - Saves crawl state and logs so a crawl can be resumed later
 ### Page Importer
 - Reads a CSV of submitted URLs
 - Scrapes page content
 - Lets you review the extracted content
 - Exports a WordPress WXR XML import file
 ## Project Layout
 - `app.py`: top-level Streamlit app with both tabs
 - `requirements.txt`: shared Python dependencies for the combined app
 - `Dockerfile`: single image for the combined tool
 - `.gitea/workflows/docker-image.yml`: Gitea Actions workflow for Docker builds
 - `Sitemap Builder/`: sitemap crawler logic
 - `Page Importer/`: WordPress import logic
 ## Run Locally
 ### Linux or macOS
 ```bash
 python3 -m venv .venv
 source .venv/bin/activate
 pip install -r requirements.txt
 streamlit run app.py
 ```
 ### Windows PowerShell
 ```powershell
 python -m venv .venv
 .venv\Scripts\Activate.ps1
 pip install -r requirements.txt
 streamlit run app.py
 ```
 Then open:
 ```text
 http://localhost:8501
 ```
 ## Docker
 Build the image:
 ```bash
 docker build -t wdw-sitemap-and-importer .
 ```
 Run the container:
 ```bash
 docker run --rm -p 8501:8501 -v wdw-tools-data:/data wdw-sitemap-and-importer
 ```
 Then open:
 ```text
 http://localhost:8501
 ```
 The mounted `/data` volume stores sitemap CSV files, crawl state files, and crawl logs so sitemap jobs can survive container restarts.
 ## Gitea Automation
 The workflow file is:
 ```text
 .gitea/workflows/docker-image.yml
 ```
 It runs on pushes to `main` and on manual workflow dispatch.
 The workflow always builds the Docker image. If these secrets are configured in Gitea, it also logs in and pushes the image to your registry:
 - `GITEA_REGISTRY_URL`
 - `GITEA_REGISTRY_USERNAME`
 - `GITEA_REGISTRY_PASSWORD`
 Published tags:
 - `${REGISTRY}/wdw-sitemap-and-importer:<commit-sha>`
 - `${REGISTRY}/wdw-sitemap-and-importer:latest`
 If the registry secrets are not configured, the workflow still performs the build as validation but skips the push steps.
 ## Notes
 - Sitemap output files are written under `/data` in Docker.
 - The sitemap crawler can resume previous runs when a matching crawl state file exists.
 - The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app.
@@ -0,0 +1,80 @@
 # Sitemap Builder
 This folder contains the sitemap crawler used by the combined web application in the repository root.
 The crawler can still be used directly from Python, but the primary supported experience is now the shared Streamlit interface in the root project:
 ```text
 ../app.py
 ```
 ## Current Role In The Combined App
 The root application uses this module to:
 - crawl a site from a submitted starting URL
 - discover internal URLs from HTML links and XML sitemaps
 - export a sitemap CSV
 - save crawl state and crawl logs for resume support
 ## Output
 The crawler writes:
 - a CSV file
 - a sidecar crawl state file ending in `.crawlstate.json`
 - a crawl log file ending in `.crawl.log`
 The CSV contains these columns:
 - `URL`
 - `Title`
 - `Canonical URL`
 - `Type`
 ## Standalone CLI Usage
 Interactive mode:
 ```bash
 python3 sitemap_builder.py
 ```
 Command line mode:
 ```bash
 python3 sitemap_builder.py https://example.com -o ./sitemap.csv
 ```
 On Windows:
 ```powershell
 python .\sitemap_builder.py https://example.com -o .\sitemap.csv
 ```
 ## Useful Options
 ```bash
 python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --include-subdomains
 ```
 - `--max-pages`: stop after the given number of visited pages. Default: `10000`
 - `--delay`: wait between requests to reduce load on the site
 - `--timeout`: request timeout in seconds
 - `--include-subdomains`: crawl subdomains of the starting host
 - `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
 - `--workers`: number of worker threads to use. Set `1` to disable multithreading
 - `--save-every`: save progress after every N pages. Default: `25`
 - `--resume`: resume from an existing state file
 - `--fresh`: ignore the existing state file and start over
 ## Discovery And Behavior
 - The crawler checks `robots.txt` for sitemap references and also tries `/sitemap.xml`
 - XML sitemap URLs are added to the crawl queue before page crawling begins
 - HTML pages store page title and canonical URL in the CSV when available
 - On Windows CLI runs, `P` pauses, `R` resumes, and `Q` stops cleanly and saves progress
 ## Recommendation
 For normal use, run the root application or Docker container instead of calling this script directly. That is now the intended user interface for this repository.
@@ -0,0 +1,947 @@
 from __future__ import annotations
 import argparse
 import csv
 import json
 import os
 import sys
 import time
 import xml.etree.ElementTree as ET
 from collections import deque
 from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
 from dataclasses import dataclass
 from html.parser import HTMLParser
 from pathlib import Path
 from typing import Iterable
 from urllib.error import HTTPError, URLError
 from urllib.parse import urljoin, urlsplit, urlunsplit
 from urllib.request import Request, urlopen
 if os.name == "nt":
    import msvcrt
 DEFAULT_USER_AGENT = "SitemapBuilder/1.0 (+local script)"
 DEFAULT_OUTPUT_NAME = "sitemap.csv"
 DEFAULT_STATE_SUFFIX = ".crawlstate.json"
 DEFAULT_LOG_SUFFIX = ".crawl.log"
 DEFAULT_MAX_PAGES = 10000
 DEFAULT_RESUME_PAGE_INCREMENT = 10000
 DEFAULT_SAVE_EVERY = 25
 DEFAULT_WORKERS = 8
 SCRIPT_DIR = Path(__file__).resolve().parent
 DOCUMENT_EXTENSIONS = {
    ".pdf",
    ".csv",
    ".doc",
    ".docx",
    ".xls",
    ".xlsx",
    ".ppt",
    ".pptx",
    ".txt",
    ".rtf",
    ".zip",
    ".xml",
    ".json",
 }
@dataclass
 class CrawlResult:
    url: str
    links: list[str]
    title: str = ""
    canonical_url: str = ""
    skipped: bool = False
    error: str | None = None
@dataclass
 class CrawlState:
    start_url: str
    include_subdomains: bool
    include_documents: bool
    visited: set[str]
    queued: set[str]
    queue: deque[str]
    records: dict[str, dict[str, str]]
    alias_to_canonical: dict[str, str]
    errors: list[dict[str, str]]
    skipped_count: int
    discovered_from_sitemaps: int
@dataclass
 class RuntimeControl:
    paused: bool = False
    stop_requested: bool = False
@dataclass
 class CrawlRunResult:
    state: CrawlState
    user_stopped: bool
    output_path: Path
    state_path: Path
    log_path: Path
    max_pages: int
    workers: int
 class HTMLPageParser(HTMLParser):
    def __init__(self) -> None:
        super().__init__()
        self.links: list[str] = []
        self.title_parts: list[str] = []
        self.in_title = False
        self.canonical_href = ""
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        attrs_map = {key.lower(): value for key, value in attrs}
        lower_tag = tag.lower()
        if lower_tag == "a":
            href = attrs_map.get("href")
            if href:
                self.links.append(href)
        if lower_tag == "title":
            self.in_title = True
        if lower_tag == "link":
            rel = (attrs_map.get("rel") or "").lower()
            href = attrs_map.get("href") or ""
            if "canonical" in rel and href:
                self.canonical_href = href
    def handle_endtag(self, tag: str) -> None:
        if tag.lower() == "title":
            self.in_title = False
    def handle_data(self, data: str) -> None:
        if self.in_title:
            self.title_parts.append(data)
    @property
    def title(self) -> str:
        return " ".join(part.strip() for part in self.title_parts if part.strip()).strip()
 def normalize_url(url: str) -> str:
    parts = urlsplit(url.strip())
    scheme = parts.scheme.lower() or "https"
    netloc = parts.netloc.lower()
    path = parts.path or "/"
    if path != "/" and path.endswith("/"):
        path = path.rstrip("/")
    return urlunsplit((scheme, netloc, path, parts.query, ""))
 def is_http_url(url: str) -> bool:
    return urlsplit(url).scheme in {"http", "https"}
 def build_allowed_hosts(start_url: str) -> set[str]:
    return {urlsplit(start_url).netloc.lower()}
 def should_visit(url: str, allowed_hosts: set[str], include_subdomains: bool) -> bool:
    if not is_http_url(url):
        return False
    host = urlsplit(url).netloc.lower()
    if include_subdomains:
        return any(host == allowed or host.endswith(f".{allowed}") for allowed in allowed_hosts)
    return host in allowed_hosts
 def is_document_url(url: str) -> bool:
    return Path(urlsplit(url).path).suffix.lower() in DOCUMENT_EXTENSIONS
 def should_record_url(url: str) -> bool:
    query = urlsplit(url).query.lower()
    return query != "page=1"
 def get_state_path(output_path: Path) -> Path:
    return output_path.with_suffix(output_path.suffix + DEFAULT_STATE_SUFFIX)
 def get_log_path(output_path: Path) -> Path:
    return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
 def log_message(log_path: Path, message: str) -> None:
    log_path.parent.mkdir(parents=True, exist_ok=True)
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    with log_path.open("a", encoding="utf-8") as log_file:
        log_file.write(f"[{timestamp}] {message}\n")
 def resolve_alias(url: str, alias_to_canonical: dict[str, str]) -> str:
    resolved = url
    seen: set[str] = set()
    while resolved in alias_to_canonical and resolved not in seen:
        seen.add(resolved)
        resolved = alias_to_canonical[resolved]
    return resolved
 def register_record(
    state: CrawlState,
    url: str,
    record_type: str,
    title: str = "",
    canonical_url: str = "",
 ) -> None:
    if not should_record_url(url):
        return
    existing = state.records.get(url, {"title": "", "canonical_url": "", "type": record_type})
    if not existing.get("type"):
        existing["type"] = record_type
    elif existing["type"] == "document" and record_type == "page":
        existing["type"] = "page"
    if title and not existing.get("title"):
        existing["title"] = title
    if canonical_url and not existing.get("canonical_url"):
        existing["canonical_url"] = canonical_url
    if "canonical_url" not in existing:
        existing["canonical_url"] = canonical_url
    if "title" not in existing:
        existing["title"] = title
    state.records[url] = existing
 def save_state(state: CrawlState, state_path: Path, output_path: Path) -> None:
    state_path.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        "start_url": state.start_url,
        "include_subdomains": state.include_subdomains,
        "include_documents": state.include_documents,
        "visited": sorted(state.visited),
        "queued": sorted(state.queued),
        "queue": list(state.queue),
        "records": state.records,
        "alias_to_canonical": state.alias_to_canonical,
        "errors": state.errors,
        "skipped_count": state.skipped_count,
        "discovered_from_sitemaps": state.discovered_from_sitemaps,
        "saved_at": time.strftime("%Y-%m-%d %H:%M:%S"),
        "output_path": str(output_path),
    }
    state_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
 def load_state(state_path: Path) -> CrawlState:
    payload = json.loads(state_path.read_text(encoding="utf-8"))
    return CrawlState(
        start_url=payload["start_url"],
        include_subdomains=bool(payload.get("include_subdomains", False)),
        include_documents=bool(payload.get("include_documents", False)),
        visited=set(payload.get("visited", [])),
        queued=set(payload.get("queued", [])),
        queue=deque(payload.get("queue", [])),
        records=dict(payload.get("records", {})),
        alias_to_canonical=dict(payload.get("alias_to_canonical", {})),
        errors=list(payload.get("errors", [])),
        skipped_count=int(payload.get("skipped_count", 0)),
        discovered_from_sitemaps=int(payload.get("discovered_from_sitemaps", 0)),
    )
 def initialize_state(start_url: str, include_subdomains: bool, include_documents: bool) -> CrawlState:
    normalized_start = normalize_url(start_url)
    return CrawlState(
        start_url=normalized_start,
        include_subdomains=include_subdomains,
        include_documents=include_documents,
        visited=set(),
        queued={normalized_start},
        queue=deque([normalized_start]),
        records={},
        alias_to_canonical={},
        errors=[],
        skipped_count=0,
        discovered_from_sitemaps=0,
    )
 def prompt_if_missing(value: str | None, prompt_text: str) -> str:
    if value:
        return value
    return input(prompt_text).strip()
 def prompt_yes_no(prompt_text: str, default: bool) -> bool:
    suffix = "Y/n" if default else "y/N"
    answer = input(f"{prompt_text} [{suffix}]: ").strip().lower()
    if not answer:
        return default
    return answer in {"y", "yes"}
 def write_csv(records: dict[str, dict[str, str]], output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["URL", "Title", "Canonical URL", "Type"])
        for url in sorted(records):
            record = records[url]
            writer.writerow(
                [
                    url,
                    record.get("title", ""),
                    record.get("canonical_url", ""),
                    record.get("type", ""),
                ]
            )
 def fetch_text(url: str, timeout: float, user_agent: str, accept: str) -> tuple[str | None, str | None]:
    request = Request(url, headers={"User-Agent": user_agent, "Accept": accept})
    try:
        with urlopen(request, timeout=timeout) as response:
            return (
                response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace"),
                None,
            )
    except HTTPError as exc:
        return None, f"HTTP {exc.code}"
    except URLError as exc:
        return None, str(exc.reason)
    except TimeoutError:
        return None, "request timed out"
    except Exception as exc:  # pragma: no cover
        return None, str(exc)
 def fetch_page(url: str, timeout: float, user_agent: str) -> CrawlResult:
    request = Request(
        url,
        headers={
            "User-Agent": user_agent,
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        },
    )
    try:
        with urlopen(request, timeout=timeout) as response:
            content_type = response.headers.get("Content-Type", "").lower()
            if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
                return CrawlResult(url=url, links=[], skipped=True)
            content = response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace")
    except HTTPError as exc:
        return CrawlResult(url=url, links=[], error=f"HTTP {exc.code}")
    except URLError as exc:
        return CrawlResult(url=url, links=[], error=str(exc.reason))
    except TimeoutError:
        return CrawlResult(url=url, links=[], error="request timed out")
    except Exception as exc:  # pragma: no cover
        return CrawlResult(url=url, links=[], error=str(exc))
    parser = HTMLPageParser()
    parser.feed(content)
    canonical_url = normalize_url(urljoin(url, parser.canonical_href)) if parser.canonical_href else ""
    return CrawlResult(
        url=url,
        links=parser.links,
        title=parser.title,
        canonical_url=canonical_url,
    )
 def fetch_page_with_delay(url: str, timeout: float, user_agent: str, delay: float) -> CrawlResult:
    if delay > 0:
        time.sleep(delay)
    return fetch_page(url, timeout=timeout, user_agent=user_agent)
 def print_progress(state: CrawlState, max_pages: int, current_url: str) -> None:
    print(
        f"[{len(state.visited)}/{max_pages}] Found {len(state.records)} URL(s), "
        f"queued {len(state.queue)} more: {current_url}"
    )
 def poll_runtime_control(control: RuntimeControl, log_path: Path) -> None:
    if os.name != "nt":
        return
    while msvcrt.kbhit():
        key = msvcrt.getwch().lower()
        if key == "p" and not control.paused:
            control.paused = True
            print("Paused. Press R to resume or Q to stop.")
            log_message(log_path, "Crawl paused by user")
        elif key == "r" and control.paused:
            control.paused = False
            print("Resuming crawl.")
            log_message(log_path, "Crawl resumed by user")
        elif key == "q":
            control.stop_requested = True
            log_message(log_path, "Stop requested by user")
 def discover_robots_sitemaps(
    start_url: str,
    timeout: float,
    user_agent: str,
    log_path: Path,
 ) -> set[str]:
    robots_url = normalize_url(urljoin(start_url, "/robots.txt"))
    content, error = fetch_text(robots_url, timeout, user_agent, "text/plain,*/*;q=0.8")
    if error:
        log_message(log_path, f"robots.txt not available at {robots_url}: {error}")
        return set()
    sitemap_urls: set[str] = set()
    for line in content.splitlines():
        if line.lower().startswith("sitemap:"):
            raw_url = line.split(":", 1)[1].strip()
            if raw_url:
                sitemap_urls.add(normalize_url(raw_url))
    if sitemap_urls:
        log_message(log_path, f"Discovered {len(sitemap_urls)} sitemap reference(s) from robots.txt")
    return sitemap_urls
 def xml_local_name(tag: str) -> str:
    if "}" in tag:
        return tag.rsplit("}", 1)[1]
    return tag
 def parse_sitemap_urls(
    sitemap_url: str,
    allowed_hosts: set[str],
    include_subdomains: bool,
    timeout: float,
    user_agent: str,
    log_path: Path,
    seen_sitemaps: set[str],
 ) -> set[str]:
    normalized_sitemap = normalize_url(sitemap_url)
    if normalized_sitemap in seen_sitemaps:
        return set()
    seen_sitemaps.add(normalized_sitemap)
    if not should_visit(normalized_sitemap, allowed_hosts, include_subdomains):
        return set()
    content, error = fetch_text(normalized_sitemap, timeout, user_agent, "application/xml,text/xml;q=0.9,*/*;q=0.8")
    if error:
        log_message(log_path, f"Sitemap fetch failed for {normalized_sitemap}: {error}")
        return set()
    try:
        root = ET.fromstring(content)
    except ET.ParseError as exc:
        log_message(log_path, f"Sitemap parse failed for {normalized_sitemap}: {exc}")
        return set()
    tag_name = xml_local_name(root.tag)
    discovered_urls: set[str] = set()
    if tag_name == "urlset":
        for element in root.findall(".//"):
            if xml_local_name(element.tag) == "loc" and element.text:
                normalized = normalize_url(element.text.strip())
                if should_visit(normalized, allowed_hosts, include_subdomains):
                    discovered_urls.add(normalized)
    elif tag_name == "sitemapindex":
        for element in root.findall(".//"):
            if xml_local_name(element.tag) == "loc" and element.text:
                child_sitemap = normalize_url(element.text.strip())
                discovered_urls.update(
                    parse_sitemap_urls(
                        child_sitemap,
                        allowed_hosts,
                        include_subdomains,
                        timeout,
                        user_agent,
                        log_path,
                        seen_sitemaps,
                    )
                )
    else:
        log_message(log_path, f"Unsupported sitemap format at {normalized_sitemap}")
    return discovered_urls
 def seed_from_xml_sitemaps(
    state: CrawlState,
    timeout: float,
    user_agent: str,
    log_path: Path,
 ) -> None:
    allowed_hosts = build_allowed_hosts(state.start_url)
    sitemap_candidates = discover_robots_sitemaps(state.start_url, timeout, user_agent, log_path)
    sitemap_candidates.add(normalize_url(urljoin(state.start_url, "/sitemap.xml")))
    seen_sitemaps: set[str] = set()
    discovered_urls: set[str] = set()
    for sitemap_url in sitemap_candidates:
        discovered_urls.update(
            parse_sitemap_urls(
                sitemap_url,
                allowed_hosts,
                state.include_subdomains,
                timeout,
                user_agent,
                log_path,
                seen_sitemaps,
            )
        )
    added = 0
    for url in discovered_urls:
        canonical_url = resolve_alias(url, state.alias_to_canonical)
        if is_document_url(canonical_url):
            if state.include_documents:
                register_record(state, canonical_url, "document")
                added += 1
            continue
        register_record(state, canonical_url, "page")
        if canonical_url not in state.visited and canonical_url not in state.queued:
            state.queue.append(canonical_url)
            state.queued.add(canonical_url)
        added += 1
    state.discovered_from_sitemaps += added
    log_message(log_path, f"Added {added} URL(s) from XML sitemap discovery")
 def process_crawl_result(
    state: CrawlState,
    result: CrawlResult,
    allowed_hosts: set[str],
    log_path: Path,
 ) -> None:
    if result.error:
        state.errors.append({"url": result.url, "error": result.error})
        log_message(log_path, f"Error fetching {result.url}: {result.error}")
        return
    if result.skipped:
        state.skipped_count += 1
        register_record(state, result.url, "document")
        return
    canonical_url = ""
    if result.canonical_url and should_visit(result.canonical_url, allowed_hosts, state.include_subdomains):
        canonical_url = resolve_alias(result.canonical_url, state.alias_to_canonical)
        state.alias_to_canonical[result.url] = canonical_url
        register_record(state, canonical_url, "page", title=result.title, canonical_url=canonical_url)
        if canonical_url not in state.visited and canonical_url not in state.queued:
            state.queue.append(canonical_url)
            state.queued.add(canonical_url)
    register_record(state, result.url, "page", title=result.title, canonical_url=canonical_url)
    for raw_link in result.links:
        absolute = normalize_url(urljoin(result.url, raw_link))
        if not should_visit(absolute, allowed_hosts, state.include_subdomains):
            continue
        absolute = resolve_alias(absolute, state.alias_to_canonical)
        if is_document_url(absolute):
            if state.include_documents:
                register_record(state, absolute, "document")
            continue
        register_record(state, absolute, "page")
        if absolute not in state.queued and absolute not in state.visited:
            state.queue.append(absolute)
            state.queued.add(absolute)
 def crawl_site(
    state: CrawlState,
    max_pages: int,
    delay: float,
    timeout: float,
    user_agent: str,
    state_path: Path,
    output_path: Path,
    log_path: Path,
    save_every: int,
    workers: int,
 ) -> tuple[CrawlState, bool]:
    allowed_hosts = build_allowed_hosts(state.start_url)
    processed_since_save = 0
    user_stopped = False
    control = RuntimeControl()
    if workers <= 1:
        while state.queue and len(state.visited) < max_pages:
            poll_runtime_control(control, log_path)
            if control.stop_requested:
                user_stopped = True
                print("Stop requested. Saving progress and finishing cleanly...")
                break
            while control.paused and not control.stop_requested:
                time.sleep(0.2)
                poll_runtime_control(control, log_path)
            if control.stop_requested:
                user_stopped = True
                print("Stop requested. Saving progress and finishing cleanly...")
                break
            current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
            if current in state.visited:
                continue
            state.visited.add(current)
            register_record(state, current, "page")
            print_progress(state, max_pages, current)
            result = fetch_page_with_delay(current, timeout=timeout, user_agent=user_agent, delay=delay)
            process_crawl_result(state, result, allowed_hosts, log_path)
            processed_since_save += 1
            if processed_since_save >= save_every:
                write_csv(state.records, output_path)
                save_state(state, state_path, output_path)
                log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
                processed_since_save = 0
    else:
        with ThreadPoolExecutor(max_workers=workers) as executor:
            pending: dict[object, str] = {}
            while pending or (state.queue and len(state.visited) < max_pages):
                poll_runtime_control(control, log_path)
                if control.stop_requested:
                    user_stopped = True
                    print("Stop requested. No new pages will be queued. Waiting for active requests to finish...")
                    break
                if control.paused:
                    if pending:
                        completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
                        for future in completed:
                            pending.pop(future, None)
                            result = future.result()
                            process_crawl_result(state, result, allowed_hosts, log_path)
                            processed_since_save += 1
                    else:
                        time.sleep(0.2)
                    if processed_since_save >= save_every:
                        write_csv(state.records, output_path)
                        save_state(state, state_path, output_path)
                        log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
                        processed_since_save = 0
                    continue
                while state.queue and len(pending) < workers and len(state.visited) < max_pages:
                    current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
                    if current in state.visited:
                        continue
                    state.visited.add(current)
                    register_record(state, current, "page")
                    print_progress(state, max_pages, current)
                    future = executor.submit(fetch_page_with_delay, current, timeout, user_agent, delay)
                    pending[future] = current
                if not pending:
                    continue
                completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
                for future in completed:
                    pending.pop(future, None)
                    result = future.result()
                    process_crawl_result(state, result, allowed_hosts, log_path)
                    processed_since_save += 1
                    if processed_since_save >= save_every:
                        write_csv(state.records, output_path)
                        save_state(state, state_path, output_path)
                        log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
                        processed_since_save = 0
            if user_stopped and pending:
                completed, _ = wait(pending.keys())
                for future in completed:
                    pending.pop(future, None)
                    result = future.result()
                    process_crawl_result(state, result, allowed_hosts, log_path)
    write_csv(state.records, output_path)
    save_state(state, state_path, output_path)
    log_message(log_path, f"Final save completed with {len(state.records)} URL(s) recorded")
    return state, user_stopped
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Crawl a website and export discovered internal URLs to a CSV sitemap.",
    )
    parser.add_argument("url", nargs="?", help="Starting URL to crawl, for example https://example.com")
    parser.add_argument(
        "-o",
        "--output",
        help=f"Output CSV path. Defaults to {DEFAULT_OUTPUT_NAME} in the script folder.",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=DEFAULT_MAX_PAGES,
        help=f"Maximum number of pages to crawl before stopping. Default: {DEFAULT_MAX_PAGES}",
    )
    parser.add_argument(
        "--delay",
        type=float,
        default=0.0,
        help="Delay in seconds between requests. Default: 0",
    )
    parser.add_argument(
        "--timeout",
        type=float,
        default=15.0,
        help="Request timeout in seconds. Default: 15",
    )
    parser.add_argument(
        "--include-subdomains",
        action="store_true",
        help="Also crawl subdomains of the starting host.",
    )
    parser.add_argument(
        "--include-documents",
        action="store_true",
        help="Include document links like PDF, CSV, DOC, and DOCX in the sitemap output.",
    )
    parser.add_argument(
        "--save-every",
        type=int,
        default=DEFAULT_SAVE_EVERY,
        help=f"Save progress after this many pages. Default: {DEFAULT_SAVE_EVERY}",
    )
    parser.add_argument(
        "--resume",
        action="store_true",
        help="Resume from the saved crawl state if a state file already exists.",
    )
    parser.add_argument(
        "--fresh",
        action="store_true",
        help="Ignore any saved crawl state and start over.",
    )
    parser.add_argument(
        "--workers",
        type=int,
        default=0,
        help=f"Number of worker threads. Use 1 to disable multithreading. Default when prompted on: {DEFAULT_WORKERS}",
    )
    return parser.parse_args()
 def run_crawl(
    *,
    start_url: str,
    output_path: Path,
    max_pages: int = DEFAULT_MAX_PAGES,
    delay: float = 0.0,
    timeout: float = 15.0,
    include_subdomains: bool = False,
    include_documents: bool = False,
    save_every: int = DEFAULT_SAVE_EVERY,
    workers: int = DEFAULT_WORKERS,
    resume: bool = True,
    fresh: bool = False,
    user_agent: str = DEFAULT_USER_AGENT,
 ) -> CrawlRunResult:
    if not start_url:
        raise ValueError("A starting URL is required.")
    if "://" not in start_url:
        start_url = f"https://{start_url}"
    normalized_start = normalize_url(start_url)
    if not is_http_url(normalized_start):
        raise ValueError("Only http and https URLs are supported.")
    output_path = Path(output_path)
    state_path = get_state_path(output_path)
    log_path = get_log_path(output_path)
    state: CrawlState
    if state_path.exists() and not fresh and resume:
        state = load_state(state_path)
        if state.start_url != normalized_start:
            raise ValueError(
                "The saved crawl state belongs to a different starting URL. "
                "Use a different output name or start a fresh crawl."
            )
        if state.include_documents != include_documents:
            raise ValueError(
                "The saved crawl state uses a different document setting. "
                "Keep the same choice or start a fresh crawl."
            )
    else:
        state = initialize_state(normalized_start, include_subdomains, include_documents)
    effective_workers = max(int(workers), 1)
    effective_max_pages = max(int(max_pages), 1)
    if state.visited:
        effective_max_pages = max(effective_max_pages, len(state.visited) + DEFAULT_RESUME_PAGE_INCREMENT)
    else:
        seed_from_xml_sitemaps(state, max(timeout, 1.0), user_agent, log_path)
    log_message(log_path, f"Starting crawl for {state.start_url}")
    log_message(log_path, f"Output CSV: {output_path.resolve()}")
    log_message(log_path, f"State file: {state_path.resolve()}")
    log_message(log_path, f"Multithreading workers: {effective_workers}")
    log_message(log_path, f"Include documents: {state.include_documents}")
    state, user_stopped = crawl_site(
        state=state,
        max_pages=effective_max_pages,
        delay=max(delay, 0.0),
        timeout=max(timeout, 1.0),
        user_agent=user_agent,
        state_path=state_path,
        output_path=output_path,
        log_path=log_path,
        save_every=max(save_every, 1),
        workers=effective_workers,
    )
    if user_stopped:
        log_message(log_path, "Crawl stopped by user")
    elif state.queue and len(state.visited) >= effective_max_pages:
        log_message(log_path, "Crawl stopped at max page limit")
    elif state.queue:
        log_message(log_path, "Crawl stopped before queue emptied")
    else:
        log_message(log_path, "Crawl completed with empty queue")
    return CrawlRunResult(
        state=state,
        user_stopped=user_stopped,
        output_path=output_path,
        state_path=state_path,
        log_path=log_path,
        max_pages=effective_max_pages,
        workers=effective_workers,
    )
 def main() -> int:
    args = parse_args()
    start_url = prompt_if_missing(args.url, "Enter the website URL to crawl: ")
    if not start_url:
        print("A starting URL is required.", file=sys.stderr)
        return 1
    if "://" not in start_url:
        start_url = f"https://{start_url}"
    normalized_start = normalize_url(start_url)
    if not is_http_url(normalized_start):
        print("Only http and https URLs are supported.", file=sys.stderr)
        return 1
    output_value = prompt_if_missing(args.output, f"Enter output CSV path [{DEFAULT_OUTPUT_NAME}]: ")
    output_path = Path(output_value) if output_value else SCRIPT_DIR / DEFAULT_OUTPUT_NAME
    state_path = get_state_path(output_path)
    log_path = get_log_path(output_path)
    include_documents = args.include_documents or prompt_yes_no(
        "Include document links such as PDF, CSV, DOC, and DOCX in the sitemap?",
        default=False,
    )
    workers = args.workers
    if workers <= 0:
        enable_multithreading = prompt_yes_no(
            f"Enable multithreading for faster scanning? {DEFAULT_WORKERS} worker threads will be used.",
            default=True,
        )
        workers = DEFAULT_WORKERS if enable_multithreading else 1
    print(f"Crawling {normalized_start}")
    print(f"Output file: {output_path.resolve()}")
    print(f"State file: {state_path.resolve()}")
    print(f"Log file: {log_path.resolve()}")
    resume_existing = False
    if state_path.exists() and not args.fresh:
        resume_existing = args.resume or prompt_yes_no(
            f"Found saved crawl state at {state_path.name}. Resume from where it left off?",
            default=True,
        )
    try:
        run_result = run_crawl(
            start_url=normalized_start,
            output_path=output_path,
            max_pages=args.max_pages,
            delay=args.delay,
            timeout=args.timeout,
            include_subdomains=args.include_subdomains,
            include_documents=include_documents,
            save_every=args.save_every,
            workers=workers,
            resume=resume_existing,
            fresh=args.fresh,
            user_agent=DEFAULT_USER_AGENT,
        )
    except ValueError as exc:
        print(str(exc), file=sys.stderr)
        return 1
    state = run_result.state
    user_stopped = run_result.user_stopped
    effective_max_pages = run_result.max_pages
    print(f"Max pages: {effective_max_pages}")
    print(f"Include documents: {'Yes' if state.include_documents else 'No'}")
    print(f"Multithreading: {'Yes' if run_result.workers > 1 else 'No'}")
    print(f"Worker threads: {run_result.workers}")
    if os.name == "nt":
        print("Press P to pause, R to resume, or Q to stop cleanly and save progress.")
    if resume_existing:
        print("Resumed from the existing crawl state file.")
        log_message(log_path, "Resumed from existing crawl state")
    print(f"Found {len(state.records)} unique URL(s).")
    print(f"Visited pages: {len(state.visited)}")
    print(f"Queued pages remaining: {len(state.queue)}")
    print(f"URLs added from XML sitemaps: {state.discovered_from_sitemaps}")
    if state.errors:
        print(f"Pages with errors: {len(state.errors)}")
        for result in state.errors[:10]:
            print(f"  {result['url']} -> {result['error']}")
    if state.skipped_count:
        print(f"Non-HTML pages skipped while crawling: {state.skipped_count}")
    if user_stopped:
        print("Stopped by user. Run it again to continue from the saved state.")
        log_message(log_path, "Crawl stopped by user")
    elif state.queue and len(state.visited) >= effective_max_pages:
        print("Stopped because the max page limit was reached. Run it again to continue.")
        log_message(log_path, "Crawl stopped at max page limit")
    elif state.queue:
        print("Stopped before the queue was empty. Run it again to continue.")
        log_message(log_path, "Crawl stopped before queue emptied")
    else:
        print("Crawl complete. No queued pages remain.")
        log_message(log_path, "Crawl completed with empty queue")
    print("Done.")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
@@ -0,0 +1,210 @@
 from __future__ import annotations
 import contextlib
 import csv
 import importlib.util
 import io
 import os
 import re
 import sys
 from pathlib import Path
 import streamlit as st
 ROOT_DIR = Path(__file__).resolve().parent
 PAGE_IMPORTER_DIR = ROOT_DIR / "Page Importer"
 SITEMAP_BUILDER_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
 APP_DATA_DIR = Path(os.environ.get("APP_DATA_DIR", ROOT_DIR / ".data")).resolve()
 SITEMAP_OUTPUT_DIR = APP_DATA_DIR / "sitemaps"
 def load_module(module_name: str, file_path: Path):
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Unable to load module from {file_path}")
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module
 def get_page_importer_module():
    if str(PAGE_IMPORTER_DIR) not in sys.path:
        sys.path.insert(0, str(PAGE_IMPORTER_DIR))
    return load_module("page_importer_streamlit", PAGE_IMPORTER_DIR / "app.py")
 def get_sitemap_module():
    return load_module("sitemap_builder_module", SITEMAP_BUILDER_PATH)
 def sanitize_job_name(value: str) -> str:
    cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", (value or "").strip())
    cleaned = cleaned.strip(".-")
    return cleaned or "sitemap"
 def read_csv_preview(csv_bytes: bytes, limit: int = 200) -> list[dict[str, str]]:
    text = csv_bytes.decode("utf-8-sig", errors="replace")
    reader = csv.DictReader(io.StringIO(text))
    rows: list[dict[str, str]] = []
    for index, row in enumerate(reader):
        if index >= limit:
            break
        rows.append(dict(row))
    return rows
 def render_sitemap_tab() -> None:
    st.title("Sitemap Generator")
    st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
    SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    with st.form("sitemap-form"):
        start_url = st.text_input("Starting URL", placeholder="https://example.com")
        job_name = st.text_input(
            "Output name",
            value="sitemap",
            help="Used for the CSV, crawl state, and log file names.",
        )
        col1, col2, col3 = st.columns(3)
        with col1:
            max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
            workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
        with col2:
            delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
            timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
        with col3:
            save_every = st.number_input("Save progress every N pages", min_value=1, value=25, step=1)
            include_subdomains = st.checkbox("Include subdomains", value=False)
            include_documents = st.checkbox("Include document links", value=False)
        resume_existing = st.checkbox("Resume from saved crawl state if present", value=True)
        start_fresh = st.checkbox("Ignore any saved crawl state and start fresh", value=False)
        submitted = st.form_submit_button("Run Sitemap Crawl", type="primary")
    if submitted:
        if not start_url.strip():
            st.error("Starting URL is required.")
        else:
            sitemap_builder = get_sitemap_module()
            safe_name = sanitize_job_name(job_name)
            output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
            captured_stdout = io.StringIO()
            try:
                with st.spinner("Running sitemap crawl..."):
                    with contextlib.redirect_stdout(captured_stdout):
                        result = sitemap_builder.run_crawl(
                            start_url=start_url,
                            output_path=output_path,
                            max_pages=int(max_pages),
                            delay=float(delay),
                            timeout=float(timeout),
                            include_subdomains=include_subdomains,
                            include_documents=include_documents,
                            save_every=int(save_every),
                            workers=int(workers),
                            resume=resume_existing,
                            fresh=start_fresh,
                        )
            except Exception as exc:
                st.error(str(exc))
            else:
                st.session_state["sitemap_result"] = {
                    "summary": {
                        "records": len(result.state.records),
                        "visited": len(result.state.visited),
                        "queued": len(result.state.queue),
                        "errors": len(result.state.errors),
                        "skipped": result.state.skipped_count,
                        "from_sitemaps": result.state.discovered_from_sitemaps,
                        "user_stopped": result.user_stopped,
                        "max_pages": result.max_pages,
                        "workers": result.workers,
                    },
                    "output_path": str(result.output_path),
                    "state_path": str(result.state_path),
                    "log_path": str(result.log_path),
                    "stdout": captured_stdout.getvalue(),
                }
    result_data = st.session_state.get("sitemap_result")
    if not result_data:
        st.info("Run a crawl to generate a sitemap CSV.")
        return
    summary = result_data["summary"]
    csv_path = Path(result_data["output_path"])
    state_path = Path(result_data["state_path"])
    log_path = Path(result_data["log_path"])
    st.subheader("Crawl Summary")
    metric_cols = st.columns(6)
    metric_cols[0].metric("URLs Found", summary["records"])
    metric_cols[1].metric("Visited", summary["visited"])
    metric_cols[2].metric("Queued", summary["queued"])
    metric_cols[3].metric("XML Seeded", summary["from_sitemaps"])
    metric_cols[4].metric("Errors", summary["errors"])
    metric_cols[5].metric("Skipped", summary["skipped"])
    status_text = "Stopped by user." if summary["user_stopped"] else "Run completed."
    st.caption(f"{status_text} Max pages used: {summary['max_pages']} | Worker threads: {summary['workers']}")
    if csv_path.exists():
        csv_bytes = csv_path.read_bytes()
        st.download_button(
            "Download Sitemap CSV",
            data=csv_bytes,
            file_name=csv_path.name,
            mime="text/csv",
        )
        preview_rows = read_csv_preview(csv_bytes)
        if preview_rows:
            st.dataframe(preview_rows, width="stretch", hide_index=True)
    file_cols = st.columns(2)
    with file_cols[0]:
        if state_path.exists():
            st.download_button(
                "Download Crawl State",
                data=state_path.read_bytes(),
                file_name=state_path.name,
                mime="application/json",
            )
    with file_cols[1]:
        if log_path.exists():
            st.download_button(
                "Download Crawl Log",
                data=log_path.read_bytes(),
                file_name=log_path.name,
                mime="text/plain",
            )
    crawl_output = (result_data.get("stdout") or "").strip()
    if crawl_output:
        st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True)
    if log_path.exists():
        log_text = log_path.read_text(encoding="utf-8", errors="replace")
        st.text_area("Log Tail", value="\n".join(log_text.splitlines()[-50:]), height=220, disabled=True)
 def main() -> None:
    st.set_page_config(page_title="WDW Tools", layout="wide")
    st.header("WDW Sitemap And Import Tools")
    sitemap_tab, importer_tab = st.tabs(["Sitemap Generator", "Page Importer"])
    with sitemap_tab:
        render_sitemap_tab()
    with importer_tab:
        page_importer_app = get_page_importer_module()
        page_importer_app.render_app()
 if __name__ == "__main__":
    main()
@@ -0,0 +1,4 @@
 streamlit>=1.43,<2
 requests>=2.32,<3
 beautifulsoup4>=4.12,<5
 python-dateutil>=2.9,<3