first commit
Build Docker Image / docker (push) Successful in 44s

This commit is contained in:
2026-04-09 10:42:10 -07:00
commit ead872a0a5
19 changed files with 2783 additions and 0 deletions
+13
View File
@@ -0,0 +1,13 @@
.git
.gitignore
.codex
**/.git
**/.venv
**/__pycache__
**/*.pyc
**/*.pyo
**/*.pyd
**/.pytest_cache
**/.mypy_cache
**/.DS_Store
.data
+42
View File
@@ -0,0 +1,42 @@
name: Build Docker Image
on:
push:
branches:
- main
workflow_dispatch:
env:
IMAGE_NAME: wdw-sitemap-and-importer
REGISTRY: ${{ secrets.REGISTRY_URL }}
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
jobs:
docker:
runs-on: ubuntu-latest
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Build image
run: docker build -t "${IMAGE_NAME}:${GITHUB_SHA}" .
- name: Tag latest image
run: docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${IMAGE_NAME}:latest"
- name: Log in to registry
if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
run: echo "${REGISTRY_PASSWORD}" | docker login "${REGISTRY}" -u "${REGISTRY_USERNAME}" --password-stdin
- name: Push commit image
if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
run: |
docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
docker push "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
- name: Push latest image
if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
run: |
docker tag "${IMAGE_NAME}:latest" "${REGISTRY}/${IMAGE_NAME}:latest"
docker push "${REGISTRY}/${IMAGE_NAME}:latest"
+15
View File
@@ -0,0 +1,15 @@
.codex
.data/
__pycache__/
*.py[cod]
.venv/
**/.venv/
**/__pycache__/
.pytest_cache/
.mypy_cache/
*.crawl.log
*.crawlstate.json
streamlit_uploads/
+22
View File
@@ -0,0 +1,22 @@
FROM python:3.14-slim
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
STREAMLIT_SERVER_HEADLESS=true \
STREAMLIT_SERVER_PORT=8501 \
STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
APP_DATA_DIR=/data
WORKDIR /app
COPY requirements.txt ./requirements.txt
RUN pip install -r requirements.txt
COPY . .
RUN mkdir -p /data
EXPOSE 8501
CMD ["streamlit", "run", "app.py"]
+12
View File
@@ -0,0 +1,12 @@
.venv/
__pycache__/
*.py[cod]
*$py.class
.pytest_cache/
.mypy_cache/
.ruff_cache/
.streamlit/secrets.toml
*.log
+63
View File
@@ -0,0 +1,63 @@
# Page Importer
This folder contains the WordPress import tool used by the combined application in the repository root.
The importer still uses Streamlit internally, but it is now rendered as the `Page Importer` tab inside the shared app rather than being the main entrypoint for the repository.
## Features
- Upload a CSV of submitted URLs
- Choose the URL column and optional title override column
- Optionally map post type from the CSV or force a single post type
- Scrape only the listed URLs
- Extract title, publish date, author, body HTML, categories, and tags
- Retry failed rows
- Export a WordPress WXR XML file
## Recommended Usage
Run the root application:
```bash
streamlit run ../app.py
```
Or run the combined Docker container from the repository root.
## Standalone Usage
If you need to run this importer by itself:
```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
streamlit run app.py
```
On Windows PowerShell:
```powershell
python -m venv .venv
.venv\Scripts\Activate.ps1
pip install -r requirements.txt
streamlit run app.py
```
## CSV Input
The app accepts CSV files with any columns. You choose:
- the URL column to scrape
- an optional title or name column to override the scraped title
- an optional post type column with values like `post` or `page`
- an optional category column whose values are appended during export
You can also add manual categories in the sidebar to append them to every exported item.
## Notes
- Exported posts default to `draft` unless changed in the UI
- Image and link URLs remain pointed at the source site
- Some themes need heuristic fallback. The `Force heuristic scraping` option skips JSON-LD-first extraction and relies on page structure
- In the combined app, dependencies come from the root `requirements.txt`
+475
View File
@@ -0,0 +1,475 @@
from __future__ import annotations
import csv
import datetime as dt
import io
import re
from dataclasses import replace
import streamlit as st
from page_importer.dates import parse_datetime
from page_importer.models import ScrapeOptions, ScrapedPost
from page_importer.scraper import Scraper
from page_importer.wxr import build_wxr
def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
text = file_data.decode("utf-8-sig", errors="replace")
reader = csv.DictReader(io.StringIO(text))
rows = list(reader)
return reader.fieldnames or [], rows
def render_app() -> None:
st.title("Page Importer")
st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
with st.sidebar:
st.header("Options")
include_author = st.checkbox("Include author", value=True)
include_categories = st.checkbox("Include categories", value=True)
include_tags = st.checkbox("Include tags", value=True)
force_heuristics = st.checkbox("Force heuristic scraping", value=False)
test_run = st.checkbox(
"Test run only",
value=False,
help="Scrape only the first 10 rows that contain a URL.",
)
post_type_mode = st.selectbox(
"WordPress post type mode",
["Single type for all rows", "Use a CSV column"],
index=0,
)
default_post_type = st.selectbox("Default WordPress post type", ["post", "page"], index=0)
uploaded = st.file_uploader("Upload CSV", type=["csv"])
if not uploaded:
st.info("Upload a CSV to begin.")
return
headers, rows = load_csv(uploaded.getvalue())
if not rows:
st.error("The CSV did not contain any rows.")
return
col1, col2, col3 = st.columns(3)
with col1:
url_column = st.selectbox("URL column", headers, index=_safe_index(headers, ["url", "link"]))
with col2:
title_column = st.selectbox(
"Optional title override column",
["(none)", *headers],
index=_safe_index(["(none)", *headers], ["name", "title"]),
)
with col3:
post_type_column = st.selectbox(
"Optional post type column",
["(none)", *headers],
index=_safe_index(["(none)", *headers], ["post_type", "type"]),
disabled=post_type_mode != "Use a CSV column",
)
st.write(f"Loaded {len(rows)} row(s). Only the selected URL column will be scraped.")
if test_run:
st.caption("Test run is enabled. Only the first 10 rows with a URL will be scraped.")
if st.button("Scrape URLs", type="primary"):
context = build_scrape_context(
include_author=include_author,
include_categories=include_categories,
include_tags=include_tags,
force_heuristics=force_heuristics,
test_run=test_run,
post_type_mode=post_type_mode,
post_type_column=post_type_column,
default_post_type=default_post_type,
url_column=url_column,
title_column=title_column,
)
results = scrape_rows(rows, context, phase_label="Scraping")
st.session_state["results"] = results
st.session_state["input_rows"] = rows
st.session_state["scrape_context"] = context
results = st.session_state.get("results", [])
if not results:
return
successful = [post for post in results if post.success]
failed = [post for post in results if not post.success]
st.subheader("Results")
st.write(f"Successful: {len(successful)} | Failed: {len(failed)}")
if failed and st.button("Retry failed items"):
stored_rows = st.session_state.get("input_rows", rows)
context = st.session_state.get("scrape_context")
if context:
retried = scrape_rows(
stored_rows,
context,
row_numbers=[post.row_number for post in failed if post.row_number],
phase_label="Retrying",
)
results = merge_retry_results(results, retried)
st.session_state["results"] = results
successful = [post for post in results if post.success]
failed = [post for post in results if not post.success]
preview_rows = []
for post in results:
preview_rows.append(
{
"Row": post.row_number,
"URL": post.source_url,
"CMS": post.cms,
"Success": post.success,
"Title": post.title,
"Publish Date": post.publish_date,
"Author": post.author,
"Categories": ", ".join(post.categories),
"Tags": ", ".join(post.tags),
"Post Type": post.post_type,
"Error": post.error,
}
)
st.dataframe(
preview_rows,
width="stretch",
hide_index=True,
column_config={
"Row": st.column_config.NumberColumn(width="small"),
"URL": st.column_config.TextColumn(width="medium"),
"Title": st.column_config.TextColumn(width="medium"),
"Publish Date": st.column_config.TextColumn(width="medium"),
"Categories": st.column_config.TextColumn(width="medium"),
"Tags": st.column_config.TextColumn(width="medium"),
"Error": st.column_config.TextColumn(width="large"),
},
)
if failed:
selected_failed = st.selectbox(
"Failed row details",
failed,
format_func=lambda post: f"Row {post.row_number}: {post.source_url or '(missing URL)'}",
)
st.text_area(
"Error details",
value=selected_failed.error_details or selected_failed.error,
height=180,
disabled=True,
)
if successful:
selected_index = st.number_input(
"Preview successful row",
min_value=1,
max_value=len(successful),
value=1,
step=1,
)
selected = successful[selected_index - 1]
st.markdown("### Content Preview")
st.write(f"**Title:** {selected.title}")
st.write(f"**Source URL:** {selected.source_url}")
st.write(f"**Publish Date:** {selected.publish_date or '(missing)'}")
st.write(f"**Author:** {selected.author or '(missing)'}")
st.write(f"**Post Type:** {selected.post_type}")
st.write(selected.body_html, unsafe_allow_html=True)
render_export_sidebar(successful, rows, headers)
def build_scrape_context(
*,
include_author: bool,
include_categories: bool,
include_tags: bool,
force_heuristics: bool,
test_run: bool,
post_type_mode: str,
post_type_column: str,
default_post_type: str,
url_column: str,
title_column: str,
) -> dict[str, object]:
return {
"options": ScrapeOptions(
include_author=include_author,
include_categories=include_categories,
include_tags=include_tags,
force_heuristics=force_heuristics,
),
"test_run": test_run,
"post_type_mode": post_type_mode,
"post_type_column": post_type_column,
"default_post_type": default_post_type,
"url_column": url_column,
"title_column": title_column,
}
def scrape_rows(
rows: list[dict[str, str]],
context: dict[str, object],
row_numbers: list[int] | None = None,
phase_label: str = "Scraping",
) -> list[ScrapedPost]:
options = context["options"]
if not isinstance(options, ScrapeOptions):
raise TypeError("Invalid scrape options in session state.")
scraper = Scraper(options)
targets = list(enumerate(rows, start=1))
if row_numbers is not None:
requested_rows = set(row_numbers)
targets = [(row_number, row) for row_number, row in targets if row_number in requested_rows]
elif bool(context.get("test_run")):
targets = [
(row_number, row)
for row_number, row in targets
if (row.get(str(context["url_column"])) or "").strip()
][:10]
results: list[ScrapedPost] = []
progress = st.progress(0.0)
status = st.empty()
total = len(targets) or 1
for index, (row_number, row) in enumerate(targets, start=1):
url = (row.get(context["url_column"]) or "").strip()
status.write(f"{phase_label} {index}/{len(targets)}: {url or f'row {row_number} has no URL'}")
if url:
post = scraper.scrape(url)
else:
post = ScrapedPost(
source_url="",
row_number=row_number,
error="Missing URL in the selected URL column.",
error_details=f"Row {row_number} does not contain a URL in column '{context['url_column']}'.",
)
post.row_number = row_number
apply_row_overrides(post, row, context)
results.append(post)
progress.progress(index / total)
status.write(f"{phase_label} complete.")
return results
def apply_row_overrides(post: ScrapedPost, row: dict[str, str], context: dict[str, object]) -> None:
title_column = context["title_column"]
if isinstance(title_column, str) and title_column != "(none)" and row.get(title_column):
post.title = row[title_column].strip()
post.post_type = resolve_post_type(
row=row,
mode=str(context["post_type_mode"]),
column=str(context["post_type_column"]),
default_value=str(context["default_post_type"]),
)
def resolve_export_categories(
row: dict[str, str],
category_column: str,
manual_categories: list[str],
) -> list[str]:
csv_categories = parse_terms(row.get(category_column, "")) if category_column != "(none)" else []
return merge_unique_terms(csv_categories, manual_categories)
def parse_terms(value: str) -> list[str]:
return [term.strip() for term in re.split(r"[,|>]", value or "") if term.strip()]
def merge_unique_terms(*groups: list[str]) -> list[str]:
merged: list[str] = []
for group in groups:
for term in group:
cleaned = term.strip()
if cleaned and cleaned not in merged:
merged.append(cleaned)
return merged
def merge_retry_results(existing: list[ScrapedPost], replacements: list[ScrapedPost]) -> list[ScrapedPost]:
replacement_map = {post.row_number: post for post in replacements}
merged = [replacement_map.get(post.row_number, post) for post in existing]
return sorted(merged, key=lambda post: post.row_number or 0)
def build_export_posts(
posts: list[ScrapedPost],
rows: list[dict[str, str]],
category_column: str,
manual_categories: list[str],
post_status: str,
custom_post_type_slug: str,
) -> list[ScrapedPost]:
export_posts: list[ScrapedPost] = []
for post in posts:
row = rows[post.row_number - 1] if 0 < post.row_number <= len(rows) else {}
export_posts.append(
replace(
post,
status=post_status,
post_type=custom_post_type_slug or post.post_type,
categories=merge_unique_terms(
post.categories,
resolve_export_categories(row, category_column, manual_categories),
),
)
)
return export_posts
def render_export_sidebar(
successful: list[ScrapedPost],
rows: list[dict[str, str]],
headers: list[str],
) -> None:
with st.sidebar:
st.markdown("---")
st.subheader("Export")
post_status = st.selectbox(
"Imported post status",
["draft", "publish", "private"],
index=0,
key="export_post_status",
)
category_column = st.selectbox(
"CSV category column",
["(none)", *headers],
index=_safe_index(["(none)", *headers], ["category", "categories", "department"]),
key="export_category_column",
)
manual_categories = parse_terms(
st.text_input(
"Additional export categories",
value="",
help="Comma-separated categories to append to every exported item.",
key="export_manual_categories",
)
)
output_name = st.text_input(
"Output filename",
value="wordpress-import.xml",
key="export_output_name",
)
custom_post_type_slug = normalize_post_type_slug(
st.text_input(
"Custom post type slug",
value="",
help="Optional. If set, all exported items will use this WordPress post type slug.",
key="export_custom_post_type_slug",
)
)
export_posts = build_export_posts(
successful,
rows,
category_column,
manual_categories,
post_status,
custom_post_type_slug,
)
if custom_post_type_slug:
st.caption(f"Exporting all items as post type `{custom_post_type_slug}`.")
dated_export_posts = [(post, publish_date) for post in export_posts if (publish_date := parse_publish_date(post.publish_date))]
if dated_export_posts:
min_date = min(publish_date for _, publish_date in dated_export_posts)
max_date = max(publish_date for _, publish_date in dated_export_posts)
filter_by_publish_date = st.checkbox(
"Filter export by publish date",
value=False,
key="export_filter_by_publish_date",
)
if filter_by_publish_date:
export_start = st.date_input(
"Export start date",
value=min_date,
min_value=min_date,
max_value=max_date,
format="MM/DD/YYYY",
key="export_start_date",
)
export_end = st.date_input(
"Export end date",
value=max_date,
min_value=min_date,
max_value=max_date,
format="MM/DD/YYYY",
key="export_end_date",
)
if export_start > export_end:
st.error("Export start date must be on or before the end date.")
export_posts = []
else:
export_posts = [
post
for post in export_posts
if (publish_date := parse_publish_date(post.publish_date)) and export_start <= publish_date <= export_end
]
st.caption(
"Date filter: "
f"{export_start.strftime('%m/%d/%Y')} to {export_end.strftime('%m/%d/%Y')}."
)
undated_count = len(successful) - len(dated_export_posts)
if undated_count:
st.caption(f"Excluded {undated_count} successful item(s) with no publish date.")
else:
st.caption("No successful items have a publish date, so export date filtering is unavailable.")
st.caption(f"Ready to export {len(export_posts)} post(s).")
xml_data = build_wxr(export_posts)
st.download_button(
label="Download WXR XML",
data=xml_data,
file_name=output_name,
mime="application/xml",
disabled=not export_posts,
)
def parse_publish_date(value: str) -> dt.date | None:
parsed = parse_datetime(value)
if parsed is None:
return None
return parsed.date()
def _safe_index(values: list[str], candidates: list[str]) -> int:
lowered = {value.lower(): idx for idx, value in enumerate(values)}
for candidate in candidates:
if candidate in lowered:
return lowered[candidate]
return 0
def resolve_post_type(
row: dict[str, str],
mode: str,
column: str,
default_value: str,
) -> str:
if mode != "Use a CSV column" or column == "(none)":
return default_value
raw_value = normalize_post_type_slug(row.get(column) or "")
if raw_value:
return raw_value
return default_value
def normalize_post_type_slug(value: str) -> str:
return re.sub(r"[^a-z0-9_-]", "", (value or "").strip().lower())
if __name__ == "__main__":
st.set_page_config(page_title="Page Importer", layout="wide")
render_app()
+1
View File
@@ -0,0 +1 @@
+26
View File
@@ -0,0 +1,26 @@
from __future__ import annotations
import datetime as dt
from dateutil import parser as date_parser
def parse_datetime(value: str | None) -> dt.datetime | None:
if not value:
return None
try:
return date_parser.parse(value)
except (TypeError, ValueError, OverflowError):
try:
return date_parser.parse(value, fuzzy=True)
except (TypeError, ValueError, OverflowError):
return None
def normalize_date(value: str | None) -> str:
parsed = parse_datetime(value)
if parsed is None:
return ""
if parsed.tzinfo is None or parsed.utcoffset() is None:
return parsed.strftime("%Y-%m-%d %H:%M:%S")
return parsed.isoformat(sep=" ", timespec="seconds")
+34
View File
@@ -0,0 +1,34 @@
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class ScrapeOptions:
include_author: bool = True
include_categories: bool = True
include_tags: bool = True
force_heuristics: bool = False
request_timeout: int = 20
user_agent: str = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0 Safari/537.36"
)
@dataclass
class ScrapedPost:
source_url: str
row_number: int = 0
cms: str = "unknown"
title: str = ""
publish_date: str = ""
author: str = ""
body_html: str = ""
categories: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
status: str = "draft"
post_type: str = "post"
success: bool = False
error: str = ""
error_details: str = ""
+555
View File
@@ -0,0 +1,555 @@
from __future__ import annotations
import json
import re
import traceback
from html import unescape
from typing import Iterable
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from page_importer.dates import normalize_date
from page_importer.models import ScrapeOptions, ScrapedPost
JSON_ARTICLE_TYPES = {
"article",
"blogposting",
"newsarticle",
"report",
"webpage",
}
BODY_SELECTORS = [
"article .entry-content",
"article .post-content",
"article .node__content",
"article .node .content",
"article .node-content",
"article .field-name-body .field-item",
"article .field-name-body",
"article .field--name-body",
"article .article-body",
"article .content",
".post-content",
".entry-content",
".node__content",
".node .content",
".node-content",
".field-name-body .field-item",
".field-name-body",
".field--name-body",
".article-body",
"#content-area .node .content",
"article",
"main article",
"main",
]
CATEGORY_SELECTORS = [
".cat-links a",
".post-categories a",
".field--name-field-category a",
".tags a[rel='category tag']",
".terms a",
".taxonomy a",
]
TAG_SELECTORS = [
".tags-links a",
".post-tags a",
".field--name-field-tags a",
"a[rel='tag']",
".terms a",
]
AUTHOR_SELECTORS = [
"[rel='author']",
".author a",
".byline a",
".submitted a",
".node__submitted a",
".node-info a",
".createdby",
]
DATE_SELECTORS = [
"time[datetime]",
"meta[property='article:published_time']",
"meta[name='publish_date']",
"meta[name='pubdate']",
".date-display-single",
".submitted",
".node-info",
]
DRUPAL_TITLE_DATE_PATTERN = re.compile(
r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
)
class Scraper:
def __init__(self, options: ScrapeOptions) -> None:
self.options = options
self.session = requests.Session()
self.session.headers.update({"User-Agent": options.user_agent})
def scrape(self, url: str) -> ScrapedPost:
post = ScrapedPost(source_url=url)
response: requests.Response | None = None
try:
response = self.session.get(url, timeout=self.options.request_timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
post.cms = detect_cms(soup)
article_data = extract_article_json_ld(soup)
if article_data and not self.options.force_heuristics:
apply_article_data(post, article_data, soup, self.options)
merge_fallback_data(post, soup, self.options)
post.body_html = sanitize_html(post.body_html)
missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
if missing_fields:
raise ValueError(
"Unable to extract required field(s): "
f"{', '.join(missing_fields)}. "
f"Detected CMS: {post.cms}. "
f"Publish date found: {'yes' if post.publish_date else 'no'}. "
f"Author found: {'yes' if post.author else 'no'}."
)
post.success = True
return post
except Exception as exc:
post.error = format_error_summary(url, exc, response, self.options.request_timeout)
post.error_details = format_error_details(url, exc, response)
return post
def detect_cms(soup: BeautifulSoup) -> str:
generator = meta_content(soup, "meta", {"name": "generator"})
html = str(soup).lower()
if generator:
g = generator.lower()
if "wordpress" in g:
return "wordpress"
if "drupal" in g:
return "drupal"
if "joomla" in g:
return "joomla"
if "/wp-content/" in html:
return "wordpress"
if "drupal-settings-json" in html or "sites/default/files" in html:
return "drupal"
if "com_content" in html or "joomla" in html:
return "joomla"
return "unknown"
def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
for script in soup.select("script[type='application/ld+json']"):
raw = script.string or script.get_text(" ", strip=True)
if not raw:
continue
for payload in parse_json_candidates(raw):
article = find_article_payload(payload)
if article:
return article
return None
def parse_json_candidates(raw: str) -> Iterable[dict | list]:
try:
data = json.loads(raw)
yield data
return
except json.JSONDecodeError:
pass
cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
try:
data = json.loads(cleaned)
yield data
except json.JSONDecodeError:
return
def find_article_payload(payload: dict | list) -> dict | None:
if isinstance(payload, list):
for item in payload:
found = find_article_payload(item)
if found:
return found
return None
if not isinstance(payload, dict):
return None
if "@graph" in payload:
found = find_article_payload(payload["@graph"])
if found:
return found
node_type = payload.get("@type")
types = {node_type.lower()} if isinstance(node_type, str) else {
item.lower() for item in node_type or [] if isinstance(item, str)
}
if types & JSON_ARTICLE_TYPES:
return payload
return None
def apply_article_data(
post: ScrapedPost,
article: dict,
soup: BeautifulSoup,
options: ScrapeOptions,
) -> None:
post.title = article.get("headline") or article.get("name") or post.title
post.publish_date = normalize_date(
article.get("datePublished") or article.get("dateCreated") or post.publish_date
)
if options.include_author:
post.author = extract_author_from_json_ld(article) or post.author
if options.include_categories:
post.categories = normalize_terms(article.get("articleSection")) or post.categories
if options.include_tags:
post.tags = normalize_terms(article.get("keywords")) or post.tags
post.body_html = extract_body_from_article(article, soup) or post.body_html
def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
if not post.title:
post.title = extract_title(soup)
if not post.publish_date:
post.publish_date = extract_date(soup, post.cms)
if options.include_author and not post.author:
post.author = extract_author(soup)
if not post.body_html:
post.body_html = extract_body(soup)
if options.include_categories:
post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
if post.cms == "drupal":
post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
if options.include_tags and not post.tags:
post.tags = extract_terms(soup, TAG_SELECTORS)
def extract_title(soup: BeautifulSoup) -> str:
og_title = meta_content(soup, "meta", {"property": "og:title"})
if og_title:
return og_title
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
node = soup.select_one(selector)
if node:
return clean_text(node.get_text(" ", strip=True))
return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
for selector in DATE_SELECTORS:
node = soup.select_one(selector)
if not node:
continue
candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
normalized = normalize_date(candidate)
if normalized:
return normalized
if cms == "drupal":
return extract_drupal_title_adjacent_date(soup)
return ""
def extract_author(soup: BeautifulSoup) -> str:
author = meta_content(soup, "meta", {"name": "author"})
if author:
return clean_text(author)
for selector in AUTHOR_SELECTORS:
node = soup.select_one(selector)
if node:
return clean_text(node.get_text(" ", strip=True))
return ""
def extract_body(soup: BeautifulSoup) -> str:
fallback_html = ""
for selector in BODY_SELECTORS:
node = soup.select_one(selector)
if not node:
continue
candidate = clone_tag(node)
strip_unwanted(candidate)
html = candidate.decode_contents().strip()
text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
if text_length >= 120:
return html
if not fallback_html and has_meaningful_body_content(html):
fallback_html = html
return fallback_html
def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
terms: list[str] = []
for selector in selectors:
for node in soup.select(selector):
term = clean_text(node.get_text(" ", strip=True))
if term and term not in terms:
terms.append(term)
return terms
def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
title_node = find_title_node(soup)
if not title_node:
return ""
for sibling in title_node.next_siblings:
candidate = text_from_node(sibling)
normalized = normalize_drupal_date(candidate)
if normalized:
return normalized
header = title_node.find_parent(["header", "div", "section"])
if header:
header_text = clean_text(header.get_text(" ", strip=True))
title_text = clean_text(title_node.get_text(" ", strip=True))
if title_text and header_text.startswith(title_text):
header_text = clean_text(header_text[len(title_text):])
normalized = normalize_drupal_date(header_text)
if normalized:
return normalized
return ""
def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
categories: list[str] = []
label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)
for label_node in soup.find_all(string=label_pattern):
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
if not parent:
continue
inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
normalized_inline_value = normalize_department_category(inline_value)
if normalized_inline_value:
categories = merge_terms(categories, [normalized_inline_value])
continue
for sibling in parent.next_siblings:
value = normalize_department_category(text_from_node(sibling))
if value:
categories = merge_terms(categories, [value])
break
for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
text = clean_text(candidate.get_text(" ", strip=True))
if not text.lower().startswith("department:"):
continue
extracted = normalize_department_category(extract_labeled_value(text, "Department"))
if extracted:
categories = merge_terms(categories, [extracted])
return categories
def extract_author_from_json_ld(article: dict) -> str:
author = article.get("author")
if isinstance(author, dict):
return clean_text(author.get("name", ""))
if isinstance(author, list):
names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
return ", ".join(name for name in names if name)
if isinstance(author, str):
return clean_text(author)
return ""
def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
body = article.get("articleBody")
if isinstance(body, str) and len(body.strip()) > 120:
return f"<p>{unescape(body.strip())}</p>"
return extract_body(soup)
def normalize_terms(value: object) -> list[str]:
if isinstance(value, str):
parts = re.split(r"[,|>]", value)
return [clean_text(part) for part in parts if clean_text(part)]
if isinstance(value, list):
result: list[str] = []
for item in value:
if isinstance(item, str):
cleaned = clean_text(item)
if cleaned and cleaned not in result:
result.append(cleaned)
return result
return []
def merge_terms(*groups: list[str]) -> list[str]:
merged: list[str] = []
for group in groups:
for item in group:
cleaned = clean_text(item)
if cleaned and cleaned not in merged:
merged.append(cleaned)
return merged
def normalize_drupal_date(value: str | None) -> str:
if not value:
return ""
match = DRUPAL_TITLE_DATE_PATTERN.search(value)
if not match:
return ""
return normalize_date(match.group(0))
def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
node = soup.find(tag_name, attrs=attrs)
if node and node.get("content"):
return node["content"].strip()
return ""
def clean_text(value: str) -> str:
return re.sub(r"\s+", " ", value or "").strip()
def text_from_node(node: object) -> str:
if isinstance(node, NavigableString):
return clean_text(str(node))
if isinstance(node, Tag):
return clean_text(node.get_text(" ", strip=True))
return ""
def sanitize_html(html: str) -> str:
if not html:
return ""
soup = BeautifulSoup(html, "html.parser")
strip_unwanted(soup)
strip_dangerous_attributes(soup)
return soup.decode_contents().strip()
def has_meaningful_body_content(html: str) -> bool:
if not html:
return False
text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))
def strip_unwanted(node: BeautifulSoup | Tag) -> None:
for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
for child in node.select(selector):
child.decompose()
def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
for child in node.find_all(True):
for attr_name in list(child.attrs):
normalized_name = attr_name.lower()
if normalized_name.startswith("on") or normalized_name == "srcdoc":
del child.attrs[attr_name]
continue
if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
continue
raw_value = child.attrs.get(attr_name)
if isinstance(raw_value, list):
candidate = " ".join(str(item) for item in raw_value)
else:
candidate = str(raw_value or "")
lowered = candidate.strip().lower()
if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
del child.attrs[attr_name]
def clone_tag(node: Tag) -> BeautifulSoup:
return BeautifulSoup(str(node), "html.parser")
def find_title_node(soup: BeautifulSoup) -> Tag | None:
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
node = soup.select_one(selector)
if node:
return node
return None
def extract_labeled_value(text: str, label: str) -> str:
if not text:
return ""
pattern = re.compile(
rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
re.IGNORECASE,
)
match = pattern.search(clean_text(text))
if not match:
return ""
return clean_text(match.group(1))
def normalize_department_category(value: str) -> str:
cleaned = clean_text(value)
if not cleaned:
return ""
if len(cleaned) > 80 or len(cleaned.split()) > 8:
return ""
if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
return ""
return cleaned
def format_error_summary(
url: str,
exc: Exception,
response: requests.Response | None,
timeout_seconds: int,
) -> str:
if isinstance(exc, requests.HTTPError):
failing_response = exc.response or response
if failing_response is not None:
return (
f"HTTP {failing_response.status_code} {failing_response.reason} "
f"while fetching {failing_response.url or url}"
)
if isinstance(exc, requests.Timeout):
return f"Request timed out after {timeout_seconds}s while fetching {url}"
if isinstance(exc, requests.RequestException):
return f"{type(exc).__name__} while fetching {url}: {exc}"
return f"{type(exc).__name__}: {exc}"
def format_error_details(
url: str,
exc: Exception,
response: requests.Response | None,
) -> str:
details = [
f"URL: {url}",
f"Error Type: {type(exc).__name__}",
f"Message: {exc}",
]
failing_response = getattr(exc, "response", None) or response
if failing_response is not None:
details.extend(
[
f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
f"Resolved URL: {failing_response.url}",
]
)
trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
if trace:
details.append(f"Exception: {trace}")
return "\n".join(details)
+91
View File
@@ -0,0 +1,91 @@
from __future__ import annotations
from email.utils import format_datetime
from io import StringIO
from xml.sax.saxutils import escape
import datetime as dt
from page_importer.dates import parse_datetime
from page_importer.models import ScrapedPost
def build_wxr(posts: list[ScrapedPost], channel_title: str = "Imported Content") -> str:
now = dt.datetime.now(dt.timezone.utc)
out = StringIO()
out.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
out.write(
'<rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" '
'xmlns:content="http://purl.org/rss/1.0/modules/content/" '
'xmlns:wfw="http://wellformedweb.org/CommentAPI/" '
'xmlns:dc="http://purl.org/dc/elements/1.1/" '
'xmlns:wp="http://wordpress.org/export/1.2/">\n'
)
out.write("<channel>\n")
out.write(f"<title>{escape(channel_title)}</title>\n")
out.write("<link>http://localhost/</link>\n")
out.write("<description>Generated by Page Importer</description>\n")
out.write(f"<pubDate>{format_datetime(now)}</pubDate>\n")
out.write("<language>en-US</language>\n")
out.write("<wp:wxr_version>1.2</wp:wxr_version>\n")
for post in posts:
local_date, gmt_date, item_pub_date = _resolve_post_dates(post.publish_date, now)
out.write("<item>\n")
out.write(f"<title>{escape(post.title)}</title>\n")
out.write(f"<link>{escape(post.source_url)}</link>\n")
out.write(f"<pubDate>{format_datetime(item_pub_date)}</pubDate>\n")
out.write(f"<dc:creator>{cdata(post.author or 'importer')}</dc:creator>\n")
out.write(f"<guid isPermaLink=\"false\">{escape(post.source_url)}</guid>\n")
out.write("<description></description>\n")
out.write(f"<content:encoded>{cdata(post.body_html)}</content:encoded>\n")
out.write(f"<excerpt:encoded>{cdata('')}</excerpt:encoded>\n")
out.write(f"<wp:post_date>{cdata(local_date)}</wp:post_date>\n")
out.write(f"<wp:post_date_gmt>{cdata(gmt_date)}</wp:post_date_gmt>\n")
out.write("<wp:comment_status><![CDATA[closed]]></wp:comment_status>\n")
out.write("<wp:ping_status><![CDATA[closed]]></wp:ping_status>\n")
out.write("<wp:post_name><![CDATA[]]></wp:post_name>\n")
out.write(f"<wp:status>{cdata(post.status)}</wp:status>\n")
out.write("<wp:post_parent>0</wp:post_parent>\n")
out.write("<wp:menu_order>0</wp:menu_order>\n")
out.write(f"<wp:post_type>{cdata(post.post_type or 'post')}</wp:post_type>\n")
out.write("<wp:post_password><![CDATA[]]></wp:post_password>\n")
out.write("<wp:is_sticky>0</wp:is_sticky>\n")
for category in post.categories:
out.write(
f'<category domain="category" nicename="{escape(slugify(category))}">{cdata(category)}</category>\n'
)
for tag in post.tags:
out.write(
f'<category domain="post_tag" nicename="{escape(slugify(tag))}">{cdata(tag)}</category>\n'
)
out.write("</item>\n")
out.write("</channel>\n</rss>\n")
return out.getvalue()
def slugify(value: str) -> str:
return "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-")
def cdata(value: str) -> str:
return f"<![CDATA[{(value or '').replace(']]>', ']]]]><![CDATA[>')}]]>"
def _resolve_post_dates(value: str, fallback: dt.datetime) -> tuple[str, str, dt.datetime]:
parsed = parse_datetime(value)
if parsed is None:
return "", "", fallback
if parsed.tzinfo is None or parsed.utcoffset() is None:
local_date = _format_wp_date(parsed)
assumed_utc = parsed.replace(tzinfo=dt.timezone.utc)
return local_date, local_date, assumed_utc
local_date = _format_wp_date(parsed)
gmt_value = parsed.astimezone(dt.timezone.utc)
return local_date, _format_wp_date(gmt_value), gmt_value
def _format_wp_date(value: dt.datetime) -> str:
return value.replace(tzinfo=None).strftime("%Y-%m-%d %H:%M:%S")
+4
View File
@@ -0,0 +1,4 @@
streamlit>=1.43,<2
requests>=2.32,<3
beautifulsoup4>=4.12,<5
python-dateutil>=2.9,<3
+79
View File
@@ -0,0 +1,79 @@
from __future__ import annotations
import unittest
from bs4 import BeautifulSoup
from page_importer.dates import normalize_date
from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
from page_importer.wxr import build_wxr
from page_importer.models import ScrapedPost
class DateNormalizationTests(unittest.TestCase):
def test_preserves_timezone_offset_in_normalized_value(self) -> None:
self.assertEqual(
normalize_date("2024-05-01T09:30:00-07:00"),
"2024-05-01 09:30:00-07:00",
)
class WxrSerializationTests(unittest.TestCase):
def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
xml = build_wxr(
[
ScrapedPost(
source_url="https://example.com/post",
title="Example",
body_html="<p>Body</p>",
publish_date="2024-05-01 09:30:00-07:00",
success=True,
)
]
)
self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
def test_splits_cdata_terminators_in_content(self) -> None:
xml = build_wxr(
[
ScrapedPost(
source_url="https://example.com/post",
title="Example",
body_html="<p>alpha ]]> omega</p>",
author="Jane ]]> Doe",
success=True,
)
]
)
self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
class HtmlSanitizationTests(unittest.TestCase):
def test_removes_inline_event_handlers_and_script_uris(self) -> None:
sanitized = sanitize_html(
'<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
)
self.assertNotIn("onclick", sanitized)
self.assertNotIn("onerror", sanitized)
self.assertNotIn("javascript:", sanitized)
class TaxonomySelectorTests(unittest.TestCase):
def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
soup = BeautifulSoup(
'<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
"html.parser",
)
self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
if __name__ == "__main__":
unittest.main()
+110
View File
@@ -0,0 +1,110 @@
# WDW Sitemap And Import Tools
This repository combines two internal tools into one web application and one Docker image:
- `Sitemap Generator`
- `Page Importer`
The application uses Streamlit and presents both tools behind a single URL with two tabs at the top of the page.
## What It Does
### Sitemap Generator
- Crawls a site from a starting URL
- Discovers URLs from page links and XML sitemaps
- Exports a sitemap CSV
- Saves crawl state and logs so a crawl can be resumed later
### Page Importer
- Reads a CSV of submitted URLs
- Scrapes page content
- Lets you review the extracted content
- Exports a WordPress WXR XML import file
## Project Layout
- `app.py`: top-level Streamlit app with both tabs
- `requirements.txt`: shared Python dependencies for the combined app
- `Dockerfile`: single image for the combined tool
- `.gitea/workflows/docker-image.yml`: Gitea Actions workflow for Docker builds
- `Sitemap Builder/`: sitemap crawler logic
- `Page Importer/`: WordPress import logic
## Run Locally
### Linux or macOS
```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
streamlit run app.py
```
### Windows PowerShell
```powershell
python -m venv .venv
.venv\Scripts\Activate.ps1
pip install -r requirements.txt
streamlit run app.py
```
Then open:
```text
http://localhost:8501
```
## Docker
Build the image:
```bash
docker build -t wdw-sitemap-and-importer .
```
Run the container:
```bash
docker run --rm -p 8501:8501 -v wdw-tools-data:/data wdw-sitemap-and-importer
```
Then open:
```text
http://localhost:8501
```
The mounted `/data` volume stores sitemap CSV files, crawl state files, and crawl logs so sitemap jobs can survive container restarts.
## Gitea Automation
The workflow file is:
```text
.gitea/workflows/docker-image.yml
```
It runs on pushes to `main` and on manual workflow dispatch.
The workflow always builds the Docker image. If these secrets are configured in Gitea, it also logs in and pushes the image to your registry:
- `GITEA_REGISTRY_URL`
- `GITEA_REGISTRY_USERNAME`
- `GITEA_REGISTRY_PASSWORD`
Published tags:
- `${REGISTRY}/wdw-sitemap-and-importer:<commit-sha>`
- `${REGISTRY}/wdw-sitemap-and-importer:latest`
If the registry secrets are not configured, the workflow still performs the build as validation but skips the push steps.
## Notes
- Sitemap output files are written under `/data` in Docker.
- The sitemap crawler can resume previous runs when a matching crawl state file exists.
- The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app.
+80
View File
@@ -0,0 +1,80 @@
# Sitemap Builder
This folder contains the sitemap crawler used by the combined web application in the repository root.
The crawler can still be used directly from Python, but the primary supported experience is now the shared Streamlit interface in the root project:
```text
../app.py
```
## Current Role In The Combined App
The root application uses this module to:
- crawl a site from a submitted starting URL
- discover internal URLs from HTML links and XML sitemaps
- export a sitemap CSV
- save crawl state and crawl logs for resume support
## Output
The crawler writes:
- a CSV file
- a sidecar crawl state file ending in `.crawlstate.json`
- a crawl log file ending in `.crawl.log`
The CSV contains these columns:
- `URL`
- `Title`
- `Canonical URL`
- `Type`
## Standalone CLI Usage
Interactive mode:
```bash
python3 sitemap_builder.py
```
Command line mode:
```bash
python3 sitemap_builder.py https://example.com -o ./sitemap.csv
```
On Windows:
```powershell
python .\sitemap_builder.py https://example.com -o .\sitemap.csv
```
## Useful Options
```bash
python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --include-subdomains
```
- `--max-pages`: stop after the given number of visited pages. Default: `10000`
- `--delay`: wait between requests to reduce load on the site
- `--timeout`: request timeout in seconds
- `--include-subdomains`: crawl subdomains of the starting host
- `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
- `--workers`: number of worker threads to use. Set `1` to disable multithreading
- `--save-every`: save progress after every N pages. Default: `25`
- `--resume`: resume from an existing state file
- `--fresh`: ignore the existing state file and start over
## Discovery And Behavior
- The crawler checks `robots.txt` for sitemap references and also tries `/sitemap.xml`
- XML sitemap URLs are added to the crawl queue before page crawling begins
- HTML pages store page title and canonical URL in the CSV when available
- On Windows CLI runs, `P` pauses, `R` resumes, and `Q` stops cleanly and saves progress
## Recommendation
For normal use, run the root application or Docker container instead of calling this script directly. That is now the intended user interface for this repository.
+947
View File
@@ -0,0 +1,947 @@
from __future__ import annotations
import argparse
import csv
import json
import os
import sys
import time
import xml.etree.ElementTree as ET
from collections import deque
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
from dataclasses import dataclass
from html.parser import HTMLParser
from pathlib import Path
from typing import Iterable
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin, urlsplit, urlunsplit
from urllib.request import Request, urlopen
if os.name == "nt":
import msvcrt
DEFAULT_USER_AGENT = "SitemapBuilder/1.0 (+local script)"
DEFAULT_OUTPUT_NAME = "sitemap.csv"
DEFAULT_STATE_SUFFIX = ".crawlstate.json"
DEFAULT_LOG_SUFFIX = ".crawl.log"
DEFAULT_MAX_PAGES = 10000
DEFAULT_RESUME_PAGE_INCREMENT = 10000
DEFAULT_SAVE_EVERY = 25
DEFAULT_WORKERS = 8
SCRIPT_DIR = Path(__file__).resolve().parent
DOCUMENT_EXTENSIONS = {
".pdf",
".csv",
".doc",
".docx",
".xls",
".xlsx",
".ppt",
".pptx",
".txt",
".rtf",
".zip",
".xml",
".json",
}
@dataclass
class CrawlResult:
url: str
links: list[str]
title: str = ""
canonical_url: str = ""
skipped: bool = False
error: str | None = None
@dataclass
class CrawlState:
start_url: str
include_subdomains: bool
include_documents: bool
visited: set[str]
queued: set[str]
queue: deque[str]
records: dict[str, dict[str, str]]
alias_to_canonical: dict[str, str]
errors: list[dict[str, str]]
skipped_count: int
discovered_from_sitemaps: int
@dataclass
class RuntimeControl:
paused: bool = False
stop_requested: bool = False
@dataclass
class CrawlRunResult:
state: CrawlState
user_stopped: bool
output_path: Path
state_path: Path
log_path: Path
max_pages: int
workers: int
class HTMLPageParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.links: list[str] = []
self.title_parts: list[str] = []
self.in_title = False
self.canonical_href = ""
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
attrs_map = {key.lower(): value for key, value in attrs}
lower_tag = tag.lower()
if lower_tag == "a":
href = attrs_map.get("href")
if href:
self.links.append(href)
if lower_tag == "title":
self.in_title = True
if lower_tag == "link":
rel = (attrs_map.get("rel") or "").lower()
href = attrs_map.get("href") or ""
if "canonical" in rel and href:
self.canonical_href = href
def handle_endtag(self, tag: str) -> None:
if tag.lower() == "title":
self.in_title = False
def handle_data(self, data: str) -> None:
if self.in_title:
self.title_parts.append(data)
@property
def title(self) -> str:
return " ".join(part.strip() for part in self.title_parts if part.strip()).strip()
def normalize_url(url: str) -> str:
parts = urlsplit(url.strip())
scheme = parts.scheme.lower() or "https"
netloc = parts.netloc.lower()
path = parts.path or "/"
if path != "/" and path.endswith("/"):
path = path.rstrip("/")
return urlunsplit((scheme, netloc, path, parts.query, ""))
def is_http_url(url: str) -> bool:
return urlsplit(url).scheme in {"http", "https"}
def build_allowed_hosts(start_url: str) -> set[str]:
return {urlsplit(start_url).netloc.lower()}
def should_visit(url: str, allowed_hosts: set[str], include_subdomains: bool) -> bool:
if not is_http_url(url):
return False
host = urlsplit(url).netloc.lower()
if include_subdomains:
return any(host == allowed or host.endswith(f".{allowed}") for allowed in allowed_hosts)
return host in allowed_hosts
def is_document_url(url: str) -> bool:
return Path(urlsplit(url).path).suffix.lower() in DOCUMENT_EXTENSIONS
def should_record_url(url: str) -> bool:
query = urlsplit(url).query.lower()
return query != "page=1"
def get_state_path(output_path: Path) -> Path:
return output_path.with_suffix(output_path.suffix + DEFAULT_STATE_SUFFIX)
def get_log_path(output_path: Path) -> Path:
return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
def log_message(log_path: Path, message: str) -> None:
log_path.parent.mkdir(parents=True, exist_ok=True)
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
with log_path.open("a", encoding="utf-8") as log_file:
log_file.write(f"[{timestamp}] {message}\n")
def resolve_alias(url: str, alias_to_canonical: dict[str, str]) -> str:
resolved = url
seen: set[str] = set()
while resolved in alias_to_canonical and resolved not in seen:
seen.add(resolved)
resolved = alias_to_canonical[resolved]
return resolved
def register_record(
state: CrawlState,
url: str,
record_type: str,
title: str = "",
canonical_url: str = "",
) -> None:
if not should_record_url(url):
return
existing = state.records.get(url, {"title": "", "canonical_url": "", "type": record_type})
if not existing.get("type"):
existing["type"] = record_type
elif existing["type"] == "document" and record_type == "page":
existing["type"] = "page"
if title and not existing.get("title"):
existing["title"] = title
if canonical_url and not existing.get("canonical_url"):
existing["canonical_url"] = canonical_url
if "canonical_url" not in existing:
existing["canonical_url"] = canonical_url
if "title" not in existing:
existing["title"] = title
state.records[url] = existing
def save_state(state: CrawlState, state_path: Path, output_path: Path) -> None:
state_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"start_url": state.start_url,
"include_subdomains": state.include_subdomains,
"include_documents": state.include_documents,
"visited": sorted(state.visited),
"queued": sorted(state.queued),
"queue": list(state.queue),
"records": state.records,
"alias_to_canonical": state.alias_to_canonical,
"errors": state.errors,
"skipped_count": state.skipped_count,
"discovered_from_sitemaps": state.discovered_from_sitemaps,
"saved_at": time.strftime("%Y-%m-%d %H:%M:%S"),
"output_path": str(output_path),
}
state_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
def load_state(state_path: Path) -> CrawlState:
payload = json.loads(state_path.read_text(encoding="utf-8"))
return CrawlState(
start_url=payload["start_url"],
include_subdomains=bool(payload.get("include_subdomains", False)),
include_documents=bool(payload.get("include_documents", False)),
visited=set(payload.get("visited", [])),
queued=set(payload.get("queued", [])),
queue=deque(payload.get("queue", [])),
records=dict(payload.get("records", {})),
alias_to_canonical=dict(payload.get("alias_to_canonical", {})),
errors=list(payload.get("errors", [])),
skipped_count=int(payload.get("skipped_count", 0)),
discovered_from_sitemaps=int(payload.get("discovered_from_sitemaps", 0)),
)
def initialize_state(start_url: str, include_subdomains: bool, include_documents: bool) -> CrawlState:
normalized_start = normalize_url(start_url)
return CrawlState(
start_url=normalized_start,
include_subdomains=include_subdomains,
include_documents=include_documents,
visited=set(),
queued={normalized_start},
queue=deque([normalized_start]),
records={},
alias_to_canonical={},
errors=[],
skipped_count=0,
discovered_from_sitemaps=0,
)
def prompt_if_missing(value: str | None, prompt_text: str) -> str:
if value:
return value
return input(prompt_text).strip()
def prompt_yes_no(prompt_text: str, default: bool) -> bool:
suffix = "Y/n" if default else "y/N"
answer = input(f"{prompt_text} [{suffix}]: ").strip().lower()
if not answer:
return default
return answer in {"y", "yes"}
def write_csv(records: dict[str, dict[str, str]], output_path: Path) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["URL", "Title", "Canonical URL", "Type"])
for url in sorted(records):
record = records[url]
writer.writerow(
[
url,
record.get("title", ""),
record.get("canonical_url", ""),
record.get("type", ""),
]
)
def fetch_text(url: str, timeout: float, user_agent: str, accept: str) -> tuple[str | None, str | None]:
request = Request(url, headers={"User-Agent": user_agent, "Accept": accept})
try:
with urlopen(request, timeout=timeout) as response:
return (
response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace"),
None,
)
except HTTPError as exc:
return None, f"HTTP {exc.code}"
except URLError as exc:
return None, str(exc.reason)
except TimeoutError:
return None, "request timed out"
except Exception as exc: # pragma: no cover
return None, str(exc)
def fetch_page(url: str, timeout: float, user_agent: str) -> CrawlResult:
request = Request(
url,
headers={
"User-Agent": user_agent,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
)
try:
with urlopen(request, timeout=timeout) as response:
content_type = response.headers.get("Content-Type", "").lower()
if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
return CrawlResult(url=url, links=[], skipped=True)
content = response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace")
except HTTPError as exc:
return CrawlResult(url=url, links=[], error=f"HTTP {exc.code}")
except URLError as exc:
return CrawlResult(url=url, links=[], error=str(exc.reason))
except TimeoutError:
return CrawlResult(url=url, links=[], error="request timed out")
except Exception as exc: # pragma: no cover
return CrawlResult(url=url, links=[], error=str(exc))
parser = HTMLPageParser()
parser.feed(content)
canonical_url = normalize_url(urljoin(url, parser.canonical_href)) if parser.canonical_href else ""
return CrawlResult(
url=url,
links=parser.links,
title=parser.title,
canonical_url=canonical_url,
)
def fetch_page_with_delay(url: str, timeout: float, user_agent: str, delay: float) -> CrawlResult:
if delay > 0:
time.sleep(delay)
return fetch_page(url, timeout=timeout, user_agent=user_agent)
def print_progress(state: CrawlState, max_pages: int, current_url: str) -> None:
print(
f"[{len(state.visited)}/{max_pages}] Found {len(state.records)} URL(s), "
f"queued {len(state.queue)} more: {current_url}"
)
def poll_runtime_control(control: RuntimeControl, log_path: Path) -> None:
if os.name != "nt":
return
while msvcrt.kbhit():
key = msvcrt.getwch().lower()
if key == "p" and not control.paused:
control.paused = True
print("Paused. Press R to resume or Q to stop.")
log_message(log_path, "Crawl paused by user")
elif key == "r" and control.paused:
control.paused = False
print("Resuming crawl.")
log_message(log_path, "Crawl resumed by user")
elif key == "q":
control.stop_requested = True
log_message(log_path, "Stop requested by user")
def discover_robots_sitemaps(
start_url: str,
timeout: float,
user_agent: str,
log_path: Path,
) -> set[str]:
robots_url = normalize_url(urljoin(start_url, "/robots.txt"))
content, error = fetch_text(robots_url, timeout, user_agent, "text/plain,*/*;q=0.8")
if error:
log_message(log_path, f"robots.txt not available at {robots_url}: {error}")
return set()
sitemap_urls: set[str] = set()
for line in content.splitlines():
if line.lower().startswith("sitemap:"):
raw_url = line.split(":", 1)[1].strip()
if raw_url:
sitemap_urls.add(normalize_url(raw_url))
if sitemap_urls:
log_message(log_path, f"Discovered {len(sitemap_urls)} sitemap reference(s) from robots.txt")
return sitemap_urls
def xml_local_name(tag: str) -> str:
if "}" in tag:
return tag.rsplit("}", 1)[1]
return tag
def parse_sitemap_urls(
sitemap_url: str,
allowed_hosts: set[str],
include_subdomains: bool,
timeout: float,
user_agent: str,
log_path: Path,
seen_sitemaps: set[str],
) -> set[str]:
normalized_sitemap = normalize_url(sitemap_url)
if normalized_sitemap in seen_sitemaps:
return set()
seen_sitemaps.add(normalized_sitemap)
if not should_visit(normalized_sitemap, allowed_hosts, include_subdomains):
return set()
content, error = fetch_text(normalized_sitemap, timeout, user_agent, "application/xml,text/xml;q=0.9,*/*;q=0.8")
if error:
log_message(log_path, f"Sitemap fetch failed for {normalized_sitemap}: {error}")
return set()
try:
root = ET.fromstring(content)
except ET.ParseError as exc:
log_message(log_path, f"Sitemap parse failed for {normalized_sitemap}: {exc}")
return set()
tag_name = xml_local_name(root.tag)
discovered_urls: set[str] = set()
if tag_name == "urlset":
for element in root.findall(".//"):
if xml_local_name(element.tag) == "loc" and element.text:
normalized = normalize_url(element.text.strip())
if should_visit(normalized, allowed_hosts, include_subdomains):
discovered_urls.add(normalized)
elif tag_name == "sitemapindex":
for element in root.findall(".//"):
if xml_local_name(element.tag) == "loc" and element.text:
child_sitemap = normalize_url(element.text.strip())
discovered_urls.update(
parse_sitemap_urls(
child_sitemap,
allowed_hosts,
include_subdomains,
timeout,
user_agent,
log_path,
seen_sitemaps,
)
)
else:
log_message(log_path, f"Unsupported sitemap format at {normalized_sitemap}")
return discovered_urls
def seed_from_xml_sitemaps(
state: CrawlState,
timeout: float,
user_agent: str,
log_path: Path,
) -> None:
allowed_hosts = build_allowed_hosts(state.start_url)
sitemap_candidates = discover_robots_sitemaps(state.start_url, timeout, user_agent, log_path)
sitemap_candidates.add(normalize_url(urljoin(state.start_url, "/sitemap.xml")))
seen_sitemaps: set[str] = set()
discovered_urls: set[str] = set()
for sitemap_url in sitemap_candidates:
discovered_urls.update(
parse_sitemap_urls(
sitemap_url,
allowed_hosts,
state.include_subdomains,
timeout,
user_agent,
log_path,
seen_sitemaps,
)
)
added = 0
for url in discovered_urls:
canonical_url = resolve_alias(url, state.alias_to_canonical)
if is_document_url(canonical_url):
if state.include_documents:
register_record(state, canonical_url, "document")
added += 1
continue
register_record(state, canonical_url, "page")
if canonical_url not in state.visited and canonical_url not in state.queued:
state.queue.append(canonical_url)
state.queued.add(canonical_url)
added += 1
state.discovered_from_sitemaps += added
log_message(log_path, f"Added {added} URL(s) from XML sitemap discovery")
def process_crawl_result(
state: CrawlState,
result: CrawlResult,
allowed_hosts: set[str],
log_path: Path,
) -> None:
if result.error:
state.errors.append({"url": result.url, "error": result.error})
log_message(log_path, f"Error fetching {result.url}: {result.error}")
return
if result.skipped:
state.skipped_count += 1
register_record(state, result.url, "document")
return
canonical_url = ""
if result.canonical_url and should_visit(result.canonical_url, allowed_hosts, state.include_subdomains):
canonical_url = resolve_alias(result.canonical_url, state.alias_to_canonical)
state.alias_to_canonical[result.url] = canonical_url
register_record(state, canonical_url, "page", title=result.title, canonical_url=canonical_url)
if canonical_url not in state.visited and canonical_url not in state.queued:
state.queue.append(canonical_url)
state.queued.add(canonical_url)
register_record(state, result.url, "page", title=result.title, canonical_url=canonical_url)
for raw_link in result.links:
absolute = normalize_url(urljoin(result.url, raw_link))
if not should_visit(absolute, allowed_hosts, state.include_subdomains):
continue
absolute = resolve_alias(absolute, state.alias_to_canonical)
if is_document_url(absolute):
if state.include_documents:
register_record(state, absolute, "document")
continue
register_record(state, absolute, "page")
if absolute not in state.queued and absolute not in state.visited:
state.queue.append(absolute)
state.queued.add(absolute)
def crawl_site(
state: CrawlState,
max_pages: int,
delay: float,
timeout: float,
user_agent: str,
state_path: Path,
output_path: Path,
log_path: Path,
save_every: int,
workers: int,
) -> tuple[CrawlState, bool]:
allowed_hosts = build_allowed_hosts(state.start_url)
processed_since_save = 0
user_stopped = False
control = RuntimeControl()
if workers <= 1:
while state.queue and len(state.visited) < max_pages:
poll_runtime_control(control, log_path)
if control.stop_requested:
user_stopped = True
print("Stop requested. Saving progress and finishing cleanly...")
break
while control.paused and not control.stop_requested:
time.sleep(0.2)
poll_runtime_control(control, log_path)
if control.stop_requested:
user_stopped = True
print("Stop requested. Saving progress and finishing cleanly...")
break
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
if current in state.visited:
continue
state.visited.add(current)
register_record(state, current, "page")
print_progress(state, max_pages, current)
result = fetch_page_with_delay(current, timeout=timeout, user_agent=user_agent, delay=delay)
process_crawl_result(state, result, allowed_hosts, log_path)
processed_since_save += 1
if processed_since_save >= save_every:
write_csv(state.records, output_path)
save_state(state, state_path, output_path)
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
processed_since_save = 0
else:
with ThreadPoolExecutor(max_workers=workers) as executor:
pending: dict[object, str] = {}
while pending or (state.queue and len(state.visited) < max_pages):
poll_runtime_control(control, log_path)
if control.stop_requested:
user_stopped = True
print("Stop requested. No new pages will be queued. Waiting for active requests to finish...")
break
if control.paused:
if pending:
completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
for future in completed:
pending.pop(future, None)
result = future.result()
process_crawl_result(state, result, allowed_hosts, log_path)
processed_since_save += 1
else:
time.sleep(0.2)
if processed_since_save >= save_every:
write_csv(state.records, output_path)
save_state(state, state_path, output_path)
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
processed_since_save = 0
continue
while state.queue and len(pending) < workers and len(state.visited) < max_pages:
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
if current in state.visited:
continue
state.visited.add(current)
register_record(state, current, "page")
print_progress(state, max_pages, current)
future = executor.submit(fetch_page_with_delay, current, timeout, user_agent, delay)
pending[future] = current
if not pending:
continue
completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
for future in completed:
pending.pop(future, None)
result = future.result()
process_crawl_result(state, result, allowed_hosts, log_path)
processed_since_save += 1
if processed_since_save >= save_every:
write_csv(state.records, output_path)
save_state(state, state_path, output_path)
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
processed_since_save = 0
if user_stopped and pending:
completed, _ = wait(pending.keys())
for future in completed:
pending.pop(future, None)
result = future.result()
process_crawl_result(state, result, allowed_hosts, log_path)
write_csv(state.records, output_path)
save_state(state, state_path, output_path)
log_message(log_path, f"Final save completed with {len(state.records)} URL(s) recorded")
return state, user_stopped
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Crawl a website and export discovered internal URLs to a CSV sitemap.",
)
parser.add_argument("url", nargs="?", help="Starting URL to crawl, for example https://example.com")
parser.add_argument(
"-o",
"--output",
help=f"Output CSV path. Defaults to {DEFAULT_OUTPUT_NAME} in the script folder.",
)
parser.add_argument(
"--max-pages",
type=int,
default=DEFAULT_MAX_PAGES,
help=f"Maximum number of pages to crawl before stopping. Default: {DEFAULT_MAX_PAGES}",
)
parser.add_argument(
"--delay",
type=float,
default=0.0,
help="Delay in seconds between requests. Default: 0",
)
parser.add_argument(
"--timeout",
type=float,
default=15.0,
help="Request timeout in seconds. Default: 15",
)
parser.add_argument(
"--include-subdomains",
action="store_true",
help="Also crawl subdomains of the starting host.",
)
parser.add_argument(
"--include-documents",
action="store_true",
help="Include document links like PDF, CSV, DOC, and DOCX in the sitemap output.",
)
parser.add_argument(
"--save-every",
type=int,
default=DEFAULT_SAVE_EVERY,
help=f"Save progress after this many pages. Default: {DEFAULT_SAVE_EVERY}",
)
parser.add_argument(
"--resume",
action="store_true",
help="Resume from the saved crawl state if a state file already exists.",
)
parser.add_argument(
"--fresh",
action="store_true",
help="Ignore any saved crawl state and start over.",
)
parser.add_argument(
"--workers",
type=int,
default=0,
help=f"Number of worker threads. Use 1 to disable multithreading. Default when prompted on: {DEFAULT_WORKERS}",
)
return parser.parse_args()
def run_crawl(
*,
start_url: str,
output_path: Path,
max_pages: int = DEFAULT_MAX_PAGES,
delay: float = 0.0,
timeout: float = 15.0,
include_subdomains: bool = False,
include_documents: bool = False,
save_every: int = DEFAULT_SAVE_EVERY,
workers: int = DEFAULT_WORKERS,
resume: bool = True,
fresh: bool = False,
user_agent: str = DEFAULT_USER_AGENT,
) -> CrawlRunResult:
if not start_url:
raise ValueError("A starting URL is required.")
if "://" not in start_url:
start_url = f"https://{start_url}"
normalized_start = normalize_url(start_url)
if not is_http_url(normalized_start):
raise ValueError("Only http and https URLs are supported.")
output_path = Path(output_path)
state_path = get_state_path(output_path)
log_path = get_log_path(output_path)
state: CrawlState
if state_path.exists() and not fresh and resume:
state = load_state(state_path)
if state.start_url != normalized_start:
raise ValueError(
"The saved crawl state belongs to a different starting URL. "
"Use a different output name or start a fresh crawl."
)
if state.include_documents != include_documents:
raise ValueError(
"The saved crawl state uses a different document setting. "
"Keep the same choice or start a fresh crawl."
)
else:
state = initialize_state(normalized_start, include_subdomains, include_documents)
effective_workers = max(int(workers), 1)
effective_max_pages = max(int(max_pages), 1)
if state.visited:
effective_max_pages = max(effective_max_pages, len(state.visited) + DEFAULT_RESUME_PAGE_INCREMENT)
else:
seed_from_xml_sitemaps(state, max(timeout, 1.0), user_agent, log_path)
log_message(log_path, f"Starting crawl for {state.start_url}")
log_message(log_path, f"Output CSV: {output_path.resolve()}")
log_message(log_path, f"State file: {state_path.resolve()}")
log_message(log_path, f"Multithreading workers: {effective_workers}")
log_message(log_path, f"Include documents: {state.include_documents}")
state, user_stopped = crawl_site(
state=state,
max_pages=effective_max_pages,
delay=max(delay, 0.0),
timeout=max(timeout, 1.0),
user_agent=user_agent,
state_path=state_path,
output_path=output_path,
log_path=log_path,
save_every=max(save_every, 1),
workers=effective_workers,
)
if user_stopped:
log_message(log_path, "Crawl stopped by user")
elif state.queue and len(state.visited) >= effective_max_pages:
log_message(log_path, "Crawl stopped at max page limit")
elif state.queue:
log_message(log_path, "Crawl stopped before queue emptied")
else:
log_message(log_path, "Crawl completed with empty queue")
return CrawlRunResult(
state=state,
user_stopped=user_stopped,
output_path=output_path,
state_path=state_path,
log_path=log_path,
max_pages=effective_max_pages,
workers=effective_workers,
)
def main() -> int:
args = parse_args()
start_url = prompt_if_missing(args.url, "Enter the website URL to crawl: ")
if not start_url:
print("A starting URL is required.", file=sys.stderr)
return 1
if "://" not in start_url:
start_url = f"https://{start_url}"
normalized_start = normalize_url(start_url)
if not is_http_url(normalized_start):
print("Only http and https URLs are supported.", file=sys.stderr)
return 1
output_value = prompt_if_missing(args.output, f"Enter output CSV path [{DEFAULT_OUTPUT_NAME}]: ")
output_path = Path(output_value) if output_value else SCRIPT_DIR / DEFAULT_OUTPUT_NAME
state_path = get_state_path(output_path)
log_path = get_log_path(output_path)
include_documents = args.include_documents or prompt_yes_no(
"Include document links such as PDF, CSV, DOC, and DOCX in the sitemap?",
default=False,
)
workers = args.workers
if workers <= 0:
enable_multithreading = prompt_yes_no(
f"Enable multithreading for faster scanning? {DEFAULT_WORKERS} worker threads will be used.",
default=True,
)
workers = DEFAULT_WORKERS if enable_multithreading else 1
print(f"Crawling {normalized_start}")
print(f"Output file: {output_path.resolve()}")
print(f"State file: {state_path.resolve()}")
print(f"Log file: {log_path.resolve()}")
resume_existing = False
if state_path.exists() and not args.fresh:
resume_existing = args.resume or prompt_yes_no(
f"Found saved crawl state at {state_path.name}. Resume from where it left off?",
default=True,
)
try:
run_result = run_crawl(
start_url=normalized_start,
output_path=output_path,
max_pages=args.max_pages,
delay=args.delay,
timeout=args.timeout,
include_subdomains=args.include_subdomains,
include_documents=include_documents,
save_every=args.save_every,
workers=workers,
resume=resume_existing,
fresh=args.fresh,
user_agent=DEFAULT_USER_AGENT,
)
except ValueError as exc:
print(str(exc), file=sys.stderr)
return 1
state = run_result.state
user_stopped = run_result.user_stopped
effective_max_pages = run_result.max_pages
print(f"Max pages: {effective_max_pages}")
print(f"Include documents: {'Yes' if state.include_documents else 'No'}")
print(f"Multithreading: {'Yes' if run_result.workers > 1 else 'No'}")
print(f"Worker threads: {run_result.workers}")
if os.name == "nt":
print("Press P to pause, R to resume, or Q to stop cleanly and save progress.")
if resume_existing:
print("Resumed from the existing crawl state file.")
log_message(log_path, "Resumed from existing crawl state")
print(f"Found {len(state.records)} unique URL(s).")
print(f"Visited pages: {len(state.visited)}")
print(f"Queued pages remaining: {len(state.queue)}")
print(f"URLs added from XML sitemaps: {state.discovered_from_sitemaps}")
if state.errors:
print(f"Pages with errors: {len(state.errors)}")
for result in state.errors[:10]:
print(f" {result['url']} -> {result['error']}")
if state.skipped_count:
print(f"Non-HTML pages skipped while crawling: {state.skipped_count}")
if user_stopped:
print("Stopped by user. Run it again to continue from the saved state.")
log_message(log_path, "Crawl stopped by user")
elif state.queue and len(state.visited) >= effective_max_pages:
print("Stopped because the max page limit was reached. Run it again to continue.")
log_message(log_path, "Crawl stopped at max page limit")
elif state.queue:
print("Stopped before the queue was empty. Run it again to continue.")
log_message(log_path, "Crawl stopped before queue emptied")
else:
print("Crawl complete. No queued pages remain.")
log_message(log_path, "Crawl completed with empty queue")
print("Done.")
return 0
if __name__ == "__main__":
raise SystemExit(main())
+210
View File
@@ -0,0 +1,210 @@
from __future__ import annotations
import contextlib
import csv
import importlib.util
import io
import os
import re
import sys
from pathlib import Path
import streamlit as st
ROOT_DIR = Path(__file__).resolve().parent
PAGE_IMPORTER_DIR = ROOT_DIR / "Page Importer"
SITEMAP_BUILDER_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
APP_DATA_DIR = Path(os.environ.get("APP_DATA_DIR", ROOT_DIR / ".data")).resolve()
SITEMAP_OUTPUT_DIR = APP_DATA_DIR / "sitemaps"
def load_module(module_name: str, file_path: Path):
spec = importlib.util.spec_from_file_location(module_name, file_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"Unable to load module from {file_path}")
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
def get_page_importer_module():
if str(PAGE_IMPORTER_DIR) not in sys.path:
sys.path.insert(0, str(PAGE_IMPORTER_DIR))
return load_module("page_importer_streamlit", PAGE_IMPORTER_DIR / "app.py")
def get_sitemap_module():
return load_module("sitemap_builder_module", SITEMAP_BUILDER_PATH)
def sanitize_job_name(value: str) -> str:
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", (value or "").strip())
cleaned = cleaned.strip(".-")
return cleaned or "sitemap"
def read_csv_preview(csv_bytes: bytes, limit: int = 200) -> list[dict[str, str]]:
text = csv_bytes.decode("utf-8-sig", errors="replace")
reader = csv.DictReader(io.StringIO(text))
rows: list[dict[str, str]] = []
for index, row in enumerate(reader):
if index >= limit:
break
rows.append(dict(row))
return rows
def render_sitemap_tab() -> None:
st.title("Sitemap Generator")
st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
with st.form("sitemap-form"):
start_url = st.text_input("Starting URL", placeholder="https://example.com")
job_name = st.text_input(
"Output name",
value="sitemap",
help="Used for the CSV, crawl state, and log file names.",
)
col1, col2, col3 = st.columns(3)
with col1:
max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
with col2:
delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
with col3:
save_every = st.number_input("Save progress every N pages", min_value=1, value=25, step=1)
include_subdomains = st.checkbox("Include subdomains", value=False)
include_documents = st.checkbox("Include document links", value=False)
resume_existing = st.checkbox("Resume from saved crawl state if present", value=True)
start_fresh = st.checkbox("Ignore any saved crawl state and start fresh", value=False)
submitted = st.form_submit_button("Run Sitemap Crawl", type="primary")
if submitted:
if not start_url.strip():
st.error("Starting URL is required.")
else:
sitemap_builder = get_sitemap_module()
safe_name = sanitize_job_name(job_name)
output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
captured_stdout = io.StringIO()
try:
with st.spinner("Running sitemap crawl..."):
with contextlib.redirect_stdout(captured_stdout):
result = sitemap_builder.run_crawl(
start_url=start_url,
output_path=output_path,
max_pages=int(max_pages),
delay=float(delay),
timeout=float(timeout),
include_subdomains=include_subdomains,
include_documents=include_documents,
save_every=int(save_every),
workers=int(workers),
resume=resume_existing,
fresh=start_fresh,
)
except Exception as exc:
st.error(str(exc))
else:
st.session_state["sitemap_result"] = {
"summary": {
"records": len(result.state.records),
"visited": len(result.state.visited),
"queued": len(result.state.queue),
"errors": len(result.state.errors),
"skipped": result.state.skipped_count,
"from_sitemaps": result.state.discovered_from_sitemaps,
"user_stopped": result.user_stopped,
"max_pages": result.max_pages,
"workers": result.workers,
},
"output_path": str(result.output_path),
"state_path": str(result.state_path),
"log_path": str(result.log_path),
"stdout": captured_stdout.getvalue(),
}
result_data = st.session_state.get("sitemap_result")
if not result_data:
st.info("Run a crawl to generate a sitemap CSV.")
return
summary = result_data["summary"]
csv_path = Path(result_data["output_path"])
state_path = Path(result_data["state_path"])
log_path = Path(result_data["log_path"])
st.subheader("Crawl Summary")
metric_cols = st.columns(6)
metric_cols[0].metric("URLs Found", summary["records"])
metric_cols[1].metric("Visited", summary["visited"])
metric_cols[2].metric("Queued", summary["queued"])
metric_cols[3].metric("XML Seeded", summary["from_sitemaps"])
metric_cols[4].metric("Errors", summary["errors"])
metric_cols[5].metric("Skipped", summary["skipped"])
status_text = "Stopped by user." if summary["user_stopped"] else "Run completed."
st.caption(f"{status_text} Max pages used: {summary['max_pages']} | Worker threads: {summary['workers']}")
if csv_path.exists():
csv_bytes = csv_path.read_bytes()
st.download_button(
"Download Sitemap CSV",
data=csv_bytes,
file_name=csv_path.name,
mime="text/csv",
)
preview_rows = read_csv_preview(csv_bytes)
if preview_rows:
st.dataframe(preview_rows, width="stretch", hide_index=True)
file_cols = st.columns(2)
with file_cols[0]:
if state_path.exists():
st.download_button(
"Download Crawl State",
data=state_path.read_bytes(),
file_name=state_path.name,
mime="application/json",
)
with file_cols[1]:
if log_path.exists():
st.download_button(
"Download Crawl Log",
data=log_path.read_bytes(),
file_name=log_path.name,
mime="text/plain",
)
crawl_output = (result_data.get("stdout") or "").strip()
if crawl_output:
st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True)
if log_path.exists():
log_text = log_path.read_text(encoding="utf-8", errors="replace")
st.text_area("Log Tail", value="\n".join(log_text.splitlines()[-50:]), height=220, disabled=True)
def main() -> None:
st.set_page_config(page_title="WDW Tools", layout="wide")
st.header("WDW Sitemap And Import Tools")
sitemap_tab, importer_tab = st.tabs(["Sitemap Generator", "Page Importer"])
with sitemap_tab:
render_sitemap_tab()
with importer_tab:
page_importer_app = get_page_importer_module()
page_importer_app.render_app()
if __name__ == "__main__":
main()
+4
View File
@@ -0,0 +1,4 @@
streamlit>=1.43,<2
requests>=2.32,<3
beautifulsoup4>=4.12,<5
python-dateutil>=2.9,<3