@@ -0,0 +1,13 @@
|
||||
.git
|
||||
.gitignore
|
||||
.codex
|
||||
**/.git
|
||||
**/.venv
|
||||
**/__pycache__
|
||||
**/*.pyc
|
||||
**/*.pyo
|
||||
**/*.pyd
|
||||
**/.pytest_cache
|
||||
**/.mypy_cache
|
||||
**/.DS_Store
|
||||
.data
|
||||
@@ -0,0 +1,42 @@
|
||||
name: Build Docker Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
IMAGE_NAME: wdw-sitemap-and-importer
|
||||
REGISTRY: ${{ secrets.REGISTRY_URL }}
|
||||
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
|
||||
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build image
|
||||
run: docker build -t "${IMAGE_NAME}:${GITHUB_SHA}" .
|
||||
|
||||
- name: Tag latest image
|
||||
run: docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${IMAGE_NAME}:latest"
|
||||
|
||||
- name: Log in to registry
|
||||
if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
|
||||
run: echo "${REGISTRY_PASSWORD}" | docker login "${REGISTRY}" -u "${REGISTRY_USERNAME}" --password-stdin
|
||||
|
||||
- name: Push commit image
|
||||
if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
|
||||
run: |
|
||||
docker tag "${IMAGE_NAME}:${GITHUB_SHA}" "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
|
||||
docker push "${REGISTRY}/${IMAGE_NAME}:${GITHUB_SHA}"
|
||||
|
||||
- name: Push latest image
|
||||
if: ${{ env.REGISTRY != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '' }}
|
||||
run: |
|
||||
docker tag "${IMAGE_NAME}:latest" "${REGISTRY}/${IMAGE_NAME}:latest"
|
||||
docker push "${REGISTRY}/${IMAGE_NAME}:latest"
|
||||
+15
@@ -0,0 +1,15 @@
|
||||
.codex
|
||||
.data/
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
.venv/
|
||||
**/.venv/
|
||||
**/__pycache__/
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
|
||||
*.crawl.log
|
||||
*.crawlstate.json
|
||||
|
||||
streamlit_uploads/
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
FROM python:3.14-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
STREAMLIT_SERVER_HEADLESS=true \
|
||||
STREAMLIT_SERVER_PORT=8501 \
|
||||
STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
|
||||
APP_DATA_DIR=/data
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt ./requirements.txt
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN mkdir -p /data
|
||||
|
||||
EXPOSE 8501
|
||||
|
||||
CMD ["streamlit", "run", "app.py"]
|
||||
@@ -0,0 +1,12 @@
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
|
||||
.streamlit/secrets.toml
|
||||
|
||||
*.log
|
||||
@@ -0,0 +1,63 @@
|
||||
# Page Importer
|
||||
|
||||
This folder contains the WordPress import tool used by the combined application in the repository root.
|
||||
|
||||
The importer still uses Streamlit internally, but it is now rendered as the `Page Importer` tab inside the shared app rather than being the main entrypoint for the repository.
|
||||
|
||||
## Features
|
||||
|
||||
- Upload a CSV of submitted URLs
|
||||
- Choose the URL column and optional title override column
|
||||
- Optionally map post type from the CSV or force a single post type
|
||||
- Scrape only the listed URLs
|
||||
- Extract title, publish date, author, body HTML, categories, and tags
|
||||
- Retry failed rows
|
||||
- Export a WordPress WXR XML file
|
||||
|
||||
## Recommended Usage
|
||||
|
||||
Run the root application:
|
||||
|
||||
```bash
|
||||
streamlit run ../app.py
|
||||
```
|
||||
|
||||
Or run the combined Docker container from the repository root.
|
||||
|
||||
## Standalone Usage
|
||||
|
||||
If you need to run this importer by itself:
|
||||
|
||||
```bash
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
On Windows PowerShell:
|
||||
|
||||
```powershell
|
||||
python -m venv .venv
|
||||
.venv\Scripts\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
## CSV Input
|
||||
|
||||
The app accepts CSV files with any columns. You choose:
|
||||
|
||||
- the URL column to scrape
|
||||
- an optional title or name column to override the scraped title
|
||||
- an optional post type column with values like `post` or `page`
|
||||
- an optional category column whose values are appended during export
|
||||
|
||||
You can also add manual categories in the sidebar to append them to every exported item.
|
||||
|
||||
## Notes
|
||||
|
||||
- Exported posts default to `draft` unless changed in the UI
|
||||
- Image and link URLs remain pointed at the source site
|
||||
- Some themes need heuristic fallback. The `Force heuristic scraping` option skips JSON-LD-first extraction and relies on page structure
|
||||
- In the combined app, dependencies come from the root `requirements.txt`
|
||||
@@ -0,0 +1,475 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import datetime as dt
|
||||
import io
|
||||
import re
|
||||
from dataclasses import replace
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from page_importer.dates import parse_datetime
|
||||
from page_importer.models import ScrapeOptions, ScrapedPost
|
||||
from page_importer.scraper import Scraper
|
||||
from page_importer.wxr import build_wxr
|
||||
|
||||
def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
|
||||
text = file_data.decode("utf-8-sig", errors="replace")
|
||||
reader = csv.DictReader(io.StringIO(text))
|
||||
rows = list(reader)
|
||||
return reader.fieldnames or [], rows
|
||||
|
||||
|
||||
def render_app() -> None:
|
||||
st.title("Page Importer")
|
||||
st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
|
||||
|
||||
with st.sidebar:
|
||||
st.header("Options")
|
||||
include_author = st.checkbox("Include author", value=True)
|
||||
include_categories = st.checkbox("Include categories", value=True)
|
||||
include_tags = st.checkbox("Include tags", value=True)
|
||||
force_heuristics = st.checkbox("Force heuristic scraping", value=False)
|
||||
test_run = st.checkbox(
|
||||
"Test run only",
|
||||
value=False,
|
||||
help="Scrape only the first 10 rows that contain a URL.",
|
||||
)
|
||||
post_type_mode = st.selectbox(
|
||||
"WordPress post type mode",
|
||||
["Single type for all rows", "Use a CSV column"],
|
||||
index=0,
|
||||
)
|
||||
default_post_type = st.selectbox("Default WordPress post type", ["post", "page"], index=0)
|
||||
|
||||
uploaded = st.file_uploader("Upload CSV", type=["csv"])
|
||||
if not uploaded:
|
||||
st.info("Upload a CSV to begin.")
|
||||
return
|
||||
|
||||
headers, rows = load_csv(uploaded.getvalue())
|
||||
if not rows:
|
||||
st.error("The CSV did not contain any rows.")
|
||||
return
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
url_column = st.selectbox("URL column", headers, index=_safe_index(headers, ["url", "link"]))
|
||||
with col2:
|
||||
title_column = st.selectbox(
|
||||
"Optional title override column",
|
||||
["(none)", *headers],
|
||||
index=_safe_index(["(none)", *headers], ["name", "title"]),
|
||||
)
|
||||
with col3:
|
||||
post_type_column = st.selectbox(
|
||||
"Optional post type column",
|
||||
["(none)", *headers],
|
||||
index=_safe_index(["(none)", *headers], ["post_type", "type"]),
|
||||
disabled=post_type_mode != "Use a CSV column",
|
||||
)
|
||||
st.write(f"Loaded {len(rows)} row(s). Only the selected URL column will be scraped.")
|
||||
if test_run:
|
||||
st.caption("Test run is enabled. Only the first 10 rows with a URL will be scraped.")
|
||||
|
||||
if st.button("Scrape URLs", type="primary"):
|
||||
context = build_scrape_context(
|
||||
include_author=include_author,
|
||||
include_categories=include_categories,
|
||||
include_tags=include_tags,
|
||||
force_heuristics=force_heuristics,
|
||||
test_run=test_run,
|
||||
post_type_mode=post_type_mode,
|
||||
post_type_column=post_type_column,
|
||||
default_post_type=default_post_type,
|
||||
url_column=url_column,
|
||||
title_column=title_column,
|
||||
)
|
||||
results = scrape_rows(rows, context, phase_label="Scraping")
|
||||
st.session_state["results"] = results
|
||||
st.session_state["input_rows"] = rows
|
||||
st.session_state["scrape_context"] = context
|
||||
|
||||
results = st.session_state.get("results", [])
|
||||
if not results:
|
||||
return
|
||||
|
||||
successful = [post for post in results if post.success]
|
||||
failed = [post for post in results if not post.success]
|
||||
|
||||
st.subheader("Results")
|
||||
st.write(f"Successful: {len(successful)} | Failed: {len(failed)}")
|
||||
|
||||
if failed and st.button("Retry failed items"):
|
||||
stored_rows = st.session_state.get("input_rows", rows)
|
||||
context = st.session_state.get("scrape_context")
|
||||
if context:
|
||||
retried = scrape_rows(
|
||||
stored_rows,
|
||||
context,
|
||||
row_numbers=[post.row_number for post in failed if post.row_number],
|
||||
phase_label="Retrying",
|
||||
)
|
||||
results = merge_retry_results(results, retried)
|
||||
st.session_state["results"] = results
|
||||
successful = [post for post in results if post.success]
|
||||
failed = [post for post in results if not post.success]
|
||||
|
||||
preview_rows = []
|
||||
for post in results:
|
||||
preview_rows.append(
|
||||
{
|
||||
"Row": post.row_number,
|
||||
"URL": post.source_url,
|
||||
"CMS": post.cms,
|
||||
"Success": post.success,
|
||||
"Title": post.title,
|
||||
"Publish Date": post.publish_date,
|
||||
"Author": post.author,
|
||||
"Categories": ", ".join(post.categories),
|
||||
"Tags": ", ".join(post.tags),
|
||||
"Post Type": post.post_type,
|
||||
"Error": post.error,
|
||||
}
|
||||
)
|
||||
st.dataframe(
|
||||
preview_rows,
|
||||
width="stretch",
|
||||
hide_index=True,
|
||||
column_config={
|
||||
"Row": st.column_config.NumberColumn(width="small"),
|
||||
"URL": st.column_config.TextColumn(width="medium"),
|
||||
"Title": st.column_config.TextColumn(width="medium"),
|
||||
"Publish Date": st.column_config.TextColumn(width="medium"),
|
||||
"Categories": st.column_config.TextColumn(width="medium"),
|
||||
"Tags": st.column_config.TextColumn(width="medium"),
|
||||
"Error": st.column_config.TextColumn(width="large"),
|
||||
},
|
||||
)
|
||||
|
||||
if failed:
|
||||
selected_failed = st.selectbox(
|
||||
"Failed row details",
|
||||
failed,
|
||||
format_func=lambda post: f"Row {post.row_number}: {post.source_url or '(missing URL)'}",
|
||||
)
|
||||
st.text_area(
|
||||
"Error details",
|
||||
value=selected_failed.error_details or selected_failed.error,
|
||||
height=180,
|
||||
disabled=True,
|
||||
)
|
||||
|
||||
if successful:
|
||||
selected_index = st.number_input(
|
||||
"Preview successful row",
|
||||
min_value=1,
|
||||
max_value=len(successful),
|
||||
value=1,
|
||||
step=1,
|
||||
)
|
||||
selected = successful[selected_index - 1]
|
||||
st.markdown("### Content Preview")
|
||||
st.write(f"**Title:** {selected.title}")
|
||||
st.write(f"**Source URL:** {selected.source_url}")
|
||||
st.write(f"**Publish Date:** {selected.publish_date or '(missing)'}")
|
||||
st.write(f"**Author:** {selected.author or '(missing)'}")
|
||||
st.write(f"**Post Type:** {selected.post_type}")
|
||||
st.write(selected.body_html, unsafe_allow_html=True)
|
||||
render_export_sidebar(successful, rows, headers)
|
||||
|
||||
|
||||
def build_scrape_context(
|
||||
*,
|
||||
include_author: bool,
|
||||
include_categories: bool,
|
||||
include_tags: bool,
|
||||
force_heuristics: bool,
|
||||
test_run: bool,
|
||||
post_type_mode: str,
|
||||
post_type_column: str,
|
||||
default_post_type: str,
|
||||
url_column: str,
|
||||
title_column: str,
|
||||
) -> dict[str, object]:
|
||||
return {
|
||||
"options": ScrapeOptions(
|
||||
include_author=include_author,
|
||||
include_categories=include_categories,
|
||||
include_tags=include_tags,
|
||||
force_heuristics=force_heuristics,
|
||||
),
|
||||
"test_run": test_run,
|
||||
"post_type_mode": post_type_mode,
|
||||
"post_type_column": post_type_column,
|
||||
"default_post_type": default_post_type,
|
||||
"url_column": url_column,
|
||||
"title_column": title_column,
|
||||
}
|
||||
|
||||
|
||||
def scrape_rows(
|
||||
rows: list[dict[str, str]],
|
||||
context: dict[str, object],
|
||||
row_numbers: list[int] | None = None,
|
||||
phase_label: str = "Scraping",
|
||||
) -> list[ScrapedPost]:
|
||||
options = context["options"]
|
||||
if not isinstance(options, ScrapeOptions):
|
||||
raise TypeError("Invalid scrape options in session state.")
|
||||
|
||||
scraper = Scraper(options)
|
||||
targets = list(enumerate(rows, start=1))
|
||||
if row_numbers is not None:
|
||||
requested_rows = set(row_numbers)
|
||||
targets = [(row_number, row) for row_number, row in targets if row_number in requested_rows]
|
||||
elif bool(context.get("test_run")):
|
||||
targets = [
|
||||
(row_number, row)
|
||||
for row_number, row in targets
|
||||
if (row.get(str(context["url_column"])) or "").strip()
|
||||
][:10]
|
||||
|
||||
results: list[ScrapedPost] = []
|
||||
progress = st.progress(0.0)
|
||||
status = st.empty()
|
||||
|
||||
total = len(targets) or 1
|
||||
for index, (row_number, row) in enumerate(targets, start=1):
|
||||
url = (row.get(context["url_column"]) or "").strip()
|
||||
status.write(f"{phase_label} {index}/{len(targets)}: {url or f'row {row_number} has no URL'}")
|
||||
|
||||
if url:
|
||||
post = scraper.scrape(url)
|
||||
else:
|
||||
post = ScrapedPost(
|
||||
source_url="",
|
||||
row_number=row_number,
|
||||
error="Missing URL in the selected URL column.",
|
||||
error_details=f"Row {row_number} does not contain a URL in column '{context['url_column']}'.",
|
||||
)
|
||||
|
||||
post.row_number = row_number
|
||||
apply_row_overrides(post, row, context)
|
||||
results.append(post)
|
||||
progress.progress(index / total)
|
||||
|
||||
status.write(f"{phase_label} complete.")
|
||||
return results
|
||||
|
||||
|
||||
def apply_row_overrides(post: ScrapedPost, row: dict[str, str], context: dict[str, object]) -> None:
|
||||
title_column = context["title_column"]
|
||||
if isinstance(title_column, str) and title_column != "(none)" and row.get(title_column):
|
||||
post.title = row[title_column].strip()
|
||||
|
||||
post.post_type = resolve_post_type(
|
||||
row=row,
|
||||
mode=str(context["post_type_mode"]),
|
||||
column=str(context["post_type_column"]),
|
||||
default_value=str(context["default_post_type"]),
|
||||
)
|
||||
|
||||
|
||||
def resolve_export_categories(
|
||||
row: dict[str, str],
|
||||
category_column: str,
|
||||
manual_categories: list[str],
|
||||
) -> list[str]:
|
||||
csv_categories = parse_terms(row.get(category_column, "")) if category_column != "(none)" else []
|
||||
return merge_unique_terms(csv_categories, manual_categories)
|
||||
|
||||
|
||||
def parse_terms(value: str) -> list[str]:
|
||||
return [term.strip() for term in re.split(r"[,|>]", value or "") if term.strip()]
|
||||
|
||||
|
||||
def merge_unique_terms(*groups: list[str]) -> list[str]:
|
||||
merged: list[str] = []
|
||||
for group in groups:
|
||||
for term in group:
|
||||
cleaned = term.strip()
|
||||
if cleaned and cleaned not in merged:
|
||||
merged.append(cleaned)
|
||||
return merged
|
||||
|
||||
|
||||
def merge_retry_results(existing: list[ScrapedPost], replacements: list[ScrapedPost]) -> list[ScrapedPost]:
|
||||
replacement_map = {post.row_number: post for post in replacements}
|
||||
merged = [replacement_map.get(post.row_number, post) for post in existing]
|
||||
return sorted(merged, key=lambda post: post.row_number or 0)
|
||||
|
||||
|
||||
def build_export_posts(
|
||||
posts: list[ScrapedPost],
|
||||
rows: list[dict[str, str]],
|
||||
category_column: str,
|
||||
manual_categories: list[str],
|
||||
post_status: str,
|
||||
custom_post_type_slug: str,
|
||||
) -> list[ScrapedPost]:
|
||||
export_posts: list[ScrapedPost] = []
|
||||
for post in posts:
|
||||
row = rows[post.row_number - 1] if 0 < post.row_number <= len(rows) else {}
|
||||
export_posts.append(
|
||||
replace(
|
||||
post,
|
||||
status=post_status,
|
||||
post_type=custom_post_type_slug or post.post_type,
|
||||
categories=merge_unique_terms(
|
||||
post.categories,
|
||||
resolve_export_categories(row, category_column, manual_categories),
|
||||
),
|
||||
)
|
||||
)
|
||||
return export_posts
|
||||
|
||||
|
||||
def render_export_sidebar(
|
||||
successful: list[ScrapedPost],
|
||||
rows: list[dict[str, str]],
|
||||
headers: list[str],
|
||||
) -> None:
|
||||
with st.sidebar:
|
||||
st.markdown("---")
|
||||
st.subheader("Export")
|
||||
post_status = st.selectbox(
|
||||
"Imported post status",
|
||||
["draft", "publish", "private"],
|
||||
index=0,
|
||||
key="export_post_status",
|
||||
)
|
||||
category_column = st.selectbox(
|
||||
"CSV category column",
|
||||
["(none)", *headers],
|
||||
index=_safe_index(["(none)", *headers], ["category", "categories", "department"]),
|
||||
key="export_category_column",
|
||||
)
|
||||
manual_categories = parse_terms(
|
||||
st.text_input(
|
||||
"Additional export categories",
|
||||
value="",
|
||||
help="Comma-separated categories to append to every exported item.",
|
||||
key="export_manual_categories",
|
||||
)
|
||||
)
|
||||
output_name = st.text_input(
|
||||
"Output filename",
|
||||
value="wordpress-import.xml",
|
||||
key="export_output_name",
|
||||
)
|
||||
custom_post_type_slug = normalize_post_type_slug(
|
||||
st.text_input(
|
||||
"Custom post type slug",
|
||||
value="",
|
||||
help="Optional. If set, all exported items will use this WordPress post type slug.",
|
||||
key="export_custom_post_type_slug",
|
||||
)
|
||||
)
|
||||
|
||||
export_posts = build_export_posts(
|
||||
successful,
|
||||
rows,
|
||||
category_column,
|
||||
manual_categories,
|
||||
post_status,
|
||||
custom_post_type_slug,
|
||||
)
|
||||
if custom_post_type_slug:
|
||||
st.caption(f"Exporting all items as post type `{custom_post_type_slug}`.")
|
||||
dated_export_posts = [(post, publish_date) for post in export_posts if (publish_date := parse_publish_date(post.publish_date))]
|
||||
|
||||
if dated_export_posts:
|
||||
min_date = min(publish_date for _, publish_date in dated_export_posts)
|
||||
max_date = max(publish_date for _, publish_date in dated_export_posts)
|
||||
filter_by_publish_date = st.checkbox(
|
||||
"Filter export by publish date",
|
||||
value=False,
|
||||
key="export_filter_by_publish_date",
|
||||
)
|
||||
|
||||
if filter_by_publish_date:
|
||||
export_start = st.date_input(
|
||||
"Export start date",
|
||||
value=min_date,
|
||||
min_value=min_date,
|
||||
max_value=max_date,
|
||||
format="MM/DD/YYYY",
|
||||
key="export_start_date",
|
||||
)
|
||||
export_end = st.date_input(
|
||||
"Export end date",
|
||||
value=max_date,
|
||||
min_value=min_date,
|
||||
max_value=max_date,
|
||||
format="MM/DD/YYYY",
|
||||
key="export_end_date",
|
||||
)
|
||||
|
||||
if export_start > export_end:
|
||||
st.error("Export start date must be on or before the end date.")
|
||||
export_posts = []
|
||||
else:
|
||||
export_posts = [
|
||||
post
|
||||
for post in export_posts
|
||||
if (publish_date := parse_publish_date(post.publish_date)) and export_start <= publish_date <= export_end
|
||||
]
|
||||
st.caption(
|
||||
"Date filter: "
|
||||
f"{export_start.strftime('%m/%d/%Y')} to {export_end.strftime('%m/%d/%Y')}."
|
||||
)
|
||||
undated_count = len(successful) - len(dated_export_posts)
|
||||
if undated_count:
|
||||
st.caption(f"Excluded {undated_count} successful item(s) with no publish date.")
|
||||
else:
|
||||
st.caption("No successful items have a publish date, so export date filtering is unavailable.")
|
||||
|
||||
st.caption(f"Ready to export {len(export_posts)} post(s).")
|
||||
xml_data = build_wxr(export_posts)
|
||||
st.download_button(
|
||||
label="Download WXR XML",
|
||||
data=xml_data,
|
||||
file_name=output_name,
|
||||
mime="application/xml",
|
||||
disabled=not export_posts,
|
||||
)
|
||||
|
||||
|
||||
def parse_publish_date(value: str) -> dt.date | None:
|
||||
parsed = parse_datetime(value)
|
||||
if parsed is None:
|
||||
return None
|
||||
return parsed.date()
|
||||
|
||||
|
||||
def _safe_index(values: list[str], candidates: list[str]) -> int:
|
||||
lowered = {value.lower(): idx for idx, value in enumerate(values)}
|
||||
for candidate in candidates:
|
||||
if candidate in lowered:
|
||||
return lowered[candidate]
|
||||
return 0
|
||||
|
||||
|
||||
def resolve_post_type(
|
||||
row: dict[str, str],
|
||||
mode: str,
|
||||
column: str,
|
||||
default_value: str,
|
||||
) -> str:
|
||||
if mode != "Use a CSV column" or column == "(none)":
|
||||
return default_value
|
||||
|
||||
raw_value = normalize_post_type_slug(row.get(column) or "")
|
||||
if raw_value:
|
||||
return raw_value
|
||||
return default_value
|
||||
|
||||
|
||||
def normalize_post_type_slug(value: str) -> str:
|
||||
return re.sub(r"[^a-z0-9_-]", "", (value or "").strip().lower())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
st.set_page_config(page_title="Page Importer", layout="wide")
|
||||
render_app()
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as dt
|
||||
|
||||
from dateutil import parser as date_parser
|
||||
|
||||
|
||||
def parse_datetime(value: str | None) -> dt.datetime | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return date_parser.parse(value)
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
try:
|
||||
return date_parser.parse(value, fuzzy=True)
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
return None
|
||||
|
||||
|
||||
def normalize_date(value: str | None) -> str:
|
||||
parsed = parse_datetime(value)
|
||||
if parsed is None:
|
||||
return ""
|
||||
if parsed.tzinfo is None or parsed.utcoffset() is None:
|
||||
return parsed.strftime("%Y-%m-%d %H:%M:%S")
|
||||
return parsed.isoformat(sep=" ", timespec="seconds")
|
||||
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeOptions:
|
||||
include_author: bool = True
|
||||
include_categories: bool = True
|
||||
include_tags: bool = True
|
||||
force_heuristics: bool = False
|
||||
request_timeout: int = 20
|
||||
user_agent: str = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedPost:
|
||||
source_url: str
|
||||
row_number: int = 0
|
||||
cms: str = "unknown"
|
||||
title: str = ""
|
||||
publish_date: str = ""
|
||||
author: str = ""
|
||||
body_html: str = ""
|
||||
categories: list[str] = field(default_factory=list)
|
||||
tags: list[str] = field(default_factory=list)
|
||||
status: str = "draft"
|
||||
post_type: str = "post"
|
||||
success: bool = False
|
||||
error: str = ""
|
||||
error_details: str = ""
|
||||
@@ -0,0 +1,555 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import traceback
|
||||
from html import unescape
|
||||
from typing import Iterable
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString, Tag
|
||||
|
||||
from page_importer.dates import normalize_date
|
||||
from page_importer.models import ScrapeOptions, ScrapedPost
|
||||
|
||||
JSON_ARTICLE_TYPES = {
|
||||
"article",
|
||||
"blogposting",
|
||||
"newsarticle",
|
||||
"report",
|
||||
"webpage",
|
||||
}
|
||||
|
||||
BODY_SELECTORS = [
|
||||
"article .entry-content",
|
||||
"article .post-content",
|
||||
"article .node__content",
|
||||
"article .node .content",
|
||||
"article .node-content",
|
||||
"article .field-name-body .field-item",
|
||||
"article .field-name-body",
|
||||
"article .field--name-body",
|
||||
"article .article-body",
|
||||
"article .content",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".node__content",
|
||||
".node .content",
|
||||
".node-content",
|
||||
".field-name-body .field-item",
|
||||
".field-name-body",
|
||||
".field--name-body",
|
||||
".article-body",
|
||||
"#content-area .node .content",
|
||||
"article",
|
||||
"main article",
|
||||
"main",
|
||||
]
|
||||
|
||||
CATEGORY_SELECTORS = [
|
||||
".cat-links a",
|
||||
".post-categories a",
|
||||
".field--name-field-category a",
|
||||
".tags a[rel='category tag']",
|
||||
".terms a",
|
||||
".taxonomy a",
|
||||
]
|
||||
|
||||
TAG_SELECTORS = [
|
||||
".tags-links a",
|
||||
".post-tags a",
|
||||
".field--name-field-tags a",
|
||||
"a[rel='tag']",
|
||||
".terms a",
|
||||
]
|
||||
|
||||
AUTHOR_SELECTORS = [
|
||||
"[rel='author']",
|
||||
".author a",
|
||||
".byline a",
|
||||
".submitted a",
|
||||
".node__submitted a",
|
||||
".node-info a",
|
||||
".createdby",
|
||||
]
|
||||
|
||||
DATE_SELECTORS = [
|
||||
"time[datetime]",
|
||||
"meta[property='article:published_time']",
|
||||
"meta[name='publish_date']",
|
||||
"meta[name='pubdate']",
|
||||
".date-display-single",
|
||||
".submitted",
|
||||
".node-info",
|
||||
]
|
||||
|
||||
DRUPAL_TITLE_DATE_PATTERN = re.compile(
|
||||
r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
|
||||
r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
|
||||
)
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, options: ScrapeOptions) -> None:
|
||||
self.options = options
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"User-Agent": options.user_agent})
|
||||
|
||||
def scrape(self, url: str) -> ScrapedPost:
|
||||
post = ScrapedPost(source_url=url)
|
||||
response: requests.Response | None = None
|
||||
try:
|
||||
response = self.session.get(url, timeout=self.options.request_timeout)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
post.cms = detect_cms(soup)
|
||||
|
||||
article_data = extract_article_json_ld(soup)
|
||||
if article_data and not self.options.force_heuristics:
|
||||
apply_article_data(post, article_data, soup, self.options)
|
||||
|
||||
merge_fallback_data(post, soup, self.options)
|
||||
post.body_html = sanitize_html(post.body_html)
|
||||
|
||||
missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
|
||||
if missing_fields:
|
||||
raise ValueError(
|
||||
"Unable to extract required field(s): "
|
||||
f"{', '.join(missing_fields)}. "
|
||||
f"Detected CMS: {post.cms}. "
|
||||
f"Publish date found: {'yes' if post.publish_date else 'no'}. "
|
||||
f"Author found: {'yes' if post.author else 'no'}."
|
||||
)
|
||||
|
||||
post.success = True
|
||||
return post
|
||||
except Exception as exc:
|
||||
post.error = format_error_summary(url, exc, response, self.options.request_timeout)
|
||||
post.error_details = format_error_details(url, exc, response)
|
||||
return post
|
||||
|
||||
|
||||
def detect_cms(soup: BeautifulSoup) -> str:
|
||||
generator = meta_content(soup, "meta", {"name": "generator"})
|
||||
html = str(soup).lower()
|
||||
if generator:
|
||||
g = generator.lower()
|
||||
if "wordpress" in g:
|
||||
return "wordpress"
|
||||
if "drupal" in g:
|
||||
return "drupal"
|
||||
if "joomla" in g:
|
||||
return "joomla"
|
||||
if "/wp-content/" in html:
|
||||
return "wordpress"
|
||||
if "drupal-settings-json" in html or "sites/default/files" in html:
|
||||
return "drupal"
|
||||
if "com_content" in html or "joomla" in html:
|
||||
return "joomla"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
|
||||
for script in soup.select("script[type='application/ld+json']"):
|
||||
raw = script.string or script.get_text(" ", strip=True)
|
||||
if not raw:
|
||||
continue
|
||||
for payload in parse_json_candidates(raw):
|
||||
article = find_article_payload(payload)
|
||||
if article:
|
||||
return article
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_candidates(raw: str) -> Iterable[dict | list]:
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
yield data
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
yield data
|
||||
except json.JSONDecodeError:
|
||||
return
|
||||
|
||||
|
||||
def find_article_payload(payload: dict | list) -> dict | None:
|
||||
if isinstance(payload, list):
|
||||
for item in payload:
|
||||
found = find_article_payload(item)
|
||||
if found:
|
||||
return found
|
||||
return None
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
if "@graph" in payload:
|
||||
found = find_article_payload(payload["@graph"])
|
||||
if found:
|
||||
return found
|
||||
node_type = payload.get("@type")
|
||||
types = {node_type.lower()} if isinstance(node_type, str) else {
|
||||
item.lower() for item in node_type or [] if isinstance(item, str)
|
||||
}
|
||||
if types & JSON_ARTICLE_TYPES:
|
||||
return payload
|
||||
return None
|
||||
|
||||
|
||||
def apply_article_data(
|
||||
post: ScrapedPost,
|
||||
article: dict,
|
||||
soup: BeautifulSoup,
|
||||
options: ScrapeOptions,
|
||||
) -> None:
|
||||
post.title = article.get("headline") or article.get("name") or post.title
|
||||
post.publish_date = normalize_date(
|
||||
article.get("datePublished") or article.get("dateCreated") or post.publish_date
|
||||
)
|
||||
if options.include_author:
|
||||
post.author = extract_author_from_json_ld(article) or post.author
|
||||
if options.include_categories:
|
||||
post.categories = normalize_terms(article.get("articleSection")) or post.categories
|
||||
if options.include_tags:
|
||||
post.tags = normalize_terms(article.get("keywords")) or post.tags
|
||||
post.body_html = extract_body_from_article(article, soup) or post.body_html
|
||||
|
||||
|
||||
def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
|
||||
if not post.title:
|
||||
post.title = extract_title(soup)
|
||||
if not post.publish_date:
|
||||
post.publish_date = extract_date(soup, post.cms)
|
||||
if options.include_author and not post.author:
|
||||
post.author = extract_author(soup)
|
||||
if not post.body_html:
|
||||
post.body_html = extract_body(soup)
|
||||
if options.include_categories:
|
||||
post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
|
||||
if post.cms == "drupal":
|
||||
post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
|
||||
if options.include_tags and not post.tags:
|
||||
post.tags = extract_terms(soup, TAG_SELECTORS)
|
||||
|
||||
|
||||
def extract_title(soup: BeautifulSoup) -> str:
|
||||
og_title = meta_content(soup, "meta", {"property": "og:title"})
|
||||
if og_title:
|
||||
return og_title
|
||||
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
|
||||
|
||||
|
||||
def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
|
||||
for selector in DATE_SELECTORS:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
continue
|
||||
candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
|
||||
normalized = normalize_date(candidate)
|
||||
if normalized:
|
||||
return normalized
|
||||
if cms == "drupal":
|
||||
return extract_drupal_title_adjacent_date(soup)
|
||||
return ""
|
||||
|
||||
|
||||
def extract_author(soup: BeautifulSoup) -> str:
|
||||
author = meta_content(soup, "meta", {"name": "author"})
|
||||
if author:
|
||||
return clean_text(author)
|
||||
for selector in AUTHOR_SELECTORS:
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
return ""
|
||||
|
||||
|
||||
def extract_body(soup: BeautifulSoup) -> str:
|
||||
fallback_html = ""
|
||||
for selector in BODY_SELECTORS:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
continue
|
||||
candidate = clone_tag(node)
|
||||
strip_unwanted(candidate)
|
||||
html = candidate.decode_contents().strip()
|
||||
text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
|
||||
if text_length >= 120:
|
||||
return html
|
||||
if not fallback_html and has_meaningful_body_content(html):
|
||||
fallback_html = html
|
||||
return fallback_html
|
||||
|
||||
|
||||
def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
|
||||
terms: list[str] = []
|
||||
for selector in selectors:
|
||||
for node in soup.select(selector):
|
||||
term = clean_text(node.get_text(" ", strip=True))
|
||||
if term and term not in terms:
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
|
||||
def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
|
||||
title_node = find_title_node(soup)
|
||||
if not title_node:
|
||||
return ""
|
||||
|
||||
for sibling in title_node.next_siblings:
|
||||
candidate = text_from_node(sibling)
|
||||
normalized = normalize_drupal_date(candidate)
|
||||
if normalized:
|
||||
return normalized
|
||||
|
||||
header = title_node.find_parent(["header", "div", "section"])
|
||||
if header:
|
||||
header_text = clean_text(header.get_text(" ", strip=True))
|
||||
title_text = clean_text(title_node.get_text(" ", strip=True))
|
||||
if title_text and header_text.startswith(title_text):
|
||||
header_text = clean_text(header_text[len(title_text):])
|
||||
normalized = normalize_drupal_date(header_text)
|
||||
if normalized:
|
||||
return normalized
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
|
||||
categories: list[str] = []
|
||||
label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)
|
||||
|
||||
for label_node in soup.find_all(string=label_pattern):
|
||||
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
|
||||
if not parent:
|
||||
continue
|
||||
|
||||
inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
|
||||
normalized_inline_value = normalize_department_category(inline_value)
|
||||
if normalized_inline_value:
|
||||
categories = merge_terms(categories, [normalized_inline_value])
|
||||
continue
|
||||
|
||||
for sibling in parent.next_siblings:
|
||||
value = normalize_department_category(text_from_node(sibling))
|
||||
if value:
|
||||
categories = merge_terms(categories, [value])
|
||||
break
|
||||
|
||||
for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
|
||||
text = clean_text(candidate.get_text(" ", strip=True))
|
||||
if not text.lower().startswith("department:"):
|
||||
continue
|
||||
extracted = normalize_department_category(extract_labeled_value(text, "Department"))
|
||||
if extracted:
|
||||
categories = merge_terms(categories, [extracted])
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def extract_author_from_json_ld(article: dict) -> str:
|
||||
author = article.get("author")
|
||||
if isinstance(author, dict):
|
||||
return clean_text(author.get("name", ""))
|
||||
if isinstance(author, list):
|
||||
names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
|
||||
return ", ".join(name for name in names if name)
|
||||
if isinstance(author, str):
|
||||
return clean_text(author)
|
||||
return ""
|
||||
|
||||
|
||||
def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
|
||||
body = article.get("articleBody")
|
||||
if isinstance(body, str) and len(body.strip()) > 120:
|
||||
return f"<p>{unescape(body.strip())}</p>"
|
||||
return extract_body(soup)
|
||||
|
||||
|
||||
def normalize_terms(value: object) -> list[str]:
|
||||
if isinstance(value, str):
|
||||
parts = re.split(r"[,|>]", value)
|
||||
return [clean_text(part) for part in parts if clean_text(part)]
|
||||
if isinstance(value, list):
|
||||
result: list[str] = []
|
||||
for item in value:
|
||||
if isinstance(item, str):
|
||||
cleaned = clean_text(item)
|
||||
if cleaned and cleaned not in result:
|
||||
result.append(cleaned)
|
||||
return result
|
||||
return []
|
||||
|
||||
|
||||
def merge_terms(*groups: list[str]) -> list[str]:
|
||||
merged: list[str] = []
|
||||
for group in groups:
|
||||
for item in group:
|
||||
cleaned = clean_text(item)
|
||||
if cleaned and cleaned not in merged:
|
||||
merged.append(cleaned)
|
||||
return merged
|
||||
|
||||
|
||||
def normalize_drupal_date(value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
match = DRUPAL_TITLE_DATE_PATTERN.search(value)
|
||||
if not match:
|
||||
return ""
|
||||
return normalize_date(match.group(0))
|
||||
|
||||
|
||||
def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
|
||||
node = soup.find(tag_name, attrs=attrs)
|
||||
if node and node.get("content"):
|
||||
return node["content"].strip()
|
||||
return ""
|
||||
|
||||
|
||||
def clean_text(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value or "").strip()
|
||||
|
||||
|
||||
def text_from_node(node: object) -> str:
|
||||
if isinstance(node, NavigableString):
|
||||
return clean_text(str(node))
|
||||
if isinstance(node, Tag):
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
return ""
|
||||
|
||||
|
||||
def sanitize_html(html: str) -> str:
|
||||
if not html:
|
||||
return ""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
strip_unwanted(soup)
|
||||
strip_dangerous_attributes(soup)
|
||||
return soup.decode_contents().strip()
|
||||
|
||||
|
||||
def has_meaningful_body_content(html: str) -> bool:
|
||||
if not html:
|
||||
return False
|
||||
text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
|
||||
return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))
|
||||
|
||||
|
||||
def strip_unwanted(node: BeautifulSoup | Tag) -> None:
|
||||
for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
|
||||
for child in node.select(selector):
|
||||
child.decompose()
|
||||
|
||||
|
||||
def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
|
||||
for child in node.find_all(True):
|
||||
for attr_name in list(child.attrs):
|
||||
normalized_name = attr_name.lower()
|
||||
if normalized_name.startswith("on") or normalized_name == "srcdoc":
|
||||
del child.attrs[attr_name]
|
||||
continue
|
||||
|
||||
if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
|
||||
continue
|
||||
|
||||
raw_value = child.attrs.get(attr_name)
|
||||
if isinstance(raw_value, list):
|
||||
candidate = " ".join(str(item) for item in raw_value)
|
||||
else:
|
||||
candidate = str(raw_value or "")
|
||||
|
||||
lowered = candidate.strip().lower()
|
||||
if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
|
||||
del child.attrs[attr_name]
|
||||
|
||||
|
||||
def clone_tag(node: Tag) -> BeautifulSoup:
|
||||
return BeautifulSoup(str(node), "html.parser")
|
||||
|
||||
|
||||
def find_title_node(soup: BeautifulSoup) -> Tag | None:
|
||||
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return node
|
||||
return None
|
||||
|
||||
|
||||
def extract_labeled_value(text: str, label: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
pattern = re.compile(
|
||||
rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern.search(clean_text(text))
|
||||
if not match:
|
||||
return ""
|
||||
return clean_text(match.group(1))
|
||||
|
||||
|
||||
def normalize_department_category(value: str) -> str:
|
||||
cleaned = clean_text(value)
|
||||
if not cleaned:
|
||||
return ""
|
||||
if len(cleaned) > 80 or len(cleaned.split()) > 8:
|
||||
return ""
|
||||
if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
|
||||
return ""
|
||||
return cleaned
|
||||
|
||||
|
||||
def format_error_summary(
|
||||
url: str,
|
||||
exc: Exception,
|
||||
response: requests.Response | None,
|
||||
timeout_seconds: int,
|
||||
) -> str:
|
||||
if isinstance(exc, requests.HTTPError):
|
||||
failing_response = exc.response or response
|
||||
if failing_response is not None:
|
||||
return (
|
||||
f"HTTP {failing_response.status_code} {failing_response.reason} "
|
||||
f"while fetching {failing_response.url or url}"
|
||||
)
|
||||
if isinstance(exc, requests.Timeout):
|
||||
return f"Request timed out after {timeout_seconds}s while fetching {url}"
|
||||
if isinstance(exc, requests.RequestException):
|
||||
return f"{type(exc).__name__} while fetching {url}: {exc}"
|
||||
return f"{type(exc).__name__}: {exc}"
|
||||
|
||||
|
||||
def format_error_details(
|
||||
url: str,
|
||||
exc: Exception,
|
||||
response: requests.Response | None,
|
||||
) -> str:
|
||||
details = [
|
||||
f"URL: {url}",
|
||||
f"Error Type: {type(exc).__name__}",
|
||||
f"Message: {exc}",
|
||||
]
|
||||
|
||||
failing_response = getattr(exc, "response", None) or response
|
||||
if failing_response is not None:
|
||||
details.extend(
|
||||
[
|
||||
f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
|
||||
f"Resolved URL: {failing_response.url}",
|
||||
]
|
||||
)
|
||||
|
||||
trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
|
||||
if trace:
|
||||
details.append(f"Exception: {trace}")
|
||||
|
||||
return "\n".join(details)
|
||||
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from email.utils import format_datetime
|
||||
from io import StringIO
|
||||
from xml.sax.saxutils import escape
|
||||
import datetime as dt
|
||||
|
||||
from page_importer.dates import parse_datetime
|
||||
from page_importer.models import ScrapedPost
|
||||
|
||||
|
||||
def build_wxr(posts: list[ScrapedPost], channel_title: str = "Imported Content") -> str:
|
||||
now = dt.datetime.now(dt.timezone.utc)
|
||||
out = StringIO()
|
||||
out.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
|
||||
out.write(
|
||||
'<rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" '
|
||||
'xmlns:content="http://purl.org/rss/1.0/modules/content/" '
|
||||
'xmlns:wfw="http://wellformedweb.org/CommentAPI/" '
|
||||
'xmlns:dc="http://purl.org/dc/elements/1.1/" '
|
||||
'xmlns:wp="http://wordpress.org/export/1.2/">\n'
|
||||
)
|
||||
out.write("<channel>\n")
|
||||
out.write(f"<title>{escape(channel_title)}</title>\n")
|
||||
out.write("<link>http://localhost/</link>\n")
|
||||
out.write("<description>Generated by Page Importer</description>\n")
|
||||
out.write(f"<pubDate>{format_datetime(now)}</pubDate>\n")
|
||||
out.write("<language>en-US</language>\n")
|
||||
out.write("<wp:wxr_version>1.2</wp:wxr_version>\n")
|
||||
|
||||
for post in posts:
|
||||
local_date, gmt_date, item_pub_date = _resolve_post_dates(post.publish_date, now)
|
||||
out.write("<item>\n")
|
||||
out.write(f"<title>{escape(post.title)}</title>\n")
|
||||
out.write(f"<link>{escape(post.source_url)}</link>\n")
|
||||
out.write(f"<pubDate>{format_datetime(item_pub_date)}</pubDate>\n")
|
||||
out.write(f"<dc:creator>{cdata(post.author or 'importer')}</dc:creator>\n")
|
||||
out.write(f"<guid isPermaLink=\"false\">{escape(post.source_url)}</guid>\n")
|
||||
out.write("<description></description>\n")
|
||||
out.write(f"<content:encoded>{cdata(post.body_html)}</content:encoded>\n")
|
||||
out.write(f"<excerpt:encoded>{cdata('')}</excerpt:encoded>\n")
|
||||
out.write(f"<wp:post_date>{cdata(local_date)}</wp:post_date>\n")
|
||||
out.write(f"<wp:post_date_gmt>{cdata(gmt_date)}</wp:post_date_gmt>\n")
|
||||
out.write("<wp:comment_status><![CDATA[closed]]></wp:comment_status>\n")
|
||||
out.write("<wp:ping_status><![CDATA[closed]]></wp:ping_status>\n")
|
||||
out.write("<wp:post_name><![CDATA[]]></wp:post_name>\n")
|
||||
out.write(f"<wp:status>{cdata(post.status)}</wp:status>\n")
|
||||
out.write("<wp:post_parent>0</wp:post_parent>\n")
|
||||
out.write("<wp:menu_order>0</wp:menu_order>\n")
|
||||
out.write(f"<wp:post_type>{cdata(post.post_type or 'post')}</wp:post_type>\n")
|
||||
out.write("<wp:post_password><![CDATA[]]></wp:post_password>\n")
|
||||
out.write("<wp:is_sticky>0</wp:is_sticky>\n")
|
||||
for category in post.categories:
|
||||
out.write(
|
||||
f'<category domain="category" nicename="{escape(slugify(category))}">{cdata(category)}</category>\n'
|
||||
)
|
||||
for tag in post.tags:
|
||||
out.write(
|
||||
f'<category domain="post_tag" nicename="{escape(slugify(tag))}">{cdata(tag)}</category>\n'
|
||||
)
|
||||
out.write("</item>\n")
|
||||
|
||||
out.write("</channel>\n</rss>\n")
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
def slugify(value: str) -> str:
|
||||
return "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-")
|
||||
|
||||
|
||||
def cdata(value: str) -> str:
|
||||
return f"<![CDATA[{(value or '').replace(']]>', ']]]]><![CDATA[>')}]]>"
|
||||
|
||||
|
||||
def _resolve_post_dates(value: str, fallback: dt.datetime) -> tuple[str, str, dt.datetime]:
|
||||
parsed = parse_datetime(value)
|
||||
if parsed is None:
|
||||
return "", "", fallback
|
||||
|
||||
if parsed.tzinfo is None or parsed.utcoffset() is None:
|
||||
local_date = _format_wp_date(parsed)
|
||||
assumed_utc = parsed.replace(tzinfo=dt.timezone.utc)
|
||||
return local_date, local_date, assumed_utc
|
||||
|
||||
local_date = _format_wp_date(parsed)
|
||||
gmt_value = parsed.astimezone(dt.timezone.utc)
|
||||
return local_date, _format_wp_date(gmt_value), gmt_value
|
||||
|
||||
|
||||
def _format_wp_date(value: dt.datetime) -> str:
|
||||
return value.replace(tzinfo=None).strftime("%Y-%m-%d %H:%M:%S")
|
||||
@@ -0,0 +1,4 @@
|
||||
streamlit>=1.43,<2
|
||||
requests>=2.32,<3
|
||||
beautifulsoup4>=4.12,<5
|
||||
python-dateutil>=2.9,<3
|
||||
@@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from page_importer.dates import normalize_date
|
||||
from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
|
||||
from page_importer.wxr import build_wxr
|
||||
from page_importer.models import ScrapedPost
|
||||
|
||||
|
||||
class DateNormalizationTests(unittest.TestCase):
|
||||
def test_preserves_timezone_offset_in_normalized_value(self) -> None:
|
||||
self.assertEqual(
|
||||
normalize_date("2024-05-01T09:30:00-07:00"),
|
||||
"2024-05-01 09:30:00-07:00",
|
||||
)
|
||||
|
||||
|
||||
class WxrSerializationTests(unittest.TestCase):
|
||||
def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
|
||||
xml = build_wxr(
|
||||
[
|
||||
ScrapedPost(
|
||||
source_url="https://example.com/post",
|
||||
title="Example",
|
||||
body_html="<p>Body</p>",
|
||||
publish_date="2024-05-01 09:30:00-07:00",
|
||||
success=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
|
||||
self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
|
||||
self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
|
||||
|
||||
def test_splits_cdata_terminators_in_content(self) -> None:
|
||||
xml = build_wxr(
|
||||
[
|
||||
ScrapedPost(
|
||||
source_url="https://example.com/post",
|
||||
title="Example",
|
||||
body_html="<p>alpha ]]> omega</p>",
|
||||
author="Jane ]]> Doe",
|
||||
success=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
|
||||
self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
|
||||
|
||||
|
||||
class HtmlSanitizationTests(unittest.TestCase):
|
||||
def test_removes_inline_event_handlers_and_script_uris(self) -> None:
|
||||
sanitized = sanitize_html(
|
||||
'<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
|
||||
)
|
||||
|
||||
self.assertNotIn("onclick", sanitized)
|
||||
self.assertNotIn("onerror", sanitized)
|
||||
self.assertNotIn("javascript:", sanitized)
|
||||
|
||||
|
||||
class TaxonomySelectorTests(unittest.TestCase):
|
||||
def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
|
||||
soup = BeautifulSoup(
|
||||
'<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
|
||||
"html.parser",
|
||||
)
|
||||
|
||||
self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
|
||||
self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,110 @@
|
||||
# WDW Sitemap And Import Tools
|
||||
|
||||
This repository combines two internal tools into one web application and one Docker image:
|
||||
|
||||
- `Sitemap Generator`
|
||||
- `Page Importer`
|
||||
|
||||
The application uses Streamlit and presents both tools behind a single URL with two tabs at the top of the page.
|
||||
|
||||
## What It Does
|
||||
|
||||
### Sitemap Generator
|
||||
|
||||
- Crawls a site from a starting URL
|
||||
- Discovers URLs from page links and XML sitemaps
|
||||
- Exports a sitemap CSV
|
||||
- Saves crawl state and logs so a crawl can be resumed later
|
||||
|
||||
### Page Importer
|
||||
|
||||
- Reads a CSV of submitted URLs
|
||||
- Scrapes page content
|
||||
- Lets you review the extracted content
|
||||
- Exports a WordPress WXR XML import file
|
||||
|
||||
## Project Layout
|
||||
|
||||
- `app.py`: top-level Streamlit app with both tabs
|
||||
- `requirements.txt`: shared Python dependencies for the combined app
|
||||
- `Dockerfile`: single image for the combined tool
|
||||
- `.gitea/workflows/docker-image.yml`: Gitea Actions workflow for Docker builds
|
||||
- `Sitemap Builder/`: sitemap crawler logic
|
||||
- `Page Importer/`: WordPress import logic
|
||||
|
||||
## Run Locally
|
||||
|
||||
### Linux or macOS
|
||||
|
||||
```bash
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
### Windows PowerShell
|
||||
|
||||
```powershell
|
||||
python -m venv .venv
|
||||
.venv\Scripts\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
Then open:
|
||||
|
||||
```text
|
||||
http://localhost:8501
|
||||
```
|
||||
|
||||
## Docker
|
||||
|
||||
Build the image:
|
||||
|
||||
```bash
|
||||
docker build -t wdw-sitemap-and-importer .
|
||||
```
|
||||
|
||||
Run the container:
|
||||
|
||||
```bash
|
||||
docker run --rm -p 8501:8501 -v wdw-tools-data:/data wdw-sitemap-and-importer
|
||||
```
|
||||
|
||||
Then open:
|
||||
|
||||
```text
|
||||
http://localhost:8501
|
||||
```
|
||||
|
||||
The mounted `/data` volume stores sitemap CSV files, crawl state files, and crawl logs so sitemap jobs can survive container restarts.
|
||||
|
||||
## Gitea Automation
|
||||
|
||||
The workflow file is:
|
||||
|
||||
```text
|
||||
.gitea/workflows/docker-image.yml
|
||||
```
|
||||
|
||||
It runs on pushes to `main` and on manual workflow dispatch.
|
||||
|
||||
The workflow always builds the Docker image. If these secrets are configured in Gitea, it also logs in and pushes the image to your registry:
|
||||
|
||||
- `GITEA_REGISTRY_URL`
|
||||
- `GITEA_REGISTRY_USERNAME`
|
||||
- `GITEA_REGISTRY_PASSWORD`
|
||||
|
||||
Published tags:
|
||||
|
||||
- `${REGISTRY}/wdw-sitemap-and-importer:<commit-sha>`
|
||||
- `${REGISTRY}/wdw-sitemap-and-importer:latest`
|
||||
|
||||
If the registry secrets are not configured, the workflow still performs the build as validation but skips the push steps.
|
||||
|
||||
## Notes
|
||||
|
||||
- Sitemap output files are written under `/data` in Docker.
|
||||
- The sitemap crawler can resume previous runs when a matching crawl state file exists.
|
||||
- The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app.
|
||||
@@ -0,0 +1,80 @@
|
||||
# Sitemap Builder
|
||||
|
||||
This folder contains the sitemap crawler used by the combined web application in the repository root.
|
||||
|
||||
The crawler can still be used directly from Python, but the primary supported experience is now the shared Streamlit interface in the root project:
|
||||
|
||||
```text
|
||||
../app.py
|
||||
```
|
||||
|
||||
## Current Role In The Combined App
|
||||
|
||||
The root application uses this module to:
|
||||
|
||||
- crawl a site from a submitted starting URL
|
||||
- discover internal URLs from HTML links and XML sitemaps
|
||||
- export a sitemap CSV
|
||||
- save crawl state and crawl logs for resume support
|
||||
|
||||
## Output
|
||||
|
||||
The crawler writes:
|
||||
|
||||
- a CSV file
|
||||
- a sidecar crawl state file ending in `.crawlstate.json`
|
||||
- a crawl log file ending in `.crawl.log`
|
||||
|
||||
The CSV contains these columns:
|
||||
|
||||
- `URL`
|
||||
- `Title`
|
||||
- `Canonical URL`
|
||||
- `Type`
|
||||
|
||||
## Standalone CLI Usage
|
||||
|
||||
Interactive mode:
|
||||
|
||||
```bash
|
||||
python3 sitemap_builder.py
|
||||
```
|
||||
|
||||
Command line mode:
|
||||
|
||||
```bash
|
||||
python3 sitemap_builder.py https://example.com -o ./sitemap.csv
|
||||
```
|
||||
|
||||
On Windows:
|
||||
|
||||
```powershell
|
||||
python .\sitemap_builder.py https://example.com -o .\sitemap.csv
|
||||
```
|
||||
|
||||
## Useful Options
|
||||
|
||||
```bash
|
||||
python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --include-subdomains
|
||||
```
|
||||
|
||||
- `--max-pages`: stop after the given number of visited pages. Default: `10000`
|
||||
- `--delay`: wait between requests to reduce load on the site
|
||||
- `--timeout`: request timeout in seconds
|
||||
- `--include-subdomains`: crawl subdomains of the starting host
|
||||
- `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
|
||||
- `--workers`: number of worker threads to use. Set `1` to disable multithreading
|
||||
- `--save-every`: save progress after every N pages. Default: `25`
|
||||
- `--resume`: resume from an existing state file
|
||||
- `--fresh`: ignore the existing state file and start over
|
||||
|
||||
## Discovery And Behavior
|
||||
|
||||
- The crawler checks `robots.txt` for sitemap references and also tries `/sitemap.xml`
|
||||
- XML sitemap URLs are added to the crawl queue before page crawling begins
|
||||
- HTML pages store page title and canonical URL in the CSV when available
|
||||
- On Windows CLI runs, `P` pauses, `R` resumes, and `Q` stops cleanly and saves progress
|
||||
|
||||
## Recommendation
|
||||
|
||||
For normal use, run the root application or Docker container instead of calling this script directly. That is now the intended user interface for this repository.
|
||||
@@ -0,0 +1,947 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from collections import deque
|
||||
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
||||
from dataclasses import dataclass
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.parse import urljoin, urlsplit, urlunsplit
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
if os.name == "nt":
|
||||
import msvcrt
|
||||
|
||||
|
||||
DEFAULT_USER_AGENT = "SitemapBuilder/1.0 (+local script)"
|
||||
DEFAULT_OUTPUT_NAME = "sitemap.csv"
|
||||
DEFAULT_STATE_SUFFIX = ".crawlstate.json"
|
||||
DEFAULT_LOG_SUFFIX = ".crawl.log"
|
||||
DEFAULT_MAX_PAGES = 10000
|
||||
DEFAULT_RESUME_PAGE_INCREMENT = 10000
|
||||
DEFAULT_SAVE_EVERY = 25
|
||||
DEFAULT_WORKERS = 8
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
DOCUMENT_EXTENSIONS = {
|
||||
".pdf",
|
||||
".csv",
|
||||
".doc",
|
||||
".docx",
|
||||
".xls",
|
||||
".xlsx",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".txt",
|
||||
".rtf",
|
||||
".zip",
|
||||
".xml",
|
||||
".json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
url: str
|
||||
links: list[str]
|
||||
title: str = ""
|
||||
canonical_url: str = ""
|
||||
skipped: bool = False
|
||||
error: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlState:
|
||||
start_url: str
|
||||
include_subdomains: bool
|
||||
include_documents: bool
|
||||
visited: set[str]
|
||||
queued: set[str]
|
||||
queue: deque[str]
|
||||
records: dict[str, dict[str, str]]
|
||||
alias_to_canonical: dict[str, str]
|
||||
errors: list[dict[str, str]]
|
||||
skipped_count: int
|
||||
discovered_from_sitemaps: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeControl:
|
||||
paused: bool = False
|
||||
stop_requested: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlRunResult:
|
||||
state: CrawlState
|
||||
user_stopped: bool
|
||||
output_path: Path
|
||||
state_path: Path
|
||||
log_path: Path
|
||||
max_pages: int
|
||||
workers: int
|
||||
|
||||
|
||||
class HTMLPageParser(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.links: list[str] = []
|
||||
self.title_parts: list[str] = []
|
||||
self.in_title = False
|
||||
self.canonical_href = ""
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||
attrs_map = {key.lower(): value for key, value in attrs}
|
||||
lower_tag = tag.lower()
|
||||
|
||||
if lower_tag == "a":
|
||||
href = attrs_map.get("href")
|
||||
if href:
|
||||
self.links.append(href)
|
||||
|
||||
if lower_tag == "title":
|
||||
self.in_title = True
|
||||
|
||||
if lower_tag == "link":
|
||||
rel = (attrs_map.get("rel") or "").lower()
|
||||
href = attrs_map.get("href") or ""
|
||||
if "canonical" in rel and href:
|
||||
self.canonical_href = href
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag.lower() == "title":
|
||||
self.in_title = False
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self.in_title:
|
||||
self.title_parts.append(data)
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
return " ".join(part.strip() for part in self.title_parts if part.strip()).strip()
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
parts = urlsplit(url.strip())
|
||||
scheme = parts.scheme.lower() or "https"
|
||||
netloc = parts.netloc.lower()
|
||||
path = parts.path or "/"
|
||||
|
||||
if path != "/" and path.endswith("/"):
|
||||
path = path.rstrip("/")
|
||||
|
||||
return urlunsplit((scheme, netloc, path, parts.query, ""))
|
||||
|
||||
|
||||
def is_http_url(url: str) -> bool:
|
||||
return urlsplit(url).scheme in {"http", "https"}
|
||||
|
||||
|
||||
def build_allowed_hosts(start_url: str) -> set[str]:
|
||||
return {urlsplit(start_url).netloc.lower()}
|
||||
|
||||
|
||||
def should_visit(url: str, allowed_hosts: set[str], include_subdomains: bool) -> bool:
|
||||
if not is_http_url(url):
|
||||
return False
|
||||
|
||||
host = urlsplit(url).netloc.lower()
|
||||
if include_subdomains:
|
||||
return any(host == allowed or host.endswith(f".{allowed}") for allowed in allowed_hosts)
|
||||
return host in allowed_hosts
|
||||
|
||||
|
||||
def is_document_url(url: str) -> bool:
|
||||
return Path(urlsplit(url).path).suffix.lower() in DOCUMENT_EXTENSIONS
|
||||
|
||||
|
||||
def should_record_url(url: str) -> bool:
|
||||
query = urlsplit(url).query.lower()
|
||||
return query != "page=1"
|
||||
|
||||
|
||||
def get_state_path(output_path: Path) -> Path:
|
||||
return output_path.with_suffix(output_path.suffix + DEFAULT_STATE_SUFFIX)
|
||||
|
||||
|
||||
def get_log_path(output_path: Path) -> Path:
|
||||
return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
|
||||
|
||||
|
||||
def log_message(log_path: Path, message: str) -> None:
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
with log_path.open("a", encoding="utf-8") as log_file:
|
||||
log_file.write(f"[{timestamp}] {message}\n")
|
||||
|
||||
|
||||
def resolve_alias(url: str, alias_to_canonical: dict[str, str]) -> str:
|
||||
resolved = url
|
||||
seen: set[str] = set()
|
||||
while resolved in alias_to_canonical and resolved not in seen:
|
||||
seen.add(resolved)
|
||||
resolved = alias_to_canonical[resolved]
|
||||
return resolved
|
||||
|
||||
|
||||
def register_record(
|
||||
state: CrawlState,
|
||||
url: str,
|
||||
record_type: str,
|
||||
title: str = "",
|
||||
canonical_url: str = "",
|
||||
) -> None:
|
||||
if not should_record_url(url):
|
||||
return
|
||||
|
||||
existing = state.records.get(url, {"title": "", "canonical_url": "", "type": record_type})
|
||||
if not existing.get("type"):
|
||||
existing["type"] = record_type
|
||||
elif existing["type"] == "document" and record_type == "page":
|
||||
existing["type"] = "page"
|
||||
|
||||
if title and not existing.get("title"):
|
||||
existing["title"] = title
|
||||
if canonical_url and not existing.get("canonical_url"):
|
||||
existing["canonical_url"] = canonical_url
|
||||
if "canonical_url" not in existing:
|
||||
existing["canonical_url"] = canonical_url
|
||||
if "title" not in existing:
|
||||
existing["title"] = title
|
||||
state.records[url] = existing
|
||||
|
||||
|
||||
def save_state(state: CrawlState, state_path: Path, output_path: Path) -> None:
|
||||
state_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"start_url": state.start_url,
|
||||
"include_subdomains": state.include_subdomains,
|
||||
"include_documents": state.include_documents,
|
||||
"visited": sorted(state.visited),
|
||||
"queued": sorted(state.queued),
|
||||
"queue": list(state.queue),
|
||||
"records": state.records,
|
||||
"alias_to_canonical": state.alias_to_canonical,
|
||||
"errors": state.errors,
|
||||
"skipped_count": state.skipped_count,
|
||||
"discovered_from_sitemaps": state.discovered_from_sitemaps,
|
||||
"saved_at": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"output_path": str(output_path),
|
||||
}
|
||||
state_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def load_state(state_path: Path) -> CrawlState:
|
||||
payload = json.loads(state_path.read_text(encoding="utf-8"))
|
||||
return CrawlState(
|
||||
start_url=payload["start_url"],
|
||||
include_subdomains=bool(payload.get("include_subdomains", False)),
|
||||
include_documents=bool(payload.get("include_documents", False)),
|
||||
visited=set(payload.get("visited", [])),
|
||||
queued=set(payload.get("queued", [])),
|
||||
queue=deque(payload.get("queue", [])),
|
||||
records=dict(payload.get("records", {})),
|
||||
alias_to_canonical=dict(payload.get("alias_to_canonical", {})),
|
||||
errors=list(payload.get("errors", [])),
|
||||
skipped_count=int(payload.get("skipped_count", 0)),
|
||||
discovered_from_sitemaps=int(payload.get("discovered_from_sitemaps", 0)),
|
||||
)
|
||||
|
||||
|
||||
def initialize_state(start_url: str, include_subdomains: bool, include_documents: bool) -> CrawlState:
|
||||
normalized_start = normalize_url(start_url)
|
||||
return CrawlState(
|
||||
start_url=normalized_start,
|
||||
include_subdomains=include_subdomains,
|
||||
include_documents=include_documents,
|
||||
visited=set(),
|
||||
queued={normalized_start},
|
||||
queue=deque([normalized_start]),
|
||||
records={},
|
||||
alias_to_canonical={},
|
||||
errors=[],
|
||||
skipped_count=0,
|
||||
discovered_from_sitemaps=0,
|
||||
)
|
||||
|
||||
|
||||
def prompt_if_missing(value: str | None, prompt_text: str) -> str:
|
||||
if value:
|
||||
return value
|
||||
return input(prompt_text).strip()
|
||||
|
||||
|
||||
def prompt_yes_no(prompt_text: str, default: bool) -> bool:
|
||||
suffix = "Y/n" if default else "y/N"
|
||||
answer = input(f"{prompt_text} [{suffix}]: ").strip().lower()
|
||||
if not answer:
|
||||
return default
|
||||
return answer in {"y", "yes"}
|
||||
|
||||
|
||||
def write_csv(records: dict[str, dict[str, str]], output_path: Path) -> None:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with output_path.open("w", newline="", encoding="utf-8") as csv_file:
|
||||
writer = csv.writer(csv_file)
|
||||
writer.writerow(["URL", "Title", "Canonical URL", "Type"])
|
||||
for url in sorted(records):
|
||||
record = records[url]
|
||||
writer.writerow(
|
||||
[
|
||||
url,
|
||||
record.get("title", ""),
|
||||
record.get("canonical_url", ""),
|
||||
record.get("type", ""),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def fetch_text(url: str, timeout: float, user_agent: str, accept: str) -> tuple[str | None, str | None]:
|
||||
request = Request(url, headers={"User-Agent": user_agent, "Accept": accept})
|
||||
try:
|
||||
with urlopen(request, timeout=timeout) as response:
|
||||
return (
|
||||
response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace"),
|
||||
None,
|
||||
)
|
||||
except HTTPError as exc:
|
||||
return None, f"HTTP {exc.code}"
|
||||
except URLError as exc:
|
||||
return None, str(exc.reason)
|
||||
except TimeoutError:
|
||||
return None, "request timed out"
|
||||
except Exception as exc: # pragma: no cover
|
||||
return None, str(exc)
|
||||
|
||||
|
||||
def fetch_page(url: str, timeout: float, user_agent: str) -> CrawlResult:
|
||||
request = Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
with urlopen(request, timeout=timeout) as response:
|
||||
content_type = response.headers.get("Content-Type", "").lower()
|
||||
if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
|
||||
return CrawlResult(url=url, links=[], skipped=True)
|
||||
|
||||
content = response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace")
|
||||
except HTTPError as exc:
|
||||
return CrawlResult(url=url, links=[], error=f"HTTP {exc.code}")
|
||||
except URLError as exc:
|
||||
return CrawlResult(url=url, links=[], error=str(exc.reason))
|
||||
except TimeoutError:
|
||||
return CrawlResult(url=url, links=[], error="request timed out")
|
||||
except Exception as exc: # pragma: no cover
|
||||
return CrawlResult(url=url, links=[], error=str(exc))
|
||||
|
||||
parser = HTMLPageParser()
|
||||
parser.feed(content)
|
||||
canonical_url = normalize_url(urljoin(url, parser.canonical_href)) if parser.canonical_href else ""
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
links=parser.links,
|
||||
title=parser.title,
|
||||
canonical_url=canonical_url,
|
||||
)
|
||||
|
||||
|
||||
def fetch_page_with_delay(url: str, timeout: float, user_agent: str, delay: float) -> CrawlResult:
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
return fetch_page(url, timeout=timeout, user_agent=user_agent)
|
||||
|
||||
|
||||
def print_progress(state: CrawlState, max_pages: int, current_url: str) -> None:
|
||||
print(
|
||||
f"[{len(state.visited)}/{max_pages}] Found {len(state.records)} URL(s), "
|
||||
f"queued {len(state.queue)} more: {current_url}"
|
||||
)
|
||||
|
||||
|
||||
def poll_runtime_control(control: RuntimeControl, log_path: Path) -> None:
|
||||
if os.name != "nt":
|
||||
return
|
||||
|
||||
while msvcrt.kbhit():
|
||||
key = msvcrt.getwch().lower()
|
||||
if key == "p" and not control.paused:
|
||||
control.paused = True
|
||||
print("Paused. Press R to resume or Q to stop.")
|
||||
log_message(log_path, "Crawl paused by user")
|
||||
elif key == "r" and control.paused:
|
||||
control.paused = False
|
||||
print("Resuming crawl.")
|
||||
log_message(log_path, "Crawl resumed by user")
|
||||
elif key == "q":
|
||||
control.stop_requested = True
|
||||
log_message(log_path, "Stop requested by user")
|
||||
|
||||
|
||||
def discover_robots_sitemaps(
|
||||
start_url: str,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
log_path: Path,
|
||||
) -> set[str]:
|
||||
robots_url = normalize_url(urljoin(start_url, "/robots.txt"))
|
||||
content, error = fetch_text(robots_url, timeout, user_agent, "text/plain,*/*;q=0.8")
|
||||
if error:
|
||||
log_message(log_path, f"robots.txt not available at {robots_url}: {error}")
|
||||
return set()
|
||||
|
||||
sitemap_urls: set[str] = set()
|
||||
for line in content.splitlines():
|
||||
if line.lower().startswith("sitemap:"):
|
||||
raw_url = line.split(":", 1)[1].strip()
|
||||
if raw_url:
|
||||
sitemap_urls.add(normalize_url(raw_url))
|
||||
|
||||
if sitemap_urls:
|
||||
log_message(log_path, f"Discovered {len(sitemap_urls)} sitemap reference(s) from robots.txt")
|
||||
return sitemap_urls
|
||||
|
||||
|
||||
def xml_local_name(tag: str) -> str:
|
||||
if "}" in tag:
|
||||
return tag.rsplit("}", 1)[1]
|
||||
return tag
|
||||
|
||||
|
||||
def parse_sitemap_urls(
|
||||
sitemap_url: str,
|
||||
allowed_hosts: set[str],
|
||||
include_subdomains: bool,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
log_path: Path,
|
||||
seen_sitemaps: set[str],
|
||||
) -> set[str]:
|
||||
normalized_sitemap = normalize_url(sitemap_url)
|
||||
if normalized_sitemap in seen_sitemaps:
|
||||
return set()
|
||||
seen_sitemaps.add(normalized_sitemap)
|
||||
|
||||
if not should_visit(normalized_sitemap, allowed_hosts, include_subdomains):
|
||||
return set()
|
||||
|
||||
content, error = fetch_text(normalized_sitemap, timeout, user_agent, "application/xml,text/xml;q=0.9,*/*;q=0.8")
|
||||
if error:
|
||||
log_message(log_path, f"Sitemap fetch failed for {normalized_sitemap}: {error}")
|
||||
return set()
|
||||
|
||||
try:
|
||||
root = ET.fromstring(content)
|
||||
except ET.ParseError as exc:
|
||||
log_message(log_path, f"Sitemap parse failed for {normalized_sitemap}: {exc}")
|
||||
return set()
|
||||
|
||||
tag_name = xml_local_name(root.tag)
|
||||
discovered_urls: set[str] = set()
|
||||
|
||||
if tag_name == "urlset":
|
||||
for element in root.findall(".//"):
|
||||
if xml_local_name(element.tag) == "loc" and element.text:
|
||||
normalized = normalize_url(element.text.strip())
|
||||
if should_visit(normalized, allowed_hosts, include_subdomains):
|
||||
discovered_urls.add(normalized)
|
||||
elif tag_name == "sitemapindex":
|
||||
for element in root.findall(".//"):
|
||||
if xml_local_name(element.tag) == "loc" and element.text:
|
||||
child_sitemap = normalize_url(element.text.strip())
|
||||
discovered_urls.update(
|
||||
parse_sitemap_urls(
|
||||
child_sitemap,
|
||||
allowed_hosts,
|
||||
include_subdomains,
|
||||
timeout,
|
||||
user_agent,
|
||||
log_path,
|
||||
seen_sitemaps,
|
||||
)
|
||||
)
|
||||
else:
|
||||
log_message(log_path, f"Unsupported sitemap format at {normalized_sitemap}")
|
||||
|
||||
return discovered_urls
|
||||
|
||||
|
||||
def seed_from_xml_sitemaps(
|
||||
state: CrawlState,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
log_path: Path,
|
||||
) -> None:
|
||||
allowed_hosts = build_allowed_hosts(state.start_url)
|
||||
sitemap_candidates = discover_robots_sitemaps(state.start_url, timeout, user_agent, log_path)
|
||||
sitemap_candidates.add(normalize_url(urljoin(state.start_url, "/sitemap.xml")))
|
||||
|
||||
seen_sitemaps: set[str] = set()
|
||||
discovered_urls: set[str] = set()
|
||||
for sitemap_url in sitemap_candidates:
|
||||
discovered_urls.update(
|
||||
parse_sitemap_urls(
|
||||
sitemap_url,
|
||||
allowed_hosts,
|
||||
state.include_subdomains,
|
||||
timeout,
|
||||
user_agent,
|
||||
log_path,
|
||||
seen_sitemaps,
|
||||
)
|
||||
)
|
||||
|
||||
added = 0
|
||||
for url in discovered_urls:
|
||||
canonical_url = resolve_alias(url, state.alias_to_canonical)
|
||||
if is_document_url(canonical_url):
|
||||
if state.include_documents:
|
||||
register_record(state, canonical_url, "document")
|
||||
added += 1
|
||||
continue
|
||||
|
||||
register_record(state, canonical_url, "page")
|
||||
if canonical_url not in state.visited and canonical_url not in state.queued:
|
||||
state.queue.append(canonical_url)
|
||||
state.queued.add(canonical_url)
|
||||
added += 1
|
||||
|
||||
state.discovered_from_sitemaps += added
|
||||
log_message(log_path, f"Added {added} URL(s) from XML sitemap discovery")
|
||||
|
||||
|
||||
def process_crawl_result(
|
||||
state: CrawlState,
|
||||
result: CrawlResult,
|
||||
allowed_hosts: set[str],
|
||||
log_path: Path,
|
||||
) -> None:
|
||||
if result.error:
|
||||
state.errors.append({"url": result.url, "error": result.error})
|
||||
log_message(log_path, f"Error fetching {result.url}: {result.error}")
|
||||
return
|
||||
|
||||
if result.skipped:
|
||||
state.skipped_count += 1
|
||||
register_record(state, result.url, "document")
|
||||
return
|
||||
|
||||
canonical_url = ""
|
||||
if result.canonical_url and should_visit(result.canonical_url, allowed_hosts, state.include_subdomains):
|
||||
canonical_url = resolve_alias(result.canonical_url, state.alias_to_canonical)
|
||||
state.alias_to_canonical[result.url] = canonical_url
|
||||
register_record(state, canonical_url, "page", title=result.title, canonical_url=canonical_url)
|
||||
if canonical_url not in state.visited and canonical_url not in state.queued:
|
||||
state.queue.append(canonical_url)
|
||||
state.queued.add(canonical_url)
|
||||
register_record(state, result.url, "page", title=result.title, canonical_url=canonical_url)
|
||||
|
||||
for raw_link in result.links:
|
||||
absolute = normalize_url(urljoin(result.url, raw_link))
|
||||
if not should_visit(absolute, allowed_hosts, state.include_subdomains):
|
||||
continue
|
||||
|
||||
absolute = resolve_alias(absolute, state.alias_to_canonical)
|
||||
if is_document_url(absolute):
|
||||
if state.include_documents:
|
||||
register_record(state, absolute, "document")
|
||||
continue
|
||||
|
||||
register_record(state, absolute, "page")
|
||||
if absolute not in state.queued and absolute not in state.visited:
|
||||
state.queue.append(absolute)
|
||||
state.queued.add(absolute)
|
||||
|
||||
|
||||
def crawl_site(
|
||||
state: CrawlState,
|
||||
max_pages: int,
|
||||
delay: float,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
state_path: Path,
|
||||
output_path: Path,
|
||||
log_path: Path,
|
||||
save_every: int,
|
||||
workers: int,
|
||||
) -> tuple[CrawlState, bool]:
|
||||
allowed_hosts = build_allowed_hosts(state.start_url)
|
||||
processed_since_save = 0
|
||||
user_stopped = False
|
||||
control = RuntimeControl()
|
||||
|
||||
if workers <= 1:
|
||||
while state.queue and len(state.visited) < max_pages:
|
||||
poll_runtime_control(control, log_path)
|
||||
if control.stop_requested:
|
||||
user_stopped = True
|
||||
print("Stop requested. Saving progress and finishing cleanly...")
|
||||
break
|
||||
|
||||
while control.paused and not control.stop_requested:
|
||||
time.sleep(0.2)
|
||||
poll_runtime_control(control, log_path)
|
||||
|
||||
if control.stop_requested:
|
||||
user_stopped = True
|
||||
print("Stop requested. Saving progress and finishing cleanly...")
|
||||
break
|
||||
|
||||
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
|
||||
if current in state.visited:
|
||||
continue
|
||||
|
||||
state.visited.add(current)
|
||||
register_record(state, current, "page")
|
||||
print_progress(state, max_pages, current)
|
||||
|
||||
result = fetch_page_with_delay(current, timeout=timeout, user_agent=user_agent, delay=delay)
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
|
||||
processed_since_save += 1
|
||||
if processed_since_save >= save_every:
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
|
||||
processed_since_save = 0
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
pending: dict[object, str] = {}
|
||||
|
||||
while pending or (state.queue and len(state.visited) < max_pages):
|
||||
poll_runtime_control(control, log_path)
|
||||
|
||||
if control.stop_requested:
|
||||
user_stopped = True
|
||||
print("Stop requested. No new pages will be queued. Waiting for active requests to finish...")
|
||||
break
|
||||
|
||||
if control.paused:
|
||||
if pending:
|
||||
completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
|
||||
for future in completed:
|
||||
pending.pop(future, None)
|
||||
result = future.result()
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
processed_since_save += 1
|
||||
else:
|
||||
time.sleep(0.2)
|
||||
|
||||
if processed_since_save >= save_every:
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
|
||||
processed_since_save = 0
|
||||
continue
|
||||
|
||||
while state.queue and len(pending) < workers and len(state.visited) < max_pages:
|
||||
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
|
||||
if current in state.visited:
|
||||
continue
|
||||
|
||||
state.visited.add(current)
|
||||
register_record(state, current, "page")
|
||||
print_progress(state, max_pages, current)
|
||||
future = executor.submit(fetch_page_with_delay, current, timeout, user_agent, delay)
|
||||
pending[future] = current
|
||||
|
||||
if not pending:
|
||||
continue
|
||||
|
||||
completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
|
||||
for future in completed:
|
||||
pending.pop(future, None)
|
||||
result = future.result()
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
processed_since_save += 1
|
||||
|
||||
if processed_since_save >= save_every:
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
|
||||
processed_since_save = 0
|
||||
|
||||
if user_stopped and pending:
|
||||
completed, _ = wait(pending.keys())
|
||||
for future in completed:
|
||||
pending.pop(future, None)
|
||||
result = future.result()
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Final save completed with {len(state.records)} URL(s) recorded")
|
||||
return state, user_stopped
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Crawl a website and export discovered internal URLs to a CSV sitemap.",
|
||||
)
|
||||
parser.add_argument("url", nargs="?", help="Starting URL to crawl, for example https://example.com")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help=f"Output CSV path. Defaults to {DEFAULT_OUTPUT_NAME} in the script folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_PAGES,
|
||||
help=f"Maximum number of pages to crawl before stopping. Default: {DEFAULT_MAX_PAGES}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Delay in seconds between requests. Default: 0",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=15.0,
|
||||
help="Request timeout in seconds. Default: 15",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-subdomains",
|
||||
action="store_true",
|
||||
help="Also crawl subdomains of the starting host.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-documents",
|
||||
action="store_true",
|
||||
help="Include document links like PDF, CSV, DOC, and DOCX in the sitemap output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-every",
|
||||
type=int,
|
||||
default=DEFAULT_SAVE_EVERY,
|
||||
help=f"Save progress after this many pages. Default: {DEFAULT_SAVE_EVERY}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume",
|
||||
action="store_true",
|
||||
help="Resume from the saved crawl state if a state file already exists.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fresh",
|
||||
action="store_true",
|
||||
help="Ignore any saved crawl state and start over.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=f"Number of worker threads. Use 1 to disable multithreading. Default when prompted on: {DEFAULT_WORKERS}",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run_crawl(
|
||||
*,
|
||||
start_url: str,
|
||||
output_path: Path,
|
||||
max_pages: int = DEFAULT_MAX_PAGES,
|
||||
delay: float = 0.0,
|
||||
timeout: float = 15.0,
|
||||
include_subdomains: bool = False,
|
||||
include_documents: bool = False,
|
||||
save_every: int = DEFAULT_SAVE_EVERY,
|
||||
workers: int = DEFAULT_WORKERS,
|
||||
resume: bool = True,
|
||||
fresh: bool = False,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
) -> CrawlRunResult:
|
||||
if not start_url:
|
||||
raise ValueError("A starting URL is required.")
|
||||
|
||||
if "://" not in start_url:
|
||||
start_url = f"https://{start_url}"
|
||||
|
||||
normalized_start = normalize_url(start_url)
|
||||
if not is_http_url(normalized_start):
|
||||
raise ValueError("Only http and https URLs are supported.")
|
||||
|
||||
output_path = Path(output_path)
|
||||
state_path = get_state_path(output_path)
|
||||
log_path = get_log_path(output_path)
|
||||
|
||||
state: CrawlState
|
||||
if state_path.exists() and not fresh and resume:
|
||||
state = load_state(state_path)
|
||||
if state.start_url != normalized_start:
|
||||
raise ValueError(
|
||||
"The saved crawl state belongs to a different starting URL. "
|
||||
"Use a different output name or start a fresh crawl."
|
||||
)
|
||||
if state.include_documents != include_documents:
|
||||
raise ValueError(
|
||||
"The saved crawl state uses a different document setting. "
|
||||
"Keep the same choice or start a fresh crawl."
|
||||
)
|
||||
else:
|
||||
state = initialize_state(normalized_start, include_subdomains, include_documents)
|
||||
|
||||
effective_workers = max(int(workers), 1)
|
||||
effective_max_pages = max(int(max_pages), 1)
|
||||
if state.visited:
|
||||
effective_max_pages = max(effective_max_pages, len(state.visited) + DEFAULT_RESUME_PAGE_INCREMENT)
|
||||
else:
|
||||
seed_from_xml_sitemaps(state, max(timeout, 1.0), user_agent, log_path)
|
||||
|
||||
log_message(log_path, f"Starting crawl for {state.start_url}")
|
||||
log_message(log_path, f"Output CSV: {output_path.resolve()}")
|
||||
log_message(log_path, f"State file: {state_path.resolve()}")
|
||||
log_message(log_path, f"Multithreading workers: {effective_workers}")
|
||||
log_message(log_path, f"Include documents: {state.include_documents}")
|
||||
|
||||
state, user_stopped = crawl_site(
|
||||
state=state,
|
||||
max_pages=effective_max_pages,
|
||||
delay=max(delay, 0.0),
|
||||
timeout=max(timeout, 1.0),
|
||||
user_agent=user_agent,
|
||||
state_path=state_path,
|
||||
output_path=output_path,
|
||||
log_path=log_path,
|
||||
save_every=max(save_every, 1),
|
||||
workers=effective_workers,
|
||||
)
|
||||
|
||||
if user_stopped:
|
||||
log_message(log_path, "Crawl stopped by user")
|
||||
elif state.queue and len(state.visited) >= effective_max_pages:
|
||||
log_message(log_path, "Crawl stopped at max page limit")
|
||||
elif state.queue:
|
||||
log_message(log_path, "Crawl stopped before queue emptied")
|
||||
else:
|
||||
log_message(log_path, "Crawl completed with empty queue")
|
||||
|
||||
return CrawlRunResult(
|
||||
state=state,
|
||||
user_stopped=user_stopped,
|
||||
output_path=output_path,
|
||||
state_path=state_path,
|
||||
log_path=log_path,
|
||||
max_pages=effective_max_pages,
|
||||
workers=effective_workers,
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
start_url = prompt_if_missing(args.url, "Enter the website URL to crawl: ")
|
||||
if not start_url:
|
||||
print("A starting URL is required.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if "://" not in start_url:
|
||||
start_url = f"https://{start_url}"
|
||||
|
||||
normalized_start = normalize_url(start_url)
|
||||
if not is_http_url(normalized_start):
|
||||
print("Only http and https URLs are supported.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
output_value = prompt_if_missing(args.output, f"Enter output CSV path [{DEFAULT_OUTPUT_NAME}]: ")
|
||||
output_path = Path(output_value) if output_value else SCRIPT_DIR / DEFAULT_OUTPUT_NAME
|
||||
state_path = get_state_path(output_path)
|
||||
log_path = get_log_path(output_path)
|
||||
include_documents = args.include_documents or prompt_yes_no(
|
||||
"Include document links such as PDF, CSV, DOC, and DOCX in the sitemap?",
|
||||
default=False,
|
||||
)
|
||||
workers = args.workers
|
||||
if workers <= 0:
|
||||
enable_multithreading = prompt_yes_no(
|
||||
f"Enable multithreading for faster scanning? {DEFAULT_WORKERS} worker threads will be used.",
|
||||
default=True,
|
||||
)
|
||||
workers = DEFAULT_WORKERS if enable_multithreading else 1
|
||||
|
||||
print(f"Crawling {normalized_start}")
|
||||
print(f"Output file: {output_path.resolve()}")
|
||||
print(f"State file: {state_path.resolve()}")
|
||||
print(f"Log file: {log_path.resolve()}")
|
||||
resume_existing = False
|
||||
if state_path.exists() and not args.fresh:
|
||||
resume_existing = args.resume or prompt_yes_no(
|
||||
f"Found saved crawl state at {state_path.name}. Resume from where it left off?",
|
||||
default=True,
|
||||
)
|
||||
|
||||
try:
|
||||
run_result = run_crawl(
|
||||
start_url=normalized_start,
|
||||
output_path=output_path,
|
||||
max_pages=args.max_pages,
|
||||
delay=args.delay,
|
||||
timeout=args.timeout,
|
||||
include_subdomains=args.include_subdomains,
|
||||
include_documents=include_documents,
|
||||
save_every=args.save_every,
|
||||
workers=workers,
|
||||
resume=resume_existing,
|
||||
fresh=args.fresh,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
)
|
||||
except ValueError as exc:
|
||||
print(str(exc), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
state = run_result.state
|
||||
user_stopped = run_result.user_stopped
|
||||
effective_max_pages = run_result.max_pages
|
||||
|
||||
print(f"Max pages: {effective_max_pages}")
|
||||
print(f"Include documents: {'Yes' if state.include_documents else 'No'}")
|
||||
print(f"Multithreading: {'Yes' if run_result.workers > 1 else 'No'}")
|
||||
print(f"Worker threads: {run_result.workers}")
|
||||
if os.name == "nt":
|
||||
print("Press P to pause, R to resume, or Q to stop cleanly and save progress.")
|
||||
if resume_existing:
|
||||
print("Resumed from the existing crawl state file.")
|
||||
log_message(log_path, "Resumed from existing crawl state")
|
||||
|
||||
print(f"Found {len(state.records)} unique URL(s).")
|
||||
print(f"Visited pages: {len(state.visited)}")
|
||||
print(f"Queued pages remaining: {len(state.queue)}")
|
||||
print(f"URLs added from XML sitemaps: {state.discovered_from_sitemaps}")
|
||||
if state.errors:
|
||||
print(f"Pages with errors: {len(state.errors)}")
|
||||
for result in state.errors[:10]:
|
||||
print(f" {result['url']} -> {result['error']}")
|
||||
if state.skipped_count:
|
||||
print(f"Non-HTML pages skipped while crawling: {state.skipped_count}")
|
||||
|
||||
if user_stopped:
|
||||
print("Stopped by user. Run it again to continue from the saved state.")
|
||||
log_message(log_path, "Crawl stopped by user")
|
||||
elif state.queue and len(state.visited) >= effective_max_pages:
|
||||
print("Stopped because the max page limit was reached. Run it again to continue.")
|
||||
log_message(log_path, "Crawl stopped at max page limit")
|
||||
elif state.queue:
|
||||
print("Stopped before the queue was empty. Run it again to continue.")
|
||||
log_message(log_path, "Crawl stopped before queue emptied")
|
||||
else:
|
||||
print("Crawl complete. No queued pages remain.")
|
||||
log_message(log_path, "Crawl completed with empty queue")
|
||||
|
||||
print("Done.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,210 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import csv
|
||||
import importlib.util
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).resolve().parent
|
||||
PAGE_IMPORTER_DIR = ROOT_DIR / "Page Importer"
|
||||
SITEMAP_BUILDER_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
|
||||
APP_DATA_DIR = Path(os.environ.get("APP_DATA_DIR", ROOT_DIR / ".data")).resolve()
|
||||
SITEMAP_OUTPUT_DIR = APP_DATA_DIR / "sitemaps"
|
||||
|
||||
|
||||
def load_module(module_name: str, file_path: Path):
|
||||
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||
if spec is None or spec.loader is None:
|
||||
raise RuntimeError(f"Unable to load module from {file_path}")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def get_page_importer_module():
|
||||
if str(PAGE_IMPORTER_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(PAGE_IMPORTER_DIR))
|
||||
return load_module("page_importer_streamlit", PAGE_IMPORTER_DIR / "app.py")
|
||||
|
||||
|
||||
def get_sitemap_module():
|
||||
return load_module("sitemap_builder_module", SITEMAP_BUILDER_PATH)
|
||||
|
||||
|
||||
def sanitize_job_name(value: str) -> str:
|
||||
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", (value or "").strip())
|
||||
cleaned = cleaned.strip(".-")
|
||||
return cleaned or "sitemap"
|
||||
|
||||
|
||||
def read_csv_preview(csv_bytes: bytes, limit: int = 200) -> list[dict[str, str]]:
|
||||
text = csv_bytes.decode("utf-8-sig", errors="replace")
|
||||
reader = csv.DictReader(io.StringIO(text))
|
||||
rows: list[dict[str, str]] = []
|
||||
for index, row in enumerate(reader):
|
||||
if index >= limit:
|
||||
break
|
||||
rows.append(dict(row))
|
||||
return rows
|
||||
|
||||
|
||||
def render_sitemap_tab() -> None:
|
||||
st.title("Sitemap Generator")
|
||||
st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
|
||||
|
||||
SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with st.form("sitemap-form"):
|
||||
start_url = st.text_input("Starting URL", placeholder="https://example.com")
|
||||
job_name = st.text_input(
|
||||
"Output name",
|
||||
value="sitemap",
|
||||
help="Used for the CSV, crawl state, and log file names.",
|
||||
)
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
|
||||
workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
|
||||
with col2:
|
||||
delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
|
||||
timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
|
||||
with col3:
|
||||
save_every = st.number_input("Save progress every N pages", min_value=1, value=25, step=1)
|
||||
include_subdomains = st.checkbox("Include subdomains", value=False)
|
||||
include_documents = st.checkbox("Include document links", value=False)
|
||||
|
||||
resume_existing = st.checkbox("Resume from saved crawl state if present", value=True)
|
||||
start_fresh = st.checkbox("Ignore any saved crawl state and start fresh", value=False)
|
||||
submitted = st.form_submit_button("Run Sitemap Crawl", type="primary")
|
||||
|
||||
if submitted:
|
||||
if not start_url.strip():
|
||||
st.error("Starting URL is required.")
|
||||
else:
|
||||
sitemap_builder = get_sitemap_module()
|
||||
safe_name = sanitize_job_name(job_name)
|
||||
output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
|
||||
captured_stdout = io.StringIO()
|
||||
|
||||
try:
|
||||
with st.spinner("Running sitemap crawl..."):
|
||||
with contextlib.redirect_stdout(captured_stdout):
|
||||
result = sitemap_builder.run_crawl(
|
||||
start_url=start_url,
|
||||
output_path=output_path,
|
||||
max_pages=int(max_pages),
|
||||
delay=float(delay),
|
||||
timeout=float(timeout),
|
||||
include_subdomains=include_subdomains,
|
||||
include_documents=include_documents,
|
||||
save_every=int(save_every),
|
||||
workers=int(workers),
|
||||
resume=resume_existing,
|
||||
fresh=start_fresh,
|
||||
)
|
||||
except Exception as exc:
|
||||
st.error(str(exc))
|
||||
else:
|
||||
st.session_state["sitemap_result"] = {
|
||||
"summary": {
|
||||
"records": len(result.state.records),
|
||||
"visited": len(result.state.visited),
|
||||
"queued": len(result.state.queue),
|
||||
"errors": len(result.state.errors),
|
||||
"skipped": result.state.skipped_count,
|
||||
"from_sitemaps": result.state.discovered_from_sitemaps,
|
||||
"user_stopped": result.user_stopped,
|
||||
"max_pages": result.max_pages,
|
||||
"workers": result.workers,
|
||||
},
|
||||
"output_path": str(result.output_path),
|
||||
"state_path": str(result.state_path),
|
||||
"log_path": str(result.log_path),
|
||||
"stdout": captured_stdout.getvalue(),
|
||||
}
|
||||
|
||||
result_data = st.session_state.get("sitemap_result")
|
||||
if not result_data:
|
||||
st.info("Run a crawl to generate a sitemap CSV.")
|
||||
return
|
||||
|
||||
summary = result_data["summary"]
|
||||
csv_path = Path(result_data["output_path"])
|
||||
state_path = Path(result_data["state_path"])
|
||||
log_path = Path(result_data["log_path"])
|
||||
|
||||
st.subheader("Crawl Summary")
|
||||
metric_cols = st.columns(6)
|
||||
metric_cols[0].metric("URLs Found", summary["records"])
|
||||
metric_cols[1].metric("Visited", summary["visited"])
|
||||
metric_cols[2].metric("Queued", summary["queued"])
|
||||
metric_cols[3].metric("XML Seeded", summary["from_sitemaps"])
|
||||
metric_cols[4].metric("Errors", summary["errors"])
|
||||
metric_cols[5].metric("Skipped", summary["skipped"])
|
||||
|
||||
status_text = "Stopped by user." if summary["user_stopped"] else "Run completed."
|
||||
st.caption(f"{status_text} Max pages used: {summary['max_pages']} | Worker threads: {summary['workers']}")
|
||||
|
||||
if csv_path.exists():
|
||||
csv_bytes = csv_path.read_bytes()
|
||||
st.download_button(
|
||||
"Download Sitemap CSV",
|
||||
data=csv_bytes,
|
||||
file_name=csv_path.name,
|
||||
mime="text/csv",
|
||||
)
|
||||
preview_rows = read_csv_preview(csv_bytes)
|
||||
if preview_rows:
|
||||
st.dataframe(preview_rows, width="stretch", hide_index=True)
|
||||
|
||||
file_cols = st.columns(2)
|
||||
with file_cols[0]:
|
||||
if state_path.exists():
|
||||
st.download_button(
|
||||
"Download Crawl State",
|
||||
data=state_path.read_bytes(),
|
||||
file_name=state_path.name,
|
||||
mime="application/json",
|
||||
)
|
||||
with file_cols[1]:
|
||||
if log_path.exists():
|
||||
st.download_button(
|
||||
"Download Crawl Log",
|
||||
data=log_path.read_bytes(),
|
||||
file_name=log_path.name,
|
||||
mime="text/plain",
|
||||
)
|
||||
|
||||
crawl_output = (result_data.get("stdout") or "").strip()
|
||||
if crawl_output:
|
||||
st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True)
|
||||
|
||||
if log_path.exists():
|
||||
log_text = log_path.read_text(encoding="utf-8", errors="replace")
|
||||
st.text_area("Log Tail", value="\n".join(log_text.splitlines()[-50:]), height=220, disabled=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
st.set_page_config(page_title="WDW Tools", layout="wide")
|
||||
st.header("WDW Sitemap And Import Tools")
|
||||
sitemap_tab, importer_tab = st.tabs(["Sitemap Generator", "Page Importer"])
|
||||
|
||||
with sitemap_tab:
|
||||
render_sitemap_tab()
|
||||
|
||||
with importer_tab:
|
||||
page_importer_app = get_page_importer_module()
|
||||
page_importer_app.render_app()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,4 @@
|
||||
streamlit>=1.43,<2
|
||||
requests>=2.32,<3
|
||||
beautifulsoup4>=4.12,<5
|
||||
python-dateutil>=2.9,<3
|
||||
Reference in New Issue
Block a user