@@ -0,0 +1,210 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import csv
|
||||
import importlib.util
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).resolve().parent
|
||||
PAGE_IMPORTER_DIR = ROOT_DIR / "Page Importer"
|
||||
SITEMAP_BUILDER_PATH = ROOT_DIR / "Sitemap Builder" / "sitemap_builder.py"
|
||||
APP_DATA_DIR = Path(os.environ.get("APP_DATA_DIR", ROOT_DIR / ".data")).resolve()
|
||||
SITEMAP_OUTPUT_DIR = APP_DATA_DIR / "sitemaps"
|
||||
|
||||
|
||||
def load_module(module_name: str, file_path: Path):
|
||||
spec = importlib.util.spec_from_file_location(module_name, file_path)
|
||||
if spec is None or spec.loader is None:
|
||||
raise RuntimeError(f"Unable to load module from {file_path}")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def get_page_importer_module():
|
||||
if str(PAGE_IMPORTER_DIR) not in sys.path:
|
||||
sys.path.insert(0, str(PAGE_IMPORTER_DIR))
|
||||
return load_module("page_importer_streamlit", PAGE_IMPORTER_DIR / "app.py")
|
||||
|
||||
|
||||
def get_sitemap_module():
|
||||
return load_module("sitemap_builder_module", SITEMAP_BUILDER_PATH)
|
||||
|
||||
|
||||
def sanitize_job_name(value: str) -> str:
|
||||
cleaned = re.sub(r"[^A-Za-z0-9._-]+", "-", (value or "").strip())
|
||||
cleaned = cleaned.strip(".-")
|
||||
return cleaned or "sitemap"
|
||||
|
||||
|
||||
def read_csv_preview(csv_bytes: bytes, limit: int = 200) -> list[dict[str, str]]:
|
||||
text = csv_bytes.decode("utf-8-sig", errors="replace")
|
||||
reader = csv.DictReader(io.StringIO(text))
|
||||
rows: list[dict[str, str]] = []
|
||||
for index, row in enumerate(reader):
|
||||
if index >= limit:
|
||||
break
|
||||
rows.append(dict(row))
|
||||
return rows
|
||||
|
||||
|
||||
def render_sitemap_tab() -> None:
|
||||
st.title("Sitemap Generator")
|
||||
st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
|
||||
|
||||
SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with st.form("sitemap-form"):
|
||||
start_url = st.text_input("Starting URL", placeholder="https://example.com")
|
||||
job_name = st.text_input(
|
||||
"Output name",
|
||||
value="sitemap",
|
||||
help="Used for the CSV, crawl state, and log file names.",
|
||||
)
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
|
||||
workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
|
||||
with col2:
|
||||
delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
|
||||
timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
|
||||
with col3:
|
||||
save_every = st.number_input("Save progress every N pages", min_value=1, value=25, step=1)
|
||||
include_subdomains = st.checkbox("Include subdomains", value=False)
|
||||
include_documents = st.checkbox("Include document links", value=False)
|
||||
|
||||
resume_existing = st.checkbox("Resume from saved crawl state if present", value=True)
|
||||
start_fresh = st.checkbox("Ignore any saved crawl state and start fresh", value=False)
|
||||
submitted = st.form_submit_button("Run Sitemap Crawl", type="primary")
|
||||
|
||||
if submitted:
|
||||
if not start_url.strip():
|
||||
st.error("Starting URL is required.")
|
||||
else:
|
||||
sitemap_builder = get_sitemap_module()
|
||||
safe_name = sanitize_job_name(job_name)
|
||||
output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
|
||||
captured_stdout = io.StringIO()
|
||||
|
||||
try:
|
||||
with st.spinner("Running sitemap crawl..."):
|
||||
with contextlib.redirect_stdout(captured_stdout):
|
||||
result = sitemap_builder.run_crawl(
|
||||
start_url=start_url,
|
||||
output_path=output_path,
|
||||
max_pages=int(max_pages),
|
||||
delay=float(delay),
|
||||
timeout=float(timeout),
|
||||
include_subdomains=include_subdomains,
|
||||
include_documents=include_documents,
|
||||
save_every=int(save_every),
|
||||
workers=int(workers),
|
||||
resume=resume_existing,
|
||||
fresh=start_fresh,
|
||||
)
|
||||
except Exception as exc:
|
||||
st.error(str(exc))
|
||||
else:
|
||||
st.session_state["sitemap_result"] = {
|
||||
"summary": {
|
||||
"records": len(result.state.records),
|
||||
"visited": len(result.state.visited),
|
||||
"queued": len(result.state.queue),
|
||||
"errors": len(result.state.errors),
|
||||
"skipped": result.state.skipped_count,
|
||||
"from_sitemaps": result.state.discovered_from_sitemaps,
|
||||
"user_stopped": result.user_stopped,
|
||||
"max_pages": result.max_pages,
|
||||
"workers": result.workers,
|
||||
},
|
||||
"output_path": str(result.output_path),
|
||||
"state_path": str(result.state_path),
|
||||
"log_path": str(result.log_path),
|
||||
"stdout": captured_stdout.getvalue(),
|
||||
}
|
||||
|
||||
result_data = st.session_state.get("sitemap_result")
|
||||
if not result_data:
|
||||
st.info("Run a crawl to generate a sitemap CSV.")
|
||||
return
|
||||
|
||||
summary = result_data["summary"]
|
||||
csv_path = Path(result_data["output_path"])
|
||||
state_path = Path(result_data["state_path"])
|
||||
log_path = Path(result_data["log_path"])
|
||||
|
||||
st.subheader("Crawl Summary")
|
||||
metric_cols = st.columns(6)
|
||||
metric_cols[0].metric("URLs Found", summary["records"])
|
||||
metric_cols[1].metric("Visited", summary["visited"])
|
||||
metric_cols[2].metric("Queued", summary["queued"])
|
||||
metric_cols[3].metric("XML Seeded", summary["from_sitemaps"])
|
||||
metric_cols[4].metric("Errors", summary["errors"])
|
||||
metric_cols[5].metric("Skipped", summary["skipped"])
|
||||
|
||||
status_text = "Stopped by user." if summary["user_stopped"] else "Run completed."
|
||||
st.caption(f"{status_text} Max pages used: {summary['max_pages']} | Worker threads: {summary['workers']}")
|
||||
|
||||
if csv_path.exists():
|
||||
csv_bytes = csv_path.read_bytes()
|
||||
st.download_button(
|
||||
"Download Sitemap CSV",
|
||||
data=csv_bytes,
|
||||
file_name=csv_path.name,
|
||||
mime="text/csv",
|
||||
)
|
||||
preview_rows = read_csv_preview(csv_bytes)
|
||||
if preview_rows:
|
||||
st.dataframe(preview_rows, width="stretch", hide_index=True)
|
||||
|
||||
file_cols = st.columns(2)
|
||||
with file_cols[0]:
|
||||
if state_path.exists():
|
||||
st.download_button(
|
||||
"Download Crawl State",
|
||||
data=state_path.read_bytes(),
|
||||
file_name=state_path.name,
|
||||
mime="application/json",
|
||||
)
|
||||
with file_cols[1]:
|
||||
if log_path.exists():
|
||||
st.download_button(
|
||||
"Download Crawl Log",
|
||||
data=log_path.read_bytes(),
|
||||
file_name=log_path.name,
|
||||
mime="text/plain",
|
||||
)
|
||||
|
||||
crawl_output = (result_data.get("stdout") or "").strip()
|
||||
if crawl_output:
|
||||
st.text_area("Crawler Output", value=crawl_output, height=220, disabled=True)
|
||||
|
||||
if log_path.exists():
|
||||
log_text = log_path.read_text(encoding="utf-8", errors="replace")
|
||||
st.text_area("Log Tail", value="\n".join(log_text.splitlines()[-50:]), height=220, disabled=True)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
st.set_page_config(page_title="WDW Tools", layout="wide")
|
||||
st.header("WDW Sitemap And Import Tools")
|
||||
sitemap_tab, importer_tab = st.tabs(["Sitemap Generator", "Page Importer"])
|
||||
|
||||
with sitemap_tab:
|
||||
render_sitemap_tab()
|
||||
|
||||
with importer_tab:
|
||||
page_importer_app = get_page_importer_module()
|
||||
page_importer_app.render_app()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user