@@ -0,0 +1,947 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from collections import deque
|
||||
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
||||
from dataclasses import dataclass
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.parse import urljoin, urlsplit, urlunsplit
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
if os.name == "nt":
|
||||
import msvcrt
|
||||
|
||||
|
||||
DEFAULT_USER_AGENT = "SitemapBuilder/1.0 (+local script)"
|
||||
DEFAULT_OUTPUT_NAME = "sitemap.csv"
|
||||
DEFAULT_STATE_SUFFIX = ".crawlstate.json"
|
||||
DEFAULT_LOG_SUFFIX = ".crawl.log"
|
||||
DEFAULT_MAX_PAGES = 10000
|
||||
DEFAULT_RESUME_PAGE_INCREMENT = 10000
|
||||
DEFAULT_SAVE_EVERY = 25
|
||||
DEFAULT_WORKERS = 8
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
DOCUMENT_EXTENSIONS = {
|
||||
".pdf",
|
||||
".csv",
|
||||
".doc",
|
||||
".docx",
|
||||
".xls",
|
||||
".xlsx",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".txt",
|
||||
".rtf",
|
||||
".zip",
|
||||
".xml",
|
||||
".json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
url: str
|
||||
links: list[str]
|
||||
title: str = ""
|
||||
canonical_url: str = ""
|
||||
skipped: bool = False
|
||||
error: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlState:
|
||||
start_url: str
|
||||
include_subdomains: bool
|
||||
include_documents: bool
|
||||
visited: set[str]
|
||||
queued: set[str]
|
||||
queue: deque[str]
|
||||
records: dict[str, dict[str, str]]
|
||||
alias_to_canonical: dict[str, str]
|
||||
errors: list[dict[str, str]]
|
||||
skipped_count: int
|
||||
discovered_from_sitemaps: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeControl:
|
||||
paused: bool = False
|
||||
stop_requested: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlRunResult:
|
||||
state: CrawlState
|
||||
user_stopped: bool
|
||||
output_path: Path
|
||||
state_path: Path
|
||||
log_path: Path
|
||||
max_pages: int
|
||||
workers: int
|
||||
|
||||
|
||||
class HTMLPageParser(HTMLParser):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.links: list[str] = []
|
||||
self.title_parts: list[str] = []
|
||||
self.in_title = False
|
||||
self.canonical_href = ""
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||
attrs_map = {key.lower(): value for key, value in attrs}
|
||||
lower_tag = tag.lower()
|
||||
|
||||
if lower_tag == "a":
|
||||
href = attrs_map.get("href")
|
||||
if href:
|
||||
self.links.append(href)
|
||||
|
||||
if lower_tag == "title":
|
||||
self.in_title = True
|
||||
|
||||
if lower_tag == "link":
|
||||
rel = (attrs_map.get("rel") or "").lower()
|
||||
href = attrs_map.get("href") or ""
|
||||
if "canonical" in rel and href:
|
||||
self.canonical_href = href
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag.lower() == "title":
|
||||
self.in_title = False
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self.in_title:
|
||||
self.title_parts.append(data)
|
||||
|
||||
@property
|
||||
def title(self) -> str:
|
||||
return " ".join(part.strip() for part in self.title_parts if part.strip()).strip()
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
parts = urlsplit(url.strip())
|
||||
scheme = parts.scheme.lower() or "https"
|
||||
netloc = parts.netloc.lower()
|
||||
path = parts.path or "/"
|
||||
|
||||
if path != "/" and path.endswith("/"):
|
||||
path = path.rstrip("/")
|
||||
|
||||
return urlunsplit((scheme, netloc, path, parts.query, ""))
|
||||
|
||||
|
||||
def is_http_url(url: str) -> bool:
|
||||
return urlsplit(url).scheme in {"http", "https"}
|
||||
|
||||
|
||||
def build_allowed_hosts(start_url: str) -> set[str]:
|
||||
return {urlsplit(start_url).netloc.lower()}
|
||||
|
||||
|
||||
def should_visit(url: str, allowed_hosts: set[str], include_subdomains: bool) -> bool:
|
||||
if not is_http_url(url):
|
||||
return False
|
||||
|
||||
host = urlsplit(url).netloc.lower()
|
||||
if include_subdomains:
|
||||
return any(host == allowed or host.endswith(f".{allowed}") for allowed in allowed_hosts)
|
||||
return host in allowed_hosts
|
||||
|
||||
|
||||
def is_document_url(url: str) -> bool:
|
||||
return Path(urlsplit(url).path).suffix.lower() in DOCUMENT_EXTENSIONS
|
||||
|
||||
|
||||
def should_record_url(url: str) -> bool:
|
||||
query = urlsplit(url).query.lower()
|
||||
return query != "page=1"
|
||||
|
||||
|
||||
def get_state_path(output_path: Path) -> Path:
|
||||
return output_path.with_suffix(output_path.suffix + DEFAULT_STATE_SUFFIX)
|
||||
|
||||
|
||||
def get_log_path(output_path: Path) -> Path:
|
||||
return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
|
||||
|
||||
|
||||
def log_message(log_path: Path, message: str) -> None:
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
with log_path.open("a", encoding="utf-8") as log_file:
|
||||
log_file.write(f"[{timestamp}] {message}\n")
|
||||
|
||||
|
||||
def resolve_alias(url: str, alias_to_canonical: dict[str, str]) -> str:
|
||||
resolved = url
|
||||
seen: set[str] = set()
|
||||
while resolved in alias_to_canonical and resolved not in seen:
|
||||
seen.add(resolved)
|
||||
resolved = alias_to_canonical[resolved]
|
||||
return resolved
|
||||
|
||||
|
||||
def register_record(
|
||||
state: CrawlState,
|
||||
url: str,
|
||||
record_type: str,
|
||||
title: str = "",
|
||||
canonical_url: str = "",
|
||||
) -> None:
|
||||
if not should_record_url(url):
|
||||
return
|
||||
|
||||
existing = state.records.get(url, {"title": "", "canonical_url": "", "type": record_type})
|
||||
if not existing.get("type"):
|
||||
existing["type"] = record_type
|
||||
elif existing["type"] == "document" and record_type == "page":
|
||||
existing["type"] = "page"
|
||||
|
||||
if title and not existing.get("title"):
|
||||
existing["title"] = title
|
||||
if canonical_url and not existing.get("canonical_url"):
|
||||
existing["canonical_url"] = canonical_url
|
||||
if "canonical_url" not in existing:
|
||||
existing["canonical_url"] = canonical_url
|
||||
if "title" not in existing:
|
||||
existing["title"] = title
|
||||
state.records[url] = existing
|
||||
|
||||
|
||||
def save_state(state: CrawlState, state_path: Path, output_path: Path) -> None:
|
||||
state_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
"start_url": state.start_url,
|
||||
"include_subdomains": state.include_subdomains,
|
||||
"include_documents": state.include_documents,
|
||||
"visited": sorted(state.visited),
|
||||
"queued": sorted(state.queued),
|
||||
"queue": list(state.queue),
|
||||
"records": state.records,
|
||||
"alias_to_canonical": state.alias_to_canonical,
|
||||
"errors": state.errors,
|
||||
"skipped_count": state.skipped_count,
|
||||
"discovered_from_sitemaps": state.discovered_from_sitemaps,
|
||||
"saved_at": time.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"output_path": str(output_path),
|
||||
}
|
||||
state_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def load_state(state_path: Path) -> CrawlState:
|
||||
payload = json.loads(state_path.read_text(encoding="utf-8"))
|
||||
return CrawlState(
|
||||
start_url=payload["start_url"],
|
||||
include_subdomains=bool(payload.get("include_subdomains", False)),
|
||||
include_documents=bool(payload.get("include_documents", False)),
|
||||
visited=set(payload.get("visited", [])),
|
||||
queued=set(payload.get("queued", [])),
|
||||
queue=deque(payload.get("queue", [])),
|
||||
records=dict(payload.get("records", {})),
|
||||
alias_to_canonical=dict(payload.get("alias_to_canonical", {})),
|
||||
errors=list(payload.get("errors", [])),
|
||||
skipped_count=int(payload.get("skipped_count", 0)),
|
||||
discovered_from_sitemaps=int(payload.get("discovered_from_sitemaps", 0)),
|
||||
)
|
||||
|
||||
|
||||
def initialize_state(start_url: str, include_subdomains: bool, include_documents: bool) -> CrawlState:
|
||||
normalized_start = normalize_url(start_url)
|
||||
return CrawlState(
|
||||
start_url=normalized_start,
|
||||
include_subdomains=include_subdomains,
|
||||
include_documents=include_documents,
|
||||
visited=set(),
|
||||
queued={normalized_start},
|
||||
queue=deque([normalized_start]),
|
||||
records={},
|
||||
alias_to_canonical={},
|
||||
errors=[],
|
||||
skipped_count=0,
|
||||
discovered_from_sitemaps=0,
|
||||
)
|
||||
|
||||
|
||||
def prompt_if_missing(value: str | None, prompt_text: str) -> str:
|
||||
if value:
|
||||
return value
|
||||
return input(prompt_text).strip()
|
||||
|
||||
|
||||
def prompt_yes_no(prompt_text: str, default: bool) -> bool:
|
||||
suffix = "Y/n" if default else "y/N"
|
||||
answer = input(f"{prompt_text} [{suffix}]: ").strip().lower()
|
||||
if not answer:
|
||||
return default
|
||||
return answer in {"y", "yes"}
|
||||
|
||||
|
||||
def write_csv(records: dict[str, dict[str, str]], output_path: Path) -> None:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with output_path.open("w", newline="", encoding="utf-8") as csv_file:
|
||||
writer = csv.writer(csv_file)
|
||||
writer.writerow(["URL", "Title", "Canonical URL", "Type"])
|
||||
for url in sorted(records):
|
||||
record = records[url]
|
||||
writer.writerow(
|
||||
[
|
||||
url,
|
||||
record.get("title", ""),
|
||||
record.get("canonical_url", ""),
|
||||
record.get("type", ""),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def fetch_text(url: str, timeout: float, user_agent: str, accept: str) -> tuple[str | None, str | None]:
|
||||
request = Request(url, headers={"User-Agent": user_agent, "Accept": accept})
|
||||
try:
|
||||
with urlopen(request, timeout=timeout) as response:
|
||||
return (
|
||||
response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace"),
|
||||
None,
|
||||
)
|
||||
except HTTPError as exc:
|
||||
return None, f"HTTP {exc.code}"
|
||||
except URLError as exc:
|
||||
return None, str(exc.reason)
|
||||
except TimeoutError:
|
||||
return None, "request timed out"
|
||||
except Exception as exc: # pragma: no cover
|
||||
return None, str(exc)
|
||||
|
||||
|
||||
def fetch_page(url: str, timeout: float, user_agent: str) -> CrawlResult:
|
||||
request = Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
with urlopen(request, timeout=timeout) as response:
|
||||
content_type = response.headers.get("Content-Type", "").lower()
|
||||
if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
|
||||
return CrawlResult(url=url, links=[], skipped=True)
|
||||
|
||||
content = response.read().decode(response.headers.get_content_charset() or "utf-8", errors="replace")
|
||||
except HTTPError as exc:
|
||||
return CrawlResult(url=url, links=[], error=f"HTTP {exc.code}")
|
||||
except URLError as exc:
|
||||
return CrawlResult(url=url, links=[], error=str(exc.reason))
|
||||
except TimeoutError:
|
||||
return CrawlResult(url=url, links=[], error="request timed out")
|
||||
except Exception as exc: # pragma: no cover
|
||||
return CrawlResult(url=url, links=[], error=str(exc))
|
||||
|
||||
parser = HTMLPageParser()
|
||||
parser.feed(content)
|
||||
canonical_url = normalize_url(urljoin(url, parser.canonical_href)) if parser.canonical_href else ""
|
||||
return CrawlResult(
|
||||
url=url,
|
||||
links=parser.links,
|
||||
title=parser.title,
|
||||
canonical_url=canonical_url,
|
||||
)
|
||||
|
||||
|
||||
def fetch_page_with_delay(url: str, timeout: float, user_agent: str, delay: float) -> CrawlResult:
|
||||
if delay > 0:
|
||||
time.sleep(delay)
|
||||
return fetch_page(url, timeout=timeout, user_agent=user_agent)
|
||||
|
||||
|
||||
def print_progress(state: CrawlState, max_pages: int, current_url: str) -> None:
|
||||
print(
|
||||
f"[{len(state.visited)}/{max_pages}] Found {len(state.records)} URL(s), "
|
||||
f"queued {len(state.queue)} more: {current_url}"
|
||||
)
|
||||
|
||||
|
||||
def poll_runtime_control(control: RuntimeControl, log_path: Path) -> None:
|
||||
if os.name != "nt":
|
||||
return
|
||||
|
||||
while msvcrt.kbhit():
|
||||
key = msvcrt.getwch().lower()
|
||||
if key == "p" and not control.paused:
|
||||
control.paused = True
|
||||
print("Paused. Press R to resume or Q to stop.")
|
||||
log_message(log_path, "Crawl paused by user")
|
||||
elif key == "r" and control.paused:
|
||||
control.paused = False
|
||||
print("Resuming crawl.")
|
||||
log_message(log_path, "Crawl resumed by user")
|
||||
elif key == "q":
|
||||
control.stop_requested = True
|
||||
log_message(log_path, "Stop requested by user")
|
||||
|
||||
|
||||
def discover_robots_sitemaps(
|
||||
start_url: str,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
log_path: Path,
|
||||
) -> set[str]:
|
||||
robots_url = normalize_url(urljoin(start_url, "/robots.txt"))
|
||||
content, error = fetch_text(robots_url, timeout, user_agent, "text/plain,*/*;q=0.8")
|
||||
if error:
|
||||
log_message(log_path, f"robots.txt not available at {robots_url}: {error}")
|
||||
return set()
|
||||
|
||||
sitemap_urls: set[str] = set()
|
||||
for line in content.splitlines():
|
||||
if line.lower().startswith("sitemap:"):
|
||||
raw_url = line.split(":", 1)[1].strip()
|
||||
if raw_url:
|
||||
sitemap_urls.add(normalize_url(raw_url))
|
||||
|
||||
if sitemap_urls:
|
||||
log_message(log_path, f"Discovered {len(sitemap_urls)} sitemap reference(s) from robots.txt")
|
||||
return sitemap_urls
|
||||
|
||||
|
||||
def xml_local_name(tag: str) -> str:
|
||||
if "}" in tag:
|
||||
return tag.rsplit("}", 1)[1]
|
||||
return tag
|
||||
|
||||
|
||||
def parse_sitemap_urls(
|
||||
sitemap_url: str,
|
||||
allowed_hosts: set[str],
|
||||
include_subdomains: bool,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
log_path: Path,
|
||||
seen_sitemaps: set[str],
|
||||
) -> set[str]:
|
||||
normalized_sitemap = normalize_url(sitemap_url)
|
||||
if normalized_sitemap in seen_sitemaps:
|
||||
return set()
|
||||
seen_sitemaps.add(normalized_sitemap)
|
||||
|
||||
if not should_visit(normalized_sitemap, allowed_hosts, include_subdomains):
|
||||
return set()
|
||||
|
||||
content, error = fetch_text(normalized_sitemap, timeout, user_agent, "application/xml,text/xml;q=0.9,*/*;q=0.8")
|
||||
if error:
|
||||
log_message(log_path, f"Sitemap fetch failed for {normalized_sitemap}: {error}")
|
||||
return set()
|
||||
|
||||
try:
|
||||
root = ET.fromstring(content)
|
||||
except ET.ParseError as exc:
|
||||
log_message(log_path, f"Sitemap parse failed for {normalized_sitemap}: {exc}")
|
||||
return set()
|
||||
|
||||
tag_name = xml_local_name(root.tag)
|
||||
discovered_urls: set[str] = set()
|
||||
|
||||
if tag_name == "urlset":
|
||||
for element in root.findall(".//"):
|
||||
if xml_local_name(element.tag) == "loc" and element.text:
|
||||
normalized = normalize_url(element.text.strip())
|
||||
if should_visit(normalized, allowed_hosts, include_subdomains):
|
||||
discovered_urls.add(normalized)
|
||||
elif tag_name == "sitemapindex":
|
||||
for element in root.findall(".//"):
|
||||
if xml_local_name(element.tag) == "loc" and element.text:
|
||||
child_sitemap = normalize_url(element.text.strip())
|
||||
discovered_urls.update(
|
||||
parse_sitemap_urls(
|
||||
child_sitemap,
|
||||
allowed_hosts,
|
||||
include_subdomains,
|
||||
timeout,
|
||||
user_agent,
|
||||
log_path,
|
||||
seen_sitemaps,
|
||||
)
|
||||
)
|
||||
else:
|
||||
log_message(log_path, f"Unsupported sitemap format at {normalized_sitemap}")
|
||||
|
||||
return discovered_urls
|
||||
|
||||
|
||||
def seed_from_xml_sitemaps(
|
||||
state: CrawlState,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
log_path: Path,
|
||||
) -> None:
|
||||
allowed_hosts = build_allowed_hosts(state.start_url)
|
||||
sitemap_candidates = discover_robots_sitemaps(state.start_url, timeout, user_agent, log_path)
|
||||
sitemap_candidates.add(normalize_url(urljoin(state.start_url, "/sitemap.xml")))
|
||||
|
||||
seen_sitemaps: set[str] = set()
|
||||
discovered_urls: set[str] = set()
|
||||
for sitemap_url in sitemap_candidates:
|
||||
discovered_urls.update(
|
||||
parse_sitemap_urls(
|
||||
sitemap_url,
|
||||
allowed_hosts,
|
||||
state.include_subdomains,
|
||||
timeout,
|
||||
user_agent,
|
||||
log_path,
|
||||
seen_sitemaps,
|
||||
)
|
||||
)
|
||||
|
||||
added = 0
|
||||
for url in discovered_urls:
|
||||
canonical_url = resolve_alias(url, state.alias_to_canonical)
|
||||
if is_document_url(canonical_url):
|
||||
if state.include_documents:
|
||||
register_record(state, canonical_url, "document")
|
||||
added += 1
|
||||
continue
|
||||
|
||||
register_record(state, canonical_url, "page")
|
||||
if canonical_url not in state.visited and canonical_url not in state.queued:
|
||||
state.queue.append(canonical_url)
|
||||
state.queued.add(canonical_url)
|
||||
added += 1
|
||||
|
||||
state.discovered_from_sitemaps += added
|
||||
log_message(log_path, f"Added {added} URL(s) from XML sitemap discovery")
|
||||
|
||||
|
||||
def process_crawl_result(
|
||||
state: CrawlState,
|
||||
result: CrawlResult,
|
||||
allowed_hosts: set[str],
|
||||
log_path: Path,
|
||||
) -> None:
|
||||
if result.error:
|
||||
state.errors.append({"url": result.url, "error": result.error})
|
||||
log_message(log_path, f"Error fetching {result.url}: {result.error}")
|
||||
return
|
||||
|
||||
if result.skipped:
|
||||
state.skipped_count += 1
|
||||
register_record(state, result.url, "document")
|
||||
return
|
||||
|
||||
canonical_url = ""
|
||||
if result.canonical_url and should_visit(result.canonical_url, allowed_hosts, state.include_subdomains):
|
||||
canonical_url = resolve_alias(result.canonical_url, state.alias_to_canonical)
|
||||
state.alias_to_canonical[result.url] = canonical_url
|
||||
register_record(state, canonical_url, "page", title=result.title, canonical_url=canonical_url)
|
||||
if canonical_url not in state.visited and canonical_url not in state.queued:
|
||||
state.queue.append(canonical_url)
|
||||
state.queued.add(canonical_url)
|
||||
register_record(state, result.url, "page", title=result.title, canonical_url=canonical_url)
|
||||
|
||||
for raw_link in result.links:
|
||||
absolute = normalize_url(urljoin(result.url, raw_link))
|
||||
if not should_visit(absolute, allowed_hosts, state.include_subdomains):
|
||||
continue
|
||||
|
||||
absolute = resolve_alias(absolute, state.alias_to_canonical)
|
||||
if is_document_url(absolute):
|
||||
if state.include_documents:
|
||||
register_record(state, absolute, "document")
|
||||
continue
|
||||
|
||||
register_record(state, absolute, "page")
|
||||
if absolute not in state.queued and absolute not in state.visited:
|
||||
state.queue.append(absolute)
|
||||
state.queued.add(absolute)
|
||||
|
||||
|
||||
def crawl_site(
|
||||
state: CrawlState,
|
||||
max_pages: int,
|
||||
delay: float,
|
||||
timeout: float,
|
||||
user_agent: str,
|
||||
state_path: Path,
|
||||
output_path: Path,
|
||||
log_path: Path,
|
||||
save_every: int,
|
||||
workers: int,
|
||||
) -> tuple[CrawlState, bool]:
|
||||
allowed_hosts = build_allowed_hosts(state.start_url)
|
||||
processed_since_save = 0
|
||||
user_stopped = False
|
||||
control = RuntimeControl()
|
||||
|
||||
if workers <= 1:
|
||||
while state.queue and len(state.visited) < max_pages:
|
||||
poll_runtime_control(control, log_path)
|
||||
if control.stop_requested:
|
||||
user_stopped = True
|
||||
print("Stop requested. Saving progress and finishing cleanly...")
|
||||
break
|
||||
|
||||
while control.paused and not control.stop_requested:
|
||||
time.sleep(0.2)
|
||||
poll_runtime_control(control, log_path)
|
||||
|
||||
if control.stop_requested:
|
||||
user_stopped = True
|
||||
print("Stop requested. Saving progress and finishing cleanly...")
|
||||
break
|
||||
|
||||
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
|
||||
if current in state.visited:
|
||||
continue
|
||||
|
||||
state.visited.add(current)
|
||||
register_record(state, current, "page")
|
||||
print_progress(state, max_pages, current)
|
||||
|
||||
result = fetch_page_with_delay(current, timeout=timeout, user_agent=user_agent, delay=delay)
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
|
||||
processed_since_save += 1
|
||||
if processed_since_save >= save_every:
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
|
||||
processed_since_save = 0
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
pending: dict[object, str] = {}
|
||||
|
||||
while pending or (state.queue and len(state.visited) < max_pages):
|
||||
poll_runtime_control(control, log_path)
|
||||
|
||||
if control.stop_requested:
|
||||
user_stopped = True
|
||||
print("Stop requested. No new pages will be queued. Waiting for active requests to finish...")
|
||||
break
|
||||
|
||||
if control.paused:
|
||||
if pending:
|
||||
completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
|
||||
for future in completed:
|
||||
pending.pop(future, None)
|
||||
result = future.result()
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
processed_since_save += 1
|
||||
else:
|
||||
time.sleep(0.2)
|
||||
|
||||
if processed_since_save >= save_every:
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
|
||||
processed_since_save = 0
|
||||
continue
|
||||
|
||||
while state.queue and len(pending) < workers and len(state.visited) < max_pages:
|
||||
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
|
||||
if current in state.visited:
|
||||
continue
|
||||
|
||||
state.visited.add(current)
|
||||
register_record(state, current, "page")
|
||||
print_progress(state, max_pages, current)
|
||||
future = executor.submit(fetch_page_with_delay, current, timeout, user_agent, delay)
|
||||
pending[future] = current
|
||||
|
||||
if not pending:
|
||||
continue
|
||||
|
||||
completed, _ = wait(pending.keys(), timeout=0.2, return_when=FIRST_COMPLETED)
|
||||
for future in completed:
|
||||
pending.pop(future, None)
|
||||
result = future.result()
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
processed_since_save += 1
|
||||
|
||||
if processed_since_save >= save_every:
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Saved progress after {len(state.visited)} visited page(s)")
|
||||
processed_since_save = 0
|
||||
|
||||
if user_stopped and pending:
|
||||
completed, _ = wait(pending.keys())
|
||||
for future in completed:
|
||||
pending.pop(future, None)
|
||||
result = future.result()
|
||||
process_crawl_result(state, result, allowed_hosts, log_path)
|
||||
|
||||
write_csv(state.records, output_path)
|
||||
save_state(state, state_path, output_path)
|
||||
log_message(log_path, f"Final save completed with {len(state.records)} URL(s) recorded")
|
||||
return state, user_stopped
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Crawl a website and export discovered internal URLs to a CSV sitemap.",
|
||||
)
|
||||
parser.add_argument("url", nargs="?", help="Starting URL to crawl, for example https://example.com")
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help=f"Output CSV path. Defaults to {DEFAULT_OUTPUT_NAME} in the script folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-pages",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_PAGES,
|
||||
help=f"Maximum number of pages to crawl before stopping. Default: {DEFAULT_MAX_PAGES}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay",
|
||||
type=float,
|
||||
default=0.0,
|
||||
help="Delay in seconds between requests. Default: 0",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=float,
|
||||
default=15.0,
|
||||
help="Request timeout in seconds. Default: 15",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-subdomains",
|
||||
action="store_true",
|
||||
help="Also crawl subdomains of the starting host.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-documents",
|
||||
action="store_true",
|
||||
help="Include document links like PDF, CSV, DOC, and DOCX in the sitemap output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-every",
|
||||
type=int,
|
||||
default=DEFAULT_SAVE_EVERY,
|
||||
help=f"Save progress after this many pages. Default: {DEFAULT_SAVE_EVERY}",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume",
|
||||
action="store_true",
|
||||
help="Resume from the saved crawl state if a state file already exists.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fresh",
|
||||
action="store_true",
|
||||
help="Ignore any saved crawl state and start over.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help=f"Number of worker threads. Use 1 to disable multithreading. Default when prompted on: {DEFAULT_WORKERS}",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run_crawl(
|
||||
*,
|
||||
start_url: str,
|
||||
output_path: Path,
|
||||
max_pages: int = DEFAULT_MAX_PAGES,
|
||||
delay: float = 0.0,
|
||||
timeout: float = 15.0,
|
||||
include_subdomains: bool = False,
|
||||
include_documents: bool = False,
|
||||
save_every: int = DEFAULT_SAVE_EVERY,
|
||||
workers: int = DEFAULT_WORKERS,
|
||||
resume: bool = True,
|
||||
fresh: bool = False,
|
||||
user_agent: str = DEFAULT_USER_AGENT,
|
||||
) -> CrawlRunResult:
|
||||
if not start_url:
|
||||
raise ValueError("A starting URL is required.")
|
||||
|
||||
if "://" not in start_url:
|
||||
start_url = f"https://{start_url}"
|
||||
|
||||
normalized_start = normalize_url(start_url)
|
||||
if not is_http_url(normalized_start):
|
||||
raise ValueError("Only http and https URLs are supported.")
|
||||
|
||||
output_path = Path(output_path)
|
||||
state_path = get_state_path(output_path)
|
||||
log_path = get_log_path(output_path)
|
||||
|
||||
state: CrawlState
|
||||
if state_path.exists() and not fresh and resume:
|
||||
state = load_state(state_path)
|
||||
if state.start_url != normalized_start:
|
||||
raise ValueError(
|
||||
"The saved crawl state belongs to a different starting URL. "
|
||||
"Use a different output name or start a fresh crawl."
|
||||
)
|
||||
if state.include_documents != include_documents:
|
||||
raise ValueError(
|
||||
"The saved crawl state uses a different document setting. "
|
||||
"Keep the same choice or start a fresh crawl."
|
||||
)
|
||||
else:
|
||||
state = initialize_state(normalized_start, include_subdomains, include_documents)
|
||||
|
||||
effective_workers = max(int(workers), 1)
|
||||
effective_max_pages = max(int(max_pages), 1)
|
||||
if state.visited:
|
||||
effective_max_pages = max(effective_max_pages, len(state.visited) + DEFAULT_RESUME_PAGE_INCREMENT)
|
||||
else:
|
||||
seed_from_xml_sitemaps(state, max(timeout, 1.0), user_agent, log_path)
|
||||
|
||||
log_message(log_path, f"Starting crawl for {state.start_url}")
|
||||
log_message(log_path, f"Output CSV: {output_path.resolve()}")
|
||||
log_message(log_path, f"State file: {state_path.resolve()}")
|
||||
log_message(log_path, f"Multithreading workers: {effective_workers}")
|
||||
log_message(log_path, f"Include documents: {state.include_documents}")
|
||||
|
||||
state, user_stopped = crawl_site(
|
||||
state=state,
|
||||
max_pages=effective_max_pages,
|
||||
delay=max(delay, 0.0),
|
||||
timeout=max(timeout, 1.0),
|
||||
user_agent=user_agent,
|
||||
state_path=state_path,
|
||||
output_path=output_path,
|
||||
log_path=log_path,
|
||||
save_every=max(save_every, 1),
|
||||
workers=effective_workers,
|
||||
)
|
||||
|
||||
if user_stopped:
|
||||
log_message(log_path, "Crawl stopped by user")
|
||||
elif state.queue and len(state.visited) >= effective_max_pages:
|
||||
log_message(log_path, "Crawl stopped at max page limit")
|
||||
elif state.queue:
|
||||
log_message(log_path, "Crawl stopped before queue emptied")
|
||||
else:
|
||||
log_message(log_path, "Crawl completed with empty queue")
|
||||
|
||||
return CrawlRunResult(
|
||||
state=state,
|
||||
user_stopped=user_stopped,
|
||||
output_path=output_path,
|
||||
state_path=state_path,
|
||||
log_path=log_path,
|
||||
max_pages=effective_max_pages,
|
||||
workers=effective_workers,
|
||||
)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
start_url = prompt_if_missing(args.url, "Enter the website URL to crawl: ")
|
||||
if not start_url:
|
||||
print("A starting URL is required.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if "://" not in start_url:
|
||||
start_url = f"https://{start_url}"
|
||||
|
||||
normalized_start = normalize_url(start_url)
|
||||
if not is_http_url(normalized_start):
|
||||
print("Only http and https URLs are supported.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
output_value = prompt_if_missing(args.output, f"Enter output CSV path [{DEFAULT_OUTPUT_NAME}]: ")
|
||||
output_path = Path(output_value) if output_value else SCRIPT_DIR / DEFAULT_OUTPUT_NAME
|
||||
state_path = get_state_path(output_path)
|
||||
log_path = get_log_path(output_path)
|
||||
include_documents = args.include_documents or prompt_yes_no(
|
||||
"Include document links such as PDF, CSV, DOC, and DOCX in the sitemap?",
|
||||
default=False,
|
||||
)
|
||||
workers = args.workers
|
||||
if workers <= 0:
|
||||
enable_multithreading = prompt_yes_no(
|
||||
f"Enable multithreading for faster scanning? {DEFAULT_WORKERS} worker threads will be used.",
|
||||
default=True,
|
||||
)
|
||||
workers = DEFAULT_WORKERS if enable_multithreading else 1
|
||||
|
||||
print(f"Crawling {normalized_start}")
|
||||
print(f"Output file: {output_path.resolve()}")
|
||||
print(f"State file: {state_path.resolve()}")
|
||||
print(f"Log file: {log_path.resolve()}")
|
||||
resume_existing = False
|
||||
if state_path.exists() and not args.fresh:
|
||||
resume_existing = args.resume or prompt_yes_no(
|
||||
f"Found saved crawl state at {state_path.name}. Resume from where it left off?",
|
||||
default=True,
|
||||
)
|
||||
|
||||
try:
|
||||
run_result = run_crawl(
|
||||
start_url=normalized_start,
|
||||
output_path=output_path,
|
||||
max_pages=args.max_pages,
|
||||
delay=args.delay,
|
||||
timeout=args.timeout,
|
||||
include_subdomains=args.include_subdomains,
|
||||
include_documents=include_documents,
|
||||
save_every=args.save_every,
|
||||
workers=workers,
|
||||
resume=resume_existing,
|
||||
fresh=args.fresh,
|
||||
user_agent=DEFAULT_USER_AGENT,
|
||||
)
|
||||
except ValueError as exc:
|
||||
print(str(exc), file=sys.stderr)
|
||||
return 1
|
||||
|
||||
state = run_result.state
|
||||
user_stopped = run_result.user_stopped
|
||||
effective_max_pages = run_result.max_pages
|
||||
|
||||
print(f"Max pages: {effective_max_pages}")
|
||||
print(f"Include documents: {'Yes' if state.include_documents else 'No'}")
|
||||
print(f"Multithreading: {'Yes' if run_result.workers > 1 else 'No'}")
|
||||
print(f"Worker threads: {run_result.workers}")
|
||||
if os.name == "nt":
|
||||
print("Press P to pause, R to resume, or Q to stop cleanly and save progress.")
|
||||
if resume_existing:
|
||||
print("Resumed from the existing crawl state file.")
|
||||
log_message(log_path, "Resumed from existing crawl state")
|
||||
|
||||
print(f"Found {len(state.records)} unique URL(s).")
|
||||
print(f"Visited pages: {len(state.visited)}")
|
||||
print(f"Queued pages remaining: {len(state.queue)}")
|
||||
print(f"URLs added from XML sitemaps: {state.discovered_from_sitemaps}")
|
||||
if state.errors:
|
||||
print(f"Pages with errors: {len(state.errors)}")
|
||||
for result in state.errors[:10]:
|
||||
print(f" {result['url']} -> {result['error']}")
|
||||
if state.skipped_count:
|
||||
print(f"Non-HTML pages skipped while crawling: {state.skipped_count}")
|
||||
|
||||
if user_stopped:
|
||||
print("Stopped by user. Run it again to continue from the saved state.")
|
||||
log_message(log_path, "Crawl stopped by user")
|
||||
elif state.queue and len(state.visited) >= effective_max_pages:
|
||||
print("Stopped because the max page limit was reached. Run it again to continue.")
|
||||
log_message(log_path, "Crawl stopped at max page limit")
|
||||
elif state.queue:
|
||||
print("Stopped before the queue was empty. Run it again to continue.")
|
||||
log_message(log_path, "Crawl stopped before queue emptied")
|
||||
else:
|
||||
print("Crawl complete. No queued pages remain.")
|
||||
log_message(log_path, "Crawl completed with empty queue")
|
||||
|
||||
print("Done.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user