WDW-Sitemap-and-Scraper-Docker/Page Importer/page_importer/scraper.py

from __future__ import annotations

import json
import re
import traceback
from html import unescape
from typing import Iterable

import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag

from page_importer.dates import normalize_date
from page_importer.models import ScrapeOptions, ScrapedPost

JSON_ARTICLE_TYPES = {
    "article",
    "blogposting",
    "newsarticle",
    "report",
    "webpage",
}

BODY_SELECTORS = [
    "article .entry-content",
    "article .post-content",
    "article .node__content",
    "article .node .content",
    "article .node-content",
    "article .field-name-body .field-item",
    "article .field-name-body",
    "article .field--name-body",
    "article .article-body",
    "article .content",
    ".post-content",
    ".entry-content",
    ".node__content",
    ".node .content",
    ".node-content",
    ".field-name-body .field-item",
    ".field-name-body",
    ".field--name-body",
    ".article-body",
    "#content-area .node .content",
    "article",
    "main article",
    "main",
]

CATEGORY_SELECTORS = [
    ".cat-links a",
    ".post-categories a",
    ".field--name-field-category a",
    ".tags a[rel='category tag']",
    ".terms a",
    ".taxonomy a",
]

TAG_SELECTORS = [
    ".tags-links a",
    ".post-tags a",
    ".field--name-field-tags a",
    "a[rel='tag']",
    ".terms a",
]

AUTHOR_SELECTORS = [
    "[rel='author']",
    ".author a",
    ".byline a",
    ".submitted a",
    ".node__submitted a",
    ".node-info a",
    ".createdby",
]

DATE_SELECTORS = [
    "time[datetime]",
    "meta[property='article:published_time']",
    "meta[name='publish_date']",
    "meta[name='pubdate']",
    ".date-display-single",
    ".submitted",
    ".node-info",
]

DRUPAL_TITLE_DATE_PATTERN = re.compile(
    r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
    r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
)


class Scraper:
    def __init__(self, options: ScrapeOptions) -> None:
        self.options = options
        self.session = requests.Session()
        self.session.headers.update({"User-Agent": options.user_agent})

    def scrape(self, url: str) -> ScrapedPost:
        post = ScrapedPost(source_url=url)
        response: requests.Response | None = None
        try:
            response = self.session.get(url, timeout=self.options.request_timeout)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            post.cms = detect_cms(soup)

            article_data = extract_article_json_ld(soup)
            if article_data and not self.options.force_heuristics:
                apply_article_data(post, article_data, soup, self.options)

            merge_fallback_data(post, soup, self.options)
            post.body_html = sanitize_html(post.body_html)

            missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
            if missing_fields:
                raise ValueError(
                    "Unable to extract required field(s): "
                    f"{', '.join(missing_fields)}. "
                    f"Detected CMS: {post.cms}. "
                    f"Publish date found: {'yes' if post.publish_date else 'no'}. "
                    f"Author found: {'yes' if post.author else 'no'}."
                )

            post.success = True
            return post
        except Exception as exc:
            post.error = format_error_summary(url, exc, response, self.options.request_timeout)
            post.error_details = format_error_details(url, exc, response)
            return post


def detect_cms(soup: BeautifulSoup) -> str:
    generator = meta_content(soup, "meta", {"name": "generator"})
    html = str(soup).lower()
    if generator:
        g = generator.lower()
        if "wordpress" in g:
            return "wordpress"
        if "drupal" in g:
            return "drupal"
        if "joomla" in g:
            return "joomla"
    if "/wp-content/" in html:
        return "wordpress"
    if "drupal-settings-json" in html or "sites/default/files" in html:
        return "drupal"
    if "com_content" in html or "joomla" in html:
        return "joomla"
    return "unknown"


def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
    for script in soup.select("script[type='application/ld+json']"):
        raw = script.string or script.get_text(" ", strip=True)
        if not raw:
            continue
        for payload in parse_json_candidates(raw):
            article = find_article_payload(payload)
            if article:
                return article
    return None


def parse_json_candidates(raw: str) -> Iterable[dict | list]:
    try:
        data = json.loads(raw)
        yield data
        return
    except json.JSONDecodeError:
        pass

    cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
    try:
        data = json.loads(cleaned)
        yield data
    except json.JSONDecodeError:
        return


def find_article_payload(payload: dict | list) -> dict | None:
    if isinstance(payload, list):
        for item in payload:
            found = find_article_payload(item)
            if found:
                return found
        return None
    if not isinstance(payload, dict):
        return None
    if "@graph" in payload:
        found = find_article_payload(payload["@graph"])
        if found:
            return found
    node_type = payload.get("@type")
    types = {node_type.lower()} if isinstance(node_type, str) else {
        item.lower() for item in node_type or [] if isinstance(item, str)
    }
    if types & JSON_ARTICLE_TYPES:
        return payload
    return None


def apply_article_data(
    post: ScrapedPost,
    article: dict,
    soup: BeautifulSoup,
    options: ScrapeOptions,
) -> None:
    post.title = article.get("headline") or article.get("name") or post.title
    post.publish_date = normalize_date(
        article.get("datePublished") or article.get("dateCreated") or post.publish_date
    )
    if options.include_author:
        post.author = extract_author_from_json_ld(article) or post.author
    if options.include_categories:
        post.categories = normalize_terms(article.get("articleSection")) or post.categories
    if options.include_tags:
        post.tags = normalize_terms(article.get("keywords")) or post.tags
    post.body_html = extract_body_from_article(article, soup) or post.body_html


def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
    if not post.title:
        post.title = extract_title(soup)
    if not post.publish_date:
        post.publish_date = extract_date(soup, post.cms)
    if options.include_author and not post.author:
        post.author = extract_author(soup)
    if not post.body_html:
        post.body_html = extract_body(soup)
    if options.include_categories:
        post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
        if post.cms == "drupal":
            post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
    if options.include_tags and not post.tags:
        post.tags = extract_terms(soup, TAG_SELECTORS)


def extract_title(soup: BeautifulSoup) -> str:
    og_title = meta_content(soup, "meta", {"property": "og:title"})
    if og_title:
        return og_title
    for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
        node = soup.select_one(selector)
        if node:
            return clean_text(node.get_text(" ", strip=True))
    return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""


def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
    for selector in DATE_SELECTORS:
        node = soup.select_one(selector)
        if not node:
            continue
        candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
        normalized = normalize_date(candidate)
        if normalized:
            return normalized
    if cms == "drupal":
        return extract_drupal_title_adjacent_date(soup)
    return ""


def extract_author(soup: BeautifulSoup) -> str:
    author = meta_content(soup, "meta", {"name": "author"})
    if author:
        return clean_text(author)
    for selector in AUTHOR_SELECTORS:
        node = soup.select_one(selector)
        if node:
            return clean_text(node.get_text(" ", strip=True))
    return ""


def extract_body(soup: BeautifulSoup) -> str:
    fallback_html = ""
    for selector in BODY_SELECTORS:
        node = soup.select_one(selector)
        if not node:
            continue
        candidate = clone_tag(node)
        strip_unwanted(candidate)
        html = candidate.decode_contents().strip()
        text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
        if text_length >= 120:
            return html
        if not fallback_html and has_meaningful_body_content(html):
            fallback_html = html
    return fallback_html


def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
    terms: list[str] = []
    for selector in selectors:
        for node in soup.select(selector):
            term = clean_text(node.get_text(" ", strip=True))
            if term and term not in terms:
                terms.append(term)
    return terms


def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
    title_node = find_title_node(soup)
    if not title_node:
        return ""

    for sibling in title_node.next_siblings:
        candidate = text_from_node(sibling)
        normalized = normalize_drupal_date(candidate)
        if normalized:
            return normalized

    header = title_node.find_parent(["header", "div", "section"])
    if header:
        header_text = clean_text(header.get_text(" ", strip=True))
        title_text = clean_text(title_node.get_text(" ", strip=True))
        if title_text and header_text.startswith(title_text):
            header_text = clean_text(header_text[len(title_text):])
        normalized = normalize_drupal_date(header_text)
        if normalized:
            return normalized

    return ""


def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
    categories: list[str] = []
    label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)

    for label_node in soup.find_all(string=label_pattern):
        parent = label_node.parent if isinstance(label_node.parent, Tag) else None
        if not parent:
            continue

        inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
        normalized_inline_value = normalize_department_category(inline_value)
        if normalized_inline_value:
            categories = merge_terms(categories, [normalized_inline_value])
            continue

        for sibling in parent.next_siblings:
            value = normalize_department_category(text_from_node(sibling))
            if value:
                categories = merge_terms(categories, [value])
                break

    for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
        text = clean_text(candidate.get_text(" ", strip=True))
        if not text.lower().startswith("department:"):
            continue
        extracted = normalize_department_category(extract_labeled_value(text, "Department"))
        if extracted:
            categories = merge_terms(categories, [extracted])

    return categories


def extract_author_from_json_ld(article: dict) -> str:
    author = article.get("author")
    if isinstance(author, dict):
        return clean_text(author.get("name", ""))
    if isinstance(author, list):
        names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
        return ", ".join(name for name in names if name)
    if isinstance(author, str):
        return clean_text(author)
    return ""


def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
    body = article.get("articleBody")
    if isinstance(body, str) and len(body.strip()) > 120:
        return f"<p>{unescape(body.strip())}</p>"
    return extract_body(soup)


def normalize_terms(value: object) -> list[str]:
    if isinstance(value, str):
        parts = re.split(r"[,|>]", value)
        return [clean_text(part) for part in parts if clean_text(part)]
    if isinstance(value, list):
        result: list[str] = []
        for item in value:
            if isinstance(item, str):
                cleaned = clean_text(item)
                if cleaned and cleaned not in result:
                    result.append(cleaned)
        return result
    return []


def merge_terms(*groups: list[str]) -> list[str]:
    merged: list[str] = []
    for group in groups:
        for item in group:
            cleaned = clean_text(item)
            if cleaned and cleaned not in merged:
                merged.append(cleaned)
    return merged


def normalize_drupal_date(value: str | None) -> str:
    if not value:
        return ""
    match = DRUPAL_TITLE_DATE_PATTERN.search(value)
    if not match:
        return ""
    return normalize_date(match.group(0))


def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
    node = soup.find(tag_name, attrs=attrs)
    if node and node.get("content"):
        return node["content"].strip()
    return ""


def clean_text(value: str) -> str:
    return re.sub(r"\s+", " ", value or "").strip()


def text_from_node(node: object) -> str:
    if isinstance(node, NavigableString):
        return clean_text(str(node))
    if isinstance(node, Tag):
        return clean_text(node.get_text(" ", strip=True))
    return ""


def sanitize_html(html: str) -> str:
    if not html:
        return ""
    soup = BeautifulSoup(html, "html.parser")
    strip_unwanted(soup)
    strip_dangerous_attributes(soup)
    return soup.decode_contents().strip()


def has_meaningful_body_content(html: str) -> bool:
    if not html:
        return False
    text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
    return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))


def strip_unwanted(node: BeautifulSoup | Tag) -> None:
    for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
        for child in node.select(selector):
            child.decompose()


def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
    for child in node.find_all(True):
        for attr_name in list(child.attrs):
            normalized_name = attr_name.lower()
            if normalized_name.startswith("on") or normalized_name == "srcdoc":
                del child.attrs[attr_name]
                continue

            if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
                continue

            raw_value = child.attrs.get(attr_name)
            if isinstance(raw_value, list):
                candidate = " ".join(str(item) for item in raw_value)
            else:
                candidate = str(raw_value or "")

            lowered = candidate.strip().lower()
            if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
                del child.attrs[attr_name]


def clone_tag(node: Tag) -> BeautifulSoup:
    return BeautifulSoup(str(node), "html.parser")


def find_title_node(soup: BeautifulSoup) -> Tag | None:
    for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
        node = soup.select_one(selector)
        if node:
            return node
    return None


def extract_labeled_value(text: str, label: str) -> str:
    if not text:
        return ""

    pattern = re.compile(
        rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
        re.IGNORECASE,
    )
    match = pattern.search(clean_text(text))
    if not match:
        return ""
    return clean_text(match.group(1))


def normalize_department_category(value: str) -> str:
    cleaned = clean_text(value)
    if not cleaned:
        return ""
    if len(cleaned) > 80 or len(cleaned.split()) > 8:
        return ""
    if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
        return ""
    return cleaned


def format_error_summary(
    url: str,
    exc: Exception,
    response: requests.Response | None,
    timeout_seconds: int,
) -> str:
    if isinstance(exc, requests.HTTPError):
        failing_response = exc.response or response
        if failing_response is not None:
            return (
                f"HTTP {failing_response.status_code} {failing_response.reason} "
                f"while fetching {failing_response.url or url}"
            )
    if isinstance(exc, requests.Timeout):
        return f"Request timed out after {timeout_seconds}s while fetching {url}"
    if isinstance(exc, requests.RequestException):
        return f"{type(exc).__name__} while fetching {url}: {exc}"
    return f"{type(exc).__name__}: {exc}"


def format_error_details(
    url: str,
    exc: Exception,
    response: requests.Response | None,
) -> str:
    details = [
        f"URL: {url}",
        f"Error Type: {type(exc).__name__}",
        f"Message: {exc}",
    ]

    failing_response = getattr(exc, "response", None) or response
    if failing_response is not None:
        details.extend(
            [
                f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
                f"Resolved URL: {failing_response.url}",
            ]
        )

    trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
    if trace:
        details.append(f"Exception: {trace}")

    return "\n".join(details)