first commit
Build Docker Image / docker (push) Successful in 44s

This commit is contained in:
2026-04-09 10:42:10 -07:00
commit ead872a0a5
19 changed files with 2783 additions and 0 deletions
+555
View File
@@ -0,0 +1,555 @@
from __future__ import annotations
import json
import re
import traceback
from html import unescape
from typing import Iterable
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from page_importer.dates import normalize_date
from page_importer.models import ScrapeOptions, ScrapedPost
JSON_ARTICLE_TYPES = {
"article",
"blogposting",
"newsarticle",
"report",
"webpage",
}
BODY_SELECTORS = [
"article .entry-content",
"article .post-content",
"article .node__content",
"article .node .content",
"article .node-content",
"article .field-name-body .field-item",
"article .field-name-body",
"article .field--name-body",
"article .article-body",
"article .content",
".post-content",
".entry-content",
".node__content",
".node .content",
".node-content",
".field-name-body .field-item",
".field-name-body",
".field--name-body",
".article-body",
"#content-area .node .content",
"article",
"main article",
"main",
]
CATEGORY_SELECTORS = [
".cat-links a",
".post-categories a",
".field--name-field-category a",
".tags a[rel='category tag']",
".terms a",
".taxonomy a",
]
TAG_SELECTORS = [
".tags-links a",
".post-tags a",
".field--name-field-tags a",
"a[rel='tag']",
".terms a",
]
AUTHOR_SELECTORS = [
"[rel='author']",
".author a",
".byline a",
".submitted a",
".node__submitted a",
".node-info a",
".createdby",
]
DATE_SELECTORS = [
"time[datetime]",
"meta[property='article:published_time']",
"meta[name='publish_date']",
"meta[name='pubdate']",
".date-display-single",
".submitted",
".node-info",
]
DRUPAL_TITLE_DATE_PATTERN = re.compile(
r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
)
class Scraper:
def __init__(self, options: ScrapeOptions) -> None:
self.options = options
self.session = requests.Session()
self.session.headers.update({"User-Agent": options.user_agent})
def scrape(self, url: str) -> ScrapedPost:
post = ScrapedPost(source_url=url)
response: requests.Response | None = None
try:
response = self.session.get(url, timeout=self.options.request_timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
post.cms = detect_cms(soup)
article_data = extract_article_json_ld(soup)
if article_data and not self.options.force_heuristics:
apply_article_data(post, article_data, soup, self.options)
merge_fallback_data(post, soup, self.options)
post.body_html = sanitize_html(post.body_html)
missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
if missing_fields:
raise ValueError(
"Unable to extract required field(s): "
f"{', '.join(missing_fields)}. "
f"Detected CMS: {post.cms}. "
f"Publish date found: {'yes' if post.publish_date else 'no'}. "
f"Author found: {'yes' if post.author else 'no'}."
)
post.success = True
return post
except Exception as exc:
post.error = format_error_summary(url, exc, response, self.options.request_timeout)
post.error_details = format_error_details(url, exc, response)
return post
def detect_cms(soup: BeautifulSoup) -> str:
generator = meta_content(soup, "meta", {"name": "generator"})
html = str(soup).lower()
if generator:
g = generator.lower()
if "wordpress" in g:
return "wordpress"
if "drupal" in g:
return "drupal"
if "joomla" in g:
return "joomla"
if "/wp-content/" in html:
return "wordpress"
if "drupal-settings-json" in html or "sites/default/files" in html:
return "drupal"
if "com_content" in html or "joomla" in html:
return "joomla"
return "unknown"
def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
for script in soup.select("script[type='application/ld+json']"):
raw = script.string or script.get_text(" ", strip=True)
if not raw:
continue
for payload in parse_json_candidates(raw):
article = find_article_payload(payload)
if article:
return article
return None
def parse_json_candidates(raw: str) -> Iterable[dict | list]:
try:
data = json.loads(raw)
yield data
return
except json.JSONDecodeError:
pass
cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
try:
data = json.loads(cleaned)
yield data
except json.JSONDecodeError:
return
def find_article_payload(payload: dict | list) -> dict | None:
if isinstance(payload, list):
for item in payload:
found = find_article_payload(item)
if found:
return found
return None
if not isinstance(payload, dict):
return None
if "@graph" in payload:
found = find_article_payload(payload["@graph"])
if found:
return found
node_type = payload.get("@type")
types = {node_type.lower()} if isinstance(node_type, str) else {
item.lower() for item in node_type or [] if isinstance(item, str)
}
if types & JSON_ARTICLE_TYPES:
return payload
return None
def apply_article_data(
post: ScrapedPost,
article: dict,
soup: BeautifulSoup,
options: ScrapeOptions,
) -> None:
post.title = article.get("headline") or article.get("name") or post.title
post.publish_date = normalize_date(
article.get("datePublished") or article.get("dateCreated") or post.publish_date
)
if options.include_author:
post.author = extract_author_from_json_ld(article) or post.author
if options.include_categories:
post.categories = normalize_terms(article.get("articleSection")) or post.categories
if options.include_tags:
post.tags = normalize_terms(article.get("keywords")) or post.tags
post.body_html = extract_body_from_article(article, soup) or post.body_html
def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
if not post.title:
post.title = extract_title(soup)
if not post.publish_date:
post.publish_date = extract_date(soup, post.cms)
if options.include_author and not post.author:
post.author = extract_author(soup)
if not post.body_html:
post.body_html = extract_body(soup)
if options.include_categories:
post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
if post.cms == "drupal":
post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
if options.include_tags and not post.tags:
post.tags = extract_terms(soup, TAG_SELECTORS)
def extract_title(soup: BeautifulSoup) -> str:
og_title = meta_content(soup, "meta", {"property": "og:title"})
if og_title:
return og_title
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
node = soup.select_one(selector)
if node:
return clean_text(node.get_text(" ", strip=True))
return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
for selector in DATE_SELECTORS:
node = soup.select_one(selector)
if not node:
continue
candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
normalized = normalize_date(candidate)
if normalized:
return normalized
if cms == "drupal":
return extract_drupal_title_adjacent_date(soup)
return ""
def extract_author(soup: BeautifulSoup) -> str:
author = meta_content(soup, "meta", {"name": "author"})
if author:
return clean_text(author)
for selector in AUTHOR_SELECTORS:
node = soup.select_one(selector)
if node:
return clean_text(node.get_text(" ", strip=True))
return ""
def extract_body(soup: BeautifulSoup) -> str:
fallback_html = ""
for selector in BODY_SELECTORS:
node = soup.select_one(selector)
if not node:
continue
candidate = clone_tag(node)
strip_unwanted(candidate)
html = candidate.decode_contents().strip()
text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
if text_length >= 120:
return html
if not fallback_html and has_meaningful_body_content(html):
fallback_html = html
return fallback_html
def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
terms: list[str] = []
for selector in selectors:
for node in soup.select(selector):
term = clean_text(node.get_text(" ", strip=True))
if term and term not in terms:
terms.append(term)
return terms
def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
title_node = find_title_node(soup)
if not title_node:
return ""
for sibling in title_node.next_siblings:
candidate = text_from_node(sibling)
normalized = normalize_drupal_date(candidate)
if normalized:
return normalized
header = title_node.find_parent(["header", "div", "section"])
if header:
header_text = clean_text(header.get_text(" ", strip=True))
title_text = clean_text(title_node.get_text(" ", strip=True))
if title_text and header_text.startswith(title_text):
header_text = clean_text(header_text[len(title_text):])
normalized = normalize_drupal_date(header_text)
if normalized:
return normalized
return ""
def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
categories: list[str] = []
label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)
for label_node in soup.find_all(string=label_pattern):
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
if not parent:
continue
inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
normalized_inline_value = normalize_department_category(inline_value)
if normalized_inline_value:
categories = merge_terms(categories, [normalized_inline_value])
continue
for sibling in parent.next_siblings:
value = normalize_department_category(text_from_node(sibling))
if value:
categories = merge_terms(categories, [value])
break
for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
text = clean_text(candidate.get_text(" ", strip=True))
if not text.lower().startswith("department:"):
continue
extracted = normalize_department_category(extract_labeled_value(text, "Department"))
if extracted:
categories = merge_terms(categories, [extracted])
return categories
def extract_author_from_json_ld(article: dict) -> str:
author = article.get("author")
if isinstance(author, dict):
return clean_text(author.get("name", ""))
if isinstance(author, list):
names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
return ", ".join(name for name in names if name)
if isinstance(author, str):
return clean_text(author)
return ""
def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
body = article.get("articleBody")
if isinstance(body, str) and len(body.strip()) > 120:
return f"<p>{unescape(body.strip())}</p>"
return extract_body(soup)
def normalize_terms(value: object) -> list[str]:
if isinstance(value, str):
parts = re.split(r"[,|>]", value)
return [clean_text(part) for part in parts if clean_text(part)]
if isinstance(value, list):
result: list[str] = []
for item in value:
if isinstance(item, str):
cleaned = clean_text(item)
if cleaned and cleaned not in result:
result.append(cleaned)
return result
return []
def merge_terms(*groups: list[str]) -> list[str]:
merged: list[str] = []
for group in groups:
for item in group:
cleaned = clean_text(item)
if cleaned and cleaned not in merged:
merged.append(cleaned)
return merged
def normalize_drupal_date(value: str | None) -> str:
if not value:
return ""
match = DRUPAL_TITLE_DATE_PATTERN.search(value)
if not match:
return ""
return normalize_date(match.group(0))
def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
node = soup.find(tag_name, attrs=attrs)
if node and node.get("content"):
return node["content"].strip()
return ""
def clean_text(value: str) -> str:
return re.sub(r"\s+", " ", value or "").strip()
def text_from_node(node: object) -> str:
if isinstance(node, NavigableString):
return clean_text(str(node))
if isinstance(node, Tag):
return clean_text(node.get_text(" ", strip=True))
return ""
def sanitize_html(html: str) -> str:
if not html:
return ""
soup = BeautifulSoup(html, "html.parser")
strip_unwanted(soup)
strip_dangerous_attributes(soup)
return soup.decode_contents().strip()
def has_meaningful_body_content(html: str) -> bool:
if not html:
return False
text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))
def strip_unwanted(node: BeautifulSoup | Tag) -> None:
for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
for child in node.select(selector):
child.decompose()
def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
for child in node.find_all(True):
for attr_name in list(child.attrs):
normalized_name = attr_name.lower()
if normalized_name.startswith("on") or normalized_name == "srcdoc":
del child.attrs[attr_name]
continue
if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
continue
raw_value = child.attrs.get(attr_name)
if isinstance(raw_value, list):
candidate = " ".join(str(item) for item in raw_value)
else:
candidate = str(raw_value or "")
lowered = candidate.strip().lower()
if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
del child.attrs[attr_name]
def clone_tag(node: Tag) -> BeautifulSoup:
return BeautifulSoup(str(node), "html.parser")
def find_title_node(soup: BeautifulSoup) -> Tag | None:
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
node = soup.select_one(selector)
if node:
return node
return None
def extract_labeled_value(text: str, label: str) -> str:
if not text:
return ""
pattern = re.compile(
rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
re.IGNORECASE,
)
match = pattern.search(clean_text(text))
if not match:
return ""
return clean_text(match.group(1))
def normalize_department_category(value: str) -> str:
cleaned = clean_text(value)
if not cleaned:
return ""
if len(cleaned) > 80 or len(cleaned.split()) > 8:
return ""
if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
return ""
return cleaned
def format_error_summary(
url: str,
exc: Exception,
response: requests.Response | None,
timeout_seconds: int,
) -> str:
if isinstance(exc, requests.HTTPError):
failing_response = exc.response or response
if failing_response is not None:
return (
f"HTTP {failing_response.status_code} {failing_response.reason} "
f"while fetching {failing_response.url or url}"
)
if isinstance(exc, requests.Timeout):
return f"Request timed out after {timeout_seconds}s while fetching {url}"
if isinstance(exc, requests.RequestException):
return f"{type(exc).__name__} while fetching {url}: {exc}"
return f"{type(exc).__name__}: {exc}"
def format_error_details(
url: str,
exc: Exception,
response: requests.Response | None,
) -> str:
details = [
f"URL: {url}",
f"Error Type: {type(exc).__name__}",
f"Message: {exc}",
]
failing_response = getattr(exc, "response", None) or response
if failing_response is not None:
details.extend(
[
f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
f"Resolved URL: {failing_response.url}",
]
)
trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
if trace:
details.append(f"Exception: {trace}")
return "\n".join(details)