@@ -0,0 +1,12 @@
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
.pytest_cache/
|
||||
.mypy_cache/
|
||||
.ruff_cache/
|
||||
|
||||
.streamlit/secrets.toml
|
||||
|
||||
*.log
|
||||
@@ -0,0 +1,63 @@
|
||||
# Page Importer
|
||||
|
||||
This folder contains the WordPress import tool used by the combined application in the repository root.
|
||||
|
||||
The importer still uses Streamlit internally, but it is now rendered as the `Page Importer` tab inside the shared app rather than being the main entrypoint for the repository.
|
||||
|
||||
## Features
|
||||
|
||||
- Upload a CSV of submitted URLs
|
||||
- Choose the URL column and optional title override column
|
||||
- Optionally map post type from the CSV or force a single post type
|
||||
- Scrape only the listed URLs
|
||||
- Extract title, publish date, author, body HTML, categories, and tags
|
||||
- Retry failed rows
|
||||
- Export a WordPress WXR XML file
|
||||
|
||||
## Recommended Usage
|
||||
|
||||
Run the root application:
|
||||
|
||||
```bash
|
||||
streamlit run ../app.py
|
||||
```
|
||||
|
||||
Or run the combined Docker container from the repository root.
|
||||
|
||||
## Standalone Usage
|
||||
|
||||
If you need to run this importer by itself:
|
||||
|
||||
```bash
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
On Windows PowerShell:
|
||||
|
||||
```powershell
|
||||
python -m venv .venv
|
||||
.venv\Scripts\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
streamlit run app.py
|
||||
```
|
||||
|
||||
## CSV Input
|
||||
|
||||
The app accepts CSV files with any columns. You choose:
|
||||
|
||||
- the URL column to scrape
|
||||
- an optional title or name column to override the scraped title
|
||||
- an optional post type column with values like `post` or `page`
|
||||
- an optional category column whose values are appended during export
|
||||
|
||||
You can also add manual categories in the sidebar to append them to every exported item.
|
||||
|
||||
## Notes
|
||||
|
||||
- Exported posts default to `draft` unless changed in the UI
|
||||
- Image and link URLs remain pointed at the source site
|
||||
- Some themes need heuristic fallback. The `Force heuristic scraping` option skips JSON-LD-first extraction and relies on page structure
|
||||
- In the combined app, dependencies come from the root `requirements.txt`
|
||||
@@ -0,0 +1,475 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import datetime as dt
|
||||
import io
|
||||
import re
|
||||
from dataclasses import replace
|
||||
|
||||
import streamlit as st
|
||||
|
||||
from page_importer.dates import parse_datetime
|
||||
from page_importer.models import ScrapeOptions, ScrapedPost
|
||||
from page_importer.scraper import Scraper
|
||||
from page_importer.wxr import build_wxr
|
||||
|
||||
def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
|
||||
text = file_data.decode("utf-8-sig", errors="replace")
|
||||
reader = csv.DictReader(io.StringIO(text))
|
||||
rows = list(reader)
|
||||
return reader.fieldnames or [], rows
|
||||
|
||||
|
||||
def render_app() -> None:
|
||||
st.title("Page Importer")
|
||||
st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
|
||||
|
||||
with st.sidebar:
|
||||
st.header("Options")
|
||||
include_author = st.checkbox("Include author", value=True)
|
||||
include_categories = st.checkbox("Include categories", value=True)
|
||||
include_tags = st.checkbox("Include tags", value=True)
|
||||
force_heuristics = st.checkbox("Force heuristic scraping", value=False)
|
||||
test_run = st.checkbox(
|
||||
"Test run only",
|
||||
value=False,
|
||||
help="Scrape only the first 10 rows that contain a URL.",
|
||||
)
|
||||
post_type_mode = st.selectbox(
|
||||
"WordPress post type mode",
|
||||
["Single type for all rows", "Use a CSV column"],
|
||||
index=0,
|
||||
)
|
||||
default_post_type = st.selectbox("Default WordPress post type", ["post", "page"], index=0)
|
||||
|
||||
uploaded = st.file_uploader("Upload CSV", type=["csv"])
|
||||
if not uploaded:
|
||||
st.info("Upload a CSV to begin.")
|
||||
return
|
||||
|
||||
headers, rows = load_csv(uploaded.getvalue())
|
||||
if not rows:
|
||||
st.error("The CSV did not contain any rows.")
|
||||
return
|
||||
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
url_column = st.selectbox("URL column", headers, index=_safe_index(headers, ["url", "link"]))
|
||||
with col2:
|
||||
title_column = st.selectbox(
|
||||
"Optional title override column",
|
||||
["(none)", *headers],
|
||||
index=_safe_index(["(none)", *headers], ["name", "title"]),
|
||||
)
|
||||
with col3:
|
||||
post_type_column = st.selectbox(
|
||||
"Optional post type column",
|
||||
["(none)", *headers],
|
||||
index=_safe_index(["(none)", *headers], ["post_type", "type"]),
|
||||
disabled=post_type_mode != "Use a CSV column",
|
||||
)
|
||||
st.write(f"Loaded {len(rows)} row(s). Only the selected URL column will be scraped.")
|
||||
if test_run:
|
||||
st.caption("Test run is enabled. Only the first 10 rows with a URL will be scraped.")
|
||||
|
||||
if st.button("Scrape URLs", type="primary"):
|
||||
context = build_scrape_context(
|
||||
include_author=include_author,
|
||||
include_categories=include_categories,
|
||||
include_tags=include_tags,
|
||||
force_heuristics=force_heuristics,
|
||||
test_run=test_run,
|
||||
post_type_mode=post_type_mode,
|
||||
post_type_column=post_type_column,
|
||||
default_post_type=default_post_type,
|
||||
url_column=url_column,
|
||||
title_column=title_column,
|
||||
)
|
||||
results = scrape_rows(rows, context, phase_label="Scraping")
|
||||
st.session_state["results"] = results
|
||||
st.session_state["input_rows"] = rows
|
||||
st.session_state["scrape_context"] = context
|
||||
|
||||
results = st.session_state.get("results", [])
|
||||
if not results:
|
||||
return
|
||||
|
||||
successful = [post for post in results if post.success]
|
||||
failed = [post for post in results if not post.success]
|
||||
|
||||
st.subheader("Results")
|
||||
st.write(f"Successful: {len(successful)} | Failed: {len(failed)}")
|
||||
|
||||
if failed and st.button("Retry failed items"):
|
||||
stored_rows = st.session_state.get("input_rows", rows)
|
||||
context = st.session_state.get("scrape_context")
|
||||
if context:
|
||||
retried = scrape_rows(
|
||||
stored_rows,
|
||||
context,
|
||||
row_numbers=[post.row_number for post in failed if post.row_number],
|
||||
phase_label="Retrying",
|
||||
)
|
||||
results = merge_retry_results(results, retried)
|
||||
st.session_state["results"] = results
|
||||
successful = [post for post in results if post.success]
|
||||
failed = [post for post in results if not post.success]
|
||||
|
||||
preview_rows = []
|
||||
for post in results:
|
||||
preview_rows.append(
|
||||
{
|
||||
"Row": post.row_number,
|
||||
"URL": post.source_url,
|
||||
"CMS": post.cms,
|
||||
"Success": post.success,
|
||||
"Title": post.title,
|
||||
"Publish Date": post.publish_date,
|
||||
"Author": post.author,
|
||||
"Categories": ", ".join(post.categories),
|
||||
"Tags": ", ".join(post.tags),
|
||||
"Post Type": post.post_type,
|
||||
"Error": post.error,
|
||||
}
|
||||
)
|
||||
st.dataframe(
|
||||
preview_rows,
|
||||
width="stretch",
|
||||
hide_index=True,
|
||||
column_config={
|
||||
"Row": st.column_config.NumberColumn(width="small"),
|
||||
"URL": st.column_config.TextColumn(width="medium"),
|
||||
"Title": st.column_config.TextColumn(width="medium"),
|
||||
"Publish Date": st.column_config.TextColumn(width="medium"),
|
||||
"Categories": st.column_config.TextColumn(width="medium"),
|
||||
"Tags": st.column_config.TextColumn(width="medium"),
|
||||
"Error": st.column_config.TextColumn(width="large"),
|
||||
},
|
||||
)
|
||||
|
||||
if failed:
|
||||
selected_failed = st.selectbox(
|
||||
"Failed row details",
|
||||
failed,
|
||||
format_func=lambda post: f"Row {post.row_number}: {post.source_url or '(missing URL)'}",
|
||||
)
|
||||
st.text_area(
|
||||
"Error details",
|
||||
value=selected_failed.error_details or selected_failed.error,
|
||||
height=180,
|
||||
disabled=True,
|
||||
)
|
||||
|
||||
if successful:
|
||||
selected_index = st.number_input(
|
||||
"Preview successful row",
|
||||
min_value=1,
|
||||
max_value=len(successful),
|
||||
value=1,
|
||||
step=1,
|
||||
)
|
||||
selected = successful[selected_index - 1]
|
||||
st.markdown("### Content Preview")
|
||||
st.write(f"**Title:** {selected.title}")
|
||||
st.write(f"**Source URL:** {selected.source_url}")
|
||||
st.write(f"**Publish Date:** {selected.publish_date or '(missing)'}")
|
||||
st.write(f"**Author:** {selected.author or '(missing)'}")
|
||||
st.write(f"**Post Type:** {selected.post_type}")
|
||||
st.write(selected.body_html, unsafe_allow_html=True)
|
||||
render_export_sidebar(successful, rows, headers)
|
||||
|
||||
|
||||
def build_scrape_context(
|
||||
*,
|
||||
include_author: bool,
|
||||
include_categories: bool,
|
||||
include_tags: bool,
|
||||
force_heuristics: bool,
|
||||
test_run: bool,
|
||||
post_type_mode: str,
|
||||
post_type_column: str,
|
||||
default_post_type: str,
|
||||
url_column: str,
|
||||
title_column: str,
|
||||
) -> dict[str, object]:
|
||||
return {
|
||||
"options": ScrapeOptions(
|
||||
include_author=include_author,
|
||||
include_categories=include_categories,
|
||||
include_tags=include_tags,
|
||||
force_heuristics=force_heuristics,
|
||||
),
|
||||
"test_run": test_run,
|
||||
"post_type_mode": post_type_mode,
|
||||
"post_type_column": post_type_column,
|
||||
"default_post_type": default_post_type,
|
||||
"url_column": url_column,
|
||||
"title_column": title_column,
|
||||
}
|
||||
|
||||
|
||||
def scrape_rows(
|
||||
rows: list[dict[str, str]],
|
||||
context: dict[str, object],
|
||||
row_numbers: list[int] | None = None,
|
||||
phase_label: str = "Scraping",
|
||||
) -> list[ScrapedPost]:
|
||||
options = context["options"]
|
||||
if not isinstance(options, ScrapeOptions):
|
||||
raise TypeError("Invalid scrape options in session state.")
|
||||
|
||||
scraper = Scraper(options)
|
||||
targets = list(enumerate(rows, start=1))
|
||||
if row_numbers is not None:
|
||||
requested_rows = set(row_numbers)
|
||||
targets = [(row_number, row) for row_number, row in targets if row_number in requested_rows]
|
||||
elif bool(context.get("test_run")):
|
||||
targets = [
|
||||
(row_number, row)
|
||||
for row_number, row in targets
|
||||
if (row.get(str(context["url_column"])) or "").strip()
|
||||
][:10]
|
||||
|
||||
results: list[ScrapedPost] = []
|
||||
progress = st.progress(0.0)
|
||||
status = st.empty()
|
||||
|
||||
total = len(targets) or 1
|
||||
for index, (row_number, row) in enumerate(targets, start=1):
|
||||
url = (row.get(context["url_column"]) or "").strip()
|
||||
status.write(f"{phase_label} {index}/{len(targets)}: {url or f'row {row_number} has no URL'}")
|
||||
|
||||
if url:
|
||||
post = scraper.scrape(url)
|
||||
else:
|
||||
post = ScrapedPost(
|
||||
source_url="",
|
||||
row_number=row_number,
|
||||
error="Missing URL in the selected URL column.",
|
||||
error_details=f"Row {row_number} does not contain a URL in column '{context['url_column']}'.",
|
||||
)
|
||||
|
||||
post.row_number = row_number
|
||||
apply_row_overrides(post, row, context)
|
||||
results.append(post)
|
||||
progress.progress(index / total)
|
||||
|
||||
status.write(f"{phase_label} complete.")
|
||||
return results
|
||||
|
||||
|
||||
def apply_row_overrides(post: ScrapedPost, row: dict[str, str], context: dict[str, object]) -> None:
|
||||
title_column = context["title_column"]
|
||||
if isinstance(title_column, str) and title_column != "(none)" and row.get(title_column):
|
||||
post.title = row[title_column].strip()
|
||||
|
||||
post.post_type = resolve_post_type(
|
||||
row=row,
|
||||
mode=str(context["post_type_mode"]),
|
||||
column=str(context["post_type_column"]),
|
||||
default_value=str(context["default_post_type"]),
|
||||
)
|
||||
|
||||
|
||||
def resolve_export_categories(
|
||||
row: dict[str, str],
|
||||
category_column: str,
|
||||
manual_categories: list[str],
|
||||
) -> list[str]:
|
||||
csv_categories = parse_terms(row.get(category_column, "")) if category_column != "(none)" else []
|
||||
return merge_unique_terms(csv_categories, manual_categories)
|
||||
|
||||
|
||||
def parse_terms(value: str) -> list[str]:
|
||||
return [term.strip() for term in re.split(r"[,|>]", value or "") if term.strip()]
|
||||
|
||||
|
||||
def merge_unique_terms(*groups: list[str]) -> list[str]:
|
||||
merged: list[str] = []
|
||||
for group in groups:
|
||||
for term in group:
|
||||
cleaned = term.strip()
|
||||
if cleaned and cleaned not in merged:
|
||||
merged.append(cleaned)
|
||||
return merged
|
||||
|
||||
|
||||
def merge_retry_results(existing: list[ScrapedPost], replacements: list[ScrapedPost]) -> list[ScrapedPost]:
|
||||
replacement_map = {post.row_number: post for post in replacements}
|
||||
merged = [replacement_map.get(post.row_number, post) for post in existing]
|
||||
return sorted(merged, key=lambda post: post.row_number or 0)
|
||||
|
||||
|
||||
def build_export_posts(
|
||||
posts: list[ScrapedPost],
|
||||
rows: list[dict[str, str]],
|
||||
category_column: str,
|
||||
manual_categories: list[str],
|
||||
post_status: str,
|
||||
custom_post_type_slug: str,
|
||||
) -> list[ScrapedPost]:
|
||||
export_posts: list[ScrapedPost] = []
|
||||
for post in posts:
|
||||
row = rows[post.row_number - 1] if 0 < post.row_number <= len(rows) else {}
|
||||
export_posts.append(
|
||||
replace(
|
||||
post,
|
||||
status=post_status,
|
||||
post_type=custom_post_type_slug or post.post_type,
|
||||
categories=merge_unique_terms(
|
||||
post.categories,
|
||||
resolve_export_categories(row, category_column, manual_categories),
|
||||
),
|
||||
)
|
||||
)
|
||||
return export_posts
|
||||
|
||||
|
||||
def render_export_sidebar(
|
||||
successful: list[ScrapedPost],
|
||||
rows: list[dict[str, str]],
|
||||
headers: list[str],
|
||||
) -> None:
|
||||
with st.sidebar:
|
||||
st.markdown("---")
|
||||
st.subheader("Export")
|
||||
post_status = st.selectbox(
|
||||
"Imported post status",
|
||||
["draft", "publish", "private"],
|
||||
index=0,
|
||||
key="export_post_status",
|
||||
)
|
||||
category_column = st.selectbox(
|
||||
"CSV category column",
|
||||
["(none)", *headers],
|
||||
index=_safe_index(["(none)", *headers], ["category", "categories", "department"]),
|
||||
key="export_category_column",
|
||||
)
|
||||
manual_categories = parse_terms(
|
||||
st.text_input(
|
||||
"Additional export categories",
|
||||
value="",
|
||||
help="Comma-separated categories to append to every exported item.",
|
||||
key="export_manual_categories",
|
||||
)
|
||||
)
|
||||
output_name = st.text_input(
|
||||
"Output filename",
|
||||
value="wordpress-import.xml",
|
||||
key="export_output_name",
|
||||
)
|
||||
custom_post_type_slug = normalize_post_type_slug(
|
||||
st.text_input(
|
||||
"Custom post type slug",
|
||||
value="",
|
||||
help="Optional. If set, all exported items will use this WordPress post type slug.",
|
||||
key="export_custom_post_type_slug",
|
||||
)
|
||||
)
|
||||
|
||||
export_posts = build_export_posts(
|
||||
successful,
|
||||
rows,
|
||||
category_column,
|
||||
manual_categories,
|
||||
post_status,
|
||||
custom_post_type_slug,
|
||||
)
|
||||
if custom_post_type_slug:
|
||||
st.caption(f"Exporting all items as post type `{custom_post_type_slug}`.")
|
||||
dated_export_posts = [(post, publish_date) for post in export_posts if (publish_date := parse_publish_date(post.publish_date))]
|
||||
|
||||
if dated_export_posts:
|
||||
min_date = min(publish_date for _, publish_date in dated_export_posts)
|
||||
max_date = max(publish_date for _, publish_date in dated_export_posts)
|
||||
filter_by_publish_date = st.checkbox(
|
||||
"Filter export by publish date",
|
||||
value=False,
|
||||
key="export_filter_by_publish_date",
|
||||
)
|
||||
|
||||
if filter_by_publish_date:
|
||||
export_start = st.date_input(
|
||||
"Export start date",
|
||||
value=min_date,
|
||||
min_value=min_date,
|
||||
max_value=max_date,
|
||||
format="MM/DD/YYYY",
|
||||
key="export_start_date",
|
||||
)
|
||||
export_end = st.date_input(
|
||||
"Export end date",
|
||||
value=max_date,
|
||||
min_value=min_date,
|
||||
max_value=max_date,
|
||||
format="MM/DD/YYYY",
|
||||
key="export_end_date",
|
||||
)
|
||||
|
||||
if export_start > export_end:
|
||||
st.error("Export start date must be on or before the end date.")
|
||||
export_posts = []
|
||||
else:
|
||||
export_posts = [
|
||||
post
|
||||
for post in export_posts
|
||||
if (publish_date := parse_publish_date(post.publish_date)) and export_start <= publish_date <= export_end
|
||||
]
|
||||
st.caption(
|
||||
"Date filter: "
|
||||
f"{export_start.strftime('%m/%d/%Y')} to {export_end.strftime('%m/%d/%Y')}."
|
||||
)
|
||||
undated_count = len(successful) - len(dated_export_posts)
|
||||
if undated_count:
|
||||
st.caption(f"Excluded {undated_count} successful item(s) with no publish date.")
|
||||
else:
|
||||
st.caption("No successful items have a publish date, so export date filtering is unavailable.")
|
||||
|
||||
st.caption(f"Ready to export {len(export_posts)} post(s).")
|
||||
xml_data = build_wxr(export_posts)
|
||||
st.download_button(
|
||||
label="Download WXR XML",
|
||||
data=xml_data,
|
||||
file_name=output_name,
|
||||
mime="application/xml",
|
||||
disabled=not export_posts,
|
||||
)
|
||||
|
||||
|
||||
def parse_publish_date(value: str) -> dt.date | None:
|
||||
parsed = parse_datetime(value)
|
||||
if parsed is None:
|
||||
return None
|
||||
return parsed.date()
|
||||
|
||||
|
||||
def _safe_index(values: list[str], candidates: list[str]) -> int:
|
||||
lowered = {value.lower(): idx for idx, value in enumerate(values)}
|
||||
for candidate in candidates:
|
||||
if candidate in lowered:
|
||||
return lowered[candidate]
|
||||
return 0
|
||||
|
||||
|
||||
def resolve_post_type(
|
||||
row: dict[str, str],
|
||||
mode: str,
|
||||
column: str,
|
||||
default_value: str,
|
||||
) -> str:
|
||||
if mode != "Use a CSV column" or column == "(none)":
|
||||
return default_value
|
||||
|
||||
raw_value = normalize_post_type_slug(row.get(column) or "")
|
||||
if raw_value:
|
||||
return raw_value
|
||||
return default_value
|
||||
|
||||
|
||||
def normalize_post_type_slug(value: str) -> str:
|
||||
return re.sub(r"[^a-z0-9_-]", "", (value or "").strip().lower())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
st.set_page_config(page_title="Page Importer", layout="wide")
|
||||
render_app()
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime as dt
|
||||
|
||||
from dateutil import parser as date_parser
|
||||
|
||||
|
||||
def parse_datetime(value: str | None) -> dt.datetime | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return date_parser.parse(value)
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
try:
|
||||
return date_parser.parse(value, fuzzy=True)
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
return None
|
||||
|
||||
|
||||
def normalize_date(value: str | None) -> str:
|
||||
parsed = parse_datetime(value)
|
||||
if parsed is None:
|
||||
return ""
|
||||
if parsed.tzinfo is None or parsed.utcoffset() is None:
|
||||
return parsed.strftime("%Y-%m-%d %H:%M:%S")
|
||||
return parsed.isoformat(sep=" ", timespec="seconds")
|
||||
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeOptions:
|
||||
include_author: bool = True
|
||||
include_categories: bool = True
|
||||
include_tags: bool = True
|
||||
force_heuristics: bool = False
|
||||
request_timeout: int = 20
|
||||
user_agent: str = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapedPost:
|
||||
source_url: str
|
||||
row_number: int = 0
|
||||
cms: str = "unknown"
|
||||
title: str = ""
|
||||
publish_date: str = ""
|
||||
author: str = ""
|
||||
body_html: str = ""
|
||||
categories: list[str] = field(default_factory=list)
|
||||
tags: list[str] = field(default_factory=list)
|
||||
status: str = "draft"
|
||||
post_type: str = "post"
|
||||
success: bool = False
|
||||
error: str = ""
|
||||
error_details: str = ""
|
||||
@@ -0,0 +1,555 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import traceback
|
||||
from html import unescape
|
||||
from typing import Iterable
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString, Tag
|
||||
|
||||
from page_importer.dates import normalize_date
|
||||
from page_importer.models import ScrapeOptions, ScrapedPost
|
||||
|
||||
JSON_ARTICLE_TYPES = {
|
||||
"article",
|
||||
"blogposting",
|
||||
"newsarticle",
|
||||
"report",
|
||||
"webpage",
|
||||
}
|
||||
|
||||
BODY_SELECTORS = [
|
||||
"article .entry-content",
|
||||
"article .post-content",
|
||||
"article .node__content",
|
||||
"article .node .content",
|
||||
"article .node-content",
|
||||
"article .field-name-body .field-item",
|
||||
"article .field-name-body",
|
||||
"article .field--name-body",
|
||||
"article .article-body",
|
||||
"article .content",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".node__content",
|
||||
".node .content",
|
||||
".node-content",
|
||||
".field-name-body .field-item",
|
||||
".field-name-body",
|
||||
".field--name-body",
|
||||
".article-body",
|
||||
"#content-area .node .content",
|
||||
"article",
|
||||
"main article",
|
||||
"main",
|
||||
]
|
||||
|
||||
CATEGORY_SELECTORS = [
|
||||
".cat-links a",
|
||||
".post-categories a",
|
||||
".field--name-field-category a",
|
||||
".tags a[rel='category tag']",
|
||||
".terms a",
|
||||
".taxonomy a",
|
||||
]
|
||||
|
||||
TAG_SELECTORS = [
|
||||
".tags-links a",
|
||||
".post-tags a",
|
||||
".field--name-field-tags a",
|
||||
"a[rel='tag']",
|
||||
".terms a",
|
||||
]
|
||||
|
||||
AUTHOR_SELECTORS = [
|
||||
"[rel='author']",
|
||||
".author a",
|
||||
".byline a",
|
||||
".submitted a",
|
||||
".node__submitted a",
|
||||
".node-info a",
|
||||
".createdby",
|
||||
]
|
||||
|
||||
DATE_SELECTORS = [
|
||||
"time[datetime]",
|
||||
"meta[property='article:published_time']",
|
||||
"meta[name='publish_date']",
|
||||
"meta[name='pubdate']",
|
||||
".date-display-single",
|
||||
".submitted",
|
||||
".node-info",
|
||||
]
|
||||
|
||||
DRUPAL_TITLE_DATE_PATTERN = re.compile(
|
||||
r"(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+"
|
||||
r"([A-Za-z]+)\s+\d{1,2},\s+\d{4}"
|
||||
)
|
||||
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, options: ScrapeOptions) -> None:
|
||||
self.options = options
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"User-Agent": options.user_agent})
|
||||
|
||||
def scrape(self, url: str) -> ScrapedPost:
|
||||
post = ScrapedPost(source_url=url)
|
||||
response: requests.Response | None = None
|
||||
try:
|
||||
response = self.session.get(url, timeout=self.options.request_timeout)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
post.cms = detect_cms(soup)
|
||||
|
||||
article_data = extract_article_json_ld(soup)
|
||||
if article_data and not self.options.force_heuristics:
|
||||
apply_article_data(post, article_data, soup, self.options)
|
||||
|
||||
merge_fallback_data(post, soup, self.options)
|
||||
post.body_html = sanitize_html(post.body_html)
|
||||
|
||||
missing_fields = [field for field, value in {"title": post.title, "body_html": post.body_html}.items() if not value]
|
||||
if missing_fields:
|
||||
raise ValueError(
|
||||
"Unable to extract required field(s): "
|
||||
f"{', '.join(missing_fields)}. "
|
||||
f"Detected CMS: {post.cms}. "
|
||||
f"Publish date found: {'yes' if post.publish_date else 'no'}. "
|
||||
f"Author found: {'yes' if post.author else 'no'}."
|
||||
)
|
||||
|
||||
post.success = True
|
||||
return post
|
||||
except Exception as exc:
|
||||
post.error = format_error_summary(url, exc, response, self.options.request_timeout)
|
||||
post.error_details = format_error_details(url, exc, response)
|
||||
return post
|
||||
|
||||
|
||||
def detect_cms(soup: BeautifulSoup) -> str:
|
||||
generator = meta_content(soup, "meta", {"name": "generator"})
|
||||
html = str(soup).lower()
|
||||
if generator:
|
||||
g = generator.lower()
|
||||
if "wordpress" in g:
|
||||
return "wordpress"
|
||||
if "drupal" in g:
|
||||
return "drupal"
|
||||
if "joomla" in g:
|
||||
return "joomla"
|
||||
if "/wp-content/" in html:
|
||||
return "wordpress"
|
||||
if "drupal-settings-json" in html or "sites/default/files" in html:
|
||||
return "drupal"
|
||||
if "com_content" in html or "joomla" in html:
|
||||
return "joomla"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def extract_article_json_ld(soup: BeautifulSoup) -> dict | None:
|
||||
for script in soup.select("script[type='application/ld+json']"):
|
||||
raw = script.string or script.get_text(" ", strip=True)
|
||||
if not raw:
|
||||
continue
|
||||
for payload in parse_json_candidates(raw):
|
||||
article = find_article_payload(payload)
|
||||
if article:
|
||||
return article
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_candidates(raw: str) -> Iterable[dict | list]:
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
yield data
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
cleaned = re.sub(r"[\x00-\x1f]+", " ", raw).strip()
|
||||
try:
|
||||
data = json.loads(cleaned)
|
||||
yield data
|
||||
except json.JSONDecodeError:
|
||||
return
|
||||
|
||||
|
||||
def find_article_payload(payload: dict | list) -> dict | None:
|
||||
if isinstance(payload, list):
|
||||
for item in payload:
|
||||
found = find_article_payload(item)
|
||||
if found:
|
||||
return found
|
||||
return None
|
||||
if not isinstance(payload, dict):
|
||||
return None
|
||||
if "@graph" in payload:
|
||||
found = find_article_payload(payload["@graph"])
|
||||
if found:
|
||||
return found
|
||||
node_type = payload.get("@type")
|
||||
types = {node_type.lower()} if isinstance(node_type, str) else {
|
||||
item.lower() for item in node_type or [] if isinstance(item, str)
|
||||
}
|
||||
if types & JSON_ARTICLE_TYPES:
|
||||
return payload
|
||||
return None
|
||||
|
||||
|
||||
def apply_article_data(
|
||||
post: ScrapedPost,
|
||||
article: dict,
|
||||
soup: BeautifulSoup,
|
||||
options: ScrapeOptions,
|
||||
) -> None:
|
||||
post.title = article.get("headline") or article.get("name") or post.title
|
||||
post.publish_date = normalize_date(
|
||||
article.get("datePublished") or article.get("dateCreated") or post.publish_date
|
||||
)
|
||||
if options.include_author:
|
||||
post.author = extract_author_from_json_ld(article) or post.author
|
||||
if options.include_categories:
|
||||
post.categories = normalize_terms(article.get("articleSection")) or post.categories
|
||||
if options.include_tags:
|
||||
post.tags = normalize_terms(article.get("keywords")) or post.tags
|
||||
post.body_html = extract_body_from_article(article, soup) or post.body_html
|
||||
|
||||
|
||||
def merge_fallback_data(post: ScrapedPost, soup: BeautifulSoup, options: ScrapeOptions) -> None:
|
||||
if not post.title:
|
||||
post.title = extract_title(soup)
|
||||
if not post.publish_date:
|
||||
post.publish_date = extract_date(soup, post.cms)
|
||||
if options.include_author and not post.author:
|
||||
post.author = extract_author(soup)
|
||||
if not post.body_html:
|
||||
post.body_html = extract_body(soup)
|
||||
if options.include_categories:
|
||||
post.categories = merge_terms(post.categories, extract_terms(soup, CATEGORY_SELECTORS))
|
||||
if post.cms == "drupal":
|
||||
post.categories = merge_terms(post.categories, extract_drupal_department_categories(soup))
|
||||
if options.include_tags and not post.tags:
|
||||
post.tags = extract_terms(soup, TAG_SELECTORS)
|
||||
|
||||
|
||||
def extract_title(soup: BeautifulSoup) -> str:
|
||||
og_title = meta_content(soup, "meta", {"property": "og:title"})
|
||||
if og_title:
|
||||
return og_title
|
||||
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
return clean_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
|
||||
|
||||
|
||||
def extract_date(soup: BeautifulSoup, cms: str = "unknown") -> str:
|
||||
for selector in DATE_SELECTORS:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
continue
|
||||
candidate = node.get("datetime") or node.get("content") or node.get_text(" ", strip=True)
|
||||
normalized = normalize_date(candidate)
|
||||
if normalized:
|
||||
return normalized
|
||||
if cms == "drupal":
|
||||
return extract_drupal_title_adjacent_date(soup)
|
||||
return ""
|
||||
|
||||
|
||||
def extract_author(soup: BeautifulSoup) -> str:
|
||||
author = meta_content(soup, "meta", {"name": "author"})
|
||||
if author:
|
||||
return clean_text(author)
|
||||
for selector in AUTHOR_SELECTORS:
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
return ""
|
||||
|
||||
|
||||
def extract_body(soup: BeautifulSoup) -> str:
|
||||
fallback_html = ""
|
||||
for selector in BODY_SELECTORS:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
continue
|
||||
candidate = clone_tag(node)
|
||||
strip_unwanted(candidate)
|
||||
html = candidate.decode_contents().strip()
|
||||
text_length = len(BeautifulSoup(html, "html.parser").get_text(" ", strip=True))
|
||||
if text_length >= 120:
|
||||
return html
|
||||
if not fallback_html and has_meaningful_body_content(html):
|
||||
fallback_html = html
|
||||
return fallback_html
|
||||
|
||||
|
||||
def extract_terms(soup: BeautifulSoup, selectors: list[str]) -> list[str]:
|
||||
terms: list[str] = []
|
||||
for selector in selectors:
|
||||
for node in soup.select(selector):
|
||||
term = clean_text(node.get_text(" ", strip=True))
|
||||
if term and term not in terms:
|
||||
terms.append(term)
|
||||
return terms
|
||||
|
||||
|
||||
def extract_drupal_title_adjacent_date(soup: BeautifulSoup) -> str:
|
||||
title_node = find_title_node(soup)
|
||||
if not title_node:
|
||||
return ""
|
||||
|
||||
for sibling in title_node.next_siblings:
|
||||
candidate = text_from_node(sibling)
|
||||
normalized = normalize_drupal_date(candidate)
|
||||
if normalized:
|
||||
return normalized
|
||||
|
||||
header = title_node.find_parent(["header", "div", "section"])
|
||||
if header:
|
||||
header_text = clean_text(header.get_text(" ", strip=True))
|
||||
title_text = clean_text(title_node.get_text(" ", strip=True))
|
||||
if title_text and header_text.startswith(title_text):
|
||||
header_text = clean_text(header_text[len(title_text):])
|
||||
normalized = normalize_drupal_date(header_text)
|
||||
if normalized:
|
||||
return normalized
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def extract_drupal_department_categories(soup: BeautifulSoup) -> list[str]:
|
||||
categories: list[str] = []
|
||||
label_pattern = re.compile(r"^\s*Department:\s*$", re.IGNORECASE)
|
||||
|
||||
for label_node in soup.find_all(string=label_pattern):
|
||||
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
|
||||
if not parent:
|
||||
continue
|
||||
|
||||
inline_value = extract_labeled_value(parent.get_text(" ", strip=True), "Department")
|
||||
normalized_inline_value = normalize_department_category(inline_value)
|
||||
if normalized_inline_value:
|
||||
categories = merge_terms(categories, [normalized_inline_value])
|
||||
continue
|
||||
|
||||
for sibling in parent.next_siblings:
|
||||
value = normalize_department_category(text_from_node(sibling))
|
||||
if value:
|
||||
categories = merge_terms(categories, [value])
|
||||
break
|
||||
|
||||
for candidate in soup.find_all(["p", "li", "span", "dt", "dd"]):
|
||||
text = clean_text(candidate.get_text(" ", strip=True))
|
||||
if not text.lower().startswith("department:"):
|
||||
continue
|
||||
extracted = normalize_department_category(extract_labeled_value(text, "Department"))
|
||||
if extracted:
|
||||
categories = merge_terms(categories, [extracted])
|
||||
|
||||
return categories
|
||||
|
||||
|
||||
def extract_author_from_json_ld(article: dict) -> str:
|
||||
author = article.get("author")
|
||||
if isinstance(author, dict):
|
||||
return clean_text(author.get("name", ""))
|
||||
if isinstance(author, list):
|
||||
names = [clean_text(item.get("name", "")) for item in author if isinstance(item, dict)]
|
||||
return ", ".join(name for name in names if name)
|
||||
if isinstance(author, str):
|
||||
return clean_text(author)
|
||||
return ""
|
||||
|
||||
|
||||
def extract_body_from_article(article: dict, soup: BeautifulSoup) -> str:
|
||||
body = article.get("articleBody")
|
||||
if isinstance(body, str) and len(body.strip()) > 120:
|
||||
return f"<p>{unescape(body.strip())}</p>"
|
||||
return extract_body(soup)
|
||||
|
||||
|
||||
def normalize_terms(value: object) -> list[str]:
|
||||
if isinstance(value, str):
|
||||
parts = re.split(r"[,|>]", value)
|
||||
return [clean_text(part) for part in parts if clean_text(part)]
|
||||
if isinstance(value, list):
|
||||
result: list[str] = []
|
||||
for item in value:
|
||||
if isinstance(item, str):
|
||||
cleaned = clean_text(item)
|
||||
if cleaned and cleaned not in result:
|
||||
result.append(cleaned)
|
||||
return result
|
||||
return []
|
||||
|
||||
|
||||
def merge_terms(*groups: list[str]) -> list[str]:
|
||||
merged: list[str] = []
|
||||
for group in groups:
|
||||
for item in group:
|
||||
cleaned = clean_text(item)
|
||||
if cleaned and cleaned not in merged:
|
||||
merged.append(cleaned)
|
||||
return merged
|
||||
|
||||
|
||||
def normalize_drupal_date(value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
match = DRUPAL_TITLE_DATE_PATTERN.search(value)
|
||||
if not match:
|
||||
return ""
|
||||
return normalize_date(match.group(0))
|
||||
|
||||
|
||||
def meta_content(soup: BeautifulSoup, tag_name: str, attrs: dict[str, str]) -> str:
|
||||
node = soup.find(tag_name, attrs=attrs)
|
||||
if node and node.get("content"):
|
||||
return node["content"].strip()
|
||||
return ""
|
||||
|
||||
|
||||
def clean_text(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value or "").strip()
|
||||
|
||||
|
||||
def text_from_node(node: object) -> str:
|
||||
if isinstance(node, NavigableString):
|
||||
return clean_text(str(node))
|
||||
if isinstance(node, Tag):
|
||||
return clean_text(node.get_text(" ", strip=True))
|
||||
return ""
|
||||
|
||||
|
||||
def sanitize_html(html: str) -> str:
|
||||
if not html:
|
||||
return ""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
strip_unwanted(soup)
|
||||
strip_dangerous_attributes(soup)
|
||||
return soup.decode_contents().strip()
|
||||
|
||||
|
||||
def has_meaningful_body_content(html: str) -> bool:
|
||||
if not html:
|
||||
return False
|
||||
text = BeautifulSoup(html, "html.parser").get_text(" ", strip=True)
|
||||
return bool(text) or any(token in html.lower() for token in ("<img", "<a ", "<embed", "<object"))
|
||||
|
||||
|
||||
def strip_unwanted(node: BeautifulSoup | Tag) -> None:
|
||||
for selector in ("script", "style", "noscript", "iframe", "form", "nav", ".share", ".social-share"):
|
||||
for child in node.select(selector):
|
||||
child.decompose()
|
||||
|
||||
|
||||
def strip_dangerous_attributes(node: BeautifulSoup | Tag) -> None:
|
||||
for child in node.find_all(True):
|
||||
for attr_name in list(child.attrs):
|
||||
normalized_name = attr_name.lower()
|
||||
if normalized_name.startswith("on") or normalized_name == "srcdoc":
|
||||
del child.attrs[attr_name]
|
||||
continue
|
||||
|
||||
if normalized_name not in {"href", "src", "action", "formaction", "xlink:href"}:
|
||||
continue
|
||||
|
||||
raw_value = child.attrs.get(attr_name)
|
||||
if isinstance(raw_value, list):
|
||||
candidate = " ".join(str(item) for item in raw_value)
|
||||
else:
|
||||
candidate = str(raw_value or "")
|
||||
|
||||
lowered = candidate.strip().lower()
|
||||
if lowered.startswith(("javascript:", "vbscript:", "data:text/html")):
|
||||
del child.attrs[attr_name]
|
||||
|
||||
|
||||
def clone_tag(node: Tag) -> BeautifulSoup:
|
||||
return BeautifulSoup(str(node), "html.parser")
|
||||
|
||||
|
||||
def find_title_node(soup: BeautifulSoup) -> Tag | None:
|
||||
for selector in ("article h1", "h1.entry-title", "h1.page-title", "h1.title", "h1"):
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return node
|
||||
return None
|
||||
|
||||
|
||||
def extract_labeled_value(text: str, label: str) -> str:
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
pattern = re.compile(
|
||||
rf"{re.escape(label)}:\s*(.+?)(?=\s+(?:[A-Z][a-z]+:)|\s{{2,}}|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
match = pattern.search(clean_text(text))
|
||||
if not match:
|
||||
return ""
|
||||
return clean_text(match.group(1))
|
||||
|
||||
|
||||
def normalize_department_category(value: str) -> str:
|
||||
cleaned = clean_text(value)
|
||||
if not cleaned:
|
||||
return ""
|
||||
if len(cleaned) > 80 or len(cleaned.split()) > 8:
|
||||
return ""
|
||||
if any(token in cleaned.lower() for token in ("p.o. box", "contact us", "@", "http://", "https://")):
|
||||
return ""
|
||||
return cleaned
|
||||
|
||||
|
||||
def format_error_summary(
|
||||
url: str,
|
||||
exc: Exception,
|
||||
response: requests.Response | None,
|
||||
timeout_seconds: int,
|
||||
) -> str:
|
||||
if isinstance(exc, requests.HTTPError):
|
||||
failing_response = exc.response or response
|
||||
if failing_response is not None:
|
||||
return (
|
||||
f"HTTP {failing_response.status_code} {failing_response.reason} "
|
||||
f"while fetching {failing_response.url or url}"
|
||||
)
|
||||
if isinstance(exc, requests.Timeout):
|
||||
return f"Request timed out after {timeout_seconds}s while fetching {url}"
|
||||
if isinstance(exc, requests.RequestException):
|
||||
return f"{type(exc).__name__} while fetching {url}: {exc}"
|
||||
return f"{type(exc).__name__}: {exc}"
|
||||
|
||||
|
||||
def format_error_details(
|
||||
url: str,
|
||||
exc: Exception,
|
||||
response: requests.Response | None,
|
||||
) -> str:
|
||||
details = [
|
||||
f"URL: {url}",
|
||||
f"Error Type: {type(exc).__name__}",
|
||||
f"Message: {exc}",
|
||||
]
|
||||
|
||||
failing_response = getattr(exc, "response", None) or response
|
||||
if failing_response is not None:
|
||||
details.extend(
|
||||
[
|
||||
f"HTTP Status: {failing_response.status_code} {failing_response.reason}",
|
||||
f"Resolved URL: {failing_response.url}",
|
||||
]
|
||||
)
|
||||
|
||||
trace = "".join(traceback.format_exception_only(type(exc), exc)).strip()
|
||||
if trace:
|
||||
details.append(f"Exception: {trace}")
|
||||
|
||||
return "\n".join(details)
|
||||
@@ -0,0 +1,91 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from email.utils import format_datetime
|
||||
from io import StringIO
|
||||
from xml.sax.saxutils import escape
|
||||
import datetime as dt
|
||||
|
||||
from page_importer.dates import parse_datetime
|
||||
from page_importer.models import ScrapedPost
|
||||
|
||||
|
||||
def build_wxr(posts: list[ScrapedPost], channel_title: str = "Imported Content") -> str:
|
||||
now = dt.datetime.now(dt.timezone.utc)
|
||||
out = StringIO()
|
||||
out.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
|
||||
out.write(
|
||||
'<rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" '
|
||||
'xmlns:content="http://purl.org/rss/1.0/modules/content/" '
|
||||
'xmlns:wfw="http://wellformedweb.org/CommentAPI/" '
|
||||
'xmlns:dc="http://purl.org/dc/elements/1.1/" '
|
||||
'xmlns:wp="http://wordpress.org/export/1.2/">\n'
|
||||
)
|
||||
out.write("<channel>\n")
|
||||
out.write(f"<title>{escape(channel_title)}</title>\n")
|
||||
out.write("<link>http://localhost/</link>\n")
|
||||
out.write("<description>Generated by Page Importer</description>\n")
|
||||
out.write(f"<pubDate>{format_datetime(now)}</pubDate>\n")
|
||||
out.write("<language>en-US</language>\n")
|
||||
out.write("<wp:wxr_version>1.2</wp:wxr_version>\n")
|
||||
|
||||
for post in posts:
|
||||
local_date, gmt_date, item_pub_date = _resolve_post_dates(post.publish_date, now)
|
||||
out.write("<item>\n")
|
||||
out.write(f"<title>{escape(post.title)}</title>\n")
|
||||
out.write(f"<link>{escape(post.source_url)}</link>\n")
|
||||
out.write(f"<pubDate>{format_datetime(item_pub_date)}</pubDate>\n")
|
||||
out.write(f"<dc:creator>{cdata(post.author or 'importer')}</dc:creator>\n")
|
||||
out.write(f"<guid isPermaLink=\"false\">{escape(post.source_url)}</guid>\n")
|
||||
out.write("<description></description>\n")
|
||||
out.write(f"<content:encoded>{cdata(post.body_html)}</content:encoded>\n")
|
||||
out.write(f"<excerpt:encoded>{cdata('')}</excerpt:encoded>\n")
|
||||
out.write(f"<wp:post_date>{cdata(local_date)}</wp:post_date>\n")
|
||||
out.write(f"<wp:post_date_gmt>{cdata(gmt_date)}</wp:post_date_gmt>\n")
|
||||
out.write("<wp:comment_status><![CDATA[closed]]></wp:comment_status>\n")
|
||||
out.write("<wp:ping_status><![CDATA[closed]]></wp:ping_status>\n")
|
||||
out.write("<wp:post_name><![CDATA[]]></wp:post_name>\n")
|
||||
out.write(f"<wp:status>{cdata(post.status)}</wp:status>\n")
|
||||
out.write("<wp:post_parent>0</wp:post_parent>\n")
|
||||
out.write("<wp:menu_order>0</wp:menu_order>\n")
|
||||
out.write(f"<wp:post_type>{cdata(post.post_type or 'post')}</wp:post_type>\n")
|
||||
out.write("<wp:post_password><![CDATA[]]></wp:post_password>\n")
|
||||
out.write("<wp:is_sticky>0</wp:is_sticky>\n")
|
||||
for category in post.categories:
|
||||
out.write(
|
||||
f'<category domain="category" nicename="{escape(slugify(category))}">{cdata(category)}</category>\n'
|
||||
)
|
||||
for tag in post.tags:
|
||||
out.write(
|
||||
f'<category domain="post_tag" nicename="{escape(slugify(tag))}">{cdata(tag)}</category>\n'
|
||||
)
|
||||
out.write("</item>\n")
|
||||
|
||||
out.write("</channel>\n</rss>\n")
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
def slugify(value: str) -> str:
|
||||
return "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-")
|
||||
|
||||
|
||||
def cdata(value: str) -> str:
|
||||
return f"<![CDATA[{(value or '').replace(']]>', ']]]]><![CDATA[>')}]]>"
|
||||
|
||||
|
||||
def _resolve_post_dates(value: str, fallback: dt.datetime) -> tuple[str, str, dt.datetime]:
|
||||
parsed = parse_datetime(value)
|
||||
if parsed is None:
|
||||
return "", "", fallback
|
||||
|
||||
if parsed.tzinfo is None or parsed.utcoffset() is None:
|
||||
local_date = _format_wp_date(parsed)
|
||||
assumed_utc = parsed.replace(tzinfo=dt.timezone.utc)
|
||||
return local_date, local_date, assumed_utc
|
||||
|
||||
local_date = _format_wp_date(parsed)
|
||||
gmt_value = parsed.astimezone(dt.timezone.utc)
|
||||
return local_date, _format_wp_date(gmt_value), gmt_value
|
||||
|
||||
|
||||
def _format_wp_date(value: dt.datetime) -> str:
|
||||
return value.replace(tzinfo=None).strftime("%Y-%m-%d %H:%M:%S")
|
||||
@@ -0,0 +1,4 @@
|
||||
streamlit>=1.43,<2
|
||||
requests>=2.32,<3
|
||||
beautifulsoup4>=4.12,<5
|
||||
python-dateutil>=2.9,<3
|
||||
@@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import unittest
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from page_importer.dates import normalize_date
|
||||
from page_importer.scraper import CATEGORY_SELECTORS, TAG_SELECTORS, extract_terms, sanitize_html
|
||||
from page_importer.wxr import build_wxr
|
||||
from page_importer.models import ScrapedPost
|
||||
|
||||
|
||||
class DateNormalizationTests(unittest.TestCase):
|
||||
def test_preserves_timezone_offset_in_normalized_value(self) -> None:
|
||||
self.assertEqual(
|
||||
normalize_date("2024-05-01T09:30:00-07:00"),
|
||||
"2024-05-01 09:30:00-07:00",
|
||||
)
|
||||
|
||||
|
||||
class WxrSerializationTests(unittest.TestCase):
|
||||
def test_writes_local_and_gmt_dates_from_offset_timestamp(self) -> None:
|
||||
xml = build_wxr(
|
||||
[
|
||||
ScrapedPost(
|
||||
source_url="https://example.com/post",
|
||||
title="Example",
|
||||
body_html="<p>Body</p>",
|
||||
publish_date="2024-05-01 09:30:00-07:00",
|
||||
success=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
self.assertIn("<wp:post_date><![CDATA[2024-05-01 09:30:00]]></wp:post_date>", xml)
|
||||
self.assertIn("<wp:post_date_gmt><![CDATA[2024-05-01 16:30:00]]></wp:post_date_gmt>", xml)
|
||||
self.assertIn("<pubDate>Wed, 01 May 2024 16:30:00 +0000</pubDate>", xml)
|
||||
|
||||
def test_splits_cdata_terminators_in_content(self) -> None:
|
||||
xml = build_wxr(
|
||||
[
|
||||
ScrapedPost(
|
||||
source_url="https://example.com/post",
|
||||
title="Example",
|
||||
body_html="<p>alpha ]]> omega</p>",
|
||||
author="Jane ]]> Doe",
|
||||
success=True,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
self.assertIn("alpha ]]]]><![CDATA[> omega", xml)
|
||||
self.assertIn("Jane ]]]]><![CDATA[> Doe", xml)
|
||||
|
||||
|
||||
class HtmlSanitizationTests(unittest.TestCase):
|
||||
def test_removes_inline_event_handlers_and_script_uris(self) -> None:
|
||||
sanitized = sanitize_html(
|
||||
'<div onclick="alert(1)"><a href="javascript:alert(1)">x</a><img src="x" onerror="alert(1)"></div>'
|
||||
)
|
||||
|
||||
self.assertNotIn("onclick", sanitized)
|
||||
self.assertNotIn("onerror", sanitized)
|
||||
self.assertNotIn("javascript:", sanitized)
|
||||
|
||||
|
||||
class TaxonomySelectorTests(unittest.TestCase):
|
||||
def test_drupal_tag_field_is_not_treated_as_category(self) -> None:
|
||||
soup = BeautifulSoup(
|
||||
'<div class="field--name-field-tags"><a href="/tags/example">Example Tag</a></div>',
|
||||
"html.parser",
|
||||
)
|
||||
|
||||
self.assertEqual(extract_terms(soup, CATEGORY_SELECTORS), [])
|
||||
self.assertEqual(extract_terms(soup, TAG_SELECTORS), ["Example Tag"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user