92 lines
3.9 KiB
Python
92 lines
3.9 KiB
Python
from __future__ import annotations
|
|
|
|
from email.utils import format_datetime
|
|
from io import StringIO
|
|
from xml.sax.saxutils import escape
|
|
import datetime as dt
|
|
|
|
from page_importer.dates import parse_datetime
|
|
from page_importer.models import ScrapedPost
|
|
|
|
|
|
def build_wxr(posts: list[ScrapedPost], channel_title: str = "Imported Content") -> str:
|
|
now = dt.datetime.now(dt.timezone.utc)
|
|
out = StringIO()
|
|
out.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
|
|
out.write(
|
|
'<rss version="2.0" xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/" '
|
|
'xmlns:content="http://purl.org/rss/1.0/modules/content/" '
|
|
'xmlns:wfw="http://wellformedweb.org/CommentAPI/" '
|
|
'xmlns:dc="http://purl.org/dc/elements/1.1/" '
|
|
'xmlns:wp="http://wordpress.org/export/1.2/">\n'
|
|
)
|
|
out.write("<channel>\n")
|
|
out.write(f"<title>{escape(channel_title)}</title>\n")
|
|
out.write("<link>http://localhost/</link>\n")
|
|
out.write("<description>Generated by Page Importer</description>\n")
|
|
out.write(f"<pubDate>{format_datetime(now)}</pubDate>\n")
|
|
out.write("<language>en-US</language>\n")
|
|
out.write("<wp:wxr_version>1.2</wp:wxr_version>\n")
|
|
|
|
for post in posts:
|
|
local_date, gmt_date, item_pub_date = _resolve_post_dates(post.publish_date, now)
|
|
out.write("<item>\n")
|
|
out.write(f"<title>{escape(post.title)}</title>\n")
|
|
out.write(f"<link>{escape(post.source_url)}</link>\n")
|
|
out.write(f"<pubDate>{format_datetime(item_pub_date)}</pubDate>\n")
|
|
out.write(f"<dc:creator>{cdata(post.author or 'importer')}</dc:creator>\n")
|
|
out.write(f"<guid isPermaLink=\"false\">{escape(post.source_url)}</guid>\n")
|
|
out.write("<description></description>\n")
|
|
out.write(f"<content:encoded>{cdata(post.body_html)}</content:encoded>\n")
|
|
out.write(f"<excerpt:encoded>{cdata('')}</excerpt:encoded>\n")
|
|
out.write(f"<wp:post_date>{cdata(local_date)}</wp:post_date>\n")
|
|
out.write(f"<wp:post_date_gmt>{cdata(gmt_date)}</wp:post_date_gmt>\n")
|
|
out.write("<wp:comment_status><![CDATA[closed]]></wp:comment_status>\n")
|
|
out.write("<wp:ping_status><![CDATA[closed]]></wp:ping_status>\n")
|
|
out.write("<wp:post_name><![CDATA[]]></wp:post_name>\n")
|
|
out.write(f"<wp:status>{cdata(post.status)}</wp:status>\n")
|
|
out.write("<wp:post_parent>0</wp:post_parent>\n")
|
|
out.write("<wp:menu_order>0</wp:menu_order>\n")
|
|
out.write(f"<wp:post_type>{cdata(post.post_type or 'post')}</wp:post_type>\n")
|
|
out.write("<wp:post_password><![CDATA[]]></wp:post_password>\n")
|
|
out.write("<wp:is_sticky>0</wp:is_sticky>\n")
|
|
for category in post.categories:
|
|
out.write(
|
|
f'<category domain="category" nicename="{escape(slugify(category))}">{cdata(category)}</category>\n'
|
|
)
|
|
for tag in post.tags:
|
|
out.write(
|
|
f'<category domain="post_tag" nicename="{escape(slugify(tag))}">{cdata(tag)}</category>\n'
|
|
)
|
|
out.write("</item>\n")
|
|
|
|
out.write("</channel>\n</rss>\n")
|
|
return out.getvalue()
|
|
|
|
|
|
def slugify(value: str) -> str:
|
|
return "".join(ch.lower() if ch.isalnum() else "-" for ch in value).strip("-")
|
|
|
|
|
|
def cdata(value: str) -> str:
|
|
return f"<![CDATA[{(value or '').replace(']]>', ']]]]><![CDATA[>')}]]>"
|
|
|
|
|
|
def _resolve_post_dates(value: str, fallback: dt.datetime) -> tuple[str, str, dt.datetime]:
|
|
parsed = parse_datetime(value)
|
|
if parsed is None:
|
|
return "", "", fallback
|
|
|
|
if parsed.tzinfo is None or parsed.utcoffset() is None:
|
|
local_date = _format_wp_date(parsed)
|
|
assumed_utc = parsed.replace(tzinfo=dt.timezone.utc)
|
|
return local_date, local_date, assumed_utc
|
|
|
|
local_date = _format_wp_date(parsed)
|
|
gmt_value = parsed.astimezone(dt.timezone.utc)
|
|
return local_date, _format_wp_date(gmt_value), gmt_value
|
|
|
|
|
|
def _format_wp_date(value: dt.datetime) -> str:
|
|
return value.replace(tzinfo=None).strftime("%Y-%m-%d %H:%M:%S")
|