This commit is contained in:
+24
-2
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import datetime as dt
|
||||
import hashlib
|
||||
import io
|
||||
import re
|
||||
from dataclasses import replace
|
||||
@@ -13,6 +14,7 @@ from page_importer.models import ScrapeOptions, ScrapedPost
|
||||
from page_importer.scraper import Scraper
|
||||
from page_importer.wxr import build_wxr
|
||||
|
||||
|
||||
def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
|
||||
text = file_data.decode("utf-8-sig", errors="replace")
|
||||
reader = csv.DictReader(io.StringIO(text))
|
||||
@@ -20,6 +22,20 @@ def load_csv(file_data: bytes) -> tuple[list[str], list[dict[str, str]]]:
|
||||
return reader.fieldnames or [], rows
|
||||
|
||||
|
||||
def build_upload_fingerprint(file_data: bytes) -> str:
|
||||
return hashlib.sha256(file_data).hexdigest()
|
||||
|
||||
|
||||
def sync_uploaded_file_state(session_state: dict[str, object], upload_fingerprint: str) -> None:
|
||||
previous_fingerprint = session_state.get("uploaded_csv_fingerprint")
|
||||
if previous_fingerprint == upload_fingerprint:
|
||||
return
|
||||
|
||||
for key in ("results", "input_rows", "input_headers", "scrape_context"):
|
||||
session_state.pop(key, None)
|
||||
session_state["uploaded_csv_fingerprint"] = upload_fingerprint
|
||||
|
||||
|
||||
def render_app() -> None:
|
||||
st.title("Page Importer")
|
||||
st.caption("Scrape blog posts from CSV URLs and export a WordPress WXR file.")
|
||||
@@ -47,7 +63,10 @@ def render_app() -> None:
|
||||
st.info("Upload a CSV to begin.")
|
||||
return
|
||||
|
||||
headers, rows = load_csv(uploaded.getvalue())
|
||||
uploaded_bytes = uploaded.getvalue()
|
||||
sync_uploaded_file_state(st.session_state, build_upload_fingerprint(uploaded_bytes))
|
||||
|
||||
headers, rows = load_csv(uploaded_bytes)
|
||||
if not rows:
|
||||
st.error("The CSV did not contain any rows.")
|
||||
return
|
||||
@@ -88,6 +107,7 @@ def render_app() -> None:
|
||||
results = scrape_rows(rows, context, phase_label="Scraping")
|
||||
st.session_state["results"] = results
|
||||
st.session_state["input_rows"] = rows
|
||||
st.session_state["input_headers"] = headers
|
||||
st.session_state["scrape_context"] = context
|
||||
|
||||
results = st.session_state.get("results", [])
|
||||
@@ -176,7 +196,9 @@ def render_app() -> None:
|
||||
st.write(f"**Author:** {selected.author or '(missing)'}")
|
||||
st.write(f"**Post Type:** {selected.post_type}")
|
||||
st.write(selected.body_html, unsafe_allow_html=True)
|
||||
render_export_sidebar(successful, rows, headers)
|
||||
stored_rows = st.session_state.get("input_rows", rows)
|
||||
stored_headers = st.session_state.get("input_headers", headers)
|
||||
render_export_sidebar(successful, stored_rows, stored_headers)
|
||||
|
||||
|
||||
def build_scrape_context(
|
||||
|
||||
Reference in New Issue
Block a user