From 287566716f2d9ab6e5c32ec5aecc0eb4a4fcee16 Mon Sep 17 00:00:00 2001 From: Jeffrey Long Date: Thu, 9 Apr 2026 11:27:13 -0700 Subject: [PATCH] Update UI, and Update Sitemap tool to get proper thread count --- Page Importer/tests/test_sitemap_builder.py | 3 +++ README.md | 1 + Sitemap Builder/README.md | 2 +- Sitemap Builder/sitemap_builder.py | 19 ++++++++++++++++++- app.py | 12 +++++++++--- 5 files changed, 32 insertions(+), 5 deletions(-) diff --git a/Page Importer/tests/test_sitemap_builder.py b/Page Importer/tests/test_sitemap_builder.py index c0731f7..5d8fa7a 100644 --- a/Page Importer/tests/test_sitemap_builder.py +++ b/Page Importer/tests/test_sitemap_builder.py @@ -17,6 +17,9 @@ SPEC.loader.exec_module(MODULE) class SitemapBuilderTests(unittest.TestCase): + def test_detect_default_workers_returns_at_least_one(self) -> None: + self.assertGreaterEqual(MODULE.detect_default_workers(), 1) + def test_cleanup_run_files_deletes_generated_artifacts(self) -> None: with tempfile.TemporaryDirectory() as temp_dir: output_path = pathlib.Path(temp_dir) / "crawl.csv" diff --git a/README.md b/README.md index d1b4181..6cad5cf 100644 --- a/README.md +++ b/README.md @@ -146,5 +146,6 @@ If the registry secrets are not configured, the workflow still performs the buil ## Notes - Sitemap output files are written under `/data` in Docker. +- Sitemap Generator worker threads default to the number of CPUs visible inside the Docker container. - The sitemap crawler can resume previous runs when a matching crawl state file exists. - The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app. diff --git a/Sitemap Builder/README.md b/Sitemap Builder/README.md index 4d6f4a4..69a102b 100644 --- a/Sitemap Builder/README.md +++ b/Sitemap Builder/README.md @@ -63,7 +63,7 @@ python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 -- - `--timeout`: request timeout in seconds - `--include-subdomains`: crawl subdomains of the starting host - `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files -- `--workers`: number of worker threads to use. Set `1` to disable multithreading +- `--workers`: number of worker threads to use. Set `1` to disable multithreading. Default: all CPUs visible to the current machine or container - `--save-every`: save progress after every N pages. Default: `25` - `--resume`: resume from an existing state file - `--fresh`: ignore the existing state file and start over diff --git a/Sitemap Builder/sitemap_builder.py b/Sitemap Builder/sitemap_builder.py index 8611897..469440a 100644 --- a/Sitemap Builder/sitemap_builder.py +++ b/Sitemap Builder/sitemap_builder.py @@ -28,7 +28,6 @@ DEFAULT_LOG_SUFFIX = ".crawl.log" DEFAULT_MAX_PAGES = 10000 DEFAULT_RESUME_PAGE_INCREMENT = 10000 DEFAULT_SAVE_EVERY = 25 -DEFAULT_WORKERS = 8 SCRIPT_DIR = Path(__file__).resolve().parent DOCUMENT_EXTENSIONS = { ".pdf", @@ -47,6 +46,24 @@ DOCUMENT_EXTENSIONS = { } +def detect_default_workers() -> int: + affinity_count: int | None = None + get_affinity = getattr(os, "sched_getaffinity", None) + if callable(get_affinity): + try: + affinity_count = len(get_affinity(0)) + except OSError: + affinity_count = None + + cpu_count = os.cpu_count() or 1 + if affinity_count: + return max(affinity_count, 1) + return max(cpu_count, 1) + + +DEFAULT_WORKERS = detect_default_workers() + + @dataclass class CrawlResult: url: str diff --git a/app.py b/app.py index 4def6e5..8718782 100644 --- a/app.py +++ b/app.py @@ -61,6 +61,8 @@ def render_sitemap_tab() -> None: st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.") SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + sitemap_builder = get_sitemap_module() + default_workers = sitemap_builder.DEFAULT_WORKERS with st.form("sitemap-form"): start_url = st.text_input("Starting URL", placeholder="https://example.com") @@ -73,7 +75,13 @@ def render_sitemap_tab() -> None: col1, col2, col3 = st.columns(3) with col1: max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100) - workers = st.number_input("Worker threads", min_value=1, value=8, step=1) + workers = st.number_input( + "Worker threads", + min_value=1, + value=default_workers, + step=1, + help="Defaults to the number of CPUs visible inside the Docker container.", + ) with col2: delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25) timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0) @@ -90,7 +98,6 @@ def render_sitemap_tab() -> None: if not start_url.strip(): st.error("Starting URL is required.") else: - sitemap_builder = get_sitemap_module() safe_name = sanitize_job_name(job_name) output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv" captured_stdout = io.StringIO() @@ -137,7 +144,6 @@ def render_sitemap_tab() -> None: st.info("Run a crawl to generate a sitemap CSV.") return - sitemap_builder = get_sitemap_module() summary = result_data["summary"] csv_path = Path(result_data["output_path"]) state_path = Path(result_data["state_path"])