From 287566716f2d9ab6e5c32ec5aecc0eb4a4fcee16 Mon Sep 17 00:00:00 2001
From: Jeffrey Long <support@wdwalrus.com>
Date: Thu, 9 Apr 2026 11:27:13 -0700
Subject: [PATCH] Update UI, and Update Sitemap tool to get proper thread count

---
 Page Importer/tests/test_sitemap_builder.py |  3 +++
 README.md                                   |  1 +
 Sitemap Builder/README.md                   |  2 +-
 Sitemap Builder/sitemap_builder.py          | 19 ++++++++++++++++++-
 app.py                                      | 12 +++++++++---
 5 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/Page Importer/tests/test_sitemap_builder.py b/Page Importer/tests/test_sitemap_builder.py
index c0731f7..5d8fa7a 100644
--- a/Page Importer/tests/test_sitemap_builder.py	
+++ b/Page Importer/tests/test_sitemap_builder.py	
@@ -17,6 +17,9 @@ SPEC.loader.exec_module(MODULE)
 
 
 class SitemapBuilderTests(unittest.TestCase):
+    def test_detect_default_workers_returns_at_least_one(self) -> None:
+        self.assertGreaterEqual(MODULE.detect_default_workers(), 1)
+
     def test_cleanup_run_files_deletes_generated_artifacts(self) -> None:
         with tempfile.TemporaryDirectory() as temp_dir:
             output_path = pathlib.Path(temp_dir) / "crawl.csv"
diff --git a/README.md b/README.md
index d1b4181..6cad5cf 100644
--- a/README.md
+++ b/README.md
@@ -146,5 +146,6 @@ If the registry secrets are not configured, the workflow still performs the buil
 ## Notes
 
 - Sitemap output files are written under `/data` in Docker.
+- Sitemap Generator worker threads default to the number of CPUs visible inside the Docker container.
 - The sitemap crawler can resume previous runs when a matching crawl state file exists.
 - The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app.
diff --git a/Sitemap Builder/README.md b/Sitemap Builder/README.md
index 4d6f4a4..69a102b 100644
--- a/Sitemap Builder/README.md	
+++ b/Sitemap Builder/README.md	
@@ -63,7 +63,7 @@ python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --
 - `--timeout`: request timeout in seconds
 - `--include-subdomains`: crawl subdomains of the starting host
 - `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
-- `--workers`: number of worker threads to use. Set `1` to disable multithreading
+- `--workers`: number of worker threads to use. Set `1` to disable multithreading. Default: all CPUs visible to the current machine or container
 - `--save-every`: save progress after every N pages. Default: `25`
 - `--resume`: resume from an existing state file
 - `--fresh`: ignore the existing state file and start over
diff --git a/Sitemap Builder/sitemap_builder.py b/Sitemap Builder/sitemap_builder.py
index 8611897..469440a 100644
--- a/Sitemap Builder/sitemap_builder.py	
+++ b/Sitemap Builder/sitemap_builder.py	
@@ -28,7 +28,6 @@ DEFAULT_LOG_SUFFIX = ".crawl.log"
 DEFAULT_MAX_PAGES = 10000
 DEFAULT_RESUME_PAGE_INCREMENT = 10000
 DEFAULT_SAVE_EVERY = 25
-DEFAULT_WORKERS = 8
 SCRIPT_DIR = Path(__file__).resolve().parent
 DOCUMENT_EXTENSIONS = {
     ".pdf",
@@ -47,6 +46,24 @@ DOCUMENT_EXTENSIONS = {
 }
 
 
+def detect_default_workers() -> int:
+    affinity_count: int | None = None
+    get_affinity = getattr(os, "sched_getaffinity", None)
+    if callable(get_affinity):
+        try:
+            affinity_count = len(get_affinity(0))
+        except OSError:
+            affinity_count = None
+
+    cpu_count = os.cpu_count() or 1
+    if affinity_count:
+        return max(affinity_count, 1)
+    return max(cpu_count, 1)
+
+
+DEFAULT_WORKERS = detect_default_workers()
+
+
 @dataclass
 class CrawlResult:
     url: str
diff --git a/app.py b/app.py
index 4def6e5..8718782 100644
--- a/app.py
+++ b/app.py
@@ -61,6 +61,8 @@ def render_sitemap_tab() -> None:
     st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
 
     SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    sitemap_builder = get_sitemap_module()
+    default_workers = sitemap_builder.DEFAULT_WORKERS
 
     with st.form("sitemap-form"):
         start_url = st.text_input("Starting URL", placeholder="https://example.com")
@@ -73,7 +75,13 @@ def render_sitemap_tab() -> None:
         col1, col2, col3 = st.columns(3)
         with col1:
             max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
-            workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
+            workers = st.number_input(
+                "Worker threads",
+                min_value=1,
+                value=default_workers,
+                step=1,
+                help="Defaults to the number of CPUs visible inside the Docker container.",
+            )
         with col2:
             delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
             timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
@@ -90,7 +98,6 @@ def render_sitemap_tab() -> None:
         if not start_url.strip():
             st.error("Starting URL is required.")
         else:
-            sitemap_builder = get_sitemap_module()
             safe_name = sanitize_job_name(job_name)
             output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
             captured_stdout = io.StringIO()
@@ -137,7 +144,6 @@ def render_sitemap_tab() -> None:
         st.info("Run a crawl to generate a sitemap CSV.")
         return
 
-    sitemap_builder = get_sitemap_module()
     summary = result_data["summary"]
     csv_path = Path(result_data["output_path"])
     state_path = Path(result_data["state_path"])