Update UI, and Update Sitemap tool to get proper thread count

2026-04-09 11:27:13 -07:00
parent 0e410a1f6c
commit 287566716f
5 changed files with 32 additions and 5 deletions
@@ -63,7 +63,7 @@ python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --
 - `--timeout`: request timeout in seconds
 - `--include-subdomains`: crawl subdomains of the starting host
 - `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
- `--workers`: number of worker threads to use. Set `1` to disable multithreading
+- `--workers`: number of worker threads to use. Set `1` to disable multithreading. Default: all CPUs visible to the current machine or container
 - `--save-every`: save progress after every N pages. Default: `25`
 - `--resume`: resume from an existing state file
 - `--fresh`: ignore the existing state file and start over
@@ -28,7 +28,6 @@ DEFAULT_LOG_SUFFIX = ".crawl.log"
 DEFAULT_MAX_PAGES = 10000
 DEFAULT_RESUME_PAGE_INCREMENT = 10000
 DEFAULT_SAVE_EVERY = 25
-DEFAULT_WORKERS = 8
 SCRIPT_DIR = Path(__file__).resolve().parent
 DOCUMENT_EXTENSIONS = {
    ".pdf",
@@ -47,6 +46,24 @@ DOCUMENT_EXTENSIONS = {
 }


+def detect_default_workers() -> int:
+    affinity_count: int | None = None
+    get_affinity = getattr(os, "sched_getaffinity", None)
+    if callable(get_affinity):
+        try:
+            affinity_count = len(get_affinity(0))
+        except OSError:
+            affinity_count = None
+
+    cpu_count = os.cpu_count() or 1
+    if affinity_count:
+        return max(affinity_count, 1)
+    return max(cpu_count, 1)
+
+
+DEFAULT_WORKERS = detect_default_workers()
+
+
@dataclass
 class CrawlResult:
    url: str