Update UI, and Update Sitemap tool to get proper thread count
Build Docker Image / docker (push) Successful in 6s
Build Docker Image / docker (push) Successful in 6s
This commit is contained in:
@@ -17,6 +17,9 @@ SPEC.loader.exec_module(MODULE)
|
||||
|
||||
|
||||
class SitemapBuilderTests(unittest.TestCase):
|
||||
def test_detect_default_workers_returns_at_least_one(self) -> None:
|
||||
self.assertGreaterEqual(MODULE.detect_default_workers(), 1)
|
||||
|
||||
def test_cleanup_run_files_deletes_generated_artifacts(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
output_path = pathlib.Path(temp_dir) / "crawl.csv"
|
||||
|
||||
@@ -146,5 +146,6 @@ If the registry secrets are not configured, the workflow still performs the buil
|
||||
## Notes
|
||||
|
||||
- Sitemap output files are written under `/data` in Docker.
|
||||
- Sitemap Generator worker threads default to the number of CPUs visible inside the Docker container.
|
||||
- The sitemap crawler can resume previous runs when a matching crawl state file exists.
|
||||
- The importer keeps its existing scraping and WordPress export behavior, but it now runs inside the shared interface instead of as a separate app.
|
||||
|
||||
@@ -63,7 +63,7 @@ python3 sitemap_builder.py https://example.com --max-pages 20000 --delay 0.25 --
|
||||
- `--timeout`: request timeout in seconds
|
||||
- `--include-subdomains`: crawl subdomains of the starting host
|
||||
- `--include-documents`: include document links such as PDF, CSV, DOC, DOCX, XLSX, and similar files
|
||||
- `--workers`: number of worker threads to use. Set `1` to disable multithreading
|
||||
- `--workers`: number of worker threads to use. Set `1` to disable multithreading. Default: all CPUs visible to the current machine or container
|
||||
- `--save-every`: save progress after every N pages. Default: `25`
|
||||
- `--resume`: resume from an existing state file
|
||||
- `--fresh`: ignore the existing state file and start over
|
||||
|
||||
@@ -28,7 +28,6 @@ DEFAULT_LOG_SUFFIX = ".crawl.log"
|
||||
DEFAULT_MAX_PAGES = 10000
|
||||
DEFAULT_RESUME_PAGE_INCREMENT = 10000
|
||||
DEFAULT_SAVE_EVERY = 25
|
||||
DEFAULT_WORKERS = 8
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
DOCUMENT_EXTENSIONS = {
|
||||
".pdf",
|
||||
@@ -47,6 +46,24 @@ DOCUMENT_EXTENSIONS = {
|
||||
}
|
||||
|
||||
|
||||
def detect_default_workers() -> int:
|
||||
affinity_count: int | None = None
|
||||
get_affinity = getattr(os, "sched_getaffinity", None)
|
||||
if callable(get_affinity):
|
||||
try:
|
||||
affinity_count = len(get_affinity(0))
|
||||
except OSError:
|
||||
affinity_count = None
|
||||
|
||||
cpu_count = os.cpu_count() or 1
|
||||
if affinity_count:
|
||||
return max(affinity_count, 1)
|
||||
return max(cpu_count, 1)
|
||||
|
||||
|
||||
DEFAULT_WORKERS = detect_default_workers()
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlResult:
|
||||
url: str
|
||||
|
||||
@@ -61,6 +61,8 @@ def render_sitemap_tab() -> None:
|
||||
st.caption("Crawl a site, export a sitemap CSV, and keep resume data inside the container data volume.")
|
||||
|
||||
SITEMAP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
sitemap_builder = get_sitemap_module()
|
||||
default_workers = sitemap_builder.DEFAULT_WORKERS
|
||||
|
||||
with st.form("sitemap-form"):
|
||||
start_url = st.text_input("Starting URL", placeholder="https://example.com")
|
||||
@@ -73,7 +75,13 @@ def render_sitemap_tab() -> None:
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
max_pages = st.number_input("Max pages", min_value=1, value=10000, step=100)
|
||||
workers = st.number_input("Worker threads", min_value=1, value=8, step=1)
|
||||
workers = st.number_input(
|
||||
"Worker threads",
|
||||
min_value=1,
|
||||
value=default_workers,
|
||||
step=1,
|
||||
help="Defaults to the number of CPUs visible inside the Docker container.",
|
||||
)
|
||||
with col2:
|
||||
delay = st.number_input("Delay between requests (seconds)", min_value=0.0, value=0.0, step=0.25)
|
||||
timeout = st.number_input("Request timeout (seconds)", min_value=1.0, value=15.0, step=1.0)
|
||||
@@ -90,7 +98,6 @@ def render_sitemap_tab() -> None:
|
||||
if not start_url.strip():
|
||||
st.error("Starting URL is required.")
|
||||
else:
|
||||
sitemap_builder = get_sitemap_module()
|
||||
safe_name = sanitize_job_name(job_name)
|
||||
output_path = SITEMAP_OUTPUT_DIR / f"{safe_name}.csv"
|
||||
captured_stdout = io.StringIO()
|
||||
@@ -137,7 +144,6 @@ def render_sitemap_tab() -> None:
|
||||
st.info("Run a crawl to generate a sitemap CSV.")
|
||||
return
|
||||
|
||||
sitemap_builder = get_sitemap_module()
|
||||
summary = result_data["summary"]
|
||||
csv_path = Path(result_data["output_path"])
|
||||
state_path = Path(result_data["state_path"])
|
||||
|
||||
Reference in New Issue
Block a user