This commit is contained in:
@@ -175,6 +175,15 @@ def get_log_path(output_path: Path) -> Path:
|
||||
return output_path.with_suffix(output_path.suffix + DEFAULT_LOG_SUFFIX)
|
||||
|
||||
|
||||
def cleanup_run_files(output_path: Path) -> list[Path]:
|
||||
removed_paths: list[Path] = []
|
||||
for path in (Path(output_path), get_state_path(Path(output_path)), get_log_path(Path(output_path))):
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
removed_paths.append(path)
|
||||
return removed_paths
|
||||
|
||||
|
||||
def log_message(log_path: Path, message: str) -> None:
|
||||
log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
@@ -599,6 +608,7 @@ def crawl_site(
|
||||
break
|
||||
|
||||
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
|
||||
state.queued.discard(current)
|
||||
if current in state.visited:
|
||||
continue
|
||||
|
||||
@@ -647,6 +657,7 @@ def crawl_site(
|
||||
|
||||
while state.queue and len(pending) < workers and len(state.visited) < max_pages:
|
||||
current = resolve_alias(state.queue.popleft(), state.alias_to_canonical)
|
||||
state.queued.discard(current)
|
||||
if current in state.visited:
|
||||
continue
|
||||
|
||||
@@ -790,6 +801,11 @@ def run_crawl(
|
||||
"The saved crawl state uses a different document setting. "
|
||||
"Keep the same choice or start a fresh crawl."
|
||||
)
|
||||
if state.include_subdomains != include_subdomains:
|
||||
raise ValueError(
|
||||
"The saved crawl state uses a different subdomain setting. "
|
||||
"Keep the same choice or start a fresh crawl."
|
||||
)
|
||||
else:
|
||||
state = initialize_state(normalized_start, include_subdomains, include_documents)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user