diff --git a/.gitignore b/.gitignore index 73a6444..9adac0d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Cache files +url_map.pickle + # Output graphs output/ diff --git a/src/__main__.py b/src/__main__.py index 1e61621..433be82 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,5 +1,7 @@ import asyncio +import pickle import re +from pathlib import Path from time import perf_counter import httpx @@ -10,6 +12,7 @@ from src.visualization import display_top_urls URL = httpx.URL("https://ailab.fai.utb.cz") ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") +CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle" async def main() -> None: @@ -21,14 +24,22 @@ async def main() -> None: print("Scraping...") start = perf_counter() - async with httpx.AsyncClient() as client: - url_map = await get_urlmap( - client, - URL, - max_depth=2, - filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, - suppress_exception=standard_urlmap_exception_suppressor, - ) + if CACHE_RESULTS_DIR.exists(): + print("> Using cached URLs") + with CACHE_RESULTS_DIR.open("rb") as fp: + url_map = pickle.load(fp) # noqa: S301 + else: + async with httpx.AsyncClient() as client: + url_map = await get_urlmap( + client, + URL, + max_depth=2, + filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, + suppress_exception=standard_urlmap_exception_suppressor, + ) + + with CACHE_RESULTS_DIR.open("wb") as fp: + pickle.dump(url_map, fp) took = perf_counter() - start print(f"Took: {round(took, 2)} seconds")