From bdb9529b773b3e4105094f02e7efb3e141254f9f Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Sun, 24 Nov 2024 19:53:08 +0100 Subject: [PATCH] Add caching support --- .gitignore | 3 +++ src/__main__.py | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 73a6444..9adac0d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Cache files +url_map.pickle + # Output graphs output/ diff --git a/src/__main__.py b/src/__main__.py index 1e61621..433be82 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,5 +1,7 @@ import asyncio +import pickle import re +from pathlib import Path from time import perf_counter import httpx @@ -10,6 +12,7 @@ from src.visualization import display_top_urls URL = httpx.URL("https://ailab.fai.utb.cz") ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") +CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle" async def main() -> None: @@ -21,14 +24,22 @@ async def main() -> None: print("Scraping...") start = perf_counter() - async with httpx.AsyncClient() as client: - url_map = await get_urlmap( - client, - URL, - max_depth=2, - filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, - suppress_exception=standard_urlmap_exception_suppressor, - ) + if CACHE_RESULTS_DIR.exists(): + print("> Using cached URLs") + with CACHE_RESULTS_DIR.open("rb") as fp: + url_map = pickle.load(fp) # noqa: S301 + else: + async with httpx.AsyncClient() as client: + url_map = await get_urlmap( + client, + URL, + max_depth=2, + filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, + suppress_exception=standard_urlmap_exception_suppressor, + ) + + with CACHE_RESULTS_DIR.open("wb") as fp: + pickle.dump(url_map, fp) took = perf_counter() - start print(f"Took: {round(took, 2)} seconds")