Add caching support

This commit is contained in:
Peter Vacho 2024-11-24 19:53:08 +01:00
parent 299350a90a
commit bdb9529b77
Signed by: school
GPG key ID: 8CFC3837052871B4
2 changed files with 22 additions and 8 deletions

3
.gitignore vendored
View file

@ -1,3 +1,6 @@
# Cache files
url_map.pickle
# Output graphs # Output graphs
output/ output/

View file

@ -1,5 +1,7 @@
import asyncio import asyncio
import pickle
import re import re
from pathlib import Path
from time import perf_counter from time import perf_counter
import httpx import httpx
@ -10,6 +12,7 @@ from src.visualization import display_top_urls
URL = httpx.URL("https://ailab.fai.utb.cz") URL = httpx.URL("https://ailab.fai.utb.cz")
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
async def main() -> None: async def main() -> None:
@ -21,14 +24,22 @@ async def main() -> None:
print("Scraping...") print("Scraping...")
start = perf_counter() start = perf_counter()
async with httpx.AsyncClient() as client: if CACHE_RESULTS_DIR.exists():
url_map = await get_urlmap( print("> Using cached URLs")
client, with CACHE_RESULTS_DIR.open("rb") as fp:
URL, url_map = pickle.load(fp) # noqa: S301
max_depth=2, else:
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, async with httpx.AsyncClient() as client:
suppress_exception=standard_urlmap_exception_suppressor, url_map = await get_urlmap(
) client,
URL,
max_depth=2,
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
suppress_exception=standard_urlmap_exception_suppressor,
)
with CACHE_RESULTS_DIR.open("wb") as fp:
pickle.dump(url_map, fp)
took = perf_counter() - start took = perf_counter() - start
print(f"Took: {round(took, 2)} seconds") print(f"Took: {round(took, 2)} seconds")