Add caching support
This commit is contained in:
parent
299350a90a
commit
bdb9529b77
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,3 +1,6 @@
|
|||
# Cache files
|
||||
url_map.pickle
|
||||
|
||||
# Output graphs
|
||||
output/
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import asyncio
|
||||
import pickle
|
||||
import re
|
||||
from pathlib import Path
|
||||
from time import perf_counter
|
||||
|
||||
import httpx
|
||||
|
@ -10,6 +12,7 @@ from src.visualization import display_top_urls
|
|||
|
||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||
CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
|
@ -21,14 +24,22 @@ async def main() -> None:
|
|||
print("Scraping...")
|
||||
start = perf_counter()
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
url_map = await get_urlmap(
|
||||
client,
|
||||
URL,
|
||||
max_depth=2,
|
||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||
suppress_exception=standard_urlmap_exception_suppressor,
|
||||
)
|
||||
if CACHE_RESULTS_DIR.exists():
|
||||
print("> Using cached URLs")
|
||||
with CACHE_RESULTS_DIR.open("rb") as fp:
|
||||
url_map = pickle.load(fp) # noqa: S301
|
||||
else:
|
||||
async with httpx.AsyncClient() as client:
|
||||
url_map = await get_urlmap(
|
||||
client,
|
||||
URL,
|
||||
max_depth=2,
|
||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||
suppress_exception=standard_urlmap_exception_suppressor,
|
||||
)
|
||||
|
||||
with CACHE_RESULTS_DIR.open("wb") as fp:
|
||||
pickle.dump(url_map, fp)
|
||||
|
||||
took = perf_counter() - start
|
||||
print(f"Took: {round(took, 2)} seconds")
|
||||
|
|
Loading…
Reference in a new issue