Add caching support
This commit is contained in:
parent
299350a90a
commit
bdb9529b77
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,3 +1,6 @@
|
||||||
|
# Cache files
|
||||||
|
url_map.pickle
|
||||||
|
|
||||||
# Output graphs
|
# Output graphs
|
||||||
output/
|
output/
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import pickle
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
from time import perf_counter
|
from time import perf_counter
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
@ -10,6 +12,7 @@ from src.visualization import display_top_urls
|
||||||
|
|
||||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
|
CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
|
@ -21,14 +24,22 @@ async def main() -> None:
|
||||||
print("Scraping...")
|
print("Scraping...")
|
||||||
start = perf_counter()
|
start = perf_counter()
|
||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
if CACHE_RESULTS_DIR.exists():
|
||||||
url_map = await get_urlmap(
|
print("> Using cached URLs")
|
||||||
client,
|
with CACHE_RESULTS_DIR.open("rb") as fp:
|
||||||
URL,
|
url_map = pickle.load(fp) # noqa: S301
|
||||||
max_depth=2,
|
else:
|
||||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
async with httpx.AsyncClient() as client:
|
||||||
suppress_exception=standard_urlmap_exception_suppressor,
|
url_map = await get_urlmap(
|
||||||
)
|
client,
|
||||||
|
URL,
|
||||||
|
max_depth=2,
|
||||||
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
|
suppress_exception=standard_urlmap_exception_suppressor,
|
||||||
|
)
|
||||||
|
|
||||||
|
with CACHE_RESULTS_DIR.open("wb") as fp:
|
||||||
|
pickle.dump(url_map, fp)
|
||||||
|
|
||||||
took = perf_counter() - start
|
took = perf_counter() - start
|
||||||
print(f"Took: {round(took, 2)} seconds")
|
print(f"Took: {round(took, 2)} seconds")
|
||||||
|
|
Loading…
Reference in a new issue