Use dataframe based caching

This commit is contained in:
Peter Vacho 2024-11-25 10:38:22 +01:00
parent 56947296b5
commit fd563ef46c
Signed by: school
GPG key ID: 8CFC3837052871B4
3 changed files with 47 additions and 11 deletions

4
.gitignore vendored
View file

@ -1,5 +1,5 @@
# Cache files
url_map.pickle
# Cache (scraped dataframes)
cache/
# Output graphs
output/

View file

@ -1,18 +1,20 @@
import asyncio
import pickle
import base64
import re
from pathlib import Path
from time import perf_counter
import httpx
import pandas as pd
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
from src.pagerank import pagerank, test_pagerank
from src.util import dataframe_to_urlmap, urlmap_to_dataframe
from src.visualization import display_top_urls
URL = httpx.URL("https://ailab.fai.utb.cz")
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
CACHE_RESULTS_DIR = Path.cwd() / "cache"
async def main() -> None:
@ -21,14 +23,20 @@ async def main() -> None:
# in the assignment.
test_pagerank()
CACHE_RESULTS_DIR.mkdir(exist_ok=True)
print("Scraping...")
start = perf_counter()
if CACHE_RESULTS_DIR.exists():
print("> Using cached URLs")
with CACHE_RESULTS_DIR.open("rb") as fp:
url_map = pickle.load(fp) # noqa: S301
# Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination
cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex()
if cache_file.exists():
print("> Using cached urlmap dataframe")
url_map_df = pd.read_csv(cache_file)
url_map = dataframe_to_urlmap(url_map_df)
else:
print("> Cache not found, proceeding to scrape")
async with httpx.AsyncClient() as client:
url_map = await get_urlmap(
client,
@ -37,9 +45,9 @@ async def main() -> None:
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
suppress_exception=standard_urlmap_exception_suppressor,
)
with CACHE_RESULTS_DIR.open("wb") as fp:
pickle.dump(url_map, fp)
print("> Storing results to cache")
url_map_df = urlmap_to_dataframe(url_map)
url_map_df.to_csv(cache_file)
took = perf_counter() - start
print(f"Took: {round(took, 2)} seconds")

28
src/util.py Normal file
View file

@ -0,0 +1,28 @@
import httpx
import pandas as pd
def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame:
"""Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame.
:param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs.
:return: A pandas DataFrame with columns 'source' and 'target'.
"""
rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets]
return pd.DataFrame(rows)
def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
"""Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`.
:param df: A DataFrame with columns 'source' and 'target'.
:return: A dictionary where the key is a source URL, and the value is a set of target URLs.
"""
urlmap = {}
for source, target in zip(df["source"], df["target"], strict=True):
source_url = httpx.URL(source)
target_url = httpx.URL(target)
if source_url not in urlmap:
urlmap[source_url] = set()
urlmap[source_url].add(target_url)
return urlmap