Use dataframe based caching

2024-11-25 10:38:22 +01:00 · 2024-11-25 10:38:22 +01:00 · fd563ef46c
parent 56947296b5
commit fd563ef46c
3 changed files with 47 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
-# Cache files
+# Cache (scraped dataframes)
-url_map.pickle
+cache/
 # Output graphs
 output/
--- a/src/main.py
+++ b/src/main.py
@ -1,18 +1,20 @@
 import asyncio
-import pickle
+import base64
 import re
 from pathlib import Path
 from time import perf_counter
 import httpx
 import pandas as pd
 from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
 from src.pagerank import pagerank, test_pagerank
 from src.util import dataframe_to_urlmap, urlmap_to_dataframe
 from src.visualization import display_top_urls
 URL = httpx.URL("https://ailab.fai.utb.cz")
 ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
-CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
+CACHE_RESULTS_DIR = Path.cwd() / "cache"
 async def main() -> None:
@ -21,14 +23,20 @@ async def main() -> None:
    # in the assignment.
    test_pagerank()
    CACHE_RESULTS_DIR.mkdir(exist_ok=True)
    print("Scraping...")
    start = perf_counter()
-    if CACHE_RESULTS_DIR.exists():
+    # Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination
-        print("> Using cached URLs")
+    cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex()
-        with CACHE_RESULTS_DIR.open("rb") as fp:
+
-            url_map = pickle.load(fp)  # noqa: S301
+    if cache_file.exists():
        print("> Using cached urlmap dataframe")
        url_map_df = pd.read_csv(cache_file)
        url_map = dataframe_to_urlmap(url_map_df)
    else:
        print("> Cache not found, proceeding to scrape")
        async with httpx.AsyncClient() as client:
            url_map = await get_urlmap(
                client,
@ -37,9 +45,9 @@ async def main() -> None:
                filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
                suppress_exception=standard_urlmap_exception_suppressor,
            )
-
+            print("> Storing results to cache")
-        with CACHE_RESULTS_DIR.open("wb") as fp:
+            url_map_df = urlmap_to_dataframe(url_map)
-            pickle.dump(url_map, fp)
+            url_map_df.to_csv(cache_file)
    took = perf_counter() - start
    print(f"Took: {round(took, 2)} seconds")
--- a/src/util.py
+++ b/src/util.py
@ -0,0 +1,28 @@
 import httpx
 import pandas as pd
 def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame:
    """Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame.
    :param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs.
    :return: A pandas DataFrame with columns 'source' and 'target'.
    """
    rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets]
    return pd.DataFrame(rows)
 def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
    """Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`.
    :param df: A DataFrame with columns 'source' and 'target'.
    :return: A dictionary where the key is a source URL, and the value is a set of target URLs.
    """
    urlmap = {}
    for source, target in zip(df["source"], df["target"], strict=True):
        source_url = httpx.URL(source)
        target_url = httpx.URL(target)
        if source_url not in urlmap:
            urlmap[source_url] = set()
        urlmap[source_url].add(target_url)
    return urlmap