Use dataframe based caching

2024-11-25 10:38:22 +01:00 · 2024-11-25 10:38:22 +01:00 · fd563ef46c
parent 56947296b5
commit fd563ef46c
3 changed files with 47 additions and 11 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
-# Cache files
-url_map.pickle
+# Cache (scraped dataframes)
+cache/

 # Output graphs
 output/
--- a/src/main.py
+++ b/src/main.py
@ -1,18 +1,20 @@
 import asyncio
-import pickle
+import base64
 import re
 from pathlib import Path
 from time import perf_counter

 import httpx
+import pandas as pd

 from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
 from src.pagerank import pagerank, test_pagerank
+from src.util import dataframe_to_urlmap, urlmap_to_dataframe
 from src.visualization import display_top_urls

 URL = httpx.URL("https://ailab.fai.utb.cz")
 ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
-CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
+CACHE_RESULTS_DIR = Path.cwd() / "cache"


 async def main() -> None:
@ -21,14 +23,20 @@ async def main() -> None:
    # in the assignment.
    test_pagerank()

+    CACHE_RESULTS_DIR.mkdir(exist_ok=True)
+
    print("Scraping...")
    start = perf_counter()

-    if CACHE_RESULTS_DIR.exists():
-        print("> Using cached URLs")
-        with CACHE_RESULTS_DIR.open("rb") as fp:
-            url_map = pickle.load(fp)  # noqa: S301
+    # Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination
+    cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex()
+
+    if cache_file.exists():
+        print("> Using cached urlmap dataframe")
+        url_map_df = pd.read_csv(cache_file)
+        url_map = dataframe_to_urlmap(url_map_df)
    else:
+        print("> Cache not found, proceeding to scrape")
        async with httpx.AsyncClient() as client:
            url_map = await get_urlmap(
                client,
@ -37,9 +45,9 @@ async def main() -> None:
                filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
                suppress_exception=standard_urlmap_exception_suppressor,
            )
-
-        with CACHE_RESULTS_DIR.open("wb") as fp:
-            pickle.dump(url_map, fp)
+            print("> Storing results to cache")
+            url_map_df = urlmap_to_dataframe(url_map)
+            url_map_df.to_csv(cache_file)

    took = perf_counter() - start
    print(f"Took: {round(took, 2)} seconds")
--- a/src/util.py
+++ b/src/util.py
@ -0,0 +1,28 @@
+import httpx
+import pandas as pd
+
+
+def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame:
+    """Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame.
+
+    :param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs.
+    :return: A pandas DataFrame with columns 'source' and 'target'.
+    """
+    rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets]
+    return pd.DataFrame(rows)
+
+
+def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
+    """Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`.
+
+    :param df: A DataFrame with columns 'source' and 'target'.
+    :return: A dictionary where the key is a source URL, and the value is a set of target URLs.
+    """
+    urlmap = {}
+    for source, target in zip(df["source"], df["target"], strict=True):
+        source_url = httpx.URL(source)
+        target_url = httpx.URL(target)
+        if source_url not in urlmap:
+            urlmap[source_url] = set()
+        urlmap[source_url].add(target_url)
+    return urlmap