From fd563ef46c89dd67dd28cb3a7f6d1240820fd44a Mon Sep 17 00:00:00 2001
From: Peter Vacho <p_vacho@utb.cz>
Date: Mon, 25 Nov 2024 10:38:22 +0100
Subject: [PATCH] Use dataframe based caching

---
 .gitignore      |  4 ++--
 src/__main__.py | 26 +++++++++++++++++---------
 src/util.py     | 28 ++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 11 deletions(-)
 create mode 100644 src/util.py

diff --git a/.gitignore b/.gitignore
index 9adac0d..446a883 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
-# Cache files
-url_map.pickle
+# Cache (scraped dataframes)
+cache/
 
 # Output graphs
 output/
diff --git a/src/__main__.py b/src/__main__.py
index 433be82..c98a603 100644
--- a/src/__main__.py
+++ b/src/__main__.py
@@ -1,18 +1,20 @@
 import asyncio
-import pickle
+import base64
 import re
 from pathlib import Path
 from time import perf_counter
 
 import httpx
+import pandas as pd
 
 from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
 from src.pagerank import pagerank, test_pagerank
+from src.util import dataframe_to_urlmap, urlmap_to_dataframe
 from src.visualization import display_top_urls
 
 URL = httpx.URL("https://ailab.fai.utb.cz")
 ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
-CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
+CACHE_RESULTS_DIR = Path.cwd() / "cache"
 
 
 async def main() -> None:
@@ -21,14 +23,20 @@ async def main() -> None:
     # in the assignment.
     test_pagerank()
 
+    CACHE_RESULTS_DIR.mkdir(exist_ok=True)
+
     print("Scraping...")
     start = perf_counter()
 
-    if CACHE_RESULTS_DIR.exists():
-        print("> Using cached URLs")
-        with CACHE_RESULTS_DIR.open("rb") as fp:
-            url_map = pickle.load(fp)  # noqa: S301
+    # Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination
+    cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex()
+
+    if cache_file.exists():
+        print("> Using cached urlmap dataframe")
+        url_map_df = pd.read_csv(cache_file)
+        url_map = dataframe_to_urlmap(url_map_df)
     else:
+        print("> Cache not found, proceeding to scrape")
         async with httpx.AsyncClient() as client:
             url_map = await get_urlmap(
                 client,
@@ -37,9 +45,9 @@ async def main() -> None:
                 filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
                 suppress_exception=standard_urlmap_exception_suppressor,
             )
-
-        with CACHE_RESULTS_DIR.open("wb") as fp:
-            pickle.dump(url_map, fp)
+            print("> Storing results to cache")
+            url_map_df = urlmap_to_dataframe(url_map)
+            url_map_df.to_csv(cache_file)
 
     took = perf_counter() - start
     print(f"Took: {round(took, 2)} seconds")
diff --git a/src/util.py b/src/util.py
new file mode 100644
index 0000000..cc454b6
--- /dev/null
+++ b/src/util.py
@@ -0,0 +1,28 @@
+import httpx
+import pandas as pd
+
+
+def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame:
+    """Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame.
+
+    :param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs.
+    :return: A pandas DataFrame with columns 'source' and 'target'.
+    """
+    rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets]
+    return pd.DataFrame(rows)
+
+
+def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
+    """Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`.
+
+    :param df: A DataFrame with columns 'source' and 'target'.
+    :return: A dictionary where the key is a source URL, and the value is a set of target URLs.
+    """
+    urlmap = {}
+    for source, target in zip(df["source"], df["target"], strict=True):
+        source_url = httpx.URL(source)
+        target_url = httpx.URL(target)
+        if source_url not in urlmap:
+            urlmap[source_url] = set()
+        urlmap[source_url].add(target_url)
+    return urlmap