Store ranking into the cache

2024-11-25 11:50:06 +01:00 · 2024-11-25 11:50:06 +01:00 · ac0c2a8c21
parent 50543bd22a
commit ac0c2a8c21
2 changed files with 46 additions and 2 deletions
--- a/src/main.py
+++ b/src/main.py
@ -9,7 +9,7 @@ import pandas as pd

 from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
 from src.pagerank import pagerank, test_pagerank
-from src.util import dataframe_to_urlmap, urlmap_to_dataframe
+from src.util import add_ratings_to_dataframe, dataframe_to_urlmap, extract_ratings_from_dataframe, urlmap_to_dataframe
 from src.visualization import display_top_urls

 URL = httpx.URL("https://ailab.fai.utb.cz")
@ -60,10 +60,22 @@ async def main() -> None:
    print("Ranking...")
    start = perf_counter()

-    ranking = pagerank(url_map)
+    if "rating" in url_map_df:
+        print("> Rankings recovered from cache")
+        ranking = extract_ratings_from_dataframe(url_map_df)
+    else:
+        print("> Cache doesn't contain rankings, computing")
+        del url_map_df
+        ranking = pagerank(url_map)
+
+        print("> Storing rankings into the cache")
+        url_map_df = urlmap_to_dataframe(url_map)
+        add_ratings_to_dataframe(url_map_df, ranking)
+        url_map_df.to_csv(cache_file)

    took = perf_counter() - start
    print(f"Took: {round(took, 2)} seconds")
+
    print("Done")

    display_top_urls(ranking)
--- a/src/util.py
+++ b/src/util.py
@ -26,3 +26,35 @@ def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
            urlmap[source_url] = set()
        urlmap[source_url].add(target_url)
    return urlmap
+
+
+def add_ratings_to_dataframe(df: pd.DataFrame, ratings: dict[httpx.URL, float]) -> None:
+    """Add ratings to the DataFrame (mutating it).
+
+    :param df: The DataFrame containing 'source' and 'target' columns.
+    :param ratings: A dictionary mapping each URL to its PageRank rating.
+    """
+    # Add a new column 'rating' based on the 'source' column
+    if "source" in df:
+        df["rating"] = df["source"].apply(lambda x: ratings.get(httpx.URL(x), None))
+        return
+
+    # empty dataframe
+    if len(ratings) == 0:
+        return
+
+    raise ValueError("Got an empty dataframe with non-empty ratings dict")
+
+
+def extract_ratings_from_dataframe(df: pd.DataFrame) -> dict[httpx.URL, float]:
+    """Extract ratings from a DataFrame and update the original ratings dictionary.
+
+    :param df: DataFrame containing 'source' and 'rating' columns.
+    :return: A dictionary where the key is the URL and the value is the rating.
+    """
+    ratings = {}
+    for _, row in df.iterrows():
+        url = httpx.URL(row["source"])  # Extract the source URL
+        rating = row["rating"]  # Extract the rating
+        ratings[url] = rating  # Add it to the dictionary
+    return ratings