Store ranking into the cache

2024-11-25 11:50:06 +01:00 · 2024-11-25 11:50:06 +01:00 · ac0c2a8c21
parent 50543bd22a
commit ac0c2a8c21
2 changed files with 46 additions and 2 deletions
--- a/src/main.py
+++ b/src/main.py
@ -9,7 +9,7 @@ import pandas as pd
 from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
 from src.pagerank import pagerank, test_pagerank
-from src.util import dataframe_to_urlmap, urlmap_to_dataframe
+from src.util import add_ratings_to_dataframe, dataframe_to_urlmap, extract_ratings_from_dataframe, urlmap_to_dataframe
 from src.visualization import display_top_urls
 URL = httpx.URL("https://ailab.fai.utb.cz")
@ -60,10 +60,22 @@ async def main() -> None:
    print("Ranking...")
    start = perf_counter()
    if "rating" in url_map_df:
        print("> Rankings recovered from cache")
        ranking = extract_ratings_from_dataframe(url_map_df)
    else:
        print("> Cache doesn't contain rankings, computing")
        del url_map_df
        ranking = pagerank(url_map)
        print("> Storing rankings into the cache")
        url_map_df = urlmap_to_dataframe(url_map)
        add_ratings_to_dataframe(url_map_df, ranking)
        url_map_df.to_csv(cache_file)
    took = perf_counter() - start
    print(f"Took: {round(took, 2)} seconds")
    print("Done")
    display_top_urls(ranking)
--- a/src/util.py
+++ b/src/util.py
@ -26,3 +26,35 @@ def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
            urlmap[source_url] = set()
        urlmap[source_url].add(target_url)
    return urlmap
 def add_ratings_to_dataframe(df: pd.DataFrame, ratings: dict[httpx.URL, float]) -> None:
    """Add ratings to the DataFrame (mutating it).
    :param df: The DataFrame containing 'source' and 'target' columns.
    :param ratings: A dictionary mapping each URL to its PageRank rating.
    """
    # Add a new column 'rating' based on the 'source' column
    if "source" in df:
        df["rating"] = df["source"].apply(lambda x: ratings.get(httpx.URL(x), None))
        return
    # empty dataframe
    if len(ratings) == 0:
        return
    raise ValueError("Got an empty dataframe with non-empty ratings dict")
 def extract_ratings_from_dataframe(df: pd.DataFrame) -> dict[httpx.URL, float]:
    """Extract ratings from a DataFrame and update the original ratings dictionary.
    :param df: DataFrame containing 'source' and 'rating' columns.
    :return: A dictionary where the key is the URL and the value is the rating.
    """
    ratings = {}
    for _, row in df.iterrows():
        url = httpx.URL(row["source"])  # Extract the source URL
        rating = row["rating"]  # Extract the rating
        ratings[url] = rating  # Add it to the dictionary
    return ratings