From ac0c2a8c211756356f07e60969adbab632328cb7 Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Mon, 25 Nov 2024 11:50:06 +0100 Subject: [PATCH] Store ranking into the cache --- src/__main__.py | 16 ++++++++++++++-- src/util.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/__main__.py b/src/__main__.py index f9283aa..9489590 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -9,7 +9,7 @@ import pandas as pd from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor from src.pagerank import pagerank, test_pagerank -from src.util import dataframe_to_urlmap, urlmap_to_dataframe +from src.util import add_ratings_to_dataframe, dataframe_to_urlmap, extract_ratings_from_dataframe, urlmap_to_dataframe from src.visualization import display_top_urls URL = httpx.URL("https://ailab.fai.utb.cz") @@ -60,10 +60,22 @@ async def main() -> None: print("Ranking...") start = perf_counter() - ranking = pagerank(url_map) + if "rating" in url_map_df: + print("> Rankings recovered from cache") + ranking = extract_ratings_from_dataframe(url_map_df) + else: + print("> Cache doesn't contain rankings, computing") + del url_map_df + ranking = pagerank(url_map) + + print("> Storing rankings into the cache") + url_map_df = urlmap_to_dataframe(url_map) + add_ratings_to_dataframe(url_map_df, ranking) + url_map_df.to_csv(cache_file) took = perf_counter() - start print(f"Took: {round(took, 2)} seconds") + print("Done") display_top_urls(ranking) diff --git a/src/util.py b/src/util.py index 4fe77b7..b6453a6 100644 --- a/src/util.py +++ b/src/util.py @@ -26,3 +26,35 @@ def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]: urlmap[source_url] = set() urlmap[source_url].add(target_url) return urlmap + + +def add_ratings_to_dataframe(df: pd.DataFrame, ratings: dict[httpx.URL, float]) -> None: + """Add ratings to the DataFrame (mutating it). + + :param df: The DataFrame containing 'source' and 'target' columns. + :param ratings: A dictionary mapping each URL to its PageRank rating. + """ + # Add a new column 'rating' based on the 'source' column + if "source" in df: + df["rating"] = df["source"].apply(lambda x: ratings.get(httpx.URL(x), None)) + return + + # empty dataframe + if len(ratings) == 0: + return + + raise ValueError("Got an empty dataframe with non-empty ratings dict") + + +def extract_ratings_from_dataframe(df: pd.DataFrame) -> dict[httpx.URL, float]: + """Extract ratings from a DataFrame and update the original ratings dictionary. + + :param df: DataFrame containing 'source' and 'rating' columns. + :return: A dictionary where the key is the URL and the value is the rating. + """ + ratings = {} + for _, row in df.iterrows(): + url = httpx.URL(row["source"]) # Extract the source URL + rating = row["rating"] # Extract the rating + ratings[url] = rating # Add it to the dictionary + return ratings