Store ranking into the cache

This commit is contained in:
Peter Vacho 2024-11-25 11:50:06 +01:00
parent 50543bd22a
commit ac0c2a8c21
Signed by: school
GPG key ID: 8CFC3837052871B4
2 changed files with 46 additions and 2 deletions

View file

@ -9,7 +9,7 @@ import pandas as pd
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
from src.pagerank import pagerank, test_pagerank
from src.util import dataframe_to_urlmap, urlmap_to_dataframe
from src.util import add_ratings_to_dataframe, dataframe_to_urlmap, extract_ratings_from_dataframe, urlmap_to_dataframe
from src.visualization import display_top_urls
URL = httpx.URL("https://ailab.fai.utb.cz")
@ -60,10 +60,22 @@ async def main() -> None:
print("Ranking...")
start = perf_counter()
ranking = pagerank(url_map)
if "rating" in url_map_df:
print("> Rankings recovered from cache")
ranking = extract_ratings_from_dataframe(url_map_df)
else:
print("> Cache doesn't contain rankings, computing")
del url_map_df
ranking = pagerank(url_map)
print("> Storing rankings into the cache")
url_map_df = urlmap_to_dataframe(url_map)
add_ratings_to_dataframe(url_map_df, ranking)
url_map_df.to_csv(cache_file)
took = perf_counter() - start
print(f"Took: {round(took, 2)} seconds")
print("Done")
display_top_urls(ranking)

View file

@ -26,3 +26,35 @@ def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
urlmap[source_url] = set()
urlmap[source_url].add(target_url)
return urlmap
def add_ratings_to_dataframe(df: pd.DataFrame, ratings: dict[httpx.URL, float]) -> None:
"""Add ratings to the DataFrame (mutating it).
:param df: The DataFrame containing 'source' and 'target' columns.
:param ratings: A dictionary mapping each URL to its PageRank rating.
"""
# Add a new column 'rating' based on the 'source' column
if "source" in df:
df["rating"] = df["source"].apply(lambda x: ratings.get(httpx.URL(x), None))
return
# empty dataframe
if len(ratings) == 0:
return
raise ValueError("Got an empty dataframe with non-empty ratings dict")
def extract_ratings_from_dataframe(df: pd.DataFrame) -> dict[httpx.URL, float]:
"""Extract ratings from a DataFrame and update the original ratings dictionary.
:param df: DataFrame containing 'source' and 'rating' columns.
:return: A dictionary where the key is the URL and the value is the rating.
"""
ratings = {}
for _, row in df.iterrows():
url = httpx.URL(row["source"]) # Extract the source URL
rating = row["rating"] # Extract the rating
ratings[url] = rating # Add it to the dictionary
return ratings