Store ranking into the cache
This commit is contained in:
parent
50543bd22a
commit
ac0c2a8c21
|
@ -9,7 +9,7 @@ import pandas as pd
|
|||
|
||||
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||
from src.pagerank import pagerank, test_pagerank
|
||||
from src.util import dataframe_to_urlmap, urlmap_to_dataframe
|
||||
from src.util import add_ratings_to_dataframe, dataframe_to_urlmap, extract_ratings_from_dataframe, urlmap_to_dataframe
|
||||
from src.visualization import display_top_urls
|
||||
|
||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||
|
@ -60,10 +60,22 @@ async def main() -> None:
|
|||
print("Ranking...")
|
||||
start = perf_counter()
|
||||
|
||||
if "rating" in url_map_df:
|
||||
print("> Rankings recovered from cache")
|
||||
ranking = extract_ratings_from_dataframe(url_map_df)
|
||||
else:
|
||||
print("> Cache doesn't contain rankings, computing")
|
||||
del url_map_df
|
||||
ranking = pagerank(url_map)
|
||||
|
||||
print("> Storing rankings into the cache")
|
||||
url_map_df = urlmap_to_dataframe(url_map)
|
||||
add_ratings_to_dataframe(url_map_df, ranking)
|
||||
url_map_df.to_csv(cache_file)
|
||||
|
||||
took = perf_counter() - start
|
||||
print(f"Took: {round(took, 2)} seconds")
|
||||
|
||||
print("Done")
|
||||
|
||||
display_top_urls(ranking)
|
||||
|
|
32
src/util.py
32
src/util.py
|
@ -26,3 +26,35 @@ def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
|
|||
urlmap[source_url] = set()
|
||||
urlmap[source_url].add(target_url)
|
||||
return urlmap
|
||||
|
||||
|
||||
def add_ratings_to_dataframe(df: pd.DataFrame, ratings: dict[httpx.URL, float]) -> None:
|
||||
"""Add ratings to the DataFrame (mutating it).
|
||||
|
||||
:param df: The DataFrame containing 'source' and 'target' columns.
|
||||
:param ratings: A dictionary mapping each URL to its PageRank rating.
|
||||
"""
|
||||
# Add a new column 'rating' based on the 'source' column
|
||||
if "source" in df:
|
||||
df["rating"] = df["source"].apply(lambda x: ratings.get(httpx.URL(x), None))
|
||||
return
|
||||
|
||||
# empty dataframe
|
||||
if len(ratings) == 0:
|
||||
return
|
||||
|
||||
raise ValueError("Got an empty dataframe with non-empty ratings dict")
|
||||
|
||||
|
||||
def extract_ratings_from_dataframe(df: pd.DataFrame) -> dict[httpx.URL, float]:
|
||||
"""Extract ratings from a DataFrame and update the original ratings dictionary.
|
||||
|
||||
:param df: DataFrame containing 'source' and 'rating' columns.
|
||||
:return: A dictionary where the key is the URL and the value is the rating.
|
||||
"""
|
||||
ratings = {}
|
||||
for _, row in df.iterrows():
|
||||
url = httpx.URL(row["source"]) # Extract the source URL
|
||||
rating = row["rating"] # Extract the rating
|
||||
ratings[url] = rating # Add it to the dictionary
|
||||
return ratings
|
||||
|
|
Loading…
Reference in a new issue