Store ranking into the cache
This commit is contained in:
parent
50543bd22a
commit
ac0c2a8c21
|
@ -9,7 +9,7 @@ import pandas as pd
|
||||||
|
|
||||||
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||||
from src.pagerank import pagerank, test_pagerank
|
from src.pagerank import pagerank, test_pagerank
|
||||||
from src.util import dataframe_to_urlmap, urlmap_to_dataframe
|
from src.util import add_ratings_to_dataframe, dataframe_to_urlmap, extract_ratings_from_dataframe, urlmap_to_dataframe
|
||||||
from src.visualization import display_top_urls
|
from src.visualization import display_top_urls
|
||||||
|
|
||||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
|
@ -60,10 +60,22 @@ async def main() -> None:
|
||||||
print("Ranking...")
|
print("Ranking...")
|
||||||
start = perf_counter()
|
start = perf_counter()
|
||||||
|
|
||||||
ranking = pagerank(url_map)
|
if "rating" in url_map_df:
|
||||||
|
print("> Rankings recovered from cache")
|
||||||
|
ranking = extract_ratings_from_dataframe(url_map_df)
|
||||||
|
else:
|
||||||
|
print("> Cache doesn't contain rankings, computing")
|
||||||
|
del url_map_df
|
||||||
|
ranking = pagerank(url_map)
|
||||||
|
|
||||||
|
print("> Storing rankings into the cache")
|
||||||
|
url_map_df = urlmap_to_dataframe(url_map)
|
||||||
|
add_ratings_to_dataframe(url_map_df, ranking)
|
||||||
|
url_map_df.to_csv(cache_file)
|
||||||
|
|
||||||
took = perf_counter() - start
|
took = perf_counter() - start
|
||||||
print(f"Took: {round(took, 2)} seconds")
|
print(f"Took: {round(took, 2)} seconds")
|
||||||
|
|
||||||
print("Done")
|
print("Done")
|
||||||
|
|
||||||
display_top_urls(ranking)
|
display_top_urls(ranking)
|
||||||
|
|
32
src/util.py
32
src/util.py
|
@ -26,3 +26,35 @@ def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
|
||||||
urlmap[source_url] = set()
|
urlmap[source_url] = set()
|
||||||
urlmap[source_url].add(target_url)
|
urlmap[source_url].add(target_url)
|
||||||
return urlmap
|
return urlmap
|
||||||
|
|
||||||
|
|
||||||
|
def add_ratings_to_dataframe(df: pd.DataFrame, ratings: dict[httpx.URL, float]) -> None:
|
||||||
|
"""Add ratings to the DataFrame (mutating it).
|
||||||
|
|
||||||
|
:param df: The DataFrame containing 'source' and 'target' columns.
|
||||||
|
:param ratings: A dictionary mapping each URL to its PageRank rating.
|
||||||
|
"""
|
||||||
|
# Add a new column 'rating' based on the 'source' column
|
||||||
|
if "source" in df:
|
||||||
|
df["rating"] = df["source"].apply(lambda x: ratings.get(httpx.URL(x), None))
|
||||||
|
return
|
||||||
|
|
||||||
|
# empty dataframe
|
||||||
|
if len(ratings) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
raise ValueError("Got an empty dataframe with non-empty ratings dict")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ratings_from_dataframe(df: pd.DataFrame) -> dict[httpx.URL, float]:
|
||||||
|
"""Extract ratings from a DataFrame and update the original ratings dictionary.
|
||||||
|
|
||||||
|
:param df: DataFrame containing 'source' and 'rating' columns.
|
||||||
|
:return: A dictionary where the key is the URL and the value is the rating.
|
||||||
|
"""
|
||||||
|
ratings = {}
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
url = httpx.URL(row["source"]) # Extract the source URL
|
||||||
|
rating = row["rating"] # Extract the rating
|
||||||
|
ratings[url] = rating # Add it to the dictionary
|
||||||
|
return ratings
|
||||||
|
|
Loading…
Reference in a new issue