diff --git a/.gitignore b/.gitignore index 9adac0d..446a883 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -# Cache files -url_map.pickle +# Cache (scraped dataframes) +cache/ # Output graphs output/ diff --git a/src/__main__.py b/src/__main__.py index 433be82..c98a603 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,18 +1,20 @@ import asyncio -import pickle +import base64 import re from pathlib import Path from time import perf_counter import httpx +import pandas as pd from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor from src.pagerank import pagerank, test_pagerank +from src.util import dataframe_to_urlmap, urlmap_to_dataframe from src.visualization import display_top_urls URL = httpx.URL("https://ailab.fai.utb.cz") ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") -CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle" +CACHE_RESULTS_DIR = Path.cwd() / "cache" async def main() -> None: @@ -21,14 +23,20 @@ async def main() -> None: # in the assignment. test_pagerank() + CACHE_RESULTS_DIR.mkdir(exist_ok=True) + print("Scraping...") start = perf_counter() - if CACHE_RESULTS_DIR.exists(): - print("> Using cached URLs") - with CACHE_RESULTS_DIR.open("rb") as fp: - url_map = pickle.load(fp) # noqa: S301 + # Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination + cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex() + + if cache_file.exists(): + print("> Using cached urlmap dataframe") + url_map_df = pd.read_csv(cache_file) + url_map = dataframe_to_urlmap(url_map_df) else: + print("> Cache not found, proceeding to scrape") async with httpx.AsyncClient() as client: url_map = await get_urlmap( client, @@ -37,9 +45,9 @@ async def main() -> None: filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, suppress_exception=standard_urlmap_exception_suppressor, ) - - with CACHE_RESULTS_DIR.open("wb") as fp: - pickle.dump(url_map, fp) + print("> Storing results to cache") + url_map_df = urlmap_to_dataframe(url_map) + url_map_df.to_csv(cache_file) took = perf_counter() - start print(f"Took: {round(took, 2)} seconds") diff --git a/src/util.py b/src/util.py new file mode 100644 index 0000000..cc454b6 --- /dev/null +++ b/src/util.py @@ -0,0 +1,28 @@ +import httpx +import pandas as pd + + +def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame: + """Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame. + + :param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs. + :return: A pandas DataFrame with columns 'source' and 'target'. + """ + rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets] + return pd.DataFrame(rows) + + +def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]: + """Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`. + + :param df: A DataFrame with columns 'source' and 'target'. + :return: A dictionary where the key is a source URL, and the value is a set of target URLs. + """ + urlmap = {} + for source, target in zip(df["source"], df["target"], strict=True): + source_url = httpx.URL(source) + target_url = httpx.URL(target) + if source_url not in urlmap: + urlmap[source_url] = set() + urlmap[source_url].add(target_url) + return urlmap