Use dataframe based caching
This commit is contained in:
parent
56947296b5
commit
fd563ef46c
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,5 +1,5 @@
|
|||
# Cache files
|
||||
url_map.pickle
|
||||
# Cache (scraped dataframes)
|
||||
cache/
|
||||
|
||||
# Output graphs
|
||||
output/
|
||||
|
|
|
@ -1,18 +1,20 @@
|
|||
import asyncio
|
||||
import pickle
|
||||
import base64
|
||||
import re
|
||||
from pathlib import Path
|
||||
from time import perf_counter
|
||||
|
||||
import httpx
|
||||
import pandas as pd
|
||||
|
||||
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||
from src.pagerank import pagerank, test_pagerank
|
||||
from src.util import dataframe_to_urlmap, urlmap_to_dataframe
|
||||
from src.visualization import display_top_urls
|
||||
|
||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||
CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
|
||||
CACHE_RESULTS_DIR = Path.cwd() / "cache"
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
|
@ -21,14 +23,20 @@ async def main() -> None:
|
|||
# in the assignment.
|
||||
test_pagerank()
|
||||
|
||||
CACHE_RESULTS_DIR.mkdir(exist_ok=True)
|
||||
|
||||
print("Scraping...")
|
||||
start = perf_counter()
|
||||
|
||||
if CACHE_RESULTS_DIR.exists():
|
||||
print("> Using cached URLs")
|
||||
with CACHE_RESULTS_DIR.open("rb") as fp:
|
||||
url_map = pickle.load(fp) # noqa: S301
|
||||
# Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination
|
||||
cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex()
|
||||
|
||||
if cache_file.exists():
|
||||
print("> Using cached urlmap dataframe")
|
||||
url_map_df = pd.read_csv(cache_file)
|
||||
url_map = dataframe_to_urlmap(url_map_df)
|
||||
else:
|
||||
print("> Cache not found, proceeding to scrape")
|
||||
async with httpx.AsyncClient() as client:
|
||||
url_map = await get_urlmap(
|
||||
client,
|
||||
|
@ -37,9 +45,9 @@ async def main() -> None:
|
|||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||
suppress_exception=standard_urlmap_exception_suppressor,
|
||||
)
|
||||
|
||||
with CACHE_RESULTS_DIR.open("wb") as fp:
|
||||
pickle.dump(url_map, fp)
|
||||
print("> Storing results to cache")
|
||||
url_map_df = urlmap_to_dataframe(url_map)
|
||||
url_map_df.to_csv(cache_file)
|
||||
|
||||
took = perf_counter() - start
|
||||
print(f"Took: {round(took, 2)} seconds")
|
||||
|
|
28
src/util.py
Normal file
28
src/util.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
import httpx
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame:
|
||||
"""Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame.
|
||||
|
||||
:param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs.
|
||||
:return: A pandas DataFrame with columns 'source' and 'target'.
|
||||
"""
|
||||
rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets]
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
|
||||
"""Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`.
|
||||
|
||||
:param df: A DataFrame with columns 'source' and 'target'.
|
||||
:return: A dictionary where the key is a source URL, and the value is a set of target URLs.
|
||||
"""
|
||||
urlmap = {}
|
||||
for source, target in zip(df["source"], df["target"], strict=True):
|
||||
source_url = httpx.URL(source)
|
||||
target_url = httpx.URL(target)
|
||||
if source_url not in urlmap:
|
||||
urlmap[source_url] = set()
|
||||
urlmap[source_url].add(target_url)
|
||||
return urlmap
|
Loading…
Reference in a new issue