Use dataframe based caching
This commit is contained in:
parent
56947296b5
commit
fd563ef46c
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,5 +1,5 @@
|
||||||
# Cache files
|
# Cache (scraped dataframes)
|
||||||
url_map.pickle
|
cache/
|
||||||
|
|
||||||
# Output graphs
|
# Output graphs
|
||||||
output/
|
output/
|
||||||
|
|
|
@ -1,18 +1,20 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import pickle
|
import base64
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import perf_counter
|
from time import perf_counter
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||||
from src.pagerank import pagerank, test_pagerank
|
from src.pagerank import pagerank, test_pagerank
|
||||||
|
from src.util import dataframe_to_urlmap, urlmap_to_dataframe
|
||||||
from src.visualization import display_top_urls
|
from src.visualization import display_top_urls
|
||||||
|
|
||||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
CACHE_RESULTS_DIR = Path.cwd() / "url_map.pickle"
|
CACHE_RESULTS_DIR = Path.cwd() / "cache"
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
|
@ -21,14 +23,20 @@ async def main() -> None:
|
||||||
# in the assignment.
|
# in the assignment.
|
||||||
test_pagerank()
|
test_pagerank()
|
||||||
|
|
||||||
|
CACHE_RESULTS_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
print("Scraping...")
|
print("Scraping...")
|
||||||
start = perf_counter()
|
start = perf_counter()
|
||||||
|
|
||||||
if CACHE_RESULTS_DIR.exists():
|
# Construct a unique file-name for each URL & ALLOWED_HOSTS_RE combination
|
||||||
print("> Using cached URLs")
|
cache_file = CACHE_RESULTS_DIR / base64.b64encode((str(URL) + str(ALLOWED_HOSTS_RE)).encode("utf-8")).hex()
|
||||||
with CACHE_RESULTS_DIR.open("rb") as fp:
|
|
||||||
url_map = pickle.load(fp) # noqa: S301
|
if cache_file.exists():
|
||||||
|
print("> Using cached urlmap dataframe")
|
||||||
|
url_map_df = pd.read_csv(cache_file)
|
||||||
|
url_map = dataframe_to_urlmap(url_map_df)
|
||||||
else:
|
else:
|
||||||
|
print("> Cache not found, proceeding to scrape")
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
url_map = await get_urlmap(
|
url_map = await get_urlmap(
|
||||||
client,
|
client,
|
||||||
|
@ -37,9 +45,9 @@ async def main() -> None:
|
||||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
suppress_exception=standard_urlmap_exception_suppressor,
|
suppress_exception=standard_urlmap_exception_suppressor,
|
||||||
)
|
)
|
||||||
|
print("> Storing results to cache")
|
||||||
with CACHE_RESULTS_DIR.open("wb") as fp:
|
url_map_df = urlmap_to_dataframe(url_map)
|
||||||
pickle.dump(url_map, fp)
|
url_map_df.to_csv(cache_file)
|
||||||
|
|
||||||
took = perf_counter() - start
|
took = perf_counter() - start
|
||||||
print(f"Took: {round(took, 2)} seconds")
|
print(f"Took: {round(took, 2)} seconds")
|
||||||
|
|
28
src/util.py
Normal file
28
src/util.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
import httpx
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def urlmap_to_dataframe(urlmap: dict[httpx.URL, set[httpx.URL]]) -> pd.DataFrame:
|
||||||
|
"""Convert a `dict[httpx.URL, set[httpx.URL]]` to a pandas DataFrame.
|
||||||
|
|
||||||
|
:param urlmap: A dictionary where the key is a source URL, and the value is a set of target URLs.
|
||||||
|
:return: A pandas DataFrame with columns 'source' and 'target'.
|
||||||
|
"""
|
||||||
|
rows = [{"source": str(source), "target": str(target)} for source, targets in urlmap.items() for target in targets]
|
||||||
|
return pd.DataFrame(rows)
|
||||||
|
|
||||||
|
|
||||||
|
def dataframe_to_urlmap(df: pd.DataFrame) -> dict[httpx.URL, set[httpx.URL]]:
|
||||||
|
"""Convert a pandas DataFrame with 'source' and 'target' columns back to a `dict[httpx.URL, set[httpx.URL]]`.
|
||||||
|
|
||||||
|
:param df: A DataFrame with columns 'source' and 'target'.
|
||||||
|
:return: A dictionary where the key is a source URL, and the value is a set of target URLs.
|
||||||
|
"""
|
||||||
|
urlmap = {}
|
||||||
|
for source, target in zip(df["source"], df["target"], strict=True):
|
||||||
|
source_url = httpx.URL(source)
|
||||||
|
target_url = httpx.URL(target)
|
||||||
|
if source_url not in urlmap:
|
||||||
|
urlmap[source_url] = set()
|
||||||
|
urlmap[source_url].add(target_url)
|
||||||
|
return urlmap
|
Loading…
Reference in a new issue