Add pagerank algorithm
This commit is contained in:
parent
e853747cdd
commit
e7f0b5ce4e
|
@ -1,10 +1,12 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
from time import perf_counter
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||||
|
from src.pagerank import pagerank, test_pagerank
|
||||||
|
|
||||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
|
@ -12,17 +14,37 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
"""Program entrypoint."""
|
"""Program entrypoint."""
|
||||||
|
# Run a unit-test for the page-rank algorithm, according to the sample/test data
|
||||||
|
# in the assignment.
|
||||||
|
test_pagerank()
|
||||||
|
|
||||||
print("Scraping...")
|
print("Scraping...")
|
||||||
|
start = perf_counter()
|
||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
url_map = await get_urlmap(
|
url_map = await get_urlmap(
|
||||||
client,
|
client,
|
||||||
URL,
|
URL,
|
||||||
max_depth=3,
|
max_depth=2,
|
||||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
suppress_exception=standard_urlmap_exception_suppressor,
|
suppress_exception=standard_urlmap_exception_suppressor,
|
||||||
)
|
)
|
||||||
|
|
||||||
pprint(url_map)
|
took = perf_counter() - start
|
||||||
|
print(f"Took: {round(took, 2)} seconds")
|
||||||
|
|
||||||
|
print("Ranking...")
|
||||||
|
start = perf_counter()
|
||||||
|
|
||||||
|
ranking = pagerank(url_map)
|
||||||
|
sorted_urls = list(ranking.keys())
|
||||||
|
sorted_urls.sort(key=lambda url: ranking[url])
|
||||||
|
|
||||||
|
took = perf_counter() - start
|
||||||
|
print(f"Took: {round(took, 2)} seconds")
|
||||||
|
print("Done")
|
||||||
|
|
||||||
|
pprint(sorted_urls)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
93
src/pagerank.py
Normal file
93
src/pagerank.py
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def pagerank[T](
|
||||||
|
link_map: dict[T, set[T]],
|
||||||
|
beta: float = 0.85,
|
||||||
|
max_iter: int = 100,
|
||||||
|
tol: float = 1e-6,
|
||||||
|
) -> dict[T, float]:
|
||||||
|
"""Implementation of the PageRank algorithm based on the given procedure.
|
||||||
|
|
||||||
|
:param link_map: A map of links {source URL -> set of target URLs}.
|
||||||
|
:param beta: Probability of following existing links (damping factor).
|
||||||
|
:param max_iter: Maximum number of iterations for the algorithm.
|
||||||
|
:param tol: Tolerance for the difference between iterations (convergence threshold).
|
||||||
|
:return: A dictionary where the key is a URL and the value is its rank.
|
||||||
|
"""
|
||||||
|
# Get unique pages
|
||||||
|
pages = list(link_map.keys() | {link for links in link_map.values() for link in links})
|
||||||
|
n = len(pages)
|
||||||
|
page_indices = {page: i for i, page in enumerate(pages)} # Map pages to indices
|
||||||
|
|
||||||
|
# Build the adjacency matrix
|
||||||
|
m = np.zeros((n, n))
|
||||||
|
for src, targets in link_map.items():
|
||||||
|
if not targets:
|
||||||
|
# If the page has no outgoing links, assume it links to all pages (including itself)
|
||||||
|
m[:, page_indices[src]] = 1 / n
|
||||||
|
else:
|
||||||
|
# Calculate probabilities for all outgoing links from the given page
|
||||||
|
for target in targets:
|
||||||
|
m[page_indices[target], page_indices[src]] = 1 / len(targets)
|
||||||
|
|
||||||
|
# Create the E matrix
|
||||||
|
e = np.ones((n, n)) / n
|
||||||
|
|
||||||
|
# Create the A matrix
|
||||||
|
a = beta * m + (1 - beta) * e
|
||||||
|
|
||||||
|
# Initialize ranks (r(0))
|
||||||
|
ranks = np.ones(n) / n
|
||||||
|
|
||||||
|
# Iteratively calculate PageRank
|
||||||
|
for _ in range(max_iter):
|
||||||
|
new_ranks = a @ ranks # r(t+1) = A . r(t)
|
||||||
|
if np.linalg.norm(new_ranks - ranks, ord=1) < tol: # Convergence check
|
||||||
|
break
|
||||||
|
ranks = new_ranks
|
||||||
|
|
||||||
|
# Return ranks as {httpx.URL: rank}
|
||||||
|
return {page: ranks[idx] for page, idx in page_indices.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def test_pagerank() -> None:
|
||||||
|
"""A simple function to test out the functionality of the pagerank algorithm.
|
||||||
|
|
||||||
|
This uses the provided sample numbers given in the task assignment.
|
||||||
|
"""
|
||||||
|
# The assignment only had the following as input, this test therefore also
|
||||||
|
# contains the logic to parse it out into a proper python dict
|
||||||
|
inp = "{1 -> 2, 1 -> 3, 2 -> 4, 3 -> 1, 3 -> 2, 3 -> 4, 4 -> 3}"
|
||||||
|
|
||||||
|
link_map: dict[int, set[int]] = {}
|
||||||
|
for pairing in inp.strip("{}").split(", "):
|
||||||
|
key, val = pairing.split(" -> ")
|
||||||
|
|
||||||
|
key = int(key)
|
||||||
|
val = int(val)
|
||||||
|
link_map.setdefault(key, set()).add(val)
|
||||||
|
|
||||||
|
# The results were shown in an image, I will just rewrite them here:
|
||||||
|
# (iterations count -> results)
|
||||||
|
results = {
|
||||||
|
0: [0.25, 0.25, 0.25, 0.25],
|
||||||
|
1: [0.10833333, 0.21458333, 0.35625, 0.32083333],
|
||||||
|
2: [0.1384375, 0.18447917, 0.35625, 0.32083333],
|
||||||
|
3: [0.1384375, 0.19727344, 0.36904427, 0.29524479],
|
||||||
|
4: [0.14206254, 0.20089848, 0.34729401, 0.30974497],
|
||||||
|
5: [0.13589997, 0.19627655, 0.3611598, 0.30666368],
|
||||||
|
}
|
||||||
|
# To how many digits should the results be verified
|
||||||
|
check_precision = 5
|
||||||
|
|
||||||
|
for it_count, expected in results.items():
|
||||||
|
# We're using tolerance of 0 to make sure we run for given amt of iterations exactly
|
||||||
|
actual = pagerank(link_map, max_iter=it_count, tol=0)
|
||||||
|
# convert to a list (use the format from the results)
|
||||||
|
actual = [actual[1], actual[2], actual[3], actual[4]]
|
||||||
|
|
||||||
|
for pos, (expected_num, actual_num) in enumerate(zip(expected, actual, strict=True)):
|
||||||
|
assert round(expected_num, check_precision) == round( # noqa: S101
|
||||||
|
actual_num, check_precision
|
||||||
|
), f"Invalid result for it={it_count} for {pos + 1}: {expected_num} != {actual_num}"
|
Loading…
Reference in a new issue