Improve comments in pagerank algo

This commit is contained in:
Peter Vacho 2024-11-24 22:09:24 +01:00
parent 2fdb600c50
commit 422b0d5880
Signed by: school
GPG key ID: 8CFC3837052871B4

View file

@ -15,12 +15,12 @@ def pagerank[T](
:param tol: Tolerance for the difference between iterations (convergence threshold). :param tol: Tolerance for the difference between iterations (convergence threshold).
:return: A dictionary where the key is a URL and the value is its rank. :return: A dictionary where the key is a URL and the value is its rank.
""" """
# Get unique pages # Step 1: Identify all unique pages
pages = list(link_map.keys() | {link for links in link_map.values() for link in links}) pages = list(link_map.keys() | {link for links in link_map.values() for link in links})
n = len(pages) n = len(pages)
page_indices = {page: i for i, page in enumerate(pages)} # Map pages to indices page_indices = {page: i for i, page in enumerate(pages)} # Map pages to indices
# Build the adjacency matrix # Step 2: Construct the adjacency matrix (m) that represents the links between pages
m = np.zeros((n, n)) m = np.zeros((n, n))
for src, targets in link_map.items(): for src, targets in link_map.items():
if not targets: if not targets:
@ -31,23 +31,27 @@ def pagerank[T](
for target in targets: for target in targets:
m[page_indices[target], page_indices[src]] = 1 / len(targets) m[page_indices[target], page_indices[src]] = 1 / len(targets)
# Create the E matrix # Step 3: Create the "E" matrix, which represents the random jump factor
e = np.ones((n, n)) / n # This represents the idea that a user can randomly jump to any page, with equal probability for each.
e = np.ones((n, n)) / n # A matrix where each entry is 1/n (uniform probability for each page)
# Create the A matrix # Step 4: Create the "A" matrix that combines the link structure (m) and the random jump factor (e)
# This matrix represents the full model combining the link structure and random jumps
a = beta * m + (1 - beta) * e a = beta * m + (1 - beta) * e
# Initialize ranks (r(0)) # Step 5: Initialize the PageRank scores r(0).
ranks = np.ones(n) / n ranks = np.ones(n) / n # Start with a uniform rank for all pages
# Iteratively calculate PageRank # Step 6: Iterate to update the PageRank scores
for _ in range(max_iter): for _ in range(max_iter):
new_ranks = a @ ranks # r(t+1) = A . r(t) new_ranks = a @ ranks # Calculate new ranks based on the current ranks
if np.linalg.norm(new_ranks - ranks, ord=1) < tol: # Convergence check if np.linalg.norm(new_ranks - ranks, ord=1) < tol: # Convergence check
# If the change is smaller than the tolerance, we stop
break break
ranks = new_ranks ranks = new_ranks
# Return ranks as {httpx.URL: rank} # Step 7: Return the final PageRank scores
# Map the final ranks back to the original page names and return the result
return {page: ranks[idx] for page, idx in page_indices.items()} return {page: ranks[idx] for page, idx in page_indices.items()}