Improve comments in pagerank algo
This commit is contained in:
parent
2fdb600c50
commit
422b0d5880
|
@ -15,12 +15,12 @@ def pagerank[T](
|
||||||
:param tol: Tolerance for the difference between iterations (convergence threshold).
|
:param tol: Tolerance for the difference between iterations (convergence threshold).
|
||||||
:return: A dictionary where the key is a URL and the value is its rank.
|
:return: A dictionary where the key is a URL and the value is its rank.
|
||||||
"""
|
"""
|
||||||
# Get unique pages
|
# Step 1: Identify all unique pages
|
||||||
pages = list(link_map.keys() | {link for links in link_map.values() for link in links})
|
pages = list(link_map.keys() | {link for links in link_map.values() for link in links})
|
||||||
n = len(pages)
|
n = len(pages)
|
||||||
page_indices = {page: i for i, page in enumerate(pages)} # Map pages to indices
|
page_indices = {page: i for i, page in enumerate(pages)} # Map pages to indices
|
||||||
|
|
||||||
# Build the adjacency matrix
|
# Step 2: Construct the adjacency matrix (m) that represents the links between pages
|
||||||
m = np.zeros((n, n))
|
m = np.zeros((n, n))
|
||||||
for src, targets in link_map.items():
|
for src, targets in link_map.items():
|
||||||
if not targets:
|
if not targets:
|
||||||
|
@ -31,23 +31,27 @@ def pagerank[T](
|
||||||
for target in targets:
|
for target in targets:
|
||||||
m[page_indices[target], page_indices[src]] = 1 / len(targets)
|
m[page_indices[target], page_indices[src]] = 1 / len(targets)
|
||||||
|
|
||||||
# Create the E matrix
|
# Step 3: Create the "E" matrix, which represents the random jump factor
|
||||||
e = np.ones((n, n)) / n
|
# This represents the idea that a user can randomly jump to any page, with equal probability for each.
|
||||||
|
e = np.ones((n, n)) / n # A matrix where each entry is 1/n (uniform probability for each page)
|
||||||
|
|
||||||
# Create the A matrix
|
# Step 4: Create the "A" matrix that combines the link structure (m) and the random jump factor (e)
|
||||||
|
# This matrix represents the full model combining the link structure and random jumps
|
||||||
a = beta * m + (1 - beta) * e
|
a = beta * m + (1 - beta) * e
|
||||||
|
|
||||||
# Initialize ranks (r(0))
|
# Step 5: Initialize the PageRank scores r(0).
|
||||||
ranks = np.ones(n) / n
|
ranks = np.ones(n) / n # Start with a uniform rank for all pages
|
||||||
|
|
||||||
# Iteratively calculate PageRank
|
# Step 6: Iterate to update the PageRank scores
|
||||||
for _ in range(max_iter):
|
for _ in range(max_iter):
|
||||||
new_ranks = a @ ranks # r(t+1) = A . r(t)
|
new_ranks = a @ ranks # Calculate new ranks based on the current ranks
|
||||||
if np.linalg.norm(new_ranks - ranks, ord=1) < tol: # Convergence check
|
if np.linalg.norm(new_ranks - ranks, ord=1) < tol: # Convergence check
|
||||||
|
# If the change is smaller than the tolerance, we stop
|
||||||
break
|
break
|
||||||
ranks = new_ranks
|
ranks = new_ranks
|
||||||
|
|
||||||
# Return ranks as {httpx.URL: rank}
|
# Step 7: Return the final PageRank scores
|
||||||
|
# Map the final ranks back to the original page names and return the result
|
||||||
return {page: ranks[idx] for page, idx in page_indices.items()}
|
return {page: ranks[idx] for page, idx in page_indices.items()}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue