From 422b0d5880b4db0a59eeb25635c3e2f670e59af9 Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Sun, 24 Nov 2024 22:09:24 +0100 Subject: [PATCH] Improve comments in pagerank algo --- src/pagerank.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/pagerank.py b/src/pagerank.py index 0741834..3a39037 100644 --- a/src/pagerank.py +++ b/src/pagerank.py @@ -15,12 +15,12 @@ def pagerank[T]( :param tol: Tolerance for the difference between iterations (convergence threshold). :return: A dictionary where the key is a URL and the value is its rank. """ - # Get unique pages + # Step 1: Identify all unique pages pages = list(link_map.keys() | {link for links in link_map.values() for link in links}) n = len(pages) page_indices = {page: i for i, page in enumerate(pages)} # Map pages to indices - # Build the adjacency matrix + # Step 2: Construct the adjacency matrix (m) that represents the links between pages m = np.zeros((n, n)) for src, targets in link_map.items(): if not targets: @@ -31,23 +31,27 @@ def pagerank[T]( for target in targets: m[page_indices[target], page_indices[src]] = 1 / len(targets) - # Create the E matrix - e = np.ones((n, n)) / n + # Step 3: Create the "E" matrix, which represents the random jump factor + # This represents the idea that a user can randomly jump to any page, with equal probability for each. + e = np.ones((n, n)) / n # A matrix where each entry is 1/n (uniform probability for each page) - # Create the A matrix + # Step 4: Create the "A" matrix that combines the link structure (m) and the random jump factor (e) + # This matrix represents the full model combining the link structure and random jumps a = beta * m + (1 - beta) * e - # Initialize ranks (r(0)) - ranks = np.ones(n) / n + # Step 5: Initialize the PageRank scores r(0). + ranks = np.ones(n) / n # Start with a uniform rank for all pages - # Iteratively calculate PageRank + # Step 6: Iterate to update the PageRank scores for _ in range(max_iter): - new_ranks = a @ ranks # r(t+1) = A . r(t) + new_ranks = a @ ranks # Calculate new ranks based on the current ranks if np.linalg.norm(new_ranks - ranks, ord=1) < tol: # Convergence check + # If the change is smaller than the tolerance, we stop break ranks = new_ranks - # Return ranks as {httpx.URL: rank} + # Step 7: Return the final PageRank scores + # Map the final ranks back to the original page names and return the result return {page: ranks[idx] for page, idx in page_indices.items()}