Improve comments in pagerank algo

2024-11-24 22:09:24 +01:00 · 2024-11-24 22:09:24 +01:00 · 422b0d5880
parent 2fdb600c50
commit 422b0d5880
1 changed files with 14 additions and 10 deletions
--- a/src/pagerank.py
+++ b/src/pagerank.py
@ -15,12 +15,12 @@ def pagerank[T](
    :param tol: Tolerance for the difference between iterations (convergence threshold).
    :return: A dictionary where the key is a URL and the value is its rank.
    """
-    # Get unique pages
+    # Step 1: Identify all unique pages
    pages = list(link_map.keys() | {link for links in link_map.values() for link in links})
    n = len(pages)
    page_indices = {page: i for i, page in enumerate(pages)}  # Map pages to indices
-    # Build the adjacency matrix
+    # Step 2: Construct the adjacency matrix (m) that represents the links between pages
    m = np.zeros((n, n))
    for src, targets in link_map.items():
        if not targets:
@ -31,23 +31,27 @@ def pagerank[T](
            for target in targets:
                m[page_indices[target], page_indices[src]] = 1 / len(targets)
-    # Create the E matrix
+    # Step 3: Create the "E" matrix, which represents the random jump factor
-    e = np.ones((n, n)) / n
+    # This represents the idea that a user can randomly jump to any page, with equal probability for each.
    e = np.ones((n, n)) / n  # A matrix where each entry is 1/n (uniform probability for each page)
-    # Create the A matrix
+    # Step 4: Create the "A" matrix that combines the link structure (m) and the random jump factor (e)
    # This matrix represents the full model combining the link structure and random jumps
    a = beta * m + (1 - beta) * e
-    # Initialize ranks (r(0))
+    # Step 5: Initialize the PageRank scores r(0).
-    ranks = np.ones(n) / n
+    ranks = np.ones(n) / n  # Start with a uniform rank for all pages
-    # Iteratively calculate PageRank
+    # Step 6: Iterate to update the PageRank scores
    for _ in range(max_iter):
-        new_ranks = a @ ranks  # r(t+1) = A . r(t)
+        new_ranks = a @ ranks  # Calculate new ranks based on the current ranks
        if np.linalg.norm(new_ranks - ranks, ord=1) < tol:  # Convergence check
            # If the change is smaller than the tolerance, we stop
            break
        ranks = new_ranks
-    # Return ranks as {httpx.URL: rank}
+    # Step 7: Return the final PageRank scores
    # Map the final ranks back to the original page names and return the result
    return {page: ranks[idx] for page, idx in page_indices.items()}