From 16373bc014bb46004b55a0ad577303d2aecdcebb Mon Sep 17 00:00:00 2001
From: Peter Vacho <p_vacho@utb.cz>
Date: Sun, 24 Nov 2024 17:23:31 +0100
Subject: [PATCH] Fix depth handling

---
 src/__main__.py     |  3 ++-
 src/link_scraper.py | 18 ++++++++----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/__main__.py b/src/__main__.py
index bad8a47..ac85514 100644
--- a/src/__main__.py
+++ b/src/__main__.py
@@ -12,11 +12,12 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
 
 async def main() -> None:
     """Program entrypoint."""
+    print("Scraping...")
     async with httpx.AsyncClient() as client:
         url_map = await get_urlmap(
             client,
             URL,
-            max_depth=5,
+            max_depth=2,
             filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
             suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
         )
diff --git a/src/link_scraper.py b/src/link_scraper.py
index 79a46ea..23315af 100644
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@@ -74,14 +74,13 @@ async def get_urlmap(
     :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
     """
     url_map: dict[httpx.URL, set[httpx.URL]] = {}
-    urls: set[httpx.URL] = {start_url}
+    urls: list[tuple[int, httpx.URL]] = [(0, start_url)]  # (depth, url)
     traversed: set[httpx.URL] = set()
-    depth = 0
 
     while len(urls) > 0:
-        url = urls.pop()
+        depth, url = urls.pop()
 
-        if url in traversed:
+        if url in traversed or depth > max_depth:
             continue
         traversed.add(url)
 
@@ -94,11 +93,10 @@ async def get_urlmap(
             page_links = set()
         url_map.setdefault(url, set()).update(page_links)
 
-        # Update the list of URLs to scrape next, filtering unwanted ones.
-        urls.update(filter(filter_condition, page_links))
-
-        if depth > max_depth:
-            break
-        depth += 1
+        # Add the list of URLs to scrape next, filtering unwanted ones.
+        for found_link in page_links:
+            if filter_condition and not filter_condition(found_link):
+                continue
+            urls.append((depth + 1, found_link))
 
     return url_map