Fix depth handling

2024-11-24 17:23:31 +01:00 · 2024-11-24 17:23:31 +01:00 · 16373bc014
parent bd5347c299
commit 16373bc014
2 changed files with 10 additions and 11 deletions
--- a/src/main.py
+++ b/src/main.py
@ -12,11 +12,12 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
 async def main() -> None:
    """Program entrypoint."""
    print("Scraping...")
    async with httpx.AsyncClient() as client:
        url_map = await get_urlmap(
            client,
            URL,
-            max_depth=5,
+            max_depth=2,
            filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
            suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
        )
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@ -74,14 +74,13 @@ async def get_urlmap(
    :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
    """
    url_map: dict[httpx.URL, set[httpx.URL]] = {}
-    urls: set[httpx.URL] = {start_url}
+    urls: list[tuple[int, httpx.URL]] = [(0, start_url)]  # (depth, url)
    traversed: set[httpx.URL] = set()
    depth = 0
    while len(urls) > 0:
-        url = urls.pop()
+        depth, url = urls.pop()
-        if url in traversed:
+        if url in traversed or depth > max_depth:
            continue
        traversed.add(url)
@ -94,11 +93,10 @@ async def get_urlmap(
            page_links = set()
        url_map.setdefault(url, set()).update(page_links)
-        # Update the list of URLs to scrape next, filtering unwanted ones.
+        # Add the list of URLs to scrape next, filtering unwanted ones.
-        urls.update(filter(filter_condition, page_links))
+        for found_link in page_links:
-
+            if filter_condition and not filter_condition(found_link):
-        if depth > max_depth:
+                continue
-            break
+            urls.append((depth + 1, found_link))
        depth += 1
    return url_map