Add logging & better exc handling

2024-11-24 18:11:38 +01:00 · 2024-11-24 18:11:38 +01:00 · e853747cdd
parent 16373bc014
commit e853747cdd
2 changed files with 43 additions and 7 deletions
--- a/src/main.py
+++ b/src/main.py
@ -4,7 +4,7 @@ from pprint import pprint
 import httpx
-from src.link_scraper import get_urlmap
+from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
 URL = httpx.URL("https://ailab.fai.utb.cz")
 ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
@ -17,9 +17,9 @@ async def main() -> None:
        url_map = await get_urlmap(
            client,
            URL,
-            max_depth=2,
+            max_depth=3,
            filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
-            suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
+            suppress_exception=standard_urlmap_exception_suppressor,
        )
    pprint(url_map)
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@ -1,8 +1,12 @@
 from collections.abc import Callable
 from queue import Queue
 from urllib.parse import urljoin
 import httpx
 from bs4 import BeautifulSoup
 from rich.console import Console
 from rich.style import StyleType
 from rich.text import Text
 async def get_page_links(
@ -40,6 +44,31 @@ async def get_page_links(
    return {link for link in links if link.scheme in {"http", "https"}}
 def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool:
    """This function can be used as the `suppress_exception` parameter to :func:`get_urlmap`.
    The function attempts to ignore most exceptions that are reasonable to ignore during
    the scraping process, producing a log message when an ignore occurs.
    """
    def print_exc(*msg: str | Text | tuple[str, StyleType]) -> None:
        text = Text.assemble("--> ", *msg, " from ", (str(url), "yellow"))
        Console().print(text)
    if isinstance(exc, httpx.HTTPStatusError):
        if exc.response.is_redirect:
            print_exc("Skipping ", (f"redirect ({exc.response.status_code})", "red"))
        else:
            print_exc("Got ", (f"code {exc.response.status_code}", "red"))
        return True
    if isinstance(exc, httpx.TransportError):
        print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
        return True
    return False
 async def get_urlmap(
    client: httpx.AsyncClient,
    start_url: httpx.URL,
@ -48,6 +77,7 @@ async def get_urlmap(
    filter_condition: Callable[[httpx.URL], bool] | None = None,
    suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
    follow_redirects: bool = False,
    show_progress: bool = True,
 ) -> dict[httpx.URL, set[httpx.URL]]:
    """Obtain all of the links from given url, working recursively until given max_depth.
@ -70,15 +100,18 @@ async def get_urlmap(
        A function that will determine whether an exception should be suppressed (ignored) or whether
        it should get raised. By default, all exceptions will be raised and will interrupt the scraping
        process.
    :parm show_progress:
        When true, a log for each scraped URL will be shown.
    :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
    """
    url_map: dict[httpx.URL, set[httpx.URL]] = {}
-    urls: list[tuple[int, httpx.URL]] = [(0, start_url)]  # (depth, url)
+    urls: Queue[tuple[int, httpx.URL]] = Queue()  # (depth, url)
    urls.put((0, start_url))
    traversed: set[httpx.URL] = set()
-    while len(urls) > 0:
+    while urls.qsize() > 0:
-        depth, url = urls.pop()
+        depth, url = urls.get()
        if url in traversed or depth > max_depth:
            continue
@ -86,6 +119,9 @@ async def get_urlmap(
        # Include all found links in the result dict
        try:
            if show_progress:
                text = Text.assemble("> Scraping from (", (f"{depth=}", "green"), "): ", (str(url), "blue"))
                Console().print(text)
            page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
        except Exception as exc:
            if suppress_exception is None or suppress_exception(exc, url) is False:
@ -97,6 +133,6 @@ async def get_urlmap(
        for found_link in page_links:
            if filter_condition and not filter_condition(found_link):
                continue
-            urls.append((depth + 1, found_link))
+            urls.put((depth + 1, found_link))
    return url_map