Allow exception suppressing

2024-11-19 20:16:24 +01:00 · 2024-11-19 20:16:24 +01:00 · 9dfac02aab
parent 7f9798ed28
commit 9dfac02aab
2 changed files with 12 additions and 1 deletions
--- a/src/main.py
+++ b/src/main.py
@ -18,6 +18,7 @@ async def main() -> None:
            URL,
            max_depth=5,
            filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
            suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
        )
    pprint(url_map)
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@ -29,6 +29,7 @@ async def get_urlmap(
    start_url: httpx.URL,
    max_depth: int,
    filter_condition: Callable[[httpx.URL], bool] | None = None,
    suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
 ) -> dict[httpx.URL, set[httpx.URL]]:
    """Obtain all of the links from given url, working recursively until given max_depth.
@ -47,6 +48,10 @@ async def get_urlmap(
        A common use-case is to limit scraping only to URLs with the same hostname as the start url
        (to avoid scraping large sites, like facebook).
    :param suppress_exception:
        A function that will determine whether an exception should be suppressed (ignored) or whether
        it should get raised. By default, all exceptions will be raised and will interrupt the scraping
        process.
    :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
    """
@ -63,7 +68,12 @@ async def get_urlmap(
        traversed.add(url)
        # Include all found links in the result dict
-        page_links = await get_page_links(client, url)
+        try:
            page_links = await get_page_links(client, url)
        except Exception as exc:
            if suppress_exception is None or suppress_exception(exc, url) is False:
                raise exc from exc
            page_links = set()
        url_map.setdefault(url, set()).update(page_links)
        # Update the list of URLs to scrape next, filtering unwanted ones.