From 9dfac02aabbab545426e8ccd96f73da531dd5e0f Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Tue, 19 Nov 2024 20:16:24 +0100 Subject: [PATCH] Allow exception suppressing --- src/__main__.py | 1 + src/link_scraper.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/__main__.py b/src/__main__.py index bffed95..bad8a47 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -18,6 +18,7 @@ async def main() -> None: URL, max_depth=5, filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, + suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError), ) pprint(url_map) diff --git a/src/link_scraper.py b/src/link_scraper.py index 06aaf87..78369a7 100644 --- a/src/link_scraper.py +++ b/src/link_scraper.py @@ -29,6 +29,7 @@ async def get_urlmap( start_url: httpx.URL, max_depth: int, filter_condition: Callable[[httpx.URL], bool] | None = None, + suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None, ) -> dict[httpx.URL, set[httpx.URL]]: """Obtain all of the links from given url, working recursively until given max_depth. @@ -47,6 +48,10 @@ async def get_urlmap( A common use-case is to limit scraping only to URLs with the same hostname as the start url (to avoid scraping large sites, like facebook). + :param suppress_exception: + A function that will determine whether an exception should be suppressed (ignored) or whether + it should get raised. By default, all exceptions will be raised and will interrupt the scraping + process. :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links)) """ @@ -63,7 +68,12 @@ async def get_urlmap( traversed.add(url) # Include all found links in the result dict - page_links = await get_page_links(client, url) + try: + page_links = await get_page_links(client, url) + except Exception as exc: + if suppress_exception is None or suppress_exception(exc, url) is False: + raise exc from exc + page_links = set() url_map.setdefault(url, set()).update(page_links) # Update the list of URLs to scrape next, filtering unwanted ones.