From 9dfac02aabbab545426e8ccd96f73da531dd5e0f Mon Sep 17 00:00:00 2001
From: Peter Vacho <p_vacho@utb.cz>
Date: Tue, 19 Nov 2024 20:16:24 +0100
Subject: [PATCH] Allow exception suppressing

---
 src/__main__.py     |  1 +
 src/link_scraper.py | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/__main__.py b/src/__main__.py
index bffed95..bad8a47 100644
--- a/src/__main__.py
+++ b/src/__main__.py
@@ -18,6 +18,7 @@ async def main() -> None:
             URL,
             max_depth=5,
             filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
+            suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
         )
 
     pprint(url_map)
diff --git a/src/link_scraper.py b/src/link_scraper.py
index 06aaf87..78369a7 100644
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@@ -29,6 +29,7 @@ async def get_urlmap(
     start_url: httpx.URL,
     max_depth: int,
     filter_condition: Callable[[httpx.URL], bool] | None = None,
+    suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
 ) -> dict[httpx.URL, set[httpx.URL]]:
     """Obtain all of the links from given url, working recursively until given max_depth.
 
@@ -47,6 +48,10 @@ async def get_urlmap(
 
         A common use-case is to limit scraping only to URLs with the same hostname as the start url
         (to avoid scraping large sites, like facebook).
+    :param suppress_exception:
+        A function that will determine whether an exception should be suppressed (ignored) or whether
+        it should get raised. By default, all exceptions will be raised and will interrupt the scraping
+        process.
 
     :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
     """
@@ -63,7 +68,12 @@ async def get_urlmap(
         traversed.add(url)
 
         # Include all found links in the result dict
-        page_links = await get_page_links(client, url)
+        try:
+            page_links = await get_page_links(client, url)
+        except Exception as exc:
+            if suppress_exception is None or suppress_exception(exc, url) is False:
+                raise exc from exc
+            page_links = set()
         url_map.setdefault(url, set()).update(page_links)
 
         # Update the list of URLs to scrape next, filtering unwanted ones.