Allow exception suppressing
This commit is contained in:
parent
7f9798ed28
commit
9dfac02aab
|
@ -18,6 +18,7 @@ async def main() -> None:
|
||||||
URL,
|
URL,
|
||||||
max_depth=5,
|
max_depth=5,
|
||||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
|
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
|
||||||
)
|
)
|
||||||
|
|
||||||
pprint(url_map)
|
pprint(url_map)
|
||||||
|
|
|
@ -29,6 +29,7 @@ async def get_urlmap(
|
||||||
start_url: httpx.URL,
|
start_url: httpx.URL,
|
||||||
max_depth: int,
|
max_depth: int,
|
||||||
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||||
|
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
||||||
) -> dict[httpx.URL, set[httpx.URL]]:
|
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||||
"""Obtain all of the links from given url, working recursively until given max_depth.
|
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||||
|
|
||||||
|
@ -47,6 +48,10 @@ async def get_urlmap(
|
||||||
|
|
||||||
A common use-case is to limit scraping only to URLs with the same hostname as the start url
|
A common use-case is to limit scraping only to URLs with the same hostname as the start url
|
||||||
(to avoid scraping large sites, like facebook).
|
(to avoid scraping large sites, like facebook).
|
||||||
|
:param suppress_exception:
|
||||||
|
A function that will determine whether an exception should be suppressed (ignored) or whether
|
||||||
|
it should get raised. By default, all exceptions will be raised and will interrupt the scraping
|
||||||
|
process.
|
||||||
|
|
||||||
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
||||||
"""
|
"""
|
||||||
|
@ -63,7 +68,12 @@ async def get_urlmap(
|
||||||
traversed.add(url)
|
traversed.add(url)
|
||||||
|
|
||||||
# Include all found links in the result dict
|
# Include all found links in the result dict
|
||||||
page_links = await get_page_links(client, url)
|
try:
|
||||||
|
page_links = await get_page_links(client, url)
|
||||||
|
except Exception as exc:
|
||||||
|
if suppress_exception is None or suppress_exception(exc, url) is False:
|
||||||
|
raise exc from exc
|
||||||
|
page_links = set()
|
||||||
url_map.setdefault(url, set()).update(page_links)
|
url_map.setdefault(url, set()).update(page_links)
|
||||||
|
|
||||||
# Update the list of URLs to scrape next, filtering unwanted ones.
|
# Update the list of URLs to scrape next, filtering unwanted ones.
|
||||||
|
|
Loading…
Reference in a new issue