Add logging & better exc handling

This commit is contained in:
Peter Vacho 2024-11-24 18:11:38 +01:00
parent 16373bc014
commit e853747cdd
Signed by: school
GPG key ID: 8CFC3837052871B4
2 changed files with 43 additions and 7 deletions

View file

@ -4,7 +4,7 @@ from pprint import pprint
import httpx import httpx
from src.link_scraper import get_urlmap from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
URL = httpx.URL("https://ailab.fai.utb.cz") URL = httpx.URL("https://ailab.fai.utb.cz")
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
@ -17,9 +17,9 @@ async def main() -> None:
url_map = await get_urlmap( url_map = await get_urlmap(
client, client,
URL, URL,
max_depth=2, max_depth=3,
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError), suppress_exception=standard_urlmap_exception_suppressor,
) )
pprint(url_map) pprint(url_map)

View file

@ -1,8 +1,12 @@
from collections.abc import Callable from collections.abc import Callable
from queue import Queue
from urllib.parse import urljoin from urllib.parse import urljoin
import httpx import httpx
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from rich.console import Console
from rich.style import StyleType
from rich.text import Text
async def get_page_links( async def get_page_links(
@ -40,6 +44,31 @@ async def get_page_links(
return {link for link in links if link.scheme in {"http", "https"}} return {link for link in links if link.scheme in {"http", "https"}}
def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool:
"""This function can be used as the `suppress_exception` parameter to :func:`get_urlmap`.
The function attempts to ignore most exceptions that are reasonable to ignore during
the scraping process, producing a log message when an ignore occurs.
"""
def print_exc(*msg: str | Text | tuple[str, StyleType]) -> None:
text = Text.assemble("--> ", *msg, " from ", (str(url), "yellow"))
Console().print(text)
if isinstance(exc, httpx.HTTPStatusError):
if exc.response.is_redirect:
print_exc("Skipping ", (f"redirect ({exc.response.status_code})", "red"))
else:
print_exc("Got ", (f"code {exc.response.status_code}", "red"))
return True
if isinstance(exc, httpx.TransportError):
print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
return True
return False
async def get_urlmap( async def get_urlmap(
client: httpx.AsyncClient, client: httpx.AsyncClient,
start_url: httpx.URL, start_url: httpx.URL,
@ -48,6 +77,7 @@ async def get_urlmap(
filter_condition: Callable[[httpx.URL], bool] | None = None, filter_condition: Callable[[httpx.URL], bool] | None = None,
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None, suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
follow_redirects: bool = False, follow_redirects: bool = False,
show_progress: bool = True,
) -> dict[httpx.URL, set[httpx.URL]]: ) -> dict[httpx.URL, set[httpx.URL]]:
"""Obtain all of the links from given url, working recursively until given max_depth. """Obtain all of the links from given url, working recursively until given max_depth.
@ -70,15 +100,18 @@ async def get_urlmap(
A function that will determine whether an exception should be suppressed (ignored) or whether A function that will determine whether an exception should be suppressed (ignored) or whether
it should get raised. By default, all exceptions will be raised and will interrupt the scraping it should get raised. By default, all exceptions will be raised and will interrupt the scraping
process. process.
:parm show_progress:
When true, a log for each scraped URL will be shown.
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links)) :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
""" """
url_map: dict[httpx.URL, set[httpx.URL]] = {} url_map: dict[httpx.URL, set[httpx.URL]] = {}
urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url) urls: Queue[tuple[int, httpx.URL]] = Queue() # (depth, url)
urls.put((0, start_url))
traversed: set[httpx.URL] = set() traversed: set[httpx.URL] = set()
while len(urls) > 0: while urls.qsize() > 0:
depth, url = urls.pop() depth, url = urls.get()
if url in traversed or depth > max_depth: if url in traversed or depth > max_depth:
continue continue
@ -86,6 +119,9 @@ async def get_urlmap(
# Include all found links in the result dict # Include all found links in the result dict
try: try:
if show_progress:
text = Text.assemble("> Scraping from (", (f"{depth=}", "green"), "): ", (str(url), "blue"))
Console().print(text)
page_links = await get_page_links(client, url, follow_redirects=follow_redirects) page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
except Exception as exc: except Exception as exc:
if suppress_exception is None or suppress_exception(exc, url) is False: if suppress_exception is None or suppress_exception(exc, url) is False:
@ -97,6 +133,6 @@ async def get_urlmap(
for found_link in page_links: for found_link in page_links:
if filter_condition and not filter_condition(found_link): if filter_condition and not filter_condition(found_link):
continue continue
urls.append((depth + 1, found_link)) urls.put((depth + 1, found_link))
return url_map return url_map