diff --git a/src/__main__.py b/src/__main__.py index ac85514..83e0e57 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -4,7 +4,7 @@ from pprint import pprint import httpx -from src.link_scraper import get_urlmap +from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor URL = httpx.URL("https://ailab.fai.utb.cz") ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") @@ -17,9 +17,9 @@ async def main() -> None: url_map = await get_urlmap( client, URL, - max_depth=2, + max_depth=3, filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, - suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError), + suppress_exception=standard_urlmap_exception_suppressor, ) pprint(url_map) diff --git a/src/link_scraper.py b/src/link_scraper.py index 23315af..e6a8fb6 100644 --- a/src/link_scraper.py +++ b/src/link_scraper.py @@ -1,8 +1,12 @@ from collections.abc import Callable +from queue import Queue from urllib.parse import urljoin import httpx from bs4 import BeautifulSoup +from rich.console import Console +from rich.style import StyleType +from rich.text import Text async def get_page_links( @@ -40,6 +44,31 @@ async def get_page_links( return {link for link in links if link.scheme in {"http", "https"}} +def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool: + """This function can be used as the `suppress_exception` parameter to :func:`get_urlmap`. + + The function attempts to ignore most exceptions that are reasonable to ignore during + the scraping process, producing a log message when an ignore occurs. + """ + + def print_exc(*msg: str | Text | tuple[str, StyleType]) -> None: + text = Text.assemble("--> ", *msg, " from ", (str(url), "yellow")) + Console().print(text) + + if isinstance(exc, httpx.HTTPStatusError): + if exc.response.is_redirect: + print_exc("Skipping ", (f"redirect ({exc.response.status_code})", "red")) + else: + print_exc("Got ", (f"code {exc.response.status_code}", "red")) + return True + + if isinstance(exc, httpx.TransportError): + print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")") + return True + + return False + + async def get_urlmap( client: httpx.AsyncClient, start_url: httpx.URL, @@ -48,6 +77,7 @@ async def get_urlmap( filter_condition: Callable[[httpx.URL], bool] | None = None, suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None, follow_redirects: bool = False, + show_progress: bool = True, ) -> dict[httpx.URL, set[httpx.URL]]: """Obtain all of the links from given url, working recursively until given max_depth. @@ -70,15 +100,18 @@ async def get_urlmap( A function that will determine whether an exception should be suppressed (ignored) or whether it should get raised. By default, all exceptions will be raised and will interrupt the scraping process. + :parm show_progress: + When true, a log for each scraped URL will be shown. :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links)) """ url_map: dict[httpx.URL, set[httpx.URL]] = {} - urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url) + urls: Queue[tuple[int, httpx.URL]] = Queue() # (depth, url) + urls.put((0, start_url)) traversed: set[httpx.URL] = set() - while len(urls) > 0: - depth, url = urls.pop() + while urls.qsize() > 0: + depth, url = urls.get() if url in traversed or depth > max_depth: continue @@ -86,6 +119,9 @@ async def get_urlmap( # Include all found links in the result dict try: + if show_progress: + text = Text.assemble("> Scraping from (", (f"{depth=}", "green"), "): ", (str(url), "blue")) + Console().print(text) page_links = await get_page_links(client, url, follow_redirects=follow_redirects) except Exception as exc: if suppress_exception is None or suppress_exception(exc, url) is False: @@ -97,6 +133,6 @@ async def get_urlmap( for found_link in page_links: if filter_condition and not filter_condition(found_link): continue - urls.append((depth + 1, found_link)) + urls.put((depth + 1, found_link)) return url_map