Add logging & better exc handling
This commit is contained in:
parent
16373bc014
commit
e853747cdd
|
@ -4,7 +4,7 @@ from pprint import pprint
|
|||
|
||||
import httpx
|
||||
|
||||
from src.link_scraper import get_urlmap
|
||||
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||
|
||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||
|
@ -17,9 +17,9 @@ async def main() -> None:
|
|||
url_map = await get_urlmap(
|
||||
client,
|
||||
URL,
|
||||
max_depth=2,
|
||||
max_depth=3,
|
||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
|
||||
suppress_exception=standard_urlmap_exception_suppressor,
|
||||
)
|
||||
|
||||
pprint(url_map)
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
from collections.abc import Callable
|
||||
from queue import Queue
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from rich.console import Console
|
||||
from rich.style import StyleType
|
||||
from rich.text import Text
|
||||
|
||||
|
||||
async def get_page_links(
|
||||
|
@ -40,6 +44,31 @@ async def get_page_links(
|
|||
return {link for link in links if link.scheme in {"http", "https"}}
|
||||
|
||||
|
||||
def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool:
|
||||
"""This function can be used as the `suppress_exception` parameter to :func:`get_urlmap`.
|
||||
|
||||
The function attempts to ignore most exceptions that are reasonable to ignore during
|
||||
the scraping process, producing a log message when an ignore occurs.
|
||||
"""
|
||||
|
||||
def print_exc(*msg: str | Text | tuple[str, StyleType]) -> None:
|
||||
text = Text.assemble("--> ", *msg, " from ", (str(url), "yellow"))
|
||||
Console().print(text)
|
||||
|
||||
if isinstance(exc, httpx.HTTPStatusError):
|
||||
if exc.response.is_redirect:
|
||||
print_exc("Skipping ", (f"redirect ({exc.response.status_code})", "red"))
|
||||
else:
|
||||
print_exc("Got ", (f"code {exc.response.status_code}", "red"))
|
||||
return True
|
||||
|
||||
if isinstance(exc, httpx.TransportError):
|
||||
print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
async def get_urlmap(
|
||||
client: httpx.AsyncClient,
|
||||
start_url: httpx.URL,
|
||||
|
@ -48,6 +77,7 @@ async def get_urlmap(
|
|||
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
||||
follow_redirects: bool = False,
|
||||
show_progress: bool = True,
|
||||
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||
|
||||
|
@ -70,15 +100,18 @@ async def get_urlmap(
|
|||
A function that will determine whether an exception should be suppressed (ignored) or whether
|
||||
it should get raised. By default, all exceptions will be raised and will interrupt the scraping
|
||||
process.
|
||||
:parm show_progress:
|
||||
When true, a log for each scraped URL will be shown.
|
||||
|
||||
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
||||
"""
|
||||
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
||||
urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url)
|
||||
urls: Queue[tuple[int, httpx.URL]] = Queue() # (depth, url)
|
||||
urls.put((0, start_url))
|
||||
traversed: set[httpx.URL] = set()
|
||||
|
||||
while len(urls) > 0:
|
||||
depth, url = urls.pop()
|
||||
while urls.qsize() > 0:
|
||||
depth, url = urls.get()
|
||||
|
||||
if url in traversed or depth > max_depth:
|
||||
continue
|
||||
|
@ -86,6 +119,9 @@ async def get_urlmap(
|
|||
|
||||
# Include all found links in the result dict
|
||||
try:
|
||||
if show_progress:
|
||||
text = Text.assemble("> Scraping from (", (f"{depth=}", "green"), "): ", (str(url), "blue"))
|
||||
Console().print(text)
|
||||
page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
|
||||
except Exception as exc:
|
||||
if suppress_exception is None or suppress_exception(exc, url) is False:
|
||||
|
@ -97,6 +133,6 @@ async def get_urlmap(
|
|||
for found_link in page_links:
|
||||
if filter_condition and not filter_condition(found_link):
|
||||
continue
|
||||
urls.append((depth + 1, found_link))
|
||||
urls.put((depth + 1, found_link))
|
||||
|
||||
return url_map
|
||||
|
|
Loading…
Reference in a new issue