Add logging & better exc handling
This commit is contained in:
parent
16373bc014
commit
e853747cdd
|
@ -4,7 +4,7 @@ from pprint import pprint
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from src.link_scraper import get_urlmap
|
from src.link_scraper import get_urlmap, standard_urlmap_exception_suppressor
|
||||||
|
|
||||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
|
@ -17,9 +17,9 @@ async def main() -> None:
|
||||||
url_map = await get_urlmap(
|
url_map = await get_urlmap(
|
||||||
client,
|
client,
|
||||||
URL,
|
URL,
|
||||||
max_depth=2,
|
max_depth=3,
|
||||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
|
suppress_exception=standard_urlmap_exception_suppressor,
|
||||||
)
|
)
|
||||||
|
|
||||||
pprint(url_map)
|
pprint(url_map)
|
||||||
|
|
|
@ -1,8 +1,12 @@
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
|
from queue import Queue
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.style import StyleType
|
||||||
|
from rich.text import Text
|
||||||
|
|
||||||
|
|
||||||
async def get_page_links(
|
async def get_page_links(
|
||||||
|
@ -40,6 +44,31 @@ async def get_page_links(
|
||||||
return {link for link in links if link.scheme in {"http", "https"}}
|
return {link for link in links if link.scheme in {"http", "https"}}
|
||||||
|
|
||||||
|
|
||||||
|
def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool:
|
||||||
|
"""This function can be used as the `suppress_exception` parameter to :func:`get_urlmap`.
|
||||||
|
|
||||||
|
The function attempts to ignore most exceptions that are reasonable to ignore during
|
||||||
|
the scraping process, producing a log message when an ignore occurs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def print_exc(*msg: str | Text | tuple[str, StyleType]) -> None:
|
||||||
|
text = Text.assemble("--> ", *msg, " from ", (str(url), "yellow"))
|
||||||
|
Console().print(text)
|
||||||
|
|
||||||
|
if isinstance(exc, httpx.HTTPStatusError):
|
||||||
|
if exc.response.is_redirect:
|
||||||
|
print_exc("Skipping ", (f"redirect ({exc.response.status_code})", "red"))
|
||||||
|
else:
|
||||||
|
print_exc("Got ", (f"code {exc.response.status_code}", "red"))
|
||||||
|
return True
|
||||||
|
|
||||||
|
if isinstance(exc, httpx.TransportError):
|
||||||
|
print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def get_urlmap(
|
async def get_urlmap(
|
||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
start_url: httpx.URL,
|
start_url: httpx.URL,
|
||||||
|
@ -48,6 +77,7 @@ async def get_urlmap(
|
||||||
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||||
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
||||||
follow_redirects: bool = False,
|
follow_redirects: bool = False,
|
||||||
|
show_progress: bool = True,
|
||||||
) -> dict[httpx.URL, set[httpx.URL]]:
|
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||||
"""Obtain all of the links from given url, working recursively until given max_depth.
|
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||||
|
|
||||||
|
@ -70,15 +100,18 @@ async def get_urlmap(
|
||||||
A function that will determine whether an exception should be suppressed (ignored) or whether
|
A function that will determine whether an exception should be suppressed (ignored) or whether
|
||||||
it should get raised. By default, all exceptions will be raised and will interrupt the scraping
|
it should get raised. By default, all exceptions will be raised and will interrupt the scraping
|
||||||
process.
|
process.
|
||||||
|
:parm show_progress:
|
||||||
|
When true, a log for each scraped URL will be shown.
|
||||||
|
|
||||||
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
||||||
"""
|
"""
|
||||||
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
||||||
urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url)
|
urls: Queue[tuple[int, httpx.URL]] = Queue() # (depth, url)
|
||||||
|
urls.put((0, start_url))
|
||||||
traversed: set[httpx.URL] = set()
|
traversed: set[httpx.URL] = set()
|
||||||
|
|
||||||
while len(urls) > 0:
|
while urls.qsize() > 0:
|
||||||
depth, url = urls.pop()
|
depth, url = urls.get()
|
||||||
|
|
||||||
if url in traversed or depth > max_depth:
|
if url in traversed or depth > max_depth:
|
||||||
continue
|
continue
|
||||||
|
@ -86,6 +119,9 @@ async def get_urlmap(
|
||||||
|
|
||||||
# Include all found links in the result dict
|
# Include all found links in the result dict
|
||||||
try:
|
try:
|
||||||
|
if show_progress:
|
||||||
|
text = Text.assemble("> Scraping from (", (f"{depth=}", "green"), "): ", (str(url), "blue"))
|
||||||
|
Console().print(text)
|
||||||
page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
|
page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
if suppress_exception is None or suppress_exception(exc, url) is False:
|
if suppress_exception is None or suppress_exception(exc, url) is False:
|
||||||
|
@ -97,6 +133,6 @@ async def get_urlmap(
|
||||||
for found_link in page_links:
|
for found_link in page_links:
|
||||||
if filter_condition and not filter_condition(found_link):
|
if filter_condition and not filter_condition(found_link):
|
||||||
continue
|
continue
|
||||||
urls.append((depth + 1, found_link))
|
urls.put((depth + 1, found_link))
|
||||||
|
|
||||||
return url_map
|
return url_map
|
||||||
|
|
Loading…
Reference in a new issue