Use regex for filter condition

This commit is contained in:
Peter Vacho 2024-11-19 20:14:52 +01:00
parent 47c9a9f555
commit 7f9798ed28
Signed by: school
GPG key ID: 8CFC3837052871B4

View file

@ -1,4 +1,5 @@
import asyncio
import re
from pprint import pprint
import httpx
@ -6,12 +7,18 @@ import httpx
from src.link_scraper import get_urlmap
URL = httpx.URL("https://ailab.fai.utb.cz")
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
async def main() -> None:
"""Program entrypoint."""
async with httpx.AsyncClient() as client:
url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
url_map = await get_urlmap(
client,
URL,
max_depth=5,
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
)
pprint(url_map)