From 7f9798ed28dffac3060d8df958497f1f8bd5de22 Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Tue, 19 Nov 2024 20:14:52 +0100 Subject: [PATCH] Use regex for filter condition --- src/__main__.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/__main__.py b/src/__main__.py index 0637324..bffed95 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,4 +1,5 @@ import asyncio +import re from pprint import pprint import httpx @@ -6,12 +7,18 @@ import httpx from src.link_scraper import get_urlmap URL = httpx.URL("https://ailab.fai.utb.cz") +ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") async def main() -> None: """Program entrypoint.""" async with httpx.AsyncClient() as client: - url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host) + url_map = await get_urlmap( + client, + URL, + max_depth=5, + filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, + ) pprint(url_map)