Use regex for filter condition
This commit is contained in:
parent
47c9a9f555
commit
7f9798ed28
|
@ -1,4 +1,5 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import re
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
@ -6,12 +7,18 @@ import httpx
|
||||||
from src.link_scraper import get_urlmap
|
from src.link_scraper import get_urlmap
|
||||||
|
|
||||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
|
ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
|
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
"""Program entrypoint."""
|
"""Program entrypoint."""
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
|
url_map = await get_urlmap(
|
||||||
|
client,
|
||||||
|
URL,
|
||||||
|
max_depth=5,
|
||||||
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
|
)
|
||||||
|
|
||||||
pprint(url_map)
|
pprint(url_map)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue