From 16373bc014bb46004b55a0ad577303d2aecdcebb Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Sun, 24 Nov 2024 17:23:31 +0100 Subject: [PATCH] Fix depth handling --- src/__main__.py | 3 ++- src/link_scraper.py | 18 ++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/__main__.py b/src/__main__.py index bad8a47..ac85514 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -12,11 +12,12 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz") async def main() -> None: """Program entrypoint.""" + print("Scraping...") async with httpx.AsyncClient() as client: url_map = await get_urlmap( client, URL, - max_depth=5, + max_depth=2, filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError), ) diff --git a/src/link_scraper.py b/src/link_scraper.py index 79a46ea..23315af 100644 --- a/src/link_scraper.py +++ b/src/link_scraper.py @@ -74,14 +74,13 @@ async def get_urlmap( :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links)) """ url_map: dict[httpx.URL, set[httpx.URL]] = {} - urls: set[httpx.URL] = {start_url} + urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url) traversed: set[httpx.URL] = set() - depth = 0 while len(urls) > 0: - url = urls.pop() + depth, url = urls.pop() - if url in traversed: + if url in traversed or depth > max_depth: continue traversed.add(url) @@ -94,11 +93,10 @@ async def get_urlmap( page_links = set() url_map.setdefault(url, set()).update(page_links) - # Update the list of URLs to scrape next, filtering unwanted ones. - urls.update(filter(filter_condition, page_links)) - - if depth > max_depth: - break - depth += 1 + # Add the list of URLs to scrape next, filtering unwanted ones. + for found_link in page_links: + if filter_condition and not filter_condition(found_link): + continue + urls.append((depth + 1, found_link)) return url_map