Fix depth handling

This commit is contained in:
Peter Vacho 2024-11-24 17:23:31 +01:00
parent bd5347c299
commit 16373bc014
Signed by: school
GPG key ID: 8CFC3837052871B4
2 changed files with 10 additions and 11 deletions

View file

@ -12,11 +12,12 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
async def main() -> None: async def main() -> None:
"""Program entrypoint.""" """Program entrypoint."""
print("Scraping...")
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
url_map = await get_urlmap( url_map = await get_urlmap(
client, client,
URL, URL,
max_depth=5, max_depth=2,
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None, filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError), suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
) )

View file

@ -74,14 +74,13 @@ async def get_urlmap(
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links)) :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
""" """
url_map: dict[httpx.URL, set[httpx.URL]] = {} url_map: dict[httpx.URL, set[httpx.URL]] = {}
urls: set[httpx.URL] = {start_url} urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url)
traversed: set[httpx.URL] = set() traversed: set[httpx.URL] = set()
depth = 0
while len(urls) > 0: while len(urls) > 0:
url = urls.pop() depth, url = urls.pop()
if url in traversed: if url in traversed or depth > max_depth:
continue continue
traversed.add(url) traversed.add(url)
@ -94,11 +93,10 @@ async def get_urlmap(
page_links = set() page_links = set()
url_map.setdefault(url, set()).update(page_links) url_map.setdefault(url, set()).update(page_links)
# Update the list of URLs to scrape next, filtering unwanted ones. # Add the list of URLs to scrape next, filtering unwanted ones.
urls.update(filter(filter_condition, page_links)) for found_link in page_links:
if filter_condition and not filter_condition(found_link):
if depth > max_depth: continue
break urls.append((depth + 1, found_link))
depth += 1
return url_map return url_map