Fix depth handling

This commit is contained in:
Peter Vacho 2024-11-24 17:23:31 +01:00
parent bd5347c299
commit 16373bc014
Signed by: school
GPG key ID: 8CFC3837052871B4
2 changed files with 10 additions and 11 deletions

View file

@ -12,11 +12,12 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
async def main() -> None:
"""Program entrypoint."""
print("Scraping...")
async with httpx.AsyncClient() as client:
url_map = await get_urlmap(
client,
URL,
max_depth=5,
max_depth=2,
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
)

View file

@ -74,14 +74,13 @@ async def get_urlmap(
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
"""
url_map: dict[httpx.URL, set[httpx.URL]] = {}
urls: set[httpx.URL] = {start_url}
urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url)
traversed: set[httpx.URL] = set()
depth = 0
while len(urls) > 0:
url = urls.pop()
depth, url = urls.pop()
if url in traversed:
if url in traversed or depth > max_depth:
continue
traversed.add(url)
@ -94,11 +93,10 @@ async def get_urlmap(
page_links = set()
url_map.setdefault(url, set()).update(page_links)
# Update the list of URLs to scrape next, filtering unwanted ones.
urls.update(filter(filter_condition, page_links))
if depth > max_depth:
break
depth += 1
# Add the list of URLs to scrape next, filtering unwanted ones.
for found_link in page_links:
if filter_condition and not filter_condition(found_link):
continue
urls.append((depth + 1, found_link))
return url_map