Fix depth handling
This commit is contained in:
parent
bd5347c299
commit
16373bc014
|
@ -12,11 +12,12 @@ ALLOWED_HOSTS_RE = re.compile(r"(?:.*\.)?utb\.cz")
|
||||||
|
|
||||||
async def main() -> None:
|
async def main() -> None:
|
||||||
"""Program entrypoint."""
|
"""Program entrypoint."""
|
||||||
|
print("Scraping...")
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
url_map = await get_urlmap(
|
url_map = await get_urlmap(
|
||||||
client,
|
client,
|
||||||
URL,
|
URL,
|
||||||
max_depth=5,
|
max_depth=2,
|
||||||
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
filter_condition=lambda url: ALLOWED_HOSTS_RE.fullmatch(url.host) is not None,
|
||||||
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
|
suppress_exception=lambda exc, _: isinstance(exc, httpx.HTTPStatusError),
|
||||||
)
|
)
|
||||||
|
|
|
@ -74,14 +74,13 @@ async def get_urlmap(
|
||||||
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
||||||
"""
|
"""
|
||||||
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
||||||
urls: set[httpx.URL] = {start_url}
|
urls: list[tuple[int, httpx.URL]] = [(0, start_url)] # (depth, url)
|
||||||
traversed: set[httpx.URL] = set()
|
traversed: set[httpx.URL] = set()
|
||||||
depth = 0
|
|
||||||
|
|
||||||
while len(urls) > 0:
|
while len(urls) > 0:
|
||||||
url = urls.pop()
|
depth, url = urls.pop()
|
||||||
|
|
||||||
if url in traversed:
|
if url in traversed or depth > max_depth:
|
||||||
continue
|
continue
|
||||||
traversed.add(url)
|
traversed.add(url)
|
||||||
|
|
||||||
|
@ -94,11 +93,10 @@ async def get_urlmap(
|
||||||
page_links = set()
|
page_links = set()
|
||||||
url_map.setdefault(url, set()).update(page_links)
|
url_map.setdefault(url, set()).update(page_links)
|
||||||
|
|
||||||
# Update the list of URLs to scrape next, filtering unwanted ones.
|
# Add the list of URLs to scrape next, filtering unwanted ones.
|
||||||
urls.update(filter(filter_condition, page_links))
|
for found_link in page_links:
|
||||||
|
if filter_condition and not filter_condition(found_link):
|
||||||
if depth > max_depth:
|
continue
|
||||||
break
|
urls.append((depth + 1, found_link))
|
||||||
depth += 1
|
|
||||||
|
|
||||||
return url_map
|
return url_map
|
||||||
|
|
Loading…
Reference in a new issue