From bd5347c2994399bbc10d854f118ec0ceffd86118 Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Tue, 19 Nov 2024 20:26:15 +0100 Subject: [PATCH] Support following redirects --- src/link_scraper.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/link_scraper.py b/src/link_scraper.py index 78369a7..79a46ea 100644 --- a/src/link_scraper.py +++ b/src/link_scraper.py @@ -5,13 +5,29 @@ import httpx from bs4 import BeautifulSoup -async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]: +async def get_page_links( + client: httpx.AsyncClient, + url: httpx.URL, + *, + follow_redirects: bool = False, +) -> set[httpx.URL]: """Scrape all links from given page (url). This function will also resolve relative URLs. Non http/https schemas will not be included. """ res = await client.get(url) + if res.is_redirect and follow_redirects: + if not res.has_redirect_location: + raise httpx.HTTPStatusError( + f"Redirect response '{res.status_code} {res.reason_phrase}' " + "for url '{res.url} without Location header", + request=res.request, + response=res, + ) + location = res.headers["Location"] + return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects) + res.raise_for_status() html = res.text @@ -27,9 +43,11 @@ async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx async def get_urlmap( client: httpx.AsyncClient, start_url: httpx.URL, + *, max_depth: int, filter_condition: Callable[[httpx.URL], bool] | None = None, suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None, + follow_redirects: bool = False, ) -> dict[httpx.URL, set[httpx.URL]]: """Obtain all of the links from given url, working recursively until given max_depth. @@ -69,7 +87,7 @@ async def get_urlmap( # Include all found links in the result dict try: - page_links = await get_page_links(client, url) + page_links = await get_page_links(client, url, follow_redirects=follow_redirects) except Exception as exc: if suppress_exception is None or suppress_exception(exc, url) is False: raise exc from exc