Support following redirects

2024-11-19 20:26:15 +01:00 · 2024-11-19 20:26:15 +01:00 · bd5347c299
parent 9dfac02aab
commit bd5347c299
1 changed files with 20 additions and 2 deletions
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@ -5,13 +5,29 @@ import httpx
 from bs4 import BeautifulSoup


-async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
+async def get_page_links(
+    client: httpx.AsyncClient,
+    url: httpx.URL,
+    *,
+    follow_redirects: bool = False,
+) -> set[httpx.URL]:
    """Scrape all links from given page (url).

    This function will also resolve relative URLs.
    Non http/https schemas will not be included.
    """
    res = await client.get(url)
+    if res.is_redirect and follow_redirects:
+        if not res.has_redirect_location:
+            raise httpx.HTTPStatusError(
+                f"Redirect response '{res.status_code} {res.reason_phrase}' "
+                "for url '{res.url} without Location header",
+                request=res.request,
+                response=res,
+            )
+        location = res.headers["Location"]
+        return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects)
+
    res.raise_for_status()
    html = res.text

@ -27,9 +43,11 @@ async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx
 async def get_urlmap(
    client: httpx.AsyncClient,
    start_url: httpx.URL,
+    *,
    max_depth: int,
    filter_condition: Callable[[httpx.URL], bool] | None = None,
    suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
+    follow_redirects: bool = False,
 ) -> dict[httpx.URL, set[httpx.URL]]:
    """Obtain all of the links from given url, working recursively until given max_depth.

@ -69,7 +87,7 @@ async def get_urlmap(

        # Include all found links in the result dict
        try:
-            page_links = await get_page_links(client, url)
+            page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
        except Exception as exc:
            if suppress_exception is None or suppress_exception(exc, url) is False:
                raise exc from exc