Support following redirects

This commit is contained in:
Peter Vacho 2024-11-19 20:26:15 +01:00
parent 9dfac02aab
commit bd5347c299
Signed by: school
GPG key ID: 8CFC3837052871B4

View file

@ -5,13 +5,29 @@ import httpx
from bs4 import BeautifulSoup
async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
async def get_page_links(
client: httpx.AsyncClient,
url: httpx.URL,
*,
follow_redirects: bool = False,
) -> set[httpx.URL]:
"""Scrape all links from given page (url).
This function will also resolve relative URLs.
Non http/https schemas will not be included.
"""
res = await client.get(url)
if res.is_redirect and follow_redirects:
if not res.has_redirect_location:
raise httpx.HTTPStatusError(
f"Redirect response '{res.status_code} {res.reason_phrase}' "
"for url '{res.url} without Location header",
request=res.request,
response=res,
)
location = res.headers["Location"]
return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects)
res.raise_for_status()
html = res.text
@ -27,9 +43,11 @@ async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx
async def get_urlmap(
client: httpx.AsyncClient,
start_url: httpx.URL,
*,
max_depth: int,
filter_condition: Callable[[httpx.URL], bool] | None = None,
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
follow_redirects: bool = False,
) -> dict[httpx.URL, set[httpx.URL]]:
"""Obtain all of the links from given url, working recursively until given max_depth.
@ -69,7 +87,7 @@ async def get_urlmap(
# Include all found links in the result dict
try:
page_links = await get_page_links(client, url)
page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
except Exception as exc:
if suppress_exception is None or suppress_exception(exc, url) is False:
raise exc from exc