Support following redirects
This commit is contained in:
parent
9dfac02aab
commit
bd5347c299
|
@ -5,13 +5,29 @@ import httpx
|
|||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
|
||||
async def get_page_links(
|
||||
client: httpx.AsyncClient,
|
||||
url: httpx.URL,
|
||||
*,
|
||||
follow_redirects: bool = False,
|
||||
) -> set[httpx.URL]:
|
||||
"""Scrape all links from given page (url).
|
||||
|
||||
This function will also resolve relative URLs.
|
||||
Non http/https schemas will not be included.
|
||||
"""
|
||||
res = await client.get(url)
|
||||
if res.is_redirect and follow_redirects:
|
||||
if not res.has_redirect_location:
|
||||
raise httpx.HTTPStatusError(
|
||||
f"Redirect response '{res.status_code} {res.reason_phrase}' "
|
||||
"for url '{res.url} without Location header",
|
||||
request=res.request,
|
||||
response=res,
|
||||
)
|
||||
location = res.headers["Location"]
|
||||
return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects)
|
||||
|
||||
res.raise_for_status()
|
||||
html = res.text
|
||||
|
||||
|
@ -27,9 +43,11 @@ async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx
|
|||
async def get_urlmap(
|
||||
client: httpx.AsyncClient,
|
||||
start_url: httpx.URL,
|
||||
*,
|
||||
max_depth: int,
|
||||
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
||||
follow_redirects: bool = False,
|
||||
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||
|
||||
|
@ -69,7 +87,7 @@ async def get_urlmap(
|
|||
|
||||
# Include all found links in the result dict
|
||||
try:
|
||||
page_links = await get_page_links(client, url)
|
||||
page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
|
||||
except Exception as exc:
|
||||
if suppress_exception is None or suppress_exception(exc, url) is False:
|
||||
raise exc from exc
|
||||
|
|
Loading…
Reference in a new issue