Support following redirects
This commit is contained in:
parent
9dfac02aab
commit
bd5347c299
|
@ -5,13 +5,29 @@ import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
|
async def get_page_links(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
url: httpx.URL,
|
||||||
|
*,
|
||||||
|
follow_redirects: bool = False,
|
||||||
|
) -> set[httpx.URL]:
|
||||||
"""Scrape all links from given page (url).
|
"""Scrape all links from given page (url).
|
||||||
|
|
||||||
This function will also resolve relative URLs.
|
This function will also resolve relative URLs.
|
||||||
Non http/https schemas will not be included.
|
Non http/https schemas will not be included.
|
||||||
"""
|
"""
|
||||||
res = await client.get(url)
|
res = await client.get(url)
|
||||||
|
if res.is_redirect and follow_redirects:
|
||||||
|
if not res.has_redirect_location:
|
||||||
|
raise httpx.HTTPStatusError(
|
||||||
|
f"Redirect response '{res.status_code} {res.reason_phrase}' "
|
||||||
|
"for url '{res.url} without Location header",
|
||||||
|
request=res.request,
|
||||||
|
response=res,
|
||||||
|
)
|
||||||
|
location = res.headers["Location"]
|
||||||
|
return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects)
|
||||||
|
|
||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
html = res.text
|
html = res.text
|
||||||
|
|
||||||
|
@ -27,9 +43,11 @@ async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx
|
||||||
async def get_urlmap(
|
async def get_urlmap(
|
||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
start_url: httpx.URL,
|
start_url: httpx.URL,
|
||||||
|
*,
|
||||||
max_depth: int,
|
max_depth: int,
|
||||||
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||||
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
suppress_exception: Callable[[Exception, httpx.URL], bool] | None = None,
|
||||||
|
follow_redirects: bool = False,
|
||||||
) -> dict[httpx.URL, set[httpx.URL]]:
|
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||||
"""Obtain all of the links from given url, working recursively until given max_depth.
|
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||||
|
|
||||||
|
@ -69,7 +87,7 @@ async def get_urlmap(
|
||||||
|
|
||||||
# Include all found links in the result dict
|
# Include all found links in the result dict
|
||||||
try:
|
try:
|
||||||
page_links = await get_page_links(client, url)
|
page_links = await get_page_links(client, url, follow_redirects=follow_redirects)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
if suppress_exception is None or suppress_exception(exc, url) is False:
|
if suppress_exception is None or suppress_exception(exc, url) is False:
|
||||||
raise exc from exc
|
raise exc from exc
|
||||||
|
|
Loading…
Reference in a new issue