Don't download non-html content
This commit is contained in:
parent
fd563ef46c
commit
ea03f0cf75
|
@ -10,6 +10,15 @@ from rich.style import StyleType
|
||||||
from rich.text import Text
|
from rich.text import Text
|
||||||
|
|
||||||
|
|
||||||
|
class NonHtmlContentError(httpx.HTTPError):
|
||||||
|
"""Raised when the site's Content-Type header indicates non-HTML content."""
|
||||||
|
|
||||||
|
def __init__(self, message: str, *, request: httpx.Request, response: httpx.Response) -> None:
|
||||||
|
super().__init__(message)
|
||||||
|
self.request = request
|
||||||
|
self.response = response
|
||||||
|
|
||||||
|
|
||||||
async def get_page_links(
|
async def get_page_links(
|
||||||
client: httpx.AsyncClient,
|
client: httpx.AsyncClient,
|
||||||
url: httpx.URL,
|
url: httpx.URL,
|
||||||
|
@ -21,20 +30,30 @@ async def get_page_links(
|
||||||
This function will also resolve relative URLs.
|
This function will also resolve relative URLs.
|
||||||
Non http/https schemas will not be included.
|
Non http/https schemas will not be included.
|
||||||
"""
|
"""
|
||||||
res = await client.get(url)
|
async with client.stream("GET", url) as res:
|
||||||
if res.is_redirect and follow_redirects:
|
if res.is_redirect and follow_redirects:
|
||||||
if not res.has_redirect_location:
|
if not res.has_redirect_location:
|
||||||
raise httpx.HTTPStatusError(
|
raise httpx.HTTPStatusError(
|
||||||
f"Redirect response '{res.status_code} {res.reason_phrase}' "
|
f"Redirect response '{res.status_code} {res.reason_phrase}' "
|
||||||
"for url '{res.url} without Location header",
|
"for url '{res.url} without Location header",
|
||||||
request=res.request,
|
request=res.request,
|
||||||
response=res,
|
response=res,
|
||||||
|
)
|
||||||
|
location = res.headers["Location"]
|
||||||
|
return await get_page_links(
|
||||||
|
client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects
|
||||||
)
|
)
|
||||||
location = res.headers["Location"]
|
|
||||||
return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects)
|
|
||||||
|
|
||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
html = res.text
|
|
||||||
|
# Make sure that we're getting back HTML content
|
||||||
|
content_type = res.headers.get("Content-Type", "")
|
||||||
|
if not content_type.startswith("text/html"):
|
||||||
|
raise NonHtmlContentError("The site content type isn't HTML", request=res.request, response=res)
|
||||||
|
|
||||||
|
# Only read the rest of the data here, this prevents pulling large non-HTML files
|
||||||
|
await res.aread()
|
||||||
|
html = res.text
|
||||||
|
|
||||||
soup = BeautifulSoup(html, features="html.parser")
|
soup = BeautifulSoup(html, features="html.parser")
|
||||||
anchors = soup.find_all("a")
|
anchors = soup.find_all("a")
|
||||||
|
@ -63,12 +82,23 @@ def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool
|
||||||
print_exc("Got ", (f"code {exc.response.status_code}", "red"))
|
print_exc("Got ", (f"code {exc.response.status_code}", "red"))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
if isinstance(exc, NonHtmlContentError):
|
||||||
|
print_exc(
|
||||||
|
"Got ",
|
||||||
|
("Non-HTML Content-Type Header", "red"),
|
||||||
|
", (",
|
||||||
|
(str(exc.response.headers.get("Content-Type", "")), "orange"),
|
||||||
|
")",
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
if isinstance(exc, httpx.TransportError):
|
if isinstance(exc, httpx.TransportError):
|
||||||
print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
|
print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if isinstance(exc, ParserRejectedMarkup):
|
if isinstance(exc, ParserRejectedMarkup):
|
||||||
print_exc("Parsing failure: ", ("Invalid HTML", "red"))
|
print_exc("Parsing failure: ", ("Invalid HTML", "red"))
|
||||||
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue