Don't download non-html content

2024-11-25 10:54:21 +01:00 · 2024-11-25 10:54:21 +01:00 · ea03f0cf75
parent fd563ef46c
commit ea03f0cf75
1 changed files with 42 additions and 12 deletions
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@ -10,6 +10,15 @@ from rich.style import StyleType
 from rich.text import Text


+class NonHtmlContentError(httpx.HTTPError):
+    """Raised when the site's Content-Type header indicates non-HTML content."""
+
+    def __init__(self, message: str, *, request: httpx.Request, response: httpx.Response) -> None:
+        super().__init__(message)
+        self.request = request
+        self.response = response
+
+
 async def get_page_links(
    client: httpx.AsyncClient,
    url: httpx.URL,
@ -21,20 +30,30 @@ async def get_page_links(
    This function will also resolve relative URLs.
    Non http/https schemas will not be included.
    """
-    res = await client.get(url)
-    if res.is_redirect and follow_redirects:
-        if not res.has_redirect_location:
-            raise httpx.HTTPStatusError(
-                f"Redirect response '{res.status_code} {res.reason_phrase}' "
-                "for url '{res.url} without Location header",
-                request=res.request,
-                response=res,
+    async with client.stream("GET", url) as res:
+        if res.is_redirect and follow_redirects:
+            if not res.has_redirect_location:
+                raise httpx.HTTPStatusError(
+                    f"Redirect response '{res.status_code} {res.reason_phrase}' "
+                    "for url '{res.url} without Location header",
+                    request=res.request,
+                    response=res,
+                )
+            location = res.headers["Location"]
+            return await get_page_links(
+                client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects
            )
-        location = res.headers["Location"]
-        return await get_page_links(client, httpx.URL(urljoin(str(url), location)), follow_redirects=follow_redirects)

-    res.raise_for_status()
-    html = res.text
+        res.raise_for_status()
+
+        # Make sure that we're getting back HTML content
+        content_type = res.headers.get("Content-Type", "")
+        if not content_type.startswith("text/html"):
+            raise NonHtmlContentError("The site content type isn't HTML", request=res.request, response=res)
+
+        # Only read the rest of the data here, this prevents pulling large non-HTML files
+        await res.aread()
+        html = res.text

    soup = BeautifulSoup(html, features="html.parser")
    anchors = soup.find_all("a")
@ -63,12 +82,23 @@ def standard_urlmap_exception_suppressor(exc: Exception, url: httpx.URL) -> bool
            print_exc("Got ", (f"code {exc.response.status_code}", "red"))
        return True

+    if isinstance(exc, NonHtmlContentError):
+        print_exc(
+            "Got ",
+            ("Non-HTML Content-Type Header", "red"),
+            ", (",
+            (str(exc.response.headers.get("Content-Type", "")), "orange"),
+            ")",
+        )
+        return True
+
    if isinstance(exc, httpx.TransportError):
        print_exc("Got ", (exc.__class__.__qualname__, "red"), ", (", (str(exc), "orange"), ")")
        return True

    if isinstance(exc, ParserRejectedMarkup):
        print_exc("Parsing failure: ", ("Invalid HTML", "red"))
+        return True

    return False