Basic link scraper

2024-11-19 19:51:44 +01:00 · 2024-11-19 19:51:44 +01:00 · 47c9a9f555
parent b1e815e588
commit 47c9a9f555
5 changed files with 132 additions and 2 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,6 +11,8 @@ dependencies = [
    "polars[all]>=1.9.0",
    "seaborn>=0.13.2",
    "rich>=13.9.2",
    "httpx>=0.27.2",
    "beautifulsoup4>=4.12.3",
 ]
 readme = "README.md"
 requires-python = ">= 3.12"
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@ -19,6 +19,8 @@ altair==5.4.1
    # via polars
 annotated-types==0.7.0
    # via pydantic
 anyio==4.6.2.post1
    # via httpx
 asttokens==2.4.1
    # via stack-data
 attrs==24.2.0
@ -27,7 +29,11 @@ attrs==24.2.0
 babel==2.16.0
    # via great-tables
 basedpyright==1.18.0
 beautifulsoup4==4.12.3
    # via task2
 certifi==2024.8.30
    # via httpcore
    # via httpx
    # via requests
 cfgv==3.4.0
    # via pre-commit
@ -71,11 +77,19 @@ great-tables==0.13.0
 greenlet==3.1.1
    # via gevent
    # via sqlalchemy
 h11==0.14.0
    # via httpcore
 htmltools==0.5.3
    # via great-tables
 httpcore==1.0.7
    # via httpx
 httpx==0.27.2
    # via task2
 identify==2.6.1
    # via pre-commit
 idna==3.10
    # via anyio
    # via httpx
    # via requests
 importlib-metadata==8.5.0
    # via great-tables
@ -215,8 +229,13 @@ setuptools==75.1.0
 six==1.16.0
    # via asttokens
    # via python-dateutil
 sniffio==1.3.1
    # via anyio
    # via httpx
 sortedcontainers==2.4.0
    # via pyiceberg
 soupsieve==2.6
    # via beautifulsoup4
 sqlalchemy==2.0.35
    # via polars
 stack-data==0.6.3
--- a/requirements.lock
+++ b/requirements.lock
@ -19,12 +19,18 @@ altair==5.4.1
    # via polars
 annotated-types==0.7.0
    # via pydantic
 anyio==4.6.2.post1
    # via httpx
 attrs==24.2.0
    # via jsonschema
    # via referencing
 babel==2.16.0
    # via great-tables
 beautifulsoup4==4.12.3
    # via task2
 certifi==2024.8.30
    # via httpcore
    # via httpx
    # via requests
 charset-normalizer==3.4.0
    # via requests
@ -58,9 +64,17 @@ great-tables==0.13.0
 greenlet==3.1.1
    # via gevent
    # via sqlalchemy
 h11==0.14.0
    # via httpcore
 htmltools==0.5.3
    # via great-tables
 httpcore==1.0.7
    # via httpx
 httpx==0.27.2
    # via task2
 idna==3.10
    # via anyio
    # via httpx
    # via requests
 importlib-metadata==8.5.0
    # via great-tables
@ -171,8 +185,13 @@ setuptools==75.1.0
    # via zope-interface
 six==1.16.0
    # via python-dateutil
 sniffio==1.3.1
    # via anyio
    # via httpx
 sortedcontainers==2.4.0
    # via pyiceberg
 soupsieve==2.6
    # via beautifulsoup4
 sqlalchemy==2.0.35
    # via polars
 strictyaml==1.7.3
--- a/src/main.py
+++ b/src/main.py
@ -1,6 +1,20 @@
-def main() -> None:
+import asyncio
 from pprint import pprint
 import httpx
 from src.link_scraper import get_urlmap
 URL = httpx.URL("https://ailab.fai.utb.cz")
 async def main() -> None:
    """Program entrypoint."""
    async with httpx.AsyncClient() as client:
        url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
    pprint(url_map)
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
--- a/src/link_scraper.py
+++ b/src/link_scraper.py
@ -0,0 +1,76 @@
 from collections.abc import Callable
 from urllib.parse import urljoin
 import httpx
 from bs4 import BeautifulSoup
 async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
    """Scrape all links from given page (url).
    This function will also resolve relative URLs.
    Non http/https schemas will not be included.
    """
    res = await client.get(url)
    res.raise_for_status()
    html = res.text
    soup = BeautifulSoup(html, features="html.parser")
    anchors = soup.find_all("a")
    hrefs = [anchor.get("href") for anchor in anchors if anchor.get("href") is not None]
    # Handle relative links (e.g. home, /home, #home)
    links = [httpx.URL(urljoin(str(url), href)) for href in hrefs]
    # Ignore mailto:, ftp:, ... schemes
    return {link for link in links if link.scheme in {"http", "https"}}
 async def get_urlmap(
    client: httpx.AsyncClient,
    start_url: httpx.URL,
    max_depth: int,
    filter_condition: Callable[[httpx.URL], bool] | None = None,
 ) -> dict[httpx.URL, set[httpx.URL]]:
    """Obtain all of the links from given url, working recursively until given max_depth.
    :param client: Asynchronous HTTPX client used to make the requests.
    :param start_url: URL to begin the link scraping process from.
    :param max_depth:
        How many recursive iterations to perform:
        - 0: Only get links on the start URL.
        - 1: Get links from the start URL alongside all links from the URLs found from the start url.
        - 2: Links from links from links from start_url.
    :param filter_condition:
        By default, all found URLs will be scraped in the next iteration, this function can be used
        to apply an excluding condition, skipping those URLs for which this function returns False.
        Note that the filtered URLs will still be included in the resulting url map.
        A common use-case is to limit scraping only to URLs with the same hostname as the start url
        (to avoid scraping large sites, like facebook).
    :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
    """
    url_map: dict[httpx.URL, set[httpx.URL]] = {}
    urls: set[httpx.URL] = {start_url}
    traversed: set[httpx.URL] = set()
    depth = 0
    while len(urls) > 0:
        url = urls.pop()
        if url in traversed:
            continue
        traversed.add(url)
        # Include all found links in the result dict
        page_links = await get_page_links(client, url)
        url_map.setdefault(url, set()).update(page_links)
        # Update the list of URLs to scrape next, filtering unwanted ones.
        urls.update(filter(filter_condition, page_links))
        if depth > max_depth:
            break
        depth += 1
    return url_map