From 47c9a9f55546385b0d7ab7204d6133816ad05b06 Mon Sep 17 00:00:00 2001
From: Peter Vacho <p_vacho@utb.cz>
Date: Tue, 19 Nov 2024 19:51:44 +0100
Subject: [PATCH] Basic link scraper

---
 pyproject.toml        |  2 ++
 requirements-dev.lock | 19 +++++++++++
 requirements.lock     | 19 +++++++++++
 src/__main__.py       | 18 ++++++++--
 src/link_scraper.py   | 76 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 src/link_scraper.py

diff --git a/pyproject.toml b/pyproject.toml
index 79c190f..e42aa9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,8 @@ dependencies = [
     "polars[all]>=1.9.0",
     "seaborn>=0.13.2",
     "rich>=13.9.2",
+    "httpx>=0.27.2",
+    "beautifulsoup4>=4.12.3",
 ]
 readme = "README.md"
 requires-python = ">= 3.12"
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 603bead..2005ece 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -19,6 +19,8 @@ altair==5.4.1
     # via polars
 annotated-types==0.7.0
     # via pydantic
+anyio==4.6.2.post1
+    # via httpx
 asttokens==2.4.1
     # via stack-data
 attrs==24.2.0
@@ -27,7 +29,11 @@ attrs==24.2.0
 babel==2.16.0
     # via great-tables
 basedpyright==1.18.0
+beautifulsoup4==4.12.3
+    # via task2
 certifi==2024.8.30
+    # via httpcore
+    # via httpx
     # via requests
 cfgv==3.4.0
     # via pre-commit
@@ -71,11 +77,19 @@ great-tables==0.13.0
 greenlet==3.1.1
     # via gevent
     # via sqlalchemy
+h11==0.14.0
+    # via httpcore
 htmltools==0.5.3
     # via great-tables
+httpcore==1.0.7
+    # via httpx
+httpx==0.27.2
+    # via task2
 identify==2.6.1
     # via pre-commit
 idna==3.10
+    # via anyio
+    # via httpx
     # via requests
 importlib-metadata==8.5.0
     # via great-tables
@@ -215,8 +229,13 @@ setuptools==75.1.0
 six==1.16.0
     # via asttokens
     # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+    # via httpx
 sortedcontainers==2.4.0
     # via pyiceberg
+soupsieve==2.6
+    # via beautifulsoup4
 sqlalchemy==2.0.35
     # via polars
 stack-data==0.6.3
diff --git a/requirements.lock b/requirements.lock
index c968a3b..6344d3f 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -19,12 +19,18 @@ altair==5.4.1
     # via polars
 annotated-types==0.7.0
     # via pydantic
+anyio==4.6.2.post1
+    # via httpx
 attrs==24.2.0
     # via jsonschema
     # via referencing
 babel==2.16.0
     # via great-tables
+beautifulsoup4==4.12.3
+    # via task2
 certifi==2024.8.30
+    # via httpcore
+    # via httpx
     # via requests
 charset-normalizer==3.4.0
     # via requests
@@ -58,9 +64,17 @@ great-tables==0.13.0
 greenlet==3.1.1
     # via gevent
     # via sqlalchemy
+h11==0.14.0
+    # via httpcore
 htmltools==0.5.3
     # via great-tables
+httpcore==1.0.7
+    # via httpx
+httpx==0.27.2
+    # via task2
 idna==3.10
+    # via anyio
+    # via httpx
     # via requests
 importlib-metadata==8.5.0
     # via great-tables
@@ -171,8 +185,13 @@ setuptools==75.1.0
     # via zope-interface
 six==1.16.0
     # via python-dateutil
+sniffio==1.3.1
+    # via anyio
+    # via httpx
 sortedcontainers==2.4.0
     # via pyiceberg
+soupsieve==2.6
+    # via beautifulsoup4
 sqlalchemy==2.0.35
     # via polars
 strictyaml==1.7.3
diff --git a/src/__main__.py b/src/__main__.py
index 23fb5a3..0637324 100644
--- a/src/__main__.py
+++ b/src/__main__.py
@@ -1,6 +1,20 @@
-def main() -> None:
+import asyncio
+from pprint import pprint
+
+import httpx
+
+from src.link_scraper import get_urlmap
+
+URL = httpx.URL("https://ailab.fai.utb.cz")
+
+
+async def main() -> None:
     """Program entrypoint."""
+    async with httpx.AsyncClient() as client:
+        url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
+
+    pprint(url_map)
 
 
 if __name__ == "__main__":
-    main()
+    asyncio.run(main())
diff --git a/src/link_scraper.py b/src/link_scraper.py
new file mode 100644
index 0000000..06aaf87
--- /dev/null
+++ b/src/link_scraper.py
@@ -0,0 +1,76 @@
+from collections.abc import Callable
+from urllib.parse import urljoin
+
+import httpx
+from bs4 import BeautifulSoup
+
+
+async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
+    """Scrape all links from given page (url).
+
+    This function will also resolve relative URLs.
+    Non http/https schemas will not be included.
+    """
+    res = await client.get(url)
+    res.raise_for_status()
+    html = res.text
+
+    soup = BeautifulSoup(html, features="html.parser")
+    anchors = soup.find_all("a")
+    hrefs = [anchor.get("href") for anchor in anchors if anchor.get("href") is not None]
+    # Handle relative links (e.g. home, /home, #home)
+    links = [httpx.URL(urljoin(str(url), href)) for href in hrefs]
+    # Ignore mailto:, ftp:, ... schemes
+    return {link for link in links if link.scheme in {"http", "https"}}
+
+
+async def get_urlmap(
+    client: httpx.AsyncClient,
+    start_url: httpx.URL,
+    max_depth: int,
+    filter_condition: Callable[[httpx.URL], bool] | None = None,
+) -> dict[httpx.URL, set[httpx.URL]]:
+    """Obtain all of the links from given url, working recursively until given max_depth.
+
+    :param client: Asynchronous HTTPX client used to make the requests.
+    :param start_url: URL to begin the link scraping process from.
+    :param max_depth:
+        How many recursive iterations to perform:
+
+        - 0: Only get links on the start URL.
+        - 1: Get links from the start URL alongside all links from the URLs found from the start url.
+        - 2: Links from links from links from start_url.
+    :param filter_condition:
+        By default, all found URLs will be scraped in the next iteration, this function can be used
+        to apply an excluding condition, skipping those URLs for which this function returns False.
+        Note that the filtered URLs will still be included in the resulting url map.
+
+        A common use-case is to limit scraping only to URLs with the same hostname as the start url
+        (to avoid scraping large sites, like facebook).
+
+    :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
+    """
+    url_map: dict[httpx.URL, set[httpx.URL]] = {}
+    urls: set[httpx.URL] = {start_url}
+    traversed: set[httpx.URL] = set()
+    depth = 0
+
+    while len(urls) > 0:
+        url = urls.pop()
+
+        if url in traversed:
+            continue
+        traversed.add(url)
+
+        # Include all found links in the result dict
+        page_links = await get_page_links(client, url)
+        url_map.setdefault(url, set()).update(page_links)
+
+        # Update the list of URLs to scrape next, filtering unwanted ones.
+        urls.update(filter(filter_condition, page_links))
+
+        if depth > max_depth:
+            break
+        depth += 1
+
+    return url_map