From 47c9a9f55546385b0d7ab7204d6133816ad05b06 Mon Sep 17 00:00:00 2001 From: Peter Vacho Date: Tue, 19 Nov 2024 19:51:44 +0100 Subject: [PATCH] Basic link scraper --- pyproject.toml | 2 ++ requirements-dev.lock | 19 +++++++++++ requirements.lock | 19 +++++++++++ src/__main__.py | 18 ++++++++-- src/link_scraper.py | 76 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 132 insertions(+), 2 deletions(-) create mode 100644 src/link_scraper.py diff --git a/pyproject.toml b/pyproject.toml index 79c190f..e42aa9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,8 @@ dependencies = [ "polars[all]>=1.9.0", "seaborn>=0.13.2", "rich>=13.9.2", + "httpx>=0.27.2", + "beautifulsoup4>=4.12.3", ] readme = "README.md" requires-python = ">= 3.12" diff --git a/requirements-dev.lock b/requirements-dev.lock index 603bead..2005ece 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -19,6 +19,8 @@ altair==5.4.1 # via polars annotated-types==0.7.0 # via pydantic +anyio==4.6.2.post1 + # via httpx asttokens==2.4.1 # via stack-data attrs==24.2.0 @@ -27,7 +29,11 @@ attrs==24.2.0 babel==2.16.0 # via great-tables basedpyright==1.18.0 +beautifulsoup4==4.12.3 + # via task2 certifi==2024.8.30 + # via httpcore + # via httpx # via requests cfgv==3.4.0 # via pre-commit @@ -71,11 +77,19 @@ great-tables==0.13.0 greenlet==3.1.1 # via gevent # via sqlalchemy +h11==0.14.0 + # via httpcore htmltools==0.5.3 # via great-tables +httpcore==1.0.7 + # via httpx +httpx==0.27.2 + # via task2 identify==2.6.1 # via pre-commit idna==3.10 + # via anyio + # via httpx # via requests importlib-metadata==8.5.0 # via great-tables @@ -215,8 +229,13 @@ setuptools==75.1.0 six==1.16.0 # via asttokens # via python-dateutil +sniffio==1.3.1 + # via anyio + # via httpx sortedcontainers==2.4.0 # via pyiceberg +soupsieve==2.6 + # via beautifulsoup4 sqlalchemy==2.0.35 # via polars stack-data==0.6.3 diff --git a/requirements.lock b/requirements.lock index c968a3b..6344d3f 100644 --- a/requirements.lock +++ b/requirements.lock @@ -19,12 +19,18 @@ altair==5.4.1 # via polars annotated-types==0.7.0 # via pydantic +anyio==4.6.2.post1 + # via httpx attrs==24.2.0 # via jsonschema # via referencing babel==2.16.0 # via great-tables +beautifulsoup4==4.12.3 + # via task2 certifi==2024.8.30 + # via httpcore + # via httpx # via requests charset-normalizer==3.4.0 # via requests @@ -58,9 +64,17 @@ great-tables==0.13.0 greenlet==3.1.1 # via gevent # via sqlalchemy +h11==0.14.0 + # via httpcore htmltools==0.5.3 # via great-tables +httpcore==1.0.7 + # via httpx +httpx==0.27.2 + # via task2 idna==3.10 + # via anyio + # via httpx # via requests importlib-metadata==8.5.0 # via great-tables @@ -171,8 +185,13 @@ setuptools==75.1.0 # via zope-interface six==1.16.0 # via python-dateutil +sniffio==1.3.1 + # via anyio + # via httpx sortedcontainers==2.4.0 # via pyiceberg +soupsieve==2.6 + # via beautifulsoup4 sqlalchemy==2.0.35 # via polars strictyaml==1.7.3 diff --git a/src/__main__.py b/src/__main__.py index 23fb5a3..0637324 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -1,6 +1,20 @@ -def main() -> None: +import asyncio +from pprint import pprint + +import httpx + +from src.link_scraper import get_urlmap + +URL = httpx.URL("https://ailab.fai.utb.cz") + + +async def main() -> None: """Program entrypoint.""" + async with httpx.AsyncClient() as client: + url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host) + + pprint(url_map) if __name__ == "__main__": - main() + asyncio.run(main()) diff --git a/src/link_scraper.py b/src/link_scraper.py new file mode 100644 index 0000000..06aaf87 --- /dev/null +++ b/src/link_scraper.py @@ -0,0 +1,76 @@ +from collections.abc import Callable +from urllib.parse import urljoin + +import httpx +from bs4 import BeautifulSoup + + +async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]: + """Scrape all links from given page (url). + + This function will also resolve relative URLs. + Non http/https schemas will not be included. + """ + res = await client.get(url) + res.raise_for_status() + html = res.text + + soup = BeautifulSoup(html, features="html.parser") + anchors = soup.find_all("a") + hrefs = [anchor.get("href") for anchor in anchors if anchor.get("href") is not None] + # Handle relative links (e.g. home, /home, #home) + links = [httpx.URL(urljoin(str(url), href)) for href in hrefs] + # Ignore mailto:, ftp:, ... schemes + return {link for link in links if link.scheme in {"http", "https"}} + + +async def get_urlmap( + client: httpx.AsyncClient, + start_url: httpx.URL, + max_depth: int, + filter_condition: Callable[[httpx.URL], bool] | None = None, +) -> dict[httpx.URL, set[httpx.URL]]: + """Obtain all of the links from given url, working recursively until given max_depth. + + :param client: Asynchronous HTTPX client used to make the requests. + :param start_url: URL to begin the link scraping process from. + :param max_depth: + How many recursive iterations to perform: + + - 0: Only get links on the start URL. + - 1: Get links from the start URL alongside all links from the URLs found from the start url. + - 2: Links from links from links from start_url. + :param filter_condition: + By default, all found URLs will be scraped in the next iteration, this function can be used + to apply an excluding condition, skipping those URLs for which this function returns False. + Note that the filtered URLs will still be included in the resulting url map. + + A common use-case is to limit scraping only to URLs with the same hostname as the start url + (to avoid scraping large sites, like facebook). + + :return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links)) + """ + url_map: dict[httpx.URL, set[httpx.URL]] = {} + urls: set[httpx.URL] = {start_url} + traversed: set[httpx.URL] = set() + depth = 0 + + while len(urls) > 0: + url = urls.pop() + + if url in traversed: + continue + traversed.add(url) + + # Include all found links in the result dict + page_links = await get_page_links(client, url) + url_map.setdefault(url, set()).update(page_links) + + # Update the list of URLs to scrape next, filtering unwanted ones. + urls.update(filter(filter_condition, page_links)) + + if depth > max_depth: + break + depth += 1 + + return url_map