Basic link scraper

This commit is contained in:
Peter Vacho 2024-11-19 19:51:44 +01:00
parent b1e815e588
commit 47c9a9f555
Signed by: school
GPG key ID: 8CFC3837052871B4
5 changed files with 132 additions and 2 deletions

View file

@ -11,6 +11,8 @@ dependencies = [
"polars[all]>=1.9.0", "polars[all]>=1.9.0",
"seaborn>=0.13.2", "seaborn>=0.13.2",
"rich>=13.9.2", "rich>=13.9.2",
"httpx>=0.27.2",
"beautifulsoup4>=4.12.3",
] ]
readme = "README.md" readme = "README.md"
requires-python = ">= 3.12" requires-python = ">= 3.12"

View file

@ -19,6 +19,8 @@ altair==5.4.1
# via polars # via polars
annotated-types==0.7.0 annotated-types==0.7.0
# via pydantic # via pydantic
anyio==4.6.2.post1
# via httpx
asttokens==2.4.1 asttokens==2.4.1
# via stack-data # via stack-data
attrs==24.2.0 attrs==24.2.0
@ -27,7 +29,11 @@ attrs==24.2.0
babel==2.16.0 babel==2.16.0
# via great-tables # via great-tables
basedpyright==1.18.0 basedpyright==1.18.0
beautifulsoup4==4.12.3
# via task2
certifi==2024.8.30 certifi==2024.8.30
# via httpcore
# via httpx
# via requests # via requests
cfgv==3.4.0 cfgv==3.4.0
# via pre-commit # via pre-commit
@ -71,11 +77,19 @@ great-tables==0.13.0
greenlet==3.1.1 greenlet==3.1.1
# via gevent # via gevent
# via sqlalchemy # via sqlalchemy
h11==0.14.0
# via httpcore
htmltools==0.5.3 htmltools==0.5.3
# via great-tables # via great-tables
httpcore==1.0.7
# via httpx
httpx==0.27.2
# via task2
identify==2.6.1 identify==2.6.1
# via pre-commit # via pre-commit
idna==3.10 idna==3.10
# via anyio
# via httpx
# via requests # via requests
importlib-metadata==8.5.0 importlib-metadata==8.5.0
# via great-tables # via great-tables
@ -215,8 +229,13 @@ setuptools==75.1.0
six==1.16.0 six==1.16.0
# via asttokens # via asttokens
# via python-dateutil # via python-dateutil
sniffio==1.3.1
# via anyio
# via httpx
sortedcontainers==2.4.0 sortedcontainers==2.4.0
# via pyiceberg # via pyiceberg
soupsieve==2.6
# via beautifulsoup4
sqlalchemy==2.0.35 sqlalchemy==2.0.35
# via polars # via polars
stack-data==0.6.3 stack-data==0.6.3

View file

@ -19,12 +19,18 @@ altair==5.4.1
# via polars # via polars
annotated-types==0.7.0 annotated-types==0.7.0
# via pydantic # via pydantic
anyio==4.6.2.post1
# via httpx
attrs==24.2.0 attrs==24.2.0
# via jsonschema # via jsonschema
# via referencing # via referencing
babel==2.16.0 babel==2.16.0
# via great-tables # via great-tables
beautifulsoup4==4.12.3
# via task2
certifi==2024.8.30 certifi==2024.8.30
# via httpcore
# via httpx
# via requests # via requests
charset-normalizer==3.4.0 charset-normalizer==3.4.0
# via requests # via requests
@ -58,9 +64,17 @@ great-tables==0.13.0
greenlet==3.1.1 greenlet==3.1.1
# via gevent # via gevent
# via sqlalchemy # via sqlalchemy
h11==0.14.0
# via httpcore
htmltools==0.5.3 htmltools==0.5.3
# via great-tables # via great-tables
httpcore==1.0.7
# via httpx
httpx==0.27.2
# via task2
idna==3.10 idna==3.10
# via anyio
# via httpx
# via requests # via requests
importlib-metadata==8.5.0 importlib-metadata==8.5.0
# via great-tables # via great-tables
@ -171,8 +185,13 @@ setuptools==75.1.0
# via zope-interface # via zope-interface
six==1.16.0 six==1.16.0
# via python-dateutil # via python-dateutil
sniffio==1.3.1
# via anyio
# via httpx
sortedcontainers==2.4.0 sortedcontainers==2.4.0
# via pyiceberg # via pyiceberg
soupsieve==2.6
# via beautifulsoup4
sqlalchemy==2.0.35 sqlalchemy==2.0.35
# via polars # via polars
strictyaml==1.7.3 strictyaml==1.7.3

View file

@ -1,6 +1,20 @@
def main() -> None: import asyncio
from pprint import pprint
import httpx
from src.link_scraper import get_urlmap
URL = httpx.URL("https://ailab.fai.utb.cz")
async def main() -> None:
"""Program entrypoint.""" """Program entrypoint."""
async with httpx.AsyncClient() as client:
url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
pprint(url_map)
if __name__ == "__main__": if __name__ == "__main__":
main() asyncio.run(main())

76
src/link_scraper.py Normal file
View file

@ -0,0 +1,76 @@
from collections.abc import Callable
from urllib.parse import urljoin
import httpx
from bs4 import BeautifulSoup
async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
"""Scrape all links from given page (url).
This function will also resolve relative URLs.
Non http/https schemas will not be included.
"""
res = await client.get(url)
res.raise_for_status()
html = res.text
soup = BeautifulSoup(html, features="html.parser")
anchors = soup.find_all("a")
hrefs = [anchor.get("href") for anchor in anchors if anchor.get("href") is not None]
# Handle relative links (e.g. home, /home, #home)
links = [httpx.URL(urljoin(str(url), href)) for href in hrefs]
# Ignore mailto:, ftp:, ... schemes
return {link for link in links if link.scheme in {"http", "https"}}
async def get_urlmap(
client: httpx.AsyncClient,
start_url: httpx.URL,
max_depth: int,
filter_condition: Callable[[httpx.URL], bool] | None = None,
) -> dict[httpx.URL, set[httpx.URL]]:
"""Obtain all of the links from given url, working recursively until given max_depth.
:param client: Asynchronous HTTPX client used to make the requests.
:param start_url: URL to begin the link scraping process from.
:param max_depth:
How many recursive iterations to perform:
- 0: Only get links on the start URL.
- 1: Get links from the start URL alongside all links from the URLs found from the start url.
- 2: Links from links from links from start_url.
:param filter_condition:
By default, all found URLs will be scraped in the next iteration, this function can be used
to apply an excluding condition, skipping those URLs for which this function returns False.
Note that the filtered URLs will still be included in the resulting url map.
A common use-case is to limit scraping only to URLs with the same hostname as the start url
(to avoid scraping large sites, like facebook).
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
"""
url_map: dict[httpx.URL, set[httpx.URL]] = {}
urls: set[httpx.URL] = {start_url}
traversed: set[httpx.URL] = set()
depth = 0
while len(urls) > 0:
url = urls.pop()
if url in traversed:
continue
traversed.add(url)
# Include all found links in the result dict
page_links = await get_page_links(client, url)
url_map.setdefault(url, set()).update(page_links)
# Update the list of URLs to scrape next, filtering unwanted ones.
urls.update(filter(filter_condition, page_links))
if depth > max_depth:
break
depth += 1
return url_map