Basic link scraper
This commit is contained in:
parent
b1e815e588
commit
47c9a9f555
|
@ -11,6 +11,8 @@ dependencies = [
|
|||
"polars[all]>=1.9.0",
|
||||
"seaborn>=0.13.2",
|
||||
"rich>=13.9.2",
|
||||
"httpx>=0.27.2",
|
||||
"beautifulsoup4>=4.12.3",
|
||||
]
|
||||
readme = "README.md"
|
||||
requires-python = ">= 3.12"
|
||||
|
|
|
@ -19,6 +19,8 @@ altair==5.4.1
|
|||
# via polars
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.6.2.post1
|
||||
# via httpx
|
||||
asttokens==2.4.1
|
||||
# via stack-data
|
||||
attrs==24.2.0
|
||||
|
@ -27,7 +29,11 @@ attrs==24.2.0
|
|||
babel==2.16.0
|
||||
# via great-tables
|
||||
basedpyright==1.18.0
|
||||
beautifulsoup4==4.12.3
|
||||
# via task2
|
||||
certifi==2024.8.30
|
||||
# via httpcore
|
||||
# via httpx
|
||||
# via requests
|
||||
cfgv==3.4.0
|
||||
# via pre-commit
|
||||
|
@ -71,11 +77,19 @@ great-tables==0.13.0
|
|||
greenlet==3.1.1
|
||||
# via gevent
|
||||
# via sqlalchemy
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
htmltools==0.5.3
|
||||
# via great-tables
|
||||
httpcore==1.0.7
|
||||
# via httpx
|
||||
httpx==0.27.2
|
||||
# via task2
|
||||
identify==2.6.1
|
||||
# via pre-commit
|
||||
idna==3.10
|
||||
# via anyio
|
||||
# via httpx
|
||||
# via requests
|
||||
importlib-metadata==8.5.0
|
||||
# via great-tables
|
||||
|
@ -215,8 +229,13 @@ setuptools==75.1.0
|
|||
six==1.16.0
|
||||
# via asttokens
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via anyio
|
||||
# via httpx
|
||||
sortedcontainers==2.4.0
|
||||
# via pyiceberg
|
||||
soupsieve==2.6
|
||||
# via beautifulsoup4
|
||||
sqlalchemy==2.0.35
|
||||
# via polars
|
||||
stack-data==0.6.3
|
||||
|
|
|
@ -19,12 +19,18 @@ altair==5.4.1
|
|||
# via polars
|
||||
annotated-types==0.7.0
|
||||
# via pydantic
|
||||
anyio==4.6.2.post1
|
||||
# via httpx
|
||||
attrs==24.2.0
|
||||
# via jsonschema
|
||||
# via referencing
|
||||
babel==2.16.0
|
||||
# via great-tables
|
||||
beautifulsoup4==4.12.3
|
||||
# via task2
|
||||
certifi==2024.8.30
|
||||
# via httpcore
|
||||
# via httpx
|
||||
# via requests
|
||||
charset-normalizer==3.4.0
|
||||
# via requests
|
||||
|
@ -58,9 +64,17 @@ great-tables==0.13.0
|
|||
greenlet==3.1.1
|
||||
# via gevent
|
||||
# via sqlalchemy
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
htmltools==0.5.3
|
||||
# via great-tables
|
||||
httpcore==1.0.7
|
||||
# via httpx
|
||||
httpx==0.27.2
|
||||
# via task2
|
||||
idna==3.10
|
||||
# via anyio
|
||||
# via httpx
|
||||
# via requests
|
||||
importlib-metadata==8.5.0
|
||||
# via great-tables
|
||||
|
@ -171,8 +185,13 @@ setuptools==75.1.0
|
|||
# via zope-interface
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.1
|
||||
# via anyio
|
||||
# via httpx
|
||||
sortedcontainers==2.4.0
|
||||
# via pyiceberg
|
||||
soupsieve==2.6
|
||||
# via beautifulsoup4
|
||||
sqlalchemy==2.0.35
|
||||
# via polars
|
||||
strictyaml==1.7.3
|
||||
|
|
|
@ -1,6 +1,20 @@
|
|||
def main() -> None:
|
||||
import asyncio
|
||||
from pprint import pprint
|
||||
|
||||
import httpx
|
||||
|
||||
from src.link_scraper import get_urlmap
|
||||
|
||||
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Program entrypoint."""
|
||||
async with httpx.AsyncClient() as client:
|
||||
url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
|
||||
|
||||
pprint(url_map)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
asyncio.run(main())
|
||||
|
|
76
src/link_scraper.py
Normal file
76
src/link_scraper.py
Normal file
|
@ -0,0 +1,76 @@
|
|||
from collections.abc import Callable
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
|
||||
"""Scrape all links from given page (url).
|
||||
|
||||
This function will also resolve relative URLs.
|
||||
Non http/https schemas will not be included.
|
||||
"""
|
||||
res = await client.get(url)
|
||||
res.raise_for_status()
|
||||
html = res.text
|
||||
|
||||
soup = BeautifulSoup(html, features="html.parser")
|
||||
anchors = soup.find_all("a")
|
||||
hrefs = [anchor.get("href") for anchor in anchors if anchor.get("href") is not None]
|
||||
# Handle relative links (e.g. home, /home, #home)
|
||||
links = [httpx.URL(urljoin(str(url), href)) for href in hrefs]
|
||||
# Ignore mailto:, ftp:, ... schemes
|
||||
return {link for link in links if link.scheme in {"http", "https"}}
|
||||
|
||||
|
||||
async def get_urlmap(
|
||||
client: httpx.AsyncClient,
|
||||
start_url: httpx.URL,
|
||||
max_depth: int,
|
||||
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||
|
||||
:param client: Asynchronous HTTPX client used to make the requests.
|
||||
:param start_url: URL to begin the link scraping process from.
|
||||
:param max_depth:
|
||||
How many recursive iterations to perform:
|
||||
|
||||
- 0: Only get links on the start URL.
|
||||
- 1: Get links from the start URL alongside all links from the URLs found from the start url.
|
||||
- 2: Links from links from links from start_url.
|
||||
:param filter_condition:
|
||||
By default, all found URLs will be scraped in the next iteration, this function can be used
|
||||
to apply an excluding condition, skipping those URLs for which this function returns False.
|
||||
Note that the filtered URLs will still be included in the resulting url map.
|
||||
|
||||
A common use-case is to limit scraping only to URLs with the same hostname as the start url
|
||||
(to avoid scraping large sites, like facebook).
|
||||
|
||||
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
||||
"""
|
||||
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
||||
urls: set[httpx.URL] = {start_url}
|
||||
traversed: set[httpx.URL] = set()
|
||||
depth = 0
|
||||
|
||||
while len(urls) > 0:
|
||||
url = urls.pop()
|
||||
|
||||
if url in traversed:
|
||||
continue
|
||||
traversed.add(url)
|
||||
|
||||
# Include all found links in the result dict
|
||||
page_links = await get_page_links(client, url)
|
||||
url_map.setdefault(url, set()).update(page_links)
|
||||
|
||||
# Update the list of URLs to scrape next, filtering unwanted ones.
|
||||
urls.update(filter(filter_condition, page_links))
|
||||
|
||||
if depth > max_depth:
|
||||
break
|
||||
depth += 1
|
||||
|
||||
return url_map
|
Loading…
Reference in a new issue