Basic link scraper
This commit is contained in:
parent
b1e815e588
commit
47c9a9f555
|
@ -11,6 +11,8 @@ dependencies = [
|
||||||
"polars[all]>=1.9.0",
|
"polars[all]>=1.9.0",
|
||||||
"seaborn>=0.13.2",
|
"seaborn>=0.13.2",
|
||||||
"rich>=13.9.2",
|
"rich>=13.9.2",
|
||||||
|
"httpx>=0.27.2",
|
||||||
|
"beautifulsoup4>=4.12.3",
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
requires-python = ">= 3.12"
|
requires-python = ">= 3.12"
|
||||||
|
|
|
@ -19,6 +19,8 @@ altair==5.4.1
|
||||||
# via polars
|
# via polars
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
anyio==4.6.2.post1
|
||||||
|
# via httpx
|
||||||
asttokens==2.4.1
|
asttokens==2.4.1
|
||||||
# via stack-data
|
# via stack-data
|
||||||
attrs==24.2.0
|
attrs==24.2.0
|
||||||
|
@ -27,7 +29,11 @@ attrs==24.2.0
|
||||||
babel==2.16.0
|
babel==2.16.0
|
||||||
# via great-tables
|
# via great-tables
|
||||||
basedpyright==1.18.0
|
basedpyright==1.18.0
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
# via task2
|
||||||
certifi==2024.8.30
|
certifi==2024.8.30
|
||||||
|
# via httpcore
|
||||||
|
# via httpx
|
||||||
# via requests
|
# via requests
|
||||||
cfgv==3.4.0
|
cfgv==3.4.0
|
||||||
# via pre-commit
|
# via pre-commit
|
||||||
|
@ -71,11 +77,19 @@ great-tables==0.13.0
|
||||||
greenlet==3.1.1
|
greenlet==3.1.1
|
||||||
# via gevent
|
# via gevent
|
||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
|
h11==0.14.0
|
||||||
|
# via httpcore
|
||||||
htmltools==0.5.3
|
htmltools==0.5.3
|
||||||
# via great-tables
|
# via great-tables
|
||||||
|
httpcore==1.0.7
|
||||||
|
# via httpx
|
||||||
|
httpx==0.27.2
|
||||||
|
# via task2
|
||||||
identify==2.6.1
|
identify==2.6.1
|
||||||
# via pre-commit
|
# via pre-commit
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
# via anyio
|
||||||
|
# via httpx
|
||||||
# via requests
|
# via requests
|
||||||
importlib-metadata==8.5.0
|
importlib-metadata==8.5.0
|
||||||
# via great-tables
|
# via great-tables
|
||||||
|
@ -215,8 +229,13 @@ setuptools==75.1.0
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via asttokens
|
# via asttokens
|
||||||
# via python-dateutil
|
# via python-dateutil
|
||||||
|
sniffio==1.3.1
|
||||||
|
# via anyio
|
||||||
|
# via httpx
|
||||||
sortedcontainers==2.4.0
|
sortedcontainers==2.4.0
|
||||||
# via pyiceberg
|
# via pyiceberg
|
||||||
|
soupsieve==2.6
|
||||||
|
# via beautifulsoup4
|
||||||
sqlalchemy==2.0.35
|
sqlalchemy==2.0.35
|
||||||
# via polars
|
# via polars
|
||||||
stack-data==0.6.3
|
stack-data==0.6.3
|
||||||
|
|
|
@ -19,12 +19,18 @@ altair==5.4.1
|
||||||
# via polars
|
# via polars
|
||||||
annotated-types==0.7.0
|
annotated-types==0.7.0
|
||||||
# via pydantic
|
# via pydantic
|
||||||
|
anyio==4.6.2.post1
|
||||||
|
# via httpx
|
||||||
attrs==24.2.0
|
attrs==24.2.0
|
||||||
# via jsonschema
|
# via jsonschema
|
||||||
# via referencing
|
# via referencing
|
||||||
babel==2.16.0
|
babel==2.16.0
|
||||||
# via great-tables
|
# via great-tables
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
# via task2
|
||||||
certifi==2024.8.30
|
certifi==2024.8.30
|
||||||
|
# via httpcore
|
||||||
|
# via httpx
|
||||||
# via requests
|
# via requests
|
||||||
charset-normalizer==3.4.0
|
charset-normalizer==3.4.0
|
||||||
# via requests
|
# via requests
|
||||||
|
@ -58,9 +64,17 @@ great-tables==0.13.0
|
||||||
greenlet==3.1.1
|
greenlet==3.1.1
|
||||||
# via gevent
|
# via gevent
|
||||||
# via sqlalchemy
|
# via sqlalchemy
|
||||||
|
h11==0.14.0
|
||||||
|
# via httpcore
|
||||||
htmltools==0.5.3
|
htmltools==0.5.3
|
||||||
# via great-tables
|
# via great-tables
|
||||||
|
httpcore==1.0.7
|
||||||
|
# via httpx
|
||||||
|
httpx==0.27.2
|
||||||
|
# via task2
|
||||||
idna==3.10
|
idna==3.10
|
||||||
|
# via anyio
|
||||||
|
# via httpx
|
||||||
# via requests
|
# via requests
|
||||||
importlib-metadata==8.5.0
|
importlib-metadata==8.5.0
|
||||||
# via great-tables
|
# via great-tables
|
||||||
|
@ -171,8 +185,13 @@ setuptools==75.1.0
|
||||||
# via zope-interface
|
# via zope-interface
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via python-dateutil
|
# via python-dateutil
|
||||||
|
sniffio==1.3.1
|
||||||
|
# via anyio
|
||||||
|
# via httpx
|
||||||
sortedcontainers==2.4.0
|
sortedcontainers==2.4.0
|
||||||
# via pyiceberg
|
# via pyiceberg
|
||||||
|
soupsieve==2.6
|
||||||
|
# via beautifulsoup4
|
||||||
sqlalchemy==2.0.35
|
sqlalchemy==2.0.35
|
||||||
# via polars
|
# via polars
|
||||||
strictyaml==1.7.3
|
strictyaml==1.7.3
|
||||||
|
|
|
@ -1,6 +1,20 @@
|
||||||
def main() -> None:
|
import asyncio
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from src.link_scraper import get_urlmap
|
||||||
|
|
||||||
|
URL = httpx.URL("https://ailab.fai.utb.cz")
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
"""Program entrypoint."""
|
"""Program entrypoint."""
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
url_map = await get_urlmap(client, URL, max_depth=2, filter_condition=lambda url: url.host == URL.host)
|
||||||
|
|
||||||
|
pprint(url_map)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
asyncio.run(main())
|
||||||
|
|
76
src/link_scraper.py
Normal file
76
src/link_scraper.py
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
from collections.abc import Callable
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
async def get_page_links(client: httpx.AsyncClient, url: httpx.URL) -> set[httpx.URL]:
|
||||||
|
"""Scrape all links from given page (url).
|
||||||
|
|
||||||
|
This function will also resolve relative URLs.
|
||||||
|
Non http/https schemas will not be included.
|
||||||
|
"""
|
||||||
|
res = await client.get(url)
|
||||||
|
res.raise_for_status()
|
||||||
|
html = res.text
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, features="html.parser")
|
||||||
|
anchors = soup.find_all("a")
|
||||||
|
hrefs = [anchor.get("href") for anchor in anchors if anchor.get("href") is not None]
|
||||||
|
# Handle relative links (e.g. home, /home, #home)
|
||||||
|
links = [httpx.URL(urljoin(str(url), href)) for href in hrefs]
|
||||||
|
# Ignore mailto:, ftp:, ... schemes
|
||||||
|
return {link for link in links if link.scheme in {"http", "https"}}
|
||||||
|
|
||||||
|
|
||||||
|
async def get_urlmap(
|
||||||
|
client: httpx.AsyncClient,
|
||||||
|
start_url: httpx.URL,
|
||||||
|
max_depth: int,
|
||||||
|
filter_condition: Callable[[httpx.URL], bool] | None = None,
|
||||||
|
) -> dict[httpx.URL, set[httpx.URL]]:
|
||||||
|
"""Obtain all of the links from given url, working recursively until given max_depth.
|
||||||
|
|
||||||
|
:param client: Asynchronous HTTPX client used to make the requests.
|
||||||
|
:param start_url: URL to begin the link scraping process from.
|
||||||
|
:param max_depth:
|
||||||
|
How many recursive iterations to perform:
|
||||||
|
|
||||||
|
- 0: Only get links on the start URL.
|
||||||
|
- 1: Get links from the start URL alongside all links from the URLs found from the start url.
|
||||||
|
- 2: Links from links from links from start_url.
|
||||||
|
:param filter_condition:
|
||||||
|
By default, all found URLs will be scraped in the next iteration, this function can be used
|
||||||
|
to apply an excluding condition, skipping those URLs for which this function returns False.
|
||||||
|
Note that the filtered URLs will still be included in the resulting url map.
|
||||||
|
|
||||||
|
A common use-case is to limit scraping only to URLs with the same hostname as the start url
|
||||||
|
(to avoid scraping large sites, like facebook).
|
||||||
|
|
||||||
|
:return: A dictionary mapping the scraped URLs to the found links on that URL (url -> set(links))
|
||||||
|
"""
|
||||||
|
url_map: dict[httpx.URL, set[httpx.URL]] = {}
|
||||||
|
urls: set[httpx.URL] = {start_url}
|
||||||
|
traversed: set[httpx.URL] = set()
|
||||||
|
depth = 0
|
||||||
|
|
||||||
|
while len(urls) > 0:
|
||||||
|
url = urls.pop()
|
||||||
|
|
||||||
|
if url in traversed:
|
||||||
|
continue
|
||||||
|
traversed.add(url)
|
||||||
|
|
||||||
|
# Include all found links in the result dict
|
||||||
|
page_links = await get_page_links(client, url)
|
||||||
|
url_map.setdefault(url, set()).update(page_links)
|
||||||
|
|
||||||
|
# Update the list of URLs to scrape next, filtering unwanted ones.
|
||||||
|
urls.update(filter(filter_condition, page_links))
|
||||||
|
|
||||||
|
if depth > max_depth:
|
||||||
|
break
|
||||||
|
depth += 1
|
||||||
|
|
||||||
|
return url_map
|
Loading…
Reference in a new issue