import asyncio
from bs4 import BeautifulSoup
import aiohttp
from typing import List
async def fetch_page(session: aiohttp.ClientSession, url: str) -> str:
async with session.get(url) as response:
return await response.text()
async def parse_links(html: str, base_url: str) -> List[str]:
soup = BeautifulSoup(html, 'html.parser')
links = [a.get('href') for a in soup.find_all('a', href=True)]
return [link if link.startswith('http') else f"{base_url}{link}" for link in links]
async def scrape_website(url: str, max_depth: int = 1) -> List[str]:
async with aiohttp.ClientSession() as session:
visited = set()
to_visit = [(url, 0)]
all_links = []
while to_visit:
current_url, depth = to_visit.pop(0)
if depth > max_depth or current_url in visited:
continue
try:
html = await fetch_page(session, current_url)
links = await parse_links(html, url)
all_links.extend(links)
visited.add(current_url)
if depth < max_depth:
for link in links:
if link not in visited:
to_visit.append((link, depth + 1))
except Exception as e:
print(f"Error fetching {current_url}: {e}")
return list(set(all_links))
async def main():
# Example usage
target_url = "https://example.com"
links = await scrape_website(target_url, max_depth=2)
print(f"Found {len(links)} unique links:")
for link in sorted(links):
print(link)
if __name__ == "__main__":
asyncio.run(main())
This asynchronous web scraper efficiently collects all links from a website up to a specified depth using modern Python async/await syntax. Here’s why it’s useful:
aiohttp for asynchronous HTTP requests, making it much faster than synchronous scraping for multiple pagespip install aiohttp beautifulsoup4async_scraper.py)target_url in the main() function to your desired websitepython async_scraper.pymax_depth parameter to control crawl depthasyncio.sleep() if neededparse_links() to extract other page elements (text, images, etc.)This scraper is perfect for building site maps, checking for broken links, or gathering data for SEO analysis, while being respectful of server resources through proper async handling.