import aiohttp
import asyncio
from bs4 import BeautifulSoup
async def fetch_url(session, url):
try:
async with session.get(url) as response:
if response.status == 200:
return await response.text()
return None
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
async def scrape_urls(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_url(session, url) for url in urls]
html_pages = await asyncio.gather(*tasks)
results = []
for url, html in zip(urls, html_pages):
if html:
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string if soup.title else "No title"
results.append((url, title))
return results
async def main():
urls = [
'https://python.org',
'https://github.com',
'https://stackoverflow.com',
'https://pypi.org'
]
scraped_data = await scrape_urls(urls)
for url, title in scraped_data:
print(f"{url}: {title}")
if __name__ == "__main__":
asyncio.run(main())
This code snippet demonstrates a modern, asynchronous web scraper that efficiently fetches and processes multiple web pages concurrently. Here’s why it’s useful:
fetch_url()
- An async function that:
scrape_urls()
- The main scraping function that:
main()
- Demonstrates usage with example URLspip install aiohttp beautifulsoup4
Save the code to a file (e.g., async_scraper.py
)
python async_scraper.py
scrape_urls()
urls
list in main()
asyncio.sleep()
between requests if neededThis pattern is particularly useful for data collection tasks where you need to process multiple web pages efficiently.