import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin, urlparse
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class WebScraper:
def __init__(self, base_url, headers=None):
"""
Initialize the web scraper with base URL and optional headers
Args:
base_url (str): The base URL to scrape
headers (dict): Optional headers for requests
"""
self.base_url = base_url
self.headers = headers or {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.session = requests.Session()
self.session.headers.update(self.headers)
def fetch_page(self, url, timeout=10):
"""
Fetch a web page with error handling and retries
Args:
url (str): URL to fetch
timeout (int): Request timeout in seconds
Returns:
BeautifulSoup object or None
"""
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.RequestException as e:
logger.error(f"Error fetching {url}: {e}")
return None
def extract_links(self, soup, selector='a[href]'):
"""
Extract all links from a page
Args:
soup (BeautifulSoup): Parsed HTML content
selector (str): CSS selector for links
Returns:
list: List of absolute URLs
"""
links = []
for link in soup.select(selector):
href = link.get('href')
if href:
# Convert relative URLs to absolute URLs
absolute_url = urljoin(self.base_url, href)
# Only include links from the same domain
if urlparse(absolute_url).netloc == urlparse(self.base_url).netloc:
links.append(absolute_url)
return list(set(links)) # Remove duplicates
def extract_data(self, soup, data_selectors):
"""
Extract data from page using CSS selectors
Args:
soup (BeautifulSoup): Parsed HTML content
data_selectors (dict): Dictionary mapping field names to CSS selectors
Returns:
dict: Extracted data
"""
data = {}
for field, selector in data_selectors.items():
element = soup.select_one(selector)
data[field] = element.get_text(strip=True) if element else None
return data
def scrape_pages(self, urls, data_selectors, delay=1):
"""
Scrape multiple pages and extract data
Args:
urls (list): List of URLs to scrape
data_selectors (dict): Dictionary mapping field names to CSS selectors
delay (int): Delay between requests in seconds
Returns:
list: List of dictionaries containing extracted data
"""
results = []
for i, url in enumerate(urls):
logger.info(f"Scraping {i+1}/{len(urls)}: {url}")
soup = self.fetch_page(url)
if soup:
data = self.extract_data(soup, data_selectors)
data['url'] = url
results.append(data)
# Be respectful to the server
time.sleep(delay)
return results
def save_to_csv(self, data, filename):
"""
Save data to CSV file
Args:
data (list): List of dictionaries to save
filename (str): Output filename
"""
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
logger.info(f"Data saved to {filename}")
# Example usage
if __name__ == "__main__":
# Example: Scraping a news website
scraper = WebScraper("https://httpbin.org")
# Example URLs for demonstration (using httpbin.org for testing)
urls = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/xml"
]
# Define what data to extract (example selectors)
data_selectors = {
'title': 'title',
'heading': 'h1',
'content': 'body'
}
# Scrape data
scraped_data = scraper.scrape_pages(urls, data_selectors, delay=0.5)
# Display results
for item in scraped_data:
print(item)
# Save to CSV
# scraper.save_to_csv(scraped_data, 'scraped_data.csv')
This code provides a comprehensive web scraping framework that handles common web scraping tasks while being respectful and robust. It includes:
WebScraper
class is initialized with a base URL and optional headersfetch_page
method handles HTTP errors, timeouts, and retriesextract_links
method extracts and normalizes all links from a pageextract_data
method uses CSS selectors to extract specific data from pagesscrape_pages
method can scrape multiple URLs with built-in delaysave_to_csv
method saves data to CSV filesThis scraper is useful for:
pip install requests beautifulsoup4 pandas lxml
base_url
and URLs to match your target sitedata_selectors
to match your data extraction needsheaders
to match the user agent you want to usedelay
parameter to control the scraping ratepython web_scraper.py
# Example for scraping news headlines
scraper = WebScraper("https://example-news.com")
urls = ["https://example-news.com/page1", "https://example-news.com/page2"]
data_selectors = {
'title': 'h1.headline',
'author': '.author-name',
'date': '.publish-date',
'content': '.article-content'
}
data = scraper.scrape_pages(urls, data_selectors)
scraper.save_to_csv(data, 'news_articles.csv')
This scraper provides a solid foundation for web scraping tasks while respecting websites and handling common issues gracefully.