Python Snippets

Web Scraping with BeautifulSoup and Requests for Data Extraction

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin, urlparse
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class WebScraper:
    def __init__(self, base_url, headers=None):
        """
        Initialize the web scraper with base URL and optional headers
        
        Args:
            base_url (str): The base URL to scrape
            headers (dict): Optional headers for requests
        """
        self.base_url = base_url
        self.headers = headers or {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        
    def fetch_page(self, url, timeout=10):
        """
        Fetch a web page with error handling and retries
        
        Args:
            url (str): URL to fetch
            timeout (int): Request timeout in seconds
            
        Returns:
            BeautifulSoup object or None
        """
        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except requests.RequestException as e:
            logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_links(self, soup, selector='a[href]'):
        """
        Extract all links from a page
        
        Args:
            soup (BeautifulSoup): Parsed HTML content
            selector (str): CSS selector for links
            
        Returns:
            list: List of absolute URLs
        """
        links = []
        for link in soup.select(selector):
            href = link.get('href')
            if href:
                # Convert relative URLs to absolute URLs
                absolute_url = urljoin(self.base_url, href)
                # Only include links from the same domain
                if urlparse(absolute_url).netloc == urlparse(self.base_url).netloc:
                    links.append(absolute_url)
        return list(set(links))  # Remove duplicates
    
    def extract_data(self, soup, data_selectors):
        """
        Extract data from page using CSS selectors
        
        Args:
            soup (BeautifulSoup): Parsed HTML content
            data_selectors (dict): Dictionary mapping field names to CSS selectors
            
        Returns:
            dict: Extracted data
        """
        data = {}
        for field, selector in data_selectors.items():
            element = soup.select_one(selector)
            data[field] = element.get_text(strip=True) if element else None
        return data
    
    def scrape_pages(self, urls, data_selectors, delay=1):
        """
        Scrape multiple pages and extract data
        
        Args:
            urls (list): List of URLs to scrape
            data_selectors (dict): Dictionary mapping field names to CSS selectors
            delay (int): Delay between requests in seconds
            
        Returns:
            list: List of dictionaries containing extracted data
        """
        results = []
        for i, url in enumerate(urls):
            logger.info(f"Scraping {i+1}/{len(urls)}: {url}")
            
            soup = self.fetch_page(url)
            if soup:
                data = self.extract_data(soup, data_selectors)
                data['url'] = url
                results.append(data)
            
            # Be respectful to the server
            time.sleep(delay)
        
        return results
    
    def save_to_csv(self, data, filename):
        """
        Save data to CSV file
        
        Args:
            data (list): List of dictionaries to save
            filename (str): Output filename
        """
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        logger.info(f"Data saved to {filename}")

# Example usage
if __name__ == "__main__":
    # Example: Scraping a news website
    scraper = WebScraper("https://httpbin.org")
    
    # Example URLs for demonstration (using httpbin.org for testing)
    urls = [
        "https://httpbin.org/html",
        "https://httpbin.org/json",
        "https://httpbin.org/xml"
    ]
    
    # Define what data to extract (example selectors)
    data_selectors = {
        'title': 'title',
        'heading': 'h1',
        'content': 'body'
    }
    
    # Scrape data
    scraped_data = scraper.scrape_pages(urls, data_selectors, delay=0.5)
    
    # Display results
    for item in scraped_data:
        print(item)
    
    # Save to CSV
    # scraper.save_to_csv(scraped_data, 'scraped_data.csv')

What This Code Does

This code provides a comprehensive web scraping framework that handles common web scraping tasks while being respectful and robust. It includes:

  1. Configurable Web Scraping: The WebScraper class is initialized with a base URL and optional headers
  2. Robust Page Fetching: The fetch_page method handles HTTP errors, timeouts, and retries
  3. Link Extraction: The extract_links method extracts and normalizes all links from a page
  4. Data Extraction: The extract_data method uses CSS selectors to extract specific data from pages
  5. Multi-page Scanning: The scrape_pages method can scrape multiple URLs with built-in delay
  6. Error Handling: Proper logging and error handling for robust scraping
  7. Respectful Scraping: Built-in delays to avoid overwhelming servers
  8. Data Export: The save_to_csv method saves data to CSV files

Why It’s Useful

This scraper is useful for:

  1. Data Collection: Collecting structured data from websites for analysis or research
  2. Monitoring: Monitoring websites for changes or updates
  3. Price Tracking: Tracking prices on e-commerce sites
  4. Content Analysis: Extracting content for SEO or content analysis
  5. Learning: Understanding web scraping techniques and best practices

How to Run It

  1. Install Required Libraries:
    pip install requests beautifulsoup4 pandas lxml
    
  2. Customize the Code:
    • Change the base_url and URLs to match your target site
    • Modify data_selectors to match your data extraction needs
    • Adjust the headers to match the user agent you want to use
    • Set the delay parameter to control the scraping rate
  3. Run the Script:
    python web_scraper.py
    
  4. Example Usage:
    # Example for scraping news headlines
    scraper = WebScraper("https://example-news.com")
    urls = ["https://example-news.com/page1", "https://example-news.com/page2"]
    data_selectors = {
        'title': 'h1.headline',
        'author': '.author-name',
        'date': '.publish-date',
        'content': '.article-content'
    }
    data = scraper.scrape_pages(urls, data_selectors)
    scraper.save_to_csv(data, 'news_articles.csv')
    

This scraper provides a solid foundation for web scraping tasks while respecting websites and handling common issues gracefully.