Python Snippets

Web Scraping with BeautifulSoup and Requests

import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
import time

def scrape_webpage_links(base_url, output_file='links.csv'):
    """
    Scrapes all links from a webpage and saves them to a CSV file.
    
    Args:
        base_url (str): The URL of the webpage to scrape
        output_file (str): Name of the CSV file to save results
    
    Returns:
        list: List of dictionaries containing link data
    """
    try:
        # Add headers to mimic a real browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Send GET request to the webpage
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all anchor tags with href attributes
        links = soup.find_all('a', href=True)
        
        # Process and collect link data
        link_data = []
        for link in links:
            href = link['href']
            text = link.get_text(strip=True) or '[No Text]'
            
            # Convert relative URLs to absolute URLs
            absolute_url = urljoin(base_url, href)
            
            # Get domain name for reference
            domain = urlparse(absolute_url).netloc
            
            link_data.append({
                'text': text,
                'url': absolute_url,
                'domain': domain
            })
        
        # Remove duplicates based on URL
        unique_links = list({link['url']: link for link in link_data}.values())
        
        # Save to CSV file
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['text', 'url', 'domain']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(unique_links)
        
        print(f"Successfully scraped {len(unique_links)} unique links from {base_url}")
        print(f"Results saved to {output_file}")
        
        return unique_links
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Example usage
if __name__ == "__main__":
    # Example: Scrape links from a sample website
    url = "https://httpbin.org/"
    results = scrape_webpage_links(url, 'scraped_links.csv')
    
    # Display first 5 results
    for i, link in enumerate(results[:5]):
        print(f"{i+1}. {link['text']} -> {link['url']}")

This code snippet provides a complete web scraping solution that extracts all hyperlinks from a given webpage. It’s particularly useful for:

  1. SEO Analysis: Collecting all links on a website to analyze internal linking structure
  2. Content Discovery: Finding all resources linked from a specific page
  3. Competitive Research: Analyzing competitors’ backlink strategies
  4. Link Validation: Creating a list of URLs to check for broken links

The script uses requests to fetch the webpage content and BeautifulSoup to parse the HTML. Key features include:

To run the script:

  1. Install required packages: pip install requests beautifulsoup4
  2. Replace the URL in the example with your target webpage
  3. Execute the script - it will create a CSV file with all discovered links

The code is production-ready with proper error handling, timeouts, and respects server resources by using appropriate headers and not overwhelming the target server with requests.