Python Snippets

Web Scraping with BeautifulSoup and Requests

import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
import time

def scrape_webpage_links(base_url, output_file='links.csv'):
    """
    Scrapes all links from a webpage and saves them to a CSV file.
    
    Args:
        base_url (str): The URL of the webpage to scrape
        output_file (str): Name of the CSV file to save results
    
    Returns:
        list: List of dictionaries containing link data
    """
    try:
        # Add headers to mimic a real browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Send GET request to the webpage
        response = requests.get(base_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all anchor tags with href attributes
        links = soup.find_all('a', href=True)
        
        # Process and collect link data
        link_data = []
        for link in links:
            href = link['href']
            text = link.get_text(strip=True) or '[No Text]'
            
            # Convert relative URLs to absolute URLs
            absolute_url = urljoin(base_url, href)
            
            # Get domain name for reference
            domain = urlparse(absolute_url).netloc
            
            link_data.append({
                'text': text,
                'url': absolute_url,
                'domain': domain
            })
        
        # Remove duplicates based on URL
        unique_links = list({link['url']: link for link in link_data}.values())
        
        # Save to CSV file
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['text', 'url', 'domain']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(unique_links)
        
        print(f"Successfully scraped {len(unique_links)} unique links from {base_url}")
        print(f"Results saved to {output_file}")
        
        return unique_links
        
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

# Example usage
if __name__ == "__main__":
    # Example: Scrape links from a sample website
    url = "https://httpbin.org/"
    results = scrape_webpage_links(url, 'scraped_links.csv')
    
    # Display first 5 results
    for i, link in enumerate(results[:5]):
        print(f"{i+1}. {link['text']} -> {link['url']}")

This code snippet provides a complete web scraping solution that extracts all hyperlinks from a given webpage. It’s particularly useful for:

SEO Analysis: Collecting all links on a website to analyze internal linking structure
Content Discovery: Finding all resources linked from a specific page
Competitive Research: Analyzing competitors’ backlink strategies
Link Validation: Creating a list of URLs to check for broken links

The script uses requests to fetch the webpage content and BeautifulSoup to parse the HTML. Key features include:

Browser-like headers to avoid being blocked by servers
Conversion of relative URLs to absolute URLs
Duplicate removal based on URL
CSV export for easy analysis in spreadsheet applications
Error handling for network issues and malformed URLs
Text extraction from anchor tags

To run the script:

Install required packages: pip install requests beautifulsoup4
Replace the URL in the example with your target webpage
Execute the script - it will create a CSV file with all discovered links

The code is production-ready with proper error handling, timeouts, and respects server resources by using appropriate headers and not overwhelming the target server with requests.