import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
import time
def scrape_webpage_links(base_url, output_file='links.csv'):
"""
Scrapes all links from a webpage and saves them to a CSV file.
Args:
base_url (str): The URL of the webpage to scrape
output_file (str): Name of the CSV file to save results
Returns:
list: List of dictionaries containing link data
"""
try:
# Add headers to mimic a real browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Send GET request to the webpage
response = requests.get(base_url, headers=headers, timeout=10)
response.raise_for_status() # Raise an exception for bad status codes
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all anchor tags with href attributes
links = soup.find_all('a', href=True)
# Process and collect link data
link_data = []
for link in links:
href = link['href']
text = link.get_text(strip=True) or '[No Text]'
# Convert relative URLs to absolute URLs
absolute_url = urljoin(base_url, href)
# Get domain name for reference
domain = urlparse(absolute_url).netloc
link_data.append({
'text': text,
'url': absolute_url,
'domain': domain
})
# Remove duplicates based on URL
unique_links = list({link['url']: link for link in link_data}.values())
# Save to CSV file
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['text', 'url', 'domain']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(unique_links)
print(f"Successfully scraped {len(unique_links)} unique links from {base_url}")
print(f"Results saved to {output_file}")
return unique_links
except requests.RequestException as e:
print(f"Error fetching the webpage: {e}")
return []
except Exception as e:
print(f"An error occurred: {e}")
return []
# Example usage
if __name__ == "__main__":
# Example: Scrape links from a sample website
url = "https://httpbin.org/"
results = scrape_webpage_links(url, 'scraped_links.csv')
# Display first 5 results
for i, link in enumerate(results[:5]):
print(f"{i+1}. {link['text']} -> {link['url']}")
This code snippet provides a complete web scraping solution that extracts all hyperlinks from a given webpage. It’s particularly useful for:
The script uses requests to fetch the webpage content and BeautifulSoup to parse the HTML. Key features include:
To run the script:
pip install requests beautifulsoup4The code is production-ready with proper error handling, timeouts, and respects server resources by using appropriate headers and not overwhelming the target server with requests.