import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin, urlparse
import time
def scrape_website_data(base_url, output_file='scraped_data.csv'):
"""
Scrapes product information from a website and saves it to a CSV file.
Args:
base_url (str): The URL of the website to scrape
output_file (str): Name of the output CSV file
"""
# Add headers to mimic a real browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
# Send GET request to the website
response = requests.get(base_url, headers=headers, timeout=10)
response.raise_for_status() # Raises an HTTPError for bad responses
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all product containers (adjust selectors based on target website)
products = soup.find_all('div', class_='product-item') # Example class name
# List to store scraped data
scraped_data = []
# Extract information from each product
for product in products:
try:
# Extract product name (adjust selectors as needed)
name_elem = product.find('h3', class_='product-title')
name = name_elem.get_text(strip=True) if name_elem else 'N/A'
# Extract product price
price_elem = product.find('span', class_='price')
price = price_elem.get_text(strip=True) if price_elem else 'N/A'
# Extract product link
link_elem = product.find('a')
link = link_elem.get('href') if link_elem else ''
# Convert relative URL to absolute URL
if link:
link = urljoin(base_url, link)
# Extract product image
img_elem = product.find('img')
image_url = img_elem.get('src') if img_elem else 'N/A'
# Add to data list
scraped_data.append({
'name': name,
'price': price,
'link': link,
'image_url': image_url
})
except Exception as e:
print(f"Error parsing product: {e}")
continue
# Save data to CSV
if scraped_data:
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['name', 'price', 'link', 'image_url']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in scraped_data:
writer.writerow(item)
print(f"Successfully scraped {len(scraped_data)} items and saved to {output_file}")
else:
print("No data was scraped. Check the selectors and website structure.")
except requests.exceptions.RequestException as e:
print(f"Error fetching the website: {e}")
except Exception as e:
print(f"An error occurred: {e}")
# Example usage
if __name__ == "__main__":
# Replace with the actual URL you want to scrape
target_url = "https://example-ecommerce-site.com/products"
# Scrape the data
scrape_website_data(target_url)
# Be respectful - add delay between requests if scraping multiple pages
time.sleep(1)
This Python script is a web scraping tool that extracts product information from e-commerce websites. It uses the requests library to fetch web pages and BeautifulSoup to parse HTML content. The scraped data (product names, prices, links, and images) is then saved to a CSV file for further analysis or processing.
Web scraping is a valuable technique for:
pip install requests beautifulsoup4
'div', class_='product-item' to match product container elements'h3', class_='product-title' for product names'span', class_='price' for price elementspython scraper.py
time.sleep(1) to avoid overwhelming the serverThe script will create a CSV file with all the scraped product information that can be opened in Excel or any spreadsheet application for further analysis.