Python Snippets

Concurrent File Downloader with Progress Tracking

import asyncio
import aiohttp
import aiofiles
from pathlib import Path
from typing import List, Tuple
import sys

async def download_file(session: aiohttp.ClientSession, url: str, filename: str, progress_callback=None) -> Tuple[str, bool, str]:
    """
    Download a single file asynchronously with progress tracking.
    
    Args:
        session: aiohttp client session
        url: URL to download from
        filename: local filename to save as
        progress_callback: optional callback for progress updates
        
    Returns:
        Tuple of (filename, success, message)
    """
    try:
        async with session.get(url) as response:
            if response.status != 200:
                return filename, False, f"HTTP {response.status}"
            
            # Get file size for progress tracking
            file_size = int(response.headers.get('content-length', 0))
            downloaded = 0
            
            # Create directory if needed
            Path(filename).parent.mkdir(parents=True, exist_ok=True)
            
            async with aiofiles.open(filename, 'wb') as f:
                async for chunk in response.content.iter_chunked(8192):
                    await f.write(chunk)
                    downloaded += len(chunk)
                    
                    # Report progress if callback provided
                    if progress_callback and file_size > 0:
                        progress = (downloaded / file_size) * 100
                        await progress_callback(filename, progress)
            
            return filename, True, f"Downloaded {downloaded} bytes"
    except Exception as e:
        return filename, False, str(e)

async def progress_reporter(filename: str, progress: float):
    """Simple progress reporter that updates console line."""
    sys.stdout.write(f"\r{filename}: {progress:.1f}%")
    sys.stdout.flush()

async def download_files_concurrently(urls_and_filenames: List[Tuple[str, str]], max_concurrent: int = 5):
    """
    Download multiple files concurrently with progress tracking.
    
    Args:
        urls_and_filenames: List of (url, filename) tuples
        max_concurrent: Maximum number of concurrent downloads
    """
    # Create semaphore to limit concurrent downloads
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def limited_download(session, url, filename):
        async with semaphore:
            return await download_file(session, url, filename, progress_reporter)
    
    async with aiohttp.ClientSession() as session:
        # Create tasks for all downloads
        tasks = [
            limited_download(session, url, filename) 
            for url, filename in urls_and_filenames
        ]
        
        # Execute downloads concurrently
        results = await asyncio.gather(*tasks, return_exceptions=True)
        
        # Print results
        print("\n\nDownload Results:")
        success_count = 0
        for result in results:
            if isinstance(result, Exception):
                print(f"Error: {result}")
                continue
                
            filename, success, message = result
            status = "✓" if success else "✗"
            print(f"{status} {filename}: {message}")
            if success:
                success_count += 1
                
        print(f"\nCompleted: {success_count}/{len(urls_and_filenames)} downloads")

# Example usage
if __name__ == "__main__":
    # List of files to download (URL, local filename)
    downloads = [
        ("https://httpbin.org/json", "downloads/sample1.json"),
        ("https://httpbin.org/xml", "downloads/sample2.xml"),
        ("https://httpbin.org/html", "downloads/sample3.html"),
        ("https://httpbin.org/robots.txt", "downloads/robots.txt"),
        ("https://httpbin.org/uuid", "downloads/uuid.json")
    ]
    
    # Run the downloader
    asyncio.run(download_files_concurrently(downloads, max_concurrent=3))

Explanation

This concurrent file downloader solves the common problem of efficiently downloading multiple files from the internet. Instead of downloading files one at a time, which can be slow, this snippet uses asyncio to download several files simultaneously while providing real-time progress updates.

Key Features

  1. Concurrent Downloads: Uses asyncio and aiohttp to download multiple files simultaneously, significantly reducing total download time.
  2. Progress Tracking: Shows real-time progress for each file as it downloads.
  3. Resource Control: Limits the number of concurrent downloads with a semaphore to prevent overwhelming the system or server.
  4. Error Handling: Gracefully handles network errors and reports successes/failures clearly.
  5. Automatic Directory Creation: Creates local directories as needed.

How It Works

  1. The download_file function handles downloading a single file using an async HTTP session. It writes data chunks to disk as they arrive, enabling streaming for large files.
  2. The progress_reporter callback updates the console with download progress percentages.
  3. The main download_files_concurrently function manages the concurrent execution, limiting how many downloads happen at once.
  4. Results are collected and summarized at the end, showing which downloads succeeded or failed.

How to Run

  1. Install required dependencies: pip install aiohttp aiofiles
  2. Customize the downloads list with your URLs and desired filenames
  3. Run the script: python downloader.py

Customization Options

This approach is particularly useful when downloading many files from stable connections, handling large datasets, or scraping content from multiple sources.