Python Snippets

File Integrity Checker with SHA-256 Hashing

This Python snippet creates a file integrity verification tool that generates SHA-256 hashes for files and can later verify if files have been modified. This is particularly useful for security auditing, backup verification, or ensuring file integrity during transfers.

import hashlib
import os
import json
from pathlib import Path
from typing import Dict, Optional

def calculate_file_hash(filepath: str, algorithm: str = 'sha256') -> str:
    """Calculate the hash of a file using the specified algorithm."""
    hash_obj = hashlib.new(algorithm)
    try:
        with open(filepath, 'rb') as f:
            # Read file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(4096), b""):
                hash_obj.update(chunk)
        return hash_obj.hexdigest()
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {filepath}")
    except PermissionError:
        raise PermissionError(f"Permission denied: {filepath}")

def generate_integrity_report(directory: str, output_file: Optional[str] = None) -> Dict[str, str]:
    """Generate a hash report for all files in a directory."""
    report = {}
    directory_path = Path(directory)
    
    if not directory_path.is_dir():
        raise ValueError(f"Path is not a directory: {directory}")
    
    # Walk through all files in directory and subdirectories
    for root, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            try:
                relative_path = os.path.relpath(filepath, directory)
                file_hash = calculate_file_hash(filepath)
                report[relative_path] = file_hash
            except (FileNotFoundError, PermissionError) as e:
                print(f"Warning: Skipping {filepath} - {e}")
    
    # Save report to file if requested
    if output_file:
        with open(output_file, 'w') as f:
            json.dump(report, f, indent=2)
        print(f"Integrity report saved to {output_file}")
    
    return report

def verify_integrity(directory: str, report_file: str) -> Dict[str, str]:
    """Verify file integrity by comparing current hashes with a saved report."""
    try:
        with open(report_file, 'r') as f:
            saved_report = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Report file not found: {report_file}")
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON format in report file: {report_file}")
    
    results = {}
    directory_path = Path(directory)
    
    if not directory_path.is_dir():
        raise ValueError(f"Path is not a directory: {directory}")
    
    # Check each file in the saved report
    for relative_path, saved_hash in saved_report.items():
        filepath = os.path.join(directory, relative_path)
        
        if not os.path.exists(filepath):
            results[relative_path] = "MISSING"
            continue
        
        try:
            current_hash = calculate_file_hash(filepath)
            if current_hash == saved_hash:
                results[relative_path] = "OK"
            else:
                results[relative_path] = "MODIFIED"
        except (FileNotFoundError, PermissionError) as e:
            results[relative_path] = f"ERROR: {e}"
    
    return results

def print_verification_results(results: Dict[str, str]) -> None:
    """Print verification results in a formatted way."""
    print("\nFile Integrity Verification Results:")
    print("-" * 50)
    
    modified_count = sum(1 for status in results.values() if status == "MODIFIED")
    missing_count = sum(1 for status in results.values() if status == "MISSING")
    error_count = sum(1 for status in results.values() if status.startswith("ERROR"))
    ok_count = sum(1 for status in results.values() if status == "OK")
    
    for filepath, status in results.items():
        status_symbol = {
            "OK": "✓",
            "MODIFIED": "✗",
            "MISSING": "⚠",
        }.get(status, "⚠")
        
        status_color = {
            "OK": "\033[92m",      # Green
            "MODIFIED": "\033[91m", # Red
            "MISSING": "\033[93m",  # Yellow
        }.get(status, "\033[93m")  # Default to yellow for errors
        
        reset_color = "\033[0m"
        print(f"{status_color}{status_symbol} {filepath:<40} [{status}]{reset_color}")
    
    print("-" * 50)
    print(f"Summary: {ok_count} OK, {modified_count} Modified, {missing_count} Missing, {error_count} Errors")

# Example usage
if __name__ == "__main__":
    # Example: Create an integrity report for a directory
    # generate_integrity_report("./my_project", "integrity_report.json")
    
    # Example: Verify integrity using a saved report
    # results = verify_integrity("./my_project", "integrity_report.json")
    # print_verification_results(results)
    
    # For demonstration, we'll create a small example
    import tempfile
    import shutil
    
    # Create a temporary directory with some test files
    with tempfile.TemporaryDirectory() as temp_dir:
        # Create test files
        test_files = {
            "document.txt": "This is a sample document.",
            "data.csv": "name,age\nAlice,30\nBob,25\nCharlie,35",
            "script.py": "print('Hello, World!')\nprint('File integrity checking')"
        }
        
        for filename, content in test_files.items():
            with open(os.path.join(temp_dir, filename), "w") as f:
                f.write(content)
        
        print(f"Created test files in: {temp_dir}")
        
        # Generate integrity report
        report_file = os.path.join(temp_dir, "integrity_report.json")
        report = generate_integrity_report(temp_dir, report_file)
        print(f"Generated report for {len(report)} files")
        
        # Verify integrity (should all be OK)
        results = verify_integrity(temp_dir, report_file)
        print_verification_results(results)
        
        # Modify one file to demonstrate detection
        doc_path = os.path.join(temp_dir, "document.txt")
        with open(doc_path, "a") as f:
            f.write("\nModified content!")
        
        print("\nAfter modifying document.txt:")
        results = verify_integrity(temp_dir, report_file)
        print_verification_results(results)

What This Code Does

This file integrity checker provides three main functions:

calculate_file_hash(): Computes the SHA-256 hash of a file by reading it in small chunks (making it memory-efficient for large files)
generate_integrity_report(): Scans a directory recursively and creates a JSON report mapping file paths to their hashes
verify_integrity(): Compares current file hashes with a previously saved report to detect modifications, missing files, or access errors

Why This Is Useful

File integrity checking is essential for:

Security Auditing: Detect unauthorized changes to system files
Backup Verification: Ensure backups haven’t been corrupted
Transfer Validation: Verify files weren’t altered during copying/moving
Version Control: Track unexpected changes in project files
Compliance: Meet regulatory requirements for data integrity

The tool handles large files efficiently by reading them in 4KB chunks rather than loading everything into memory. It also gracefully handles common issues like missing files or permission errors.

How to Run It

Generate an integrity report:

# Create a hash report for all files in a directory
generate_integrity_report("/path/to/your/project", "integrity_report.json")

Verify file integrity later:

# Check if any files have been modified
results = verify_integrity("/path/to/your/project", "integrity_report.json")
print_verification_results(results)

The output uses color-coding (when supported by your terminal):

Green checkmark (✓) for files that match their original hash
Red cross (✗) for modified files
Yellow warning (⚠) for missing or inaccessible files

The example at the bottom demonstrates the complete workflow with temporary files, showing how modifications are detected.