Python Snippets

File Integrity Checker Using SHA-256 Hashing

import hashlib
import os
import json
from pathlib import Path
from typing import Dict, List, Optional

def calculate_file_hash(file_path: str, algorithm: str = 'sha256') -> str:
    """
    Calculate the hash of a file using the specified algorithm.
    
    Args:
        file_path: Path to the file
        algorithm: Hash algorithm to use (default: sha256)
        
    Returns:
        Hexadecimal hash string
    """
    hash_obj = hashlib.new(algorithm)
    
    try:
        with open(file_path, 'rb') as f:
            # Read file in chunks to handle large files efficiently
            for chunk in iter(lambda: f.read(8192), b""):
                hash_obj.update(chunk)
        return hash_obj.hexdigest()
    except (IOError, OSError) as e:
        raise IOError(f"Error reading file {file_path}: {e}")

def generate_integrity_report(directory: str, output_file: Optional[str] = None) -> Dict[str, str]:
    """
    Generate a file integrity report for all files in a directory.
    
    Args:
        directory: Directory path to scan
        output_file: Optional path to save the report as JSON
        
    Returns:
        Dictionary mapping file paths to their SHA-256 hashes
    """
    if not os.path.isdir(directory):
        raise ValueError(f"Directory '{directory}' does not exist")
    
    integrity_report = {}
    dir_path = Path(directory)
    
    # Walk through directory and calculate hashes
    for file_path in dir_path.rglob('*'):
        if file_path.is_file():
            try:
                relative_path = str(file_path.relative_to(dir_path))
                file_hash = calculate_file_hash(str(file_path))
                integrity_report[relative_path] = file_hash
            except IOError as e:
                print(f"Warning: {e}")
    
    # Optionally save to file
    if output_file:
        with open(output_file, 'w') as f:
            json.dump(integrity_report, f, indent=2)
        print(f"Integrity report saved to {output_file}")
    
    return integrity_report

def verify_integrity(directory: str, reference_file: str) -> Dict[str, List[str]]:
    """
    Verify file integrity by comparing current hashes with reference values.
    
    Args:
        directory: Directory to check
        reference_file: JSON file with reference hashes
        
    Returns:
        Dictionary with 'modified', 'missing', and 'new' file lists
    """
    # Load reference hashes
    try:
        with open(reference_file, 'r') as f:
            reference_hashes = json.load(f)
    except (IOError, json.JSONDecodeError) as e:
        raise ValueError(f"Error loading reference file: {e}")
    
    # Generate current hashes
    current_hashes = generate_integrity_report(directory)
    
    # Compare hashes
    results = {
        'modified': [],
        'missing': [],
        'new': []
    }
    
    # Check for modified or missing files
    for file_path, ref_hash in reference_hashes.items():
        if file_path not in current_hashes:
            results['missing'].append(file_path)
        elif current_hashes[file_path] != ref_hash:
            results['modified'].append(file_path)
    
    # Check for new files
    for file_path in current_hashes:
        if file_path not in reference_hashes:
            results['new'].append(file_path)
    
    return results

def main():
    """
    Command-line interface for the integrity checker.
    """
    import argparse
    
    parser = argparse.ArgumentParser(description="File Integrity Checker")
    parser.add_argument("directory", help="Directory to check")
    parser.add_argument("--generate", metavar="REPORT_FILE", 
                        help="Generate integrity report and save to file")
    parser.add_argument("--verify", metavar="REFERENCE_FILE",
                        help="Verify integrity against reference report")
    
    args = parser.parse_args()
    
    if args.generate:
        try:
            generate_integrity_report(args.directory, args.generate)
        except Exception as e:
            print(f"Error: {e}")
            return 1
    elif args.verify:
        try:
            results = verify_integrity(args.directory, args.verify)
            
            print("File Integrity Check Results:")
            print("=" * 40)
            
            if results['modified']:
                print(f"\nModified files ({len(results['modified'])}):")
                for file_path in results['modified']:
                    print(f"  - {file_path}")
            
            if results['missing']:
                print(f"\nMissing files ({len(results['missing'])}):")
                for file_path in results['missing']:
                    print(f"  - {file_path}")
            
            if results['new']:
                print(f"\nNew files ({len(results['new'])}):")
                for file_path in results['new']:
                    print(f"  - {file_path}")
            
            if not any(results.values()):
                print("\nAll files verified successfully! No changes detected.")
                
        except Exception as e:
            print(f"Error: {e}")
            return 1
    else:
        parser.print_help()
        return 1
    
    return 0

if __name__ == "__main__":
    exit(main())

This script provides a comprehensive file integrity checking system using SHA-256 cryptographic hashes. The main components are:

  1. calculate_file_hash() - Computes SHA-256 hash of a file efficiently by reading in chunks
  2. generate_integrity_report() - Creates a report of all files in a directory with their hashes
  3. verify_integrity() - Compares current file hashes against a saved reference report
  4. Command-line interface with subcommands for generating reports and verifying integrity

Use Cases:

How to run:

Generate an integrity report:

python integrity_checker.py /path/to/directory --generate baseline.json

Verify against a saved report:

python integrity_checker.py /path/to/directory --verify baseline.json

The output will show any modified, missing, or new files compared to the baseline. This is especially valuable for security monitoring, deployment verification, and change detection in critical directories.