Python Snippets

File Integrity Checker with SHA-256 Hashing

import hashlib
import os
import json
from pathlib import Path
from typing import Dict, List, Optional

class FileIntegrityChecker:
    def __init__(self, root_directory: str, hash_file: str = ".file_hashes.json"):
        """
        Initialize the File Integrity Checker.
        
        Args:
            root_directory: Directory to monitor for file changes
            hash_file: JSON file to store file hashes
        """
        self.root_directory = Path(root_directory).resolve()
        self.hash_file = Path(hash_file)
        self.hashes: Dict[str, str] = {}
        self._load_hashes()
    
    def _calculate_hash(self, file_path: Path) -> str:
        """Calculate SHA-256 hash of a file."""
        hash_sha256 = hashlib.sha256()
        try:
            with open(file_path, "rb") as f:
                # Read file in chunks to handle large files efficiently
                for chunk in iter(lambda: f.read(4096), b""):
                    hash_sha256.update(chunk)
            return hash_sha256.hexdigest()
        except (IOError, OSError) as e:
            print(f"Error reading file {file_path}: {e}")
            return ""
    
    def _load_hashes(self) -> None:
        """Load existing hashes from JSON file."""
        if self.hash_file.exists():
            try:
                with open(self.hash_file, "r") as f:
                    self.hashes = json.load(f)
            except (json.JSONDecodeError, IOError) as e:
                print(f"Error loading hash file: {e}")
                self.hashes = {}
    
    def _save_hashes(self) -> None:
        """Save hashes to JSON file."""
        try:
            with open(self.hash_file, "w") as f:
                json.dump(self.hashes, f, indent=2)
        except IOError as e:
            print(f"Error saving hash file: {e}")
    
    def generate_hashes(self, file_extensions: Optional[List[str]] = None) -> None:
        """
        Generate and save hashes for all files in the directory.
        
        Args:
            file_extensions: List of file extensions to include (e.g., ['.py', '.txt'])
        """
        self.hashes = {}
        print(f"Generating hashes for files in {self.root_directory}")
        
        for file_path in self.root_directory.rglob("*"):
            # Skip directories and the hash file itself
            if file_path.is_dir() or file_path.name == self.hash_file.name:
                continue
                
            # Filter by file extensions if specified
            if file_extensions and file_path.suffix not in file_extensions:
                continue
                
            relative_path = str(file_path.relative_to(self.root_directory))
            file_hash = self._calculate_hash(file_path)
            
            if file_hash:
                self.hashes[relative_path] = file_hash
                print(f"Hashed: {relative_path}")
        
        self._save_hashes()
        print(f"Generated hashes for {len(self.hashes)} files")
    
    def check_integrity(self) -> Dict[str, str]:
        """
        Check file integrity by comparing current hashes with stored hashes.
        
        Returns:
            Dictionary with file paths as keys and status as values
            Status can be: 'modified', 'new', 'deleted', 'ok'
        """
        current_hashes = {}
        results = {}
        
        print("Checking file integrity...")
        
        # Calculate current hashes
        for file_path in self.root_directory.rglob("*"):
            if file_path.is_dir() or file_path.name == self.hash_file.name:
                continue
                
            relative_path = str(file_path.relative_to(self.root_directory))
            file_hash = self._calculate_hash(file_path)
            
            if file_hash:
                current_hashes[relative_path] = file_hash
        
        # Compare with stored hashes
        all_files = set(self.hashes.keys()) | set(current_hashes.keys())
        
        for file_path in all_files:
            if file_path in self.hashes and file_path in current_hashes:
                if self.hashes[file_path] == current_hashes[file_path]:
                    results[file_path] = "ok"
                else:
                    results[file_path] = "modified"
            elif file_path in current_hashes:
                results[file_path] = "new"
            else:
                results[file_path] = "deleted"
        
        # Print results
        modified = [f for f, s in results.items() if s == "modified"]
        new_files = [f for f, s in results.items() if s == "new"]
        deleted = [f for f, s in results.items() if s == "deleted"]
        
        print(f"Integrity check complete:")
        print(f"  Modified files: {len(modified)}")
        print(f"  New files: {len(new_files)}")
        print(f"  Deleted files: {len(deleted)}")
        
        if modified:
            print("\nModified files:")
            for f in modified:
                print(f"  {f}")
        
        if new_files:
            print("\nNew files:")
            for f in new_files:
                print(f"  {f}")
        
        if deleted:
            print("\nDeleted files:")
            for f in deleted:
                print(f"  {f}")
        
        return results

# Example usage
if __name__ == "__main__":
    # Initialize the checker
    checker = FileIntegrityChecker("./test_directory")
    
    # Generate hashes for all files
    checker.generate_hashes()
    
    # Later, check integrity
    # checker.check_integrity()

What This Code Does

This Python snippet implements a File Integrity Checker that uses SHA-256 cryptographic hashing to detect changes in files. It’s particularly useful for:

  1. Security monitoring - Detect unauthorized changes to system files
  2. Backup verification - Ensure backup files haven’t been corrupted
  3. Development workflow - Track unintended modifications during development
  4. Compliance auditing - Verify file integrity for regulatory requirements

The tool works by:

  1. Scanning a directory recursively and calculating SHA-256 hashes for all files
  2. Storing these hashes in a JSON file for future reference
  3. Comparing current file hashes with stored hashes to detect modifications, additions, or deletions

Key Features

How to Run

  1. Save the code to a Python file (e.g., integrity_checker.py)
  2. Create a directory to monitor (the tool will work with any existing directory)
  3. Initialize the checker with:
    checker = FileIntegrityChecker("/path/to/your/directory")
    
  4. Generate initial hashes:
    checker.generate_hashes()  # For all files
    # Or for specific file types:
    checker.generate_hashes(['.py', '.txt', '.json'])
    
  5. After making changes to your files, check integrity:
    checker.check_integrity()
    

The first run will create a .file_hashes.json file in your directory. Subsequent runs will compare against this baseline to detect changes.