import hashlib
import os
import json
from pathlib import Path
from typing import Dict, List, Optional
class FileIntegrityChecker:
def __init__(self, root_directory: str, hash_file: str = ".file_hashes.json"):
"""
Initialize the File Integrity Checker.
Args:
root_directory: Directory to monitor for file changes
hash_file: JSON file to store file hashes
"""
self.root_directory = Path(root_directory).resolve()
self.hash_file = Path(hash_file)
self.hashes: Dict[str, str] = {}
self._load_hashes()
def _calculate_hash(self, file_path: Path) -> str:
"""Calculate SHA-256 hash of a file."""
hash_sha256 = hashlib.sha256()
try:
with open(file_path, "rb") as f:
# Read file in chunks to handle large files efficiently
for chunk in iter(lambda: f.read(4096), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
except (IOError, OSError) as e:
print(f"Error reading file {file_path}: {e}")
return ""
def _load_hashes(self) -> None:
"""Load existing hashes from JSON file."""
if self.hash_file.exists():
try:
with open(self.hash_file, "r") as f:
self.hashes = json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"Error loading hash file: {e}")
self.hashes = {}
def _save_hashes(self) -> None:
"""Save hashes to JSON file."""
try:
with open(self.hash_file, "w") as f:
json.dump(self.hashes, f, indent=2)
except IOError as e:
print(f"Error saving hash file: {e}")
def generate_hashes(self, file_extensions: Optional[List[str]] = None) -> None:
"""
Generate and save hashes for all files in the directory.
Args:
file_extensions: List of file extensions to include (e.g., ['.py', '.txt'])
"""
self.hashes = {}
print(f"Generating hashes for files in {self.root_directory}")
for file_path in self.root_directory.rglob("*"):
# Skip directories and the hash file itself
if file_path.is_dir() or file_path.name == self.hash_file.name:
continue
# Filter by file extensions if specified
if file_extensions and file_path.suffix not in file_extensions:
continue
relative_path = str(file_path.relative_to(self.root_directory))
file_hash = self._calculate_hash(file_path)
if file_hash:
self.hashes[relative_path] = file_hash
print(f"Hashed: {relative_path}")
self._save_hashes()
print(f"Generated hashes for {len(self.hashes)} files")
def check_integrity(self) -> Dict[str, str]:
"""
Check file integrity by comparing current hashes with stored hashes.
Returns:
Dictionary with file paths as keys and status as values
Status can be: 'modified', 'new', 'deleted', 'ok'
"""
current_hashes = {}
results = {}
print("Checking file integrity...")
# Calculate current hashes
for file_path in self.root_directory.rglob("*"):
if file_path.is_dir() or file_path.name == self.hash_file.name:
continue
relative_path = str(file_path.relative_to(self.root_directory))
file_hash = self._calculate_hash(file_path)
if file_hash:
current_hashes[relative_path] = file_hash
# Compare with stored hashes
all_files = set(self.hashes.keys()) | set(current_hashes.keys())
for file_path in all_files:
if file_path in self.hashes and file_path in current_hashes:
if self.hashes[file_path] == current_hashes[file_path]:
results[file_path] = "ok"
else:
results[file_path] = "modified"
elif file_path in current_hashes:
results[file_path] = "new"
else:
results[file_path] = "deleted"
# Print results
modified = [f for f, s in results.items() if s == "modified"]
new_files = [f for f, s in results.items() if s == "new"]
deleted = [f for f, s in results.items() if s == "deleted"]
print(f"Integrity check complete:")
print(f" Modified files: {len(modified)}")
print(f" New files: {len(new_files)}")
print(f" Deleted files: {len(deleted)}")
if modified:
print("\nModified files:")
for f in modified:
print(f" {f}")
if new_files:
print("\nNew files:")
for f in new_files:
print(f" {f}")
if deleted:
print("\nDeleted files:")
for f in deleted:
print(f" {f}")
return results
# Example usage
if __name__ == "__main__":
# Initialize the checker
checker = FileIntegrityChecker("./test_directory")
# Generate hashes for all files
checker.generate_hashes()
# Later, check integrity
# checker.check_integrity()
This Python snippet implements a File Integrity Checker that uses SHA-256 cryptographic hashing to detect changes in files. It’s particularly useful for:
The tool works by:
integrity_checker.py)checker = FileIntegrityChecker("/path/to/your/directory")
checker.generate_hashes() # For all files
# Or for specific file types:
checker.generate_hashes(['.py', '.txt', '.json'])
checker.check_integrity()
The first run will create a .file_hashes.json file in your directory. Subsequent runs will compare against this baseline to detect changes.