This Python snippet creates a file integrity verification tool that generates SHA-256 hashes for files and can later verify if files have been modified. This is particularly useful for security auditing, backup verification, or ensuring file integrity during transfers.
import hashlib
import os
import json
from pathlib import Path
from typing import Dict, Optional
def calculate_file_hash(filepath: str, algorithm: str = 'sha256') -> str:
"""Calculate the hash of a file using the specified algorithm."""
hash_obj = hashlib.new(algorithm)
try:
with open(filepath, 'rb') as f:
# Read file in chunks to handle large files efficiently
for chunk in iter(lambda: f.read(4096), b""):
hash_obj.update(chunk)
return hash_obj.hexdigest()
except FileNotFoundError:
raise FileNotFoundError(f"File not found: {filepath}")
except PermissionError:
raise PermissionError(f"Permission denied: {filepath}")
def generate_integrity_report(directory: str, output_file: Optional[str] = None) -> Dict[str, str]:
"""Generate a hash report for all files in a directory."""
report = {}
directory_path = Path(directory)
if not directory_path.is_dir():
raise ValueError(f"Path is not a directory: {directory}")
# Walk through all files in directory and subdirectories
for root, _, files in os.walk(directory):
for file in files:
filepath = os.path.join(root, file)
try:
relative_path = os.path.relpath(filepath, directory)
file_hash = calculate_file_hash(filepath)
report[relative_path] = file_hash
except (FileNotFoundError, PermissionError) as e:
print(f"Warning: Skipping {filepath} - {e}")
# Save report to file if requested
if output_file:
with open(output_file, 'w') as f:
json.dump(report, f, indent=2)
print(f"Integrity report saved to {output_file}")
return report
def verify_integrity(directory: str, report_file: str) -> Dict[str, str]:
"""Verify file integrity by comparing current hashes with a saved report."""
try:
with open(report_file, 'r') as f:
saved_report = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"Report file not found: {report_file}")
except json.JSONDecodeError:
raise ValueError(f"Invalid JSON format in report file: {report_file}")
results = {}
directory_path = Path(directory)
if not directory_path.is_dir():
raise ValueError(f"Path is not a directory: {directory}")
# Check each file in the saved report
for relative_path, saved_hash in saved_report.items():
filepath = os.path.join(directory, relative_path)
if not os.path.exists(filepath):
results[relative_path] = "MISSING"
continue
try:
current_hash = calculate_file_hash(filepath)
if current_hash == saved_hash:
results[relative_path] = "OK"
else:
results[relative_path] = "MODIFIED"
except (FileNotFoundError, PermissionError) as e:
results[relative_path] = f"ERROR: {e}"
return results
def print_verification_results(results: Dict[str, str]) -> None:
"""Print verification results in a formatted way."""
print("\nFile Integrity Verification Results:")
print("-" * 50)
modified_count = sum(1 for status in results.values() if status == "MODIFIED")
missing_count = sum(1 for status in results.values() if status == "MISSING")
error_count = sum(1 for status in results.values() if status.startswith("ERROR"))
ok_count = sum(1 for status in results.values() if status == "OK")
for filepath, status in results.items():
status_symbol = {
"OK": "✓",
"MODIFIED": "✗",
"MISSING": "⚠",
}.get(status, "⚠")
status_color = {
"OK": "\033[92m", # Green
"MODIFIED": "\033[91m", # Red
"MISSING": "\033[93m", # Yellow
}.get(status, "\033[93m") # Default to yellow for errors
reset_color = "\033[0m"
print(f"{status_color}{status_symbol} {filepath:<40} [{status}]{reset_color}")
print("-" * 50)
print(f"Summary: {ok_count} OK, {modified_count} Modified, {missing_count} Missing, {error_count} Errors")
# Example usage
if __name__ == "__main__":
# Example: Create an integrity report for a directory
# generate_integrity_report("./my_project", "integrity_report.json")
# Example: Verify integrity using a saved report
# results = verify_integrity("./my_project", "integrity_report.json")
# print_verification_results(results)
# For demonstration, we'll create a small example
import tempfile
import shutil
# Create a temporary directory with some test files
with tempfile.TemporaryDirectory() as temp_dir:
# Create test files
test_files = {
"document.txt": "This is a sample document.",
"data.csv": "name,age\nAlice,30\nBob,25\nCharlie,35",
"script.py": "print('Hello, World!')\nprint('File integrity checking')"
}
for filename, content in test_files.items():
with open(os.path.join(temp_dir, filename), "w") as f:
f.write(content)
print(f"Created test files in: {temp_dir}")
# Generate integrity report
report_file = os.path.join(temp_dir, "integrity_report.json")
report = generate_integrity_report(temp_dir, report_file)
print(f"Generated report for {len(report)} files")
# Verify integrity (should all be OK)
results = verify_integrity(temp_dir, report_file)
print_verification_results(results)
# Modify one file to demonstrate detection
doc_path = os.path.join(temp_dir, "document.txt")
with open(doc_path, "a") as f:
f.write("\nModified content!")
print("\nAfter modifying document.txt:")
results = verify_integrity(temp_dir, report_file)
print_verification_results(results)
This file integrity checker provides three main functions:
calculate_file_hash(): Computes the SHA-256 hash of a file by reading it in small chunks (making it memory-efficient for large files)generate_integrity_report(): Scans a directory recursively and creates a JSON report mapping file paths to their hashesverify_integrity(): Compares current file hashes with a previously saved report to detect modifications, missing files, or access errorsFile integrity checking is essential for:
The tool handles large files efficiently by reading them in 4KB chunks rather than loading everything into memory. It also gracefully handles common issues like missing files or permission errors.
# Create a hash report for all files in a directory
generate_integrity_report("/path/to/your/project", "integrity_report.json")
# Check if any files have been modified
results = verify_integrity("/path/to/your/project", "integrity_report.json")
print_verification_results(results)
The output uses color-coding (when supported by your terminal):
The example at the bottom demonstrates the complete workflow with temporary files, showing how modifications are detected.