import hashlib
import os
import json
from pathlib import Path
from typing import Dict, List, Optional
def calculate_file_hash(file_path: str, algorithm: str = 'sha256') -> str:
"""
Calculate the hash of a file using the specified algorithm.
Args:
file_path: Path to the file
algorithm: Hash algorithm to use (default: sha256)
Returns:
Hexadecimal hash string
"""
hash_obj = hashlib.new(algorithm)
try:
with open(file_path, 'rb') as f:
# Read file in chunks to handle large files efficiently
for chunk in iter(lambda: f.read(8192), b""):
hash_obj.update(chunk)
return hash_obj.hexdigest()
except (IOError, OSError) as e:
raise IOError(f"Error reading file {file_path}: {e}")
def generate_integrity_report(directory: str, output_file: Optional[str] = None) -> Dict[str, str]:
"""
Generate a file integrity report for all files in a directory.
Args:
directory: Directory path to scan
output_file: Optional path to save the report as JSON
Returns:
Dictionary mapping file paths to their SHA-256 hashes
"""
if not os.path.isdir(directory):
raise ValueError(f"Directory '{directory}' does not exist")
integrity_report = {}
dir_path = Path(directory)
# Walk through directory and calculate hashes
for file_path in dir_path.rglob('*'):
if file_path.is_file():
try:
relative_path = str(file_path.relative_to(dir_path))
file_hash = calculate_file_hash(str(file_path))
integrity_report[relative_path] = file_hash
except IOError as e:
print(f"Warning: {e}")
# Optionally save to file
if output_file:
with open(output_file, 'w') as f:
json.dump(integrity_report, f, indent=2)
print(f"Integrity report saved to {output_file}")
return integrity_report
def verify_integrity(directory: str, reference_file: str) -> Dict[str, List[str]]:
"""
Verify file integrity by comparing current hashes with reference values.
Args:
directory: Directory to check
reference_file: JSON file with reference hashes
Returns:
Dictionary with 'modified', 'missing', and 'new' file lists
"""
# Load reference hashes
try:
with open(reference_file, 'r') as f:
reference_hashes = json.load(f)
except (IOError, json.JSONDecodeError) as e:
raise ValueError(f"Error loading reference file: {e}")
# Generate current hashes
current_hashes = generate_integrity_report(directory)
# Compare hashes
results = {
'modified': [],
'missing': [],
'new': []
}
# Check for modified or missing files
for file_path, ref_hash in reference_hashes.items():
if file_path not in current_hashes:
results['missing'].append(file_path)
elif current_hashes[file_path] != ref_hash:
results['modified'].append(file_path)
# Check for new files
for file_path in current_hashes:
if file_path not in reference_hashes:
results['new'].append(file_path)
return results
def main():
"""
Command-line interface for the integrity checker.
"""
import argparse
parser = argparse.ArgumentParser(description="File Integrity Checker")
parser.add_argument("directory", help="Directory to check")
parser.add_argument("--generate", metavar="REPORT_FILE",
help="Generate integrity report and save to file")
parser.add_argument("--verify", metavar="REFERENCE_FILE",
help="Verify integrity against reference report")
args = parser.parse_args()
if args.generate:
try:
generate_integrity_report(args.directory, args.generate)
except Exception as e:
print(f"Error: {e}")
return 1
elif args.verify:
try:
results = verify_integrity(args.directory, args.verify)
print("File Integrity Check Results:")
print("=" * 40)
if results['modified']:
print(f"\nModified files ({len(results['modified'])}):")
for file_path in results['modified']:
print(f" - {file_path}")
if results['missing']:
print(f"\nMissing files ({len(results['missing'])}):")
for file_path in results['missing']:
print(f" - {file_path}")
if results['new']:
print(f"\nNew files ({len(results['new'])}):")
for file_path in results['new']:
print(f" - {file_path}")
if not any(results.values()):
print("\nAll files verified successfully! No changes detected.")
except Exception as e:
print(f"Error: {e}")
return 1
else:
parser.print_help()
return 1
return 0
if __name__ == "__main__":
exit(main())
This script provides a comprehensive file integrity checking system using SHA-256 cryptographic hashes. The main components are:
calculate_file_hash() - Computes SHA-256 hash of a file efficiently by reading in chunksgenerate_integrity_report() - Creates a report of all files in a directory with their hashesverify_integrity() - Compares current file hashes against a saved reference reportUse Cases:
How to run:
Generate an integrity report:
python integrity_checker.py /path/to/directory --generate baseline.json
Verify against a saved report:
python integrity_checker.py /path/to/directory --verify baseline.json
The output will show any modified, missing, or new files compared to the baseline. This is especially valuable for security monitoring, deployment verification, and change detection in critical directories.