import re
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
import argparse
def analyze_log_file(log_file_path, pattern=None, top_n=10):
"""
Analyze a log file and extract useful statistics.
Args:
log_file_path (str): Path to the log file
pattern (str): Custom regex pattern to search for
top_n (int): Number of top results to show
Returns:
dict: Analysis results
"""
# Common log patterns
patterns = {
'ip': r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',
'status_code': r'\" (200|301|302|400|401|403|404|500|502|503) ',
'user_agent': r'\"(Mozilla|Opera|Chrome|Safari|Firefox)[^"]*\"',
'timestamp': r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})\]',
'request': r'\"(GET|POST|PUT|DELETE|HEAD|OPTIONS) [^"]*\"'
}
# Initialize counters
stats = {
'total_lines': 0,
'ip_addresses': Counter(),
'status_codes': Counter(),
'user_agents': Counter(),
'requests': Counter(),
'timestamps': [],
'custom_matches': Counter() if pattern else None
}
# Compile regex patterns for performance
compiled_patterns = {key: re.compile(pattern) for key, pattern in patterns.items()}
custom_pattern = re.compile(pattern) if pattern else None
# Read and process log file
with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as file:
for line_num, line in enumerate(file, 1):
stats['total_lines'] += 1
# Extract information using regex
for key, regex in compiled_patterns.items():
matches = regex.findall(line)
if matches:
if key == 'ip_addresses':
stats[key].update(matches)
elif key == 'status_codes':
stats[key].update(matches)
elif key == 'user_agents':
# Clean user agent strings
cleaned = [match.split('/')[0] for match in matches]
stats[key].update(cleaned)
elif key == 'requests':
# Extract HTTP methods
methods = [match.split()[0][1:] for match in matches]
stats[key].update(methods)
elif key == 'timestamps':
try:
# Parse timestamp to datetime object
timestamp_str = matches[0]
dt = datetime.strptime(timestamp_str.split()[0], '%d/%b/%Y:%H:%M:%S')
stats[key].append(dt)
except ValueError:
pass
# Apply custom pattern if provided
if custom_pattern:
custom_matches = custom_pattern.findall(line)
if custom_matches:
stats['custom_matches'].update(custom_matches)
# Calculate additional statistics
results = {
'summary': {
'total_lines': stats['total_lines'],
'unique_ips': len(stats['ip_addresses']),
'start_time': min(stats['timestamps']) if stats['timestamps'] else None,
'end_time': max(stats['timestamps']) if stats['timestamps'] else None
},
'top_ips': dict(stats['ip_addresses'].most_common(top_n)),
'status_codes': dict(stats['status_codes'].most_common(top_n)),
'top_user_agents': dict(stats['user_agents'].most_common(top_n)),
'http_methods': dict(stats['requests'].most_common(top_n))
}
if pattern:
results['custom_matches'] = dict(stats['custom_matches'].most_common(top_n))
return results
def print_results(results, log_file_path):
"""Print formatted analysis results."""
print(f"\n=== Log Analysis Results for {log_file_path} ===\n")
# Summary
summary = results['summary']
print("Summary:")
print(f" Total lines processed: {summary['total_lines']}")
print(f" Unique IP addresses: {summary['unique_ips']}")
if summary['start_time'] and summary['end_time']:
print(f" Time range: {summary['start_time']} to {summary['end_time']}")
print()
# Top IP addresses
print("Top IP addresses:")
for ip, count in results['top_ips'].items():
print(f" {ip}: {count} requests")
print()
# Status codes
print("HTTP Status Codes:")
for code, count in results['status_codes'].items():
print(f" {code}: {count} responses")
print()
# User agents
print("Top User Agents:")
for agent, count in results['top_user_agents'].items():
print(f" {agent}: {count} requests")
print()
# HTTP methods
print("HTTP Methods:")
for method, count in results['http_methods'].items():
print(f" {method}: {count} requests")
print()
# Custom matches if any
if 'custom_matches' in results and results['custom_matches']:
print("Custom Pattern Matches:")
for match, count in results['custom_matches'].items():
print(f" {match}: {count} occurrences")
print()
def main():
parser = argparse.ArgumentParser(description='Analyze log files and extract statistics')
parser.add_argument('log_file', help='Path to the log file')
parser.add_argument('-p', '--pattern', help='Custom regex pattern to search for')
parser.add_argument('-n', '--top', type=int, default=10, help='Number of top results to show (default: 10)')
args = parser.parse_args()
# Check if file exists
if not Path(args.log_file).exists():
print(f"Error: File '{args.log_file}' not found.")
return
try:
# Analyze the log file
results = analyze_log_file(args.log_file, args.pattern, args.top)
# Print results
print_results(results, args.log_file)
except Exception as e:
print(f"Error analyzing log file: {e}")
if __name__ == "__main__":
main()
This Log File Analyzer is a powerful tool for parsing and extracting insights from web server log files (like Apache or Nginx logs). It automatically detects:
log_analyzer.pypython log_analyzer.py /path/to/access.log
python log_analyzer.py /path/to/access.log -p "error|exception|warning"
python log_analyzer.py /path/to/access.log -n 20
The tool is especially useful for system administrators, DevOps engineers, or anyone who needs to understand web traffic patterns, identify potential security issues, or troubleshoot server problems. It provides immediate insights without requiring complex log analysis platforms.