Python Snippets

Log File Analyzer with Pattern Matching

import re
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
import argparse

def analyze_log_file(log_file_path, pattern=None, top_n=10):
    """
    Analyze a log file and extract useful statistics.
    
    Args:
        log_file_path (str): Path to the log file
        pattern (str): Custom regex pattern to search for
        top_n (int): Number of top results to show
    
    Returns:
        dict: Analysis results
    """
    # Common log patterns
    patterns = {
        'ip': r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}',
        'status_code': r'\" (200|301|302|400|401|403|404|500|502|503) ',
        'user_agent': r'\"(Mozilla|Opera|Chrome|Safari|Firefox)[^"]*\"',
        'timestamp': r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4})\]',
        'request': r'\"(GET|POST|PUT|DELETE|HEAD|OPTIONS) [^"]*\"'
    }
    
    # Initialize counters
    stats = {
        'total_lines': 0,
        'ip_addresses': Counter(),
        'status_codes': Counter(),
        'user_agents': Counter(),
        'requests': Counter(),
        'timestamps': [],
        'custom_matches': Counter() if pattern else None
    }
    
    # Compile regex patterns for performance
    compiled_patterns = {key: re.compile(pattern) for key, pattern in patterns.items()}
    custom_pattern = re.compile(pattern) if pattern else None
    
    # Read and process log file
    with open(log_file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line_num, line in enumerate(file, 1):
            stats['total_lines'] += 1
            
            # Extract information using regex
            for key, regex in compiled_patterns.items():
                matches = regex.findall(line)
                if matches:
                    if key == 'ip_addresses':
                        stats[key].update(matches)
                    elif key == 'status_codes':
                        stats[key].update(matches)
                    elif key == 'user_agents':
                        # Clean user agent strings
                        cleaned = [match.split('/')[0] for match in matches]
                        stats[key].update(cleaned)
                    elif key == 'requests':
                        # Extract HTTP methods
                        methods = [match.split()[0][1:] for match in matches]
                        stats[key].update(methods)
                    elif key == 'timestamps':
                        try:
                            # Parse timestamp to datetime object
                            timestamp_str = matches[0]
                            dt = datetime.strptime(timestamp_str.split()[0], '%d/%b/%Y:%H:%M:%S')
                            stats[key].append(dt)
                        except ValueError:
                            pass
            
            # Apply custom pattern if provided
            if custom_pattern:
                custom_matches = custom_pattern.findall(line)
                if custom_matches:
                    stats['custom_matches'].update(custom_matches)
    
    # Calculate additional statistics
    results = {
        'summary': {
            'total_lines': stats['total_lines'],
            'unique_ips': len(stats['ip_addresses']),
            'start_time': min(stats['timestamps']) if stats['timestamps'] else None,
            'end_time': max(stats['timestamps']) if stats['timestamps'] else None
        },
        'top_ips': dict(stats['ip_addresses'].most_common(top_n)),
        'status_codes': dict(stats['status_codes'].most_common(top_n)),
        'top_user_agents': dict(stats['user_agents'].most_common(top_n)),
        'http_methods': dict(stats['requests'].most_common(top_n))
    }
    
    if pattern:
        results['custom_matches'] = dict(stats['custom_matches'].most_common(top_n))
    
    return results

def print_results(results, log_file_path):
    """Print formatted analysis results."""
    print(f"\n=== Log Analysis Results for {log_file_path} ===\n")
    
    # Summary
    summary = results['summary']
    print("Summary:")
    print(f"  Total lines processed: {summary['total_lines']}")
    print(f"  Unique IP addresses: {summary['unique_ips']}")
    if summary['start_time'] and summary['end_time']:
        print(f"  Time range: {summary['start_time']} to {summary['end_time']}")
    print()
    
    # Top IP addresses
    print("Top IP addresses:")
    for ip, count in results['top_ips'].items():
        print(f"  {ip}: {count} requests")
    print()
    
    # Status codes
    print("HTTP Status Codes:")
    for code, count in results['status_codes'].items():
        print(f"  {code}: {count} responses")
    print()
    
    # User agents
    print("Top User Agents:")
    for agent, count in results['top_user_agents'].items():
        print(f"  {agent}: {count} requests")
    print()
    
    # HTTP methods
    print("HTTP Methods:")
    for method, count in results['http_methods'].items():
        print(f"  {method}: {count} requests")
    print()
    
    # Custom matches if any
    if 'custom_matches' in results and results['custom_matches']:
        print("Custom Pattern Matches:")
        for match, count in results['custom_matches'].items():
            print(f"  {match}: {count} occurrences")
        print()

def main():
    parser = argparse.ArgumentParser(description='Analyze log files and extract statistics')
    parser.add_argument('log_file', help='Path to the log file')
    parser.add_argument('-p', '--pattern', help='Custom regex pattern to search for')
    parser.add_argument('-n', '--top', type=int, default=10, help='Number of top results to show (default: 10)')
    
    args = parser.parse_args()
    
    # Check if file exists
    if not Path(args.log_file).exists():
        print(f"Error: File '{args.log_file}' not found.")
        return
    
    try:
        # Analyze the log file
        results = analyze_log_file(args.log_file, args.pattern, args.top)
        
        # Print results
        print_results(results, args.log_file)
        
    except Exception as e:
        print(f"Error analyzing log file: {e}")

if __name__ == "__main__":
    main()

This Log File Analyzer is a powerful tool for parsing and extracting insights from web server log files (like Apache or Nginx logs). It automatically detects:

IP addresses and their request frequencies
HTTP status codes (200, 404, 500, etc.)
User agents (browsers, bots, crawlers)
HTTP methods (GET, POST, PUT, etc.)
Timestamps to show activity patterns
Custom patterns you define with regex

Features:

Comprehensive Analysis: Gets detailed statistics about your web traffic
Performance Optimized: Uses compiled regex patterns for fast processing
Flexible Input: Works with any text-based log file
Customizable: Add your own regex patterns to find specific data
Time Analysis: Shows when your server is most active
CLI Interface: Easy to use from command line with arguments

How to Run:

Save the code as log_analyzer.py

Run with a log file:

python log_analyzer.py /path/to/access.log

For custom pattern matching:

python log_analyzer.py /path/to/access.log -p "error|exception|warning"

To change the number of results shown:

python log_analyzer.py /path/to/access.log -n 20

The tool is especially useful for system administrators, DevOps engineers, or anyone who needs to understand web traffic patterns, identify potential security issues, or troubleshoot server problems. It provides immediate insights without requiring complex log analysis platforms.