import re
from collections import Counter
from pathlib import Path
def extract_ips_from_log(log_file_path, top_n=10):
"""
Extract and count IP addresses from a log file.
Args:
log_file_path (str): Path to the log file
top_n (int): Number of top IPs to return
Returns:
list: Top N IP addresses with their counts
"""
# Regular expression pattern for matching IP addresses
ip_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
# Read the log file
log_path = Path(log_file_path)
if not log_path.exists():
raise FileNotFoundError(f"Log file not found: {log_file_path}")
with open(log_path, 'r', encoding='utf-8', errors='ignore') as file:
content = file.read()
# Find all IP addresses in the content
ips = re.findall(ip_pattern, content)
# Filter out invalid IP addresses (those with numbers > 255)
valid_ips = []
for ip in ips:
parts = ip.split('.')
if all(0 <= int(part) <= 255 for part in parts):
valid_ips.append(ip)
# Count occurrences and return top N
ip_counts = Counter(valid_ips)
return ip_counts.most_common(top_n)
def main():
# Example usage
log_file = "access.log" # Replace with your log file path
try:
top_ips = extract_ips_from_log(log_file, top_n=10)
print(f"Top 10 IP addresses in {log_file}:")
print("-" * 40)
for i, (ip, count) in enumerate(top_ips, 1):
print(f"{i:2d}. {ip:<15} ({count} occurrences)")
except FileNotFoundError as e:
print(f"Error: {e}")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()
This code snippet analyzes log files to extract and count IP addresses, which is extremely useful for:
The script uses regular expressions to find IP addresses and validates them to ensure they follow proper IP formatting (each octet between 0-255). It then uses Python’s Counter from the collections module to efficiently count occurrences and return the most frequent addresses.
To use this snippet:
log_analyzer.pylog_file variable with your actual log file pathpython log_analyzer.pyThe output will show the top 10 IP addresses found in your log file along with how many times each appeared. You can adjust the top_n parameter to show more or fewer results.