Python Snippets

Automatic Email Validator and Bulk Processor

import re
import json
from typing import List, Dict, Tuple
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
import time

@dataclass
class EmailValidationResult:
    email: str
    is_valid: bool
    domain: str
    validation_errors: List[str]

class EmailValidator:
    def __init__(self):
        # Comprehensive regex pattern for email validation
        self.email_pattern = re.compile(
            r'^[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$'
        )
        
        # Common disposable email domains to flag
        self.disposable_domains = {
            'mailinator.com', 'tempmail.org', 'guerrillamail.com', 
            '10minutemail.com', 'throwaway.email', 'yopmail.com'
        }
    
    def validate_email(self, email: str) -> EmailValidationResult:
        """Validate a single email address and return detailed results."""
        errors = []
        domain = ""
        
        # Basic format check
        if not email or not isinstance(email, str):
            errors.append("Email must be a non-empty string")
            return EmailValidationResult(email, False, domain, errors)
        
        email = email.strip().lower()
        
        # Check if email matches basic pattern
        if not self.email_pattern.match(email):
            errors.append("Email format is invalid")
            return EmailValidationResult(email, False, domain, errors)
        
        # Extract domain
        try:
            domain = email.split('@')[1]
        except IndexError:
            errors.append("Email must contain @ symbol")
            return EmailValidationResult(email, False, domain, errors)
        
        # Check for disposable email domains
        if domain in self.disposable_domains:
            errors.append("Disposable email addresses are not allowed")
        
        # Check for common issues
        if '..' in email:
            errors.append("Email contains consecutive dots")
        
        if email.startswith('.') or email.endswith('.'):
            errors.append("Email cannot start or end with a dot")
        
        # Length checks
        if len(email) > 254:
            errors.append("Email is too long (max 254 characters)")
        
        local_part = email.split('@')[0]
        if len(local_part) > 64:
            errors.append("Local part of email is too long (max 64 characters)")
        
        return EmailValidationResult(email, len(errors) == 0, domain, errors)
    
    def bulk_validate(self, emails: List[str], max_workers: int = 10) -> List[EmailValidationResult]:
        """Validate multiple emails concurrently for better performance."""
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = list(executor.map(self.validate_email, emails))
        return results
    
    def get_validation_summary(self, results: List[EmailValidationResult]) -> Dict:
        """Generate a summary report of validation results."""
        total = len(results)
        valid = sum(1 for r in results if r.is_valid)
        invalid = total - valid
        
        # Count errors
        error_counts = {}
        for result in results:
            for error in result.validation_errors:
                error_counts[error] = error_counts.get(error, 0) + 1
        
        # Domain statistics
        domain_counts = {}
        for result in results:
            if result.domain:
                domain_counts[result.domain] = domain_counts.get(result.domain, 0) + 1
        
        return {
            'total_emails': total,
            'valid_emails': valid,
            'invalid_emails': invalid,
            'validity_rate': round((valid/total)*100, 2) if total > 0 else 0,
            'error_distribution': error_counts,
            'domain_distribution': dict(sorted(domain_counts.items(), key=lambda x: x[1], reverse=True)[:10])
        }
    
    def export_results(self, results: List[EmailValidationResult], filename: str = "email_validation_results.json"):
        """Export validation results to JSON file."""
        export_data = {
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
            'results': [
                {
                    'email': r.email,
                    'is_valid': r.is_valid,
                    'domain': r.domain,
                    'errors': r.validation_errors
                }
                for r in results
            ],
            'summary': self.get_validation_summary(results)
        }
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        
        print(f"Results exported to {filename}")

# Example usage
if __name__ == "__main__":
    # Sample email list for validation
    sample_emails = [
        "user@example.com",
        "invalid.email",
        "test@nonexistentdomain12345.com",
        "user@gmail.com",
        "disposable@mailinator.com",
        "valid.email+tag@domain.org",
        "too..many..dots@example.com",
        ".startswithdot@example.com",
        "endswithdot.@example.com",
        "normal@outlook.com",
        "another.valid@yahoo.com"
    ]
    
    # Initialize validator
    validator = EmailValidator()
    
    # Validate emails
    print("Validating emails...")
    results = validator.bulk_validate(sample_emails)
    
    # Display results
    print("\nValidation Results:")
    print("-" * 50)
    for result in results:
        status = "✓ VALID" if result.is_valid else "✗ INVALID"
        print(f"{status:10} {result.email}")
        if result.validation_errors:
            for error in result.validation_errors:
                print(f"           └─ {error}")
        print()
    
    # Print summary
    summary = validator.get_validation_summary(results)
    print("Validation Summary:")
    print("-" * 50)
    print(f"Total emails: {summary['total_emails']}")
    print(f"Valid: {summary['valid_emails']}")
    print(f"Invalid: {summary['invalid_emails']}")
    print(f"Validity rate: {summary['validity_rate']}%")
    
    # Show common errors
    if summary['error_distribution']:
        print("\nCommon validation errors:")
        for error, count in sorted(summary['error_distribution'].items(), key=lambda x: x[1], reverse=True):
            print(f"  {error}: {count}")
    
    # Export results
    validator.export_results(results)

What This Code Does

This is a comprehensive email validation system that provides detailed validation for individual or bulk email addresses. It goes beyond simple regex matching to provide meaningful error messages and insights about email quality.

Key Features:

Robust Validation: Uses a comprehensive regex pattern based on RFC standards
Detailed Error Reporting: Provides specific reasons why an email is invalid
Disposable Email Detection: Identifies temporary email services
Bulk Processing: Concurrent validation using ThreadPoolExecutor for performance
Statistical Analysis: Generates summary reports with validation metrics
Data Export: Exports results to JSON for further analysis
Domain Analysis: Shows domain distribution in your email list

Why It’s Useful

Email validation is crucial for:

Data Quality: Maintaining clean user databases
Deliverability: Ensuring emails reach their destination
Security: Preventing fake account registrations
Marketing: Improving email campaign effectiveness
Compliance: Meeting anti-spam regulations

This implementation is particularly valuable for developers who need to validate email lists for applications, marketing campaigns, or user registration systems.

How to Run It

Save the code to a file (e.g., email_validator.py)
Run directly: python email_validator.py

For custom use, import the class:

from email_validator import EmailValidator
   
validator = EmailValidator()
result = validator.validate_email("user@example.com")
print(result.is_valid)  # True or False

Customization Options

Adjust max_workers in bulk_validate() for performance tuning
Add more disposable domains to the disposable_domains set
Modify the regex pattern for stricter or looser validation
Extend the validation rules by adding more checks in validate_email()

The code will automatically generate a JSON report file with detailed validation results and statistics when run.