Python Snippets

JSON Data Validator and Cleaner with Pandas

import pandas as pd
import json
from typing import Dict, List, Any, Optional
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class JSONDataValidator:
    """
    A utility class for validating and cleaning JSON data using pandas.
    Useful for preprocessing data before database insertion or analysis.
    """
    
    def __init__(self, required_fields: List[str] = None):
        self.required_fields = required_fields or []
    
    def validate_and_clean(self, json_data: List[Dict[str, Any]]) -> pd.DataFrame:
        """
        Validate and clean a list of JSON objects.
        
        Args:
            json_data: List of dictionaries representing JSON records
            
        Returns:
            Cleaned pandas DataFrame
        """
        # Convert to DataFrame
        df = pd.DataFrame(json_data)
        
        # Check for required fields
        missing_fields = set(self.required_fields) - set(df.columns)
        if missing_fields:
            logger.warning(f"Missing required fields: {missing_fields}")
            # Add missing columns with None values
            for field in missing_fields:
                df[field] = None
        
        # Remove duplicates
        initial_count = len(df)
        df = df.drop_duplicates()
        if len(df) < initial_count:
            logger.info(f"Removed {initial_count - len(df)} duplicate records")
        
        # Handle missing values
        df = self._handle_missing_values(df)
        
        # Data type corrections
        df = self._correct_data_types(df)
        
        # Validate email formats (basic validation)
        df = self._validate_emails(df)
        
        logger.info(f"Data validation completed. Final record count: {len(df)}")
        return df
    
    def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """Handle missing values in the DataFrame."""
        # Fill string columns with 'Unknown'
        string_cols = df.select_dtypes(include=['object']).columns
        df[string_cols] = df[string_cols].fillna('Unknown')
        
        # Fill numeric columns with 0
        numeric_cols = df.select_dtypes(include=['number']).columns
        df[numeric_cols] = df[numeric_cols].fillna(0)
        
        return df
    
    def _correct_data_types(self, df: pd.DataFrame) -> pd.DataFrame:
        """Correct common data type issues."""
        # Convert date strings to datetime if column name contains 'date'
        date_cols = [col for col in df.columns if 'date' in col.lower()]
        for col in date_cols:
            try:
                df[col] = pd.to_datetime(df[col], errors='coerce')
            except Exception as e:
                logger.warning(f"Could not convert {col} to datetime: {e}")
        
        # Ensure numeric columns are actually numeric
        numeric_cols = ['age', 'price', 'salary', 'score']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
        
        return df
    
    def _validate_emails(self, df: pd.DataFrame) -> pd.DataFrame:
        """Basic email validation."""
        email_cols = [col for col in df.columns if 'email' in col.lower()]
        for col in email_cols:
            # Simple email validation pattern
            df[f'{col}_valid'] = df[col].str.contains(
                r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', 
                na=False
            )
            invalid_emails = df[df[f'{col}_valid'] == False]
            if len(invalid_emails) > 0:
                logger.warning(f"Found {len(invalid_emails)} invalid emails in {col}")
                # Optionally clear invalid emails
                df.loc[df[f'{col}_valid'] == False, col] = None
        return df
    
    def export_clean_data(self, df: pd.DataFrame, filename: str) -> None:
        """Export cleaned data to various formats."""
        if filename.endswith('.csv'):
            df.to_csv(filename, index=False)
        elif filename.endswith('.json'):
            df.to_json(filename, orient='records', indent=2)
        elif filename.endswith('.xlsx'):
            df.to_excel(filename, index=False)
        else:
            raise ValueError("Unsupported file format. Use .csv, .json, or .xlsx")
        
        logger.info(f"Data exported to {filename}")

# Example usage
if __name__ == "__main__":
    # Sample JSON data with various issues
    sample_data = [
        {"name": "John Doe", "email": "john@example.com", "age": "25", "signup_date": "2023-01-15"},
        {"name": "Jane Smith", "email": "invalid-email", "age": 30, "signup_date": "2023-02-20"},
        {"name": "Bob Johnson", "email": "bob@example.com", "age": None, "signup_date": "2023-03-10"},
        {"name": "John Doe", "email": "john@example.com", "age": "25", "signup_date": "2023-01-15"},  # Duplicate
        {"name": None, "email": "sara@example.com", "age": "28", "signup_date": "invalid-date"},
        {"email": "missing_name@example.com", "age": "35", "signup_date": "2023-05-01"},  # Missing name
    ]
    
    # Initialize validator with required fields
    validator = JSONDataValidator(required_fields=['name', 'email', 'age'])
    
    # Validate and clean data
    cleaned_df = validator.validate_and_clean(sample_data)
    
    # Display results
    print("Cleaned Data:")
    print(cleaned_df)
    print("\nData Info:")
    print(cleaned_df.info())
    
    # Export to file
    validator.export_clean_data(cleaned_df, "cleaned_data.csv")

Explanation

This Python snippet provides a comprehensive solution for validating and cleaning JSON data before processing or storage. It’s particularly useful when dealing with data from external APIs, user inputs, or file imports where data quality is inconsistent.

Key Features

  1. Duplicate Removal: Automatically identifies and removes duplicate records
  2. Missing Value Handling: Intelligently fills missing values based on data type
  3. Data Type Correction: Converts strings to proper types (dates, numbers)
  4. Email Validation: Basic email format validation with flagging of invalid entries
  5. Required Field Validation: Ensures critical fields are present
  6. Flexible Export Options: Supports CSV, JSON, and Excel export formats

Why This is Useful

Real-world data is rarely perfect. This tool handles common data quality issues that would otherwise cause errors in downstream processes:

How to Run

  1. Install required packages:
    pip install pandas
    
  2. Save the code to a file (e.g., data_validator.py)

  3. Run directly:
    python data_validator.py
    

Customization

You can customize the validator by:

The tool logs all operations, making it easy to monitor what changes were made to your data during the cleaning process.