import pandas as pd
import json
from typing import Dict, List, Any, Optional
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class JSONDataValidator:
"""
A utility class for validating and cleaning JSON data using pandas.
Useful for preprocessing data before database insertion or analysis.
"""
def __init__(self, required_fields: List[str] = None):
self.required_fields = required_fields or []
def validate_and_clean(self, json_data: List[Dict[str, Any]]) -> pd.DataFrame:
"""
Validate and clean a list of JSON objects.
Args:
json_data: List of dictionaries representing JSON records
Returns:
Cleaned pandas DataFrame
"""
# Convert to DataFrame
df = pd.DataFrame(json_data)
# Check for required fields
missing_fields = set(self.required_fields) - set(df.columns)
if missing_fields:
logger.warning(f"Missing required fields: {missing_fields}")
# Add missing columns with None values
for field in missing_fields:
df[field] = None
# Remove duplicates
initial_count = len(df)
df = df.drop_duplicates()
if len(df) < initial_count:
logger.info(f"Removed {initial_count - len(df)} duplicate records")
# Handle missing values
df = self._handle_missing_values(df)
# Data type corrections
df = self._correct_data_types(df)
# Validate email formats (basic validation)
df = self._validate_emails(df)
logger.info(f"Data validation completed. Final record count: {len(df)}")
return df
def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
"""Handle missing values in the DataFrame."""
# Fill string columns with 'Unknown'
string_cols = df.select_dtypes(include=['object']).columns
df[string_cols] = df[string_cols].fillna('Unknown')
# Fill numeric columns with 0
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(0)
return df
def _correct_data_types(self, df: pd.DataFrame) -> pd.DataFrame:
"""Correct common data type issues."""
# Convert date strings to datetime if column name contains 'date'
date_cols = [col for col in df.columns if 'date' in col.lower()]
for col in date_cols:
try:
df[col] = pd.to_datetime(df[col], errors='coerce')
except Exception as e:
logger.warning(f"Could not convert {col} to datetime: {e}")
# Ensure numeric columns are actually numeric
numeric_cols = ['age', 'price', 'salary', 'score']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
return df
def _validate_emails(self, df: pd.DataFrame) -> pd.DataFrame:
"""Basic email validation."""
email_cols = [col for col in df.columns if 'email' in col.lower()]
for col in email_cols:
# Simple email validation pattern
df[f'{col}_valid'] = df[col].str.contains(
r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
na=False
)
invalid_emails = df[df[f'{col}_valid'] == False]
if len(invalid_emails) > 0:
logger.warning(f"Found {len(invalid_emails)} invalid emails in {col}")
# Optionally clear invalid emails
df.loc[df[f'{col}_valid'] == False, col] = None
return df
def export_clean_data(self, df: pd.DataFrame, filename: str) -> None:
"""Export cleaned data to various formats."""
if filename.endswith('.csv'):
df.to_csv(filename, index=False)
elif filename.endswith('.json'):
df.to_json(filename, orient='records', indent=2)
elif filename.endswith('.xlsx'):
df.to_excel(filename, index=False)
else:
raise ValueError("Unsupported file format. Use .csv, .json, or .xlsx")
logger.info(f"Data exported to {filename}")
# Example usage
if __name__ == "__main__":
# Sample JSON data with various issues
sample_data = [
{"name": "John Doe", "email": "john@example.com", "age": "25", "signup_date": "2023-01-15"},
{"name": "Jane Smith", "email": "invalid-email", "age": 30, "signup_date": "2023-02-20"},
{"name": "Bob Johnson", "email": "bob@example.com", "age": None, "signup_date": "2023-03-10"},
{"name": "John Doe", "email": "john@example.com", "age": "25", "signup_date": "2023-01-15"}, # Duplicate
{"name": None, "email": "sara@example.com", "age": "28", "signup_date": "invalid-date"},
{"email": "missing_name@example.com", "age": "35", "signup_date": "2023-05-01"}, # Missing name
]
# Initialize validator with required fields
validator = JSONDataValidator(required_fields=['name', 'email', 'age'])
# Validate and clean data
cleaned_df = validator.validate_and_clean(sample_data)
# Display results
print("Cleaned Data:")
print(cleaned_df)
print("\nData Info:")
print(cleaned_df.info())
# Export to file
validator.export_clean_data(cleaned_df, "cleaned_data.csv")
This Python snippet provides a comprehensive solution for validating and cleaning JSON data before processing or storage. It’s particularly useful when dealing with data from external APIs, user inputs, or file imports where data quality is inconsistent.
Real-world data is rarely perfect. This tool handles common data quality issues that would otherwise cause errors in downstream processes:
pip install pandas
Save the code to a file (e.g., data_validator.py)
python data_validator.py
You can customize the validator by:
required_fields during initializationThe tool logs all operations, making it easy to monitor what changes were made to your data during the cleaning process.