This snippet demonstrates how to validate CSV data using Pandas, ensuring data quality by checking for missing values, data types, and custom constraints. This is useful for data preprocessing in machine learning pipelines, ETL processes, or any application requiring clean input data.
import pandas as pd
def validate_csv(
file_path: str,
required_columns: list,
dtype_constraints: dict,
nullable_columns: list = None
) -> tuple[bool, pd.DataFrame, list[str]]:
"""
Validates a CSV file against specified constraints.
Args:
file_path: Path to the CSV file
required_columns: Columns that must be present
dtype_constraints: Expected data types for columns (e.g., {'age': 'int64'})
nullable_columns: Columns allowed to have null values (default: none)
Returns:
Tuple of (is_valid, dataframe, error_messages)
"""
errors = []
try:
df = pd.read_csv(file_path)
except Exception as e:
return False, None, [f"File read error: {str(e)}"]
# Check required columns
missing_cols = [col for col in required_columns if col not in df.columns]
if missing_cols:
errors.append(f"Missing required columns: {', '.join(missing_cols)}")
# Check data types
for col, expected_type in dtype_constraints.items():
if col in df.columns and str(df[col].dtype) != expected_type:
errors.append(
f"Column '{col}' has type {df[col].dtype}, expected {expected_type}"
)
# Check for unexpected null values
if nullable_columns is None:
nullable_columns = []
null_check_cols = [col for col in df.columns if col not in nullable_columns]
null_counts = df[null_check_cols].isnull().sum()
for col, count in null_counts.items():
if count > 0:
errors.append(f"Column '{col}' has {count} null values")
return len(errors) == 0, df, errors
# Example usage
if __name__ == "__main__":
is_valid, data, messages = validate_csv(
file_path="sample_data.csv",
required_columns=["id", "name", "age"],
dtype_constraints={"id": "int64", "age": "int64", "name": "object"},
nullable_columns=["name"]
)
if is_valid:
print("Data validation successful!")
print(data.head())
else:
print("Validation errors found:")
for error in messages:
print(f"- {error}")
csv_validator.py
sample_data.csv
with columns matching the examplepython csv_validator.py
required_columns
, dtype_constraints
, and nullable_columns
parameters for your specific use casepip install pandas
)The script can be easily extended with additional validation rules like value ranges, pattern matching for strings, or cross-column validation logic.