This Python snippet extracts text content from PDF files while preserving document metadata and providing word frequency analysis. It’s particularly useful for processing research papers, reports, or any document where both content and structure matter.
import PyPDF2
from collections import Counter
import re
from pathlib import Path
import json
from datetime import datetime
def extract_pdf_text_with_analysis(pdf_path, output_json=False):
"""
Extract text from PDF and provide metadata and word frequency analysis.
Args:
pdf_path (str): Path to the PDF file
output_json (bool): Whether to save results to JSON file
Returns:
dict: Extracted content with metadata and analysis
"""
# Validate file path
pdf_file = Path(pdf_path)
if not pdf_file.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
if pdf_file.suffix.lower() != '.pdf':
raise ValueError("File must be a PDF")
# Initialize result structure
result = {
"file_info": {
"filename": pdf_file.name,
"file_size_kb": round(pdf_file.stat().st_size / 1024, 2),
"extraction_date": datetime.now().isoformat()
},
"metadata": {},
"content": {
"total_pages": 0,
"extracted_text": "",
"word_count": 0,
"character_count": 0
},
"analysis": {
"most_frequent_words": [],
"text_summary": ""
}
}
try:
# Open and read PDF
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
# Extract metadata
pdf_metadata = pdf_reader.metadata
if pdf_metadata:
result["metadata"] = {
"title": pdf_metadata.get("/Title", "Unknown"),
"author": pdf_metadata.get("/Author", "Unknown"),
"subject": pdf_metadata.get("/Subject", "Unknown"),
"creator": pdf_metadata.get("/Creator", "Unknown"),
"producer": pdf_metadata.get("/Producer", "Unknown"),
"creation_date": str(pdf_metadata.get("/CreationDate", "Unknown"))
}
# Extract text from all pages
text_content = []
result["content"]["total_pages"] = len(pdf_reader.pages)
for page_num, page in enumerate(pdf_reader.pages, 1):
try:
page_text = page.extract_text()
if page_text:
text_content.append(page_text)
except Exception as e:
print(f"Warning: Could not extract text from page {page_num}: {e}")
# Combine all text
full_text = "\n".join(text_content)
result["content"]["extracted_text"] = full_text
result["content"]["word_count"] = len(full_text.split())
result["content"]["character_count"] = len(full_text)
# Perform text analysis
if full_text:
# Clean and analyze text
words = re.findall(r'\b[a-zA-Z]{3,}\b', full_text.lower())
word_freq = Counter(words)
# Get top 20 most common words
result["analysis"]["most_frequent_words"] = [
{"word": word, "frequency": count}
for word, count in word_freq.most_common(20)
]
# Generate summary (first 3 sentences or 500 chars)
sentences = re.split(r'[.!?]+', full_text)[:3]
summary = '. '.join(sentences).strip()
if len(summary) > 500:
summary = summary[:500] + "..."
result["analysis"]["text_summary"] = summary
except Exception as e:
raise RuntimeError(f"Error processing PDF: {str(e)}")
# Optionally save to JSON
if output_json:
json_filename = pdf_file.with_suffix('.json')
with open(json_filename, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"Results saved to {json_filename}")
return result
# Example usage
if __name__ == "__main__":
# Example with error handling
try:
# Replace with your PDF path
pdf_path = "sample_document.pdf"
# Extract and analyze
results = extract_pdf_text_with_analysis(pdf_path, output_json=True)
# Display key information
print(f"Document: {results['file_info']['filename']}")
print(f"Pages: {results['content']['total_pages']}")
print(f"Words: {results['content']['word_count']}")
print(f"Author: {results['metadata']['author']}")
print(f"Title: {results['metadata']['title']}")
print("\nMost frequent words:")
for item in results['analysis']['most_frequent_words'][:10]:
print(f" {item['word']}: {item['frequency']}")
except FileNotFoundError:
print("Please provide a valid PDF file path.")
except Exception as e:
print(f"An error occurred: {e}")
This snippet provides a comprehensive solution for extracting and analyzing text content from PDF documents. It combines several useful features:
pip install PyPDF2
Prepare a PDF File: Place a PDF document in your working directory or specify its path
python pdf_extractor.py
pdf_path variable to point to your PDF fileThe script will output key document information to the console and, if output_json=True, create a detailed JSON file with all extracted data and analysis results. The JSON file preserves the complete structure of the extracted information for further processing or archival purposes.