Python Snippets

Automatic PDF Text Extractor with Metadata Analysis

This Python snippet extracts text content from PDF files while preserving document metadata and providing word frequency analysis. It’s particularly useful for processing research papers, reports, or any document where both content and structure matter.

import PyPDF2
from collections import Counter
import re
from pathlib import Path
import json
from datetime import datetime

def extract_pdf_text_with_analysis(pdf_path, output_json=False):
    """
    Extract text from PDF and provide metadata and word frequency analysis.
    
    Args:
        pdf_path (str): Path to the PDF file
        output_json (bool): Whether to save results to JSON file
    
    Returns:
        dict: Extracted content with metadata and analysis
    """
    # Validate file path
    pdf_file = Path(pdf_path)
    if not pdf_file.exists():
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    
    if pdf_file.suffix.lower() != '.pdf':
        raise ValueError("File must be a PDF")
    
    # Initialize result structure
    result = {
        "file_info": {
            "filename": pdf_file.name,
            "file_size_kb": round(pdf_file.stat().st_size / 1024, 2),
            "extraction_date": datetime.now().isoformat()
        },
        "metadata": {},
        "content": {
            "total_pages": 0,
            "extracted_text": "",
            "word_count": 0,
            "character_count": 0
        },
        "analysis": {
            "most_frequent_words": [],
            "text_summary": ""
        }
    }
    
    try:
        # Open and read PDF
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Extract metadata
            pdf_metadata = pdf_reader.metadata
            if pdf_metadata:
                result["metadata"] = {
                    "title": pdf_metadata.get("/Title", "Unknown"),
                    "author": pdf_metadata.get("/Author", "Unknown"),
                    "subject": pdf_metadata.get("/Subject", "Unknown"),
                    "creator": pdf_metadata.get("/Creator", "Unknown"),
                    "producer": pdf_metadata.get("/Producer", "Unknown"),
                    "creation_date": str(pdf_metadata.get("/CreationDate", "Unknown"))
                }
            
            # Extract text from all pages
            text_content = []
            result["content"]["total_pages"] = len(pdf_reader.pages)
            
            for page_num, page in enumerate(pdf_reader.pages, 1):
                try:
                    page_text = page.extract_text()
                    if page_text:
                        text_content.append(page_text)
                except Exception as e:
                    print(f"Warning: Could not extract text from page {page_num}: {e}")
            
            # Combine all text
            full_text = "\n".join(text_content)
            result["content"]["extracted_text"] = full_text
            result["content"]["word_count"] = len(full_text.split())
            result["content"]["character_count"] = len(full_text)
            
            # Perform text analysis
            if full_text:
                # Clean and analyze text
                words = re.findall(r'\b[a-zA-Z]{3,}\b', full_text.lower())
                word_freq = Counter(words)
                
                # Get top 20 most common words
                result["analysis"]["most_frequent_words"] = [
                    {"word": word, "frequency": count} 
                    for word, count in word_freq.most_common(20)
                ]
                
                # Generate summary (first 3 sentences or 500 chars)
                sentences = re.split(r'[.!?]+', full_text)[:3]
                summary = '. '.join(sentences).strip()
                if len(summary) > 500:
                    summary = summary[:500] + "..."
                result["analysis"]["text_summary"] = summary
    
    except Exception as e:
        raise RuntimeError(f"Error processing PDF: {str(e)}")
    
    # Optionally save to JSON
    if output_json:
        json_filename = pdf_file.with_suffix('.json')
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(f"Results saved to {json_filename}")
    
    return result

# Example usage
if __name__ == "__main__":
    # Example with error handling
    try:
        # Replace with your PDF path
        pdf_path = "sample_document.pdf"
        
        # Extract and analyze
        results = extract_pdf_text_with_analysis(pdf_path, output_json=True)
        
        # Display key information
        print(f"Document: {results['file_info']['filename']}")
        print(f"Pages: {results['content']['total_pages']}")
        print(f"Words: {results['content']['word_count']}")
        print(f"Author: {results['metadata']['author']}")
        print(f"Title: {results['metadata']['title']}")
        print("\nMost frequent words:")
        for item in results['analysis']['most_frequent_words'][:10]:
            print(f"  {item['word']}: {item['frequency']}")
            
    except FileNotFoundError:
        print("Please provide a valid PDF file path.")
    except Exception as e:
        print(f"An error occurred: {e}")

What This Code Does

This snippet provides a comprehensive solution for extracting and analyzing text content from PDF documents. It combines several useful features:

  1. Text Extraction: Uses PyPDF2 to extract text from all pages of a PDF document
  2. Metadata Retrieval: Captures document metadata like title, author, subject, and creation date
  3. Content Analysis: Performs word frequency analysis to identify the most common terms
  4. Summary Generation: Creates a concise summary of the document content
  5. Export Capability: Optionally saves results to a structured JSON file
  6. Error Handling: Includes robust error handling for common issues like missing files or corrupted PDFs

Why This is Useful

How to Run

  1. Install Dependencies:
    pip install PyPDF2
    
  2. Prepare a PDF File: Place a PDF document in your working directory or specify its path

  3. Run the Script:
    python pdf_extractor.py
    
  4. Customize: Modify the pdf_path variable to point to your PDF file

The script will output key document information to the console and, if output_json=True, create a detailed JSON file with all extracted data and analysis results. The JSON file preserves the complete structure of the extracted information for further processing or archival purposes.