117 lines
3.8 KiB
Python
117 lines
3.8 KiB
Python
"""
|
|
PDF extraction module for BZO documents.
|
|
Extracts page-aware text blocks from PDF files.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Dict, Any
|
|
from dataclasses import dataclass
|
|
import fitz # PyMuPDF
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class TextBlock:
|
|
"""Represents a text block from a PDF page."""
|
|
page: int
|
|
text: str
|
|
block_id: str
|
|
bbox: tuple = None # (x0, y0, x1, y1) bounding box
|
|
|
|
|
|
class BZOPdfExtractor:
|
|
"""Extracts text blocks from PDF files with page awareness."""
|
|
|
|
def __init__(self):
|
|
"""Initialize the PDF extractor."""
|
|
pass
|
|
|
|
def extract_text_blocks(self, pdf_bytes: bytes, pdf_id: str) -> List[TextBlock]:
|
|
"""
|
|
Extract page-aware text blocks from PDF.
|
|
|
|
Args:
|
|
pdf_bytes: PDF file content as bytes
|
|
pdf_id: Identifier for the PDF (for logging)
|
|
|
|
Returns:
|
|
List of TextBlock objects with page numbers
|
|
"""
|
|
text_blocks = []
|
|
|
|
try:
|
|
# Open PDF from bytes
|
|
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
|
|
# Extract text from each page
|
|
for page_num in range(len(pdf_document)):
|
|
page = pdf_document[page_num]
|
|
|
|
# Extract text blocks from page
|
|
blocks = page.get_text("blocks")
|
|
|
|
for block_idx, block in enumerate(blocks):
|
|
# block format: (x0, y0, x1, y1, "text", block_no, block_type)
|
|
if len(block) >= 5:
|
|
bbox = (block[0], block[1], block[2], block[3])
|
|
text = block[4].strip()
|
|
|
|
# Skip empty blocks
|
|
if not text:
|
|
continue
|
|
|
|
# Create TextBlock
|
|
block_id = f"{pdf_id}_p{page_num + 1}_b{block_idx}"
|
|
text_block = TextBlock(
|
|
page=page_num + 1, # 1-indexed pages
|
|
text=text,
|
|
block_id=block_id,
|
|
bbox=bbox
|
|
)
|
|
text_blocks.append(text_block)
|
|
|
|
# Store page count before closing
|
|
page_count = len(pdf_document)
|
|
pdf_document.close()
|
|
|
|
logger.info(f"Extracted {len(text_blocks)} text blocks from PDF {pdf_id} ({page_count} pages)")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from PDF {pdf_id}: {str(e)}", exc_info=True)
|
|
raise
|
|
|
|
return text_blocks
|
|
|
|
def extract_text_by_page(self, pdf_bytes: bytes, pdf_id: str) -> Dict[int, str]:
|
|
"""
|
|
Extract full text per page (alternative method).
|
|
|
|
Args:
|
|
pdf_bytes: PDF file content as bytes
|
|
pdf_id: Identifier for the PDF
|
|
|
|
Returns:
|
|
Dictionary mapping page number to full page text
|
|
"""
|
|
page_texts = {}
|
|
|
|
try:
|
|
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
|
|
for page_num in range(len(pdf_document)):
|
|
page = pdf_document[page_num]
|
|
text = page.get_text()
|
|
page_texts[page_num + 1] = text # 1-indexed
|
|
|
|
# Store page count before closing
|
|
page_count = len(pdf_document)
|
|
pdf_document.close()
|
|
|
|
logger.debug(f"Extracted text from {page_count} pages for PDF {pdf_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting page text from PDF {pdf_id}: {str(e)}", exc_info=True)
|
|
raise
|
|
|
|
return page_texts
|