""" PDF extraction module for BZO documents. Extracts page-aware text blocks from PDF files. """ import logging from typing import List, Dict, Any from dataclasses import dataclass import fitz # PyMuPDF logger = logging.getLogger(__name__) @dataclass class TextBlock: """Represents a text block from a PDF page.""" page: int text: str block_id: str bbox: tuple = None # (x0, y0, x1, y1) bounding box class BZOPdfExtractor: """Extracts text blocks from PDF files with page awareness.""" def __init__(self): """Initialize the PDF extractor.""" pass def extract_text_blocks(self, pdf_bytes: bytes, pdf_id: str) -> List[TextBlock]: """ Extract page-aware text blocks from PDF. Args: pdf_bytes: PDF file content as bytes pdf_id: Identifier for the PDF (for logging) Returns: List of TextBlock objects with page numbers """ text_blocks = [] try: # Open PDF from bytes pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") # Extract text from each page for page_num in range(len(pdf_document)): page = pdf_document[page_num] # Extract text blocks from page blocks = page.get_text("blocks") for block_idx, block in enumerate(blocks): # block format: (x0, y0, x1, y1, "text", block_no, block_type) if len(block) >= 5: bbox = (block[0], block[1], block[2], block[3]) text = block[4].strip() # Skip empty blocks if not text: continue # Create TextBlock block_id = f"{pdf_id}_p{page_num + 1}_b{block_idx}" text_block = TextBlock( page=page_num + 1, # 1-indexed pages text=text, block_id=block_id, bbox=bbox ) text_blocks.append(text_block) # Store page count before closing page_count = len(pdf_document) pdf_document.close() logger.info(f"Extracted {len(text_blocks)} text blocks from PDF {pdf_id} ({page_count} pages)") except Exception as e: logger.error(f"Error extracting text from PDF {pdf_id}: {str(e)}", exc_info=True) raise return text_blocks def extract_text_by_page(self, pdf_bytes: bytes, pdf_id: str) -> Dict[int, str]: """ Extract full text per page (alternative method). Args: pdf_bytes: PDF file content as bytes pdf_id: Identifier for the PDF Returns: Dictionary mapping page number to full page text """ page_texts = {} try: pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf") for page_num in range(len(pdf_document)): page = pdf_document[page_num] text = page.get_text() page_texts[page_num + 1] = text # 1-indexed # Store page count before closing page_count = len(pdf_document) pdf_document.close() logger.debug(f"Extracted text from {page_count} pages for PDF {pdf_id}") except Exception as e: logger.error(f"Error extracting page text from PDF {pdf_id}: {str(e)}", exc_info=True) raise return page_texts