gateway/modules/features/realEstate/bzoPdfExtractor.py

117 lines
3.8 KiB
Python

"""
PDF extraction module for BZO documents.
Extracts page-aware text blocks from PDF files.
"""
import logging
from typing import List, Dict, Any
from dataclasses import dataclass
import fitz # PyMuPDF
logger = logging.getLogger(__name__)
@dataclass
class TextBlock:
"""Represents a text block from a PDF page."""
page: int
text: str
block_id: str
bbox: tuple = None # (x0, y0, x1, y1) bounding box
class BZOPdfExtractor:
"""Extracts text blocks from PDF files with page awareness."""
def __init__(self):
"""Initialize the PDF extractor."""
pass
def extract_text_blocks(self, pdf_bytes: bytes, pdf_id: str) -> List[TextBlock]:
"""
Extract page-aware text blocks from PDF.
Args:
pdf_bytes: PDF file content as bytes
pdf_id: Identifier for the PDF (for logging)
Returns:
List of TextBlock objects with page numbers
"""
text_blocks = []
try:
# Open PDF from bytes
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
# Extract text from each page
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
# Extract text blocks from page
blocks = page.get_text("blocks")
for block_idx, block in enumerate(blocks):
# block format: (x0, y0, x1, y1, "text", block_no, block_type)
if len(block) >= 5:
bbox = (block[0], block[1], block[2], block[3])
text = block[4].strip()
# Skip empty blocks
if not text:
continue
# Create TextBlock
block_id = f"{pdf_id}_p{page_num + 1}_b{block_idx}"
text_block = TextBlock(
page=page_num + 1, # 1-indexed pages
text=text,
block_id=block_id,
bbox=bbox
)
text_blocks.append(text_block)
# Store page count before closing
page_count = len(pdf_document)
pdf_document.close()
logger.info(f"Extracted {len(text_blocks)} text blocks from PDF {pdf_id} ({page_count} pages)")
except Exception as e:
logger.error(f"Error extracting text from PDF {pdf_id}: {str(e)}", exc_info=True)
raise
return text_blocks
def extract_text_by_page(self, pdf_bytes: bytes, pdf_id: str) -> Dict[int, str]:
"""
Extract full text per page (alternative method).
Args:
pdf_bytes: PDF file content as bytes
pdf_id: Identifier for the PDF
Returns:
Dictionary mapping page number to full page text
"""
page_texts = {}
try:
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text = page.get_text()
page_texts[page_num + 1] = text # 1-indexed
# Store page count before closing
page_count = len(pdf_document)
pdf_document.close()
logger.debug(f"Extracted text from {page_count} pages for PDF {pdf_id}")
except Exception as e:
logger.error(f"Error extracting page text from PDF {pdf_id}: {str(e)}", exc_info=True)
raise
return page_texts