# Copyright (c) 2025 Patrick Motsch # All rights reserved. """ LangGraph-based pipeline for extracting context from uploaded documents. Creates chat context from PDF and text files (no domain-specific goals). """ import logging from typing import TypedDict, List, Dict, Any, Optional from langgraph.graph import StateGraph, START, END logger = logging.getLogger(__name__) class ContextExtractionState(TypedDict): """State for context extraction pipeline.""" # Input: list of {fileId, bytes, mimeType, fileName} files: List[Dict[str, Any]] # Extracted text blocks per file: [{fileId, fileName, blocks: [{page, text, block_id}]}] textBlocks: List[Dict[str, Any]] # Structured sections for chat context (simplified articles/sections) sections: List[Dict[str, Any]] # Optional summaries (empty for now - no LLM in extraction) summaries: List[Dict[str, Any]] errors: List[str] def extract_text_node(state: ContextExtractionState) -> ContextExtractionState: """Extract text from each file. PDF via BZOPdfExtractor, TXT as plain text.""" text_blocks = [] errors = list(state.get("errors", [])) for idx, file_info in enumerate(state.get("files", [])): file_id = file_info.get("fileId", f"file_{idx}") file_bytes = file_info.get("bytes") mime_type = (file_info.get("mimeType") or "").lower() file_name = file_info.get("fileName", f"document_{idx}") if not file_bytes: errors.append(f"No content for file {file_name} ({file_id})") continue blocks = [] try: if "pdf" in mime_type or file_name.lower().endswith(".pdf"): from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor extractor = BZOPdfExtractor() tb_list = extractor.extract_text_blocks(file_bytes, file_id) for tb in tb_list: blocks.append({ "page": tb.page, "text": tb.text, "block_id": tb.block_id, "bbox": tb.bbox }) logger.info(f"Extracted {len(blocks)} blocks from PDF {file_name}") elif "text" in mime_type or file_name.lower().endswith(".txt"): text = file_bytes.decode("utf-8", errors="replace") lines = text.split("\n") for i, line in enumerate(lines): if line.strip(): blocks.append({ "page": 1, "text": line.strip(), "block_id": f"{file_id}_line_{i}", "bbox": None }) logger.info(f"Extracted {len(blocks)} lines from text file {file_name}") else: errors.append(f"Unsupported format for {file_name}: {mime_type}") except Exception as e: logger.error(f"Error extracting {file_name}: {e}", exc_info=True) errors.append(f"Extraction failed for {file_name}: {str(e)}") if blocks: text_blocks.append({ "fileId": file_id, "fileName": file_name, "blocks": blocks }) return { **state, "textBlocks": text_blocks, "errors": errors } def structure_content_node(state: ContextExtractionState) -> ContextExtractionState: """Assemble text blocks into sections for chat context.""" sections = [] for doc in state.get("textBlocks", []): file_name = doc.get("fileName", "document") blocks = doc.get("blocks", []) if not blocks: continue # Build section: combine blocks with page awareness text_parts = [] current_page = 0 for b in blocks: page = b.get("page", 1) if page != current_page and text_parts: text_parts.append("\n\n") text_parts.append(b.get("text", "")) current_page = page full_text = "".join(text_parts).strip() if full_text: sections.append({ "fileId": doc.get("fileId"), "fileName": file_name, "text": full_text, "blockCount": len(blocks) }) return { **state, "sections": sections } def create_context_extraction_graph(): """Create and compile the context extraction LangGraph.""" workflow = StateGraph(ContextExtractionState) workflow.add_node("extract_text", extract_text_node) workflow.add_node("structure_content", structure_content_node) workflow.add_edge(START, "extract_text") workflow.add_edge("extract_text", "structure_content") workflow.add_edge("structure_content", END) return workflow.compile() def run_extraction(files: List[Dict[str, Any]]) -> Dict[str, Any]: """ Run the context extraction pipeline on uploaded files. Args: files: List of {fileId, bytes, mimeType, fileName} Returns: { "textBlocks": [...], "sections": [...], "summaries": [], "errors": [...] } """ state: ContextExtractionState = { "files": files, "textBlocks": [], "sections": [], "summaries": [], "errors": [] } graph = create_context_extraction_graph() final_state = graph.invoke(state) return { "textBlocks": final_state.get("textBlocks", []), "sections": final_state.get("sections", []), "summaries": final_state.get("summaries", []), "errors": final_state.get("errors", []) }