gateway/modules/features/chatbotV2/contextExtractionLangGraph.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
LangGraph-based pipeline for extracting context from uploaded documents.
Creates chat context from PDF and text files (no domain-specific goals).
"""

import logging
from typing import TypedDict, List, Dict, Any, Optional
from langgraph.graph import StateGraph, START, END

logger = logging.getLogger(__name__)


class ContextExtractionState(TypedDict):
    """State for context extraction pipeline."""
    # Input: list of {fileId, bytes, mimeType, fileName}
    files: List[Dict[str, Any]]
    # Extracted text blocks per file: [{fileId, fileName, blocks: [{page, text, block_id}]}]
    textBlocks: List[Dict[str, Any]]
    # Structured sections for chat context (simplified articles/sections)
    sections: List[Dict[str, Any]]
    # Optional summaries (empty for now - no LLM in extraction)
    summaries: List[Dict[str, Any]]
    errors: List[str]


def extract_text_node(state: ContextExtractionState) -> ContextExtractionState:
    """Extract text from each file. PDF via BZOPdfExtractor, TXT as plain text."""
    text_blocks = []
    errors = list(state.get("errors", []))

    for idx, file_info in enumerate(state.get("files", [])):
        file_id = file_info.get("fileId", f"file_{idx}")
        file_bytes = file_info.get("bytes")
        mime_type = (file_info.get("mimeType") or "").lower()
        file_name = file_info.get("fileName", f"document_{idx}")

        if not file_bytes:
            errors.append(f"No content for file {file_name} ({file_id})")
            continue

        blocks = []
        try:
            if "pdf" in mime_type or file_name.lower().endswith(".pdf"):
                from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor
                extractor = BZOPdfExtractor()
                tb_list = extractor.extract_text_blocks(file_bytes, file_id)
                for tb in tb_list:
                    blocks.append({
                        "page": tb.page,
                        "text": tb.text,
                        "block_id": tb.block_id,
                        "bbox": tb.bbox
                    })
                logger.info(f"Extracted {len(blocks)} blocks from PDF {file_name}")
            elif "text" in mime_type or file_name.lower().endswith(".txt"):
                text = file_bytes.decode("utf-8", errors="replace")
                lines = text.split("\n")
                for i, line in enumerate(lines):
                    if line.strip():
                        blocks.append({
                            "page": 1,
                            "text": line.strip(),
                            "block_id": f"{file_id}_line_{i}",
                            "bbox": None
                        })
                logger.info(f"Extracted {len(blocks)} lines from text file {file_name}")
            else:
                errors.append(f"Unsupported format for {file_name}: {mime_type}")
        except Exception as e:
            logger.error(f"Error extracting {file_name}: {e}", exc_info=True)
            errors.append(f"Extraction failed for {file_name}: {str(e)}")

        if blocks:
            text_blocks.append({
                "fileId": file_id,
                "fileName": file_name,
                "blocks": blocks
            })

    return {
        **state,
        "textBlocks": text_blocks,
        "errors": errors
    }


def structure_content_node(state: ContextExtractionState) -> ContextExtractionState:
    """Assemble text blocks into sections for chat context."""
    sections = []
    for doc in state.get("textBlocks", []):
        file_name = doc.get("fileName", "document")
        blocks = doc.get("blocks", [])
        if not blocks:
            continue
        # Build section: combine blocks with page awareness
        text_parts = []
        current_page = 0
        for b in blocks:
            page = b.get("page", 1)
            if page != current_page and text_parts:
                text_parts.append("\n\n")
            text_parts.append(b.get("text", ""))
            current_page = page
        full_text = "".join(text_parts).strip()
        if full_text:
            sections.append({
                "fileId": doc.get("fileId"),
                "fileName": file_name,
                "text": full_text,
                "blockCount": len(blocks)
            })
    return {
        **state,
        "sections": sections
    }


def create_context_extraction_graph():
    """Create and compile the context extraction LangGraph."""
    workflow = StateGraph(ContextExtractionState)
    workflow.add_node("extract_text", extract_text_node)
    workflow.add_node("structure_content", structure_content_node)
    workflow.add_edge(START, "extract_text")
    workflow.add_edge("extract_text", "structure_content")
    workflow.add_edge("structure_content", END)
    return workflow.compile()


def run_extraction(files: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Run the context extraction pipeline on uploaded files.

    Args:
        files: List of {fileId, bytes, mimeType, fileName}

    Returns:
        {
            "textBlocks": [...],
            "sections": [...],
            "summaries": [],
            "errors": [...]
        }
    """
    state: ContextExtractionState = {
        "files": files,
        "textBlocks": [],
        "sections": [],
        "summaries": [],
        "errors": []
    }
    graph = create_context_extraction_graph()
    final_state = graph.invoke(state)
    return {
        "textBlocks": final_state.get("textBlocks", []),
        "sections": final_state.get("sections", []),
        "summaries": final_state.get("summaries", []),
        "errors": final_state.get("errors", [])
    }