gateway/modules/features/chatbotV2/contextExtractionLangGraph.py

160 lines
5.6 KiB
Python

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
"""
LangGraph-based pipeline for extracting context from uploaded documents.
Creates chat context from PDF and text files (no domain-specific goals).
"""
import logging
from typing import TypedDict, List, Dict, Any, Optional
from langgraph.graph import StateGraph, START, END
logger = logging.getLogger(__name__)
class ContextExtractionState(TypedDict):
"""State for context extraction pipeline."""
# Input: list of {fileId, bytes, mimeType, fileName}
files: List[Dict[str, Any]]
# Extracted text blocks per file: [{fileId, fileName, blocks: [{page, text, block_id}]}]
textBlocks: List[Dict[str, Any]]
# Structured sections for chat context (simplified articles/sections)
sections: List[Dict[str, Any]]
# Optional summaries (empty for now - no LLM in extraction)
summaries: List[Dict[str, Any]]
errors: List[str]
def extract_text_node(state: ContextExtractionState) -> ContextExtractionState:
"""Extract text from each file. PDF via BZOPdfExtractor, TXT as plain text."""
text_blocks = []
errors = list(state.get("errors", []))
for idx, file_info in enumerate(state.get("files", [])):
file_id = file_info.get("fileId", f"file_{idx}")
file_bytes = file_info.get("bytes")
mime_type = (file_info.get("mimeType") or "").lower()
file_name = file_info.get("fileName", f"document_{idx}")
if not file_bytes:
errors.append(f"No content for file {file_name} ({file_id})")
continue
blocks = []
try:
if "pdf" in mime_type or file_name.lower().endswith(".pdf"):
from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor
extractor = BZOPdfExtractor()
tb_list = extractor.extract_text_blocks(file_bytes, file_id)
for tb in tb_list:
blocks.append({
"page": tb.page,
"text": tb.text,
"block_id": tb.block_id,
"bbox": tb.bbox
})
logger.info(f"Extracted {len(blocks)} blocks from PDF {file_name}")
elif "text" in mime_type or file_name.lower().endswith(".txt"):
text = file_bytes.decode("utf-8", errors="replace")
lines = text.split("\n")
for i, line in enumerate(lines):
if line.strip():
blocks.append({
"page": 1,
"text": line.strip(),
"block_id": f"{file_id}_line_{i}",
"bbox": None
})
logger.info(f"Extracted {len(blocks)} lines from text file {file_name}")
else:
errors.append(f"Unsupported format for {file_name}: {mime_type}")
except Exception as e:
logger.error(f"Error extracting {file_name}: {e}", exc_info=True)
errors.append(f"Extraction failed for {file_name}: {str(e)}")
if blocks:
text_blocks.append({
"fileId": file_id,
"fileName": file_name,
"blocks": blocks
})
return {
**state,
"textBlocks": text_blocks,
"errors": errors
}
def structure_content_node(state: ContextExtractionState) -> ContextExtractionState:
"""Assemble text blocks into sections for chat context."""
sections = []
for doc in state.get("textBlocks", []):
file_name = doc.get("fileName", "document")
blocks = doc.get("blocks", [])
if not blocks:
continue
# Build section: combine blocks with page awareness
text_parts = []
current_page = 0
for b in blocks:
page = b.get("page", 1)
if page != current_page and text_parts:
text_parts.append("\n\n")
text_parts.append(b.get("text", ""))
current_page = page
full_text = "".join(text_parts).strip()
if full_text:
sections.append({
"fileId": doc.get("fileId"),
"fileName": file_name,
"text": full_text,
"blockCount": len(blocks)
})
return {
**state,
"sections": sections
}
def create_context_extraction_graph():
"""Create and compile the context extraction LangGraph."""
workflow = StateGraph(ContextExtractionState)
workflow.add_node("extract_text", extract_text_node)
workflow.add_node("structure_content", structure_content_node)
workflow.add_edge(START, "extract_text")
workflow.add_edge("extract_text", "structure_content")
workflow.add_edge("structure_content", END)
return workflow.compile()
def run_extraction(files: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Run the context extraction pipeline on uploaded files.
Args:
files: List of {fileId, bytes, mimeType, fileName}
Returns:
{
"textBlocks": [...],
"sections": [...],
"summaries": [],
"errors": [...]
}
"""
state: ContextExtractionState = {
"files": files,
"textBlocks": [],
"sections": [],
"summaries": [],
"errors": []
}
graph = create_context_extraction_graph()
final_state = graph.invoke(state)
return {
"textBlocks": final_state.get("textBlocks", []),
"sections": final_state.get("sections", []),
"summaries": final_state.get("summaries", []),
"errors": final_state.get("errors", [])
}