160 lines
5.6 KiB
Python
160 lines
5.6 KiB
Python
# Copyright (c) 2025 Patrick Motsch
|
|
# All rights reserved.
|
|
"""
|
|
LangGraph-based pipeline for extracting context from uploaded documents.
|
|
Creates chat context from PDF and text files (no domain-specific goals).
|
|
"""
|
|
|
|
import logging
|
|
from typing import TypedDict, List, Dict, Any, Optional
|
|
from langgraph.graph import StateGraph, START, END
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ContextExtractionState(TypedDict):
|
|
"""State for context extraction pipeline."""
|
|
# Input: list of {fileId, bytes, mimeType, fileName}
|
|
files: List[Dict[str, Any]]
|
|
# Extracted text blocks per file: [{fileId, fileName, blocks: [{page, text, block_id}]}]
|
|
textBlocks: List[Dict[str, Any]]
|
|
# Structured sections for chat context (simplified articles/sections)
|
|
sections: List[Dict[str, Any]]
|
|
# Optional summaries (empty for now - no LLM in extraction)
|
|
summaries: List[Dict[str, Any]]
|
|
errors: List[str]
|
|
|
|
|
|
def extract_text_node(state: ContextExtractionState) -> ContextExtractionState:
|
|
"""Extract text from each file. PDF via BZOPdfExtractor, TXT as plain text."""
|
|
text_blocks = []
|
|
errors = list(state.get("errors", []))
|
|
|
|
for idx, file_info in enumerate(state.get("files", [])):
|
|
file_id = file_info.get("fileId", f"file_{idx}")
|
|
file_bytes = file_info.get("bytes")
|
|
mime_type = (file_info.get("mimeType") or "").lower()
|
|
file_name = file_info.get("fileName", f"document_{idx}")
|
|
|
|
if not file_bytes:
|
|
errors.append(f"No content for file {file_name} ({file_id})")
|
|
continue
|
|
|
|
blocks = []
|
|
try:
|
|
if "pdf" in mime_type or file_name.lower().endswith(".pdf"):
|
|
from modules.features.realEstate.bzoPdfExtractor import BZOPdfExtractor
|
|
extractor = BZOPdfExtractor()
|
|
tb_list = extractor.extract_text_blocks(file_bytes, file_id)
|
|
for tb in tb_list:
|
|
blocks.append({
|
|
"page": tb.page,
|
|
"text": tb.text,
|
|
"block_id": tb.block_id,
|
|
"bbox": tb.bbox
|
|
})
|
|
logger.info(f"Extracted {len(blocks)} blocks from PDF {file_name}")
|
|
elif "text" in mime_type or file_name.lower().endswith(".txt"):
|
|
text = file_bytes.decode("utf-8", errors="replace")
|
|
lines = text.split("\n")
|
|
for i, line in enumerate(lines):
|
|
if line.strip():
|
|
blocks.append({
|
|
"page": 1,
|
|
"text": line.strip(),
|
|
"block_id": f"{file_id}_line_{i}",
|
|
"bbox": None
|
|
})
|
|
logger.info(f"Extracted {len(blocks)} lines from text file {file_name}")
|
|
else:
|
|
errors.append(f"Unsupported format for {file_name}: {mime_type}")
|
|
except Exception as e:
|
|
logger.error(f"Error extracting {file_name}: {e}", exc_info=True)
|
|
errors.append(f"Extraction failed for {file_name}: {str(e)}")
|
|
|
|
if blocks:
|
|
text_blocks.append({
|
|
"fileId": file_id,
|
|
"fileName": file_name,
|
|
"blocks": blocks
|
|
})
|
|
|
|
return {
|
|
**state,
|
|
"textBlocks": text_blocks,
|
|
"errors": errors
|
|
}
|
|
|
|
|
|
def structure_content_node(state: ContextExtractionState) -> ContextExtractionState:
|
|
"""Assemble text blocks into sections for chat context."""
|
|
sections = []
|
|
for doc in state.get("textBlocks", []):
|
|
file_name = doc.get("fileName", "document")
|
|
blocks = doc.get("blocks", [])
|
|
if not blocks:
|
|
continue
|
|
# Build section: combine blocks with page awareness
|
|
text_parts = []
|
|
current_page = 0
|
|
for b in blocks:
|
|
page = b.get("page", 1)
|
|
if page != current_page and text_parts:
|
|
text_parts.append("\n\n")
|
|
text_parts.append(b.get("text", ""))
|
|
current_page = page
|
|
full_text = "".join(text_parts).strip()
|
|
if full_text:
|
|
sections.append({
|
|
"fileId": doc.get("fileId"),
|
|
"fileName": file_name,
|
|
"text": full_text,
|
|
"blockCount": len(blocks)
|
|
})
|
|
return {
|
|
**state,
|
|
"sections": sections
|
|
}
|
|
|
|
|
|
def create_context_extraction_graph():
|
|
"""Create and compile the context extraction LangGraph."""
|
|
workflow = StateGraph(ContextExtractionState)
|
|
workflow.add_node("extract_text", extract_text_node)
|
|
workflow.add_node("structure_content", structure_content_node)
|
|
workflow.add_edge(START, "extract_text")
|
|
workflow.add_edge("extract_text", "structure_content")
|
|
workflow.add_edge("structure_content", END)
|
|
return workflow.compile()
|
|
|
|
|
|
def run_extraction(files: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""
|
|
Run the context extraction pipeline on uploaded files.
|
|
|
|
Args:
|
|
files: List of {fileId, bytes, mimeType, fileName}
|
|
|
|
Returns:
|
|
{
|
|
"textBlocks": [...],
|
|
"sections": [...],
|
|
"summaries": [],
|
|
"errors": [...]
|
|
}
|
|
"""
|
|
state: ContextExtractionState = {
|
|
"files": files,
|
|
"textBlocks": [],
|
|
"sections": [],
|
|
"summaries": [],
|
|
"errors": []
|
|
}
|
|
graph = create_context_extraction_graph()
|
|
final_state = graph.invoke(state)
|
|
return {
|
|
"textBlocks": final_state.get("textBlocks", []),
|
|
"sections": final_state.get("sections", []),
|
|
"summaries": final_state.get("summaries", []),
|
|
"errors": final_state.get("errors", [])
|
|
}
|