gateway/modules/agentservice_dataextraction.py
2025-04-16 21:42:26 +02:00

921 lines
No EOL
40 KiB
Python

"""
Refactored helper function for intelligent data extraction (continued).
"""
import logging
import json
from typing import List, Dict, Any, Optional, Tuple
import asyncio
from datetime import datetime
import uuid
logger = logging.getLogger(__name__)
async def data_extraction(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
lucydom_interface = None,
workflow_id: str = None,
add_log_func = None,
document_handler = None # Add document handler parameter
) -> Dict[str, Any]:
"""
Performs AI-driven data extraction with improved document and image handling.
Args:
prompt: Specification of what data to extract
files: List of all available files with metadata
messages: List of all messages in the workflow
ai_service: Service for AI requests
lucydom_interface: Interface for database access (optional)
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
document_handler: Optional document handler for structured document operations
Returns:
Structured text object with extracted data and context information
"""
try:
# Log extraction start
_log(add_log_func, workflow_id, f"Starting data extraction with {len(files)} files", "info")
# Create enhanced extraction plan using AI
_log(add_log_func, workflow_id, "Creating extraction plan", "info")
extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)
# If we have extraction plan, log summary
if extraction_plan:
extract_needed_count = sum(1 for item in extraction_plan if item.get("extract_needed", False))
_log(add_log_func, workflow_id,
f"Extraction plan created: {len(extraction_plan)} files, {extract_needed_count} need extraction", "info")
# Execute extractions, preferring document handler if available
if document_handler:
_log(add_log_func, workflow_id, "Using document handler for extraction", "info")
extracted_data = await _execute_extractions_with_handler(
extraction_plan,
files,
messages,
document_handler,
ai_service,
workflow_id,
add_log_func
)
else:
# Fall back to original implementation
_log(add_log_func, workflow_id, "Using fallback extraction method", "info")
extracted_data = await _execute_extractions(
extraction_plan,
files,
messages,
lucydom_interface,
ai_service,
workflow_id,
add_log_func
)
# Structure extracted data
_log(add_log_func, workflow_id, f"Structuring extracted data from {len(extracted_data)} files", "info")
structured_result = _structure_extracted_data(extracted_data, files, prompt)
# Enhance with contextual summaries using AI
if ai_service and structured_result["extracted_content"]:
_log(add_log_func, workflow_id, "Creating contextual summaries for extracted content", "info")
try:
# Create a prompt for contextual summary
summary_prompt = f"""
Create concise, contextual summaries of the following extracted content according to this requirement:
REQUIREMENT: {prompt}
EXTRACTED CONTENT:
"""
for item in structured_result["extracted_content"]:
file_name = item.get("name", "Unnamed file")
content_preview = item.get("content", "")[:500] + "..." if len(item.get("content", "")) > 500 else item.get("content", "")
summary_prompt += f"\n--- {file_name} ---\n{content_preview}\n"
# Call AI for contextual summaries
summaries = await ai_service.call_api([{"role": "user", "content": summary_prompt}])
structured_result["contextual_summary"] = summaries
_log(add_log_func, workflow_id, "Added contextual summaries to extracted data", "info")
except Exception as e:
_log(add_log_func, workflow_id, f"Error creating contextual summaries: {str(e)}", "warning")
# Handle image-specific content separately
image_content = [item for item in structured_result["extracted_content"]
if "Image Analysis" in item.get("content", "") or item.get("type") == "image"]
if image_content and len(image_content) > 0:
_log(add_log_func, workflow_id, f"Processing {len(image_content)} image-related content items", "info")
# Add image analysis summary if we have AI service
if ai_service:
try:
# Create a prompt for image analysis summary
image_summary_prompt = f"""
Summarize the key visual information from these image analyses according to this requirement:
REQUIREMENT: {prompt}
IMAGE ANALYSES:
"""
for item in image_content:
file_name = item.get("name", "Unnamed image")
content = item.get("content", "")
image_summary_prompt += f"\n--- {file_name} ---\n{content}\n"
# Call AI for image analysis summary
image_summaries = await ai_service.call_api([{"role": "user", "content": image_summary_prompt}])
structured_result["image_analysis_summary"] = image_summaries
_log(add_log_func, workflow_id, "Added image analysis summary to extracted data", "info")
except Exception as e:
_log(add_log_func, workflow_id, f"Error creating image analysis summary: {str(e)}", "warning")
return structured_result
except Exception as e:
logger.error(f"Error in data extraction: {str(e)}", exc_info=True)
# Add error log
if add_log_func and workflow_id:
add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error")
# Return error result
return {
"error": str(e),
"status": "error",
"files_processed": len(files),
"message": f"Data extraction failed: {str(e)}"
}
async def _execute_extractions_with_handler(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
document_handler,
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Execute extractions using the document handler with enhanced image processing.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
messages: List of all messages
document_handler: Document handler for structured operations
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance (highest first)
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Log extraction start
_log(add_log_func, workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})", "info")
# Only perform extraction if needed
if extract_needed:
# Check if file already exists in messages with content
existing_content = _find_document_in_messages(file_id, messages)
if existing_content and existing_content.get("content"):
# Content already exists, check if we need more specialized extraction
current_context = existing_content.get("extraction_context", "")
# Check if new extraction prompt is different or more specific
if extraction_prompt and extraction_prompt != current_context:
_log(add_log_func, workflow_id,
f"Re-extracting {file_name} with new prompt: {extraction_prompt}", "info")
# Create an empty message to extract into
empty_message = {}
# Use document handler to extract with new context
try:
result_message = await document_handler.add_file_to_message(
empty_message,
file_id,
extraction_prompt
)
# Get the document content from result
if "documents" in result_message and result_message["documents"]:
doc = result_message["documents"][0]
# Get text content
content_text = ""
is_extracted = False
for content in doc.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
# Create extraction result
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content_text,
"is_extracted": is_extracted,
"extraction_method": "document_handler_reextract",
"extraction_context": extraction_prompt
})
# Check for additional documents (e.g., extracted images)
for additional_doc in result_message.get("documents", [])[1:]:
source = additional_doc.get("source", {})
# Skip if not an extracted document
if source.get("type") != "extracted":
continue
# Get content
add_content_text = ""
add_is_extracted = False
for content in additional_doc.get("contents", []):
if content.get("type") == "text":
add_content_text = content.get("text", "")
add_is_extracted = content.get("is_extracted", False)
break
# Add as separate extraction result
if add_content_text:
extracted_data.append({
"file_id": source.get("id", f"extracted_{uuid.uuid4()}"),
"name": source.get("name", f"Extracted from {file_name}"),
"type": source.get("content_type", "image"),
"content": add_content_text,
"is_extracted": add_is_extracted,
"extraction_method": "document_handler_extracted_component",
"extraction_context": content.get("extraction_context", extraction_prompt),
"parent_file_id": file_id
})
_log(add_log_func, workflow_id,
f"Extracted embedded content from {file_name}", "info")
_log(add_log_func, workflow_id,
f"Re-extracted {file_name} with new context", "info")
continue
except Exception as e:
logger.error(f"Error re-extracting {file_name}: {str(e)}")
_log(add_log_func, workflow_id,
f"Error re-extracting {file_name}: {str(e)}", "warning")
# Use existing content
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": existing_content.get("content", ""),
"is_extracted": existing_content.get("is_extracted", False),
"extraction_method": "existing_content",
"extraction_context": current_context
})
_log(add_log_func, workflow_id,
f"Using existing content for {file_name}", "info")
continue
# Need to extract content with document handler
try:
# Create an empty message to extract into
empty_message = {}
# Use document handler to add file and extract content
result_message = await document_handler.add_file_to_message(
empty_message,
file_id,
extraction_prompt
)
# Get the document content from result
if "documents" in result_message and result_message["documents"]:
# Process main document
doc = result_message["documents"][0] # First document is the main file
# Get text content
content_text = ""
is_extracted = False
for content in doc.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
# Create extraction result for main document
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content_text,
"is_extracted": is_extracted,
"extraction_method": "document_handler",
"extraction_context": extraction_prompt
})
_log(add_log_func, workflow_id,
f"Extracted {file_name} using document handler", "info")
# Process additional documents (e.g., extracted images)
for additional_doc in result_message.get("documents", [])[1:]:
source = additional_doc.get("source", {})
# Skip if not an extracted document
if source.get("type") != "extracted":
continue
# Get content
add_content_text = ""
add_is_extracted = False
for content in additional_doc.get("contents", []):
if content.get("type") == "text":
add_content_text = content.get("text", "")
add_is_extracted = content.get("is_extracted", False)
break
# Add as separate extraction result
if add_content_text:
extracted_data.append({
"file_id": source.get("id", f"extracted_{uuid.uuid4()}"),
"name": source.get("name", f"Extracted from {file_name}"),
"type": source.get("content_type", "image"),
"content": add_content_text,
"is_extracted": add_is_extracted,
"extraction_method": "document_handler_extracted_component",
"extraction_context": content.get("extraction_context", extraction_prompt),
"parent_file_id": file_id
})
_log(add_log_func, workflow_id,
f"Extracted embedded content from {file_name}", "info")
else:
# Extraction failed
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Failed to extract content from {file_name}",
"is_extracted": False,
"extraction_method": "failed"
})
_log(add_log_func, workflow_id,
f"Failed to extract content from {file_name}", "warning")
except Exception as e:
logger.error(f"Error extracting {file_name}: {str(e)}")
_log(add_log_func, workflow_id,
f"Error extracting {file_name}: {str(e)}", "warning")
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Error extracting: {str(e)}",
"is_extracted": False,
"extraction_method": "error"
})
else:
# No extraction needed, use existing content
existing_content = _find_document_in_messages(file_id, messages)
if existing_content:
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": existing_content.get("content", ""),
"is_extracted": existing_content.get("is_extracted", False),
"extraction_method": "existing_content",
"extraction_context": existing_content.get("extraction_context", "")
})
_log(add_log_func, workflow_id,
f"Using existing content for {file_name}", "info")
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
_log(add_log_func, workflow_id,
f"No content available for {file_name}", "warning")
return extracted_data
def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Find a document by file ID in workflow messages.
Args:
file_id: ID of the file to find
messages: List of messages to search
Returns:
Dictionary with document information or empty dict if not found
"""
for message in messages:
for doc_index, document in enumerate(message.get("documents", [])):
source = document.get("source", {})
# Check if file ID matches
if source.get("id") == str(file_id) or source.get("id") == file_id:
# Found the document
content_text = ""
is_extracted = False
# Look for text content
for content in document.get("contents", []):
if content.get("type") == "text":
content_text = content.get("text", "")
is_extracted = content.get("is_extracted", False)
break
return {
"document_id": document.get("id"),
"message_id": message.get("id"),
"content": content_text,
"is_extracted": is_extracted
}
return {}
async def _create_extraction_plan(
prompt: str,
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
ai_service,
workflow_id: str = None,
add_log_func = None
) -> List[Dict[str, Any]]:
"""
Erstellt einen Extraktionsplan mit AI-Unterstützung.
Args:
prompt: Spezifizierung, welche Daten extrahiert werden sollen
files: Liste aller verfügbaren Dateien mit Metadaten
messages: Liste aller Nachrichten im Workflow
ai_service: Service für KI-Anfragen
workflow_id: Optionale ID des Workflows für Logging
add_log_func: Optionale Funktion für das Hinzufügen von Logs
Returns:
Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
"""
# Erstelle Kontext-Informationen für den AI Call
file_infos = []
for file in files:
# Basis-Metadaten
file_info = {
"id": file.get("id", ""),
"name": file.get("name", ""),
"type": file.get("type", ""),
"content_type": file.get("content_type", ""),
"size": file.get("size", "")
}
# Extraktionsstatus prüfen (falls vorhanden)
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
if doc_contents:
# Prüfen, ob mindestens ein Content mit is_extracted=True existiert
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
)
file_info["already_extracted"] = already_extracted
# Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
for content in doc_contents:
if content.get("type") == "text" and content.get("text"):
preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
file_info["content_preview"] = preview_text
break
else:
file_info["already_extracted"] = False
file_infos.append(file_info)
# AI-Prompt erstellen
extraction_prompt = f"""
Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.
AUFGABE:
{prompt}
VERFÜGBARE DATEIEN:
{json.dumps(file_infos, indent=2)}
Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
1. file_id: Die ID der zu extrahierenden Datei
2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)
Format:
[
{{
"file_id": 1234,
"extract_needed": true,
"extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
"importance": 5
}},
...
]
Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
"""
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")
try:
# AI-Call durchführen
extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
# JSON aus der Antwort extrahieren
import re
json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)
if json_match:
extraction_plan = json.loads(json_match.group(0))
# Log hinzufügen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
"info"
)
return extraction_plan
else:
# Fallback bei Parsing-Problemen
if add_log_func and workflow_id:
add_log_func(
workflow_id,
"Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
"warning"
)
# Standard-Plan: Alle nicht extrahierten Dateien extrahieren
default_plan = []
for file in files:
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
already_extracted = any(
content.get("is_extracted", False) for content in doc_contents
) if doc_contents else False
default_plan.append({
"file_id": file.get("id", 0),
"extract_needed": not already_extracted,
"extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
"importance": 3
})
return default_plan
except Exception as e:
logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)
if add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
"error"
)
# Leerer Plan bei Fehlern
return []
async def _execute_extractions(
extraction_plan: List[Dict[str, Any]],
files: List[Dict[str, Any]],
messages: List[Dict[str, Any]],
lucydom_interface,
ai_service,
workflow_id: str = None,
add_log_func = None,
logging_utils = None
) -> List[Dict[str, Any]]:
"""
Execute the planned extractions.
Args:
extraction_plan: List of extraction instructions
files: List of all available files
lucydom_interface: Interface for database access
ai_service: Service for AI requests
workflow_id: Optional workflow ID for logging
add_log_func: Optional function for adding logs
logging_utils: Optional logging utility
Returns:
List with extracted data per file
"""
extracted_data = []
# Sort by importance
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
for extraction_item in sorted_plan:
file_id = extraction_item.get("file_id")
extract_needed = extraction_item.get("extract_needed", False)
extraction_prompt = extraction_item.get("extraction_prompt", "")
# Find file metadata
file_metadata = next((f for f in files if f.get("id") == file_id), None)
if not file_metadata:
logger.warning(f"File with ID {file_id} not found")
continue
file_name = file_metadata.get("name", "")
file_type = file_metadata.get("type", "")
content_type = file_metadata.get("content_type", "")
# Add log
if logging_utils:
logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
"info"
)
# Only perform extraction if needed
if extract_needed:
# Get file content via LucyDOM interface
if lucydom_interface:
try:
file_content = await lucydom_interface.read_file_content(file_id)
if not file_content:
if logging_utils:
logging_utils.warning(f"File {file_name} not found", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"File {file_name} not found", "warning")
continue
# Perform extraction based on file type
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
# Image analysis with AI service
if ai_service and hasattr(ai_service, "analyze_image"):
try:
image_analysis = await ai_service.analyze_image(
image_data=file_content,
prompt=extraction_prompt,
mime_type=content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": image_analysis,
"is_extracted": True,
"extraction_method": "image_analysis"
})
if logging_utils:
logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
except Exception as e:
logger.error(f"Error analyzing image {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
else:
# Fallback if no image analysis available
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"Image: {file_name} (Analysis not available)",
"is_extracted": False,
"extraction_method": "none"
})
else:
# Text-based extraction for all other file types
try:
# Import directly here to avoid circular imports
from modules.agentservice_utils import extract_text_from_file_content
content, is_extracted = extract_text_from_file_content(
file_content, file_name, content_type
)
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content,
"is_extracted": is_extracted,
"extraction_method": "text_extraction"
})
if logging_utils:
logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
elif add_log_func and workflow_id:
add_log_func(
workflow_id,
f"File {file_name} extracted (Status: {is_extracted})",
"info"
)
except Exception as e:
logger.error(f"Error extracting text from {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
except Exception as e:
logger.error(f"Error reading file {file_name}: {str(e)}")
if logging_utils:
logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
else:
logger.warning(f"No LucyDOM interface available for file {file_name}")
if logging_utils:
logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
elif add_log_func and workflow_id:
add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
else:
# No extraction needed, use existing content
doc_contents = _extract_document_contents_from_messages(file_id, messages)
if doc_contents:
# Use first text content
for content in doc_contents:
if content.get("type") == "text":
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": content.get("text", ""),
"is_extracted": content.get("is_extracted", False),
"extraction_method": "existing_content"
})
break
else:
# No existing content found
extracted_data.append({
"file_id": file_id,
"name": file_name,
"type": file_type,
"content": f"No content available for {file_name}",
"is_extracted": False,
"extraction_method": "none"
})
return extracted_data
def _structure_extracted_data(
extracted_data: List[Dict[str, Any]],
files: List[Dict[str, Any]],
prompt: str
) -> Dict[str, Any]:
"""
Structure the extracted data into a formatted result.
Args:
extracted_data: List of extracted data per file
files: List of all available files
prompt: Original extraction prompt
Returns:
Structured result object
"""
# Create base structure
result = {
"prompt": prompt,
"files_processed": len(extracted_data),
"total_files": len(files),
"extraction_timestamp": datetime.now().isoformat(),
"status": "success",
"extracted_content": []
}
# Add extracted content
for data_item in extracted_data:
# Enrich with file metadata
file_id = data_item.get("file_id", 0)
file_metadata = next((f for f in files if f.get("id") == file_id), {})
content_item = {
"file_id": file_id,
"name": data_item.get("name", file_metadata.get("name", "")),
"type": data_item.get("type", file_metadata.get("type", "")),
"content_type": file_metadata.get("content_type", ""),
"size": file_metadata.get("size", ""),
"is_extracted": data_item.get("is_extracted", False),
"extraction_method": data_item.get("extraction_method", ""),
"content": data_item.get("content", "")
}
result["extracted_content"].append(content_item)
return result
def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Extract document contents for a specific file from workflow messages.
Enhanced to handle the new document structure.
Args:
file_id: ID of the file
messages: List of all messages in the workflow
Returns:
List of document contents for the specified file
"""
contents = []
for message in messages:
# Search documents in the message
for document in message.get("documents", []):
source = document.get("source", {})
# Check if file ID matches (handle both string and int comparison)
if (source.get("id") == file_id or
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
(isinstance(file_id, str) and source.get("id") == file_id)):
# Add contents of the file
doc_contents = document.get("contents", [])
if doc_contents:
# Ensure each content has document reference
for content in doc_contents:
content_copy = content.copy()
content_copy["document_id"] = document.get("id")
content_copy["message_id"] = message.get("id")
contents.append(content_copy)
return contents
def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
"""Helper function for logging with different log functions"""
# Log via logger instance
if log_type == "error":
logger.error(message)
elif log_type == "warning":
logger.warning(message)
else:
logger.info(message)
# Log via provided log function (if available)
if add_log_func and workflow_id:
add_log_func(workflow_id, message, log_type, agent_id, agent_name)