742 lines
No EOL
30 KiB
Python
742 lines
No EOL
30 KiB
Python
"""
|
|
Refactored helper function for intelligent data extraction (continued).
|
|
"""
|
|
|
|
import logging
|
|
import json
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
import asyncio
|
|
from datetime import datetime
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
async def data_extraction(
|
|
prompt: str,
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
ai_service,
|
|
lucydom_interface = None,
|
|
workflow_id: str = None,
|
|
add_log_func = None,
|
|
document_handler = None # Add this parameter
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Performs AI-driven data extraction with support for the document handler.
|
|
|
|
Args:
|
|
prompt: Specification of what data to extract
|
|
files: List of all available files with metadata
|
|
messages: List of all messages in the workflow
|
|
ai_service: Service for AI requests
|
|
lucydom_interface: Interface for database access (optional)
|
|
workflow_id: Optional workflow ID for logging
|
|
add_log_func: Optional function for adding logs
|
|
document_handler: Optional document handler for structured document operations
|
|
|
|
Returns:
|
|
Structured text object with extracted data and context information
|
|
"""
|
|
try:
|
|
# Create extraction plan using AI
|
|
extraction_plan = await _create_extraction_plan(prompt, files, messages, ai_service, workflow_id, add_log_func)
|
|
|
|
# Execute extractions, preferring document handler if available
|
|
if document_handler:
|
|
extracted_data = await _execute_extractions_with_handler(
|
|
extraction_plan,
|
|
files,
|
|
messages,
|
|
document_handler,
|
|
ai_service,
|
|
workflow_id,
|
|
add_log_func
|
|
)
|
|
else:
|
|
# Fall back to original implementation
|
|
extracted_data = await _execute_extractions(
|
|
extraction_plan,
|
|
files,
|
|
messages,
|
|
lucydom_interface,
|
|
ai_service,
|
|
workflow_id,
|
|
add_log_func
|
|
)
|
|
|
|
# Structure extracted data
|
|
structured_result = _structure_extracted_data(extracted_data, files, prompt)
|
|
|
|
return structured_result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in data extraction: {str(e)}", exc_info=True)
|
|
|
|
# Add error log
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Data extraction error: {str(e)}", "error")
|
|
|
|
# Return error result
|
|
return {
|
|
"error": str(e),
|
|
"status": "error",
|
|
"files_processed": len(files),
|
|
"message": f"Data extraction failed: {str(e)}"
|
|
}
|
|
|
|
|
|
async def _execute_extractions_with_handler(
|
|
extraction_plan: List[Dict[str, Any]],
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
document_handler,
|
|
ai_service,
|
|
workflow_id: str = None,
|
|
add_log_func = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute extractions using the document handler.
|
|
|
|
Args:
|
|
extraction_plan: List of extraction instructions
|
|
files: List of all available files
|
|
messages: List of all messages
|
|
document_handler: Document handler for structured operations
|
|
ai_service: Service for AI requests
|
|
workflow_id: Optional workflow ID for logging
|
|
add_log_func: Optional function for adding logs
|
|
|
|
Returns:
|
|
List with extracted data per file
|
|
"""
|
|
extracted_data = []
|
|
|
|
# Sort by importance (highest first)
|
|
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
|
|
|
|
for extraction_item in sorted_plan:
|
|
file_id = extraction_item.get("file_id")
|
|
extract_needed = extraction_item.get("extract_needed", False)
|
|
extraction_prompt = extraction_item.get("extraction_prompt", "")
|
|
|
|
# Find file metadata
|
|
file_metadata = next((f for f in files if f.get("id") == file_id), None)
|
|
|
|
if not file_metadata:
|
|
logger.warning(f"File with ID {file_id} not found")
|
|
continue
|
|
|
|
file_name = file_metadata.get("name", "")
|
|
file_type = file_metadata.get("type", "")
|
|
content_type = file_metadata.get("content_type", "")
|
|
|
|
# Log
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
|
|
"info"
|
|
)
|
|
|
|
# Only perform extraction if needed
|
|
if extract_needed:
|
|
# Find document in existing messages if available
|
|
existing_content = _find_document_in_messages(file_id, messages)
|
|
|
|
# Check if we should use document handler for contextual extraction
|
|
if existing_content:
|
|
# If document exists but needs contextual extraction
|
|
document_id = existing_content.get("document_id")
|
|
message_id = existing_content.get("message_id")
|
|
|
|
if document_id and message_id:
|
|
# Find the message containing the document
|
|
for message in messages:
|
|
if message.get("id") == message_id:
|
|
# Extract content with context
|
|
try:
|
|
# Find document reference
|
|
doc_reference = None
|
|
for doc in message.get("documents", []):
|
|
if doc.get("id") == document_id:
|
|
doc_reference = doc
|
|
break
|
|
|
|
if doc_reference:
|
|
# Use document handler to perform contextual extraction
|
|
extracted_text = await document_handler.extract_document_content(
|
|
document_id,
|
|
file_id,
|
|
extraction_prompt
|
|
)
|
|
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": extracted_text,
|
|
"is_extracted": True,
|
|
"extraction_method": "contextual_extraction"
|
|
})
|
|
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Contextual extraction for {file_name}: {extraction_prompt}",
|
|
"info"
|
|
)
|
|
|
|
continue
|
|
except Exception as e:
|
|
logger.error(f"Error in contextual extraction for {file_name}: {str(e)}")
|
|
|
|
# If we reach here, we need to perform a new extraction
|
|
try:
|
|
file_content = await document_handler.add_file_to_message(
|
|
{}, # Empty message to extract just the document
|
|
file_id,
|
|
extraction_prompt
|
|
)
|
|
|
|
# Get the extracted content from the document
|
|
if "documents" in file_content and file_content["documents"]:
|
|
doc = file_content["documents"][0]
|
|
content_text = ""
|
|
is_extracted = False
|
|
|
|
for content in doc.get("contents", []):
|
|
if content.get("type") == "text":
|
|
content_text = content.get("text", "")
|
|
is_extracted = content.get("is_extracted", False)
|
|
break
|
|
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": content_text,
|
|
"is_extracted": is_extracted,
|
|
"extraction_method": "document_handler"
|
|
})
|
|
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Extracted {file_name} using document handler",
|
|
"info"
|
|
)
|
|
else:
|
|
# Extraction failed
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"Failed to extract content from {file_name}",
|
|
"is_extracted": False,
|
|
"extraction_method": "failed"
|
|
})
|
|
except Exception as e:
|
|
logger.error(f"Error extracting {file_name}: {str(e)}")
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"Error extracting: {str(e)}",
|
|
"is_extracted": False,
|
|
"extraction_method": "error"
|
|
})
|
|
else:
|
|
# No extraction needed, use existing content
|
|
existing_content = _find_document_in_messages(file_id, messages)
|
|
|
|
if existing_content:
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": existing_content.get("content", ""),
|
|
"is_extracted": existing_content.get("is_extracted", False),
|
|
"extraction_method": "existing_content"
|
|
})
|
|
else:
|
|
# No existing content found
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"No content available for {file_name}",
|
|
"is_extracted": False,
|
|
"extraction_method": "none"
|
|
})
|
|
|
|
return extracted_data
|
|
|
|
|
|
def _find_document_in_messages(file_id: int, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""
|
|
Find a document by file ID in workflow messages.
|
|
|
|
Args:
|
|
file_id: ID of the file to find
|
|
messages: List of messages to search
|
|
|
|
Returns:
|
|
Dictionary with document information or empty dict if not found
|
|
"""
|
|
for message in messages:
|
|
for doc_index, document in enumerate(message.get("documents", [])):
|
|
source = document.get("source", {})
|
|
|
|
# Check if file ID matches
|
|
if source.get("id") == str(file_id) or source.get("id") == file_id:
|
|
# Found the document
|
|
content_text = ""
|
|
is_extracted = False
|
|
|
|
# Look for text content
|
|
for content in document.get("contents", []):
|
|
if content.get("type") == "text":
|
|
content_text = content.get("text", "")
|
|
is_extracted = content.get("is_extracted", False)
|
|
break
|
|
|
|
return {
|
|
"document_id": document.get("id"),
|
|
"message_id": message.get("id"),
|
|
"content": content_text,
|
|
"is_extracted": is_extracted
|
|
}
|
|
|
|
return {}
|
|
|
|
|
|
async def _create_extraction_plan(
|
|
prompt: str,
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
ai_service,
|
|
workflow_id: str = None,
|
|
add_log_func = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Erstellt einen Extraktionsplan mit AI-Unterstützung.
|
|
|
|
Args:
|
|
prompt: Spezifizierung, welche Daten extrahiert werden sollen
|
|
files: Liste aller verfügbaren Dateien mit Metadaten
|
|
messages: Liste aller Nachrichten im Workflow
|
|
ai_service: Service für KI-Anfragen
|
|
workflow_id: Optionale ID des Workflows für Logging
|
|
add_log_func: Optionale Funktion für das Hinzufügen von Logs
|
|
|
|
Returns:
|
|
Extraktionsplan (Liste von Extraktionsanweisungen pro Datei)
|
|
"""
|
|
# Erstelle Kontext-Informationen für den AI Call
|
|
file_infos = []
|
|
for file in files:
|
|
# Basis-Metadaten
|
|
file_info = {
|
|
"id": file.get("id", ""),
|
|
"name": file.get("name", ""),
|
|
"type": file.get("type", ""),
|
|
"content_type": file.get("content_type", ""),
|
|
"size": file.get("size", "")
|
|
}
|
|
|
|
# Extraktionsstatus prüfen (falls vorhanden)
|
|
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
|
|
|
|
if doc_contents:
|
|
# Prüfen, ob mindestens ein Content mit is_extracted=True existiert
|
|
already_extracted = any(
|
|
content.get("is_extracted", False) for content in doc_contents
|
|
)
|
|
file_info["already_extracted"] = already_extracted
|
|
|
|
# Eine kurze Vorschau des Inhalts hinzufügen (falls verfügbar)
|
|
for content in doc_contents:
|
|
if content.get("type") == "text" and content.get("text"):
|
|
preview_text = content.get("text", "")[:200] + "..." if len(content.get("text", "")) > 200 else content.get("text", "")
|
|
file_info["content_preview"] = preview_text
|
|
break
|
|
else:
|
|
file_info["already_extracted"] = False
|
|
|
|
file_infos.append(file_info)
|
|
|
|
# AI-Prompt erstellen
|
|
extraction_prompt = f"""
|
|
Du bist ein Datenextraktionsexperte, der mithilfe von KI-Analyse entscheidet, welche Dateien
|
|
und Inhalte für eine bestimmte Aufgabe extrahiert werden müssen.
|
|
|
|
AUFGABE:
|
|
{prompt}
|
|
|
|
VERFÜGBARE DATEIEN:
|
|
{json.dumps(file_infos, indent=2)}
|
|
|
|
Für jede Datei, die für die Aufgabe relevant ist, erstelle eine Extraktionsanweisung mit den folgenden Informationen:
|
|
1. file_id: Die ID der zu extrahierenden Datei
|
|
2. extract_needed: Boolean, ob eine Extraktion erforderlich ist (True, wenn die Datei noch nicht extrahiert wurde und für die Aufgabe benötigt wird)
|
|
3. extraction_prompt: Ein spezifischer Prompt für die Extraktion der Datei (besonders wichtig für Bilder und nicht-textbasierte Dateien)
|
|
4. importance: Priorität/Wichtigkeit für die Aufgabe (1-5, wobei 5 am wichtigsten ist)
|
|
|
|
Format:
|
|
[
|
|
{{
|
|
"file_id": 1234,
|
|
"extract_needed": true,
|
|
"extraction_prompt": "Extrahiere die Tabellendaten mit Fokus auf die Umsatzzahlen",
|
|
"importance": 5
|
|
}},
|
|
...
|
|
]
|
|
|
|
Gib nur das JSON-Array zurück, ohne weitere Erklärungen.
|
|
"""
|
|
|
|
# Log hinzufügen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, "Extraktionsplan wird erstellt...", "info")
|
|
|
|
try:
|
|
# AI-Call durchführen
|
|
extraction_plan_response = await ai_service.call_api([{"role": "user", "content": extraction_prompt}])
|
|
|
|
# JSON aus der Antwort extrahieren
|
|
import re
|
|
json_match = re.search(r'\[.*\]', extraction_plan_response, re.DOTALL)
|
|
|
|
if json_match:
|
|
extraction_plan = json.loads(json_match.group(0))
|
|
|
|
# Log hinzufügen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Extraktionsplan erstellt für {len(extraction_plan)} Dateien",
|
|
"info"
|
|
)
|
|
|
|
return extraction_plan
|
|
else:
|
|
# Fallback bei Parsing-Problemen
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
"Parsing-Fehler beim Extraktionsplan, erstelle Standard-Plan",
|
|
"warning"
|
|
)
|
|
|
|
# Standard-Plan: Alle nicht extrahierten Dateien extrahieren
|
|
default_plan = []
|
|
for file in files:
|
|
doc_contents = _extract_document_contents_from_messages(file.get("id", ""), messages)
|
|
already_extracted = any(
|
|
content.get("is_extracted", False) for content in doc_contents
|
|
) if doc_contents else False
|
|
|
|
default_plan.append({
|
|
"file_id": file.get("id", 0),
|
|
"extract_needed": not already_extracted,
|
|
"extraction_prompt": f"Extrahiere alle relevanten Informationen aus {file.get('name', '')}",
|
|
"importance": 3
|
|
})
|
|
|
|
return default_plan
|
|
|
|
except Exception as e:
|
|
logger.error(f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}", exc_info=True)
|
|
|
|
if add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Fehler bei der Erstellung des Extraktionsplans: {str(e)}",
|
|
"error"
|
|
)
|
|
|
|
# Leerer Plan bei Fehlern
|
|
return []
|
|
|
|
async def _execute_extractions(
|
|
extraction_plan: List[Dict[str, Any]],
|
|
files: List[Dict[str, Any]],
|
|
messages: List[Dict[str, Any]],
|
|
lucydom_interface,
|
|
ai_service,
|
|
workflow_id: str = None,
|
|
add_log_func = None,
|
|
logging_utils = None
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Execute the planned extractions.
|
|
|
|
Args:
|
|
extraction_plan: List of extraction instructions
|
|
files: List of all available files
|
|
lucydom_interface: Interface for database access
|
|
ai_service: Service for AI requests
|
|
workflow_id: Optional workflow ID for logging
|
|
add_log_func: Optional function for adding logs
|
|
logging_utils: Optional logging utility
|
|
|
|
Returns:
|
|
List with extracted data per file
|
|
"""
|
|
extracted_data = []
|
|
|
|
# Sort by importance
|
|
sorted_plan = sorted(extraction_plan, key=lambda x: x.get("importance", 0), reverse=True)
|
|
|
|
for extraction_item in sorted_plan:
|
|
file_id = extraction_item.get("file_id")
|
|
extract_needed = extraction_item.get("extract_needed", False)
|
|
extraction_prompt = extraction_item.get("extraction_prompt", "")
|
|
|
|
# Find file metadata
|
|
file_metadata = next((f for f in files if f.get("id") == file_id), None)
|
|
|
|
if not file_metadata:
|
|
logger.warning(f"File with ID {file_id} not found")
|
|
continue
|
|
|
|
file_name = file_metadata.get("name", "")
|
|
file_type = file_metadata.get("type", "")
|
|
content_type = file_metadata.get("content_type", "")
|
|
|
|
# Add log
|
|
if logging_utils:
|
|
logging_utils.info(f"Processing file: {file_name} (Extraction needed: {extract_needed})", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"Processing file: {file_name} (Extraction needed: {extract_needed})",
|
|
"info"
|
|
)
|
|
|
|
# Only perform extraction if needed
|
|
if extract_needed:
|
|
# Get file content via LucyDOM interface
|
|
if lucydom_interface:
|
|
try:
|
|
file_content = await lucydom_interface.read_file_content(file_id)
|
|
|
|
if not file_content:
|
|
if logging_utils:
|
|
logging_utils.warning(f"File {file_name} not found", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"File {file_name} not found", "warning")
|
|
continue
|
|
|
|
# Perform extraction based on file type
|
|
if file_type == "image" or file_name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
|
|
# Image analysis with AI service
|
|
if ai_service and hasattr(ai_service, "analyze_image"):
|
|
try:
|
|
image_analysis = await ai_service.analyze_image(
|
|
image_data=file_content,
|
|
prompt=extraction_prompt,
|
|
mime_type=content_type
|
|
)
|
|
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": image_analysis,
|
|
"is_extracted": True,
|
|
"extraction_method": "image_analysis"
|
|
})
|
|
|
|
if logging_utils:
|
|
logging_utils.info(f"Image {file_name} successfully analyzed", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Image {file_name} successfully analyzed", "info")
|
|
except Exception as e:
|
|
logger.error(f"Error analyzing image {file_name}: {str(e)}")
|
|
if logging_utils:
|
|
logging_utils.error(f"Error analyzing image {file_name}: {str(e)}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Error analyzing image {file_name}: {str(e)}", "error")
|
|
else:
|
|
# Fallback if no image analysis available
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"Image: {file_name} (Analysis not available)",
|
|
"is_extracted": False,
|
|
"extraction_method": "none"
|
|
})
|
|
else:
|
|
# Text-based extraction for all other file types
|
|
try:
|
|
# Import directly here to avoid circular imports
|
|
from modules.agentservice_utils import extract_text_from_file_content
|
|
|
|
content, is_extracted = extract_text_from_file_content(
|
|
file_content, file_name, content_type
|
|
)
|
|
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": content,
|
|
"is_extracted": is_extracted,
|
|
"extraction_method": "text_extraction"
|
|
})
|
|
|
|
if logging_utils:
|
|
logging_utils.info(f"File {file_name} extracted (Status: {is_extracted})", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(
|
|
workflow_id,
|
|
f"File {file_name} extracted (Status: {is_extracted})",
|
|
"info"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Error extracting text from {file_name}: {str(e)}")
|
|
if logging_utils:
|
|
logging_utils.error(f"Error extracting text from {file_name}: {str(e)}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Error extracting text from {file_name}: {str(e)}", "error")
|
|
except Exception as e:
|
|
logger.error(f"Error reading file {file_name}: {str(e)}")
|
|
if logging_utils:
|
|
logging_utils.error(f"Error reading file {file_name}: {str(e)}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"Error reading file {file_name}: {str(e)}", "error")
|
|
else:
|
|
logger.warning(f"No LucyDOM interface available for file {file_name}")
|
|
if logging_utils:
|
|
logging_utils.warning(f"No LucyDOM interface available for file {file_name}", "extraction")
|
|
elif add_log_func and workflow_id:
|
|
add_log_func(workflow_id, f"No LucyDOM interface available for file {file_name}", "warning")
|
|
else:
|
|
# No extraction needed, use existing content
|
|
doc_contents = _extract_document_contents_from_messages(file_id, messages)
|
|
|
|
if doc_contents:
|
|
# Use first text content
|
|
for content in doc_contents:
|
|
if content.get("type") == "text":
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": content.get("text", ""),
|
|
"is_extracted": content.get("is_extracted", False),
|
|
"extraction_method": "existing_content"
|
|
})
|
|
break
|
|
else:
|
|
# No existing content found
|
|
extracted_data.append({
|
|
"file_id": file_id,
|
|
"name": file_name,
|
|
"type": file_type,
|
|
"content": f"No content available for {file_name}",
|
|
"is_extracted": False,
|
|
"extraction_method": "none"
|
|
})
|
|
|
|
return extracted_data
|
|
|
|
def _structure_extracted_data(
|
|
extracted_data: List[Dict[str, Any]],
|
|
files: List[Dict[str, Any]],
|
|
prompt: str
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Structure the extracted data into a formatted result.
|
|
|
|
Args:
|
|
extracted_data: List of extracted data per file
|
|
files: List of all available files
|
|
prompt: Original extraction prompt
|
|
|
|
Returns:
|
|
Structured result object
|
|
"""
|
|
# Create base structure
|
|
result = {
|
|
"prompt": prompt,
|
|
"files_processed": len(extracted_data),
|
|
"total_files": len(files),
|
|
"extraction_timestamp": datetime.now().isoformat(),
|
|
"status": "success",
|
|
"extracted_content": []
|
|
}
|
|
|
|
# Add extracted content
|
|
for data_item in extracted_data:
|
|
# Enrich with file metadata
|
|
file_id = data_item.get("file_id", 0)
|
|
file_metadata = next((f for f in files if f.get("id") == file_id), {})
|
|
|
|
content_item = {
|
|
"file_id": file_id,
|
|
"name": data_item.get("name", file_metadata.get("name", "")),
|
|
"type": data_item.get("type", file_metadata.get("type", "")),
|
|
"content_type": file_metadata.get("content_type", ""),
|
|
"size": file_metadata.get("size", ""),
|
|
"is_extracted": data_item.get("is_extracted", False),
|
|
"extraction_method": data_item.get("extraction_method", ""),
|
|
"content": data_item.get("content", "")
|
|
}
|
|
|
|
result["extracted_content"].append(content_item)
|
|
|
|
return result
|
|
|
|
def _extract_document_contents_from_messages(file_id: int, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Extract document contents for a specific file from workflow messages.
|
|
Enhanced to handle the new document structure.
|
|
|
|
Args:
|
|
file_id: ID of the file
|
|
messages: List of all messages in the workflow
|
|
|
|
Returns:
|
|
List of document contents for the specified file
|
|
"""
|
|
contents = []
|
|
|
|
for message in messages:
|
|
# Search documents in the message
|
|
for document in message.get("documents", []):
|
|
source = document.get("source", {})
|
|
|
|
# Check if file ID matches (handle both string and int comparison)
|
|
if (source.get("id") == file_id or
|
|
(isinstance(source.get("id"), str) and source.get("id") == str(file_id)) or
|
|
(isinstance(file_id, str) and source.get("id") == int(file_id))):
|
|
|
|
# Add contents of the file
|
|
doc_contents = document.get("contents", [])
|
|
|
|
if doc_contents:
|
|
# Ensure each content has document reference
|
|
for content in doc_contents:
|
|
content_copy = content.copy()
|
|
content_copy["document_id"] = document.get("id")
|
|
content_copy["message_id"] = message.get("id")
|
|
contents.append(content_copy)
|
|
|
|
return contents
|
|
|
|
def _log(add_log_func, workflow_id, message, log_type, agent_id=None, agent_name=None):
|
|
"""Helper function for logging with different log functions"""
|
|
# Log via logger instance
|
|
if log_type == "error":
|
|
logger.error(message)
|
|
elif log_type == "warning":
|
|
logger.warning(message)
|
|
else:
|
|
logger.info(message)
|
|
|
|
# Log via provided log function (if available)
|
|
if add_log_func and workflow_id:
|
|
add_log_func(workflow_id, message, log_type, agent_id, agent_name) |