552 lines
No EOL
24 KiB
Python
552 lines
No EOL
24 KiB
Python
import re
|
|
import json
|
|
import logging
|
|
import time
|
|
from datetime import datetime, UTC
|
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubDocumentGeneration:
|
|
"""Document generation operations including single-file and multi-file generation."""
|
|
|
|
def __init__(self, services, aiObjects, documentProcessor):
|
|
"""Initialize document generation service.
|
|
|
|
Args:
|
|
services: Service center instance for accessing other services
|
|
aiObjects: Initialized AiObjects instance
|
|
documentProcessor: Document processing service instance
|
|
"""
|
|
self.services = services
|
|
self.aiObjects = aiObjects
|
|
self.documentProcessor = documentProcessor
|
|
|
|
async def callAiWithDocumentGeneration(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions,
|
|
outputFormat: str,
|
|
title: Optional[str]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Unified document generation method that handles both single and multi-file cases.
|
|
Always uses multi-file approach internally.
|
|
|
|
Args:
|
|
prompt: The main prompt for the AI call
|
|
documents: Optional list of documents to process
|
|
options: AI call configuration options
|
|
outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
|
|
title: Optional title for generated documents
|
|
|
|
Returns:
|
|
Dict with generated documents and metadata in unified structure
|
|
"""
|
|
try:
|
|
# 1. Analyze prompt intent
|
|
promptAnalysis = await self._analyzePromptIntent(prompt, self)
|
|
logger.info(f"Prompt analysis result: {promptAnalysis}")
|
|
|
|
# 2. Get unified extraction prompt
|
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
|
generationService = GenerationService(self.services)
|
|
|
|
extractionPrompt = await generationService.getAdaptiveExtractionPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=prompt,
|
|
title=title,
|
|
promptAnalysis=promptAnalysis,
|
|
aiService=self
|
|
)
|
|
|
|
# 3. Process with unified pipeline (always multi-file approach)
|
|
aiResponse = await self._processDocumentsUnified(
|
|
documents, extractionPrompt, options
|
|
)
|
|
|
|
# 4. Return unified result structure
|
|
return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in unified document generation: {str(e)}")
|
|
return self._buildErrorResult(str(e), outputFormat, title)
|
|
|
|
async def _processDocumentsUnified(
|
|
self,
|
|
documents: Optional[List[ChatDocument]],
|
|
extractionPrompt: str,
|
|
options: AiCallOptions
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Unified document processing that handles both single and multi-file cases.
|
|
Always processes as multi-file structure internally.
|
|
"""
|
|
|
|
# Init progress logger
|
|
workflow = self.services.currentWorkflow
|
|
operationId = f"docGenUnified_{workflow.id}_{int(time.time())}"
|
|
|
|
try:
|
|
# Start progress tracking
|
|
self.services.workflow.progressLogStart(
|
|
operationId,
|
|
"Generate",
|
|
"Unified Document Generation",
|
|
f"Processing {len(documents) if documents else 0} documents"
|
|
)
|
|
|
|
# Update progress - generating extraction prompt
|
|
self.services.workflow.progressLogUpdate(operationId, 0.1, "Generating prompt")
|
|
|
|
# Write prompt to debug file
|
|
self.services.utils.writeDebugFile(extractionPrompt, "extraction_prompt", documents)
|
|
|
|
# Process with unified JSON pipeline using continuation logic
|
|
aiResponse = await self.documentProcessor.processDocumentsWithContinuation(
|
|
documents, extractionPrompt, options
|
|
)
|
|
|
|
# Update progress - AI processing completed
|
|
self.services.workflow.progressLogUpdate(operationId, 0.6, "Processing done")
|
|
|
|
|
|
|
|
# Write AI response to debug file
|
|
response_json = json.dumps(aiResponse, indent=2, ensure_ascii=False) if isinstance(aiResponse, dict) else str(aiResponse)
|
|
self.services.utils.writeDebugFile(response_json, "ai_response", documents)
|
|
|
|
# Validate response structure
|
|
if not self._validateUnifiedResponseStructure(aiResponse):
|
|
raise Exception("AI response is not valid unified document structure")
|
|
|
|
# Emit raw extracted data as a chat message attachment
|
|
try:
|
|
await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified")
|
|
except Exception:
|
|
logger.warning("Failed to emit raw extraction chat message (unified)")
|
|
|
|
# Complete progress tracking
|
|
self.services.workflow.progressLogFinish(operationId, True)
|
|
|
|
return aiResponse
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in unified document processing: {str(e)}")
|
|
self.services.workflow.progressLogFinish(operationId, False)
|
|
raise
|
|
|
|
def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool:
|
|
"""
|
|
Unified validation that checks for document structure.
|
|
Handles both multi-file (documents array) and single-file (sections array) structures.
|
|
"""
|
|
try:
|
|
if not isinstance(response, dict):
|
|
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
|
|
return False
|
|
|
|
# Check for documents array (multi-file structure)
|
|
hasDocuments = "documents" in response
|
|
isDocumentsList = isinstance(response.get("documents"), list)
|
|
|
|
# Check for sections array (single-file structure)
|
|
hasSections = "sections" in response
|
|
isSectionsList = isinstance(response.get("sections"), list)
|
|
|
|
if hasDocuments and isDocumentsList:
|
|
# Multi-file structure
|
|
documents = response.get("documents", [])
|
|
if not documents:
|
|
logger.warning("Unified validation failed: documents array is empty")
|
|
return False
|
|
|
|
# Validate each document individually
|
|
validDocuments = 0
|
|
for i, doc in enumerate(documents):
|
|
if self._validateDocumentStructure(doc, i):
|
|
validDocuments += 1
|
|
else:
|
|
logger.warning(f"Document {i} failed validation, but continuing with others")
|
|
|
|
# Process succeeds if at least one document is valid
|
|
if validDocuments == 0:
|
|
logger.error("Unified validation failed: no valid documents found")
|
|
return False
|
|
|
|
logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid")
|
|
return True
|
|
|
|
elif hasSections and isSectionsList:
|
|
# Single-file structure - convert to multi-file format
|
|
logger.info("Converting single-file structure to multi-file format")
|
|
sections = response.get("sections", [])
|
|
if not sections:
|
|
logger.warning("Unified validation failed: sections array is empty")
|
|
return False
|
|
|
|
# Convert to documents array format
|
|
response["documents"] = [{
|
|
"id": "document_1",
|
|
"title": response.get("metadata", {}).get("title", "Generated Document"),
|
|
"filename": "document_1",
|
|
"sections": sections
|
|
}]
|
|
|
|
logger.info("Successfully converted single-file structure to multi-file format")
|
|
return True
|
|
|
|
else:
|
|
# No valid structure found - fail with clear error details
|
|
logger.error("Unified validation failed: No valid structure found")
|
|
logger.error(f"Response type: {type(response)}")
|
|
logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}")
|
|
logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}")
|
|
logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}")
|
|
logger.error(f"Full response: {response}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Unified response validation failed with exception: {str(e)}")
|
|
return False
|
|
|
|
def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool:
|
|
"""
|
|
Validate individual document structure.
|
|
Returns True if document is valid, False otherwise.
|
|
Does not fail the entire process if one document is invalid.
|
|
"""
|
|
try:
|
|
if not isinstance(document, dict):
|
|
logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}")
|
|
logger.error(f"Document {documentIndex} content: {document}")
|
|
return False
|
|
|
|
# Check for required fields
|
|
hasTitle = "title" in document
|
|
hasSections = "sections" in document
|
|
isSectionsList = isinstance(document.get("sections"), list)
|
|
|
|
logger.debug(f"Document {documentIndex} structure check:")
|
|
logger.debug(f" - hasTitle: {hasTitle}")
|
|
logger.debug(f" - hasSections: {hasSections}")
|
|
logger.debug(f" - isSectionsList: {isSectionsList}")
|
|
logger.debug(f" - available keys: {list(document.keys())}")
|
|
|
|
if not (hasTitle and hasSections and isSectionsList):
|
|
logger.error(f"Document {documentIndex} validation failed:")
|
|
logger.error(f" - title present: {hasTitle}")
|
|
logger.error(f" - sections present: {hasSections}")
|
|
logger.error(f" - sections is list: {isSectionsList}")
|
|
logger.error(f" - document content: {document}")
|
|
return False
|
|
|
|
sections = document.get("sections", [])
|
|
if not sections:
|
|
logger.error(f"Document {documentIndex} validation failed: sections array is empty")
|
|
logger.error(f" - document content: {document}")
|
|
return False
|
|
|
|
logger.info(f"Document {documentIndex} validation passed")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}")
|
|
logger.error(f" - document content: {document}")
|
|
return False
|
|
|
|
async def _buildUnifiedResult(
|
|
self,
|
|
aiResponse: Dict[str, Any],
|
|
outputFormat: str,
|
|
title: str,
|
|
promptAnalysis: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Build unified result structure that always returns array-based format.
|
|
Content is always a multi-document structure.
|
|
"""
|
|
try:
|
|
# Process all documents uniformly
|
|
generatedDocuments = []
|
|
documents = aiResponse.get("documents", [])
|
|
|
|
for i, docData in enumerate(documents):
|
|
try:
|
|
processedDocument = await self._processDocument(
|
|
docData, outputFormat, title, i
|
|
)
|
|
generatedDocuments.append(processedDocument)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to process document {i}: {str(e)}, skipping")
|
|
continue
|
|
|
|
if not generatedDocuments:
|
|
raise Exception("No documents could be processed successfully")
|
|
|
|
# Build unified result
|
|
result = {
|
|
"success": True,
|
|
"content": aiResponse, # Always multi-document structure
|
|
"documents": generatedDocuments, # Always array
|
|
"is_multi_file": len(generatedDocuments) > 1,
|
|
"format": outputFormat,
|
|
"title": title,
|
|
"split_strategy": promptAnalysis.get("strategy", "single"),
|
|
"total_documents": len(generatedDocuments),
|
|
"processed_documents": len(generatedDocuments)
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error building unified result: {str(e)}")
|
|
return self._buildErrorResult(str(e), outputFormat, title)
|
|
|
|
async def _processDocument(
|
|
self,
|
|
docData: Dict[str, Any],
|
|
outputFormat: str,
|
|
title: str,
|
|
documentIndex: int
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Process individual document with content enhancement and rendering.
|
|
"""
|
|
try:
|
|
# Get generation service
|
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
|
generationService = GenerationService(self.services)
|
|
|
|
# Use AI generation to enhance the extracted JSON before rendering
|
|
enhancedContent = docData # Default to original
|
|
if docData.get("sections"):
|
|
try:
|
|
# Get generation prompt
|
|
generationPrompt = await generationService.getGenerationPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=title,
|
|
title=docData.get("title", title),
|
|
aiService=self
|
|
)
|
|
|
|
# Prepare the AI call
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
requestOptions = AiCallOptions()
|
|
requestOptions.operationType = OperationTypeEnum.GENERAL
|
|
|
|
# Create context with the extracted JSON content
|
|
context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}"
|
|
|
|
request = AiCallRequest(
|
|
prompt=generationPrompt,
|
|
context=context,
|
|
options=requestOptions
|
|
)
|
|
|
|
# Call AI to enhance the content
|
|
response = await self.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
# Parse the AI response as JSON
|
|
try:
|
|
result = response.content.strip()
|
|
|
|
# Extract JSON from markdown if present
|
|
jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
|
if jsonMatch:
|
|
result = jsonMatch.group(1).strip()
|
|
elif result.startswith('```json'):
|
|
result = re.sub(r'^```json\s*', '', result)
|
|
result = re.sub(r'\s*```$', '', result)
|
|
elif result.startswith('```'):
|
|
result = re.sub(r'^```\s*', '', result)
|
|
result = re.sub(r'\s*```$', '', result)
|
|
|
|
# Try to parse JSON
|
|
enhancedContent = json.loads(result)
|
|
logger.info(f"AI enhanced JSON content successfully for document {documentIndex}")
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content")
|
|
enhancedContent = docData
|
|
else:
|
|
logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content")
|
|
enhancedContent = docData
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content")
|
|
enhancedContent = docData
|
|
|
|
# Render the enhanced JSON content
|
|
renderedContent, mimeType = await generationService.renderReport(
|
|
extractedContent=enhancedContent,
|
|
outputFormat=outputFormat,
|
|
title=docData.get("title", title),
|
|
userPrompt=title,
|
|
aiService=self
|
|
)
|
|
|
|
# Generate proper filename
|
|
baseFilename = docData.get("filename", f"document_{documentIndex + 1}")
|
|
if '.' in baseFilename:
|
|
baseFilename = baseFilename.rsplit('.', 1)[0]
|
|
|
|
# Add proper extension based on output format
|
|
if outputFormat.lower() == "docx":
|
|
filename = f"{baseFilename}.docx"
|
|
elif outputFormat.lower() == "pdf":
|
|
filename = f"{baseFilename}.pdf"
|
|
elif outputFormat.lower() == "html":
|
|
filename = f"{baseFilename}.html"
|
|
else:
|
|
filename = f"{baseFilename}.{outputFormat}"
|
|
|
|
return {
|
|
"documentName": filename,
|
|
"documentData": renderedContent,
|
|
"mimeType": mimeType,
|
|
"title": docData.get("title", title),
|
|
"documentIndex": documentIndex
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing document {documentIndex}: {str(e)}")
|
|
raise
|
|
|
|
def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]:
|
|
"""
|
|
Build error result with unified structure.
|
|
"""
|
|
return {
|
|
"success": False,
|
|
"error": errorMessage,
|
|
"content": {},
|
|
"documents": [],
|
|
"is_multi_file": False,
|
|
"format": outputFormat,
|
|
"title": title,
|
|
"split_strategy": "error",
|
|
"total_documents": 0,
|
|
"processed_documents": 0
|
|
}
|
|
|
|
async def _callAiJson(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Handle AI calls with document processing for JSON output.
|
|
Returns structured JSON document instead of text.
|
|
"""
|
|
# Process documents with JSON merging
|
|
return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
|
|
|
|
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
|
|
"""Use AI to analyze user prompt and determine processing requirements."""
|
|
if not ai_service:
|
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
|
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and determine if it requires multiple file output or single file output.
|
|
|
|
User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}"
|
|
|
|
Respond with JSON only in this exact format:
|
|
{{
|
|
"is_multi_file": true/false,
|
|
"strategy": "single|per_entity|by_section|by_criteria|custom",
|
|
"criteria": "description of how to split content",
|
|
"file_naming_pattern": "suggested pattern for filenames",
|
|
"reasoning": "brief explanation of the analysis"
|
|
}}
|
|
|
|
Consider:
|
|
- Does the user want separate files for different entities (customers, products, etc.)?
|
|
- Does the user want to split content into multiple documents?
|
|
- What would be the most logical way to organize the content?
|
|
- What language is the request in? (analyze in the original language)
|
|
|
|
Return only the JSON response.
|
|
"""
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationTypeEnum.GENERAL
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await ai_service.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
# Extract JSON from response
|
|
result = response.content.strip()
|
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
|
if json_match:
|
|
result = json_match.group(0)
|
|
|
|
analysis = json.loads(result)
|
|
return analysis
|
|
else:
|
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
|
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
|
|
|
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
|
|
"""
|
|
Create a ChatMessage with the extracted raw JSON attached as a file so the user
|
|
has access to the data even if downstream processing fails.
|
|
"""
|
|
try:
|
|
services = self.services
|
|
workflow = services.currentWorkflow
|
|
|
|
# Serialize payload
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
content_text = json.dumps(payload, ensure_ascii=False, indent=2)
|
|
content_bytes = content_text.encode('utf-8')
|
|
|
|
# Store as file via component storage
|
|
file_name = f"{label}_{ts}.json"
|
|
file_item = services.interfaceDbComponent.createFile(
|
|
name=file_name,
|
|
mimeType="application/json",
|
|
content=content_bytes
|
|
)
|
|
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
|
|
|
# Lookup file info for ChatDocument
|
|
file_info = services.workflow.getFileInfo(file_item.id)
|
|
doc = ChatDocument(
|
|
messageId="", # set after message creation
|
|
fileId=file_item.id,
|
|
fileName=file_info.get("fileName", file_name) if file_info else file_name,
|
|
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
|
|
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
|
|
)
|
|
|
|
# Create message referencing the file - include document in initial call
|
|
messageData = {
|
|
"workflowId": workflow.id,
|
|
"role": "assistant",
|
|
"message": "Raw extraction data saved",
|
|
"status": "data",
|
|
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
|
|
"publishedAt": services.utils.timestampGetUtc(),
|
|
"documentsLabel": label,
|
|
"documents": []
|
|
}
|
|
|
|
# Store message with document included from the start
|
|
services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc])
|
|
except Exception:
|
|
# Non-fatal; ignore if storage or chat creation fails
|
|
return |