gateway/modules/services/serviceAi/subDocumentGeneration.py

558 lines
No EOL
24 KiB
Python

import re
import json
import logging
import time
from datetime import datetime, UTC
from typing import Dict, Any, List, Optional, Tuple, Union
from modules.datamodels.datamodelChat import ChatDocument
from modules.datamodels.datamodelAi import AiCallOptions, OperationTypeEnum
logger = logging.getLogger(__name__)
class SubDocumentGeneration:
"""Document generation operations including single-file and multi-file generation."""
def __init__(self, services, aiObjects, documentProcessor):
"""Initialize document generation service.
Args:
services: Service center instance for accessing other services
aiObjects: Initialized AiObjects instance
documentProcessor: Document processing service instance
"""
self.services = services
self.aiObjects = aiObjects
self.documentProcessor = documentProcessor
async def callAiWithDocumentGeneration(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
outputFormat: str,
title: Optional[str]
) -> Dict[str, Any]:
"""
Unified document generation method that handles both single and multi-file cases.
Always uses multi-file approach internally.
Args:
prompt: The main prompt for the AI call
documents: Optional list of documents to process
options: AI call configuration options
outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
title: Optional title for generated documents
Returns:
Dict with generated documents and metadata in unified structure
"""
try:
# 1. Analyze prompt intent
promptAnalysis = await self._analyzePromptIntent(prompt, self)
logger.info(f"Prompt analysis result: {promptAnalysis}")
# 2. Get unified extraction prompt
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generationService = GenerationService(self.services)
extractionPrompt = await generationService.getAdaptiveExtractionPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
promptAnalysis=promptAnalysis,
aiService=self
)
# 3. Process with unified pipeline (always multi-file approach)
aiResponse = await self._processDocumentsUnified(
documents, extractionPrompt, options
)
# 4. Return unified result structure
return await self._buildUnifiedResult(aiResponse, outputFormat, title, promptAnalysis)
except Exception as e:
logger.error(f"Error in unified document generation: {str(e)}")
return self._buildErrorResult(str(e), outputFormat, title)
async def _processDocumentsUnified(
self,
documents: Optional[List[ChatDocument]],
extractionPrompt: str,
options: AiCallOptions
) -> Dict[str, Any]:
"""
Unified document processing that handles both single and multi-file cases.
Always processes as multi-file structure internally.
"""
# Init progress logger
workflow = self.services.currentWorkflow
operationId = f"docGenUnified_{workflow.id}_{int(time.time())}"
try:
# Start progress tracking
self.services.workflow.progressLogStart(
operationId,
"Generate",
"Unified Document Generation",
f"Processing {len(documents) if documents else 0} documents"
)
# Update progress - generating extraction prompt
self.services.workflow.progressLogUpdate(operationId, 0.1, "Generating prompt")
# Write prompt to debug file
self.services.utils.writeDebugFile(extractionPrompt, "extraction_prompt", documents)
# Process with unified JSON pipeline using continuation logic
aiResponse = await self.documentProcessor.processDocumentsWithContinuation(
documents, extractionPrompt, options
)
# Update progress - AI processing completed
self.services.workflow.progressLogUpdate(operationId, 0.6, "Processing done")
# Write AI response to debug file
response_json = json.dumps(aiResponse, indent=2, ensure_ascii=False) if isinstance(aiResponse, dict) else str(aiResponse)
self.services.utils.writeDebugFile(response_json, "ai_response", documents)
# Validate response structure
if not self._validateUnifiedResponseStructure(aiResponse):
raise Exception("AI response is not valid unified document structure")
# Emit raw extracted data as a chat message attachment
try:
await self._postRawDataChatMessage(aiResponse, label="raw_extraction_unified")
except Exception:
logger.warning("Failed to emit raw extraction chat message (unified)")
# Complete progress tracking
self.services.workflow.progressLogFinish(operationId, True)
return aiResponse
except Exception as e:
logger.error(f"Error in unified document processing: {str(e)}")
self.services.workflow.progressLogFinish(operationId, False)
raise
def _validateUnifiedResponseStructure(self, response: Dict[str, Any]) -> bool:
"""
Unified validation that checks for document structure.
Handles both multi-file (documents array) and single-file (sections array) structures.
"""
try:
if not isinstance(response, dict):
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
return False
# Check for documents array (multi-file structure)
hasDocuments = "documents" in response
isDocumentsList = isinstance(response.get("documents"), list)
# Check for sections array (single-file structure)
hasSections = "sections" in response
isSectionsList = isinstance(response.get("sections"), list)
if hasDocuments and isDocumentsList:
# Multi-file structure
documents = response.get("documents", [])
if not documents:
logger.warning("Unified validation failed: documents array is empty")
return False
# Validate each document individually
validDocuments = 0
for i, doc in enumerate(documents):
if self._validateDocumentStructure(doc, i):
validDocuments += 1
else:
logger.warning(f"Document {i} failed validation, but continuing with others")
# Process succeeds if at least one document is valid
if validDocuments == 0:
logger.error("Unified validation failed: no valid documents found")
return False
logger.info(f"Unified validation passed: {validDocuments}/{len(documents)} documents valid")
return True
elif hasSections and isSectionsList:
# Single-file structure - convert to multi-file format
logger.info("Converting single-file structure to multi-file format")
sections = response.get("sections", [])
if not sections:
logger.warning("Unified validation failed: sections array is empty")
return False
# Convert to documents array format
response["documents"] = [{
"id": "document_1",
"title": response.get("metadata", {}).get("title", "Generated Document"),
"filename": "document_1",
"sections": sections
}]
logger.info("Successfully converted single-file structure to multi-file format")
return True
else:
# No valid structure found - fail with clear error details
logger.error("Unified validation failed: No valid structure found")
logger.error(f"Response type: {type(response)}")
logger.error(f"Available keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}")
logger.error(f"hasDocuments={hasDocuments}, isDocumentsList={isDocumentsList}")
logger.error(f"hasSections={hasSections}, isSectionsList={isSectionsList}")
logger.error(f"Full response: {response}")
return False
except Exception as e:
logger.warning(f"Unified response validation failed with exception: {str(e)}")
return False
def _validateDocumentStructure(self, document: Dict[str, Any], documentIndex: int) -> bool:
"""
Validate individual document structure.
Returns True if document is valid, False otherwise.
Does not fail the entire process if one document is invalid.
"""
try:
if not isinstance(document, dict):
logger.error(f"Document {documentIndex} validation failed: not a dict, got {type(document)}")
logger.error(f"Document {documentIndex} content: {document}")
return False
# Check for required fields
hasTitle = "title" in document
hasSections = "sections" in document
isSectionsList = isinstance(document.get("sections"), list)
logger.debug(f"Document {documentIndex} structure check:")
logger.debug(f" - hasTitle: {hasTitle}")
logger.debug(f" - hasSections: {hasSections}")
logger.debug(f" - isSectionsList: {isSectionsList}")
logger.debug(f" - available keys: {list(document.keys())}")
if not (hasTitle and hasSections and isSectionsList):
logger.error(f"Document {documentIndex} validation failed:")
logger.error(f" - title present: {hasTitle}")
logger.error(f" - sections present: {hasSections}")
logger.error(f" - sections is list: {isSectionsList}")
logger.error(f" - document content: {document}")
return False
sections = document.get("sections", [])
if not sections:
logger.error(f"Document {documentIndex} validation failed: sections array is empty")
logger.error(f" - document content: {document}")
return False
logger.info(f"Document {documentIndex} validation passed")
return True
except Exception as e:
logger.error(f"Document {documentIndex} validation failed with exception: {str(e)}")
logger.error(f" - document content: {document}")
return False
async def _buildUnifiedResult(
self,
aiResponse: Dict[str, Any],
outputFormat: str,
title: str,
promptAnalysis: Dict[str, Any]
) -> Dict[str, Any]:
"""
Build unified result structure that always returns array-based format.
Content is always a multi-document structure.
"""
try:
# Process all documents uniformly
generatedDocuments = []
documents = aiResponse.get("documents", [])
for i, docData in enumerate(documents):
try:
processedDocument = await self._processDocument(
docData, outputFormat, title, i
)
generatedDocuments.append(processedDocument)
except Exception as e:
logger.warning(f"Failed to process document {i}: {str(e)}, skipping")
continue
if not generatedDocuments:
raise Exception("No documents could be processed successfully")
# Build unified result
result = {
"success": True,
"content": aiResponse, # Always multi-document structure
"documents": generatedDocuments, # Always array
"is_multi_file": len(generatedDocuments) > 1,
"format": outputFormat,
"title": title,
"split_strategy": promptAnalysis.get("strategy", "single"),
"total_documents": len(generatedDocuments),
"processed_documents": len(generatedDocuments)
}
return result
except Exception as e:
logger.error(f"Error building unified result: {str(e)}")
return self._buildErrorResult(str(e), outputFormat, title)
async def _processDocument(
self,
docData: Dict[str, Any],
outputFormat: str,
title: str,
documentIndex: int
) -> Dict[str, Any]:
"""
Process individual document with content enhancement and rendering.
"""
try:
# Get generation service
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generationService = GenerationService(self.services)
# Use AI generation to enhance the extracted JSON before rendering
enhancedContent = docData # Default to original
if docData.get("sections"):
try:
# Get generation prompt
generationPrompt = await generationService.getGenerationPrompt(
outputFormat=outputFormat,
userPrompt=title,
title=docData.get("title", title),
aiService=self
)
# Prepare the AI call
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
requestOptions = AiCallOptions()
requestOptions.operationType = OperationTypeEnum.DATA_GENERATE
# Create context with the extracted JSON content
context = f"Extracted JSON content:\n{json.dumps(docData, indent=2)}"
request = AiCallRequest(
prompt=generationPrompt,
context=context,
options=requestOptions
)
# Write document generation prompt to debug file
self.services.utils.writeDebugFile(generationPrompt, "document_generation_enhancement_prompt")
# Call AI to enhance the content
response = await self.aiObjects.call(request)
# Write document generation response to debug file
self.services.utils.writeDebugFile(response.content or '', "document_generation_enhancement_response")
if response and response.content:
# Parse the AI response as JSON
try:
result = response.content.strip()
# Extract JSON from markdown if present
jsonMatch = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
if jsonMatch:
result = jsonMatch.group(1).strip()
elif result.startswith('```json'):
result = re.sub(r'^```json\s*', '', result)
result = re.sub(r'\s*```$', '', result)
elif result.startswith('```'):
result = re.sub(r'^```\s*', '', result)
result = re.sub(r'\s*```$', '', result)
# Try to parse JSON
enhancedContent = json.loads(result)
logger.info(f"AI enhanced JSON content successfully for document {documentIndex}")
except json.JSONDecodeError as e:
logger.warning(f"AI generation returned invalid JSON for document {documentIndex}: {str(e)}, using original content")
enhancedContent = docData
else:
logger.warning(f"AI generation returned empty response for document {documentIndex}, using original content")
enhancedContent = docData
except Exception as e:
logger.warning(f"AI generation failed for document {documentIndex}: {str(e)}, using original content")
enhancedContent = docData
# Render the enhanced JSON content
renderedContent, mimeType = await generationService.renderReport(
extractedContent=enhancedContent,
outputFormat=outputFormat,
title=docData.get("title", title),
userPrompt=title,
aiService=self
)
# Generate proper filename
baseFilename = docData.get("filename", f"document_{documentIndex + 1}")
if '.' in baseFilename:
baseFilename = baseFilename.rsplit('.', 1)[0]
# Add proper extension based on output format
if outputFormat.lower() == "docx":
filename = f"{baseFilename}.docx"
elif outputFormat.lower() == "pdf":
filename = f"{baseFilename}.pdf"
elif outputFormat.lower() == "html":
filename = f"{baseFilename}.html"
else:
filename = f"{baseFilename}.{outputFormat}"
return {
"documentName": filename,
"documentData": renderedContent,
"mimeType": mimeType,
"title": docData.get("title", title),
"documentIndex": documentIndex
}
except Exception as e:
logger.error(f"Error processing document {documentIndex}: {str(e)}")
raise
def _buildErrorResult(self, errorMessage: str, outputFormat: str, title: str) -> Dict[str, Any]:
"""
Build error result with unified structure.
"""
return {
"success": False,
"error": errorMessage,
"content": {},
"documents": [],
"is_multi_file": False,
"format": outputFormat,
"title": title,
"split_strategy": "error",
"total_documents": 0,
"processed_documents": 0
}
async def _callAiJson(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions
) -> Dict[str, Any]:
"""
Handle AI calls with document processing for JSON output.
Returns structured JSON document instead of text.
"""
# Process documents with JSON merging
return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
"""Use AI to analyze user prompt and determine processing requirements."""
if not ai_service:
return {"is_multi_file": False, "strategy": "single", "criteria": None}
try:
analysis_prompt = f"""
Analyze this user request and determine if it requires multiple file output or single file output.
User request: "{self.services.ai.sanitizePromptContent(prompt, 'userinput')}"
Respond with JSON only in this exact format:
{{
"is_multi_file": true/false,
"strategy": "single|per_entity|by_section|by_criteria|custom",
"criteria": "description of how to split content",
"file_naming_pattern": "suggested pattern for filenames",
"reasoning": "brief explanation of the analysis"
}}
Consider:
- Does the user want separate files for different entities (customers, products, etc.)?
- Does the user want to split content into multiple documents?
- What would be the most logical way to organize the content?
- What language is the request in? (analyze in the original language)
Return only the JSON response.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationTypeEnum
request_options = AiCallOptions()
request_options.operationType = OperationTypeEnum.DATA_GENERATE
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await ai_service.aiObjects.call(request)
if response and response.content:
# Extract JSON from response
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
analysis = json.loads(result)
return analysis
else:
return {"is_multi_file": False, "strategy": "single", "criteria": None}
except Exception as e:
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
return {"is_multi_file": False, "strategy": "single", "criteria": None}
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
"""
Create a ChatMessage with the extracted raw JSON attached as a file so the user
has access to the data even if downstream processing fails.
"""
try:
services = self.services
workflow = services.currentWorkflow
# Serialize payload
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
content_text = json.dumps(payload, ensure_ascii=False, indent=2)
content_bytes = content_text.encode('utf-8')
# Store as file via component storage
file_name = f"{label}_{ts}.json"
file_item = services.interfaceDbComponent.createFile(
name=file_name,
mimeType="application/json",
content=content_bytes
)
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
# Lookup file info for ChatDocument
file_info = services.workflow.getFileInfo(file_item.id)
doc = ChatDocument(
messageId="", # set after message creation
fileId=file_item.id,
fileName=file_info.get("fileName", file_name) if file_info else file_name,
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
)
# Create message referencing the file - include document in initial call
messageData = {
"workflowId": workflow.id,
"role": "assistant",
"message": "Raw extraction data saved",
"status": "data",
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
"publishedAt": services.utils.timestampGetUtc(),
"documentsLabel": label,
"documents": []
}
# Store message with document included from the start
services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc])
except Exception:
# Non-fatal; ignore if storage or chat creation fails
return