933 lines
45 KiB
Python
933 lines
45 KiB
Python
import logging
|
|
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
from modules.datamodels.datamodelChat import ChatDocument
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class SubDocumentGeneration:
|
|
"""Document generation operations including single-file and multi-file generation."""
|
|
|
|
def __init__(self, services, aiObjects, documentProcessor):
|
|
"""Initialize document generation service.
|
|
|
|
Args:
|
|
services: Service center instance for accessing other services
|
|
aiObjects: Initialized AiObjects instance
|
|
documentProcessor: Document processing service instance
|
|
"""
|
|
self.services = services
|
|
self.aiObjects = aiObjects
|
|
self.documentProcessor = documentProcessor
|
|
|
|
async def callAiWithDocumentGeneration(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions,
|
|
outputFormat: str,
|
|
title: Optional[str]
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Handle AI calls with document generation in specific output format.
|
|
Now supports both single-file and multi-file generation.
|
|
|
|
Args:
|
|
prompt: The main prompt for the AI call
|
|
documents: Optional list of documents to process
|
|
options: AI call configuration options
|
|
outputFormat: Target output format (html, pdf, docx, txt, md, json, csv, xlsx)
|
|
title: Optional title for generated documents
|
|
|
|
Returns:
|
|
Dict with generated documents and metadata
|
|
"""
|
|
try:
|
|
# Use AI to analyze prompt intent
|
|
prompt_analysis = await self._analyzePromptIntent(prompt, self)
|
|
logger.info(f"Prompt analysis result: {prompt_analysis}")
|
|
|
|
if prompt_analysis.get("is_multi_file", False):
|
|
return await self._callAiWithMultiFileGeneration(
|
|
prompt, documents, options, outputFormat, title, prompt_analysis
|
|
)
|
|
else:
|
|
return await self._callAiWithSingleFileGeneration(
|
|
prompt, documents, options, outputFormat, title
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in document generation: {str(e)}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"content": "",
|
|
"rendered_content": "",
|
|
"mime_type": "text/plain",
|
|
"filename": f"error_{outputFormat}",
|
|
"format": outputFormat,
|
|
"title": title or "Error",
|
|
"documents": []
|
|
}
|
|
|
|
async def _callAiWithSingleFileGeneration(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions,
|
|
outputFormat: str,
|
|
title: Optional[str],
|
|
generationPrompt: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""Handle single-file document generation (existing functionality)."""
|
|
import time
|
|
|
|
# Create progress logger
|
|
workflow = self.services.currentWorkflow
|
|
progressLogger = self.services.workflow.createProgressLogger(workflow)
|
|
operationId = f"docGenSingle_{workflow.id}_{int(time.time())}"
|
|
|
|
try:
|
|
# Start progress tracking
|
|
progressLogger.startOperation(
|
|
operationId,
|
|
"Generate",
|
|
"Single-file Generation",
|
|
f"Processing {len(documents) if documents else 0} documents"
|
|
)
|
|
|
|
# Get format-specific extraction prompt from generation service
|
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
|
generation_service = GenerationService(self.services)
|
|
|
|
# Use default title if not provided
|
|
if not title:
|
|
title = "AI Generated Document"
|
|
|
|
# Update progress - generating extraction prompt
|
|
progressLogger.updateProgress(operationId, 0.1, "Generating prompt")
|
|
|
|
# Get format-specific extraction prompt
|
|
extractionPrompt = await generation_service.getExtractionPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=prompt,
|
|
title=title,
|
|
aiService=self
|
|
)
|
|
|
|
# Update progress - starting AI processing
|
|
progressLogger.updateProgress(operationId, 0.3, "AI processing")
|
|
|
|
# Process documents with format-specific prompt using JSON mode
|
|
# This ensures structured JSON output instead of text
|
|
aiResponseJson = await self._callAiJson(extractionPrompt, documents, options)
|
|
|
|
# Update progress - AI processing completed
|
|
progressLogger.updateProgress(operationId, 0.6, "Processing done")
|
|
|
|
# Validate JSON response
|
|
if not isinstance(aiResponseJson, dict) or "sections" not in aiResponseJson:
|
|
raise Exception("AI response is not valid JSON document structure")
|
|
|
|
# Emit raw extracted data as a chat message attachment before rendering
|
|
try:
|
|
await self._postRawDataChatMessage(aiResponseJson, label="raw_extraction_single")
|
|
except Exception:
|
|
logger.warning("Failed to emit raw extraction chat message (single-file)")
|
|
|
|
# Generate filename from document metadata
|
|
parsedFilename = None
|
|
try:
|
|
if aiResponseJson.get("metadata", {}).get("title"):
|
|
title = aiResponseJson["metadata"]["title"]
|
|
# Clean title for filename
|
|
import re
|
|
parsed = re.sub(r"[^a-zA-Z0-9._-]", "-", title)
|
|
parsed = re.sub(r"-+", "-", parsed).strip('-')
|
|
if parsed:
|
|
parsedFilename = f"{parsed}.{outputFormat}"
|
|
except Exception:
|
|
parsedFilename = None
|
|
|
|
# Use AI generation to enhance the extracted JSON before rendering
|
|
enhancedContent = aiResponseJson # Default to original
|
|
if prompt:
|
|
try:
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
|
|
# Get generation prompt
|
|
generationPrompt = await generation_service.getGenerationPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=prompt,
|
|
title=title,
|
|
aiService=self
|
|
)
|
|
|
|
# Prepare the AI call
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
# Create context with the extracted JSON content
|
|
import json
|
|
context = f"Extracted JSON content:\n{json.dumps(aiResponseJson, indent=2)}"
|
|
|
|
request = AiCallRequest(
|
|
prompt=generationPrompt,
|
|
context=context,
|
|
options=request_options
|
|
)
|
|
|
|
# Call AI to enhance the content
|
|
response = await self.aiObjects.call(request)
|
|
|
|
# Save generation prompt and response to debug
|
|
try:
|
|
from modules.shared.debugLogger import writeDebugFile
|
|
debugData = {
|
|
"output_format": outputFormat,
|
|
"title": title,
|
|
"context_length": len(context),
|
|
"extracted_content_keys": list(aiResponseJson.keys()) if isinstance(aiResponseJson, dict) else []
|
|
}
|
|
writeDebugFile(generationPrompt, "generation_single", debugData)
|
|
writeDebugFile(response.content or '', "generation_single_response")
|
|
except Exception:
|
|
pass
|
|
|
|
if response and response.content:
|
|
# Parse the AI response as JSON
|
|
try:
|
|
import re
|
|
result = response.content.strip()
|
|
|
|
# Check if result is empty after stripping
|
|
if not result:
|
|
logger.warning("AI generation returned empty content after stripping, using original content")
|
|
enhancedContent = aiResponseJson
|
|
else:
|
|
# Extract JSON from markdown if present
|
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
|
if json_match:
|
|
result = json_match.group(1).strip()
|
|
elif result.startswith('```json'):
|
|
result = re.sub(r'^```json\s*', '', result)
|
|
result = re.sub(r'\s*```$', '', result)
|
|
elif result.startswith('```'):
|
|
result = re.sub(r'^```\s*', '', result)
|
|
result = re.sub(r'\s*```$', '', result)
|
|
|
|
# Check if result is still empty after markdown extraction
|
|
if not result:
|
|
logger.warning("AI generation returned empty content after markdown extraction, using original content")
|
|
enhancedContent = aiResponseJson
|
|
else:
|
|
# Try to parse JSON with better error handling
|
|
try:
|
|
enhancedContent = json.loads(result)
|
|
logger.info(f"AI enhanced JSON content successfully")
|
|
except json.JSONDecodeError as jsonError:
|
|
# Try to fix common JSON issues
|
|
fixed_result = self._attemptJsonFix(result)
|
|
if fixed_result != result:
|
|
try:
|
|
enhancedContent = json.loads(fixed_result)
|
|
logger.info(f"AI enhanced JSON content successfully after fixing")
|
|
except json.JSONDecodeError:
|
|
logger.warning(f"AI generation returned invalid JSON even after fixing: {str(jsonError)}, using original content")
|
|
enhancedContent = aiResponseJson
|
|
else:
|
|
logger.warning(f"AI generation returned invalid JSON: {str(jsonError)}, using original content")
|
|
enhancedContent = aiResponseJson
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"AI generation returned invalid JSON: {str(e)}, using original content")
|
|
enhancedContent = aiResponseJson
|
|
else:
|
|
logger.warning("AI generation returned empty response, using original content")
|
|
enhancedContent = aiResponseJson
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
|
enhancedContent = aiResponseJson
|
|
|
|
# Render the enhanced JSON content
|
|
renderedContent, mimeType = await generation_service.renderReport(
|
|
extractedContent=enhancedContent,
|
|
outputFormat=outputFormat,
|
|
title=title,
|
|
userPrompt=prompt,
|
|
aiService=self
|
|
)
|
|
|
|
# Generate meaningful filename (use AI-provided if valid, else fallback)
|
|
from datetime import datetime, UTC
|
|
timestamp = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
if parsedFilename and parsedFilename.lower().endswith(f".{outputFormat.lower()}"):
|
|
filename = parsedFilename
|
|
else:
|
|
safeTitle = ''.join(c if c.isalnum() else '-' for c in (title or 'document')).strip('-')
|
|
filename = f"{safeTitle or 'document'}-{timestamp}.{outputFormat}"
|
|
|
|
# Update progress - generation completed
|
|
progressLogger.updateProgress(operationId, 0.9, "Rendering")
|
|
|
|
result = {
|
|
"success": True,
|
|
"content": aiResponseJson, # Structured JSON document
|
|
"rendered_content": renderedContent, # Formatted content
|
|
"mime_type": mimeType,
|
|
"filename": filename,
|
|
"format": outputFormat,
|
|
"title": title,
|
|
"documents": [{
|
|
"documentName": filename,
|
|
"documentData": renderedContent,
|
|
"mimeType": mimeType
|
|
}],
|
|
"is_multi_file": False
|
|
}
|
|
|
|
# Complete progress tracking
|
|
progressLogger.completeOperation(operationId, True)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in single-file document generation: {str(e)}")
|
|
# Complete progress tracking with failure
|
|
progressLogger.completeOperation(operationId, False)
|
|
raise
|
|
|
|
async def _callAiWithMultiFileGeneration(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions,
|
|
outputFormat: str,
|
|
title: Optional[str],
|
|
prompt_analysis: Dict[str, Any]
|
|
) -> Dict[str, Any]:
|
|
"""Handle multi-file document generation using AI analysis."""
|
|
import time
|
|
|
|
# Create progress logger
|
|
workflow = self.services.currentWorkflow
|
|
progressLogger = self.services.workflow.createProgressLogger(workflow)
|
|
operationId = f"docGen_{workflow.id}_{int(time.time())}"
|
|
|
|
try:
|
|
# Start progress tracking
|
|
progressLogger.startOperation(
|
|
operationId,
|
|
"Generate",
|
|
"Multi-file Generation",
|
|
f"Processing {len(documents) if documents else 0} documents"
|
|
)
|
|
|
|
# Get multi-file extraction prompt based on AI analysis
|
|
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
|
|
generation_service = GenerationService(self.services)
|
|
|
|
# Use default title if not provided
|
|
if not title:
|
|
title = "AI Generated Documents"
|
|
|
|
# Update progress - generating extraction prompt
|
|
progressLogger.updateProgress(operationId, 0.1, "Generating prompt")
|
|
|
|
# Get adaptive extraction prompt
|
|
extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=prompt,
|
|
title=title,
|
|
promptAnalysis=prompt_analysis,
|
|
aiService=self
|
|
)
|
|
|
|
logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
|
|
logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
|
|
|
|
# Update progress - starting document processing
|
|
progressLogger.updateProgress(operationId, 0.2, "Processing docs")
|
|
|
|
# Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
|
|
logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
|
|
logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
|
|
|
|
# Use the existing pipeline but replace the prompt with our adaptive one
|
|
# This ensures proper document processing while using the multi-file prompt
|
|
ai_response = await self.documentProcessor.processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
|
|
|
|
logger.info(f"AI response type: {type(ai_response)}")
|
|
logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
|
|
logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
|
|
|
|
# Validate response structure
|
|
if not self._validateResponseStructure(ai_response, prompt_analysis):
|
|
# Fallback to single-file if multi-file fails
|
|
logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
|
|
logger.warning(f"Prompt analysis: {prompt_analysis}")
|
|
logger.warning("Falling back to single-file generation")
|
|
return await self._callAiWithSingleFileGeneration(
|
|
prompt, documents, options, outputFormat, title
|
|
)
|
|
|
|
# Emit raw extracted data as a chat message attachment before transformation/rendering
|
|
try:
|
|
await self._postRawDataChatMessage(ai_response, label="raw_extraction_multi")
|
|
except Exception:
|
|
logger.warning("Failed to emit raw extraction chat message (multi-file)")
|
|
|
|
# Process multiple documents
|
|
generated_documents = []
|
|
for i, doc_data in enumerate(ai_response.get("documents", [])):
|
|
# Transform AI-generated sections to renderer-compatible format
|
|
transformed_sections = []
|
|
for section in doc_data.get("sections", []):
|
|
# Convert AI format to renderer format
|
|
transformed_section = {
|
|
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
|
|
"content_type": section.get("content_type", "paragraph"),
|
|
"elements": section.get("elements", []),
|
|
"order": section.get("order", len(transformed_sections) + 1)
|
|
}
|
|
|
|
# Extract text from elements for simple text-based sections
|
|
if section.get("content_type") in ["paragraph", "heading"]:
|
|
text_parts = []
|
|
for element in section.get("elements", []):
|
|
if "text" in element:
|
|
text_parts.append(element["text"])
|
|
# Add text to the first element or create a new one
|
|
if transformed_section["elements"]:
|
|
transformed_section["elements"][0]["text"] = "\n".join(text_parts)
|
|
else:
|
|
transformed_section["elements"] = [{"text": "\n".join(text_parts)}]
|
|
|
|
transformed_sections.append(transformed_section)
|
|
|
|
# Create complete document structure for rendering
|
|
complete_document = {
|
|
"metadata": {
|
|
"title": doc_data["title"],
|
|
"source_document": "multi_file_generation",
|
|
"document_id": doc_data.get("id", f"doc_{i+1}"),
|
|
"filename": doc_data.get("filename", f"document_{i+1}"),
|
|
"split_strategy": prompt_analysis.get("strategy", "custom")
|
|
},
|
|
"sections": transformed_sections,
|
|
"summary": f"Generated document: {doc_data['title']}",
|
|
"tags": ["multi_file", "ai_generated"]
|
|
}
|
|
|
|
# Use AI generation to enhance the extracted JSON before rendering
|
|
enhancedContent = complete_document # Default to original
|
|
if prompt:
|
|
try:
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
|
|
# Get generation prompt
|
|
generationPrompt = await generation_service.getGenerationPrompt(
|
|
outputFormat=outputFormat,
|
|
userPrompt=prompt,
|
|
title=doc_data["title"],
|
|
aiService=self
|
|
)
|
|
|
|
# Prepare the AI call
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
# Create context with the extracted JSON content
|
|
import json
|
|
context = f"Extracted JSON content:\n{json.dumps(complete_document, indent=2)}"
|
|
|
|
request = AiCallRequest(
|
|
prompt=generationPrompt,
|
|
context=context,
|
|
options=request_options
|
|
)
|
|
|
|
# Call AI to enhance the content
|
|
response = await self.aiObjects.call(request)
|
|
|
|
# Save generation prompt and response to debug
|
|
try:
|
|
from modules.shared.debugLogger import writeDebugFile
|
|
debugData = {
|
|
"output_format": outputFormat,
|
|
"title": doc_data["title"],
|
|
"document_index": i,
|
|
"context_length": len(context),
|
|
"extracted_content_keys": list(complete_document.keys()) if isinstance(complete_document, dict) else []
|
|
}
|
|
writeDebugFile(generationPrompt, f"generation_multi_doc_{i}", debugData)
|
|
writeDebugFile(response.content or '', f"generation_multi_doc_{i}_response")
|
|
except Exception:
|
|
pass
|
|
|
|
if response and response.content:
|
|
# Parse the AI response as JSON
|
|
try:
|
|
import re
|
|
result = response.content.strip()
|
|
|
|
# Extract JSON from markdown if present
|
|
json_match = re.search(r'```json\s*\n(.*?)\n```', result, re.DOTALL)
|
|
if json_match:
|
|
result = json_match.group(1).strip()
|
|
elif result.startswith('```json'):
|
|
result = re.sub(r'^```json\s*', '', result)
|
|
result = re.sub(r'\s*```$', '', result)
|
|
elif result.startswith('```'):
|
|
result = re.sub(r'^```\s*', '', result)
|
|
result = re.sub(r'\s*```$', '', result)
|
|
|
|
# Try to parse JSON
|
|
enhancedContent = json.loads(result)
|
|
logger.info(f"AI enhanced JSON content successfully")
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.warning(f"AI generation returned invalid JSON: {str(e)}, attempting to repair...")
|
|
# Try to repair common JSON issues
|
|
try:
|
|
repaired_result = self._repairJson(result)
|
|
enhancedContent = json.loads(repaired_result)
|
|
logger.info(f"Successfully repaired JSON content")
|
|
except (json.JSONDecodeError, Exception) as repair_error:
|
|
logger.warning(f"JSON repair failed: {str(repair_error)}, trying AI repair...")
|
|
# Try AI-powered JSON repair as last resort
|
|
try:
|
|
ai_repaired = await self._repairJsonWithAI(result)
|
|
enhancedContent = json.loads(ai_repaired)
|
|
logger.info(f"AI successfully repaired JSON content")
|
|
except Exception as ai_repair_error:
|
|
logger.warning(f"AI JSON repair also failed: {str(ai_repair_error)}, using original content")
|
|
enhancedContent = complete_document
|
|
else:
|
|
logger.warning("AI generation returned empty response, using original content")
|
|
enhancedContent = complete_document
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI generation failed: {str(e)}, using original content")
|
|
enhancedContent = complete_document
|
|
|
|
# Render the enhanced JSON content
|
|
rendered_content, mime_type = await generation_service.renderReport(
|
|
extractedContent=enhancedContent,
|
|
outputFormat=outputFormat,
|
|
title=doc_data["title"],
|
|
userPrompt=prompt,
|
|
aiService=self
|
|
)
|
|
|
|
# Generate proper filename with correct extension
|
|
base_filename = doc_data.get("filename", f"document_{i+1}")
|
|
# Remove any existing extension and add the correct one
|
|
if '.' in base_filename:
|
|
base_filename = base_filename.rsplit('.', 1)[0]
|
|
|
|
# Add proper extension based on output format
|
|
if outputFormat.lower() == "docx":
|
|
filename = f"{base_filename}.docx"
|
|
elif outputFormat.lower() == "pdf":
|
|
filename = f"{base_filename}.pdf"
|
|
elif outputFormat.lower() == "html":
|
|
filename = f"{base_filename}.html"
|
|
else:
|
|
filename = f"{base_filename}.{outputFormat}"
|
|
|
|
generated_documents.append({
|
|
"documentName": filename,
|
|
"documentData": rendered_content,
|
|
"mimeType": mime_type
|
|
})
|
|
|
|
# Save debug files for multi-file generation - only if debug enabled
|
|
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
|
|
if debug_enabled:
|
|
try:
|
|
import os
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
debug_root = "./test-chat/ai"
|
|
debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
|
|
os.makedirs(debug_dir, exist_ok=True)
|
|
|
|
# Save metadata
|
|
with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
|
|
f.write(f"title: {title}\n")
|
|
f.write(f"format: {outputFormat}\n")
|
|
f.write(f"documents_count: {len(generated_documents)}\n")
|
|
f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
|
|
f.write(f"prompt_analysis: {prompt_analysis}\n")
|
|
|
|
# Save each generated document
|
|
for i, doc in enumerate(generated_documents):
|
|
doc_filename = doc["documentName"]
|
|
doc_data = doc["documentData"]
|
|
doc_mime = doc["mimeType"]
|
|
|
|
# Determine file extension
|
|
if outputFormat.lower() == "docx":
|
|
file_ext = ".docx"
|
|
elif outputFormat.lower() == "pdf":
|
|
file_ext = ".pdf"
|
|
elif outputFormat.lower() == "html":
|
|
file_ext = ".html"
|
|
else:
|
|
file_ext = f".{outputFormat}"
|
|
|
|
# Save the rendered document
|
|
output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
|
|
|
|
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
|
|
# Text-based formats
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(doc_data)
|
|
else:
|
|
# Binary formats - decode from base64 if needed
|
|
try:
|
|
import base64
|
|
doc_bytes = base64.b64decode(doc_data)
|
|
with open(output_path, 'wb') as f:
|
|
f.write(doc_bytes)
|
|
except Exception:
|
|
# If not base64, save as text
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(doc_data)
|
|
|
|
logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")
|
|
|
|
logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to save multi-file debug output: {e}")
|
|
|
|
# Update progress - generation completed
|
|
progressLogger.updateProgress(operationId, 0.9, "Rendering")
|
|
|
|
result = {
|
|
"success": True,
|
|
"content": ai_response,
|
|
"rendered_content": None, # Not applicable for multi-file
|
|
"mime_type": None, # Not applicable for multi-file
|
|
"filename": None, # Not applicable for multi-file
|
|
"format": outputFormat,
|
|
"title": title,
|
|
"documents": generated_documents,
|
|
"is_multi_file": True,
|
|
"split_strategy": prompt_analysis.get("strategy", "custom")
|
|
}
|
|
|
|
# Complete progress tracking
|
|
progressLogger.completeOperation(operationId, True)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in multi-file document generation: {str(e)}")
|
|
# Complete progress tracking with failure
|
|
progressLogger.completeOperation(operationId, False)
|
|
# Fallback to single-file
|
|
return await self._callAiWithSingleFileGeneration(
|
|
prompt, documents, options, outputFormat, title
|
|
)
|
|
|
|
async def _callAiJson(
|
|
self,
|
|
prompt: str,
|
|
documents: Optional[List[ChatDocument]],
|
|
options: AiCallOptions
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Handle AI calls with document processing for JSON output.
|
|
Returns structured JSON document instead of text.
|
|
"""
|
|
# Process documents with JSON merging
|
|
return await self.documentProcessor.processDocumentsPerChunkJson(documents, prompt, options)
|
|
|
|
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
|
|
"""Use AI to analyze user prompt and determine processing requirements."""
|
|
if not ai_service:
|
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
|
|
|
try:
|
|
analysis_prompt = f"""
|
|
Analyze this user request and determine if it requires multiple file output or single file output.
|
|
|
|
User request: "{prompt}"
|
|
|
|
Respond with JSON only in this exact format:
|
|
{{
|
|
"is_multi_file": true/false,
|
|
"strategy": "single|per_entity|by_section|by_criteria|custom",
|
|
"criteria": "description of how to split content",
|
|
"file_naming_pattern": "suggested pattern for filenames",
|
|
"reasoning": "brief explanation of the analysis"
|
|
}}
|
|
|
|
Consider:
|
|
- Does the user want separate files for different entities (customers, products, etc.)?
|
|
- Does the user want to split content into multiple documents?
|
|
- What would be the most logical way to organize the content?
|
|
- What language is the request in? (analyze in the original language)
|
|
|
|
Return only the JSON response.
|
|
"""
|
|
|
|
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
|
|
request_options = AiCallOptions()
|
|
request_options.operationType = OperationType.GENERAL
|
|
|
|
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
|
|
response = await ai_service.aiObjects.call(request)
|
|
|
|
if response and response.content:
|
|
import json
|
|
import re
|
|
|
|
# Extract JSON from response
|
|
result = response.content.strip()
|
|
json_match = re.search(r'\{.*\}', result, re.DOTALL)
|
|
if json_match:
|
|
result = json_match.group(0)
|
|
|
|
analysis = json.loads(result)
|
|
return analysis
|
|
else:
|
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
|
|
return {"is_multi_file": False, "strategy": "single", "criteria": None}
|
|
|
|
def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
|
|
"""Validate that AI response matches the expected structure."""
|
|
try:
|
|
if not isinstance(response, dict):
|
|
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
|
|
return False
|
|
|
|
# Check for multi-file structure
|
|
if prompt_analysis.get("is_multi_file", False):
|
|
has_documents = "documents" in response
|
|
is_documents_list = isinstance(response.get("documents"), list)
|
|
logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
|
|
if has_documents and is_documents_list:
|
|
logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
|
|
else:
|
|
logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
|
|
logger.warning(f"Available keys: {list(response.keys())}")
|
|
return has_documents and is_documents_list
|
|
else:
|
|
has_sections = "sections" in response
|
|
is_sections_list = isinstance(response.get("sections"), list)
|
|
logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
|
|
return has_sections and is_sections_list
|
|
except Exception as e:
|
|
logger.warning(f"Response validation failed with exception: {str(e)}")
|
|
return False
|
|
|
|
async def _postRawDataChatMessage(self, payload: Any, label: str = "raw_extraction") -> None:
|
|
"""
|
|
Create a ChatMessage with the extracted raw JSON attached as a file so the user
|
|
has access to the data even if downstream processing fails.
|
|
"""
|
|
try:
|
|
services = self.services
|
|
workflow = services.currentWorkflow
|
|
|
|
# Serialize payload
|
|
import json as _json
|
|
from datetime import datetime, UTC
|
|
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
|
|
content_text = _json.dumps(payload, ensure_ascii=False, indent=2)
|
|
content_bytes = content_text.encode('utf-8')
|
|
|
|
# Store as file via component storage
|
|
file_name = f"{label}_{ts}.json"
|
|
file_item = services.interfaceDbComponent.createFile(
|
|
name=file_name,
|
|
mimeType="application/json",
|
|
content=content_bytes
|
|
)
|
|
services.interfaceDbComponent.createFileData(file_item.id, content_bytes)
|
|
|
|
# Lookup file info for ChatDocument
|
|
file_info = services.workflow.getFileInfo(file_item.id)
|
|
doc = ChatDocument(
|
|
messageId="", # set after message creation
|
|
fileId=file_item.id,
|
|
fileName=file_info.get("fileName", file_name) if file_info else file_name,
|
|
fileSize=file_info.get("size", len(content_bytes)) if file_info else len(content_bytes),
|
|
mimeType=file_info.get("mimeType", "application/json") if file_info else "application/json"
|
|
)
|
|
|
|
# Create message referencing the file - include document in initial call
|
|
messageData = {
|
|
"workflowId": workflow.id,
|
|
"role": "assistant",
|
|
"message": "Raw extraction data saved",
|
|
"status": "data",
|
|
"sequenceNr": len(getattr(workflow, 'messages', []) or []) + 1,
|
|
"publishedAt": services.utils.getUtcTimestamp(),
|
|
"documentsLabel": label,
|
|
"documents": []
|
|
}
|
|
|
|
# Store message with document included from the start
|
|
services.workflow.storeMessageWithDocuments(services.workflow.workflow, messageData, [doc])
|
|
except Exception:
|
|
# Non-fatal; ignore if storage or chat creation fails
|
|
return
|
|
|
|
def _repairJson(self, json_string: str) -> str:
|
|
"""Repair common JSON syntax errors efficiently for large JSON."""
|
|
try:
|
|
import re
|
|
import json
|
|
|
|
# Remove any leading/trailing whitespace
|
|
json_string = json_string.strip()
|
|
|
|
# For large JSON, skip substring extraction and go straight to targeted repairs
|
|
logger.info(f"Attempting JSON repair for {len(json_string)} characters...")
|
|
|
|
# Try to parse first to see what specific error we get
|
|
try:
|
|
json.loads(json_string)
|
|
return json_string # Already valid
|
|
except json.JSONDecodeError as e:
|
|
error_msg = str(e)
|
|
logger.info(f"JSON error: {error_msg}")
|
|
|
|
# Apply targeted fixes based on the specific error
|
|
if "Expecting ',' delimiter" in error_msg:
|
|
# Fix missing commas between array elements
|
|
json_string = re.sub(r'\]\s*\[', '], [', json_string)
|
|
json_string = re.sub(r'\}\s*\{', '}, {', json_string)
|
|
# Fix missing commas between object properties
|
|
json_string = re.sub(r'("\s*:\s*[^,}]+)\s*(")', r'\1, \2', json_string)
|
|
|
|
if "Expecting value" in error_msg:
|
|
# Fix missing values (replace empty with null)
|
|
json_string = re.sub(r':\s*,', ': null,', json_string)
|
|
json_string = re.sub(r':\s*}', ': null}', json_string)
|
|
|
|
if "Expecting property name" in error_msg:
|
|
# Fix unquoted property names
|
|
json_string = re.sub(r'(\w+):', r'"\1":', json_string)
|
|
|
|
# Fix trailing commas before closing brackets/braces
|
|
json_string = re.sub(r',(\s*[}\]])', r'\1', json_string)
|
|
|
|
# Fix missing closing brackets/braces (only if reasonable)
|
|
open_braces = json_string.count('{')
|
|
close_braces = json_string.count('}')
|
|
open_brackets = json_string.count('[')
|
|
close_brackets = json_string.count(']')
|
|
|
|
# Only add missing brackets if the difference is small (avoid runaway)
|
|
if 0 < (open_braces - close_braces) <= 5:
|
|
missing_braces = open_braces - close_braces
|
|
json_string += '}' * missing_braces
|
|
|
|
if 0 < (open_brackets - close_brackets) <= 5:
|
|
missing_brackets = open_brackets - close_brackets
|
|
json_string += ']' * missing_brackets
|
|
|
|
# Try to parse again
|
|
try:
|
|
json.loads(json_string)
|
|
logger.info("JSON repair successful")
|
|
return json_string
|
|
except json.JSONDecodeError:
|
|
logger.warning("JSON repair failed - will try AI repair")
|
|
return json_string
|
|
|
|
except Exception as e:
|
|
logger.warning(f"JSON repair failed: {str(e)}")
|
|
return json_string
|
|
|
|
async def _repairJsonWithAI(self, malformed_json: str) -> str:
|
|
"""Use AI to repair malformed JSON efficiently for large files."""
|
|
try:
|
|
# Limit JSON size for AI processing (max 50KB to avoid token limits)
|
|
max_json_size = 50000
|
|
json_to_repair = malformed_json
|
|
|
|
if len(malformed_json) > max_json_size:
|
|
logger.warning(f"JSON too large ({len(malformed_json)} chars), truncating to {max_json_size} chars for AI repair")
|
|
# Try to find a good truncation point (end of a complete object/array)
|
|
truncate_at = max_json_size
|
|
for i in range(max_json_size, max(0, max_json_size - 1000), -1):
|
|
if malformed_json[i] in ['}', ']']:
|
|
truncate_at = i + 1
|
|
break
|
|
json_to_repair = malformed_json[:truncate_at] + "..."
|
|
|
|
repair_prompt = f"""
|
|
You are a JSON repair expert. Fix the following malformed JSON and return ONLY the corrected JSON, no explanations.
|
|
|
|
Malformed JSON:
|
|
{json_to_repair}
|
|
|
|
Return only the valid JSON:
|
|
"""
|
|
|
|
# Use AI to repair the JSON
|
|
repaired_json = await self.services.ai.callAi(
|
|
prompt=repair_prompt,
|
|
documents=None,
|
|
options={
|
|
"process_type": "text",
|
|
"operation_type": "generate_content",
|
|
"priority": "speed",
|
|
"max_cost": 0.01
|
|
}
|
|
)
|
|
|
|
# Clean up the response (remove any markdown formatting)
|
|
repaired_json = repaired_json.strip()
|
|
if repaired_json.startswith('```json'):
|
|
repaired_json = repaired_json[7:]
|
|
if repaired_json.endswith('```'):
|
|
repaired_json = repaired_json[:-3]
|
|
repaired_json = repaired_json.strip()
|
|
|
|
# Validate the repaired JSON
|
|
import json
|
|
json.loads(repaired_json)
|
|
logger.info("AI JSON repair successful")
|
|
return repaired_json
|
|
|
|
except Exception as e:
|
|
logger.warning(f"AI JSON repair failed: {str(e)}")
|
|
return malformed_json
|
|
|
|
def _attemptJsonFix(self, json_string: str) -> str:
|
|
"""Attempt to fix common JSON issues"""
|
|
try:
|
|
# Remove any trailing commas before closing braces/brackets
|
|
import re
|
|
fixed = re.sub(r',(\s*[}\]])', r'\1', json_string)
|
|
|
|
# Try to fix unterminated strings by adding quotes at the end
|
|
if '"' in fixed and not fixed.strip().endswith('"'):
|
|
# Count quotes to see if we have an odd number (unterminated string)
|
|
quote_count = fixed.count('"')
|
|
if quote_count % 2 == 1:
|
|
# Find the last quote and add a closing quote
|
|
last_quote_pos = fixed.rfind('"')
|
|
if last_quote_pos != -1:
|
|
# Check if there's content after the last quote that needs to be quoted
|
|
after_quote = fixed[last_quote_pos + 1:].strip()
|
|
if after_quote and not after_quote.startswith(','):
|
|
# Add closing quote before any trailing content
|
|
fixed = fixed[:last_quote_pos + 1] + '"' + after_quote
|
|
|
|
return fixed
|
|
except Exception:
|
|
return json_string
|