From 0bc71c99d589cbe3f9dd86228ba7ffb2c8dc2223 Mon Sep 17 00:00:00 2001
From: ValueOn AG
Date: Tue, 14 Oct 2025 00:23:59 +0200
Subject: [PATCH] Multi-document output implemented
---
modules/services/serviceAi/mainServiceAi.py | 539 +++++++++++++++++-
.../mainServiceGeneration.py | 107 +++-
.../serviceGeneration/subJsonSchema.py | 293 +++++++++-
.../serviceGeneration/subPromptBuilder.py | 294 +++++++++-
test_document_processing.py | 2 +
test_multifile_processing.py | 263 +++++++++
6 files changed, 1448 insertions(+), 50 deletions(-)
create mode 100644 test_multifile_processing.py
diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py
index f3be97b2..927f696e 100644
--- a/modules/services/serviceAi/mainServiceAi.py
+++ b/modules/services/serviceAi/mainServiceAi.py
@@ -649,6 +649,11 @@ class AiService:
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
+ # Skip empty container chunks (they're just metadata containers)
+ if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
+ logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
+ continue
+
chunks_to_process.append({
'part': part,
'chunk_index': chunk_index,
@@ -764,7 +769,14 @@ class AiService:
elif part.typeGroup in ("container", "binary"):
# Handle ALL container and binary content generically - let AI process any document type
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
- if part.mimeType and part.data and len(part.data.strip()) > 0:
+
+ # Skip empty container chunks (they're just metadata containers)
+ if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
+ self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
+ logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
+ # Skip processing this chunk
+ pass
+ elif part.mimeType and part.data and len(part.data.strip()) > 0:
# Process any document container as text content
request_options = options if options is not None else AiCallOptions()
request_options.operationType = OperationType.GENERAL
@@ -869,12 +881,19 @@ class AiService:
# Log extraction context length
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
+ # Debug: Log the actual prompt being sent to AI
+ logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
+ logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
+
request = AiCallRequest(
prompt=prompt,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
+
+ # Debug: Log what AI actually returned
+ logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...")
ai_result = response.content
# Log extraction response length
@@ -900,16 +919,20 @@ class AiService:
import json
import re
- # Clean the response - remove markdown code blocks if present
+ # Clean the response - remove markdown code blocks and extra formatting
cleaned_result = ai_result.strip()
- if cleaned_result.startswith('```json'):
- # Remove ```json from start and ``` from end
- cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
- cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
- elif cleaned_result.startswith('```'):
- # Remove ``` from start and end
- cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
- cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
+
+ # Remove any markdown code block markers (```json, ```, etc.)
+ cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE)
+ cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE)
+
+ # Remove any remaining ``` markers anywhere in the text
+ cleaned_result = re.sub(r'```', '', cleaned_result)
+
+ # Try to extract JSON from the response if it's embedded in other text
+ json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL)
+ if json_match:
+ cleaned_result = json_match.group(0)
# Validate JSON
json.loads(cleaned_result)
@@ -1193,7 +1216,13 @@ class AiService:
# Parse JSON from AI result
chunk_json = json.loads(chunk_result.aiResult)
- # Extract sections from this chunk
+ # Check if this is a multi-file response (has "documents" key)
+ if isinstance(chunk_json, dict) and "documents" in chunk_json:
+ # This is a multi-file response - return it as-is
+ logger.info("Detected multi-file response from AI - preserving structure")
+ return chunk_json
+
+ # Extract sections from single-file response
if isinstance(chunk_json, dict) and "sections" in chunk_json:
for section in chunk_json["sections"]:
# Add document context to section
@@ -1527,6 +1556,152 @@ class AiService:
# This ensures MIME-type checking, chunk mapping, and parallel processing
return await self._processDocumentsPerChunk(documents, prompt, options)
+ async def _callAiDirect(
+ self,
+ prompt: str,
+ documents: Optional[List[ChatDocument]],
+ options: AiCallOptions
+ ) -> Dict[str, Any]:
+ """
+ Call AI directly with prompt and documents for JSON output.
+ Used for multi-file generation - uses the existing generation pipeline.
+ """
+ # Use the existing generation pipeline that already works
+ # This ensures proper document processing and content extraction
+ logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents")
+
+ # Process documents with JSON merging using the existing pipeline
+ result = await self._processDocumentsPerChunkJson(documents, prompt, options)
+
+ # Convert single-file result to multi-file format if needed
+ if "sections" in result and "documents" not in result:
+ logger.info("Converting single-file result to multi-file format")
+ # This is a single-file result, convert it to multi-file format
+ return {
+ "metadata": result.get("metadata", {"title": "Converted Document"}),
+ "documents": [{
+ "id": "doc_1",
+ "title": result.get("metadata", {}).get("title", "Document"),
+ "filename": "document.txt",
+ "sections": result.get("sections", [])
+ }]
+ }
+
+ return result
+
+ async def _processDocumentsPerChunkJsonWithPrompt(
+ self,
+ documents: List[ChatDocument],
+ custom_prompt: str,
+ options: Optional[AiCallOptions] = None
+ ) -> Dict[str, Any]:
+ """
+ Process documents with per-chunk AI calls and merge results in JSON mode.
+ Uses a custom prompt instead of the default extraction prompt.
+ """
+ if not documents:
+ return {"metadata": {"title": "Empty Document"}, "sections": []}
+
+ # Get model capabilities for size calculation
+ model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
+
+ # Build extraction options for chunking with intelligent merging
+ extractionOptions: Dict[str, Any] = {
+ "prompt": custom_prompt, # Use the custom prompt instead of default
+ "operationType": options.operationType if options else "general",
+ "processDocumentsIndividually": True, # Process each document separately
+ "maxSize": model_capabilities["maxContextBytes"],
+ "chunkAllowed": True,
+ "textChunkSize": model_capabilities["textChunkSize"],
+ "imageChunkSize": model_capabilities["imageChunkSize"],
+ "imageMaxPixels": 1024 * 1024,
+ "imageQuality": 85,
+ "mergeStrategy": {
+ "useIntelligentMerging": True, # Enable intelligent token-aware merging
+ "modelCapabilities": model_capabilities,
+ "prompt": custom_prompt, # Use the custom prompt
+ "groupBy": "typeGroup",
+ "orderBy": "id",
+ "mergeType": "concatenate"
+ },
+ }
+
+ logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
+
+ try:
+ # Extract content with chunking
+ extractionResult = self.extractionService.extractContent(documents, extractionOptions)
+
+ if not isinstance(extractionResult, list):
+ return {"metadata": {"title": "Error Document"}, "sections": []}
+
+ # Process chunks with proper mapping
+ logger.info(f"Processing {len(extractionResult)} chunks with custom prompt")
+ logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...")
+
+ # Debug: Show what content is being processed (before filtering)
+ for i, ec in enumerate(extractionResult):
+ logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}")
+
+ # Check each part within the ContentExtracted
+ if hasattr(ec, 'parts'):
+ for j, part in enumerate(ec.parts):
+ if hasattr(part, 'data') and part.data:
+ logger.debug(f" Part {j} content preview: {part.data[:200]}...")
+ else:
+ # Check what attributes the part actually has
+ part_attrs = [attr for attr in dir(part) if not attr.startswith('_')]
+ part_type = getattr(part, 'typeGroup', None)
+ part_mime = getattr(part, 'mimeType', '')
+ has_data = hasattr(part, 'data') and bool(part.data)
+
+ logger.debug(f" Part {j} DEBUG: available_attrs={part_attrs}")
+ logger.debug(f" Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}")
+
+ # Check if this is an empty container chunk (which is expected)
+ is_empty_container = False
+ if part_type == "container" and part_mime and 'document' in part_mime.lower():
+ is_empty_container = True
+
+ if is_empty_container:
+ logger.debug(f" Part {j} is empty container (will be filtered out) - mimeType={part_mime}")
+ else:
+ logger.warning(f" Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}")
+ else:
+ logger.warning(f"ContentExtracted {i} has no parts attribute")
+
+ chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
+
+ # Debug: Show what chunks were actually processed (after filtering)
+ logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
+ for i, chunk_result in enumerate(chunkResults):
+ if chunk_result and chunk_result.metadata.get("success", False):
+ logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars")
+ else:
+ logger.debug(f"Processed chunk {i}: error or skipped")
+
+ # Merge with JSON mode
+ mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
+
+ # Debug: Show what the AI actually returned
+ logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
+ if 'sections' in mergedJsonDocument:
+ logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
+ if mergedJsonDocument['sections']:
+ logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...")
+ else:
+ logger.warning("AI returned empty sections array")
+ if 'documents' in mergedJsonDocument:
+ logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
+ else:
+ logger.warning("AI did not return 'documents' key - this is single-file format")
+
+ return mergedJsonDocument
+
+ except Exception as e:
+ logger.error(f"Error in per-chunk JSON processing: {str(e)}")
+ return {"metadata": {"title": "Error Document"}, "sections": []}
+
async def _callAiJson(
self,
prompt: str,
@@ -1821,6 +1996,88 @@ class AiService:
target_length = int(len(text) * reduction_factor)
return text[:target_length] + "... [reduced]"
+ async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
+ """Use AI to analyze user prompt and determine processing requirements."""
+ if not ai_service:
+ return {"is_multi_file": False, "strategy": "single", "criteria": None}
+
+ try:
+ analysis_prompt = f"""
+Analyze this user request and determine if it requires multiple file output or single file output.
+
+User request: "{prompt}"
+
+Respond with JSON only in this exact format:
+{{
+ "is_multi_file": true/false,
+ "strategy": "single|per_entity|by_section|by_criteria|custom",
+ "criteria": "description of how to split content",
+ "file_naming_pattern": "suggested pattern for filenames",
+ "reasoning": "brief explanation of the analysis"
+}}
+
+Consider:
+- Does the user want separate files for different entities (customers, products, etc.)?
+- Does the user want to split content into multiple documents?
+- What would be the most logical way to organize the content?
+- What language is the request in? (analyze in the original language)
+
+Return only the JSON response.
+"""
+
+ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
+ request_options = AiCallOptions()
+ request_options.operationType = OperationType.GENERAL
+
+ request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
+ response = await ai_service.aiObjects.call(request)
+
+ if response and response.content:
+ import json
+ import re
+
+ # Extract JSON from response
+ result = response.content.strip()
+ json_match = re.search(r'\{.*\}', result, re.DOTALL)
+ if json_match:
+ result = json_match.group(0)
+
+ analysis = json.loads(result)
+ return analysis
+ else:
+ return {"is_multi_file": False, "strategy": "single", "criteria": None}
+
+ except Exception as e:
+ logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
+ return {"is_multi_file": False, "strategy": "single", "criteria": None}
+
+ def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
+ """Validate that AI response matches the expected structure."""
+ try:
+ if not isinstance(response, dict):
+ logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
+ return False
+
+ # Check for multi-file structure
+ if prompt_analysis.get("is_multi_file", False):
+ has_documents = "documents" in response
+ is_documents_list = isinstance(response.get("documents"), list)
+ logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
+ if has_documents and is_documents_list:
+ logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
+ else:
+ logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
+ logger.warning(f"Available keys: {list(response.keys())}")
+ return has_documents and is_documents_list
+ else:
+ has_sections = "sections" in response
+ is_sections_list = isinstance(response.get("sections"), list)
+ logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
+ return has_sections and is_sections_list
+ except Exception as e:
+ logger.warning(f"Response validation failed with exception: {str(e)}")
+ return False
+
async def _callAiWithDocumentGeneration(
self,
prompt: str,
@@ -1831,6 +2088,7 @@ class AiService:
) -> Dict[str, Any]:
"""
Handle AI calls with document generation in specific output format.
+ Now supports both single-file and multi-file generation.
Args:
prompt: The main prompt for the AI call
@@ -1842,6 +2100,43 @@ class AiService:
Returns:
Dict with generated documents and metadata
"""
+ try:
+ # Use AI to analyze prompt intent
+ prompt_analysis = await self._analyzePromptIntent(prompt, self)
+ logger.info(f"Prompt analysis result: {prompt_analysis}")
+
+ if prompt_analysis.get("is_multi_file", False):
+ return await self._callAiWithMultiFileGeneration(
+ prompt, documents, options, outputFormat, title, prompt_analysis
+ )
+ else:
+ return await self._callAiWithSingleFileGeneration(
+ prompt, documents, options, outputFormat, title
+ )
+
+ except Exception as e:
+ logger.error(f"Error in document generation: {str(e)}")
+ return {
+ "success": False,
+ "error": str(e),
+ "content": "",
+ "rendered_content": "",
+ "mime_type": "text/plain",
+ "filename": f"error_{outputFormat}",
+ "format": outputFormat,
+ "title": title or "Error",
+ "documents": []
+ }
+
+ async def _callAiWithSingleFileGeneration(
+ self,
+ prompt: str,
+ documents: Optional[List[ChatDocument]],
+ options: AiCallOptions,
+ outputFormat: str,
+ title: Optional[str]
+ ) -> Dict[str, Any]:
+ """Handle single-file document generation (existing functionality)."""
try:
# Get format-specific extraction prompt from generation service
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
@@ -1912,20 +2207,216 @@ class AiService:
"documentName": filename,
"documentData": renderedContent,
"mimeType": mimeType
- }]
+ }],
+ "is_multi_file": False
}
except Exception as e:
- logger.error(f"Error in document generation: {str(e)}")
- return {
- "success": False,
- "error": str(e),
- "content": "",
- "rendered_content": "",
- "mime_type": "text/plain",
- "filename": f"error_{outputFormat}",
- "format": outputFormat,
- "title": title or "Error",
- "documents": []
- }
+ logger.error(f"Error in single-file document generation: {str(e)}")
+ raise
+
+ async def _callAiWithMultiFileGeneration(
+ self,
+ prompt: str,
+ documents: Optional[List[ChatDocument]],
+ options: AiCallOptions,
+ outputFormat: str,
+ title: Optional[str],
+ prompt_analysis: Dict[str, Any]
+ ) -> Dict[str, Any]:
+ """Handle multi-file document generation using AI analysis."""
+ try:
+ # Get multi-file extraction prompt based on AI analysis
+ from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
+ generation_service = GenerationService(self.services)
+
+ # Use default title if not provided
+ if not title:
+ title = "AI Generated Documents"
+
+ # Get adaptive extraction prompt
+ extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
+ outputFormat=outputFormat,
+ userPrompt=prompt,
+ title=title,
+ promptAnalysis=prompt_analysis,
+ aiService=self
+ )
+
+ logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
+ logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
+
+ # Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
+ logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
+ logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
+
+ # Use the existing pipeline but replace the prompt with our adaptive one
+ # This ensures proper document processing while using the multi-file prompt
+ ai_response = await self._processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
+
+ logger.info(f"AI response type: {type(ai_response)}")
+ logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
+ logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
+
+ # Validate response structure
+ if not self._validateResponseStructure(ai_response, prompt_analysis):
+ # Fallback to single-file if multi-file fails
+ logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
+ logger.warning(f"Prompt analysis: {prompt_analysis}")
+ logger.warning("Falling back to single-file generation")
+ return await self._callAiWithSingleFileGeneration(
+ prompt, documents, options, outputFormat, title
+ )
+
+ # Process multiple documents
+ generated_documents = []
+ for i, doc_data in enumerate(ai_response.get("documents", [])):
+ # Transform AI-generated sections to renderer-compatible format
+ transformed_sections = []
+ for section in doc_data.get("sections", []):
+ # Convert AI format to renderer format
+ transformed_section = {
+ "id": section.get("id", f"section_{len(transformed_sections) + 1}"),
+ "type": section.get("content_type", "paragraph"),
+ "data": {
+ "text": "",
+ "elements": section.get("elements", [])
+ },
+ "order": section.get("order", len(transformed_sections) + 1)
+ }
+
+ # Extract text from elements for simple text-based sections
+ if section.get("content_type") in ["paragraph", "heading"]:
+ text_parts = []
+ for element in section.get("elements", []):
+ if "text" in element:
+ text_parts.append(element["text"])
+ transformed_section["data"]["text"] = "\n".join(text_parts)
+
+ transformed_sections.append(transformed_section)
+
+ # Create complete document structure for rendering
+ complete_document = {
+ "metadata": {
+ "title": doc_data["title"],
+ "source_document": "multi_file_generation",
+ "document_id": doc_data.get("id", f"doc_{i+1}"),
+ "filename": doc_data.get("filename", f"document_{i+1}"),
+ "split_strategy": prompt_analysis.get("strategy", "custom")
+ },
+ "sections": transformed_sections,
+ "summary": f"Generated document: {doc_data['title']}",
+ "tags": ["multi_file", "ai_generated"]
+ }
+
+ rendered_content, mime_type = await generation_service.renderReport(
+ extractedContent=complete_document,
+ outputFormat=outputFormat,
+ title=doc_data["title"],
+ userPrompt=prompt,
+ aiService=self
+ )
+
+ # Generate proper filename with correct extension
+ base_filename = doc_data.get("filename", f"document_{i+1}")
+ # Remove any existing extension and add the correct one
+ if '.' in base_filename:
+ base_filename = base_filename.rsplit('.', 1)[0]
+
+ # Add proper extension based on output format
+ if outputFormat.lower() == "docx":
+ filename = f"{base_filename}.docx"
+ elif outputFormat.lower() == "pdf":
+ filename = f"{base_filename}.pdf"
+ elif outputFormat.lower() == "html":
+ filename = f"{base_filename}.html"
+ else:
+ filename = f"{base_filename}.{outputFormat}"
+
+ generated_documents.append({
+ "documentName": filename,
+ "documentData": rendered_content,
+ "mimeType": mime_type
+ })
+
+ # Save debug files for multi-file generation - only if debug enabled
+ debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
+ if debug_enabled:
+ try:
+ import os
+ from datetime import datetime, UTC
+ ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
+ debug_root = "./test-chat/ai"
+ debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
+ os.makedirs(debug_dir, exist_ok=True)
+
+ # Save metadata
+ with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
+ f.write(f"title: {title}\n")
+ f.write(f"format: {outputFormat}\n")
+ f.write(f"documents_count: {len(generated_documents)}\n")
+ f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
+ f.write(f"prompt_analysis: {prompt_analysis}\n")
+
+ # Save each generated document
+ for i, doc in enumerate(generated_documents):
+ doc_filename = doc["documentName"]
+ doc_data = doc["documentData"]
+ doc_mime = doc["mimeType"]
+
+ # Determine file extension
+ if outputFormat.lower() == "docx":
+ file_ext = ".docx"
+ elif outputFormat.lower() == "pdf":
+ file_ext = ".pdf"
+ elif outputFormat.lower() == "html":
+ file_ext = ".html"
+ else:
+ file_ext = f".{outputFormat}"
+
+ # Save the rendered document
+ output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
+
+ if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
+ # Text-based formats
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(doc_data)
+ else:
+ # Binary formats - decode from base64 if needed
+ try:
+ import base64
+ doc_bytes = base64.b64decode(doc_data)
+ with open(output_path, 'wb') as f:
+ f.write(doc_bytes)
+ except Exception:
+ # If not base64, save as text
+ with open(output_path, 'w', encoding='utf-8') as f:
+ f.write(doc_data)
+
+ logger.info(f"๐พ Debug: Saved multi-file document {i+1}: {output_path}")
+
+ logger.info(f"๐พ Debug: Multi-file output saved to: {debug_dir}")
+
+ except Exception as e:
+ logger.warning(f"Failed to save multi-file debug output: {e}")
+
+ return {
+ "success": True,
+ "content": ai_response,
+ "rendered_content": None, # Not applicable for multi-file
+ "mime_type": None, # Not applicable for multi-file
+ "filename": None, # Not applicable for multi-file
+ "format": outputFormat,
+ "title": title,
+ "documents": generated_documents,
+ "is_multi_file": True,
+ "split_strategy": prompt_analysis.get("strategy", "custom")
+ }
+
+ except Exception as e:
+ logger.error(f"Error in multi-file document generation: {str(e)}")
+ # Fallback to single-file
+ return await self._callAiWithSingleFileGeneration(
+ prompt, documents, options, outputFormat, title
+ )
diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py
index 4c76c95e..340cb8ce 100644
--- a/modules/services/serviceGeneration/mainServiceGeneration.py
+++ b/modules/services/serviceGeneration/mainServiceGeneration.py
@@ -1,6 +1,6 @@
import logging
import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union, Tuple
from datetime import datetime, UTC
import re
from modules.shared.timezoneUtils import get_utc_timestamp
@@ -372,6 +372,42 @@ class GenerationService:
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
raise
+ async def getAdaptiveExtractionPrompt(
+ self,
+ outputFormat: str,
+ userPrompt: str,
+ title: str,
+ promptAnalysis: Dict[str, Any],
+ aiService=None
+ ) -> str:
+ """Get adaptive extraction prompt based on AI analysis."""
+ from .subPromptBuilder import buildAdaptiveExtractionPrompt
+ return await buildAdaptiveExtractionPrompt(
+ outputFormat=outputFormat,
+ userPrompt=userPrompt,
+ title=title,
+ promptAnalysis=promptAnalysis,
+ aiService=aiService,
+ services=self.services
+ )
+
+ async def getGenericExtractionPrompt(
+ self,
+ outputFormat: str,
+ userPrompt: str,
+ title: str,
+ aiService=None
+ ) -> str:
+ """Get generic extraction prompt that works for both single and multi-file."""
+ from .subPromptBuilder import buildGenericExtractionPrompt
+ return await buildGenericExtractionPrompt(
+ outputFormat=outputFormat,
+ userPrompt=userPrompt,
+ title=title,
+ aiService=aiService,
+ services=self.services
+ )
+
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
@@ -409,6 +445,75 @@ class GenerationService:
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise
+ async def renderAdaptiveReport(
+ self,
+ extractedContent: Dict[str, Any],
+ outputFormat: str,
+ title: str,
+ userPrompt: str = None,
+ aiService=None,
+ isMultiFile: bool = False
+ ) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
+ """Render report adaptively based on content structure."""
+
+ if isMultiFile and "documents" in extractedContent:
+ return await self._renderMultiFileReport(
+ extractedContent, outputFormat, title, userPrompt, aiService
+ )
+ else:
+ return await self._renderSingleFileReport(
+ extractedContent, outputFormat, title, userPrompt, aiService
+ )
+
+ async def _renderMultiFileReport(
+ self,
+ extractedContent: Dict[str, Any],
+ outputFormat: str,
+ title: str,
+ userPrompt: str = None,
+ aiService=None
+ ) -> List[Dict[str, Any]]:
+ """Render multiple documents from extracted content."""
+
+ generated_documents = []
+
+ for doc_data in extractedContent.get("documents", []):
+ # Use existing single-file renderer for each document
+ renderer = self._getFormatRenderer(outputFormat)
+ if not renderer:
+ continue
+
+ # Render individual document
+ rendered_content, mime_type = await renderer.render(
+ extractedContent={"sections": doc_data["sections"]},
+ title=doc_data["title"],
+ userPrompt=userPrompt,
+ aiService=aiService
+ )
+
+ generated_documents.append({
+ "filename": doc_data["filename"],
+ "content": rendered_content,
+ "mime_type": mime_type,
+ "title": doc_data["title"]
+ })
+
+ return generated_documents
+
+ async def _renderSingleFileReport(
+ self,
+ extractedContent: Dict[str, Any],
+ outputFormat: str,
+ title: str,
+ userPrompt: str = None,
+ aiService=None
+ ) -> Tuple[str, str]:
+ """Render single file report (existing functionality)."""
+ # Use existing renderReport method
+ return await self.renderReport(
+ extractedContent, outputFormat, title, userPrompt, aiService
+ )
+
def _getFormatRenderer(self, output_format: str):
"""Get the appropriate renderer for the specified format using auto-discovery."""
try:
diff --git a/modules/services/serviceGeneration/subJsonSchema.py b/modules/services/serviceGeneration/subJsonSchema.py
index 581e2037..868a6ca4 100644
--- a/modules/services/serviceGeneration/subJsonSchema.py
+++ b/modules/services/serviceGeneration/subJsonSchema.py
@@ -6,8 +6,197 @@ This module provides schemas that guide AI to generate structured JSON output.
from typing import Dict, Any
+def get_multi_document_subJsonSchema() -> Dict[str, Any]:
+ """Get the JSON schema for multi-document generation."""
+ return {
+ "type": "object",
+ "required": ["metadata", "documents"],
+ "properties": {
+ "metadata": {
+ "type": "object",
+ "required": ["title", "splitStrategy"],
+ "properties": {
+ "title": {"type": "string", "description": "Document title"},
+ "splitStrategy": {
+ "type": "string",
+ "enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
+ "description": "Strategy for splitting content into multiple files"
+ },
+ "splitCriteria": {
+ "type": "object",
+ "description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
+ },
+ "fileNamingPattern": {
+ "type": "string",
+ "description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
+ },
+ "author": {"type": "string", "description": "Document author (optional)"},
+ "source_documents": {
+ "type": "array",
+ "items": {"type": "string"},
+ "description": "List of source document IDs"
+ },
+ "extraction_method": {
+ "type": "string",
+ "default": "ai_extraction",
+ "description": "Method used for extraction"
+ }
+ }
+ },
+ "documents": {
+ "type": "array",
+ "description": "Array of individual documents to generate",
+ "items": {
+ "type": "object",
+ "required": ["id", "title", "sections", "filename"],
+ "properties": {
+ "id": {"type": "string", "description": "Unique document identifier"},
+ "title": {"type": "string", "description": "Document title"},
+ "filename": {"type": "string", "description": "Generated filename"},
+ "sections": {
+ "type": "array",
+ "description": "Document sections containing structured content",
+ "items": {
+ "type": "object",
+ "required": ["id", "content_type", "elements", "order"],
+ "properties": {
+ "id": {"type": "string", "description": "Unique section identifier"},
+ "title": {"type": "string", "description": "Section title (optional)"},
+ "content_type": {
+ "type": "string",
+ "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
+ "description": "Primary content type of this section"
+ },
+ "elements": {
+ "type": "array",
+ "description": "Content elements in this section",
+ "items": {
+ "oneOf": [
+ {"$ref": "#/definitions/table"},
+ {"$ref": "#/definitions/bullet_list"},
+ {"$ref": "#/definitions/paragraph"},
+ {"$ref": "#/definitions/heading"},
+ {"$ref": "#/definitions/code_block"}
+ ]
+ }
+ },
+ "order": {"type": "integer", "description": "Section order in document"},
+ "metadata": {
+ "type": "object",
+ "description": "Additional section metadata"
+ }
+ }
+ }
+ },
+ "metadata": {
+ "type": "object",
+ "description": "Document-specific metadata"
+ }
+ }
+ }
+ }
+ },
+ "definitions": {
+ "table": {
+ "type": "object",
+ "required": ["headers", "rows"],
+ "properties": {
+ "headers": {
+ "type": "array",
+ "items": {"type": "string"},
+ "description": "Table column headers"
+ },
+ "rows": {
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": {"type": "string"}
+ },
+ "description": "Table data rows"
+ },
+ "caption": {
+ "type": "string",
+ "description": "Table caption (optional)"
+ }
+ }
+ },
+ "bullet_list": {
+ "type": "object",
+ "required": ["items"],
+ "properties": {
+ "items": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "required": ["text"],
+ "properties": {
+ "text": {"type": "string", "description": "List item text"},
+ "subitems": {
+ "type": "array",
+ "items": {"$ref": "#/definitions/list_item"},
+ "description": "Nested sub-items (optional)"
+ }
+ }
+ },
+ "description": "List items"
+ },
+ "list_type": {
+ "type": "string",
+ "enum": ["bullet", "numbered", "checklist"],
+ "default": "bullet",
+ "description": "Type of list"
+ }
+ }
+ },
+ "list_item": {
+ "type": "object",
+ "required": ["text"],
+ "properties": {
+ "text": {"type": "string", "description": "List item text"},
+ "subitems": {
+ "type": "array",
+ "items": {"$ref": "#/definitions/list_item"},
+ "description": "Nested sub-items (optional)"
+ }
+ }
+ },
+ "paragraph": {
+ "type": "object",
+ "required": ["text"],
+ "properties": {
+ "text": {"type": "string", "description": "Paragraph text"},
+ "formatting": {
+ "type": "object",
+ "description": "Text formatting (bold, italic, etc.)"
+ }
+ }
+ },
+ "heading": {
+ "type": "object",
+ "required": ["text", "level"],
+ "properties": {
+ "text": {"type": "string", "description": "Heading text"},
+ "level": {
+ "type": "integer",
+ "minimum": 1,
+ "maximum": 6,
+ "description": "Heading level (1-6)"
+ }
+ }
+ },
+ "code_block": {
+ "type": "object",
+ "required": ["code"],
+ "properties": {
+ "code": {"type": "string", "description": "Code content"},
+ "language": {"type": "string", "description": "Programming language (optional)"}
+ }
+ }
+ }
+ }
+
def get_document_subJsonSchema() -> Dict[str, Any]:
- """Get the JSON schema for structured document generation."""
+ """Get the JSON schema for structured document generation (single document)."""
return {
"type": "object",
"required": ["metadata", "sections"],
@@ -227,6 +416,13 @@ Return only the enhanced JSON structure following the schema. Do not include any
"""
+def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
+ """Automatically select appropriate schema based on prompt analysis."""
+ if prompt_analysis and prompt_analysis.get("is_multi_file", False):
+ return get_multi_document_subJsonSchema()
+ else:
+ return get_document_subJsonSchema()
+
def validate_json_document(json_data: Dict[str, Any]) -> bool:
"""Validate that the JSON data follows the document schema."""
try:
@@ -234,35 +430,86 @@ def validate_json_document(json_data: Dict[str, Any]) -> bool:
if not isinstance(json_data, dict):
return False
- if "metadata" not in json_data or "sections" not in json_data:
- return False
-
- metadata = json_data["metadata"]
- if not isinstance(metadata, dict) or "title" not in metadata:
- return False
-
- sections = json_data["sections"]
- if not isinstance(sections, list):
- return False
-
- # Validate each section
- for i, section in enumerate(sections):
- if not isinstance(section, dict):
+ # Check if it's multi-document or single-document structure
+ if "documents" in json_data:
+ # Multi-document structure
+ if "metadata" not in json_data:
return False
- required_fields = ["id", "content_type", "elements", "order"]
- for field in required_fields:
- if field not in section:
+ metadata = json_data["metadata"]
+ if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata:
+ return False
+
+ documents = json_data["documents"]
+ if not isinstance(documents, list):
+ return False
+
+ # Validate each document
+ for doc in documents:
+ if not isinstance(doc, dict):
return False
+
+ required_fields = ["id", "title", "sections", "filename"]
+ for field in required_fields:
+ if field not in doc:
+ return False
+
+ # Validate sections in each document
+ sections = doc.get("sections", [])
+ if not isinstance(sections, list):
+ return False
+
+ for section in sections:
+ if not isinstance(section, dict):
+ return False
+
+ section_required = ["id", "content_type", "elements", "order"]
+ for field in section_required:
+ if field not in section:
+ return False
+
+ # Validate content_type
+ valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
+ if section["content_type"] not in valid_types:
+ return False
+
+ # Validate elements
+ if not isinstance(section["elements"], list):
+ return False
- # Validate content_type
- valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
- if section["content_type"] not in valid_types:
+ elif "sections" in json_data:
+ # Single-document structure (existing validation)
+ if "metadata" not in json_data:
return False
- # Validate elements
- if not isinstance(section["elements"], list):
+ metadata = json_data["metadata"]
+ if not isinstance(metadata, dict) or "title" not in metadata:
return False
+
+ sections = json_data["sections"]
+ if not isinstance(sections, list):
+ return False
+
+ # Validate each section
+ for i, section in enumerate(sections):
+ if not isinstance(section, dict):
+ return False
+
+ required_fields = ["id", "content_type", "elements", "order"]
+ for field in required_fields:
+ if field not in section:
+ return False
+
+ # Validate content_type
+ valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
+ if section["content_type"] not in valid_types:
+ return False
+
+ # Validate elements
+ if not isinstance(section["elements"], list):
+ return False
+ else:
+ return False
return True
diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py
index dd2a6717..31ffb26e 100644
--- a/modules/services/serviceGeneration/subPromptBuilder.py
+++ b/modules/services/serviceGeneration/subPromptBuilder.py
@@ -8,7 +8,8 @@ Builds a robust prompt that:
- Requires the AI to output a filename header that we can parse and use
"""
-from typing import Protocol
+import json
+from typing import Protocol, Dict, Any
class _RendererLike(Protocol):
@@ -16,6 +17,291 @@ class _RendererLike(Protocol):
...
+async def buildAdaptiveExtractionPrompt(
+ outputFormat: str,
+ userPrompt: str,
+ title: str,
+ promptAnalysis: Dict[str, Any],
+ aiService=None,
+ services=None
+) -> str:
+ """Build adaptive extraction prompt based on AI analysis."""
+
+ # Get appropriate JSON schema based on analysis
+ from .subJsonSchema import get_adaptive_json_schema
+ json_schema = get_adaptive_json_schema(promptAnalysis)
+
+ if promptAnalysis.get("is_multi_file", False):
+ schema_type = "multi-document"
+ else:
+ schema_type = "single-document"
+
+ # Build adaptive prompt using AI analysis - match single-file style
+ if promptAnalysis.get("is_multi_file", False):
+ # Check if this is JSON email data
+ is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation'])
+
+ if is_json_email:
+ # Specialized prompt for JSON email data
+ multi_file_example = {
+ "metadata": {
+ "title": "Email Conversations",
+ "splitStrategy": "per_entity"
+ },
+ "documents": [
+ {
+ "id": "doc_1",
+ "title": "Email from SENDER to RECIPIENT",
+ "filename": "email_sender_to_recipient.txt",
+ "sections": [
+ {
+ "id": "section_1",
+ "content_type": "heading",
+ "elements": [
+ {
+ "text": "Email from SENDER to RECIPIENT",
+ "level": 1
+ }
+ ],
+ "order": 1
+ },
+ {
+ "id": "section_2",
+ "content_type": "paragraph",
+ "elements": [
+ {
+ "text": "FULL_EMAIL_CONTENT_HERE"
+ }
+ ],
+ "order": 2
+ }
+ ]
+ }
+ ]
+ }
+ else:
+ # Generic multi-file prompt
+ multi_file_example = {
+ "metadata": {
+ "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
+ "splitStrategy": "by_section"
+ },
+ "documents": [
+ {
+ "id": "doc_1",
+ "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
+ "filename": "REPLACE_WITH_ACTUAL_FILENAME",
+ "sections": [
+ {
+ "id": "section_1",
+ "content_type": "heading",
+ "elements": [
+ {
+ "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
+ "level": 1
+ }
+ ],
+ "order": 1
+ },
+ {
+ "id": "section_2",
+ "content_type": "paragraph",
+ "elements": [
+ {
+ "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
+ }
+ ],
+ "order": 2
+ }
+ ]
+ }
+ ]
+ }
+
+ adaptive_prompt = f"""
+{userPrompt}
+
+You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
+
+TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
+
+REQUIREMENTS:
+1. Analyze the document content provided in the context below
+2. Identify distinct sections in the document (by headings, topics, or logical breaks)
+3. Create one JSON document entry for each section found
+4. Extract the real content from each section (headings, paragraphs, lists, etc.)
+5. Generate appropriate filenames for each section
+
+CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
+
+OUTPUT FORMAT: Return only valid JSON in this exact structure:
+{json.dumps(multi_file_example, indent=2)}
+
+IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
+- "id": unique identifier
+- "title": section title from the document
+- "filename": appropriate filename for the section
+- "sections": array of content sections
+
+DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
+
+INSTRUCTIONS:
+- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
+- Use actual section titles, headings, and text from the document
+- Create meaningful filenames based on section content
+- Ensure each section contains the complete content for that part of the document
+- Do not use generic placeholder text like "Section 1", "Section 2"
+- Extract real headings, paragraphs, lists, and other content elements
+- CRITICAL: Return JSON with "documents" array, not "sections" array
+
+CONTEXT (Document Content):
+
+Content Types to Extract:
+1. Tables: Extract all rows and columns with proper headers
+2. Lists: Extract all items with proper nesting
+3. Headings: Extract with appropriate levels
+4. Paragraphs: Extract as structured text
+5. Code: Extract code blocks with language identification
+6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
+
+Image Analysis Requirements:
+- If you cannot analyze an image for any reason, explain why in the JSON response
+- Describe everything you see in the image
+- Include all text content, tables, logos, graphics, layout, and visual elements
+- If the image is too small, corrupted, or unclear, explain this
+- Always provide feedback - never return empty responses
+
+Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
+
+Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
+""".strip()
+ else:
+ # Single-file prompt - use original style
+ adaptive_prompt = f"""
+{userPrompt}
+
+You are extracting structured content from documents and must respond with valid JSON only.
+
+IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
+
+Extract the actual data from the source documents and structure it as JSON with this format:
+{json.dumps(json_schema, indent=2)}
+
+Content Types to Extract:
+1. Tables: Extract all rows and columns with proper headers
+2. Lists: Extract all items with proper nesting
+3. Headings: Extract with appropriate levels
+4. Paragraphs: Extract as structured text
+5. Code: Extract code blocks with language identification
+6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
+
+Image Analysis Requirements:
+- If you cannot analyze an image for any reason, explain why in the JSON response
+- Describe everything you see in the image
+- Include all text content, tables, logos, graphics, layout, and visual elements
+- If the image is too small, corrupted, or unclear, explain this
+- Always provide feedback - never return empty responses
+
+Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
+
+Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
+""".strip()
+
+ return adaptive_prompt
+
+async def buildGenericExtractionPrompt(
+ outputFormat: str,
+ userPrompt: str,
+ title: str,
+ aiService=None,
+ services=None
+) -> str:
+ """Build generic extraction prompt that works for both single and multi-file."""
+
+ # Use AI to determine the best approach
+ if aiService:
+ try:
+ analysis_prompt = f"""
+Analyze this user request and determine the best JSON structure for document extraction.
+
+User request: "{userPrompt}"
+
+Respond with JSON only:
+{{
+ "requires_multi_file": true/false,
+ "recommended_schema": "single_document|multi_document",
+ "split_approach": "description of how to organize content",
+ "file_naming": "suggested naming pattern"
+}}
+
+Consider the user's intent and the most logical way to organize the extracted content.
+"""
+
+ from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
+ request_options = AiCallOptions()
+ request_options.operationType = OperationType.GENERAL
+
+ request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
+ response = await aiService.aiObjects.call(request)
+
+ if response and response.content:
+ import re
+
+ result = response.content.strip()
+ json_match = re.search(r'\{.*\}', result, re.DOTALL)
+ if json_match:
+ result = json_match.group(0)
+
+ analysis = json.loads(result)
+
+ # Use analysis to build appropriate prompt
+ return await buildAdaptiveExtractionPrompt(
+ outputFormat, userPrompt, title, analysis, aiService, services
+ )
+ except Exception as e:
+ services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
+
+ # Fallback to single-file prompt
+ from .subJsonSchema import get_document_subJsonSchema
+ json_schema = get_document_subJsonSchema()
+
+ return f"""
+{userPrompt}
+
+You are extracting structured content from documents and must respond with valid JSON only.
+
+CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
+
+Extract the actual data from the source documents and structure it as JSON with this format:
+{json.dumps(json_schema, indent=2)}
+
+Requirements:
+- Preserve all original data - do not summarize or interpret
+- Use the exact JSON schema provided
+- Maintain data integrity and structure
+
+Content Types to Extract:
+1. Tables: Extract all rows and columns with proper headers
+2. Lists: Extract all items with proper nesting
+3. Headings: Extract with appropriate levels
+4. Paragraphs: Extract as structured text
+5. Code: Extract code blocks with language identification
+6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
+
+Image Analysis Requirements:
+- If you cannot analyze an image for any reason, explain why in the JSON response
+- Describe everything you see in the image
+- Include all text content, tables, logos, graphics, layout, and visual elements
+- If the image is too small, corrupted, or unclear, explain this
+- Always provide feedback - never return empty responses
+
+Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
+
+Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
+
+DO NOT return a schema description - return actual extracted content in the JSON format shown above.
+"""
+
async def buildExtractionPrompt(
outputFormat: str,
renderer: _RendererLike,
@@ -48,7 +334,7 @@ async def buildExtractionPrompt(
You are extracting structured content from documents and must respond with valid JSON only.
-IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
+CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
Extract the actual data from the source documents and structure it as JSON with this format:
{{
@@ -106,6 +392,10 @@ Image Analysis Requirements:
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
+
+Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
+
+DO NOT return a schema description - return actual extracted content in the JSON format shown above.
""".strip()
# Final assembly
diff --git a/test_document_processing.py b/test_document_processing.py
index 41e3a3a2..49a42f72 100644
--- a/test_document_processing.py
+++ b/test_document_processing.py
@@ -220,6 +220,8 @@ async def process_documents_and_generate_summary():
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
+ # userPrompt = "Can you create one file for each section in the document"
+
# userPrompt = "Analyze these documents and create a fitting image for the content"
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."
diff --git a/test_multifile_processing.py b/test_multifile_processing.py
new file mode 100644
index 00000000..737127bf
--- /dev/null
+++ b/test_multifile_processing.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+"""
+Test script for multi-file processing implementation.
+This script tests the new multi-file functionality without breaking existing single-file processing.
+"""
+
+import asyncio
+import json
+import logging
+from typing import Dict, Any, List
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+async def test_multi_file_detection():
+ """Test AI-powered multi-file detection."""
+ print("=== Testing Multi-File Detection ===")
+
+ # Mock AI service for testing
+ class MockAiService:
+ async def call(self, request):
+ class MockResponse:
+ def __init__(self, content):
+ self.content = content
+ return MockResponse('{"is_multi_file": true, "strategy": "per_entity", "criteria": "customer_id", "file_naming_pattern": "{customer_name}_data.docx", "reasoning": "User wants separate files for each customer"}')
+
+ class MockAiObjects:
+ def __init__(self):
+ self.call = MockAiService().call
+
+ # Import the AI service
+ try:
+ from modules.services.serviceAi.mainServiceAi import AiService
+
+ # Create mock service center
+ class MockServiceCenter:
+ def __init__(self):
+ self.utils = MockUtils()
+
+ class MockUtils:
+ def debugLogToFile(self, message, category):
+ print(f"[{category}] {message}")
+
+ # Create AI service instance
+ ai_service = AiService(MockServiceCenter())
+ ai_service.aiObjects = MockAiObjects()
+
+ # Test prompts
+ test_prompts = [
+ "Create one file for each customer in the document",
+ "Split the data into separate files by category",
+ "Generate individual files for each product",
+ "Create a single report with all data",
+ "Erstelle eine Datei fรผr jeden Kunden", # German
+ "Crรฉer un fichier par section" # French
+ ]
+
+ for prompt in test_prompts:
+ print(f"\nTesting prompt: '{prompt}'")
+ try:
+ analysis = await ai_service._analyzePromptIntent(prompt, ai_service)
+ print(f" Analysis: {analysis}")
+
+ if analysis.get("is_multi_file"):
+ print(f" โ Detected as multi-file with strategy: {analysis.get('strategy')}")
+ else:
+ print(f" โ Detected as single-file")
+
+ except Exception as e:
+ print(f" โ Error: {str(e)}")
+
+ print("\n=== Multi-File Detection Test Complete ===")
+ return True
+
+ except ImportError as e:
+ print(f"Import error: {e}")
+ print("Make sure you're running from the gateway directory")
+ return False
+ except Exception as e:
+ print(f"Error during testing: {e}")
+ return False
+
+async def test_json_schema_validation():
+ """Test JSON schema validation for both single and multi-file."""
+ print("\n=== Testing JSON Schema Validation ===")
+
+ try:
+ from modules.services.serviceGeneration.subJsonSchema import (
+ get_document_subJsonSchema,
+ get_multi_document_subJsonSchema,
+ get_adaptive_json_schema,
+ validate_json_document
+ )
+
+ # Test single document schema
+ single_doc_schema = get_document_subJsonSchema()
+ print(f"โ Single document schema loaded: {len(single_doc_schema)} properties")
+
+ # Test multi-document schema
+ multi_doc_schema = get_multi_document_subJsonSchema()
+ print(f"โ Multi-document schema loaded: {len(multi_doc_schema)} properties")
+
+ # Test adaptive schema selection
+ single_analysis = {"is_multi_file": False}
+ multi_analysis = {"is_multi_file": True}
+
+ single_schema = get_adaptive_json_schema(single_analysis)
+ multi_schema = get_adaptive_json_schema(multi_analysis)
+
+ print(f"โ Adaptive schema selection working")
+ print(f" Single-file schema type: {single_schema.get('type', 'unknown')}")
+ print(f" Multi-file schema type: {multi_schema.get('type', 'unknown')}")
+
+ # Test validation with sample data
+ single_doc_data = {
+ "metadata": {"title": "Test Document"},
+ "sections": [
+ {
+ "id": "section_1",
+ "content_type": "paragraph",
+ "elements": [{"text": "Test content"}],
+ "order": 1
+ }
+ ]
+ }
+
+ multi_doc_data = {
+ "metadata": {
+ "title": "Test Documents",
+ "splitStrategy": "per_entity"
+ },
+ "documents": [
+ {
+ "id": "doc_1",
+ "title": "Document 1",
+ "filename": "doc1.docx",
+ "sections": [
+ {
+ "id": "section_1",
+ "content_type": "paragraph",
+ "elements": [{"text": "Content 1"}],
+ "order": 1
+ }
+ ]
+ }
+ ]
+ }
+
+ single_valid = validate_json_document(single_doc_data)
+ multi_valid = validate_json_document(multi_doc_data)
+
+ print(f"โ Single document validation: {'PASS' if single_valid else 'FAIL'}")
+ print(f"โ Multi-document validation: {'PASS' if multi_valid else 'FAIL'}")
+
+ print("\n=== JSON Schema Validation Test Complete ===")
+ return True
+
+ except ImportError as e:
+ print(f"Import error: {e}")
+ return False
+ except Exception as e:
+ print(f"Error during schema testing: {e}")
+ return False
+
+async def test_prompt_builder():
+ """Test adaptive prompt building."""
+ print("\n=== Testing Prompt Builder ===")
+
+ try:
+ from modules.services.serviceGeneration.subPromptBuilder import (
+ buildAdaptiveExtractionPrompt,
+ buildGenericExtractionPrompt
+ )
+
+ # Mock services
+ class MockServices:
+ def __init__(self):
+ self.utils = MockUtils()
+
+ class MockUtils:
+ def debugLogToFile(self, message, category):
+ print(f"[{category}] {message}")
+
+ services = MockServices()
+
+ # Test adaptive prompt building
+ prompt_analysis = {
+ "is_multi_file": True,
+ "strategy": "per_entity",
+ "criteria": "customer_id",
+ "file_naming_pattern": "{customer_name}_data.docx"
+ }
+
+ adaptive_prompt = await buildAdaptiveExtractionPrompt(
+ outputFormat="docx",
+ userPrompt="Create one file for each customer",
+ title="Customer Data",
+ promptAnalysis=prompt_analysis,
+ aiService=None,
+ services=services
+ )
+
+ print(f"โ Adaptive prompt generated: {len(adaptive_prompt)} characters")
+ print(f" Contains multi-file instructions: {'documents' in adaptive_prompt}")
+
+ # Test generic prompt building
+ generic_prompt = await buildGenericExtractionPrompt(
+ outputFormat="docx",
+ userPrompt="Create a single report",
+ title="Report",
+ aiService=None,
+ services=services
+ )
+
+ print(f"โ Generic prompt generated: {len(generic_prompt)} characters")
+ print(f" Contains single-file instructions: {'sections' in generic_prompt}")
+
+ print("\n=== Prompt Builder Test Complete ===")
+ return True
+
+ except ImportError as e:
+ print(f"Import error: {e}")
+ return False
+ except Exception as e:
+ print(f"Error during prompt builder testing: {e}")
+ return False
+
+async def main():
+ """Run all tests."""
+ print("Starting Multi-File Processing Tests...")
+ print("=" * 50)
+
+ tests = [
+ test_multi_file_detection,
+ test_json_schema_validation,
+ test_prompt_builder
+ ]
+
+ results = []
+ for test in tests:
+ try:
+ result = await test()
+ results.append(result)
+ except Exception as e:
+ print(f"Test failed with exception: {e}")
+ results.append(False)
+
+ print("\n" + "=" * 50)
+ print("Test Results Summary:")
+ print(f" Tests run: {len(tests)}")
+ print(f" Passed: {sum(results)}")
+ print(f" Failed: {len(tests) - sum(results)}")
+
+ if all(results):
+ print("\n๐ All tests passed! Multi-file processing is ready.")
+ else:
+ print("\nโ ๏ธ Some tests failed. Check the implementation.")
+
+ return all(results)
+
+if __name__ == "__main__":
+ asyncio.run(main())