Multi-document output implemented

This commit is contained in:
ValueOn AG 2025-10-14 00:23:59 +02:00
parent 0c357dc8a9
commit 0bc71c99d5
6 changed files with 1448 additions and 50 deletions

View file

@ -649,6 +649,11 @@ class AiService:
for part in ec.parts:
if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"):
# Skip empty container chunks (they're just metadata containers)
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
continue
chunks_to_process.append({
'part': part,
'chunk_index': chunk_index,
@ -764,7 +769,14 @@ class AiService:
elif part.typeGroup in ("container", "binary"):
# Handle ALL container and binary content generically - let AI process any document type
self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
if part.mimeType and part.data and len(part.data.strip()) > 0:
# Skip empty container chunks (they're just metadata containers)
if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0):
self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE")
logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}")
# Skip processing this chunk
pass
elif part.mimeType and part.data and len(part.data.strip()) > 0:
# Process any document container as text content
request_options = options if options is not None else AiCallOptions()
request_options.operationType = OperationType.GENERAL
@ -869,12 +881,19 @@ class AiService:
# Log extraction context length
self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE")
# Debug: Log the actual prompt being sent to AI
logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...")
logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...")
request = AiCallRequest(
prompt=prompt,
context=part.data,
options=request_options
)
response = await self.aiObjects.call(request)
# Debug: Log what AI actually returned
logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...")
ai_result = response.content
# Log extraction response length
@ -900,16 +919,20 @@ class AiService:
import json
import re
# Clean the response - remove markdown code blocks if present
# Clean the response - remove markdown code blocks and extra formatting
cleaned_result = ai_result.strip()
if cleaned_result.startswith('```json'):
# Remove ```json from start and ``` from end
cleaned_result = re.sub(r'^```json\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
elif cleaned_result.startswith('```'):
# Remove ``` from start and end
cleaned_result = re.sub(r'^```\s*', '', cleaned_result)
cleaned_result = re.sub(r'\s*```$', '', cleaned_result)
# Remove any markdown code block markers (```json, ```, etc.)
cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE)
cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE)
# Remove any remaining ``` markers anywhere in the text
cleaned_result = re.sub(r'```', '', cleaned_result)
# Try to extract JSON from the response if it's embedded in other text
json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL)
if json_match:
cleaned_result = json_match.group(0)
# Validate JSON
json.loads(cleaned_result)
@ -1193,7 +1216,13 @@ class AiService:
# Parse JSON from AI result
chunk_json = json.loads(chunk_result.aiResult)
# Extract sections from this chunk
# Check if this is a multi-file response (has "documents" key)
if isinstance(chunk_json, dict) and "documents" in chunk_json:
# This is a multi-file response - return it as-is
logger.info("Detected multi-file response from AI - preserving structure")
return chunk_json
# Extract sections from single-file response
if isinstance(chunk_json, dict) and "sections" in chunk_json:
for section in chunk_json["sections"]:
# Add document context to section
@ -1527,6 +1556,152 @@ class AiService:
# This ensures MIME-type checking, chunk mapping, and parallel processing
return await self._processDocumentsPerChunk(documents, prompt, options)
async def _callAiDirect(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions
) -> Dict[str, Any]:
"""
Call AI directly with prompt and documents for JSON output.
Used for multi-file generation - uses the existing generation pipeline.
"""
# Use the existing generation pipeline that already works
# This ensures proper document processing and content extraction
logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents")
# Process documents with JSON merging using the existing pipeline
result = await self._processDocumentsPerChunkJson(documents, prompt, options)
# Convert single-file result to multi-file format if needed
if "sections" in result and "documents" not in result:
logger.info("Converting single-file result to multi-file format")
# This is a single-file result, convert it to multi-file format
return {
"metadata": result.get("metadata", {"title": "Converted Document"}),
"documents": [{
"id": "doc_1",
"title": result.get("metadata", {}).get("title", "Document"),
"filename": "document.txt",
"sections": result.get("sections", [])
}]
}
return result
async def _processDocumentsPerChunkJsonWithPrompt(
self,
documents: List[ChatDocument],
custom_prompt: str,
options: Optional[AiCallOptions] = None
) -> Dict[str, Any]:
"""
Process documents with per-chunk AI calls and merge results in JSON mode.
Uses a custom prompt instead of the default extraction prompt.
"""
if not documents:
return {"metadata": {"title": "Empty Document"}, "sections": []}
# Get model capabilities for size calculation
model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options)
# Build extraction options for chunking with intelligent merging
extractionOptions: Dict[str, Any] = {
"prompt": custom_prompt, # Use the custom prompt instead of default
"operationType": options.operationType if options else "general",
"processDocumentsIndividually": True, # Process each document separately
"maxSize": model_capabilities["maxContextBytes"],
"chunkAllowed": True,
"textChunkSize": model_capabilities["textChunkSize"],
"imageChunkSize": model_capabilities["imageChunkSize"],
"imageMaxPixels": 1024 * 1024,
"imageQuality": 85,
"mergeStrategy": {
"useIntelligentMerging": True, # Enable intelligent token-aware merging
"modelCapabilities": model_capabilities,
"prompt": custom_prompt, # Use the custom prompt
"groupBy": "typeGroup",
"orderBy": "id",
"mergeType": "concatenate"
},
}
logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}")
try:
# Extract content with chunking
extractionResult = self.extractionService.extractContent(documents, extractionOptions)
if not isinstance(extractionResult, list):
return {"metadata": {"title": "Error Document"}, "sections": []}
# Process chunks with proper mapping
logger.info(f"Processing {len(extractionResult)} chunks with custom prompt")
logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...")
# Debug: Show what content is being processed (before filtering)
for i, ec in enumerate(extractionResult):
logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}")
# Check each part within the ContentExtracted
if hasattr(ec, 'parts'):
for j, part in enumerate(ec.parts):
if hasattr(part, 'data') and part.data:
logger.debug(f" Part {j} content preview: {part.data[:200]}...")
else:
# Check what attributes the part actually has
part_attrs = [attr for attr in dir(part) if not attr.startswith('_')]
part_type = getattr(part, 'typeGroup', None)
part_mime = getattr(part, 'mimeType', '')
has_data = hasattr(part, 'data') and bool(part.data)
logger.debug(f" Part {j} DEBUG: available_attrs={part_attrs}")
logger.debug(f" Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}")
# Check if this is an empty container chunk (which is expected)
is_empty_container = False
if part_type == "container" and part_mime and 'document' in part_mime.lower():
is_empty_container = True
if is_empty_container:
logger.debug(f" Part {j} is empty container (will be filtered out) - mimeType={part_mime}")
else:
logger.warning(f" Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}")
else:
logger.warning(f"ContentExtracted {i} has no parts attribute")
chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True)
# Debug: Show what chunks were actually processed (after filtering)
logger.info(f"After filtering: {len(chunkResults)} chunks will be processed")
for i, chunk_result in enumerate(chunkResults):
if chunk_result and chunk_result.metadata.get("success", False):
logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars")
else:
logger.debug(f"Processed chunk {i}: error or skipped")
# Merge with JSON mode
mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options)
# Debug: Show what the AI actually returned
logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}")
if 'sections' in mergedJsonDocument:
logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}")
if mergedJsonDocument['sections']:
logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...")
else:
logger.warning("AI returned empty sections array")
if 'documents' in mergedJsonDocument:
logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}")
else:
logger.warning("AI did not return 'documents' key - this is single-file format")
return mergedJsonDocument
except Exception as e:
logger.error(f"Error in per-chunk JSON processing: {str(e)}")
return {"metadata": {"title": "Error Document"}, "sections": []}
async def _callAiJson(
self,
prompt: str,
@ -1821,6 +1996,88 @@ class AiService:
target_length = int(len(text) * reduction_factor)
return text[:target_length] + "... [reduced]"
async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]:
"""Use AI to analyze user prompt and determine processing requirements."""
if not ai_service:
return {"is_multi_file": False, "strategy": "single", "criteria": None}
try:
analysis_prompt = f"""
Analyze this user request and determine if it requires multiple file output or single file output.
User request: "{prompt}"
Respond with JSON only in this exact format:
{{
"is_multi_file": true/false,
"strategy": "single|per_entity|by_section|by_criteria|custom",
"criteria": "description of how to split content",
"file_naming_pattern": "suggested pattern for filenames",
"reasoning": "brief explanation of the analysis"
}}
Consider:
- Does the user want separate files for different entities (customers, products, etc.)?
- Does the user want to split content into multiple documents?
- What would be the most logical way to organize the content?
- What language is the request in? (analyze in the original language)
Return only the JSON response.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await ai_service.aiObjects.call(request)
if response and response.content:
import json
import re
# Extract JSON from response
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
analysis = json.loads(result)
return analysis
else:
return {"is_multi_file": False, "strategy": "single", "criteria": None}
except Exception as e:
logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file")
return {"is_multi_file": False, "strategy": "single", "criteria": None}
def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool:
"""Validate that AI response matches the expected structure."""
try:
if not isinstance(response, dict):
logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}")
return False
# Check for multi-file structure
if prompt_analysis.get("is_multi_file", False):
has_documents = "documents" in response
is_documents_list = isinstance(response.get("documents"), list)
logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}")
if has_documents and is_documents_list:
logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found")
else:
logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}")
logger.warning(f"Available keys: {list(response.keys())}")
return has_documents and is_documents_list
else:
has_sections = "sections" in response
is_sections_list = isinstance(response.get("sections"), list)
logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}")
return has_sections and is_sections_list
except Exception as e:
logger.warning(f"Response validation failed with exception: {str(e)}")
return False
async def _callAiWithDocumentGeneration(
self,
prompt: str,
@ -1831,6 +2088,7 @@ class AiService:
) -> Dict[str, Any]:
"""
Handle AI calls with document generation in specific output format.
Now supports both single-file and multi-file generation.
Args:
prompt: The main prompt for the AI call
@ -1842,6 +2100,43 @@ class AiService:
Returns:
Dict with generated documents and metadata
"""
try:
# Use AI to analyze prompt intent
prompt_analysis = await self._analyzePromptIntent(prompt, self)
logger.info(f"Prompt analysis result: {prompt_analysis}")
if prompt_analysis.get("is_multi_file", False):
return await self._callAiWithMultiFileGeneration(
prompt, documents, options, outputFormat, title, prompt_analysis
)
else:
return await self._callAiWithSingleFileGeneration(
prompt, documents, options, outputFormat, title
)
except Exception as e:
logger.error(f"Error in document generation: {str(e)}")
return {
"success": False,
"error": str(e),
"content": "",
"rendered_content": "",
"mime_type": "text/plain",
"filename": f"error_{outputFormat}",
"format": outputFormat,
"title": title or "Error",
"documents": []
}
async def _callAiWithSingleFileGeneration(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
outputFormat: str,
title: Optional[str]
) -> Dict[str, Any]:
"""Handle single-file document generation (existing functionality)."""
try:
# Get format-specific extraction prompt from generation service
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
@ -1912,20 +2207,216 @@ class AiService:
"documentName": filename,
"documentData": renderedContent,
"mimeType": mimeType
}]
}],
"is_multi_file": False
}
except Exception as e:
logger.error(f"Error in document generation: {str(e)}")
return {
"success": False,
"error": str(e),
"content": "",
"rendered_content": "",
"mime_type": "text/plain",
"filename": f"error_{outputFormat}",
"format": outputFormat,
"title": title or "Error",
"documents": []
}
logger.error(f"Error in single-file document generation: {str(e)}")
raise
async def _callAiWithMultiFileGeneration(
self,
prompt: str,
documents: Optional[List[ChatDocument]],
options: AiCallOptions,
outputFormat: str,
title: Optional[str],
prompt_analysis: Dict[str, Any]
) -> Dict[str, Any]:
"""Handle multi-file document generation using AI analysis."""
try:
# Get multi-file extraction prompt based on AI analysis
from modules.services.serviceGeneration.mainServiceGeneration import GenerationService
generation_service = GenerationService(self.services)
# Use default title if not provided
if not title:
title = "AI Generated Documents"
# Get adaptive extraction prompt
extraction_prompt = await generation_service.getAdaptiveExtractionPrompt(
outputFormat=outputFormat,
userPrompt=prompt,
title=title,
promptAnalysis=prompt_analysis,
aiService=self
)
logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters")
logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...")
# Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt
logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars")
logger.debug(f"Processing documents: {len(documents) if documents else 0} documents")
# Use the existing pipeline but replace the prompt with our adaptive one
# This ensures proper document processing while using the multi-file prompt
ai_response = await self._processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options)
logger.info(f"AI response type: {type(ai_response)}")
logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}")
logger.debug(f"AI response preview: {str(ai_response)[:500]}...")
# Validate response structure
if not self._validateResponseStructure(ai_response, prompt_analysis):
# Fallback to single-file if multi-file fails
logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}")
logger.warning(f"Prompt analysis: {prompt_analysis}")
logger.warning("Falling back to single-file generation")
return await self._callAiWithSingleFileGeneration(
prompt, documents, options, outputFormat, title
)
# Process multiple documents
generated_documents = []
for i, doc_data in enumerate(ai_response.get("documents", [])):
# Transform AI-generated sections to renderer-compatible format
transformed_sections = []
for section in doc_data.get("sections", []):
# Convert AI format to renderer format
transformed_section = {
"id": section.get("id", f"section_{len(transformed_sections) + 1}"),
"type": section.get("content_type", "paragraph"),
"data": {
"text": "",
"elements": section.get("elements", [])
},
"order": section.get("order", len(transformed_sections) + 1)
}
# Extract text from elements for simple text-based sections
if section.get("content_type") in ["paragraph", "heading"]:
text_parts = []
for element in section.get("elements", []):
if "text" in element:
text_parts.append(element["text"])
transformed_section["data"]["text"] = "\n".join(text_parts)
transformed_sections.append(transformed_section)
# Create complete document structure for rendering
complete_document = {
"metadata": {
"title": doc_data["title"],
"source_document": "multi_file_generation",
"document_id": doc_data.get("id", f"doc_{i+1}"),
"filename": doc_data.get("filename", f"document_{i+1}"),
"split_strategy": prompt_analysis.get("strategy", "custom")
},
"sections": transformed_sections,
"summary": f"Generated document: {doc_data['title']}",
"tags": ["multi_file", "ai_generated"]
}
rendered_content, mime_type = await generation_service.renderReport(
extractedContent=complete_document,
outputFormat=outputFormat,
title=doc_data["title"],
userPrompt=prompt,
aiService=self
)
# Generate proper filename with correct extension
base_filename = doc_data.get("filename", f"document_{i+1}")
# Remove any existing extension and add the correct one
if '.' in base_filename:
base_filename = base_filename.rsplit('.', 1)[0]
# Add proper extension based on output format
if outputFormat.lower() == "docx":
filename = f"{base_filename}.docx"
elif outputFormat.lower() == "pdf":
filename = f"{base_filename}.pdf"
elif outputFormat.lower() == "html":
filename = f"{base_filename}.html"
else:
filename = f"{base_filename}.{outputFormat}"
generated_documents.append({
"documentName": filename,
"documentData": rendered_content,
"mimeType": mime_type
})
# Save debug files for multi-file generation - only if debug enabled
debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False)
if debug_enabled:
try:
import os
from datetime import datetime, UTC
ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S")
debug_root = "./test-chat/ai"
debug_dir = os.path.join(debug_root, f"multifile_output_{ts}")
os.makedirs(debug_dir, exist_ok=True)
# Save metadata
with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f:
f.write(f"title: {title}\n")
f.write(f"format: {outputFormat}\n")
f.write(f"documents_count: {len(generated_documents)}\n")
f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n")
f.write(f"prompt_analysis: {prompt_analysis}\n")
# Save each generated document
for i, doc in enumerate(generated_documents):
doc_filename = doc["documentName"]
doc_data = doc["documentData"]
doc_mime = doc["mimeType"]
# Determine file extension
if outputFormat.lower() == "docx":
file_ext = ".docx"
elif outputFormat.lower() == "pdf":
file_ext = ".pdf"
elif outputFormat.lower() == "html":
file_ext = ".html"
else:
file_ext = f".{outputFormat}"
# Save the rendered document
output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}")
if file_ext in ['.md', '.txt', '.html', '.json', '.csv']:
# Text-based formats
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
else:
# Binary formats - decode from base64 if needed
try:
import base64
doc_bytes = base64.b64decode(doc_data)
with open(output_path, 'wb') as f:
f.write(doc_bytes)
except Exception:
# If not base64, save as text
with open(output_path, 'w', encoding='utf-8') as f:
f.write(doc_data)
logger.info(f"💾 Debug: Saved multi-file document {i+1}: {output_path}")
logger.info(f"💾 Debug: Multi-file output saved to: {debug_dir}")
except Exception as e:
logger.warning(f"Failed to save multi-file debug output: {e}")
return {
"success": True,
"content": ai_response,
"rendered_content": None, # Not applicable for multi-file
"mime_type": None, # Not applicable for multi-file
"filename": None, # Not applicable for multi-file
"format": outputFormat,
"title": title,
"documents": generated_documents,
"is_multi_file": True,
"split_strategy": prompt_analysis.get("strategy", "custom")
}
except Exception as e:
logger.error(f"Error in multi-file document generation: {str(e)}")
# Fallback to single-file
return await self._callAiWithSingleFileGeneration(
prompt, documents, options, outputFormat, title
)

View file

@ -1,6 +1,6 @@
import logging
import uuid
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union, Tuple
from datetime import datetime, UTC
import re
from modules.shared.timezoneUtils import get_utc_timestamp
@ -372,6 +372,42 @@ class GenerationService:
logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}")
raise
async def getAdaptiveExtractionPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
promptAnalysis: Dict[str, Any],
aiService=None
) -> str:
"""Get adaptive extraction prompt based on AI analysis."""
from .subPromptBuilder import buildAdaptiveExtractionPrompt
return await buildAdaptiveExtractionPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
promptAnalysis=promptAnalysis,
aiService=aiService,
services=self.services
)
async def getGenericExtractionPrompt(
self,
outputFormat: str,
userPrompt: str,
title: str,
aiService=None
) -> str:
"""Get generic extraction prompt that works for both single and multi-file."""
from .subPromptBuilder import buildGenericExtractionPrompt
return await buildGenericExtractionPrompt(
outputFormat=outputFormat,
userPrompt=userPrompt,
title=title,
aiService=aiService,
services=self.services
)
async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str:
"""
Get the format-specific extraction prompt for AI content extraction.
@ -409,6 +445,75 @@ class GenerationService:
logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}")
raise
async def renderAdaptiveReport(
self,
extractedContent: Dict[str, Any],
outputFormat: str,
title: str,
userPrompt: str = None,
aiService=None,
isMultiFile: bool = False
) -> Union[Tuple[str, str], List[Dict[str, Any]]]:
"""Render report adaptively based on content structure."""
if isMultiFile and "documents" in extractedContent:
return await self._renderMultiFileReport(
extractedContent, outputFormat, title, userPrompt, aiService
)
else:
return await self._renderSingleFileReport(
extractedContent, outputFormat, title, userPrompt, aiService
)
async def _renderMultiFileReport(
self,
extractedContent: Dict[str, Any],
outputFormat: str,
title: str,
userPrompt: str = None,
aiService=None
) -> List[Dict[str, Any]]:
"""Render multiple documents from extracted content."""
generated_documents = []
for doc_data in extractedContent.get("documents", []):
# Use existing single-file renderer for each document
renderer = self._getFormatRenderer(outputFormat)
if not renderer:
continue
# Render individual document
rendered_content, mime_type = await renderer.render(
extractedContent={"sections": doc_data["sections"]},
title=doc_data["title"],
userPrompt=userPrompt,
aiService=aiService
)
generated_documents.append({
"filename": doc_data["filename"],
"content": rendered_content,
"mime_type": mime_type,
"title": doc_data["title"]
})
return generated_documents
async def _renderSingleFileReport(
self,
extractedContent: Dict[str, Any],
outputFormat: str,
title: str,
userPrompt: str = None,
aiService=None
) -> Tuple[str, str]:
"""Render single file report (existing functionality)."""
# Use existing renderReport method
return await self.renderReport(
extractedContent, outputFormat, title, userPrompt, aiService
)
def _getFormatRenderer(self, output_format: str):
"""Get the appropriate renderer for the specified format using auto-discovery."""
try:

View file

@ -6,8 +6,197 @@ This module provides schemas that guide AI to generate structured JSON output.
from typing import Dict, Any
def get_multi_document_subJsonSchema() -> Dict[str, Any]:
"""Get the JSON schema for multi-document generation."""
return {
"type": "object",
"required": ["metadata", "documents"],
"properties": {
"metadata": {
"type": "object",
"required": ["title", "splitStrategy"],
"properties": {
"title": {"type": "string", "description": "Document title"},
"splitStrategy": {
"type": "string",
"enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"],
"description": "Strategy for splitting content into multiple files"
},
"splitCriteria": {
"type": "object",
"description": "Custom criteria for splitting (e.g., entity_id, category, etc.)"
},
"fileNamingPattern": {
"type": "string",
"description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')"
},
"author": {"type": "string", "description": "Document author (optional)"},
"source_documents": {
"type": "array",
"items": {"type": "string"},
"description": "List of source document IDs"
},
"extraction_method": {
"type": "string",
"default": "ai_extraction",
"description": "Method used for extraction"
}
}
},
"documents": {
"type": "array",
"description": "Array of individual documents to generate",
"items": {
"type": "object",
"required": ["id", "title", "sections", "filename"],
"properties": {
"id": {"type": "string", "description": "Unique document identifier"},
"title": {"type": "string", "description": "Document title"},
"filename": {"type": "string", "description": "Generated filename"},
"sections": {
"type": "array",
"description": "Document sections containing structured content",
"items": {
"type": "object",
"required": ["id", "content_type", "elements", "order"],
"properties": {
"id": {"type": "string", "description": "Unique section identifier"},
"title": {"type": "string", "description": "Section title (optional)"},
"content_type": {
"type": "string",
"enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"],
"description": "Primary content type of this section"
},
"elements": {
"type": "array",
"description": "Content elements in this section",
"items": {
"oneOf": [
{"$ref": "#/definitions/table"},
{"$ref": "#/definitions/bullet_list"},
{"$ref": "#/definitions/paragraph"},
{"$ref": "#/definitions/heading"},
{"$ref": "#/definitions/code_block"}
]
}
},
"order": {"type": "integer", "description": "Section order in document"},
"metadata": {
"type": "object",
"description": "Additional section metadata"
}
}
}
},
"metadata": {
"type": "object",
"description": "Document-specific metadata"
}
}
}
}
},
"definitions": {
"table": {
"type": "object",
"required": ["headers", "rows"],
"properties": {
"headers": {
"type": "array",
"items": {"type": "string"},
"description": "Table column headers"
},
"rows": {
"type": "array",
"items": {
"type": "array",
"items": {"type": "string"}
},
"description": "Table data rows"
},
"caption": {
"type": "string",
"description": "Table caption (optional)"
}
}
},
"bullet_list": {
"type": "object",
"required": ["items"],
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"description": "List items"
},
"list_type": {
"type": "string",
"enum": ["bullet", "numbered", "checklist"],
"default": "bullet",
"description": "Type of list"
}
}
},
"list_item": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "List item text"},
"subitems": {
"type": "array",
"items": {"$ref": "#/definitions/list_item"},
"description": "Nested sub-items (optional)"
}
}
},
"paragraph": {
"type": "object",
"required": ["text"],
"properties": {
"text": {"type": "string", "description": "Paragraph text"},
"formatting": {
"type": "object",
"description": "Text formatting (bold, italic, etc.)"
}
}
},
"heading": {
"type": "object",
"required": ["text", "level"],
"properties": {
"text": {"type": "string", "description": "Heading text"},
"level": {
"type": "integer",
"minimum": 1,
"maximum": 6,
"description": "Heading level (1-6)"
}
}
},
"code_block": {
"type": "object",
"required": ["code"],
"properties": {
"code": {"type": "string", "description": "Code content"},
"language": {"type": "string", "description": "Programming language (optional)"}
}
}
}
}
def get_document_subJsonSchema() -> Dict[str, Any]:
"""Get the JSON schema for structured document generation."""
"""Get the JSON schema for structured document generation (single document)."""
return {
"type": "object",
"required": ["metadata", "sections"],
@ -227,6 +416,13 @@ Return only the enhanced JSON structure following the schema. Do not include any
"""
def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]:
"""Automatically select appropriate schema based on prompt analysis."""
if prompt_analysis and prompt_analysis.get("is_multi_file", False):
return get_multi_document_subJsonSchema()
else:
return get_document_subJsonSchema()
def validate_json_document(json_data: Dict[str, Any]) -> bool:
"""Validate that the JSON data follows the document schema."""
try:
@ -234,35 +430,86 @@ def validate_json_document(json_data: Dict[str, Any]) -> bool:
if not isinstance(json_data, dict):
return False
if "metadata" not in json_data or "sections" not in json_data:
return False
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata:
return False
sections = json_data["sections"]
if not isinstance(sections, list):
return False
# Validate each section
for i, section in enumerate(sections):
if not isinstance(section, dict):
# Check if it's multi-document or single-document structure
if "documents" in json_data:
# Multi-document structure
if "metadata" not in json_data:
return False
required_fields = ["id", "content_type", "elements", "order"]
for field in required_fields:
if field not in section:
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata:
return False
documents = json_data["documents"]
if not isinstance(documents, list):
return False
# Validate each document
for doc in documents:
if not isinstance(doc, dict):
return False
required_fields = ["id", "title", "sections", "filename"]
for field in required_fields:
if field not in doc:
return False
# Validate sections in each document
sections = doc.get("sections", [])
if not isinstance(sections, list):
return False
for section in sections:
if not isinstance(section, dict):
return False
section_required = ["id", "content_type", "elements", "order"]
for field in section_required:
if field not in section:
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
return False
# Validate elements
if not isinstance(section["elements"], list):
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
elif "sections" in json_data:
# Single-document structure (existing validation)
if "metadata" not in json_data:
return False
# Validate elements
if not isinstance(section["elements"], list):
metadata = json_data["metadata"]
if not isinstance(metadata, dict) or "title" not in metadata:
return False
sections = json_data["sections"]
if not isinstance(sections, list):
return False
# Validate each section
for i, section in enumerate(sections):
if not isinstance(section, dict):
return False
required_fields = ["id", "content_type", "elements", "order"]
for field in required_fields:
if field not in section:
return False
# Validate content_type
valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"]
if section["content_type"] not in valid_types:
return False
# Validate elements
if not isinstance(section["elements"], list):
return False
else:
return False
return True

View file

@ -8,7 +8,8 @@ Builds a robust prompt that:
- Requires the AI to output a filename header that we can parse and use
"""
from typing import Protocol
import json
from typing import Protocol, Dict, Any
class _RendererLike(Protocol):
@ -16,6 +17,291 @@ class _RendererLike(Protocol):
...
async def buildAdaptiveExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
promptAnalysis: Dict[str, Any],
aiService=None,
services=None
) -> str:
"""Build adaptive extraction prompt based on AI analysis."""
# Get appropriate JSON schema based on analysis
from .subJsonSchema import get_adaptive_json_schema
json_schema = get_adaptive_json_schema(promptAnalysis)
if promptAnalysis.get("is_multi_file", False):
schema_type = "multi-document"
else:
schema_type = "single-document"
# Build adaptive prompt using AI analysis - match single-file style
if promptAnalysis.get("is_multi_file", False):
# Check if this is JSON email data
is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation'])
if is_json_email:
# Specialized prompt for JSON email data
multi_file_example = {
"metadata": {
"title": "Email Conversations",
"splitStrategy": "per_entity"
},
"documents": [
{
"id": "doc_1",
"title": "Email from SENDER to RECIPIENT",
"filename": "email_sender_to_recipient.txt",
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"text": "Email from SENDER to RECIPIENT",
"level": 1
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "FULL_EMAIL_CONTENT_HERE"
}
],
"order": 2
}
]
}
]
}
else:
# Generic multi-file prompt
multi_file_example = {
"metadata": {
"title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE",
"splitStrategy": "by_section"
},
"documents": [
{
"id": "doc_1",
"title": "REPLACE_WITH_ACTUAL_SECTION_TITLE",
"filename": "REPLACE_WITH_ACTUAL_FILENAME",
"sections": [
{
"id": "section_1",
"content_type": "heading",
"elements": [
{
"text": "REPLACE_WITH_ACTUAL_HEADING_TEXT",
"level": 1
}
],
"order": 1
},
{
"id": "section_2",
"content_type": "paragraph",
"elements": [
{
"text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT"
}
],
"order": 2
}
]
}
]
}
adaptive_prompt = f"""
{userPrompt}
You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output.
TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file.
REQUIREMENTS:
1. Analyze the document content provided in the context below
2. Identify distinct sections in the document (by headings, topics, or logical breaks)
3. Create one JSON document entry for each section found
4. Extract the real content from each section (headings, paragraphs, lists, etc.)
5. Generate appropriate filenames for each section
CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array.
OUTPUT FORMAT: Return only valid JSON in this exact structure:
{json.dumps(multi_file_example, indent=2)}
IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have:
- "id": unique identifier
- "title": section title from the document
- "filename": appropriate filename for the section
- "sections": array of content sections
DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level.
INSTRUCTIONS:
- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document
- Use actual section titles, headings, and text from the document
- Create meaningful filenames based on section content
- Ensure each section contains the complete content for that part of the document
- Do not use generic placeholder text like "Section 1", "Section 2"
- Extract real headings, paragraphs, lists, and other content elements
- CRITICAL: Return JSON with "documents" array, not "sections" array
CONTEXT (Document Content):
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
else:
# Single-file prompt - use original style
adaptive_prompt = f"""
{userPrompt}
You are extracting structured content from documents and must respond with valid JSON only.
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(json_schema, indent=2)}
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
""".strip()
return adaptive_prompt
async def buildGenericExtractionPrompt(
outputFormat: str,
userPrompt: str,
title: str,
aiService=None,
services=None
) -> str:
"""Build generic extraction prompt that works for both single and multi-file."""
# Use AI to determine the best approach
if aiService:
try:
analysis_prompt = f"""
Analyze this user request and determine the best JSON structure for document extraction.
User request: "{userPrompt}"
Respond with JSON only:
{{
"requires_multi_file": true/false,
"recommended_schema": "single_document|multi_document",
"split_approach": "description of how to organize content",
"file_naming": "suggested naming pattern"
}}
Consider the user's intent and the most logical way to organize the extracted content.
"""
from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType
request_options = AiCallOptions()
request_options.operationType = OperationType.GENERAL
request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options)
response = await aiService.aiObjects.call(request)
if response and response.content:
import re
result = response.content.strip()
json_match = re.search(r'\{.*\}', result, re.DOTALL)
if json_match:
result = json_match.group(0)
analysis = json.loads(result)
# Use analysis to build appropriate prompt
return await buildAdaptiveExtractionPrompt(
outputFormat, userPrompt, title, analysis, aiService, services
)
except Exception as e:
services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER")
# Fallback to single-file prompt
from .subJsonSchema import get_document_subJsonSchema
json_schema = get_document_subJsonSchema()
return f"""
{userPrompt}
You are extracting structured content from documents and must respond with valid JSON only.
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
Extract the actual data from the source documents and structure it as JSON with this format:
{json.dumps(json_schema, indent=2)}
Requirements:
- Preserve all original data - do not summarize or interpret
- Use the exact JSON schema provided
- Maintain data integrity and structure
Content Types to Extract:
1. Tables: Extract all rows and columns with proper headers
2. Lists: Extract all items with proper nesting
3. Headings: Extract with appropriate levels
4. Paragraphs: Extract as structured text
5. Code: Extract code blocks with language identification
6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements
Image Analysis Requirements:
- If you cannot analyze an image for any reason, explain why in the JSON response
- Describe everything you see in the image
- Include all text content, tables, logos, graphics, layout, and visual elements
- If the image is too small, corrupted, or unclear, explain this
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
"""
async def buildExtractionPrompt(
outputFormat: str,
renderer: _RendererLike,
@ -48,7 +334,7 @@ async def buildExtractionPrompt(
You are extracting structured content from documents and must respond with valid JSON only.
IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure.
CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting.
Extract the actual data from the source documents and structure it as JSON with this format:
{{
@ -106,6 +392,10 @@ Image Analysis Requirements:
- Always provide feedback - never return empty responses
Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON.
Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents.
DO NOT return a schema description - return actual extracted content in the JSON format shown above.
""".strip()
# Final assembly

View file

@ -220,6 +220,8 @@ async def process_documents_and_generate_summary():
userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?"
# userPrompt = "Can you create one file for each section in the document"
# userPrompt = "Analyze these documents and create a fitting image for the content"
# userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted."

View file

@ -0,0 +1,263 @@
#!/usr/bin/env python3
"""
Test script for multi-file processing implementation.
This script tests the new multi-file functionality without breaking existing single-file processing.
"""
import asyncio
import json
import logging
from typing import Dict, Any, List
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def test_multi_file_detection():
"""Test AI-powered multi-file detection."""
print("=== Testing Multi-File Detection ===")
# Mock AI service for testing
class MockAiService:
async def call(self, request):
class MockResponse:
def __init__(self, content):
self.content = content
return MockResponse('{"is_multi_file": true, "strategy": "per_entity", "criteria": "customer_id", "file_naming_pattern": "{customer_name}_data.docx", "reasoning": "User wants separate files for each customer"}')
class MockAiObjects:
def __init__(self):
self.call = MockAiService().call
# Import the AI service
try:
from modules.services.serviceAi.mainServiceAi import AiService
# Create mock service center
class MockServiceCenter:
def __init__(self):
self.utils = MockUtils()
class MockUtils:
def debugLogToFile(self, message, category):
print(f"[{category}] {message}")
# Create AI service instance
ai_service = AiService(MockServiceCenter())
ai_service.aiObjects = MockAiObjects()
# Test prompts
test_prompts = [
"Create one file for each customer in the document",
"Split the data into separate files by category",
"Generate individual files for each product",
"Create a single report with all data",
"Erstelle eine Datei für jeden Kunden", # German
"Créer un fichier par section" # French
]
for prompt in test_prompts:
print(f"\nTesting prompt: '{prompt}'")
try:
analysis = await ai_service._analyzePromptIntent(prompt, ai_service)
print(f" Analysis: {analysis}")
if analysis.get("is_multi_file"):
print(f" ✓ Detected as multi-file with strategy: {analysis.get('strategy')}")
else:
print(f" ✓ Detected as single-file")
except Exception as e:
print(f" ✗ Error: {str(e)}")
print("\n=== Multi-File Detection Test Complete ===")
return True
except ImportError as e:
print(f"Import error: {e}")
print("Make sure you're running from the gateway directory")
return False
except Exception as e:
print(f"Error during testing: {e}")
return False
async def test_json_schema_validation():
"""Test JSON schema validation for both single and multi-file."""
print("\n=== Testing JSON Schema Validation ===")
try:
from modules.services.serviceGeneration.subJsonSchema import (
get_document_subJsonSchema,
get_multi_document_subJsonSchema,
get_adaptive_json_schema,
validate_json_document
)
# Test single document schema
single_doc_schema = get_document_subJsonSchema()
print(f"✓ Single document schema loaded: {len(single_doc_schema)} properties")
# Test multi-document schema
multi_doc_schema = get_multi_document_subJsonSchema()
print(f"✓ Multi-document schema loaded: {len(multi_doc_schema)} properties")
# Test adaptive schema selection
single_analysis = {"is_multi_file": False}
multi_analysis = {"is_multi_file": True}
single_schema = get_adaptive_json_schema(single_analysis)
multi_schema = get_adaptive_json_schema(multi_analysis)
print(f"✓ Adaptive schema selection working")
print(f" Single-file schema type: {single_schema.get('type', 'unknown')}")
print(f" Multi-file schema type: {multi_schema.get('type', 'unknown')}")
# Test validation with sample data
single_doc_data = {
"metadata": {"title": "Test Document"},
"sections": [
{
"id": "section_1",
"content_type": "paragraph",
"elements": [{"text": "Test content"}],
"order": 1
}
]
}
multi_doc_data = {
"metadata": {
"title": "Test Documents",
"splitStrategy": "per_entity"
},
"documents": [
{
"id": "doc_1",
"title": "Document 1",
"filename": "doc1.docx",
"sections": [
{
"id": "section_1",
"content_type": "paragraph",
"elements": [{"text": "Content 1"}],
"order": 1
}
]
}
]
}
single_valid = validate_json_document(single_doc_data)
multi_valid = validate_json_document(multi_doc_data)
print(f"✓ Single document validation: {'PASS' if single_valid else 'FAIL'}")
print(f"✓ Multi-document validation: {'PASS' if multi_valid else 'FAIL'}")
print("\n=== JSON Schema Validation Test Complete ===")
return True
except ImportError as e:
print(f"Import error: {e}")
return False
except Exception as e:
print(f"Error during schema testing: {e}")
return False
async def test_prompt_builder():
"""Test adaptive prompt building."""
print("\n=== Testing Prompt Builder ===")
try:
from modules.services.serviceGeneration.subPromptBuilder import (
buildAdaptiveExtractionPrompt,
buildGenericExtractionPrompt
)
# Mock services
class MockServices:
def __init__(self):
self.utils = MockUtils()
class MockUtils:
def debugLogToFile(self, message, category):
print(f"[{category}] {message}")
services = MockServices()
# Test adaptive prompt building
prompt_analysis = {
"is_multi_file": True,
"strategy": "per_entity",
"criteria": "customer_id",
"file_naming_pattern": "{customer_name}_data.docx"
}
adaptive_prompt = await buildAdaptiveExtractionPrompt(
outputFormat="docx",
userPrompt="Create one file for each customer",
title="Customer Data",
promptAnalysis=prompt_analysis,
aiService=None,
services=services
)
print(f"✓ Adaptive prompt generated: {len(adaptive_prompt)} characters")
print(f" Contains multi-file instructions: {'documents' in adaptive_prompt}")
# Test generic prompt building
generic_prompt = await buildGenericExtractionPrompt(
outputFormat="docx",
userPrompt="Create a single report",
title="Report",
aiService=None,
services=services
)
print(f"✓ Generic prompt generated: {len(generic_prompt)} characters")
print(f" Contains single-file instructions: {'sections' in generic_prompt}")
print("\n=== Prompt Builder Test Complete ===")
return True
except ImportError as e:
print(f"Import error: {e}")
return False
except Exception as e:
print(f"Error during prompt builder testing: {e}")
return False
async def main():
"""Run all tests."""
print("Starting Multi-File Processing Tests...")
print("=" * 50)
tests = [
test_multi_file_detection,
test_json_schema_validation,
test_prompt_builder
]
results = []
for test in tests:
try:
result = await test()
results.append(result)
except Exception as e:
print(f"Test failed with exception: {e}")
results.append(False)
print("\n" + "=" * 50)
print("Test Results Summary:")
print(f" Tests run: {len(tests)}")
print(f" Passed: {sum(results)}")
print(f" Failed: {len(tests) - sum(results)}")
if all(results):
print("\n🎉 All tests passed! Multi-file processing is ready.")
else:
print("\n⚠️ Some tests failed. Check the implementation.")
return all(results)
if __name__ == "__main__":
asyncio.run(main())