From 0bc71c99d589cbe3f9dd86228ba7ffb2c8dc2223 Mon Sep 17 00:00:00 2001 From: ValueOn AG Date: Tue, 14 Oct 2025 00:23:59 +0200 Subject: [PATCH] Multi-document output implemented --- modules/services/serviceAi/mainServiceAi.py | 539 +++++++++++++++++- .../mainServiceGeneration.py | 107 +++- .../serviceGeneration/subJsonSchema.py | 293 +++++++++- .../serviceGeneration/subPromptBuilder.py | 294 +++++++++- test_document_processing.py | 2 + test_multifile_processing.py | 263 +++++++++ 6 files changed, 1448 insertions(+), 50 deletions(-) create mode 100644 test_multifile_processing.py diff --git a/modules/services/serviceAi/mainServiceAi.py b/modules/services/serviceAi/mainServiceAi.py index f3be97b2..927f696e 100644 --- a/modules/services/serviceAi/mainServiceAi.py +++ b/modules/services/serviceAi/mainServiceAi.py @@ -649,6 +649,11 @@ class AiService: for part in ec.parts: if part.typeGroup in ("text", "table", "structure", "image", "container", "binary"): + # Skip empty container chunks (they're just metadata containers) + if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): + logger.debug(f"Skipping empty container chunk: mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") + continue + chunks_to_process.append({ 'part': part, 'chunk_index': chunk_index, @@ -764,7 +769,14 @@ class AiService: elif part.typeGroup in ("container", "binary"): # Handle ALL container and binary content generically - let AI process any document type self.services.utils.debugLogToFile(f"DEBUG: Chunk {chunk_index}: typeGroup={part.typeGroup}, mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") - if part.mimeType and part.data and len(part.data.strip()) > 0: + + # Skip empty container chunks (they're just metadata containers) + if part.typeGroup == "container" and (not part.data or len(part.data.strip()) == 0): + self.services.utils.debugLogToFile(f"DEBUG: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}", "AI_SERVICE") + logger.info(f"Chunk {chunk_index}: Skipping empty container - mimeType={part.mimeType}, data_length={len(part.data) if part.data else 0}") + # Skip processing this chunk + pass + elif part.mimeType and part.data and len(part.data.strip()) > 0: # Process any document container as text content request_options = options if options is not None else AiCallOptions() request_options.operationType = OperationType.GENERAL @@ -869,12 +881,19 @@ class AiService: # Log extraction context length self.services.utils.debugLogToFile(f"EXTRACTION CONTEXT LENGTH: {len(part.data) if part.data else 0} characters", "AI_SERVICE") + # Debug: Log the actual prompt being sent to AI + logger.debug(f"AI PROMPT PREVIEW: {prompt[:300]}...") + logger.debug(f"AI CONTEXT PREVIEW: {part.data[:200] if part.data else 'None'}...") + request = AiCallRequest( prompt=prompt, context=part.data, options=request_options ) response = await self.aiObjects.call(request) + + # Debug: Log what AI actually returned + logger.debug(f"AI RESPONSE PREVIEW: {response.content[:300] if response.content else 'None'}...") ai_result = response.content # Log extraction response length @@ -900,16 +919,20 @@ class AiService: import json import re - # Clean the response - remove markdown code blocks if present + # Clean the response - remove markdown code blocks and extra formatting cleaned_result = ai_result.strip() - if cleaned_result.startswith('```json'): - # Remove ```json from start and ``` from end - cleaned_result = re.sub(r'^```json\s*', '', cleaned_result) - cleaned_result = re.sub(r'\s*```$', '', cleaned_result) - elif cleaned_result.startswith('```'): - # Remove ``` from start and end - cleaned_result = re.sub(r'^```\s*', '', cleaned_result) - cleaned_result = re.sub(r'\s*```$', '', cleaned_result) + + # Remove any markdown code block markers (```json, ```, etc.) + cleaned_result = re.sub(r'^```(?:json)?\s*', '', cleaned_result, flags=re.MULTILINE) + cleaned_result = re.sub(r'\s*```\s*$', '', cleaned_result, flags=re.MULTILINE) + + # Remove any remaining ``` markers anywhere in the text + cleaned_result = re.sub(r'```', '', cleaned_result) + + # Try to extract JSON from the response if it's embedded in other text + json_match = re.search(r'\{.*\}', cleaned_result, re.DOTALL) + if json_match: + cleaned_result = json_match.group(0) # Validate JSON json.loads(cleaned_result) @@ -1193,7 +1216,13 @@ class AiService: # Parse JSON from AI result chunk_json = json.loads(chunk_result.aiResult) - # Extract sections from this chunk + # Check if this is a multi-file response (has "documents" key) + if isinstance(chunk_json, dict) and "documents" in chunk_json: + # This is a multi-file response - return it as-is + logger.info("Detected multi-file response from AI - preserving structure") + return chunk_json + + # Extract sections from single-file response if isinstance(chunk_json, dict) and "sections" in chunk_json: for section in chunk_json["sections"]: # Add document context to section @@ -1527,6 +1556,152 @@ class AiService: # This ensures MIME-type checking, chunk mapping, and parallel processing return await self._processDocumentsPerChunk(documents, prompt, options) + async def _callAiDirect( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions + ) -> Dict[str, Any]: + """ + Call AI directly with prompt and documents for JSON output. + Used for multi-file generation - uses the existing generation pipeline. + """ + # Use the existing generation pipeline that already works + # This ensures proper document processing and content extraction + logger.info(f"Using existing generation pipeline for {len(documents) if documents else 0} documents") + + # Process documents with JSON merging using the existing pipeline + result = await self._processDocumentsPerChunkJson(documents, prompt, options) + + # Convert single-file result to multi-file format if needed + if "sections" in result and "documents" not in result: + logger.info("Converting single-file result to multi-file format") + # This is a single-file result, convert it to multi-file format + return { + "metadata": result.get("metadata", {"title": "Converted Document"}), + "documents": [{ + "id": "doc_1", + "title": result.get("metadata", {}).get("title", "Document"), + "filename": "document.txt", + "sections": result.get("sections", []) + }] + } + + return result + + async def _processDocumentsPerChunkJsonWithPrompt( + self, + documents: List[ChatDocument], + custom_prompt: str, + options: Optional[AiCallOptions] = None + ) -> Dict[str, Any]: + """ + Process documents with per-chunk AI calls and merge results in JSON mode. + Uses a custom prompt instead of the default extraction prompt. + """ + if not documents: + return {"metadata": {"title": "Empty Document"}, "sections": []} + + # Get model capabilities for size calculation + model_capabilities = self._getModelCapabilitiesForContent(custom_prompt, documents, options) + + # Build extraction options for chunking with intelligent merging + extractionOptions: Dict[str, Any] = { + "prompt": custom_prompt, # Use the custom prompt instead of default + "operationType": options.operationType if options else "general", + "processDocumentsIndividually": True, # Process each document separately + "maxSize": model_capabilities["maxContextBytes"], + "chunkAllowed": True, + "textChunkSize": model_capabilities["textChunkSize"], + "imageChunkSize": model_capabilities["imageChunkSize"], + "imageMaxPixels": 1024 * 1024, + "imageQuality": 85, + "mergeStrategy": { + "useIntelligentMerging": True, # Enable intelligent token-aware merging + "modelCapabilities": model_capabilities, + "prompt": custom_prompt, # Use the custom prompt + "groupBy": "typeGroup", + "orderBy": "id", + "mergeType": "concatenate" + }, + } + + logger.debug(f"Per-chunk extraction options (JSON mode): prompt length={len(extractionOptions.get('prompt', ''))} chars, operationType={extractionOptions.get('operationType')}") + + try: + # Extract content with chunking + extractionResult = self.extractionService.extractContent(documents, extractionOptions) + + if not isinstance(extractionResult, list): + return {"metadata": {"title": "Error Document"}, "sections": []} + + # Process chunks with proper mapping + logger.info(f"Processing {len(extractionResult)} chunks with custom prompt") + logger.debug(f"Custom prompt preview: {custom_prompt[:200]}...") + + # Debug: Show what content is being processed (before filtering) + for i, ec in enumerate(extractionResult): + logger.debug(f"ContentExtracted {i}: id={ec.id}, parts={len(ec.parts) if hasattr(ec, 'parts') else 'no parts'}") + + # Check each part within the ContentExtracted + if hasattr(ec, 'parts'): + for j, part in enumerate(ec.parts): + if hasattr(part, 'data') and part.data: + logger.debug(f" Part {j} content preview: {part.data[:200]}...") + else: + # Check what attributes the part actually has + part_attrs = [attr for attr in dir(part) if not attr.startswith('_')] + part_type = getattr(part, 'typeGroup', None) + part_mime = getattr(part, 'mimeType', '') + has_data = hasattr(part, 'data') and bool(part.data) + + logger.debug(f" Part {j} DEBUG: available_attrs={part_attrs}") + logger.debug(f" Part {j} DEBUG: typeGroup='{part_type}', mimeType='{part_mime}', has_data={has_data}") + + # Check if this is an empty container chunk (which is expected) + is_empty_container = False + if part_type == "container" and part_mime and 'document' in part_mime.lower(): + is_empty_container = True + + if is_empty_container: + logger.debug(f" Part {j} is empty container (will be filtered out) - mimeType={part_mime}") + else: + logger.warning(f" Part {j} has no data - typeGroup='{part_type}', mimeType='{part_mime}', attrs={part_attrs}") + else: + logger.warning(f"ContentExtracted {i} has no parts attribute") + + chunkResults = await self._processChunksWithMapping(extractionResult, custom_prompt, options, generate_json=True) + + # Debug: Show what chunks were actually processed (after filtering) + logger.info(f"After filtering: {len(chunkResults)} chunks will be processed") + for i, chunk_result in enumerate(chunkResults): + if chunk_result and chunk_result.metadata.get("success", False): + logger.debug(f"Processed chunk {i}: {chunk_result.metadata.get('typeGroup', 'unknown')} - {len(chunk_result.aiResult)} chars") + else: + logger.debug(f"Processed chunk {i}: error or skipped") + + # Merge with JSON mode + mergedJsonDocument = self._mergeChunkResultsJson(chunkResults, options) + + # Debug: Show what the AI actually returned + logger.info(f"AI returned document with keys: {list(mergedJsonDocument.keys())}") + if 'sections' in mergedJsonDocument: + logger.info(f"Number of sections: {len(mergedJsonDocument['sections'])}") + if mergedJsonDocument['sections']: + logger.debug(f"First section preview: {str(mergedJsonDocument['sections'][0])[:200]}...") + else: + logger.warning("AI returned empty sections array") + if 'documents' in mergedJsonDocument: + logger.info(f"Number of documents: {len(mergedJsonDocument['documents'])}") + else: + logger.warning("AI did not return 'documents' key - this is single-file format") + + return mergedJsonDocument + + except Exception as e: + logger.error(f"Error in per-chunk JSON processing: {str(e)}") + return {"metadata": {"title": "Error Document"}, "sections": []} + async def _callAiJson( self, prompt: str, @@ -1821,6 +1996,88 @@ class AiService: target_length = int(len(text) * reduction_factor) return text[:target_length] + "... [reduced]" + async def _analyzePromptIntent(self, prompt: str, ai_service=None) -> Dict[str, Any]: + """Use AI to analyze user prompt and determine processing requirements.""" + if not ai_service: + return {"is_multi_file": False, "strategy": "single", "criteria": None} + + try: + analysis_prompt = f""" +Analyze this user request and determine if it requires multiple file output or single file output. + +User request: "{prompt}" + +Respond with JSON only in this exact format: +{{ + "is_multi_file": true/false, + "strategy": "single|per_entity|by_section|by_criteria|custom", + "criteria": "description of how to split content", + "file_naming_pattern": "suggested pattern for filenames", + "reasoning": "brief explanation of the analysis" +}} + +Consider: +- Does the user want separate files for different entities (customers, products, etc.)? +- Does the user want to split content into multiple documents? +- What would be the most logical way to organize the content? +- What language is the request in? (analyze in the original language) + +Return only the JSON response. +""" + + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) + response = await ai_service.aiObjects.call(request) + + if response and response.content: + import json + import re + + # Extract JSON from response + result = response.content.strip() + json_match = re.search(r'\{.*\}', result, re.DOTALL) + if json_match: + result = json_match.group(0) + + analysis = json.loads(result) + return analysis + else: + return {"is_multi_file": False, "strategy": "single", "criteria": None} + + except Exception as e: + logger.warning(f"AI prompt analysis failed: {str(e)}, defaulting to single file") + return {"is_multi_file": False, "strategy": "single", "criteria": None} + + def _validateResponseStructure(self, response: Dict[str, Any], prompt_analysis: Dict[str, Any]) -> bool: + """Validate that AI response matches the expected structure.""" + try: + if not isinstance(response, dict): + logger.warning(f"Response validation failed: Response is not a dict, got {type(response)}") + return False + + # Check for multi-file structure + if prompt_analysis.get("is_multi_file", False): + has_documents = "documents" in response + is_documents_list = isinstance(response.get("documents"), list) + logger.info(f"Multi-file validation: has_documents={has_documents}, is_documents_list={is_documents_list}") + if has_documents and is_documents_list: + logger.info(f"Multi-file validation passed: {len(response['documents'])} documents found") + else: + logger.warning(f"Multi-file validation failed: documents key present={has_documents}, documents is list={is_documents_list}") + logger.warning(f"Available keys: {list(response.keys())}") + return has_documents and is_documents_list + else: + has_sections = "sections" in response + is_sections_list = isinstance(response.get("sections"), list) + logger.info(f"Single-file validation: has_sections={has_sections}, is_sections_list={is_sections_list}") + return has_sections and is_sections_list + except Exception as e: + logger.warning(f"Response validation failed with exception: {str(e)}") + return False + async def _callAiWithDocumentGeneration( self, prompt: str, @@ -1831,6 +2088,7 @@ class AiService: ) -> Dict[str, Any]: """ Handle AI calls with document generation in specific output format. + Now supports both single-file and multi-file generation. Args: prompt: The main prompt for the AI call @@ -1842,6 +2100,43 @@ class AiService: Returns: Dict with generated documents and metadata """ + try: + # Use AI to analyze prompt intent + prompt_analysis = await self._analyzePromptIntent(prompt, self) + logger.info(f"Prompt analysis result: {prompt_analysis}") + + if prompt_analysis.get("is_multi_file", False): + return await self._callAiWithMultiFileGeneration( + prompt, documents, options, outputFormat, title, prompt_analysis + ) + else: + return await self._callAiWithSingleFileGeneration( + prompt, documents, options, outputFormat, title + ) + + except Exception as e: + logger.error(f"Error in document generation: {str(e)}") + return { + "success": False, + "error": str(e), + "content": "", + "rendered_content": "", + "mime_type": "text/plain", + "filename": f"error_{outputFormat}", + "format": outputFormat, + "title": title or "Error", + "documents": [] + } + + async def _callAiWithSingleFileGeneration( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + outputFormat: str, + title: Optional[str] + ) -> Dict[str, Any]: + """Handle single-file document generation (existing functionality).""" try: # Get format-specific extraction prompt from generation service from modules.services.serviceGeneration.mainServiceGeneration import GenerationService @@ -1912,20 +2207,216 @@ class AiService: "documentName": filename, "documentData": renderedContent, "mimeType": mimeType - }] + }], + "is_multi_file": False } except Exception as e: - logger.error(f"Error in document generation: {str(e)}") - return { - "success": False, - "error": str(e), - "content": "", - "rendered_content": "", - "mime_type": "text/plain", - "filename": f"error_{outputFormat}", - "format": outputFormat, - "title": title or "Error", - "documents": [] - } + logger.error(f"Error in single-file document generation: {str(e)}") + raise + + async def _callAiWithMultiFileGeneration( + self, + prompt: str, + documents: Optional[List[ChatDocument]], + options: AiCallOptions, + outputFormat: str, + title: Optional[str], + prompt_analysis: Dict[str, Any] + ) -> Dict[str, Any]: + """Handle multi-file document generation using AI analysis.""" + try: + # Get multi-file extraction prompt based on AI analysis + from modules.services.serviceGeneration.mainServiceGeneration import GenerationService + generation_service = GenerationService(self.services) + + # Use default title if not provided + if not title: + title = "AI Generated Documents" + + # Get adaptive extraction prompt + extraction_prompt = await generation_service.getAdaptiveExtractionPrompt( + outputFormat=outputFormat, + userPrompt=prompt, + title=title, + promptAnalysis=prompt_analysis, + aiService=self + ) + + logger.info(f"Adaptive extraction prompt length: {len(extraction_prompt)} characters") + logger.debug(f"Adaptive extraction prompt preview: {extraction_prompt[:500]}...") + + # Process with adaptive JSON schema - use the existing pipeline but with adaptive prompt + logger.info(f"Using adaptive prompt with existing pipeline: {len(extraction_prompt)} chars") + logger.debug(f"Processing documents: {len(documents) if documents else 0} documents") + + # Use the existing pipeline but replace the prompt with our adaptive one + # This ensures proper document processing while using the multi-file prompt + ai_response = await self._processDocumentsPerChunkJsonWithPrompt(documents, extraction_prompt, options) + + logger.info(f"AI response type: {type(ai_response)}") + logger.info(f"AI response keys: {list(ai_response.keys()) if isinstance(ai_response, dict) else 'Not a dict'}") + logger.debug(f"AI response preview: {str(ai_response)[:500]}...") + + # Validate response structure + if not self._validateResponseStructure(ai_response, prompt_analysis): + # Fallback to single-file if multi-file fails + logger.warning(f"Multi-file processing failed - Invalid response structure. Expected multi-file but got: {list(ai_response.keys()) if isinstance(ai_response, dict) else type(ai_response)}") + logger.warning(f"Prompt analysis: {prompt_analysis}") + logger.warning("Falling back to single-file generation") + return await self._callAiWithSingleFileGeneration( + prompt, documents, options, outputFormat, title + ) + + # Process multiple documents + generated_documents = [] + for i, doc_data in enumerate(ai_response.get("documents", [])): + # Transform AI-generated sections to renderer-compatible format + transformed_sections = [] + for section in doc_data.get("sections", []): + # Convert AI format to renderer format + transformed_section = { + "id": section.get("id", f"section_{len(transformed_sections) + 1}"), + "type": section.get("content_type", "paragraph"), + "data": { + "text": "", + "elements": section.get("elements", []) + }, + "order": section.get("order", len(transformed_sections) + 1) + } + + # Extract text from elements for simple text-based sections + if section.get("content_type") in ["paragraph", "heading"]: + text_parts = [] + for element in section.get("elements", []): + if "text" in element: + text_parts.append(element["text"]) + transformed_section["data"]["text"] = "\n".join(text_parts) + + transformed_sections.append(transformed_section) + + # Create complete document structure for rendering + complete_document = { + "metadata": { + "title": doc_data["title"], + "source_document": "multi_file_generation", + "document_id": doc_data.get("id", f"doc_{i+1}"), + "filename": doc_data.get("filename", f"document_{i+1}"), + "split_strategy": prompt_analysis.get("strategy", "custom") + }, + "sections": transformed_sections, + "summary": f"Generated document: {doc_data['title']}", + "tags": ["multi_file", "ai_generated"] + } + + rendered_content, mime_type = await generation_service.renderReport( + extractedContent=complete_document, + outputFormat=outputFormat, + title=doc_data["title"], + userPrompt=prompt, + aiService=self + ) + + # Generate proper filename with correct extension + base_filename = doc_data.get("filename", f"document_{i+1}") + # Remove any existing extension and add the correct one + if '.' in base_filename: + base_filename = base_filename.rsplit('.', 1)[0] + + # Add proper extension based on output format + if outputFormat.lower() == "docx": + filename = f"{base_filename}.docx" + elif outputFormat.lower() == "pdf": + filename = f"{base_filename}.pdf" + elif outputFormat.lower() == "html": + filename = f"{base_filename}.html" + else: + filename = f"{base_filename}.{outputFormat}" + + generated_documents.append({ + "documentName": filename, + "documentData": rendered_content, + "mimeType": mime_type + }) + + # Save debug files for multi-file generation - only if debug enabled + debug_enabled = self.services.utils.configGet("APP_DEBUG_CHAT_WORKFLOW_ENABLED", False) + if debug_enabled: + try: + import os + from datetime import datetime, UTC + ts = datetime.now(UTC).strftime("%Y%m%d-%H%M%S") + debug_root = "./test-chat/ai" + debug_dir = os.path.join(debug_root, f"multifile_output_{ts}") + os.makedirs(debug_dir, exist_ok=True) + + # Save metadata + with open(os.path.join(debug_dir, "metadata.txt"), "w", encoding="utf-8") as f: + f.write(f"title: {title}\n") + f.write(f"format: {outputFormat}\n") + f.write(f"documents_count: {len(generated_documents)}\n") + f.write(f"split_strategy: {prompt_analysis.get('strategy', 'custom')}\n") + f.write(f"prompt_analysis: {prompt_analysis}\n") + + # Save each generated document + for i, doc in enumerate(generated_documents): + doc_filename = doc["documentName"] + doc_data = doc["documentData"] + doc_mime = doc["mimeType"] + + # Determine file extension + if outputFormat.lower() == "docx": + file_ext = ".docx" + elif outputFormat.lower() == "pdf": + file_ext = ".pdf" + elif outputFormat.lower() == "html": + file_ext = ".html" + else: + file_ext = f".{outputFormat}" + + # Save the rendered document + output_path = os.path.join(debug_dir, f"document_{i+1}_{doc_filename}") + + if file_ext in ['.md', '.txt', '.html', '.json', '.csv']: + # Text-based formats + with open(output_path, 'w', encoding='utf-8') as f: + f.write(doc_data) + else: + # Binary formats - decode from base64 if needed + try: + import base64 + doc_bytes = base64.b64decode(doc_data) + with open(output_path, 'wb') as f: + f.write(doc_bytes) + except Exception: + # If not base64, save as text + with open(output_path, 'w', encoding='utf-8') as f: + f.write(doc_data) + + logger.info(f"๐Ÿ’พ Debug: Saved multi-file document {i+1}: {output_path}") + + logger.info(f"๐Ÿ’พ Debug: Multi-file output saved to: {debug_dir}") + + except Exception as e: + logger.warning(f"Failed to save multi-file debug output: {e}") + + return { + "success": True, + "content": ai_response, + "rendered_content": None, # Not applicable for multi-file + "mime_type": None, # Not applicable for multi-file + "filename": None, # Not applicable for multi-file + "format": outputFormat, + "title": title, + "documents": generated_documents, + "is_multi_file": True, + "split_strategy": prompt_analysis.get("strategy", "custom") + } + + except Exception as e: + logger.error(f"Error in multi-file document generation: {str(e)}") + # Fallback to single-file + return await self._callAiWithSingleFileGeneration( + prompt, documents, options, outputFormat, title + ) diff --git a/modules/services/serviceGeneration/mainServiceGeneration.py b/modules/services/serviceGeneration/mainServiceGeneration.py index 4c76c95e..340cb8ce 100644 --- a/modules/services/serviceGeneration/mainServiceGeneration.py +++ b/modules/services/serviceGeneration/mainServiceGeneration.py @@ -1,6 +1,6 @@ import logging import uuid -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union, Tuple from datetime import datetime, UTC import re from modules.shared.timezoneUtils import get_utc_timestamp @@ -372,6 +372,42 @@ class GenerationService: logger.error(f"Error rendering JSON report to {outputFormat}: {str(e)}") raise + async def getAdaptiveExtractionPrompt( + self, + outputFormat: str, + userPrompt: str, + title: str, + promptAnalysis: Dict[str, Any], + aiService=None + ) -> str: + """Get adaptive extraction prompt based on AI analysis.""" + from .subPromptBuilder import buildAdaptiveExtractionPrompt + return await buildAdaptiveExtractionPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + promptAnalysis=promptAnalysis, + aiService=aiService, + services=self.services + ) + + async def getGenericExtractionPrompt( + self, + outputFormat: str, + userPrompt: str, + title: str, + aiService=None + ) -> str: + """Get generic extraction prompt that works for both single and multi-file.""" + from .subPromptBuilder import buildGenericExtractionPrompt + return await buildGenericExtractionPrompt( + outputFormat=outputFormat, + userPrompt=userPrompt, + title=title, + aiService=aiService, + services=self.services + ) + async def getExtractionPrompt(self, outputFormat: str, userPrompt: str, title: str, aiService=None) -> str: """ Get the format-specific extraction prompt for AI content extraction. @@ -409,6 +445,75 @@ class GenerationService: logger.error(f"Error getting extraction prompt for {outputFormat}: {str(e)}") raise + async def renderAdaptiveReport( + self, + extractedContent: Dict[str, Any], + outputFormat: str, + title: str, + userPrompt: str = None, + aiService=None, + isMultiFile: bool = False + ) -> Union[Tuple[str, str], List[Dict[str, Any]]]: + """Render report adaptively based on content structure.""" + + if isMultiFile and "documents" in extractedContent: + return await self._renderMultiFileReport( + extractedContent, outputFormat, title, userPrompt, aiService + ) + else: + return await self._renderSingleFileReport( + extractedContent, outputFormat, title, userPrompt, aiService + ) + + async def _renderMultiFileReport( + self, + extractedContent: Dict[str, Any], + outputFormat: str, + title: str, + userPrompt: str = None, + aiService=None + ) -> List[Dict[str, Any]]: + """Render multiple documents from extracted content.""" + + generated_documents = [] + + for doc_data in extractedContent.get("documents", []): + # Use existing single-file renderer for each document + renderer = self._getFormatRenderer(outputFormat) + if not renderer: + continue + + # Render individual document + rendered_content, mime_type = await renderer.render( + extractedContent={"sections": doc_data["sections"]}, + title=doc_data["title"], + userPrompt=userPrompt, + aiService=aiService + ) + + generated_documents.append({ + "filename": doc_data["filename"], + "content": rendered_content, + "mime_type": mime_type, + "title": doc_data["title"] + }) + + return generated_documents + + async def _renderSingleFileReport( + self, + extractedContent: Dict[str, Any], + outputFormat: str, + title: str, + userPrompt: str = None, + aiService=None + ) -> Tuple[str, str]: + """Render single file report (existing functionality).""" + # Use existing renderReport method + return await self.renderReport( + extractedContent, outputFormat, title, userPrompt, aiService + ) + def _getFormatRenderer(self, output_format: str): """Get the appropriate renderer for the specified format using auto-discovery.""" try: diff --git a/modules/services/serviceGeneration/subJsonSchema.py b/modules/services/serviceGeneration/subJsonSchema.py index 581e2037..868a6ca4 100644 --- a/modules/services/serviceGeneration/subJsonSchema.py +++ b/modules/services/serviceGeneration/subJsonSchema.py @@ -6,8 +6,197 @@ This module provides schemas that guide AI to generate structured JSON output. from typing import Dict, Any +def get_multi_document_subJsonSchema() -> Dict[str, Any]: + """Get the JSON schema for multi-document generation.""" + return { + "type": "object", + "required": ["metadata", "documents"], + "properties": { + "metadata": { + "type": "object", + "required": ["title", "splitStrategy"], + "properties": { + "title": {"type": "string", "description": "Document title"}, + "splitStrategy": { + "type": "string", + "enum": ["per_entity", "by_section", "by_criteria", "by_data_type", "custom"], + "description": "Strategy for splitting content into multiple files" + }, + "splitCriteria": { + "type": "object", + "description": "Custom criteria for splitting (e.g., entity_id, category, etc.)" + }, + "fileNamingPattern": { + "type": "string", + "description": "Pattern for generating filenames (e.g., '{entity_name}_data.docx')" + }, + "author": {"type": "string", "description": "Document author (optional)"}, + "source_documents": { + "type": "array", + "items": {"type": "string"}, + "description": "List of source document IDs" + }, + "extraction_method": { + "type": "string", + "default": "ai_extraction", + "description": "Method used for extraction" + } + } + }, + "documents": { + "type": "array", + "description": "Array of individual documents to generate", + "items": { + "type": "object", + "required": ["id", "title", "sections", "filename"], + "properties": { + "id": {"type": "string", "description": "Unique document identifier"}, + "title": {"type": "string", "description": "Document title"}, + "filename": {"type": "string", "description": "Generated filename"}, + "sections": { + "type": "array", + "description": "Document sections containing structured content", + "items": { + "type": "object", + "required": ["id", "content_type", "elements", "order"], + "properties": { + "id": {"type": "string", "description": "Unique section identifier"}, + "title": {"type": "string", "description": "Section title (optional)"}, + "content_type": { + "type": "string", + "enum": ["table", "list", "paragraph", "heading", "code", "image", "mixed"], + "description": "Primary content type of this section" + }, + "elements": { + "type": "array", + "description": "Content elements in this section", + "items": { + "oneOf": [ + {"$ref": "#/definitions/table"}, + {"$ref": "#/definitions/bullet_list"}, + {"$ref": "#/definitions/paragraph"}, + {"$ref": "#/definitions/heading"}, + {"$ref": "#/definitions/code_block"} + ] + } + }, + "order": {"type": "integer", "description": "Section order in document"}, + "metadata": { + "type": "object", + "description": "Additional section metadata" + } + } + } + }, + "metadata": { + "type": "object", + "description": "Document-specific metadata" + } + } + } + } + }, + "definitions": { + "table": { + "type": "object", + "required": ["headers", "rows"], + "properties": { + "headers": { + "type": "array", + "items": {"type": "string"}, + "description": "Table column headers" + }, + "rows": { + "type": "array", + "items": { + "type": "array", + "items": {"type": "string"} + }, + "description": "Table data rows" + }, + "caption": { + "type": "string", + "description": "Table caption (optional)" + } + } + }, + "bullet_list": { + "type": "object", + "required": ["items"], + "properties": { + "items": { + "type": "array", + "items": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "List item text"}, + "subitems": { + "type": "array", + "items": {"$ref": "#/definitions/list_item"}, + "description": "Nested sub-items (optional)" + } + } + }, + "description": "List items" + }, + "list_type": { + "type": "string", + "enum": ["bullet", "numbered", "checklist"], + "default": "bullet", + "description": "Type of list" + } + } + }, + "list_item": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "List item text"}, + "subitems": { + "type": "array", + "items": {"$ref": "#/definitions/list_item"}, + "description": "Nested sub-items (optional)" + } + } + }, + "paragraph": { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "Paragraph text"}, + "formatting": { + "type": "object", + "description": "Text formatting (bold, italic, etc.)" + } + } + }, + "heading": { + "type": "object", + "required": ["text", "level"], + "properties": { + "text": {"type": "string", "description": "Heading text"}, + "level": { + "type": "integer", + "minimum": 1, + "maximum": 6, + "description": "Heading level (1-6)" + } + } + }, + "code_block": { + "type": "object", + "required": ["code"], + "properties": { + "code": {"type": "string", "description": "Code content"}, + "language": {"type": "string", "description": "Programming language (optional)"} + } + } + } + } + def get_document_subJsonSchema() -> Dict[str, Any]: - """Get the JSON schema for structured document generation.""" + """Get the JSON schema for structured document generation (single document).""" return { "type": "object", "required": ["metadata", "sections"], @@ -227,6 +416,13 @@ Return only the enhanced JSON structure following the schema. Do not include any """ +def get_adaptive_json_schema(prompt_analysis: Dict[str, Any] = None) -> Dict[str, Any]: + """Automatically select appropriate schema based on prompt analysis.""" + if prompt_analysis and prompt_analysis.get("is_multi_file", False): + return get_multi_document_subJsonSchema() + else: + return get_document_subJsonSchema() + def validate_json_document(json_data: Dict[str, Any]) -> bool: """Validate that the JSON data follows the document schema.""" try: @@ -234,35 +430,86 @@ def validate_json_document(json_data: Dict[str, Any]) -> bool: if not isinstance(json_data, dict): return False - if "metadata" not in json_data or "sections" not in json_data: - return False - - metadata = json_data["metadata"] - if not isinstance(metadata, dict) or "title" not in metadata: - return False - - sections = json_data["sections"] - if not isinstance(sections, list): - return False - - # Validate each section - for i, section in enumerate(sections): - if not isinstance(section, dict): + # Check if it's multi-document or single-document structure + if "documents" in json_data: + # Multi-document structure + if "metadata" not in json_data: return False - required_fields = ["id", "content_type", "elements", "order"] - for field in required_fields: - if field not in section: + metadata = json_data["metadata"] + if not isinstance(metadata, dict) or "title" not in metadata or "splitStrategy" not in metadata: + return False + + documents = json_data["documents"] + if not isinstance(documents, list): + return False + + # Validate each document + for doc in documents: + if not isinstance(doc, dict): return False + + required_fields = ["id", "title", "sections", "filename"] + for field in required_fields: + if field not in doc: + return False + + # Validate sections in each document + sections = doc.get("sections", []) + if not isinstance(sections, list): + return False + + for section in sections: + if not isinstance(section, dict): + return False + + section_required = ["id", "content_type", "elements", "order"] + for field in section_required: + if field not in section: + return False + + # Validate content_type + valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] + if section["content_type"] not in valid_types: + return False + + # Validate elements + if not isinstance(section["elements"], list): + return False - # Validate content_type - valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] - if section["content_type"] not in valid_types: + elif "sections" in json_data: + # Single-document structure (existing validation) + if "metadata" not in json_data: return False - # Validate elements - if not isinstance(section["elements"], list): + metadata = json_data["metadata"] + if not isinstance(metadata, dict) or "title" not in metadata: return False + + sections = json_data["sections"] + if not isinstance(sections, list): + return False + + # Validate each section + for i, section in enumerate(sections): + if not isinstance(section, dict): + return False + + required_fields = ["id", "content_type", "elements", "order"] + for field in required_fields: + if field not in section: + return False + + # Validate content_type + valid_types = ["table", "list", "paragraph", "heading", "code", "image", "mixed"] + if section["content_type"] not in valid_types: + return False + + # Validate elements + if not isinstance(section["elements"], list): + return False + else: + return False return True diff --git a/modules/services/serviceGeneration/subPromptBuilder.py b/modules/services/serviceGeneration/subPromptBuilder.py index dd2a6717..31ffb26e 100644 --- a/modules/services/serviceGeneration/subPromptBuilder.py +++ b/modules/services/serviceGeneration/subPromptBuilder.py @@ -8,7 +8,8 @@ Builds a robust prompt that: - Requires the AI to output a filename header that we can parse and use """ -from typing import Protocol +import json +from typing import Protocol, Dict, Any class _RendererLike(Protocol): @@ -16,6 +17,291 @@ class _RendererLike(Protocol): ... +async def buildAdaptiveExtractionPrompt( + outputFormat: str, + userPrompt: str, + title: str, + promptAnalysis: Dict[str, Any], + aiService=None, + services=None +) -> str: + """Build adaptive extraction prompt based on AI analysis.""" + + # Get appropriate JSON schema based on analysis + from .subJsonSchema import get_adaptive_json_schema + json_schema = get_adaptive_json_schema(promptAnalysis) + + if promptAnalysis.get("is_multi_file", False): + schema_type = "multi-document" + else: + schema_type = "single-document" + + # Build adaptive prompt using AI analysis - match single-file style + if promptAnalysis.get("is_multi_file", False): + # Check if this is JSON email data + is_json_email = any(keyword in userPrompt.lower() for keyword in ['email', 'mail', 'json', 'message', 'conversation']) + + if is_json_email: + # Specialized prompt for JSON email data + multi_file_example = { + "metadata": { + "title": "Email Conversations", + "splitStrategy": "per_entity" + }, + "documents": [ + { + "id": "doc_1", + "title": "Email from SENDER to RECIPIENT", + "filename": "email_sender_to_recipient.txt", + "sections": [ + { + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "text": "Email from SENDER to RECIPIENT", + "level": 1 + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "FULL_EMAIL_CONTENT_HERE" + } + ], + "order": 2 + } + ] + } + ] + } + else: + # Generic multi-file prompt + multi_file_example = { + "metadata": { + "title": "REPLACE_WITH_ACTUAL_DOCUMENT_TITLE", + "splitStrategy": "by_section" + }, + "documents": [ + { + "id": "doc_1", + "title": "REPLACE_WITH_ACTUAL_SECTION_TITLE", + "filename": "REPLACE_WITH_ACTUAL_FILENAME", + "sections": [ + { + "id": "section_1", + "content_type": "heading", + "elements": [ + { + "text": "REPLACE_WITH_ACTUAL_HEADING_TEXT", + "level": 1 + } + ], + "order": 1 + }, + { + "id": "section_2", + "content_type": "paragraph", + "elements": [ + { + "text": "REPLACE_WITH_ACTUAL_PARAGRAPH_CONTENT" + } + ], + "order": 2 + } + ] + } + ] + } + + adaptive_prompt = f""" +{userPrompt} + +You are a document processing assistant that extracts and structures content from documents. Your task is to analyze the provided document content and create a structured JSON output. + +TASK: Extract the actual content from the document and organize it into separate sections, where each section will become a separate file. + +REQUIREMENTS: +1. Analyze the document content provided in the context below +2. Identify distinct sections in the document (by headings, topics, or logical breaks) +3. Create one JSON document entry for each section found +4. Extract the real content from each section (headings, paragraphs, lists, etc.) +5. Generate appropriate filenames for each section + +CRITICAL: You MUST return a JSON structure with a "documents" array, NOT a "sections" array. + +OUTPUT FORMAT: Return only valid JSON in this exact structure: +{json.dumps(multi_file_example, indent=2)} + +IMPORTANT: The JSON must have a "documents" key containing an array of document objects. Each document object must have: +- "id": unique identifier +- "title": section title from the document +- "filename": appropriate filename for the section +- "sections": array of content sections + +DO NOT return a JSON with "sections" at the root level. Return a JSON with "documents" at the root level. + +INSTRUCTIONS: +- Replace "REPLACE_WITH_ACTUAL_*" placeholders with real content from the document +- Use actual section titles, headings, and text from the document +- Create meaningful filenames based on section content +- Ensure each section contains the complete content for that part of the document +- Do not use generic placeholder text like "Section 1", "Section 2" +- Extract real headings, paragraphs, lists, and other content elements +- CRITICAL: Return JSON with "documents" array, not "sections" array + +CONTEXT (Document Content): + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. +""".strip() + else: + # Single-file prompt - use original style + adaptive_prompt = f""" +{userPrompt} + +You are extracting structured content from documents and must respond with valid JSON only. + +IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. + +Extract the actual data from the source documents and structure it as JSON with this format: +{json.dumps(json_schema, indent=2)} + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. +""".strip() + + return adaptive_prompt + +async def buildGenericExtractionPrompt( + outputFormat: str, + userPrompt: str, + title: str, + aiService=None, + services=None +) -> str: + """Build generic extraction prompt that works for both single and multi-file.""" + + # Use AI to determine the best approach + if aiService: + try: + analysis_prompt = f""" +Analyze this user request and determine the best JSON structure for document extraction. + +User request: "{userPrompt}" + +Respond with JSON only: +{{ + "requires_multi_file": true/false, + "recommended_schema": "single_document|multi_document", + "split_approach": "description of how to organize content", + "file_naming": "suggested naming pattern" +}} + +Consider the user's intent and the most logical way to organize the extracted content. +""" + + from modules.datamodels.datamodelAi import AiCallRequest, AiCallOptions, OperationType + request_options = AiCallOptions() + request_options.operationType = OperationType.GENERAL + + request = AiCallRequest(prompt=analysis_prompt, context="", options=request_options) + response = await aiService.aiObjects.call(request) + + if response and response.content: + import re + + result = response.content.strip() + json_match = re.search(r'\{.*\}', result, re.DOTALL) + if json_match: + result = json_match.group(0) + + analysis = json.loads(result) + + # Use analysis to build appropriate prompt + return await buildAdaptiveExtractionPrompt( + outputFormat, userPrompt, title, analysis, aiService, services + ) + except Exception as e: + services.utils.debugLogToFile(f"Generic prompt analysis failed: {str(e)}", "PROMPT_BUILDER") + + # Fallback to single-file prompt + from .subJsonSchema import get_document_subJsonSchema + json_schema = get_document_subJsonSchema() + + return f""" +{userPrompt} + +You are extracting structured content from documents and must respond with valid JSON only. + +CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. + +Extract the actual data from the source documents and structure it as JSON with this format: +{json.dumps(json_schema, indent=2)} + +Requirements: +- Preserve all original data - do not summarize or interpret +- Use the exact JSON schema provided +- Maintain data integrity and structure + +Content Types to Extract: +1. Tables: Extract all rows and columns with proper headers +2. Lists: Extract all items with proper nesting +3. Headings: Extract with appropriate levels +4. Paragraphs: Extract as structured text +5. Code: Extract code blocks with language identification +6. Images: Analyze images and describe all visible content including text, tables, logos, graphics, layout, and visual elements + +Image Analysis Requirements: +- If you cannot analyze an image for any reason, explain why in the JSON response +- Describe everything you see in the image +- Include all text content, tables, logos, graphics, layout, and visual elements +- If the image is too small, corrupted, or unclear, explain this +- Always provide feedback - never return empty responses + +Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. + +DO NOT return a schema description - return actual extracted content in the JSON format shown above. +""" + async def buildExtractionPrompt( outputFormat: str, renderer: _RendererLike, @@ -48,7 +334,7 @@ async def buildExtractionPrompt( You are extracting structured content from documents and must respond with valid JSON only. -IMPORTANT: You must respond with valid JSON only. No additional text, explanations, or formatting outside the JSON structure. +CRITICAL: You must respond with valid JSON only. No additional text, explanations, markdown formatting, code blocks, or any other content outside the JSON structure. Do not use ``` markers or any other formatting. Extract the actual data from the source documents and structure it as JSON with this format: {{ @@ -106,6 +392,10 @@ Image Analysis Requirements: - Always provide feedback - never return empty responses Return only the JSON structure with actual data from the documents. Do not include any text before or after the JSON. + +Extract the ACTUAL CONTENT from the source documents. Do not use placeholder text like "Section 1", "Section 2", etc. Extract the real headings, paragraphs, and content from the documents. + +DO NOT return a schema description - return actual extracted content in the JSON format shown above. """.strip() # Final assembly diff --git a/test_document_processing.py b/test_document_processing.py index 41e3a3a2..49a42f72 100644 --- a/test_document_processing.py +++ b/test_document_processing.py @@ -220,6 +220,8 @@ async def process_documents_and_generate_summary(): userPrompt = "Analyze the document containing mails for customer use cases. Can you create one file for each email in plain text format?" + # userPrompt = "Can you create one file for each section in the document" + # userPrompt = "Analyze these documents and create a fitting image for the content" # userPrompt = "Extract the table from file and produce 2 lists in excel. one list with all entries, one list only with entries that are yellow highlighted." diff --git a/test_multifile_processing.py b/test_multifile_processing.py new file mode 100644 index 00000000..737127bf --- /dev/null +++ b/test_multifile_processing.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +""" +Test script for multi-file processing implementation. +This script tests the new multi-file functionality without breaking existing single-file processing. +""" + +import asyncio +import json +import logging +from typing import Dict, Any, List + +# Setup logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +async def test_multi_file_detection(): + """Test AI-powered multi-file detection.""" + print("=== Testing Multi-File Detection ===") + + # Mock AI service for testing + class MockAiService: + async def call(self, request): + class MockResponse: + def __init__(self, content): + self.content = content + return MockResponse('{"is_multi_file": true, "strategy": "per_entity", "criteria": "customer_id", "file_naming_pattern": "{customer_name}_data.docx", "reasoning": "User wants separate files for each customer"}') + + class MockAiObjects: + def __init__(self): + self.call = MockAiService().call + + # Import the AI service + try: + from modules.services.serviceAi.mainServiceAi import AiService + + # Create mock service center + class MockServiceCenter: + def __init__(self): + self.utils = MockUtils() + + class MockUtils: + def debugLogToFile(self, message, category): + print(f"[{category}] {message}") + + # Create AI service instance + ai_service = AiService(MockServiceCenter()) + ai_service.aiObjects = MockAiObjects() + + # Test prompts + test_prompts = [ + "Create one file for each customer in the document", + "Split the data into separate files by category", + "Generate individual files for each product", + "Create a single report with all data", + "Erstelle eine Datei fรผr jeden Kunden", # German + "Crรฉer un fichier par section" # French + ] + + for prompt in test_prompts: + print(f"\nTesting prompt: '{prompt}'") + try: + analysis = await ai_service._analyzePromptIntent(prompt, ai_service) + print(f" Analysis: {analysis}") + + if analysis.get("is_multi_file"): + print(f" โœ“ Detected as multi-file with strategy: {analysis.get('strategy')}") + else: + print(f" โœ“ Detected as single-file") + + except Exception as e: + print(f" โœ— Error: {str(e)}") + + print("\n=== Multi-File Detection Test Complete ===") + return True + + except ImportError as e: + print(f"Import error: {e}") + print("Make sure you're running from the gateway directory") + return False + except Exception as e: + print(f"Error during testing: {e}") + return False + +async def test_json_schema_validation(): + """Test JSON schema validation for both single and multi-file.""" + print("\n=== Testing JSON Schema Validation ===") + + try: + from modules.services.serviceGeneration.subJsonSchema import ( + get_document_subJsonSchema, + get_multi_document_subJsonSchema, + get_adaptive_json_schema, + validate_json_document + ) + + # Test single document schema + single_doc_schema = get_document_subJsonSchema() + print(f"โœ“ Single document schema loaded: {len(single_doc_schema)} properties") + + # Test multi-document schema + multi_doc_schema = get_multi_document_subJsonSchema() + print(f"โœ“ Multi-document schema loaded: {len(multi_doc_schema)} properties") + + # Test adaptive schema selection + single_analysis = {"is_multi_file": False} + multi_analysis = {"is_multi_file": True} + + single_schema = get_adaptive_json_schema(single_analysis) + multi_schema = get_adaptive_json_schema(multi_analysis) + + print(f"โœ“ Adaptive schema selection working") + print(f" Single-file schema type: {single_schema.get('type', 'unknown')}") + print(f" Multi-file schema type: {multi_schema.get('type', 'unknown')}") + + # Test validation with sample data + single_doc_data = { + "metadata": {"title": "Test Document"}, + "sections": [ + { + "id": "section_1", + "content_type": "paragraph", + "elements": [{"text": "Test content"}], + "order": 1 + } + ] + } + + multi_doc_data = { + "metadata": { + "title": "Test Documents", + "splitStrategy": "per_entity" + }, + "documents": [ + { + "id": "doc_1", + "title": "Document 1", + "filename": "doc1.docx", + "sections": [ + { + "id": "section_1", + "content_type": "paragraph", + "elements": [{"text": "Content 1"}], + "order": 1 + } + ] + } + ] + } + + single_valid = validate_json_document(single_doc_data) + multi_valid = validate_json_document(multi_doc_data) + + print(f"โœ“ Single document validation: {'PASS' if single_valid else 'FAIL'}") + print(f"โœ“ Multi-document validation: {'PASS' if multi_valid else 'FAIL'}") + + print("\n=== JSON Schema Validation Test Complete ===") + return True + + except ImportError as e: + print(f"Import error: {e}") + return False + except Exception as e: + print(f"Error during schema testing: {e}") + return False + +async def test_prompt_builder(): + """Test adaptive prompt building.""" + print("\n=== Testing Prompt Builder ===") + + try: + from modules.services.serviceGeneration.subPromptBuilder import ( + buildAdaptiveExtractionPrompt, + buildGenericExtractionPrompt + ) + + # Mock services + class MockServices: + def __init__(self): + self.utils = MockUtils() + + class MockUtils: + def debugLogToFile(self, message, category): + print(f"[{category}] {message}") + + services = MockServices() + + # Test adaptive prompt building + prompt_analysis = { + "is_multi_file": True, + "strategy": "per_entity", + "criteria": "customer_id", + "file_naming_pattern": "{customer_name}_data.docx" + } + + adaptive_prompt = await buildAdaptiveExtractionPrompt( + outputFormat="docx", + userPrompt="Create one file for each customer", + title="Customer Data", + promptAnalysis=prompt_analysis, + aiService=None, + services=services + ) + + print(f"โœ“ Adaptive prompt generated: {len(adaptive_prompt)} characters") + print(f" Contains multi-file instructions: {'documents' in adaptive_prompt}") + + # Test generic prompt building + generic_prompt = await buildGenericExtractionPrompt( + outputFormat="docx", + userPrompt="Create a single report", + title="Report", + aiService=None, + services=services + ) + + print(f"โœ“ Generic prompt generated: {len(generic_prompt)} characters") + print(f" Contains single-file instructions: {'sections' in generic_prompt}") + + print("\n=== Prompt Builder Test Complete ===") + return True + + except ImportError as e: + print(f"Import error: {e}") + return False + except Exception as e: + print(f"Error during prompt builder testing: {e}") + return False + +async def main(): + """Run all tests.""" + print("Starting Multi-File Processing Tests...") + print("=" * 50) + + tests = [ + test_multi_file_detection, + test_json_schema_validation, + test_prompt_builder + ] + + results = [] + for test in tests: + try: + result = await test() + results.append(result) + except Exception as e: + print(f"Test failed with exception: {e}") + results.append(False) + + print("\n" + "=" * 50) + print("Test Results Summary:") + print(f" Tests run: {len(tests)}") + print(f" Passed: {sum(results)}") + print(f" Failed: {len(tests) - sum(results)}") + + if all(results): + print("\n๐ŸŽ‰ All tests passed! Multi-file processing is ready.") + else: + print("\nโš ๏ธ Some tests failed. Check the implementation.") + + return all(results) + +if __name__ == "__main__": + asyncio.run(main())